#!/usr/bin/env python # -*- coding: utf-8 -*- # Make a guess at a transliteration mapping, given a lexicon. from unicodedata import name import sys, codecs from textual import freq sys.stdout = codecs.getwriter('utf-8')(sys.stdout) sys.stdin = codecs.getreader('utf-8')(sys.stdin) # '/media/SEAGATE/pat/l/lex/en2hi.txt' lexfile = sys.argv[1] # Lexicons are extracted from Wikipedia using extractterms.py # examples are English/Hindi source2target = open(lexfile).read() source2target = source2target.decode('utf-8') s2t = {} for line in source2target.splitlines(): # lines look like: # en बिहार Bihar try: code, target, source = line.split('\t') source = source.lower() s2t[source] = target except ValueError: continue # now s2t contains a lexicon: # { u'bihar' : u'बिहार' , ... } correspondences = [] # this will hold all letter-letter correspondences: [('b', 'ब'), ('h', 'ह') ... ] for e,h in s2t.items(): all = [] all.extend(zip(list(e),list(h))) # because letter-letter correspondences may be better at the end of the word than the # beginning, we zip reversed versions of the words too: all.extend(zip(reversed(list(e)), reversed(list(h)))) correspondences.extend(all) fq = freq(correspondences) qf = [(fq,pair) for pair,fq in freq(correspondences).items()] sqf = sorted(qf) for f, pair in sqf[-100:]: print name(pair[0]), pair[0], pair[1]