#'!/usr/bin/env python # -*- coding: utf-8 -*- import sys import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) from collections import defaultdict def readlexicon(lexiconfile): ka = open(lexiconfile).read().decode('utf-8') pairs = [line.strip().split('\t') for line in ka.splitlines()] pairs = [pair for pair in pairs if len(pair) == 2] pairs = [(e,k) for e,k in pairs if e != k] pairs = [(e,k) for e,k in pairs if len(e) > 3 and len(k) > 3] pairs = [(e,k) for e,k in pairs if len(e) < 10 and len(k) < 10] return pairs def patternize(word): pattern = [str(word.index(letter)) for letter in word] return ''.join(pattern) pairs = readlexicon(sys.argv[1]) transliteratedpairs = [(e,k) for e,k in pairs if patternize(e) == patternize(k)] #entry = """ \latintext \\textbf{%s} \hinditext %s""" entry = """%s\n%s\n""" """ # interesting to look at the word pairs for (e,k) in transliteratedpairs: print '\n' + '\n'.join([e, patternize(e), k]) """ matches = [] [matches.extend(zip(e,k)) for e,k in transliteratedpairs] def freq(seq): fq = defaultdict(int) for elem in seq: fq[elem] += 1 return fq model = defaultdict(list) for (e,k),fq in freq(matches).items(): model[e].append((k,fq)) schema = {} # the ultimate transliteration scheme for e in model: max = 0 for (letter, count) in model[e]: if count > max: max = count maxletter = letter model[e] = maxletter for e in model: from unicodedata import name print e, model[e] #print e, model[e], name(model[e]).replace('ARABIC LETTER','')