#!/usr/bin/env python # -*- coding: utf-8 -*- from collections import defaultdict import sys, codecs from operator import itemgetter sys.stdout = codecs.getwriter('utf-8')(sys.stdout) def read_lexicon(lexicon_file): content = open(lexicon_file).read().decode('utf-8') lexicon = [line.strip() for line in codecs.open(lexicon_file, mode='r', encoding='utf-8').readlines()] lexicon = [tuple(x.split('\t')) for x in lexicon if len(x.split()) == 2] return lexicon def write_scheme(schemefile): out = codecs.open(schemefile, mode='w', encoding='utf-8') for a,b in scheme.items(): out.write("%s %s\n" % (a,b)) out.close() print "wrote scheme to " + schemefile def pattern(w): return tuple([w.index(c) for c in w]) def samepattern(a,b): return pattern(a) == pattern(b) def freq(seq): fq = defaultdict(int) for elem in seq: fq[elem] += 1 return fq def ngrams(sequence, n): return [sequence[i:i+n] for i in range(len(sequence)+1-n)] def bigrams(sequence): return ngrams(sequence,2) def backwards(word): return ''.join(reversed(word)) def might_have_digraph(a,b,pairs): alignments = zip(a,b) + zip(backwards(a),backwards(b)) def collectpairs(lexicon): for a,b in lexicon: if samepattern(a,b) and a != b: for pair in zip(a,b): yield pair elif abs(len(a)-len(b)) == 1: shorter, longer = sorted([a,b], key=len) for pair in zip(shorter,bigrams(longer)): yield pair for pair in zip(backwards(shorter),backwards(bigrams(longer))): yield pair def genscheme(pairs): pairsbyfreq = sorted(freq(pairs).items(), key=itemgetter(1)) scheme = {} for (a,b), fq in pairsbyfreq: if fq>10:print a,b, fq scheme[a] = b return scheme if __name__ == "__main__": import os, sys if len(sys.argv) != 2: print """Usage: python xlit.py """ exit() #lexfile = 'corpora/ru2en.txt' #schemefile = 'schemes/ru2en.scheme' lexfile = sys.argv[1] schemefile = lexfile.replace('corpora', 'schemes').replace('.txt','.scheme') lexicon = read_lexicon(lexfile) pairs = collectpairs(lexicon) scheme = genscheme(pairs) write_scheme(schemefile)