#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs import sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout) class Lexicon: import codecs def __init__(self,path): self.path = path self.lexicon = self.loadlexicon(path) def loadlexicon(self,path): lexicon = codecs.open(path, encoding='utf-8') lexicon = [line.strip().split('\t') for line in lexicon] return lexicon def randpair(self): from random import choice return choice(self.lexicon) l = Lexicon('corpora/en2el.txt') print l.randpair() def backwards(word): return list(reversed(word)) def pattern(word): return [word.index(letter) for letter in word] def onset(pairs): diff = [(y-z) for y,z in pairs] i = [diff.index(i) for i in diff if i != 0][0] return i def patternzip(a,b): return zip(pattern(a), pattern(b)) def despace(w): return w.replace(' ','') if __name__ == "__main__": from random import choice lexicon = Lexicon('corpora/en2el.txt') maybes = [(left, right) for left, right in lexicon if abs(len(left)-len(right)) == 1] maybes = [(despace(left),despace(right)) for left,right in maybes] (en,el) = randpair(maybes) (ne,le) = map(backwards, (en,el)) i = onset(patternzip(ne,le)) j = onset(patternzip(en,el)) print en print el if abs(i-j) < 3: print en[i:j+1] print el[i:j+1] print '-'