from unicodedata import category from collections import defaultdict import sys sys.path.append('/home/pat/repo/translit') import translit def ngrams(text,n): return [text[i:i+n] for i in range(len(text)-n+1)] def bigrams(text): return ngrams(text,2) def trigrams(text): return ngrams(text,3) def depunc(text): return ''.join([c for c in text if not category(c).startswith('P')]) def tokenize(text): return depunc(text).split() class Text: def __init__(self,text): self.text = text self.letters = list(text) self.alphabet = sorted(set(self.letters)) self.words = tokenize(text) self.bigrams = bigrams(self.letters) self.trigrams = trigrams(self.letters) self.hits = self.letters2words() def letters2words(self): hits = defaultdict(set) for word in self.words: for letter in word: hits[letter].add(word) return hits def pattern(word): return tuple([word.index(letter) for letter in word]) def samepattern(a,b): return pattern(a) == pattern(b) """ and not tuple(sorted(set(pattern(a)))) == pattern(a) \ and not tuple(sorted(set(pattern(b)))) == pattern(b) \ """ if __name__ == "__main__": from urllib import urlopen import sys matches = [] lex = {} inlex = sys.argv[1] left2right = open(inlex).read().decode('utf-8') for line in left2right.splitlines(): line = line.strip() try: (b, left, right) = line.split('\t') if samepattern(left, right): matches.append(right) lex[right] = left except: continue right = Text(' '.join(matches)) print "