#!/usr/bin/env python """ Learn a new alphabet by deciphering a list of words in an efficient way. """ import re, sys sys.path.append('/home/user/.pylib/') from utils import freq import codecs from unicodedata import category sys.stdin = codecs.getreader('utf-8')(sys.stdin) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) def regexify(letters): letters = u''.join([letter for letter in letters if category(letter)[0] not in (u'PZC')]) return re.compile('^[%s]+$' % letters) words = [word.split('\t')[1] for word in sys.stdin] text = ''.join(words) letterfq = dict(freq(list(text))) common = sorted(list(set(letterfq)), key=lambda letter: letterfq[letter], reverse=True) def find_matches(letters): return filter(lambda w: re.match(regexify(letters), w), words) for i in range(len(common)): print u' '.join(common[:i]) pattern = ''.join(common[:i]) if len(pattern)>0: for match in find_matches(pattern): print "\t" + match