import sys from urllib import urlopen from collections import defaultdict # requires python2.5 from random import choice """ A tool to generate a suite of lessons to learn an alphabet. get en2ru.matches at http://ruphus.com/svn/translit/matches/en2ru.matches """ #en2ru = open('en2ru.matches').read().decode('utf-8').splitlines() #en2ru = open('easycountries-en2ru.txt').read().decode('utf-8').splitlines() lexicon = sys.argv[1] or 'en2ru.matches' language = sys.argv[2] or 'russian' en2ru = open(lexicon).read().decode('utf-8').splitlines() en2ru = [line.strip().split('\t') for line in en2ru if len(line.strip().split('\t')) == 2] ru = [r for e,r in en2ru] en = [e for e,r in en2ru] def freq(sequence): fq = defaultdict(int) for element in sequence: fq[element] += 1 return fq russianfq = freq(''.join(ru)) byfreq = sorted(russianfq.keys(), key=lambda letter: russianfq[letter], reverse = True) patterns = [set(byfreq[:i]) for i in range(len(byfreq))] levels= defaultdict(list) for i, pattern in enumerate(patterns): if i > 0: levels[i] = pattern scores = {} for word in ru: for i,pattern in levels.items(): if set(word).issubset(pattern): scores[word] = (pattern, i) break pages = defaultdict(list) for word, (pattern, score) in scores.items(): pages[score].append((pattern, word)) for k in range(len(pages)): for pattern, words in pages[k]: print ''.join(sorted(pattern)) print ' '.join(words) """ for i in range(25): if pages[i]: pattern, words = pages[i] print ''.join(sorted(pattern)) print ' '.join(sorted(words, key=len)) print class Page: def __init__(self, letters, words): self.letters = letters self.words = words self.template = template def """