#!/usr/bin/env python # coding: utf-8 import sys, os sys.path.append(os.environ['HOME']) sys.path.append(os.environ['HOME']+os.sep+'.pylib') import codecs from glob import glob from collections import defaultdict from textual import freq from string import Template """ This program generates a bunch of .html files in the current directory which are graded to learn an alphabet. It's a mess. """ lexicon = [line.lower().strip().split('\t') for line in open('en2ru.matches').read().decode('utf-8').splitlines() if len(line.strip().split('\t')) == 2] en2ru = dict(lexicon) ru2en = dict([(r,e) for e,r in en2ru.items() ) targetfreq = dict(freq(''.join(ru2en.keys()))) byfreq = sorted(targetfreq, key=lambda letter:targetfreq[letter], reverse=True) prefixes = [] for i in range(1,len(''.join(byfreq))): prefixes.append(tuple(sorted(byfreq[:i]))) levels = defaultdict(list) for word in ru2en.keys(): for prefix in prefixes: if set(word) < set(prefix): levels[prefix].append(word) break def makepage(pattern, words): words = sorted(words, key=len) template = open('russian.html').read().decode('utf-8') content = u"