#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) from collections import defaultdict """ This is the beginnings of a more readable and functional version of autotranslit.py. """ def build_lexicon_from_file(filename): #1 """open the lexicon file, do some sanity checking returns: list of tuples with (sourceword, targetword) @@TODO: generalize this to handle messier input. """ rawlexicon = open(filename).read().decode('utf-8') pairs = [line.strip().split('\t') for line in rawlexicon.splitlines()] pairs = [pair for pair in pairs if len(pair) == 2] pairs = [(e,k) for e,k in pairs if e != k] pairs = [(e,k) for e,k in pairs if len(e) > 3 and len(k) > 3] pairs = [(e,k) for e,k in pairs if len(e) < 10 and len(k) < 10] return pairs def build_alphabet(text): # 2 return sorted(list(set(text))) def build_alphabets(lexicon): # 2 """collects source and target alphabets from all terms returns two lists of unicode strings.""" sourcewords = ''.join([source for (source,target) in lexicon]) targetwords = ''.join([target for (source,target) in lexicon]) sourcealphabet, targetalphabet = map(build_alphabet, [sourcewords,targetwords]) return sourcealphabet, targetalphabet def patternize(word): """convert a word into a numeric representation returns unicode.""" pattern = [str(word.index(letter)) for letter in word] return ''.join(pattern) def collect_word_matches(lexicon): # 3 """collect word pairs which have the same letter pattern.""" return [(sourceword,targetword) for sourceword,targetword in lexicon if patternize(sourceword) == patternize(targetword) and sourceword != targetword] def zip_up_words(sourceword, targetword): """ >>> zip_up_words(u"Mir", u"Мир") [ (u"M", u"М"), (u"i", u"и"), (u"r", u"р") ] """ return zip(sourceword, targetword) def collect_letter_matches(matchedwords): # 4 """zip up all the word matches and collect letter matches into a big list""" matchedletters = [] [matchedletters.extend(zip(sourceword, targetword)) for sourceword, targetword in matchedwords] return matchedletters def frequencies(sequence): freq = defaultdict(int) for element in sequence: freq[element] += 1 return freq def collect_matched_letter_frequencies(matchedletters): # 5 return frequencies(matchedletters) def build_model(letter_pair_frequencies): model = defaultdict(list) print type(letter_pair_frequencies) for (sourceletter, targetletter),frequency in letter_pair_frequencies: model[sourceletter].append((targetletter,frequency)) def check_alphabet_against_model(alphabet, model): # 5, 7 pass if __name__ == "__main__": from pprint import pprint lexicon = build_lexicon_from_file(sys.argv[1]) wordmatches = collect_word_matches(lexicon) lettermatches = collect_letter_matches(wordmatches) pairmodel = collect_matched_letter_frequencies(lettermatches) print type(pairmodel) model = build_model(pairmodel) pprint(model) """ def build_lexicon_from_file(filename): #1 def build_alphabet(text): # 2 def build_alphabets(lexicon): # 2 def patternize(word): def collect_word_matches(lexicon): # 3 def zip_up_words(sourceword, targetword): def collect_letter_matches(matchedwords): # 4 def frequencies(sequence): freq = defaultdict(int) def matched_letter_frequency_model(matchedletters): # 5 def check_alphabet_against_model(alphabet, model): # 5, 7 def build_model(alphabet, matches): """