#!/usr/bin/env python # -*- coding: utf-8 -*- import sys from genscheme import pattern, freq from operator import itemgetter from pprint import pprint """ Examples for introduction to transliteration code. """ # open the source file def build_lexicon(a2b): lexicon = open(a2b).read().decode('utf-8').splitlines() return [line.strip() for line in lexicon] def collect_matches(lexicon): matches = [] for line in lexicon: if len(line.split('\t')) == 2: a, b = line.split('\t') a, b = a.lower().strip(), b.lower().strip() if pattern(a) == pattern(b) and a != b: matches.append((a,b)) return matches def collect_pairs(matches): pairs = [] for a,b in matches: for x,y in zip(a,b): pairs.append((x,y)) return pairs def display_matches(matches): for i, (a,b) in enumerate(sorted(matches)): #if i % 5 == 0: print i, a,b print a,'\n', b, '\n' def count_pairs(pairs): return freq(pairs).items() def display_pairs(pair_freq): byfreq = sorted(count_pairs(pairs), key=itemgetter(1), reverse=True) for (x,y), fq in byfreq: if fq> 1: print x, y, '\t', fq if __name__ == "__main__": a2b = 'corpora/' + sys.argv[1] + '.txt' lexicon = build_lexicon(a2b) matches = collect_matches(lexicon) display_matches(matches) pairs = collect_pairs(matches) #display_pairs(count_pairs(pairs))