#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) from collections import defaultdict from string import whitespace, ascii_letters from collections import defaultdict def get_matches(lexicon): normal = set(ascii_letters + whitespace) matches = [] for left, right in lexicon: (leftset, rightset) = set(left) - set(whitespace), set(right)- set(whitespace) if leftset.issubset(normal) and not leftset.intersection(rightset) and same_pattern(left,right): matches.append((left, right)) return matches def pattern(word): return ''.join([str(word.index(letter)) for letter in word]) def same_pattern(left, right): return pattern(left) == pattern(right) def build_lexicon(filename): return [line.strip().split('\t') for line in codecs.open(filename, encoding='utf-8', mode='r')] def zip_pair(left, right): return zip(left, right) def get_letter_pairs(matches): zipped = [] [zipped.extend(zip_pair(left, right)) for left, right in matches] return zipped def count_letter_pairs(pairs, threshhold=5): print 'threshhold is: ' + str(threshhold) freq = defaultdict(int) for pair in pairs: freq[pair] += 1 freq = filter_flukes(freq, threshhold=threshhold) return freq def filter_flukes(distribution, threshhold=2): """we don't want uncommon letter pairs because they're flukes""" for k,v in distribution.items(): if v < threshhold: distribution.pop(k) return distribution en2el = build_lexicon('corpora/en2el.txt') matches = get_matches(en2el) #for i, (en,el) in enumerate(matches): print i, en, el pairs = get_letter_pairs(matches) for i, (en,el) in enumerate(pairs): print i, en, el for (fromletter, toletter), fq in count_letter_pairs(pairs).items(): print fq, '\t', fromletter, toletter