#!/usr/bin/env python # -*- coding: utf-8 -*- import align entext = '/home/pat/l/udhr/data/udhr_eng.txt' pttext = '/home/pat/l/udhr/data/udhr_por.txt' entext = 'data/faq-fr.txt' pttext = 'data/faq-en.txt' entext = 'data/gainsbourg-en.txt' pttext = 'data/gainsbourg-fr.txt' entext = 'data/mis/en.txt' pttext = 'data/mis/fr.txt' entext = 'data/notredame/en.txt' pttext = 'data/notredame/fr.txt' entext = "/home/pat/l/pt/doingbusinessinbrazil/brazilbiz-en.txt" pttext = "/home/pat/l/pt/doingbusinessinbrazil/brazilbiz-pt.txt" entext = 'data/a.txt' pttext = 'data/b.txt' endex = align.Collection() ptdex = align.Collection() en = align.UnicodeDammit(open(entext, 'U').read()).unicode pt = align.UnicodeDammit(open(pttext, 'U').read()).unicode entokens = en.split() pttokens = pt.split() enwords = set(entokens) ptwords = set(pttokens) for i,w in enumerate(entokens): endex[w] = i for i,w in enumerate(pttokens): ptdex[w] = i def normalize(seq): maximum = max(seq) return [ int(float(n)/maximum * 100) for n in seq] def hitfreq(word, langdex): return align.freq(normalize(langdex[word])) def compare(ptwd, enwd): ptvec = hitfreq(ptwd, ptdex) envec = hitfreq(enwd, endex) answer = align.sim(ptvec, envec) if answer > 0: return answer def findsims(enword): sims = [] for ptword in list(ptwords)[:8000]: res = (compare(ptword, enword), enword, ptword) sims.append(res) return sorted(sims) def top(n, dictionary): vals = set(dictionary.values()) tops = sorted(list(vals))[-n:] return [k for k in dictionary if dictionary[k] in tops] if __name__ == "__main__": import sys query = unicode(sys.argv[1]) biz = findsims(query) for x, e, p in biz[-10:]: print "ยป %f\t%s\t%s" % (x, e, p) print u"out of " + unicode(len(endex[query]))