#!/usr/bin/env python # -*- coding: utf-8 -*- import align sourcetext = 'data/faq-fr.txt' targettext = 'data/faq-en.txt' sourcetext = 'data/gainsbourg-en.txt' targettext = 'data/gainsbourg-fr.txt' sourcetext = "/home/pat/l/pt/doingbusinessinbrazil/brazilbiz-en.txt" targettext = "/home/pat/l/pt/doingbusinessinbrazil/brazilbiz-pt.txt" sourcetext = 'data/mis/en.txt' targettext = 'data/mis/fr.txt' sourcetext = 'data/notredame/en.txt' targettext = 'data/notredame/fr.txt' sourcetext = '/home/pat/l/udhr/data/udhr_eng.txt' targettext = '/home/pat/l/udhr/data/udhr_por.txt' sourcetext = 'data/x.txt' targettext = 'data/y.txt' source = align.UnicodeDammit(open(sourcetext, 'U').read()).unicode target = align.UnicodeDammit(open(targettext, 'U').read()).unicode sourcetokens = source.split() for w in sourcetokens: print w targettokens = target.split() sourcewords = set(sourcetokens) targetwords = set(targettokens) sourcedistributions = align.Collection() targetdistributions = align.Collection() for i,w in enumerate(sourcewords): sourcedistributions[w] = i print i, w, sourcedistributions[w] for i,w in enumerate(targetwords): targetdistributions[w] = i def normalize(seq): print seq maximum = max(seq) try: return [ int(float(n)/maximum * 100) for n in seq] except ZeroDivisionError: raise "doh! maximum is: %d" % maximum def hitfreq(word, distributions): return align.freq(normalize(distributions[word])) def compare(a,b): sourcewordvec = hitfreq(a, targetdistributions) targetwordvec = hitfreq(b, sourcedistributions) answer = align.sim(sourcewordvec, targetwordvec) if answer > 0: return answer def findsims(sourceword): sims = [] for targetword in list(targetwords): res = (compare(targetword, sourceword), sourceword, targetword) sims.append(res) return sorted(sims) def top(n, dictionary): vals = set(dictionary.values())[:100] tops = sorted(list(vals))[-n:] return [k for k in dictionary if dictionary[k] in tops] if __name__ == "__main__": import sys query = unicode(sys.argv[1]) biz = findsims(query) for x, e, p in biz[-10:]: print "ยป %f\t%s\t%s" % (x, e, p) print u"out of " + unicode(len(sourcedistributions[query]))