#!/usr/bin/env python # -*- coding: utf-8 -*- from math import sqrt pease = """Pease-porridge hot, Pease-porridge cold, Pease-porridge in the pot, Nine days old; Some like it hot, Some like it cold, Some like it in the pot, Nine days old. """ mary = """Mary had a little lamb, little lamb, little lamb, Mary had a little lamb, its fleece was white as snow. And everywhere that Mary went, Mary went, Mary went, and everywhere that Mary went, the lamb was sure to go. """ def wordlist(text): text = text.lower() return text.split() class Collection(dict): def __init__(self, *args, **kargs): dict.__init__(self, *args, **kargs) for k, v in self.items(): dict.__setitem__(self, k, [v]) def __setitem__(self, item, value): if item in self: self[item].append(value) else: dict.__setitem__(self, item, [value]) def normalize(n, maximum, k=100): return float(n)/maximum * k def distribution(words, k=100): vec = Collection() numwords = len(words) for i, word in enumerate(words): vec[word] = int(normalize(i, numwords, k)) return vec peasewords = wordlist(pease) marywords = wordlist(mary) peasedist = distribution(peasewords) marydist = distribution(marywords) #print peasedist #print marydist def freq(seq): fq = {} for e in seq: if e not in fq: fq[e] = 1 else: fq[e] += 1 return fq def scalar(vec): total = 0 for elem in vec: total += vec[elem] * vec[elem] return sqrt(total) #print scalar(freq([1, 43, 5, 6, 7, 8]) ) def sim(v, w): total = 0 for elem in v: if elem in w: print elem total += v[elem] * w[elem] return float(total) / (scalar(v) * scalar(w)) def compare(query, source, target, k=100): queryvector = source[query] print queryvector comparisons = [] for word in target: targetvector = target[word] comparisons.append( (sim(freq(queryvector), freq(targetvector)), query, word) ) return sorted(comparisons) for a,b,c in compare('it', peasedist, marydist):print a,b,c """ if __name__ == "__main__": import sys en = unicode(open('brazilbiz-en.txt', 'U').read()) pt = unicode(open('brazilbiz-pt.txt', 'U').read()) enwords = wordlist(en) ptwords = wordlist(pt) envec = hitvector(enwords) ptvec = hitvector(ptwords) q = unicode(sys.argv[1]) for a in compare(q, envec, ptvec, k=100): print a # """