#!/usr/bin/env python # -*- coding: utf-8 -*- from math import sqrt def wordlist(text): text = text.lower() return text.split() class Collection(dict): def __init__(self, *args, **kargs): dict.__init__(self, *args, **kargs) for k, v in self.items(): dict.__setitem__(self, k, [v]) def __setitem__(self, item, value): if item in self: self[item].append(value) else: dict.__setitem__(self, item, [value]) def normalize(n, maximum, k=100): return float(n)/maximum * k + 1 def distribution(words, k=100): vec = Collection() numwords = len(words) for i, word in enumerate(words): vec[word] = int(normalize(i, numwords, k)) return vec def freq(seq): fq = {} for e in seq: if e not in fq: fq[e] = 1 else: fq[e] += 1 return fq def scalar(vec): total = 0 for elem in vec: total += vec[elem] * vec[elem] return sqrt(total) def sim(v, w): total = 0 for elem in v: if elem in w: total += v[elem] * w[elem] return float(total) / (scalar(v) * scalar(w)) def compare(query, source, target, k=100): queryvector = source[query] comparisons = [] for word in target: targetvector = target[word] comparisons.append( (sim(freq(queryvector), freq(targetvector)), query, word) ) return sorted(comparisons) if __name__ == "__main__": import sys sourcefile, targetfile, q = unicode(sys.argv[1]), sys.argv[2], sys.argv[3] source = unicode(open(sourcefile, 'U').read()) target = unicode(open(targetfile, 'U').read()) sourcewords = wordlist(source) targetwords = wordlist(target) sourcedist = distribution(sourcewords) targetdist = distribution(targetwords) print q for similarity, query, matched, in compare(q, sourcedist, targetdist, k=100)[-10:]: print "%.2f %s" % (similarity, matched), print