#!/usr/bin/env python # -*- coding: utf-8 -*- from math import sqrt from string import punctuation, whitespace import re import sys sys.path.append('/home/pat/.pylib') import codecs codecs.getwriter('utf-8')(sys.stdout) from BeautifulSoup import UnicodeDammit def depunctuate(text): puncRE = re.compile('[' + punctuation + whitespace + ']', re.UNICODE) text = puncRE.sub(' ', text) return text def wordlist(text): text = text.lower() return text.split() class Collection(dict): def __init__(self, *args, **kargs): dict.__init__(self, *args, **kargs) for k, v in self.items(): dict.__setitem__(self, k, [v]) def __setitem__(self, item, value): if item in self: self[item].append(value) else: dict.__setitem__(self, item, [value]) def normalize(n, maximum, k=100): return float(n)/maximum * k + 1 def distribution(words, k=100): vec = Collection() numwords = len(words) for i, word in enumerate(words): vec[word] = int(normalize(i, numwords, k)) return vec def freq(seq): fq = {} for e in seq: if e not in fq: fq[e] = 1 else: fq[e] += 1 return fq def scalar(vec): total = 0 for elem in vec: total += vec[elem] * vec[elem] return sqrt(total) def sim(v, w): total = 0 for elem in v: if elem in w: total += v[elem] * w[elem] return float(total) / (scalar(v) * scalar(w)) def compare(query, source, target, k=100): queryvector = source[query] comparisons = [] for word in target: targetvector = target[word] comparisons.append( (sim(freq(queryvector), freq(targetvector)), query, word) ) return sorted(comparisons) if __name__ == "__main__": import sys from unify import uwrite, uread, unify sourcefile, targetfile = sys.argv[1], sys.argv[2] source = uread(sourcefile) source = depunctuate(source) target = uread(targetfile) target = depunctuate(target) sourcewords = wordlist(source) targetwords = wordlist(target) sourcedist = distribution(sourcewords) targetdist = distribution(targetwords) """ out = open('out.html', 'w') header = uread('header') uwrite(header, filehandle=out) uwrite(u"
\n", out) for i, q in enumerate(sorted(list(set(sourcewords)))): print "%d/%d\t%s" % (i, len(sourcewords), q) out.write("
%s
\n" % q) if len(sourcedist[q]) > 2: for similarity, query, matched, in sorted(compare(q, sourcedist, targetdist, k=100))[-5:]: out.write(u"
%s
\n" % (matched.encode('utf-8')),) print "\t%s" % matched out.write("
\n") out.write(open('tailer').read()) out.close() """