# -*- coding: utf-8 -*- #!/usr/bin/env python from collection import Collection from BeautifulSoup import UnicodeDammit from math import sqrt from string import punctuation def depunctuate(text): puncs = list(punctuation) for p in puncs: text = text.replace(p,'') return text def freq(seq): fq = {} for e in seq: if e not in fq: fq[e] = 1 else: fq[e] += 1 return fq def normalize(seq): self.maximum = max(seq) return [ int(float(n)/maximum * 100) for n in seq] def scalar(vec): total = 0 for elem in vec: total += vec[elem] * vec[elem] return sqrt(total) def sim(v, w): total = 0 for elem in v: if elem in w: total += v[elem] * w[elem] return float(total) / (scalar(v) * scalar(w)) def _test(): import doctest import align myoptionflags = doctest.ELLIPSIS|doctest.NORMALIZE_WHITESPACE doctest.testmod(optionflags=myoptionflags) if __name__ == "__main__": _test()