#!/usr/bin/env python # -*- coding: utf-8 -*- from collections import defaultdict from operator import itemgetter import sys import re from string import punctuation, whitespace def freq(seq): f = defaultdict(int) for e in seq: f[e] += 1 return f def ngrams(seq,n): return [seq[i:i+n] for i in range(len(seq)-n+1)] def bigrams(seq): return ngrams(seq,2) def tuplize(lists): return [tuple(li) for li in lists] def byfreq(seq): fq = freq(seq).items() return sorted(fq, key=itemgetter(1)) def tokenize(text): """craptastic""" return text.split() def depunc(text): return re.sub('[' + punctuation + whitespace + ']', ' ', text)