#!/usr/bin/env python # -*- coding: utf-8 -*- from textual import ngrams from textual import freq import re def viz(s): return s.replace(' ', u'␣') blob = unicode(open('blob','U').read()) #blob = viz(blob) figrams = [] [figrams.extend(ngrams(blob, i)) for i in range(1,5)] fifq = freq(figrams) byfq = sorted([(v,k) for k,v in fifq.iteritems()]) byfq[-100:] wordRE = re.compile('^\w+$') for fq, gram in byfq[-1000:]: whitespaceRE = re.compile("\s+") if not whitespaceRE.match(gram): #print "%d\t%s" % (fq, gram) print gram