#!/usr/bin/env python # -*- coding: utf-8 -*- from textual import freq, ngrams from unicodedata import name import codecs import sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout) class LanguageModel: model = {} byfreq = [] def __init__(self, text): self.text = text self.bigrams = ngrams(text, 2) LanguageModel.model.update(freq(self.bigrams)) def load(f): """load a file into the language model.""" text = UnicodeDammit(open(f).read()).unicode def dump(self): for k,v in LanguageModel.model.items(): print "%s\t%s".encode('utf-8') % (v, k) def sortdump(self): sorted([(v,k) for k,v in LanguageModel.model.items()]) LanguageModel.byfreq.extend() for k,v in LanguageModel.byfreq[-20:]: try: n = name(v) except ValueError: n = 'UNKNOWN.' print "%d\t%s\t%s".encode('utf-8') % (v, '\t', k, '\t', n) if __name__ == "__main__": from BeautifulSoup import UnicodeDammit import sys texts = [] model = LanguageModel(content) for fname in sys.argv: model.load(fname) #print text texts.append(text) #print content.encode('utf-8') model = LanguageModel(content) #language.sortdump() model.sortdump()