#!/usr/bin/env python # -*- coding: utf-8 -*- import os from BeautifulSoup import UnicodeDammit class Corpus: def __init__(self, languages): self.languages = languages self.library = {} class Language: def __init__(self, files, code, name=None): self.files = files self.code = code self.name = self.getname() self.nativename = u'Not yet implemented' self.sampletext = self.gettext() self.histogram = self.model() def getname(self): from ethnologue import code2lang if self.code in code2lang: self.name = code2lang[self.code] else: self.name = self.code return self.name def feed(self, files): """ Is this useful? """ for f in files: if os.path.isfile(f): self.files.append(f) self.sampletext = self.gettext() self.histogram = self.model() def gettext(self): text = u'' for f in self.files: text += uread(f) return text def model(self): from textual import freq, bigrams histogram = freq(bigrams(self.sampletext)) for k, v in histogram.items(): if v == 1 or ' ' in k: histogram.pop(k) return histogram def __unicode__(self): return "%s [%s]" % (self.name, self.code) def uread(fname): return UnicodeDammit(open(fname, 'U').read()).unicode if __name__ == "__main__": en = Language(['udhr/udhr_eng.txt'], 'eng', name="English") print en.files print en.code print en.name print en.nativename print en.sampletext print en.histogram print en.histogram['th'] en.feed(['samples/en.txt', 'samples/foo-en.txt']) print en.histogram['th'] print en """ hin = Language(['udhr/udhr_hin.txt'], 'hin', name="Hindi") print hin.files print hin.code print hin.name print hin.sampletext for k,v in hin.histogram.items(): print k,v """