#!/usr/bin/env python # -*- coding: utf-8 -*- # udhr.py - build bigram models of all the files in the # udhr; store them in bigrams.py import cPickle from glob import glob from vecid import freq, uread, bigrams, modeltext UDHRDATA = './udhr/udhr_*.txt' # pattern for glob udhr = glob(UDHRDATA) def modeludhr(): udhr = glob(UDHRDATA) texts = {} models = {} for fname in udhr: if fname.count('_') > 1: continue code = fname[-7:-4].strip() #code = fname.replace('../udhr/udhr_', '').replace('.txt', '') texts[code] = nixheader(fname) for text in texts: #hists[lg] = freq(list(texts[lg])) # unigram model #print modeltext(texts[text]) # bigram model models[text] = modeltext(texts[text]) # bigram model return models def nixheader(fudhr): """ remove everything up to ^--- """ import re raw = uread(fudhr) delimRE = re.compile(u'---\n\n', re.UNICODE) header, text = delimRE.split(raw) #print header[:100], '\n\n' return text def save(stuff, dbname="udhrbigrams.db"): out = open(dbname,'w') db = cPickle.Pickler(out) db.dump(stuff) out.close() def load(dbname="udhrbigrams.db"): reread = cPickle.load(open(dbname)) return reread def rebuild(dbname='udhrbigrams.db'): allmodels = modeludhr() save(allmodels, dbname=dbname) if __name__ == "__main__": rebuild()