#!/usr/bin/env python # -*- coding: utf-8 -*- import unicodedata, sys, codecs, os, random from collections import defaultdict from glob import glob sys.stdout = codecs.getwriter('utf-8')(sys.stdout) UDHR = '/home/pat/repo/udhr' def depunc(text): letters = [c for c in text if not unicodedata.category(c).startswith('P')] return ''.join(letters) def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d def byfreq(fq): return sorted([(v,k) for k,v in fq.items()]) def random_udhr(): return random.choice(os.listdir(UDHR)) def most_frequent_word(words): return byfreq(freq(words))[-1][1] def second_most_frequent_word(words): return byfreq(freq(words))[-2][1] def word_lengths(words): return [(len(w),w) for w in words] def words(text): return depunc(text).split() class RightsDoc: def __init__(self, filename): self.filename = filename self.text = open(UDHR + os.sep + filename).read().decode('utf-8') self.letters = list(self.text) self.language = self.get_language_name() self.code = self.get_language_code() self.words = words(self.text) self.wordfreq = freq(self.words) self.wordsbyfreq = byfreq(self.wordfreq) self.topword = most_frequent_word(self.words) self.secondword = second_most_frequent_word(self.words) def get_language_code(self): bits = self.filename.split('_')[1:] filename = '_'.join(bits) return filename.replace('.txt','') def get_language_name(self): try: return self.text.splitlines()[0].split(' - ')[1] except: return 'UNABLE TO DETERMINE LANGUAGE NAME' docs = [RightsDoc(lang) for lang in sorted(glob('../udhr/*.txt'))] udhr = dict([(doc.code, doc) for doc in docs]) #print udhr['huu'].words #topwords = sorted([udhr[doc].topword for doc in udhr]) #for w in topwords: print w