from collections import defaultdict import unicodedata import codecs, sys sys.stdout = codecs.getwriter('utf-8')(sys.stdout) open('haitian.txt').read().decode('utf-8') hc = open('haitian.txt').read().decode('utf-8') def freq(seq): d = defaultdict(int) for e in seq: d[e] += 1 return d def occurrences(seq): postings = defaultdict(list) for i,w in enumerate(seq): postings[w].append(i) return postings def depunc(text): letters =[c for c in text if not unicodedata.category(c).startswith('P')] return ''.join(letters) words = depunc(hc).split() for a,b in ([(len(v),k) for k,v in occurrences(words).items()]): print a,b hcoc = occurrences(words) print "word frequencies: " for w in words: print len(hcoc[w]), w set([len(v) for k,v in hcoc.items()]) ([(v) for k,v in hcoc.items()]) ([tuple(v) for k,v in hcoc.items()]) freq([tuple(v) for k,v in hcoc.items()]) freq([tuple(v) for k,v in hcoc.items()]).values() set(freq([tuple(v) for k,v in hcoc.items()]).values()) freq([tuple(v) for k,v in hcoc.items()]) sorted([(len(v),k) for k,v in hcoc.items()]) print "word frequencies: " for a, b in sorted([(len(v),k) for k,v in hcoc.items()]): print a, b