#!/usr/bin/env python # -*- coding: utf-8 -*- import re import sys from textual import freq, byfreq, depunc from accents import remove_accents_from_text import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) text = sys.stdin.read().decode('utf-8') text = remove_accents_from_text(depunc(text)) text = text.lower() words = text.split() alphabet = set(text) letters = list(text) def invert(pairs): return [(b,a) for a,b in pairs] def alphabet_by_freq(alphabet): fq = freq(alphabet).items() fq = invert(fq) fq = sorted(fq, reverse=True) return invert(fq) freqbet = alphabet_by_freq(letters) def percent(seq,percentage=10): """return approx the first p percent elements of seq""" n = int(float(percentage)/100*len(seq)) return seq[:n] seed = [k for k,v in percent(freqbet, percentage=5)] seed = ''.join(seed) seedRE = re.compile('^['+seed+u']+',re.UNICODE) for w in sorted(set(words)): if seedRE.match(w): print w