from glob import glob from ethnologue import code2lang # maps iso-639-3 codes to language names results = [] def type_token_ratio(text): words = text.split() # this doesn't work for languages without spaces num_words = float(len(words)) num_word_forms = float(len(set(words))) return num_word_forms / num_words for filename in glob('udhr_*.txt'): code = filename.replace('udhr_', '').replace('.txt','') try: language = code2lang[code] except ValueError: language = code text = unicode(open(filename).read()) results.append((type_token_ratio(text), language)) for score, language in sorted(results): print score, language