#!/usr/bin/env python # -*- coding: utf-8 -*- from vector import scalar, sim from collections import defaultdict from glob import glob import cPickle import sys, codecs #sys.stdout = codecs.getwriter('utf-8')(sys.stdout) #LOGFILE = open('languagelog.txt', 'a') #LOGFILE = codecs.getwriter('utf-8')(LOGFILE) def load(dbname="/home/pat/.pylib/udhrbigrams.db"): return cPickle.load(open(dbname)) def freq(seq): fq = defaultdict(int) for elem in seq: fq[elem] += 1 return fq def ngrams(s,n): """ >>> ngrams('abcddlagk', 3) ['abc', 'bcd', 'cdd', 'ddl', 'dla', 'lag', 'agk'] """ return [(s[i:i+n]) for i in range(len(s)-n+1)] def bigrams(s): return ngrams(s, 2) def top(d, n): """ return all keys of dictionary d which have one of the top n values of all values in d >>> top(list('aaabbc',1)) 'a' """ #if not hasattr(d, 'values'): d = freq(d) topvals = sorted(set(d.values()))[-n:] return [e for e in d if d[e] in topvals] def uread(fname): """ read file, return unicode text, we hope """ try: from BeautifulSoup import UnicodeDammit return UnicodeDammit(open(fname, 'U').read()).unicode except ImportError: return unicode(open(fname, 'U').read()) def ethnologue(): from LanguageCodes import languages codes = languages.splitlines() codes = [unicode(line.strip()) for line in codes] code2lang = {} for entry in codes: code, country, huh, language = entry.split('\t') code = code.strip() code2lang[code] = language.strip() return code2lang code2lang = ethnologue() def modeltext(text): fq = freq(bigrams(text)) for k, v in fq.items(): if v == 1 or ' ' in k: baleeted = fq.pop(k) return fq def id(unknown, threshhold=1): UNKNOWN = True allmodels = load() unknownmodel = modeltext(unknown) sims = {} for code, known in allmodels.items(): sims[code] = sim(known, unknownmodel) res = [] #LOGFILE.write("==========" + '\n\n') #LOGFILE.write(unknown.replace('\n', '') + '\n\n') for score, code in sorted([(v,k) for k,v in sims.items()]): if code in code2lang: lgname = code2lang[code] else: lgname = code res.append(( score, lgname )) if score > 0.4: #LOGFILE.write( "%.2f\t%s\n" % (score, lgname) ) UNKNOWN = False else: UNKNOWN = True #if UNKNOWN: LOGFILE.write( "Unknown.\n" ) return res def _test(): pass if __name__ == "__main__": import sys if len(sys.argv) < 2: _test() # TODO fix this res = id(uread(sys.argv[1])) res = dict(res) for score, code in sorted([(k,v) for k,v in res.items()]): if code in code2lang: lgname = code2lang[code] else: lgname = code #print "%.2f" % score + '\t' + lgname print score, print lgname #else: #for answer in id(uread(sys.argv[1]), threshhold=3): #print code2lang[answer], answer