#!/usr/bin/env python # -*- coding: utf-8 -*- import re en = unicode(open('brazilbiz-en.txt').read()) pt = unicode(open('brazilbiz-pt.txt').read()) whitespaceRE = re.compile("\s+", re.UNICODE) def squeeze(text): text = text.rstrip() text = text.lstrip() return ' '.join(re.split(whitespaceRE, text)) return text en = squeeze(en) pt = squeeze(pt) enwords = en.split() ptwords = pt.split() # this is incredibly inefficient, oh well. ptdex = [(i,w) for (i,w) in enumerate(ptwords)] endex = [(i,w) for (i,w) in enumerate(enwords)] ptlen = len(ptdex) enlen = len(endex) #print len([w[0] for w in endex if w[1] == u'business']) #print len([w[0] for w in ptdex if w[1] == u'negócios']) print len([w[0] for w in endex if w[1] == u'commerce']) print len([w[0] for w in ptdex if w[1] == u'comércio'])