en = open('en.txt').read().decode('utf-8') es = open('es.txt').read().decode('utf-8') def wordset(text): return set(tokenize(text)) def tokenize(text): puncRE = re.compile('['+punctuation+'\n ]') return puncRE.split(text) enset = wordset(en) esset = wordset(es) enset.intersection(esset) for w in esset.intersection(enset): w, tokenize(en).index(w) for i,w in enumerate(tokenize(en)): if w in enset.intersection(esset): print i, w for i,w in enumerate(tokenize(en)): if w in enset.intersection(esset): print i, tokenize(es).index(w), w for i,w in enumerate(tokenize(en)): if w in enset.intersection(esset): print float(i)/len(tokenize(en))*100, float(tokenize(es).index(w))/len(tokenize(es)*100, w ) for i,w in enumerate(tokenize(en)): if w in enset.intersection(esset): print float(i)/len(tokenize(en))*100, float(tokenize(es).index(w))/len(tokenize(es))*100, w def spread(wordlist): s = defaultdict(list) for i,w in enumerate(wordlist): s[w].append(i) return s from collections import defaultdict enwords = tokenize(en) eswords = tokenize(es) spread(en) spread(enwords) enspread = spread(enwords) esspread = spread(eswords) for w in enset.intersection(esset): pass for w in enwords: if w in enset.intersection(esset): print w print enspread[w] print esspread[w] seen = [] for w in enwords: if w not in seen: seen.append(w) if w in enset.intersection(esset) and w not in seen: print w print enspread[w] print esspread[w] seen = [] for w in enwords: if w in enset.intersection(esset) and w not in seen: if w not in seen: seen.append(w) print w print enspread[w] print esspread[w] import re tokenize(en) from string import punctuation tokenize(en)