import doctest from difflib import get_close_matches def stem(short,long): """ Stemming for minimalists. >>> stem('foo', 'fooed') ['ed'] >>> stem('foo', 'qux') [] >>> stem('foo', 'befooulated') ['be', 'ulated'] """ if len(long) < len(short): short, long = long, short res = [affix for affix in long.split(short) if len(affix) > 0] if short in res: res.remove(short) if long in res: res.remove(long) return res def get_all_affixes(matches): """ >>> get_all_affixes(['foo', 'fooed', 'qux', 'befooified']) ['ed', 'be', 'ified'] """ matches = sort_by_length(matches) affixes = [] word = matches.pop(0) for match in matches: affixes.extend(stem(word,match)) return affixes def flatten_list(matched, matches): """ >>> flatten_list('a', ['b', 'c', 'd']) ['a', 'b', 'c', 'd'] """ li = list(matched) li.extend(matches) return li def sort_by_length(wordlist): """ >>> sort_by_length(['aaa','a','aa']) ['a', 'aa', 'aaa'] """ lens = [(len(w),w) for w in wordlist] lens.sort() return [pair[1] for pair in lens] def words(text): """ >>> words('Call me Ishmael, dammit.') ['Call', 'me', 'Ishmael', 'dammit'] """ import re wordRE = re.compile('\w+', re.UNICODE|re.IGNORECASE) return wordRE.findall(text) def unique(seq): """ >>> unique('a a a b a'.split()) ['a', 'b'] """ return list(set(seq)) def wordlist(text, lower=False): """ >>> wordlist('Call me Ishmael, Ishmael, dammit.') ['Call', 'Ishmael', 'dammit', 'me'] """ wl = unique(words(text)) if lower: wl = [w.lower() for w in wl] wl.sort() return wl def plaintext(fname): fh = open(fname,'r') content = fh.read() content = content.decode('utf-8','ignore') return content def collect_matches(wl1, wl2): all_matches = [] for word in wl1: all_matches.append(get_close_matches(word, wl2, cutoff=0.9)) for word in wl2: all_matches.append(get_close_matches(word, wl1)) return all_matches def guess_affixes(text1, text2): words1, words2 = map(wordlist, (text1,text2)) all_matches = collect_matches(words1,words2) affixes = [] for match in all_matches: if len(match) > 1: for word in match: print word print def _test(): doctest.testmod() if __name__ == "__main__": import sys testing = 0 if testing: _test() sys.exit(0) if len(sys.argv) != 3: print "Usage: python guess_affixes.py " sys.exit(0) text1 = plaintext(sys.argv[1]) text2 = plaintext(sys.argv[2]) guess_affixes(text1, text2)