#!/usr/bin/env python
# -*- coding: utf-8 -*- 
import sys, codecs
from snag import snag
sys.path.append('/home/pat/repo/wikipedia')
from extract_names import interwiki_re
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

"""
A crude implementation of part of:

AUTHOR: Koehn, Philipp and Knight, Kevin
TITLE: Learning a Translation Lexicon from Monolingual Corpora
Abstract: This paper presents work on the task of constructing a word-level translation lexicon purely from unrelated monolingual corpora. We combine various clues such as cognates, similar context, preservation of word similarity, and word frequency. Experimental results for the construction of a German-English noun lexicon are reported.
Booktitle: Proceedings of ACL Workshop on Unsupervised Lexical Acquisition
Citeulike-article-id: 2324020
Keywords: lexical_acquisition, nlp
Priority: 4
Year: 2002
URL: http://citeseer.ist.psu.edu/509449.html 
"""


input = ' '.join(sys.argv[1:])
print "query is of type: "
print type(query)

pages = {}

for query in input.split(): 
  pages[query] = snag(query)

for k,v in pages.items():
  start = v.index('<text')
  end = v.index('</text>')


def nopunc(w):
  punctuation = u":<>{}<>/?=[]"
  if not set(w).intersection(set(punctuation)): return True
  else: return False

wordsets = {}

for query, text in pages.items():
  #print text[-500:].encode('utf-8')
  words = text.decode('utf-8').split()
  words = [w for w in words if not interwiki_re.match(w) and nopunc(w)]
  wordsets[query] = set(words)

a, b = wordsets.values()[0], wordsets.values()[1]

for x in sorted(a.intersection(b)): 
  print x
