#!/usr/bin/env python
# -*- coding: utf-8 -*-
import wpalign
import urllib2
import codecs
import sys

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

def wikiurl(code, topic):
  topic = urllib2.quote(topic)
  url = u"http://%s.wikipedia.org/wiki/Special:Export/%s" % (code, topic.encode('utf-8'))
  return url

def download_article(code, topic, userAgent="innocentrobot"):
  url = wikiurl(code, topic)
  req = urllib2.Request(url, None, {'User-agent': userAgent})
  wikitext = urllib2.urlopen(req).read()
  return wikitext.decode('utf-8')

source_code, source_title = sys.argv[1].decode('utf-8').split(':')
target_code, target_title = sys.argv[2].decode('utf-8').split(':')

source = download_article(source_code,source_title)
target = download_article(target_code,target_title)

source = wpalign.Article(source)
target = wpalign.Article(target)

similar = []
threshhold = 2 

for s in source.sentences:
 for t in target.sentences:
  common = len(s.links.intersection(t.links))
  if common > threshhold:
    similar.append((common,s,t))

print similar

for common, s, t  in sorted(similar):
  print s.text
  print 
  print t.text
  print '\n-------\n'


