#!/usr/bin/env python
# -*- coding:utf-8 -*-
from languages import names2codes, codes2names
import urllib2 
import sys
import re
from collections import defaultdict
from BeautifulSoup import BeautifulStoneSoup

def wikiurl(code, topic):
  topic = urllib2.quote(topic)
  url = u"http://%s.wikipedia.org/wiki/Special:Export/%s" % (code, topic.encode('utf-8'))
  return url

def download_article(code, topic, userAgent="innocentrobot"):
  url = wikiurl(code, topic)
  req = urllib2.Request(url, None, {'User-agent': userAgent})
  wikitext = urllib2.urlopen(req).read()
  open('/tmp/foo.txt','w').write(wikitext)
  return wikitext

def extract_target_topic(code, wikitext):
	try:
		code in codes2names.keys()
	except KeyError:
		sys.stdout.write('language code for pattern unknown.')
		sys.exit()
	linkRE = re.compile('''\[\[
					%s:
					([^\]]+)
					\]\]''' % code, re.VERBOSE)
	return linkRE.search(wikitext).groups(1)[0]

def download_article_pair(sourcecode, targetcode, sourcetopic):
  """given a request like 'en it Italy', get the en:Italy article;
  from that discover and then download it:Italia"""  
  sourcearticle = download_article(sourcecode, sourcetopic)
  targettopic = extract_target_topic(targetcode, sourcearticle)
  targetarticle = download_article(targetcode, targettopic)
  return {sourcecode:sourcearticle, targetcode:targetarticle}

"""
5. Extract sentences from both pages.
"""

def strip_info_boxes(wikitext):
  """infoboxes are templates in wikitext {{like this}}"""
  return re.sub('{{[^}]+}}', '', wikitext)

def extract_text_from_article(article):
  soup = BeautifulStoneSoup(article)
  text = soup.page.revision.text.renderContents()
  return text

def sentences(text): # @@TODO desuckulate
  sentenceRE = re.compile('([\.\!\?])\s')
  return sentenceRE.split(text)

def annotate_sentences_with_links(sentence):
  links = {}
  for sentence in sentences(article):
    links[sentence] = extract_links(sentence)

#def remove_wikitext_quoting(wikitext):
#  """wikipedia markup uses the notation '''bold''' and ''italic'', 
#  which conflicts with Python's commenting syntax, just nuke it"""
#  multiplequoteRE = re.compile(('""+'|"''+")
#  multiplequoteRE.sub(  

def extract_links(sentence):
  linkRE = re.compile("""
    \[\[
    (([ \w\(\)]+)\|)?
    ([ \w]+)
  \]\]""", re.VERBOSE)
  linklist = linkRE.findall(sentence)
  links = []
  for link in linklist:
    if link[0].endswith('|'): links.append(link[1])
    else: links.append(link[2])
  return set(links)

anotherlinkRE = re.compile('''
    \[\[
       ([^\]]+)
    \]\]
  ''', re.VERBOSE)

def compare_sentences(source, target):
  ranks = defaultdict(list)
  for s in source:
    for t in target:
      ranks[s].append(link_similarity(s,t))

class LinkedSentence:
  """a sentence from Wikipedia together with a list of its links"""
  def __init__(self, sentence):
    self.sentence = sentence
    self.links =  extract_links(self.sentence)

def similarity(source,target):
  return len(source.links.intersection(target.links))

wikisentence = "Here is a typical sentence in Wikipedia with a [[link]] and [[another|some other link]]."
wikisentence2 = "Here is a different one with a [[link]] and [[Bruce Lee]]."
wikisentence3 = "[[Bruce Lee]] was never in a film with [[Bambi]]."
linked = LinkedSentence(wikisentence)
linked2 = LinkedSentence(wikisentence2)
linked3 = LinkedSentence(wikisentence3)

samples = linked, linked2, linked3


#terminal_punctuationRE = re.compile(u"""[,，﹐ ، ߸ ᠂ ᠈ 、﹑､ ;;；﹔ ؛ \:：﹕ ։ ܃-܈ ፡ ፣-፦ ᠄ ᠅ ៖ ᭝ ᛫-᛭ !！﹗ ‼ ⁉ ߹ ᥄ ?？﹖ ⁈ ⁇ ؟ ܉ ፧ ᥅ ‽ .．﹒ ۔ ܁ ܂ ። ᠃ ᠉ ᙮ 。｡ · । ॥ ꡶ ꡷ ၊ ။ ។ ៕ ᭞ ᭟ ܀ ፨ ᭚ ᭛ 𐎟 𐏐 𐤟 𒑰-𒑳 ׃ ܊ ܌ ๚ ๛ ༈ །-༒ ៚ ᙭]""")

"""
if __name__ == "__main__":
  (sourcecode, topic, targetcode) = sys.argv[1:4]
  for x in (sourcecode, topic, targetcode): print x
  pair = download_article_pair(sourcecode, topic, targetcode)

  print pair.keys()
  for v in pair.values(): print v[:1000]



"""


