#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Extract content of "interwiki links" from Wikipedia dumps. 

Dumps are large files available at download.wikipedia.org. For
testing purposes it's best to use a small Wikipedia.


Usage: 
 python extract_terms.py xxwiki-dddddddd-pages-articles.xml.bz2 

"""
from xml.sax import ContentHandler
from xml.sax import make_parser
from pprint import pprint
import re, sys
import bz2
import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
sys.stdin = codecs.getreader('utf-8')(sys.stdin)


lexicon = {}
doc = u''

pattern = """
    \[\[                       # opening [[
    (
        [-a-z][-a-z:]+
    )                          # language code 
         \s?:                  # some whitespace?:
    (
        [^\]]*
    )                          # $2: the term
    \]\]                       # closing ]]
    [\s]*                      # some space (why?)
""" 


interwiki_re = re.compile(pattern, re.VERBOSE)


class Interlex(ContentHandler):

    def __init__(self):

        self.title = u''
        self.text = u''
        self.textparts = [] # these are joined in endElement()
        self.title_flag = False
        self.text_flag = False
        self.i = 0

    def startElement(self, name, attrs):
        if name in ('title', 'text'):
            setattr(self, name + '_flag', True)

    def characters(self, content):
        if self.title_flag:
            self.title = content
            lexicon[self.title] = {}
        if self.text_flag:
            self.textparts.append(content)

    def endElement(self, name):
        if name in ('title', 'text'):
            setattr(self, name + '_flag', False)
        if name == 'text':
            self.text = ''.join(self.textparts)
            for code, translation in interwiki_re.findall(self.text):
                entry = "%s\t%s\t%s" % (code,  self.title, translation)
                #print entry
                #sys.stderr.write(entry + '\n')
                sys.stdout.write(entry + '\n')
                self.i += 1
                if self.i % 10000 == 0: 
                    sys.stderr.write(unicode(self.i) + '\n')
            self.title = ''
            self.textparts = []
            self.text = ''

class ErrorHandler:
    global SGMLSyntaxError # not necessary anymore?
    SGMLSyntaxError = "SGML syntax error"

    def error(self, exception):
           import sys
           sys.stderr.write("ERROR: %s\n" % exception)

    def fatalError(self, exception):
        "Handle a non-recoverable error."
        sys.stderr.write ("FATAL ERROR: %s\n" % exception)
        raise SGMLSyntaxError

    def warning(self, exception):
        "Handle a warning."
        sys.stderr.write ("Warning: %s\n" % exception)




if __name__ == "__main__":
    dump = sys.argv[1]
    sourcecode = dump[:dump.index('wiki')]
    print sourcecode

    if dump.endswith('.bz2'):
        corpus = bz2.BZ2File(dump, "r")
    else:
        corpus = open(dump, "r")

    lex = Interlex()
    parser = make_parser()
    parser.setErrorHandler(ErrorHandler())
    parser.setContentHandler(lex)
    
    parser.parse(corpus)

