#!/usr/bin/env python # -*- coding: utf-8 -*- """ Extract content of "interwiki links" from Wikipedia dumps and insert into a database. Dumps are large files available at download.wikipedia.org. For testing purposes it's best to use a small Wikipedia. Usage: python extract_terms.py xxwiki-dddddddd-pages-articles.xml.bz2 TODO: * fix hardcoded db """ from xml.sax import ContentHandler from xml.sax import make_parser from pprint import pprint import re, sys import bz2 import MySQLdb import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) sys.stdin = codecs.getreader('utf-8')(sys.stdin) lexicon = {} doc = u'' pattern = """ \[\[ # opening [[ ( [-a-z][-a-z:]+ ) # target language code \s?: # some whitespace?: ( [^\]]* ) # $2: source language the term \]\] # closing ]] [\s]* # some space (why?) """ insertion = """ INSERT INTO lex (source_code, target_code, source_term, target_term ) VALUES (%s, %s, %s, %s)""".encode('utf-8') interwiki_re = re.compile(pattern, re.VERBOSE|re.UNICODE) class Interlex(ContentHandler): def __init__(self, dumpfile): self.title = u'' self.text = u'' self.textparts = [] # these are joined in endElement() self.title_flag = False self.text_flag = False self.i = 0 self.source_code = dumpfile[:dumpfile.index('wiki')] self.db = MySQLdb.connect(host="localhost", user="root", passwd="", db="lex", use_unicode=True) self.cursor = self.db.cursor() def startElement(self, name, attrs): if name in ('title', 'text'): setattr(self, name + '_flag', True) def characters(self, content): if self.title_flag: self.title = content lexicon[self.title] = {} if self.text_flag: self.textparts.append(content) def endElement(self, name): if name in ('title', 'text'): setattr(self, name + '_flag', False) if name == 'text': self.text = ''.join(self.textparts) for target_code, translation in interwiki_re.findall(self.text): entry = (self.source_code, target_code, self.title, translation) entry = tuple([e.encode('utf-8').strip() for e in entry]) self.cursor.execute(insertion, entry) self.db.commit() self.i += 1 if self.i % 10000 == 0: sys.stderr.write(unicode(self.i) + '\n') self.title = '' self.textparts = [] self.text = '' class ErrorHandler: global SGMLSyntaxError # not necessary anymore? SGMLSyntaxError = "SGML syntax error" def error(self, exception): import sys sys.stderr.write(u"ERROR: %s\n" % exception) def fatalError(self, exception): "Handle a non-recoverable error." sys.stderr.write ("FATAL ERROR: %s\n" % exception) raise SGMLSyntaxError def warning(self, exception): "Handle a warning." sys.stderr.write ("Warning: %s\n" % exception) if __name__ == "__main__": import sys dump = sys.argv[1] if dump.endswith('.bz2'): corpus = bz2.BZ2File(dump, "r") else: corpus = open(dump, "r") lex = Interlex(dump) parser = make_parser() parser.setErrorHandler(ErrorHandler()) parser.setContentHandler(lex) parser.parse(corpus)