#!/usr/bin/env python # -*- coding: utf-8 -*- """ Extract content of "interwiki links" from Wikipedia dumps. Dumps are large files available at download.wikipedia.org. For testing purposes it's best to use a small Wikipedia. Usage: python extract_terms.py xxwiki-dddddddd-pages-articles.xml.bz2 """ from xml.sax import ContentHandler from xml.sax import make_parser from pprint import pprint import re, sys import bz2 import sys, codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout) sys.stdin = codecs.getreader('utf-8')(sys.stdin) lexicon = {} doc = u'' pattern = """ \[\[ # opening [[ ( [-a-z][-a-z:]+ ) # language code \s?: # some whitespace?: ( [^\]]* ) # $2: the term \]\] # closing ]] [\s]* # some space (why?) """ interwiki_re = re.compile(pattern, re.VERBOSE) class Interlex(ContentHandler): def __init__(self): self.title = u'' self.text = u'' self.textparts = [] # these are joined in endElement() self.title_flag = False self.text_flag = False self.i = 0 def startElement(self, name, attrs): if name in ('title', 'text'): setattr(self, name + '_flag', True) def characters(self, content): if self.title_flag: self.title = content lexicon[self.title] = {} if self.text_flag: self.textparts.append(content) def endElement(self, name): if name in ('title', 'text'): setattr(self, name + '_flag', False) if name == 'text': self.text = ''.join(self.textparts) for code, translation in interwiki_re.findall(self.text): entry = "%s\t%s\t%s" % (code, self.title, translation) #print entry #sys.stderr.write(entry + '\n') sys.stdout.write(entry + '\n') self.i += 1 if self.i % 10000 == 0: sys.stderr.write(unicode(self.i) + '\n') self.title = '' self.textparts = [] self.text = '' class ErrorHandler: global SGMLSyntaxError # not necessary anymore? SGMLSyntaxError = "SGML syntax error" def error(self, exception): import sys sys.stderr.write("ERROR: %s\n" % exception) def fatalError(self, exception): "Handle a non-recoverable error." sys.stderr.write ("FATAL ERROR: %s\n" % exception) raise SGMLSyntaxError def warning(self, exception): "Handle a warning." sys.stderr.write ("Warning: %s\n" % exception) if __name__ == "__main__": dump = sys.argv[1] sourcecode = dump[:dump.index('wiki')] print sourcecode if dump.endswith('.bz2'): corpus = bz2.BZ2File(dump, "r") else: corpus = open(dump, "r") lex = Interlex() parser = make_parser() parser.setErrorHandler(ErrorHandler()) parser.setContentHandler(lex) parser.parse(corpus)