""" Convert from mediawiki xml to git commits using a pull parser. The pull parser is because loading the entire xml file at once is probably going to require too much memory (not to mention the memory requirements for storing a parsed version of the xml file). """ import sys from io import BytesIO import lxml.etree import yaml namespaces = { "mw": "http://www.mediawiki.org/xml/export-0.6/", } usernames = set() revision_titles = {} def iterparse(some_file): """ Incrementally parse an XML file without first loading the entire file into memory. """ for (event, element) in lxml.etree.iterparse(some_file): if element.tag == "{http://www.mediawiki.org/xml/export-0.6/}page": # get the title of the page title = element.xpath(".//mw:title", namespaces={"mw": "http://www.mediawiki.org/xml/export-0.6/"})[0].text # page id is useful page_id = element.xpath(".//mw:id", namespaces=namespaces)[0].text page_id = page_id.zfill(8) try: print "page: ", title except Exception as exception: print "page: (id) ", page_id for revision in element.xpath(".//mw:revision", namespaces=namespaces): revision_id = revision.xpath(".//mw:id", namespaces=namespaces)[0].text revision_id = revision_id.zfill(8) revision_titles[revision_id] = title print ".. has revision: ", revision_id #output = lxml.etree.tostring(revision) #with open("revisions/{id}.xml".format(id=revision_id), "w") as file_handler: # file_handler.write(output) #for contributor in revision.xpath(".//mw:contributor/mw:username", namespaces=namespaces): # username = contributor.text # usernames.add(username) element.clear() del element # done with the experiment, quit now #sys.exit() def dump_revision_titles(): """ Store the revision title data in a separate file. """ with open("./revision_titles.yaml", "w") as output: output.write(yaml.dump(revision_titles)) if __name__ == "__main__": try: iterparse(open("./p2pfoundationnet-20140403-history.xml", "r")) except Exception as exc: pass dump_revision_titles()