"""
Convert from mediawiki xml to git commits using a pull parser. The pull parser
is because loading the entire xml file at once is probably going to require too
much memory (not to mention the memory requirements for storing a parsed
version of the xml file).
"""

import sys
from io import BytesIO

import lxml.etree
import yaml

namespaces = {
    "mw": "http://www.mediawiki.org/xml/export-0.6/",
}

usernames = set()
revision_titles = {}

def iterparse(some_file):
    """
    Incrementally parse an XML file without first loading the entire file into
    memory.
    """

    for (event, element) in lxml.etree.iterparse(some_file):
        if element.tag == "{http://www.mediawiki.org/xml/export-0.6/}page":
            # get the title of the page
            title = element.xpath(".//mw:title", namespaces={"mw": "http://www.mediawiki.org/xml/export-0.6/"})[0].text

            # page id is useful
            page_id = element.xpath(".//mw:id", namespaces=namespaces)[0].text
            page_id = page_id.zfill(8)

            try:
                print "page: ", title
            except Exception as exception:
                print "page: (id) ", page_id

            for revision in element.xpath(".//mw:revision", namespaces=namespaces):
                revision_id = revision.xpath(".//mw:id", namespaces=namespaces)[0].text
                revision_id = revision_id.zfill(8)

                revision_titles[revision_id] = title

                print ".. has revision: ", revision_id

                #output = lxml.etree.tostring(revision)

                #with open("revisions/{id}.xml".format(id=revision_id), "w") as file_handler:
                #    file_handler.write(output)

                #for contributor in revision.xpath(".//mw:contributor/mw:username", namespaces=namespaces):
                #    username = contributor.text
                #    usernames.add(username)

            element.clear()
            del element

            # done with the experiment, quit now
            #sys.exit()

def dump_revision_titles():
    """
    Store the revision title data in a separate file.
    """

    with open("./revision_titles.yaml", "w") as output:
        output.write(yaml.dump(revision_titles))

if __name__ == "__main__":
    try:
        iterparse(open("./p2pfoundationnet-20140403-history.xml", "r"))
    except Exception as exc:
        pass

    dump_revision_titles()