"""
Get a list of all pages from the p2pfoundation wiki.
"""

from StringIO import StringIO

import requests
import lxml.etree

BASE_URL = "http://p2pfoundation.net/"
ALLPAGES_URL = BASE_URL + "Special:AllPages"

class ParseHTMLException(Exception):
    """
    An exception specifically regarding the type of content passed into
    parse_html.
    """

def parse_html(content):
    """
    Convert HTML into a parsed tree using lxml.

    @param content: html content
    @type content: str
    @returns: tree
    """
    if not isinstance(content, StringIO):
        if not isinstance(content, str) and not isinstance(content, unicode):
            raise ParseHTMLException(
                "parse_html must be given a type str, unicode or StringIO "
                "instead of {}".format(
                    str(type(content)),
                )
            )

        content = StringIO(content)

    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree

def get_and_parse_page(url):
    """
    Request a page and parse the HTML.
    """
    response = requests.get(url)
    tree = parse_html(response.content)
    return tree

def extract_link_elements(tree):
    """
    Return the HTML objects for links on the page.
    """
    return tree.xpath("//a")

def extract_links(tree):
    """
    Return all links on the page.
    """
    elements = extract_link_elements(tree)
    for link_element in elements:
        if "href" in link_element.attrib.keys():
            yield link_element.attrib["href"]

def get_allpages_index_links(url=ALLPAGES_URL):
    """
    There are multiple pages that list the actual pages of the wiki.
    """
    tree = get_and_parse_page(url)
    links = extract_links(tree)

    # remove dupes
    links = set(links)

    # filter irrelevant links
    good_links = [link for link in links if "&from" in link]
    
    return good_links

def get_links_to_all_pages(url=ALLPAGES_URL):
    """
    Get a list of all pages on the mediawiki target site.
    """
    links = []

    indexlinks = get_allpages_index_links(url)

    for indexlink in indexlinks:
        if BASE_URL not in indexlink:
            indexlink = BASE_URL + indexlink
        tree = get_and_parse_page(indexlink)
        morelinks = extract_links(tree)
        links.extend(morelinks)

    # plz filter
    links = set(links)

    return links

def main(url=ALLPAGES_URL):
    """
    Dump links from Special:Allpages.
    """
    links = get_links_to_all_pages(url)

    for link in links:
        print link

if __name__ == "__main__":
    main()