""" Get a list of all pages from the p2pfoundation wiki. """ from StringIO import StringIO import requests import lxml.etree BASE_URL = "http://p2pfoundation.net/" ALLPAGES_URL = BASE_URL + "Special:AllPages" class ParseHTMLException(Exception): """ An exception specifically regarding the type of content passed into parse_html. """ def parse_html(content): """ Convert HTML into a parsed tree using lxml. @param content: html content @type content: str @returns: tree """ if not isinstance(content, StringIO): if not isinstance(content, str) and not isinstance(content, unicode): raise ParseHTMLException( "parse_html must be given a type str, unicode or StringIO " "instead of {}".format( str(type(content)), ) ) content = StringIO(content) parser = lxml.etree.HTMLParser() tree = lxml.etree.parse(content, parser) return tree def get_and_parse_page(url): """ Request a page and parse the HTML. """ response = requests.get(url) tree = parse_html(response.content) return tree def extract_link_elements(tree): """ Return the HTML objects for links on the page. """ return tree.xpath("//a") def extract_links(tree): """ Return all links on the page. """ elements = extract_link_elements(tree) for link_element in elements: if "href" in link_element.attrib.keys(): yield link_element.attrib["href"] def get_allpages_index_links(url=ALLPAGES_URL): """ There are multiple pages that list the actual pages of the wiki. """ tree = get_and_parse_page(url) links = extract_links(tree) # remove dupes links = set(links) # filter irrelevant links good_links = [link for link in links if "&from" in link] return good_links def get_links_to_all_pages(url=ALLPAGES_URL): """ Get a list of all pages on the mediawiki target site. """ links = [] indexlinks = get_allpages_index_links(url) for indexlink in indexlinks: if BASE_URL not in indexlink: indexlink = BASE_URL + indexlink tree = get_and_parse_page(indexlink) morelinks = extract_links(tree) links.extend(morelinks) # plz filter links = set(links) return links def main(url=ALLPAGES_URL): """ Dump links from Special:Allpages. """ links = get_links_to_all_pages(url) for link in links: print link if __name__ == "__main__": main()