#author: Bryan Bishop <kanzure@gmail.com>
#date: 2012-03-16
import os
import json
import requests #from python-requests.org
from BeautifulSoup import BeautifulSoup

class Protocol():
    def __init__(self, url=None, name=None, attribution=None, description=None, cache_url=None):
        self.url = url
        self.name = name
        self.attribution = attribution
        self.description = description
        self.cache_url = cache_url
        self.setup_jump_id()
    def setup_jump_id(self):
        if "?ID=" in self.url:
            self.jump_id = self.url.split("?ID=")[1]
        else: self.jump_id = None
    @staticmethod
    def extract_protocols(page):
        """parses out some protocols from a page"""
        print "extract_protocols"
        protocols = []
        sections = page.findAll(name="li", attrs={"class": "link"})
        for section in sections:
            url = section.find(name="a")["href"]
            name = section.find(name="a").string \
                 or section.find(name="a").first().string
            pieces = section.findAll(name="span", attrs={"class": "des"})
            if len(pieces) > 0:
                attribution = pieces[0].string
            if len(pieces) > 1:
                description = pieces[1].string
            cacheds = section.findAll(name="a", attrs={"class": "smalllink", "target": "_blank"})
            cache_url = None
            if len(cacheds) == 1:
                cache_url = cacheds[0]["href"]
            elif len(cacheds) == 0: pass
            else: raise Exception, "there should only be one cache link"
            protocol = Protocol(url=url, name=name, attribution=attribution, description=description, cache_url=cache_url)
            protocols.append(protocol)
        return protocols

class Category():
    def __init__(self, name=None, url=None, parent=None):
        print "Category.__init__"
        self.parent = parent
        self.url = url
        self.name = name
        self.sub_categories = []
        self.protocols = []
        self.set_mangled_url_name()
        #now go through each subcategory and get protocols on each page
        self.get_sub_categories()
    def set_mangled_url_name(self):
        print "set_mangled_url_name"
        if self.parent == None:
            self.mangled_name = self.url.split("prot/")[1]
        else:
            self.mangled_name = self.url.split("/")[-2]
        print "mangled_name: " + self.mangled_name
    def get(self, url):
        print "get"
        r = requests.get(url)
        return BeautifulSoup(r.text, fromEncoding="utf8")
    def get_sub_categories(self):
        print "get_sub_categories"
        self.sub_categories = []
        page = self.get(self.url)
        lis = page.findAll(name="li", attrs={"class": "catli"})
        for li in lis:
            link = li.first().first()
            name = link.string
            href = link["href"]
            cat = Category(name=name, url=href, parent=self)
            self.sub_categories.append(cat)
        protocols = Protocol.extract_protocols(page)
        self.protocols.extend(protocols)
        return self.sub_categories

class ProtocolOnlineScraper():
    user_agent = "Mozilla/5.0 " \
               + "(Macintosh; U; Intel Mac OS X 10_5_8; en-us) " \
               + "AppleWebKit/531.21.8 (KHTML, like Gecko) " \
               + "Version/4.0.4 Safari/5"
    url = {
        "overview": "http://www.protocol-online.org/prot/",
    }
    def get(self, url):
        print "get"
        r = requests.get(url)
        return BeautifulSoup(r.text, fromEncoding="utf8")
    def get_links(self, url):
        print "get_links"
        links = []
        doc = self.get(url)
        hrefs = doc.findAll(name="a")
        for href in hrefs:
            if not href.has_key("href"): continue
            if href.string == None:
                text = href.first().string
            else:
                text = href.string
            url = href["href"]
            links.append([text, url])
        return links
    def get_categories(self):
        print "get_categories"
        links = {}
        self.categories = []
        all_links = self.get_links(self.url["overview"])
        for link in all_links:
            text, url = link
            if url.count("/") == 4 and "/prot/" in url:
                cat = Category(url=url, name=text)
                self.categories.append(cat)
        return self.categories
    def run(self):
        print "run"
        self.get_categories()

if __name__ == "__main__":
    scraper = ProtocolOnlineScraper()
    scraper.run()