#author: Bryan Bishop #date: 2012-03-16 import os import json import requests #from python-requests.org from BeautifulSoup import BeautifulSoup class Protocol(): def __init__(self, url=None, name=None, attribution=None, description=None, cache_url=None): self.url = url self.name = name self.attribution = attribution self.description = description self.cache_url = cache_url self.setup_jump_id() def setup_jump_id(self): if "?ID=" in self.url: self.jump_id = self.url.split("?ID=")[1] else: self.jump_id = None @staticmethod def extract_protocols(page): """parses out some protocols from a page""" print "extract_protocols" protocols = [] sections = page.findAll(name="li", attrs={"class": "link"}) for section in sections: url = section.find(name="a")["href"] name = section.find(name="a").string \ or section.find(name="a").first().string pieces = section.findAll(name="span", attrs={"class": "des"}) if len(pieces) > 0: attribution = pieces[0].string if len(pieces) > 1: description = pieces[1].string cacheds = section.findAll(name="a", attrs={"class": "smalllink", "target": "_blank"}) cache_url = None if len(cacheds) == 1: cache_url = cacheds[0]["href"] elif len(cacheds) == 0: pass else: raise Exception, "there should only be one cache link" protocol = Protocol(url=url, name=name, attribution=attribution, description=description, cache_url=cache_url) protocols.append(protocol) return protocols class Category(): def __init__(self, name=None, url=None, parent=None): print "Category.__init__" self.parent = parent self.url = url self.name = name self.sub_categories = [] self.protocols = [] self.set_mangled_url_name() #now go through each subcategory and get protocols on each page self.get_sub_categories() def set_mangled_url_name(self): print "set_mangled_url_name" if self.parent == None: self.mangled_name = self.url.split("prot/")[1] else: self.mangled_name = self.url.split("/")[-2] print "mangled_name: " + self.mangled_name def get(self, url): print "get" r = requests.get(url) return BeautifulSoup(r.text, fromEncoding="utf8") def get_sub_categories(self): print "get_sub_categories" self.sub_categories = [] page = self.get(self.url) lis = page.findAll(name="li", attrs={"class": "catli"}) for li in lis: link = li.first().first() name = link.string href = link["href"] cat = Category(name=name, url=href, parent=self) self.sub_categories.append(cat) protocols = Protocol.extract_protocols(page) self.protocols.extend(protocols) return self.sub_categories class ProtocolOnlineScraper(): user_agent = "Mozilla/5.0 " \ + "(Macintosh; U; Intel Mac OS X 10_5_8; en-us) " \ + "AppleWebKit/531.21.8 (KHTML, like Gecko) " \ + "Version/4.0.4 Safari/5" url = { "overview": "http://www.protocol-online.org/prot/", } def get(self, url): print "get" r = requests.get(url) return BeautifulSoup(r.text, fromEncoding="utf8") def get_links(self, url): print "get_links" links = [] doc = self.get(url) hrefs = doc.findAll(name="a") for href in hrefs: if not href.has_key("href"): continue if href.string == None: text = href.first().string else: text = href.string url = href["href"] links.append([text, url]) return links def get_categories(self): print "get_categories" links = {} self.categories = [] all_links = self.get_links(self.url["overview"]) for link in all_links: text, url = link if url.count("/") == 4 and "/prot/" in url: cat = Category(url=url, name=text) self.categories.append(cat) return self.categories def run(self): print "run" self.get_categories() if __name__ == "__main__": scraper = ProtocolOnlineScraper() scraper.run()