diff options
author | kanzure <kanzure@gmail.com> | 2009-07-24 21:26:39 -0500 |
---|---|---|
committer | kanzure <kanzure@gmail.com> | 2009-07-24 21:26:39 -0500 |
commit | 8a45f872aa57c30d08022da498ec6dbdac082f01 (patch) | |
tree | ca3e2cc1f68f3220af9ea2c5b386774c5843a7d6 | |
parent | c72303b99a64afc22a6d39e28352e99e69fe93ea (diff) | |
download | pyscholar-8a45f872aa57c30d08022da498ec6dbdac082f01.tar.gz pyscholar-8a45f872aa57c30d08022da498ec6dbdac082f01.zip |
cleaned up some comments, prints out yaml, accepts an argument (query string)
-rw-r--r-- | pyscholar.py | 32 |
1 files changed, 16 insertions, 16 deletions
diff --git a/pyscholar.py b/pyscholar.py index 43edb28..c118468 100644 --- a/pyscholar.py +++ b/pyscholar.py @@ -2,11 +2,14 @@ import copy import re +import sys import yaml import httplib import urllib from BeautifulSoup import BeautifulSoup +if len(sys.argv) == 0: exit() + class Paper(yaml.YAMLObject): yaml_tag='!paper' def __init__(self, title="paper title goes here", journal_href="http://google.com/", potential_PDF_link="", cites_link="", cites_link_name="", diff_versions_link="", diff_versions_link_name="", related_papers_link="", view_as_html_link="", authors=[], publication="", pub_year="0001", server=""): @@ -18,23 +21,23 @@ class Paper(yaml.YAMLObject): SEARCH_HOST = "scholar.google.com" SEARCH_BASE_URL = "/scholar" -terms = ["PDMS"] -limit = 10 #100 +terms = [sys.argv[1]] +limit = 100 #10 params = urllib.urlencode({'q': "+".join(terms), 'num':limit}) headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US ; rv:1.9.0.9) Gecko/2009050519 Iceweasel/3.0.6 (Debian-3.0.6-1)'} url = SEARCH_BASE_URL + "?" + params -#conn = httplib.HTTPConnection(SEARCH_HOST) -#conn.request("GET", url, {}, headers) -#resp = conn.getresponse() -status = 200#resp.status +conn = httplib.HTTPConnection(SEARCH_HOST) +conn.request("GET", url, {}, headers) +resp = conn.getresponse() +status = resp.status #200 papers = [] if status==200: - #html = resp.read() - file2 = open("scholar.htm","r") - html = file2.read() - file2.close() + html = resp.read() + #file2 = open("scholar.htm","r") + #html = file2.read() + #file2.close() results = [] html = html.decode('ascii', 'ignore') soup = BeautifulSoup(html) @@ -64,7 +67,6 @@ if status==200: link_title = stuff.renderContents() if not link_title == "BL Direct": #then it might be useful. if href[-3:] == "pdf": - #TODO: add to record object as potential PDF link potential_PDF_link = href pass else: @@ -76,19 +78,17 @@ if status==200: #ISI Web of Knowledge integration? Nah. cites_link = href cites_link_name = stuff.renderContents() - #TODO: add to record object elif href.count("cluster") > 0: #All x versions cluster_link = href cluster_link_name = stuff.renderContents() - #TODO: parse number of papers that have cited this paper - #TODO: add to record object elif href.count("related") > 0: #related articles related_link = href related_link_name = stuff.renderContents() - #TODO: add to record object elif stuff.renderContents() == "View as HTML": view_as_html_link = href - #TODO: add to record object + elif stuff.renderContents() == "Cached": + pass + #TODO: add to record or object else: print "ERROR: was not a citation nor a 'see all versions' nor a related-papers link" print "title was = ", stuff.renderContents() |