From 8a45f872aa57c30d08022da498ec6dbdac082f01 Mon Sep 17 00:00:00 2001
From: kanzure <kanzure@gmail.com>
Date: Fri, 24 Jul 2009 21:26:39 -0500
Subject: cleaned up some comments, prints out yaml, accepts an argument (query
 string)

---
 pyscholar.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/pyscholar.py b/pyscholar.py
index 43edb28..c118468 100644
--- a/pyscholar.py
+++ b/pyscholar.py
@@ -2,11 +2,14 @@
 
 import copy
 import re
+import sys
 import yaml
 import httplib
 import urllib
 from BeautifulSoup import BeautifulSoup
 
+if len(sys.argv) == 0: exit()
+
 class Paper(yaml.YAMLObject):
     yaml_tag='!paper'
     def __init__(self, title="paper title goes here", journal_href="http://google.com/", potential_PDF_link="", cites_link="", cites_link_name="", diff_versions_link="", diff_versions_link_name="", related_papers_link="", view_as_html_link="", authors=[], publication="", pub_year="0001", server=""):
@@ -18,23 +21,23 @@ class Paper(yaml.YAMLObject):
 
 SEARCH_HOST = "scholar.google.com"
 SEARCH_BASE_URL = "/scholar"
-terms = ["PDMS"]
-limit = 10 #100
+terms = [sys.argv[1]]
+limit = 100 #10
 
 params = urllib.urlencode({'q': "+".join(terms), 'num':limit})
 headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US    ; rv:1.9.0.9) Gecko/2009050519 Iceweasel/3.0.6 (Debian-3.0.6-1)'}
 url = SEARCH_BASE_URL + "?" + params
 
-#conn = httplib.HTTPConnection(SEARCH_HOST)
-#conn.request("GET", url, {}, headers)
-#resp = conn.getresponse()
-status = 200#resp.status
+conn = httplib.HTTPConnection(SEARCH_HOST)
+conn.request("GET", url, {}, headers)
+resp = conn.getresponse()
+status = resp.status #200
 papers = []
 if status==200:
-    #html = resp.read()
-    file2 = open("scholar.htm","r")
-    html = file2.read()
-    file2.close()
+    html = resp.read()
+    #file2 = open("scholar.htm","r")
+    #html = file2.read()
+    #file2.close()
     results = []
     html = html.decode('ascii', 'ignore')
     soup = BeautifulSoup(html)
@@ -64,7 +67,6 @@ if status==200:
                     link_title = stuff.renderContents()
                     if not link_title == "BL Direct": #then it might be useful.
                         if href[-3:] == "pdf":
-                            #TODO: add to record object as potential PDF link
                             potential_PDF_link = href
                             pass
                         else:
@@ -76,19 +78,17 @@ if status==200:
                         #ISI Web of Knowledge integration? Nah.
                         cites_link = href
                         cites_link_name = stuff.renderContents()
-                        #TODO: add to record object
                     elif href.count("cluster") > 0: #All x versions
                         cluster_link = href
                         cluster_link_name = stuff.renderContents()
-                        #TODO: parse number of papers that have cited this paper
-                        #TODO: add to record object
                     elif href.count("related") > 0: #related articles
                         related_link = href
                         related_link_name = stuff.renderContents()
-                        #TODO: add to record object
                     elif stuff.renderContents() == "View as HTML":
                         view_as_html_link = href
-                        #TODO: add to record object
+                    elif stuff.renderContents() == "Cached":
+                        pass
+                        #TODO: add to record or object
                     else: 
                         print "ERROR: was not a citation nor a 'see all versions' nor a related-papers link"
                         print "title was = ", stuff.renderContents()
-- 
cgit v1.2.3