1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
#!/usr/bin/python
import copy
import re
import sys
import yaml
import httplib
import urllib
from BeautifulSoup import BeautifulSoup
if len(sys.argv) == 0: exit()
class Paper(yaml.YAMLObject):
yaml_tag='!paper'
def __init__(self, title="paper title goes here", journal_href="http://google.com/", potential_PDF_link="", cites_link="", cites_link_name="", diff_versions_link="", diff_versions_link_name="", related_papers_link="", view_as_html_link="", authors=[], publication="", pub_year="0001", server=""):
self.title, self.journal_href, self.potential_PDF_link, self.cites_link, self.cites_link_name, self.diff_versions_link, self.diff_versions_link_name, self.related_papers_link, self.view_as_html_link, self.authors, self.publication, self.pub_year, self.server = title, journal_href, potential_PDF_link, cites_link, cites_link_name, diff_versions_link, diff_versions_link_name, related_papers_link, view_as_html_link, authors, publication, pub_year, server
def __repr__(self):
return "Paper(title=\"%s\", journal_href=\"%s\", potential_PDF_link=\"%s\", cites_link=\"%s\", cites_link_name=\"%s\", diff_versions_link=\"%s\", diff_versions_link_name=\"%s\", related_papers_link=\"%s\", view_as_html_link=\"%s\", authors=\"%s\", publication=\"%s\", pub_year=\"%s\", server=\"%s\")" % (self.title, self.journal_href, self.potential_PDF_link, self.cites_link, self.cites_link_name, self.diff_versions_link, self.diff_versions_link_name, self.related_papers_link, self.view_as_html_link, self.authors, self.publication, self.pub_year, self.server)
SEARCH_HOST = "scholar.google.com"
SEARCH_BASE_URL = "/scholar"
terms = [sys.argv[1]]
limit = 100 #10
params = urllib.urlencode({'q': "+".join(terms), 'num':limit})
headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US ; rv:1.9.0.9) Gecko/2009050519 Iceweasel/3.0.6 (Debian-3.0.6-1)'}
url = SEARCH_BASE_URL + "?" + params
conn = httplib.HTTPConnection(SEARCH_HOST)
conn.request("GET", url, {}, headers)
resp = conn.getresponse()
status = resp.status #200
papers = []
if status==200:
html = resp.read()
#file2 = open("scholar.htm","r")
#html = file2.read()
#file2.close()
results = []
html = html.decode('ascii', 'ignore')
soup = BeautifulSoup(html)
for record in soup('p'):
potential_PDF_link = ""
cluster_link = ""
cluster_link_name = ""
view_as_html_link = ""
blah = record.find(name=re.compile("h3"))
blah2 = record.find(name=re.compile("form"))
#blah = True
#blah2 = True
if True:
#process one of the results
#print "record = ", record
paper_title = ""
journal_href = "" #link to something like sciencedirect
for stuff in record('a'):
if stuff._getAttrMap().has_key("onmousedown") and not stuff._getAttrMap().has_key("class"):
paper_title = stuff.renderContents()
journal_href = stuff._getAttrMap()["href"]
#print "title = ", paper_title
#print "href = ", href
elif stuff._getAttrMap().has_key("onmousedown"): #but it has a class
#print "it has a class and is onmousedown .. ";
#print stuff.renderContents()
href = stuff._getAttrMap()["href"]
link_title = stuff.renderContents()
if not link_title == "BL Direct": #then it might be useful.
if href[-3:] == "pdf":
potential_PDF_link = href
pass
else:
print "ERROR: what do I want to do with this? href = ", href
else: #it does not have onmousedown nor has a class in the link. what is it?
#cites or cluster
href = stuff._getAttrMap()["href"]
if href.count("cites") > 0: #Cited by this-many
#ISI Web of Knowledge integration? Nah.
cites_link = href
cites_link_name = stuff.renderContents()
elif href.count("cluster") > 0: #All x versions
cluster_link = href
cluster_link_name = stuff.renderContents()
elif href.count("related") > 0: #related articles
related_link = href
related_link_name = stuff.renderContents()
elif stuff.renderContents() == "View as HTML":
view_as_html_link = href
elif stuff.renderContents() == "Cached":
pass
#TODO: add to record or object
else:
print "ERROR: was not a citation nor a 'see all versions' nor a related-papers link"
print "title was = ", stuff.renderContents()
#grab description
#print "record = ", record
for more in record('font',{'size':'-1'}):
#print "more = ", more.renderContents()
#grab authors
author_data = ""
for jiggie in more('span',{'class':'a'}):
content = jiggie.renderContents()
content_array = content.split(" - ")
author_data = content_array[0]
#strip bold tags from author_data
#i don't know how to do this with BeautifulSoup
author_data = ''.join(BeautifulSoup(author_data).findAll(text=True))
author_data = author_data.replace("…","") #replace ellipses
authors = author_data.split(",")
publication = ""
pub_year = ""
if len(content_array) > 1:
if not len(content_array[1]) <= 6:
publication = content_array[1][:-6]
#last four digits should be a year
pub_year = content_array[1][-4:]
server = ""
if len(content_array) > 2:
server = content_array[2]
#print "server = ", server
#print "authors = ", author_data
#print "publisher = ", publication
#print "publication year = ", pub_year
#what now?
diff_versions_link=cluster_link
diff_versions_link_name=cluster_link_name
print "what?"
my_paper = Paper(title=paper_title, journal_href=journal_href, potential_PDF_link=potential_PDF_link, cites_link=cites_link, cites_link_name=cites_link_name, diff_versions_link=diff_versions_link, diff_versions_link_name=diff_versions_link_name, related_papers_link=related_link, view_as_html_link=view_as_html_link, authors=authors, publication=publication, pub_year=pub_year, server=server)
papers.append(my_paper)
print yaml.dump(papers)
|