other-code/google_scholar_proxy.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

#!/usr/bin/env python
import re, sys, urllib, urllib2, cookielib 
from BeautifulSoup import BeautifulSoup, Tag
from urllib2 import HTTPError
import BaseHTTPServer

cj=cookielib.CookieJar()
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

opener.addheaders=[('User-agent','Mozilla/5.0')]

testing = 0

if testing == 0:
	url = sys.stdin.readline().strip()
	#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22FG+GALLAGHER%22&btnG=Search"
	#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22a+einstein%22+-pdf&btnG=Search"
	#url = "http://scholar.google.com/scholar?hl=en&lr=&q=author%3A%22feynman%22+-pdf&btnG=Search"
	#url ="http://scholar.google.com/scholar?hl=en&lr=&q=drosophila+-pdf&btnG=Search"
	try:
		if url == "":
			url = "http://scholar.google.com/scholar"
		f = opener.open(url)
#		info = f.info()
#		print info
		data = f.read()
	except HTTPError, e:
		code = e.code
		try:
			msg = BaseHTTPServer.BaseHTTPRequestHandler.responses[code]
		except:
			print "Google Scholar error (1)", sys.exc_info()[0]
			sys.exit(0)
		
		if msg:
			print ("Google Scholar error: %s %s (%s)" % (code, msg[0], msg[1]))
		else:
			print "Google Scholar error (2): ", code
		sys.exit(0)
#	except:
#		print "Google Scholar error (3):", sys.exc_info()[0]
#		sys.exit(0)
else:
	data = sys.stdin.read()

soup = BeautifulSoup(data)
#print soup.prettify()
#sys.exit(0)


# make sure all image URL point to google
for img in soup.findAll("img"):
	if img.has_key('src') and img['src'].startswith('/'):
			img['src'] = "http://scholar.google.com" + img['src']

#
# Might be more robust to trawl ALL <A HREF=".."> (as class="w" might
# break) and replace those that start with absolute URL "http://"
# (filtering out any matching http://xxx.google.xxx/, just to be sure!)
#
items = soup.findAll("p", { "class" : "g" })

for item in items:
#	print div
	wspan = item.find("span", {"class" : "w"})
#	print wspan
	# Hmm, this should never happen, but it does!
	if not wspan:
		continue
	a = wspan.find('a')
	if not a:
		continue
	if not a['href']:
		continue
		
	cul = Tag(soup, "a")

	cul['href'] = "/posturl?url="+urllib.quote(a['href'])
	img = Tag(soup, "img")
	img['src']="http://static.citeulike.org/favicon.gif"
	img['style']="border:0"
	cul.insert(0,img)
	wspan.insert(99, cul)	
#	print wspan.prettify()

if testing == 0:
	print soup
else:
	print soup.prettify()