""" scHolar index http://insitu.lri.fr/~roussel/moulinette/h/h.cgi The following Python code makes it possible to compute H indices by using Beautiful Soup to process Google Scholar results. This code is in the public domain. For more information, see: http://en.wikipedia.org/wiki/Hirsch_number http://scholar.google.com/ http://www.crummy.com/software/BeautifulSoup/ Comments and questions should be sent to Nicolas Roussel (roussel@lri.fr). """ import os, sys, socket, re, string import BeautifulSoup def debugInfo(text): print text sys.stdout.flush() # ------------------------------------------------------------------------------ class GoogleScholarReference: def getURL(self): if self.gid==None: return None return "http://scholar.google.com/scholar?cluster=%s"%self.gid def getCitedByURL(self): if self.gid==None: return None return "http://scholar.google.com/scholar?cites=%s"%self.gid def get(self, key, defval=None): if self.__dict__.has_key(key): return self.__dict__[key] return defval _citedby = re.compile('.*Cited by (\d+).*') _pubyear = re.compile('.*\D(\d\d\d\d)\D.*') _id_cites = re.compile('.*cites=(\d+).*') _id_cluster = re.compile('.*cluster=(\d+).*') def _deHTML(line, group): text = re.sub(" "," ",line) text = re.sub("<[^<]+>","",text) if group: #text = re.sub("- group of.+","",text) text = re.sub("- all .+","",text) return text.strip() def _splitDescription(reference): reftext = unicode(reference) text = reftext.replace("\n"," ") text = re.sub("]*>","",text) text = re.sub("

","",text) text = re.sub("]*>","",text) text = re.sub("","",text) #text = re.sub("\[.+\]","",text) text = re.sub("\[.+\]","",text) return text.split("
") def _tryExp(text,exp,defval): m = exp.match(text) if m: return m.group(1) return defval #dbgfile = open("/tmp/text.txt","w") def _parseResults(soup): result = [] reference = soup.first('p', {'class':'g'}) while reference!=None: try: r = GoogleScholarReference() d = _splitDescription(reference) r.title = _deHTML(d[0],True) r.info = _deHTML(d[1],False) r.citedby = int(_tryExp(d[-1],_citedby,0)) r.year = int(_tryExp(d[1],_pubyear,0)) r.gid = _tryExp(d[-1],_id_cites,None) if not r.gid: r.gid = _tryExp(d[-1],_id_cluster,None) result.append(r) except: #dbgfile.write("\n-----------------------\n") #dbgfile.write("%s"%d) pass reference = reference.nextSibling return result # ------------------------------------------------------------------------------ _qServer,_qPort,_qTimeout = "scholar.google.com",80,5.0 _qTemplate = "/scholar?num=%d&" def doQuery(query): refs, offset, inc = [], 0, 100 while True: q = _qTemplate%inc+query if offset==0: debugInfo(u'Sending query to Google Scholar...'%(_qServer,q)) else: q = q+"&start=%d"%offset sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(_qTimeout) try: err = sock.connect_ex((_qServer, _qPort)) except socket.gaierror: debugInfo(u"unable to connect") return [] if err!=0: debugInfo(u"connection timed out (%s seconds)..."%_qTimeout) return [] sock.send("GET %s HTTP/1.1\n"%q) sock.send("Host: scholar.google.com\n") sock.send("Connection: close\n") # since we don't parse the HTTP response sock.send("Referer: http://scholar.google.com/advanced_scholar_search?hl=en&lr=\n") sock.send("User-Agent: %s\n"%os.environ.get("HTTP_USER_AGENT","Mozilla/5.0")) sock.send("\n") print "query = ", q debugInfo("reading...") data = "" while True: moredata = sock.recv(4096) if not moredata: break data = data+moredata sock.close() open("/tmp/scholar-results.html","w").write(data) debugInfo("parsing...") soup = BeautifulSoup.BeautifulSoup(data) morerefs = _parseResults(soup) refs = refs + morerefs if len(morerefs)!=inc: if len(morerefs)==0: location = None for line in string.split(data,"\n"): if line.find("Location: ")!=-1: location = string.split(line)[-1] break message = "query failed..." if location: #message = message + ' (try again after a while or after solving this)'%location message = message + ' (try again after a while)' elif len(refs)!=0: message = message + ' (try again after a while)' debugInfo(message) break offset = offset+inc return refs def fakeQuery(filename): debugInfo(u'Opening %s...'%os.path.basename(filename)) data = open(filename).read() soup = BeautifulSoup.BeautifulSoup(data) references = _parseResults(soup) return references # ------------------------------------------------------------------------------ def computeHindex(references): h = 0 while True: h = h + 1 n = 0 for r in references: if r.citedby>=h: n = n+1 if n