""" Fetches papers. """ import re import os import json import random import requests import lxml.etree from StringIO import StringIO import modules.scihub import urllib import traceback import pdfparanoia logchannel = os.environ.get("LOGGING", None) PROXY = 'http://ec2-54-218-13-46.us-west-2.compute.amazonaws.com:8500/plsget' USER_AGENT = 'Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11' ARCHIVE_DIR = '/home/bryan/public_html/papers2/paperbot/' ARCHIVE_BASE = 'http://diyhpl.us/~bryan/papers2/paperbot/' IEEE_EXPLORE_BASE = 'http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=' HEADERS_TM_1 = {"User-Agent": "time-machine/1.0"} HEADERS_TM_11 = {"User-Agent": "time-machine/1.1"} HEADERS_TM_2 = {"User-Agent": "time-machine/2.0"} HEADERS_TEAPOT = {"User-Agent": "pdf-teapot"} HEADERS_DEFENSE = {"User-Agent": "pdf-defense-force"} LIBGEN_FORM = "http://libgen.info/scimag/librarian/form.php" URL_REGEX = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' proxy_list = [ { 'proxy_url': None, 'proxy_type': 'normal'}, # { # 'proxy_url': PROXY, # 'proxy_type': 'custom_flask_json' # } ] def nullLog(msg): pass def make_jstor_url(document_id): """Return the url to a document from its ID.""" PREFIX = 'http://www.jstor.org/stable/pdfplus/' SUFFIX = '.pdf?acceptTC=true' return PREFIX + document_id + SUFFIX class paperbot_download_request(object): _log = nullLog def get(self, pdf_url, use_generator=False, **kwargs): proxies_left_to_try = len(proxy_list) extension = ".txt" request_iteration = 0 proxy_url_index = 0 user_agent = USER_AGENT headers = { "User-Agent": user_agent, } _log = self._log _log('paperbot_download_request pdf_url: %s' % pdf_url) while proxies_left_to_try: proxy_url = proxy_list[proxy_url_index]['proxy_url'] proxy_type = proxy_list[proxy_url_index]['proxy_type'] _log('proxies_left_to_try: %d proxy_url_index %d' % (proxies_left_to_try, proxy_url_index)) _log('request_iteration: %d' % request_iteration) # perform default behaviour if proxy is None if proxy_list[proxy_url_index]['proxy_url'] is None: if pdf_url.startswith("https://"): response = requests.get(pdf_url, verify=False, **kwargs) else: response = requests.get(pdf_url, **kwargs) else: # check type of proxy if proxy_type == 'custom_flask_json': data = { 'pdf_url': pdf_url, 'headers': kwargs.get('headers', None), 'request_iteration': request_iteration } headers["Content-Type"] = "application/json" _log('trying custom_flask_json, proxy_url %s' % proxy_url) response = requests.get(proxy_url, data=json.dumps(data), headers=headers) elif proxy_type == 'normal': # i'm not even checking if http or https is in the pdf_url, # since the default proxy of None is already being tried in # this loop proxies = { "http": proxy_url, "https": proxy_url, } headers = kwargs.get('headers', None) # I don't know if passing None or {} for headers is bad, so # I put this if: if headers is not None: response = requests.get(pdf_url, headers=headers, proxies=proxies) else: response = requests.get(pdf_url, proxies=proxies) if use_generator: yield response else: _log('checking \'PDF\' in response.headers') if "pdf" in response.headers["content-type"]: extension = ".pdf" _log('yielding tuply with PDF in response') # yield (response, extension) proxies_left_to_try = 0 break # return if 'proxies_remaining' in response.headers: remaining = response.headers['proxies_remaining'] _log('proxies_remaining in headers: %s' % remaining) # decrement the index if the custom proxy doesn't have any more # internal proxies to try if response.headers['proxies_remaining'] == 0 or \ response.headers['proxies_remaining'] == '0': proxies_left_to_try -= 1 request_iteration = 0 proxy_url_index += 1 else: _log('request_iteration+=1') request_iteration += 1 else: # decrement the index to move on to the next proxy in our # proxy_list proxies_left_to_try -= 1 request_iteration = 0 proxy_url_index += 1 if use_generator: return _log('last yield in paperbot_download_request') yield (response, extension) def download(phenny, input, verbose=True): """ Downloads a paper. """ if logchannel: _log = lambda x: phenny.msg("#%s" % logchannel, x) else: _log = lambda x: None # only accept requests in a channel if not input.sender.startswith('#'): # unless the user is an admin, of course if not input.admin: phenny.say("i only take requests in the ##hplusroadmap channel.") return else: # just give a warning message to the admin.. not a big deal. phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.") # get the input line = input.group() # was this an explicit command? explicit = False if line.startswith(phenny.nick): explicit = True line = line[len(phenny.nick):] if line.startswith(",") or line.startswith(":"): line = line[1:] if line.startswith(" "): line = line.strip() # don't bother if there's nothing there if len(line) < 5 or ("http://" not in line and "https://" not in line) or \ not line.startswith("http"): return for line in re.findall(URL_REGEX, line): # fix an UnboundLocalError problem shurl = None line = filter_fix(line) # fix for login.jsp links to ieee xplore line = fix_ieee_login_urls(line) line = fix_jstor_pdf_urls(line) translation_url = "http://localhost:1969/web" headers = { "Content-Type": "application/json", } data = { "url": line, "sessionid": "what" } data = json.dumps(data) response = requests.post(translation_url, data=data, headers=headers) if response.status_code == 200 and response.content != "[]": # see if there are any attachments content = json.loads(response.content) item = content[0] title = item["title"] if "DOI" in item: _log("Translator DOI") lgre = requests.post(LIBGEN_FORM, data={"doi": item["DOI"]}) tree = parse_html(lgre.content) if tree.xpath("//h1")[0].text != "No file selected": phenny.say("http://libgen.info/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) return if "attachments" in item: pdf_url = None for attachment in item["attachments"]: if "mimeType" in attachment and \ "application/pdf" in attachment["mimeType"]: pdf_url = attachment["url"] break if pdf_url: user_agent = USER_AGENT paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log gen = paperbot_download_request_obj.get(pdf_url, use_generator=False, headers=headers) # this is stupidly ugly for genresponse in gen: response, extension = genresponse # detect failure if response.status_code != 200: shurl, _ = modules.scihub.scihubber(pdf_url) if shurl: if "libgen" in shurl: phenny.say("http://libgen.info/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) elif "pdfcache" not in shurl: phenny.say(shurl) else: pdfstr = modules.scihub.scihub_dl(shurl) phenny.say(modules.scihub.libgen(pdfstr, item["DOI"])) return data = response.content if "pdf" in response.headers["content-type"]: try: data = pdfparanoia.scrub(StringIO(data)) try: _log('after pdfparanoia.scrub') requests.get('http://localhost:8500/remoteprint', headers={'msg': 'after pdfparanoia.scrub'}) except: pass break except: # this is to avoid a PDFNotImplementedError pass if "DOI" in item: phenny.say(modules.scihub.libgen(data, item["DOI"])) return # grr.. title = title.encode("ascii", "ignore") path = os.path.join(ARCHIVE_DIR, title + ".pdf") file_handler = open(path, "w") file_handler.write(data) file_handler.close() filename = requests.utils.quote(title) # Remove an ending period, which sometimes happens when the # title of the paper has a period at the end. if filename[-1] == ".": filename = filename[:-1] url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf" phenny.say(url) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) phenny.say(download_url(line, _log)) continue elif verbose and explicit: _log("Translation server fail") shurl, doi = modules.scihub.scihubber(line) _log("Scihubber -> (%s, %s)" % (shurl, doi)) if shurl: if "pdfcache" in shurl: if doi: pdfstr = modules.scihub.scihub_dl(shurl) phenny.say(modules.scihub.libgen(pdfstr, doi)) else: phenny.say(download_url(shurl, _log, cookies=modules.scihub.shcookie)) else: phenny.say(shurl) elif verbose and explicit: _log("All approaches failed") phenny.say(download_url(line, _log)) return download.commands = ["fetch", "get", "download"] download.priority = "high" download.rule = r'(.*)' def download_ieee(url): """ Downloads an IEEE paper. The Zotero translator requires frames/windows to be available. Eventually translation-server will be fixed, but until then it might be nice to have an IEEE workaround. """ # url = "http://ieeexplore.ieee.org:80/xpl/freeabs_all.jsp?reload=true&arnumber=901261" # url = "http://ieeexplore.ieee.org/iel5/27/19498/00901261.pdf?arnumber=901261" raise NotImplementedError def download_url(url, _log=nullLog, **kwargs): paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log response_generator = paperbot_download_request_obj.get(url, use_generator=True, headers={"User-Agent": "origami-pdf"}) cc = 0 for response in response_generator: _log('using generator for %s time' % cc) cc += 1 paperbot_download_request_obj2 = paperbot_download_request() paperbot_download_request_obj2._log = _log content = response.content # response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs) # content = response.content # just make up a default filename title = "%0.2x" % random.getrandbits(128) # default extension extension = ".txt" if "pdf" in response.headers["content-type"]: extension = ".pdf" elif check_if_html(response): # parse the html string with lxml.etree tree = parse_html(content) # extract some metadata with xpaths citation_pdf_url = find_citation_pdf_url(tree, url) citation_title = find_citation_title(tree) # aip.org sucks, citation_pdf_url is wrong if citation_pdf_url and "link.aip.org/" in citation_pdf_url: citation_pdf_url = None if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: content = requests.get(citation_pdf_url).content tree = parse_html(content) # citation_title = ... # wow, this seriously needs to be cleaned up if citation_pdf_url and citation_title and \ "ieeexplore.ieee.org" not in citation_pdf_url: citation_title = citation_title.encode("ascii", "ignore") response = requests.get(citation_pdf_url, headers=HEADERS_DEFENSE) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" title = citation_title else: if "sciencedirect.com" in url and "ShoppingCart" not in url: _log('download_url got a sciencedirect URL') try: try: title_xpath = "//h1[@class='svTitle']" title = tree.xpath(title_xpath)[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] except IndexError: title = tree.xpath("//title")[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] if 'http' not in pdf_url: main_url_split = response.url.split('//') http_prefix = main_url_split[0] if 'http' in http_prefix: domain_url = main_url_split[1].split('/')[0] slash = '/' if pdf_url[0] != '/' else '' pdf_url = http_prefix + '//' + domain_url + slash + pdf_url gen = paperbot_download_request_obj2.get(pdf_url, use_generator=False, headers={"User-Agent": "sdf-macross"}) # this is stupidly ugly for genresponse in gen: new_response, extension = genresponse new_content = new_response.content _log('paperbot_download_request_obj2 content-type: %s' % new_response.headers["content-type"]) if "pdf" in new_response.headers["content-type"]: extension = ".pdf" break except Exception as e: _log(traceback.format_exc()) pass else: content = new_content response = new_response elif "jstor.org/" in url: # clean up the url if "?" in url: url = url[0:url.find("?")] # not all pages have the element try: title = tree.xpath("//div[@class='hd title']")[0].text except Exception: try: input_xpath = "//input[@name='ppv-title']/@value" title = tree.xpath(input_xpath)[0] except Exception: pass # get the document id document_id = None if url[-1] != "/": # if "stable/" in url: # elif "discover/" in url: # elif "action/showShelf?candidate=" in url: # elif "pss/" in url: document_id = url.split("/")[-1] if document_id.isdigit(): try: pdf_url = make_jstor_url(document_id) new_response = requests.get(pdf_url, headers=HEADERS_TM_11) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif ".aip.org/" in url: try: title = tree.xpath("//title/text()")[0].split(" | ")[0] pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0] new_response = requests.get(pdf_url, headers=HEADERS_TM_1) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "ieeexplore.ieee.org" in url: try: pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0] new_response = requests.get(pdf_url, headers=HEADERS_TM_2) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "h1 class=\"articleTitle" in content: try: title_xpath = "//h1[@class='articleTitle']" title = tree.xpath(title_xpath)[0].text title = title.encode("ascii", "ignore") url_xpath = "//a[@title='View the Full Text PDF']/@href" pdf_url = tree.xpath(url_xpath)[0] except: pass else: if pdf_url.startswith("/"): url_start = url[:url.find("/", 8)] pdf_url = url_start + pdf_url response = requests.get(pdf_url, headers=HEADERS_TEAPOT) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" # raise Exception("problem with citation_pdf_url or citation_title") # well, at least save the contents from the original url pass # make the title again just in case if not title: title = "%0.2x" % random.getrandbits(128) # can't create directories title = title.replace("/", "_") path = os.path.join(ARCHIVE_DIR, title + extension) if extension in [".pdf", "pdf"]: try: content = pdfparanoia.scrub(StringIO(content)) except: # this is to avoid a PDFNotImplementedError pass file_handler = open(path, "w") file_handler.write(content) file_handler.close() title = title.encode("ascii", "ignore") url = ARCHIVE_BASE + requests.utils.quote(title) + extension return url def parse_html(content): if not isinstance(content, StringIO): content = StringIO(content) parser = lxml.etree.HTMLParser() tree = lxml.etree.parse(content, parser) return tree def check_if_html(response): return "text/html" in response.headers["content-type"] def find_citation_pdf_url(tree, url): """ Returns the content attribute. """ citation_pdf_url = extract_meta_content(tree, "citation_pdf_url") if citation_pdf_url and not citation_pdf_url.startswith("http"): if citation_pdf_url.startswith("/"): url_start = url[:url.find("/", 8)] citation_pdf_url = url_start + citation_pdf_url else: raise Exception("unhandled situation (citation_pdf_url)") return citation_pdf_url def find_citation_title(tree): """ Returns the content attribute. """ citation_title = extract_meta_content(tree, "citation_title") return citation_title def extract_meta_content(tree, meta_name): try: content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0] except: return None else: return content def filter_fix(url): """ Fixes some common problems in urls. """ if ".proxy.lib.pdx.edu" in url: url = url.replace(".proxy.lib.pdx.edu", "") return url def fix_ieee_login_urls(url): """ Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp page, and then this link is given to paperbot. The actual link is based on the arnumber. example: http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324 """ if "ieeexplore.ieee.org/xpl/login.jsp" in url: if "arnumber=" in url: parts = url.split("arnumber=") # i guess the url might not look like the example in the docstring if "&" in parts[1]: arnumber = parts[1].split("&")[0] else: arnumber = parts[1] return IEEE_EXPLORE_BASE + arnumber # default case when things go wrong return url def fix_jstor_pdf_urls(url): """ Fixes urls pointing to jstor pdfs. """ if "jstor.org/" in url: if ".pdf" in url and "?acceptTC=true" not in url: url += "?acceptTC=true" return url