blob: 7185fb5a349a8f5b7eb2b47bff824c2787bf5c93 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
"""
Utilities related to HTTP requests.
"""
import logging
log = logging.getLogger("paperbot.httptools")
from urllib import (
unquote,
quote_plus,
)
import requests
def run_url_fixers(url):
"""
Clean up some common url problems.
"""
log.debug("Running possible fixes on url: {}".format(url))
origurl = url
url = fix_ieee_login_url(url)
url = fix_jstor_pdf_url(url)
if origurl != url:
log.debug("Fixed url to: {}".format(url))
return url
def is_same_url(url1, url2):
"""
Normalize the given urls and check whether or not they are referencing the
same resource.
"""
log.debug("Comparing two urls:\nurl1: {}\nurl2: {}".format(url1, url2))
url1 = run_url_fixers(url1)
url2 = run_url_fixers(url2)
return url1 == url2
def fix_ieee_login_url(url):
"""
Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
page, and then this link is given to paperbot. The actual link is based on
the arnumber.
example:
http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
"""
if "ieeexplore.ieee.org/xpl/login.jsp" in url:
if "arnumber=" in url:
parts = url.split("arnumber=")
# i guess the url might not look like the example in the docstring
if "&" in parts[1]:
arnumber = parts[1].split("&")[0]
else:
arnumber = parts[1]
return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber={}".format(arnumber)
# default case when things go wrong
return url
def fix_jstor_pdf_url(url):
"""
Fixes urls pointing to jstor pdfs.
"""
if "jstor.org/" in url:
if ".pdf" in url and "?acceptTC=true" not in url:
url += "?acceptTC=true"
return url
|