1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
"""
Orchestration for downloading paper metadata and downloading the pdf. Also,
storage and debugging of failed requests.
"""
import os
import random
from StringIO import StringIO
import logging
log = logging.getLogger("paperbot.orchestrate")
import requests
import pdfparanoia
from logstuff import loghijack
from paper import Paper
from storage import (
store,
store_json,
store_logs,
)
from ezproxy import EZPROXY_CONFIG
from httptools import (
run_url_fixers,
is_same_url,
)
from htmltools import (
parse_html,
populate_metadata_from_tree,
)
from libgen import (
make_libgen_doi_url,
check_libgen_has_paper,
upload_to_libgen,
)
USER_AGENT_RAND = "%0.2x" % random.getrandbits(8)
USER_AGENT = os.environ.get("USER_AGENT",
"pdf-defense-force-" + USER_AGENT_RAND)
DEFAULT_HEADERS = {
"User-Agent": USER_AGENT,
}
def is_response_pdf(response):
"""
Determines if the response contains a pdf.
"""
return "pdf" in response.headers["content-type"]
def remove_watermarks(pdfcontent):
"""
Use pdfparanoia to remove watermarks from the pdf.
"""
log.debug("Removing pdf watermarks.")
pdfcontent = pdfparanoia.scrub(StringIO(pdfcontent))
return pdfcontent
def iterdownload(url, paper, headers=DEFAULT_HEADERS,
ezproxy_config=EZPROXY_CONFIG):
"""
Download the content at the remote url. Use a variety of methods. Not all
methods are always necessary. Sometimes none of the methods will return the
desired content.
"""
# list of responses
paper.history = []
# attempt to get without using ezproxy
log.debug("Attempting HTTP GET {}".format(url))
response = requests.get(url, headers=headers)
paper.history.append(response)
yield (url, response)
for ezproxyconf in ezproxy_config:
ezproxyurl = ezproxyconf["url"]
# POSTable data to login to this ezproxy
proxydata = ezproxyconf["data"]
# construct url based on ezproxy url plus desired url
attempturl = ezproxyurl + url
# ezproxy attempt
log.debug("Attempting ezproxy HTTP {}".format(attempturl))
response = requests.post(attempturl, data=proxydata, headers=headers)
paper.history.append(response)
# maybe this response is acceptable?
yield (attempturl, response)
def download(url, paper=None):
"""
Main entry point for executing paperbot's primary function, paper fetching.
The given url may be to a pdf file, which should be archived, or it may be
to an academic publisher's website which points to a paper. The paper needs
to be downloaded and the metadata should be stored.
Returns a tuple of (paper, json_path, pdf_path, logpath).
:param url: url to fetch and examine
:type url: str
"""
# store logs in tempfile
(templogpath, loghandler) = loghijack()
if paper is None:
paper = Paper.create({})
# clean up url if necessary
url = run_url_fixers(url)
# whether or not metadata has already been populated
populated_metadata = False
for (url2, response) in iterdownload(url, paper=paper):
if is_response_pdf(response):
log.debug("Got pdf.")
pdfcontent = remove_watermarks(response.content)
paper.pdf = pdfcontent
store(paper)
break
paper.html = response.content
# Was not pdf. Attempt to parse the HTML based on normal expected
# HTML elements. The HTML elements may say that the actual pdf url
# is something else. If this happens, then attempt to download that
# pdf url instead and then break out of this loop.
# no reason to get same metadata on every iteration of loop
if not populated_metadata:
tree = parse_html(response.content)
# most publishers show paper metadata in html in same way because ?
populate_metadata_from_tree(tree, paper)
# TODO: better way to check if populate_metadata_from_tree did
# anything useful?
if paper.title in [None, ""]:
log.debug("# TODO: parse metadata from html using plugins here")
else:
populated_metadata = True
# can't try anything else if the url is still bad
if paper.pdf_url in [None, ""]:
continue
# Normalize the two urls. The url from the metadata on the page
# might be different from the url that was originally passed in,
# even though both urls might still refer to the same resource.
if is_same_url(url, paper.pdf_url):
# pdf_url is same as original url, no pdf found yet. This
# happens when the pdf url is correct, but the publisher is
# returning html instead. And the html happens to reference the
# url that was originally requested in the first place. Argh.
continue
log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))
# paper pdf is stored at a different url. Attempt to fetch that
# url now. Only do this if pdf_url != url because otherwise
# this will be an endless loop.
for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
if is_response_pdf(response2):
log.debug("Got pdf on second-level page.")
pdfcontent = remove_watermarks(response.content)
paper.pdf = pdfcontent
store(paper)
break
else:
log.debug("Couldn't download pdf from {}".format(paper.pdf_url))
break
# was pdf downloaded?
if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or \
os.path.exists(paper.file_path_pdf):
fetched = True
else:
fetched = False
hasdoi = (paper.doi not in [None, ""])
if hasdoi:
# check if libgen has this paper already
libgenhas = check_libgen_has_paper(paper.doi)
if fetched and not libgenhas:
# upload if libgen doesn't already have it
upload_to_libgen(paper.file_path_pdf, paper.doi)
elif not fetched and libgenhas:
urldoi = make_libgen_doi_url(paper.doi)
# get from libgen
log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.")
log.debug("HTTP GET {}".format(urldoi))
response = requests.get(urldoi, headers=DEFAULT_HEADERS)
if is_response_pdf(response):
log.debug("Got pdf from libgen.")
# skip pdfparanoia because it's from libgen
pdfcontent = response.content
paper.pdf = pdfcontent
store(paper)
fetched = True
else:
log.debug("libgen lied about haspdf :(")
else:
log.debug("Don't know doi, can't check if libgen has this paper.")
libgenhas = None
# store(paper) usually handles json but in case of failure there needs to
# be an explicit save of paper metadata.
if not fetched:
store_json(paper)
# move logs into position
logpath = store_logs(paper, templogpath)
# remove loghandler from logger
mainlogger = logging.getLogger("paperbot")
mainlogger.handlers.remove(loghandler)
return (paper, paper.file_path_json, paper.file_path_pdf, logpath)
|