paperbot/orchestrate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237

"""
Orchestration for downloading paper metadata and downloading the pdf. Also,
storage and debugging of failed requests.
"""

import os
import random
from StringIO import StringIO
import logging
log = logging.getLogger("paperbot.orchestrate")

import requests
import pdfparanoia

from logstuff import loghijack
from paper import Paper

from storage import (
    store,
    store_json,
    store_logs,
)

from ezproxy import EZPROXY_CONFIG

from httptools import (
    run_url_fixers,
    is_same_url,
)

from htmltools import (
    parse_html,
    populate_metadata_from_tree,
)

from libgen import (
    make_libgen_doi_url,
    check_libgen_has_paper,
    upload_to_libgen,
)

USER_AGENT_RAND = "%0.2x" % random.getrandbits(8)
USER_AGENT = os.environ.get("USER_AGENT",
                            "pdf-defense-force-" + USER_AGENT_RAND)

DEFAULT_HEADERS = {
    "User-Agent": USER_AGENT,
}


def is_response_pdf(response):
    """
    Determines if the response contains a pdf.
    """
    return "pdf" in response.headers["content-type"]


def remove_watermarks(pdfcontent):
    """
    Use pdfparanoia to remove watermarks from the pdf.
    """
    log.debug("Removing pdf watermarks.")
    pdfcontent = pdfparanoia.scrub(StringIO(pdfcontent))
    return pdfcontent


def iterdownload(url, paper, headers=DEFAULT_HEADERS,
                 ezproxy_config=EZPROXY_CONFIG):
    """
    Download the content at the remote url. Use a variety of methods. Not all
    methods are always necessary. Sometimes none of the methods will return the
    desired content.
    """
    # list of responses
    paper.history = []

    # attempt to get without using ezproxy
    log.debug("Attempting HTTP GET {}".format(url))
    response = requests.get(url, headers=headers)
    paper.history.append(response)
    yield (url, response)

    for ezproxyconf in ezproxy_config:
        ezproxyurl = ezproxyconf["url"]

        # POSTable data to login to this ezproxy
        proxydata = ezproxyconf["data"]

        # construct url based on ezproxy url plus desired url
        attempturl = ezproxyurl + url

        # ezproxy attempt
        log.debug("Attempting ezproxy HTTP {}".format(attempturl))
        response = requests.post(attempturl, data=proxydata, headers=headers)
        paper.history.append(response)

        # maybe this response is acceptable?
        yield (attempturl, response)


def download(url, paper=None):
    """
    Main entry point for executing paperbot's primary function, paper fetching.
    The given url may be to a pdf file, which should be archived, or it may be
    to an academic publisher's website which points to a paper. The paper needs
    to be downloaded and the metadata should be stored.

    Returns a tuple of (paper, json_path, pdf_path, logpath).

    :param url: url to fetch and examine
    :type url: str
    """
    # store logs in tempfile
    (templogpath, loghandler) = loghijack()

    if paper is None:
        paper = Paper.create({})

    # clean up url if necessary
    url = run_url_fixers(url)

    # whether or not metadata has already been populated
    populated_metadata = False

    for (url2, response) in iterdownload(url, paper=paper):
        if is_response_pdf(response):
            log.debug("Got pdf.")
            pdfcontent = remove_watermarks(response.content)
            paper.pdf = pdfcontent
            store(paper)
            break

        paper.html = response.content

        # Was not pdf. Attempt to parse the HTML based on normal expected
        # HTML elements. The HTML elements may say that the actual pdf url
        # is something else. If this happens, then attempt to download that
        # pdf url instead and then break out of this loop.

        # no reason to get same metadata on every iteration of loop
        if not populated_metadata:
            tree = parse_html(response.content)

            # most publishers show paper metadata in html in same way because ?
            populate_metadata_from_tree(tree, paper)

            # TODO: better way to check if populate_metadata_from_tree did
            # anything useful?
            if paper.title in [None, ""]:
                log.debug("# TODO: parse metadata from html using plugins here")
            else:
                populated_metadata = True

        # can't try anything else if the url is still bad
        if paper.pdf_url in [None, ""]:
            continue

        # Normalize the two urls. The url from the metadata on the page
        # might be different from the url that was originally passed in,
        # even though both urls might still refer to the same resource.
        if is_same_url(url, paper.pdf_url):
            # pdf_url is same as original url, no pdf found yet. This
            # happens when the pdf url is correct, but the publisher is
            # returning html instead. And the html happens to reference the
            # url that was originally requested in the first place. Argh.
            continue

        log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))

        # paper pdf is stored at a different url. Attempt to fetch that
        # url now. Only do this if pdf_url != url because otherwise
        # this will be an endless loop.
        for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
            if is_response_pdf(response2):
                log.debug("Got pdf on second-level page.")
                pdfcontent = remove_watermarks(response.content)
                paper.pdf = pdfcontent
                store(paper)
                break
        else:
            log.debug("Couldn't download pdf from {}".format(paper.pdf_url))

        break

    # was pdf downloaded?
    if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or \
       os.path.exists(paper.file_path_pdf):
        fetched = True
    else:
        fetched = False

    hasdoi = (paper.doi not in [None, ""])

    if hasdoi:
        # check if libgen has this paper already
        libgenhas = check_libgen_has_paper(paper.doi)

        if fetched and not libgenhas:
            # upload if libgen doesn't already have it
            upload_to_libgen(paper.file_path_pdf, paper.doi)
        elif not fetched and libgenhas:
            urldoi = make_libgen_doi_url(paper.doi)

            # get from libgen
            log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.")
            log.debug("HTTP GET {}".format(urldoi))
            response = requests.get(urldoi, headers=DEFAULT_HEADERS)

            if is_response_pdf(response):
                log.debug("Got pdf from libgen.")

                # skip pdfparanoia because it's from libgen
                pdfcontent = response.content
                paper.pdf = pdfcontent

                store(paper)

                fetched = True
            else:
                log.debug("libgen lied about haspdf :(")
    else:
        log.debug("Don't know doi, can't check if libgen has this paper.")
        libgenhas = None

    # store(paper) usually handles json but in case of failure there needs to
    # be an explicit save of paper metadata.
    if not fetched:
        store_json(paper)

    # move logs into position
    logpath = store_logs(paper, templogpath)

    # remove loghandler from logger
    mainlogger = logging.getLogger("paperbot")
    mainlogger.handlers.remove(loghandler)

    return (paper, paper.file_path_json, paper.file_path_pdf, logpath)