summaryrefslogtreecommitdiff
path: root/paperbot/httptools.py
blob: 7185fb5a349a8f5b7eb2b47bff824c2787bf5c93 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Utilities related to HTTP requests.
"""

import logging
log = logging.getLogger("paperbot.httptools")

from urllib import (
    unquote,
    quote_plus,
)

import requests


def run_url_fixers(url):
    """
    Clean up some common url problems.
    """
    log.debug("Running possible fixes on url: {}".format(url))

    origurl = url
    url = fix_ieee_login_url(url)
    url = fix_jstor_pdf_url(url)

    if origurl != url:
        log.debug("Fixed url to: {}".format(url))

    return url


def is_same_url(url1, url2):
    """
    Normalize the given urls and check whether or not they are referencing the
    same resource.
    """
    log.debug("Comparing two urls:\nurl1: {}\nurl2: {}".format(url1, url2))
    url1 = run_url_fixers(url1)
    url2 = run_url_fixers(url2)
    return url1 == url2


def fix_ieee_login_url(url):
    """
    Fixes urls point to login.jsp on IEEE Xplore. When someone browses to the
    abstracts page on IEEE Xplore, they are sometimes sent to the login.jsp
    page, and then this link is given to paperbot. The actual link is based on
    the arnumber.

    example:
    http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=806324&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D806324
    """
    if "ieeexplore.ieee.org/xpl/login.jsp" in url:
        if "arnumber=" in url:
            parts = url.split("arnumber=")

            # i guess the url might not look like the example in the docstring
            if "&" in parts[1]:
                arnumber = parts[1].split("&")[0]
            else:
                arnumber = parts[1]

            return "http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber={}".format(arnumber)

    # default case when things go wrong
    return url


def fix_jstor_pdf_url(url):
    """
    Fixes urls pointing to jstor pdfs.
    """
    if "jstor.org/" in url:
        if ".pdf" in url and "?acceptTC=true" not in url:
            url += "?acceptTC=true"
    return url