summaryrefslogtreecommitdiff
path: root/paperbot/htmltools.py
blob: 316b0435b6c2d5f7b9f57291e99eb7684407ab63 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from StringIO import StringIO
import lxml.etree

from paper import meta_attribute_mapping

import logging
log = logging.getLogger("paperbot.htmlstuff")


def is_html(response):
    """
    Check if a python-requests Response object contains a text/html response.
    """
    return "text/html" in response.headers["content-type"]


def parse_html(content):
    """
    lxml.etree from html text

    :param content: html text
    :type content: str or StringIO
    """
    log.debug("parse_html")
    if not isinstance(content, StringIO):
        content = StringIO(content)
    parser = lxml.etree.HTMLParser()
    tree = lxml.etree.parse(content, parser)
    return tree


def extract_meta_content(tree, meta_name):
    content = tree.xpath("//meta[@name='" + meta_name + "']/@content")[0]
    return content


def get_citation_title(tree):
    """
    Return the <meta name="citation_title"> content attribute.
    """
    citation_title = extract_meta_content(tree, "citation_title")
    return citation_title


def get_citation_pdf_url(tree):
    citation_pdf_url = extract_meta_content(tree, "citation_pdf_url")
    return citation_pdf_url


def extract_metadata(tree, meta_attribute_mapping=meta_attribute_mapping):
    """
    Extract common metadata from the HTML document.

    :rtype: dict
    """
    output = {}

    for (metakey, paperkey) in meta_attribute_mapping.iteritems():
        try:
            value = extract_meta_content(tree, metakey)
        except:
            log.debug("Couldn't find {metakey} in the html.".format(metakey=metakey))
        else:
            log.debug("Found {metakey} with value {value}".format(metakey=metakey, value=value))
            output[paperkey] = value

    return output


def populate_metadata_from_tree(tree, paper,
                                meta_attribute_mapping=meta_attribute_mapping):
    """
    Update paper metadata based on data from parsing the html tree.
    """
    data = extract_metadata(tree,
                            meta_attribute_mapping=meta_attribute_mapping)

    for (key, value) in data.iteritems():
        log.debug("metadata | {key} => {value}".format(key=key, value=value))
        setattr(paper, key, value)