paperbot/plugins/plugin.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

"""
Basic plugin system for different scrapers.
"""

import logging
log = logging.getLogger("paperbot.plugins.plugin")


class Plugin(object):
    """
    Pluggable system for loading different scrapers for different publishers.
    """

    @staticmethod
    def check_url(url):
        """
        Check the url to determine if this plugin handles the url.

        :rtype: bool
        """
        raise NotImplementedError()

    @staticmethod
    def scrape(url, tree, pdfmeta):
        """
        Extract additional metadata about the paper from the page, including
        its pdf url. Returns pdfmeta.

        :param url: url where page content is from
        :param tree: lxml.etree (parsed html)
        :param pdfmeta: a Paper model representing extracted data
        :type pdfmeta: Paper
        :rtype: Paper
        """
        raise NotImplementedError()