blob: 08f30599f48ccb8bb2036f8f151af97064f1d1ee (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
"""
Basic plugin system for different scrapers.
"""
import logging
log = logging.getLogger("paperbot.plugins.plugin")
class Plugin(object):
"""
Pluggable system for loading different scrapers for different publishers.
"""
@staticmethod
def check_url(url):
"""
Check the url to determine if this plugin handles the url.
:rtype: bool
"""
raise NotImplementedError()
@staticmethod
def scrape(url, tree, pdfmeta):
"""
Extract additional metadata about the paper from the page, including
its pdf url. Returns pdfmeta.
:param url: url where page content is from
:param tree: lxml.etree (parsed html)
:param pdfmeta: a Paper model representing extracted data
:type pdfmeta: Paper
:rtype: Paper
"""
raise NotImplementedError()
|