packages/classes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

#!/usr/bin/python
import yaml
import re
import lxml.etree

from BeautifulSoup import BeautifulSoup
#import BSXPath.BSXPathEvaluator, BSXPath.XPathResult
from BSXPath import BSXPathEvaluator, XPathResult
#http://www.crummy.com/software/BeautifulSoup/documentation.html

class Nature(yaml.YAMLObject):
    yaml_tag='!nature'
    def __init__(self):
        pass
    def yaml_repr(self):
        pass
    def __repr__(self):
        return "nature object"
 
class ScienceDirect(yaml.YAMLObject):
    yaml_tag='!sciencedirect'
    def __init__(self):
        pass
    def yaml_repr(self):
        pass
    def __repr__(self):
        return "sciencedirect object"
    def detectWeb(self, doc, url):
        if type(doc) == type(""): 
            doc = BSXPathEvaluator(doc)
        if url.count("_ob=DownloadURL") != 0 or doc.title == "ScienceDirect Login":
            return False
        if ((not re.match("pdf",url)) and url.count("_ob=ArticleURL")==0 and url.count("/article/")==0) or url.count("/journal/") != 0:
            return "multiple"
        elif not re.match("pdf",url):
            return "journalArticle"
        return False
    def doWeb(self, doc, url):
        if type(doc) == type("huh"): #then it's not BeautifulSoup
            tree = lxml.etree.fromstring(doc, lxml.etree.HTMLParser())
            links = tree.xpath("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']")
            #print "links = ", links
            #for each in links:
            #    print type(links[0])
            document = BSXPathEvaluator(doc)
        else: document = doc
        if document.evaluate("//*[contains(@src, \"exportarticle_a.gif\")]", document, None, XPathResult.ANY_TYPE, None):
            articles = []
            if (self.detectWeb(doc, url) == "multiple"):
                #search page
                items = {}
                xpath = None
                if (url.count("_ob=PublicationURL") > 0):
                    xpath = '//table[@class="resultRow"]/tbody/tr/td[2]/a'
                else:
                    xpath = '//div[@class="font3"][@id="bodyMainResults"]/table/tbody/tr/td[2]/a'
                rows = document.evaluate(xpath, document, None, XPathResult.ANY_TYPE, None)
                print rows
                next_row = None
                #for next_row in rows.iterateNext():
                isTrue = True
                next_row = rows
                while isTrue:
                    try: next_row=rows.iterateNext()
                    except IndexError: isTrue=False
                    #while (next_row = rows.iterateNext()):
                    print next_row.__dict__
                    title = "some title here" #next_row.text
                    link = "some href here" #next_row.href
                    if not re.match("PDF \(",title) and not re.match("Related Articles",title): items[link] = title;
                #items = zotero.SelectItems(items)
                #let's assume we want all of them
                [articles.append(i) for i in items]
                result_sets = []
                for article in articles:
                    result_sets.append({'article':article})
            else:
                articles = [url]
                return_sets = [{"currentdoc":doc}]
            if len(articles) == 0:
                print "ERROR: no items were found"
                return
            print "articles = ", articles
            print "result_sets = ", result_sets
        return result_sets #return all articles or the currentdoc in a dict for stuff that we want to grab