summaryrefslogtreecommitdiff
path: root/other-code/sciencedirect-notes.py
blob: 05cc890a23f3ed53cfb4894a695f13cdb5f43cc3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
import gsearch
import re

#related articles href =
#gs("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']/table[1]/tr/td[2]/div/a[2]")[0].values()[0]

#sciencedirect results page
#paper_title and paper_link
#paper_temp = gs("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']/table/tr/td[2]/a")
#for each in paper_temp:
#    paper_title = ""
#    paper_link = ""
#    for piece in paper_temp.itertext():
#        paper_title = paper_title + piece
#    paper_link = paper_temp.items()[0][1]


results = gsearch.gs("/html/body/div[1]/div/div[@id='sdBody']/div/div[@id='rightCol']/div/div[@id='searchResults']/div[@id='bodyMainResults']/table")
number_of_results = len(results)
for result in results:
    paper_title = ""
    paper_link = ""
    related_articles_href = result.xpath("tr/td[2]/div/a[contains(.,'Related')]")[0].values()[0]
    title_temp = result.xpath("tr/td[2]/a[1]")
    for one in title_temp:
        for piece in one.itertext():
            paper_title = paper_title + piece
        paper_link = one.items()[0][1]
    #now for the PDF link
    pdf_link = ""
    pdf_results = result.xpath("tr/td[2]/div/a[contains(.,'PDF')]")
    for piece in pdf_results:
        query = re.compile("PDF")
        text = ""
        for piece2 in piece.itertext():
            text = text + piece2
        if len(query.findall(text))>0:
            pdf_link = piece.values()[0]

    #paper_title
    #paper_link
    #related_articles_href
    #pdf_link
    print "paper_title = ", paper_title.encode("utf-8","backslashreplace")
    print "paper_link = ", paper_link.decode("iso-8859-15")
    print "related_link = ", related_articles_href
    print "pdf = ", pdf_link