"""
Orchestration for downloading paper metadata and downloading the pdf. Also,
storage and debugging of failed requests.
"""

import os
import random
from StringIO import StringIO
import logging
log = logging.getLogger("paperbot.orchestrate")

import requests
import pdfparanoia

from logstuff import loghijack
from paper import Paper

from storage import (
    store,
    store_json,
    store_logs,
)

from ezproxy import EZPROXY_CONFIG

from httptools import (
    run_url_fixers,
    is_same_url,
)

from htmltools import (
    parse_html,
    populate_metadata_from_tree,
)

from libgen import (
    make_libgen_doi_url,
    check_libgen_has_paper,
    upload_to_libgen,
)

USER_AGENT_RAND = "%0.2x" % random.getrandbits(8)
USER_AGENT = os.environ.get("USER_AGENT",
                            "pdf-defense-force-" + USER_AGENT_RAND)

DEFAULT_HEADERS = {
    "User-Agent": USER_AGENT,
}


def is_response_pdf(response):
    """
    Determines if the response contains a pdf.
    """
    return "pdf" in response.headers["content-type"]


def remove_watermarks(pdfcontent):
    """
    Use pdfparanoia to remove watermarks from the pdf.
    """
    log.debug("Removing pdf watermarks.")
    pdfcontent = pdfparanoia.scrub(StringIO(pdfcontent))
    return pdfcontent


def iterdownload(url, paper, headers=DEFAULT_HEADERS,
                 ezproxy_config=EZPROXY_CONFIG):
    """
    Download the content at the remote url. Use a variety of methods. Not all
    methods are always necessary. Sometimes none of the methods will return the
    desired content.
    """
    # list of responses
    paper.history = []

    # attempt to get without using ezproxy
    log.debug("Attempting HTTP GET {}".format(url))
    response = requests.get(url, headers=headers)
    paper.history.append(response)
    yield (url, response)

    for ezproxyconf in ezproxy_config:
        ezproxyurl = ezproxyconf["url"]

        # POSTable data to login to this ezproxy
        proxydata = ezproxyconf["data"]

        # construct url based on ezproxy url plus desired url
        attempturl = ezproxyurl + url

        # ezproxy attempt
        log.debug("Attempting ezproxy HTTP {}".format(attempturl))
        response = requests.post(attempturl, data=proxydata, headers=headers)
        paper.history.append(response)

        # maybe this response is acceptable?
        yield (attempturl, response)


def download(url, paper=None):
    """
    Main entry point for executing paperbot's primary function, paper fetching.
    The given url may be to a pdf file, which should be archived, or it may be
    to an academic publisher's website which points to a paper. The paper needs
    to be downloaded and the metadata should be stored.

    Returns a tuple of (paper, json_path, pdf_path, logpath).

    :param url: url to fetch and examine
    :type url: str
    """
    # store logs in tempfile
    (templogpath, loghandler) = loghijack()

    if paper is None:
        paper = Paper.create({})

    # clean up url if necessary
    url = run_url_fixers(url)

    # whether or not metadata has already been populated
    populated_metadata = False

    for (url2, response) in iterdownload(url, paper=paper):
        if is_response_pdf(response):
            log.debug("Got pdf.")
            pdfcontent = remove_watermarks(response.content)
            paper.pdf = pdfcontent
            store(paper)
            break

        paper.html = response.content

        # Was not pdf. Attempt to parse the HTML based on normal expected
        # HTML elements. The HTML elements may say that the actual pdf url
        # is something else. If this happens, then attempt to download that
        # pdf url instead and then break out of this loop.

        # no reason to get same metadata on every iteration of loop
        if not populated_metadata:
            tree = parse_html(response.content)

            # most publishers show paper metadata in html in same way because ?
            populate_metadata_from_tree(tree, paper)

            # TODO: better way to check if populate_metadata_from_tree did
            # anything useful?
            if paper.title in [None, ""]:
                log.debug("# TODO: parse metadata from html using plugins here")
            else:
                populated_metadata = True

        # can't try anything else if the url is still bad
        if paper.pdf_url in [None, ""]:
            continue

        # Normalize the two urls. The url from the metadata on the page
        # might be different from the url that was originally passed in,
        # even though both urls might still refer to the same resource.
        if is_same_url(url, paper.pdf_url):
            # pdf_url is same as original url, no pdf found yet. This
            # happens when the pdf url is correct, but the publisher is
            # returning html instead. And the html happens to reference the
            # url that was originally requested in the first place. Argh.
            continue

        log.debug("Switching activity to pdf_url {}".format(paper.pdf_url))

        # paper pdf is stored at a different url. Attempt to fetch that
        # url now. Only do this if pdf_url != url because otherwise
        # this will be an endless loop.
        for (url3, response2) in iterdownload(paper.pdf_url, paper=paper):
            if is_response_pdf(response2):
                log.debug("Got pdf on second-level page.")
                pdfcontent = remove_watermarks(response.content)
                paper.pdf = pdfcontent
                store(paper)
                break
        else:
            log.debug("Couldn't download pdf from {}".format(paper.pdf_url))

        break

    # was pdf downloaded?
    if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or \
       os.path.exists(paper.file_path_pdf):
        fetched = True
    else:
        fetched = False

    hasdoi = (paper.doi not in [None, ""])

    if hasdoi:
        # check if libgen has this paper already
        libgenhas = check_libgen_has_paper(paper.doi)

        if fetched and not libgenhas:
            # upload if libgen doesn't already have it
            upload_to_libgen(paper.file_path_pdf, paper.doi)
        elif not fetched and libgenhas:
            urldoi = make_libgen_doi_url(paper.doi)

            # get from libgen
            log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.")
            log.debug("HTTP GET {}".format(urldoi))
            response = requests.get(urldoi, headers=DEFAULT_HEADERS)

            if is_response_pdf(response):
                log.debug("Got pdf from libgen.")

                # skip pdfparanoia because it's from libgen
                pdfcontent = response.content
                paper.pdf = pdfcontent

                store(paper)

                fetched = True
            else:
                log.debug("libgen lied about haspdf :(")
    else:
        log.debug("Don't know doi, can't check if libgen has this paper.")
        libgenhas = None

    # store(paper) usually handles json but in case of failure there needs to
    # be an explicit save of paper metadata.
    if not fetched:
        store_json(paper)

    # move logs into position
    logpath = store_logs(paper, templogpath)

    # remove loghandler from logger
    mainlogger = logging.getLogger("paperbot")
    mainlogger.handlers.remove(loghandler)

    return (paper, paper.file_path_json, paper.file_path_pdf, logpath)