#!/usr/bin/env python import os import sys import signal from optparse import OptionParser from PyQt4.QtCore import * from PyQt4.QtGui import * from PyQt4.QtWebKit import QWebPage JQUERY_URL = "http://jquery.com/src/jquery-latest.js" JQUERY_FILE = "jquery.js" JQUERY_PATH = os.path.join(os.path.dirname(__file__), JQUERY_FILE) class WebPage(QWebPage): "print javascript errors to stderr" def javaScriptConsoleMessage(self, message, line_number, source_id): sys.stdout.write(message) sys.stdout.write("\n") class Crawler(WebPage): def __init__(self, url, file): WebPage.__init__(self) self._url = url self._file = file def crawl(self): signal.signal(signal.SIGINT, signal.SIG_DFL) self.connect(self, SIGNAL("loadFinished(bool)"), self._finished_loading) self.mainFrame().load(QUrl(self._url)) def _finished_loading(self, status): #inject jquery jq_file_handler = open(JQUERY_PATH, "r") jquery = jq_file_handler.read() jq_file_handler.close() self.mainFrame().evaluateJavaScript(jquery) #pass it on self.scraper() def scraper(self): #file = open(self._file, "w") #file.write(self.mainFrame().toHtml()) #file.close() print "main tag name: " + self.mainFrame().documentElement().tagName() print "current document title is: " + self.mainFrame().title() #this is how you handle a hash print "result is: " + str(self.mainFrame().evaluateJavaScript("x={'key': 'value', 'key2': 'value2'};x").toMap().keys()) #let's test console output self.mainFrame().evaluateJavaScript("console.log(\"this message indicates that console.log() is working.\")") #how about something more complicated? self.mainFrame().evaluateJavaScript("function thinker() { return 55; }") result = self.mainFrame().evaluateJavaScript("thinker").toString() print "result is: " + result #self.mainFrame().evaluateJavaScript("function changer() { document.title = 'billy bob'; }") #changer = self.globalObject().property("changer") sys.exit(0) def main(): app = QApplication(sys.argv) options = get_cmd_options() crawler = Crawler(options.url, options.file) crawler.crawl() sys.exit(app.exec_()) def get_cmd_options(): """ gets and validates the input from the command line """ usage = "usage: %prog [options] args" parser = OptionParser(usage) parser.add_option("-u", "--url", dest = "url", help = "URL to fetch data from") parser.add_option("-f", "--file", dest = "file", help = "Local file path to save data to") (options,args) = parser.parse_args() if not options.url: print "You must specify an URL.", sys.argv[0], "--help for more details" exit(1) if not options.file: print "You must specify a destination file.", sys.argv[0], "--help for more details" exit(1) return options if __name__ == '__main__': main()