#!/usr/bin/python #author: Bryan Bishop #date: 2011-03-11 import os import sys import pycurl from urllib import urlencode from StringIO import StringIO from BeautifulSoup import BeautifulSoup user_agent = "nook browser/1.0" def construct_url(iter=0): return "http://www.cyrket.com/search?category=6953&market=android&sort=popular&o=%s" % (str(iter)) def grab_html(url, url_params="", use_cookies=True, follow_redirects=False, verbosity=0): buffer = StringIO() curl = pycurl.Curl() if len(url_params) > 0: curl.setopt(curl.URL, url + "?" + url_params) else: curl.setopt(curl.URL, url) if use_cookies: cookies = "cookies.txt" curl.setopt(curl.COOKIEJAR, cookies) curl.setopt(curl.COOKIEFILE, cookies) if follow_redirects: curl.setopt(curl.FOLLOWLOCATION, 1) curl.setopt(curl.VERBOSE, verbosity) curl.setopt(curl.USERAGENT, user_agent) curl.setopt(curl.TIMEOUT, 20) curl.setopt(curl.WRITEFUNCTION, buffer.write) curl.perform() curl.close() buffer = buffer.getvalue().strip() return buffer def grab_android_links(input_html): html = BeautifulSoup(unicode(input_html, errors="ignore")) link_objects = html.findAll(name="a") links = [] for link in link_objects: for attr in link.attrs: href = get_attribute(link.attrs, "href") if href.count("com") > 0: links.append(href) print href return links for x in range(0,1667): url = construct_url(iter=x*30) html = grab_html(url) links = grab_android_links(html)