#!/usr/bin/python #author: Bryan Bishop #url: http://heybryan.org/ #date: 2010-03-26 #filename: thingiverse.py #license: gpl2+ #purpose: simple demo interface to thingiverse.com, not to be used in production environments import os from BeautifulSoup import BeautifulSoup import pycurl from StringIO import StringIO from string import join import yaml #sudo apt-get install python-pip; pip install -e git+git://github.com/kanzure/optfunc.git#egg=optfunc import optfunc def download_thingiverse_partial(start=0, end=434): '''creates a backup from thing:434 to thing:2073''' if not os.path.exists("thingiverse_data"): os.mkdir("thingiverse_data") for current in range(start, end): os.system("cd thingiverse_data; wget http://thingiverse.com/thing:%s --output-document \"%s\"" % (str(current), str(current))) def title_fixer(messy_title): to_space = ["[", "]", "...", " | ", ". ", "!", "<", ">", "&", ";", ":-)", ":)", "(", ")", "/", "...", " ", "?"] to_blank = ["$", "'"] to_dash = [" - ", " ", "--", u"\xe3\x80\x80"] if type(messy_title) == unicode: messy_title = messy_title.encode('ascii', 'replace') for char in to_space: messy_title = messy_title.replace(char, " ") for char in to_blank: messy_title = messy_title.replace(char, "") for char in to_dash: messy_title = messy_title.replace(char, "-") #replace some bad characters if messy_title.count(",") > 0: messy_title = messy_title.split(",")[0] if messy_title[-1] == "-": messy_title = messy_title[:-1] if messy_title[0] == "-": messy_title = messy_title[1:] if messy_title[-1] == ".": messy_title = messy_title[:-1] return messy_title.lower() #keep track of which unix names are already taken thingiverse_thing_names = [] def extract_image_links(html, http=False): '''finds all links to /image: stuff. set http to 'true' if you want the full thingiverse.com link''' relevant_links = [] all_links = html.findAll(name="a") for link in all_links: for arg in link.attrs: if arg[0] == "href": if arg[1][0:7] == "/image:": relevant_links.append(arg[1]) return relevant_links def extract_images(html): '''finds all thumbnail images''' relevant_images = [] all_images = html.findAll(name="img") for image in all_images: for arg in image.attrs: if arg[0] == "src": if "renders" in arg[1]: relevant_images.append(arg[1]) return relevant_images def extract_h2(html, name): desc = html.findAll(name="h2", text=name) if len(desc) == 0: return "" desc = desc[0].parent guh = [] for each in desc.nextSibling.nextSibling.contents: guh.append(str(each)) actual_description = join(guh).strip() return actual_description def extract_license(html): license_link = html.findAll(name="a", attrs={"rel": "license"})[0] license_link = license_link.attrs[1][1] #value of "href" return license_link def extract_description(html): return extract_h2(html, "Description") def extract_instructions(html): return extract_h2(html, "Instructions") def extract_files(soup): files = [] h3s = soup.findAll(attrs={"class": "file_info"}) for h3 in h3s: filename = h3.contents[0].strip() link = h3.parent.parent.findAll(name="a")[0].attrs[0][1] stats = h3.parent.contents[2].strip().split("/") messy_downloads = stats[1] downloads = ''.join([letter for letter in messy_downloads if letter.isdigit()]) messy_size = stats[0] size = ''.join([letter for letter in messy_size if letter.isdigit()]) if "MB" in messy_size: size = size + " MB" if "kb" in messy_size: size = size + " kb" files.append({"filename": filename, "link": link, "size": size, "downloads": downloads}) return files def process_thing(thing_id): file_handler = open("thingiverse_data/" + str(thing_id)) original_html = file_handler.read() file_handler.close() html = BeautifulSoup(original_html) messy_title, messy_author = html.findAll(attrs={"id": "pageTitle"})[0].contents #fix up the title a bit messy_title = messy_title[:-4] #get rid of the " by " at the end unix_name = title_fixer(messy_title) #make sure the name isn't already used #also a cheap hack to fix the name in those awkward situations if unix_name in thingiverse_thing_names: current = 0 while unix_name in thingiverse_thing_names: unix_name = unix_name + str(current) current = current + 1 #make sure nothing else uses this name thingiverse_thing_names.append(unix_name) #get the author deets author = messy_author.contents[0] image_links = extract_image_links(html) image_thumbnails = extract_images(html) description = extract_description(html) instructions = extract_instructions(html) license = extract_license(html) files = extract_files(html) #a list of filenames and the link #TODO: discussion #print "unix_name: ", unix_name #print "author: ", author #print "image_links: ", image_links #print "image_thumbnails: ", image_thumbnails #if description and not description == "": print "description: ", description #if instructions and not instructions == "": print "instructions: ", instructions #print "files: ", files if not os.path.exists("thingiverse_packages/" + str(unix_name)): os.mkdir("thingiverse_packages/" + str(unix_name)) metadata = {"name": str(unix_name).encode("ascii", "replace"), "author": str(author).encode('ascii', 'replace'), "urls": ["http://thingiverse.com/thing:" + str(thing_id)], "image_links": image_links, "image_thumbnails": image_thumbnails, "description": str(description).encode("ascii", "replace"), "files": files, "license": str(license)} #metadata = {"author": author} output = yaml.dump(metadata).encode("ascii", "replace") file_handle = open("thingiverse_packages/" + str(unix_name) + "/metadata.yaml", "w") file_handle.write(output) file_handle.close() #download the files for file in files: link = file["link"] os.system("cd thingiverse_packages/" + str(unix_name) + "/; wget http://thingiverse.com" + str(link) +";") def tester(): #just process one for now #process_thing("1446") #exit() for each in os.listdir("thingiverse_data"): try: print "each: ", each process_thing(each) except: pass def thingiverse_get(thing_id): files = [] if thing_id.count("thing:") > 0: thing_id = thing_id.replace("thing:", "") buffer1 = StringIO() curl = pycurl.Curl() curl.setopt(curl.URL, "http://www.thingiverse.com/thing:" + str(thing_id)) curl.setopt(curl.VERBOSE, 0) curl.setopt(curl.WRITEFUNCTION, buffer1.write) curl.perform() curl.close() html = buffer1.getvalue().strip() soup = BeautifulSoup(html) h3s = soup.findAll(attrs={"class": "file_info"}) for h3 in h3s: filename = h3.contents[0].strip() link = h3.parent.parent.findAll(name="a")[0].attrs[0][1] files.append((filename, link)) output = "Which file do you want to download?\n\n" counter = 0 for file in files: filename = file[0] output = output + "\t[" + str(counter) + "] " + filename + "\n" counter = counter + 1 output = output + "\n" print output selection = raw_input("Please type a number: ") files_to_get = [int(selection)] #if selection.count(",") > 0: # files_to_get = selection.split(",") for file in files_to_get: print "Beginning download of: ", files[file][0] os.system("wget \"http://thingiverse.com" + str(files[file][1]) + "\" --output-document=\"" + files[file][0] + "\"") if __name__ == "__main__": optfunc.run(thingiverse_get) #optfunc.run(download_thingiverse_partial) #optfunc.run(tester)