#/usr/bin/python import os , string , re from BeautifulSoup import BeautifulSoup from thing import thing from tag import tag from download import download from thumb import thumb from image import image from like import like from author import author import time folder = 'cache/thingiverse_data' sub_folders = ['newest','things','thumbs','download','images','authors'] try: os.stat(folder) for i in sub_folders: try: os.stat(folder+'/'+i) except: os.mkdir(folder+'/'+i) except: os.mkdir(folder) for i in sub_folders: os.mkdir(folder+'/'+i) rx = re.compile('\W+') class thingi_index(thing): def __init__(self,sdb,parent=''): thing.__init__(self,sdb,parent) self.ftype = 'thingi_index' def check(self,sdb): return True def process(self): print 'do stuff with file '+self.path f = open(self.path) d = f.read() f.close() try: s = BeautifulSoup(d) except: s = BeautifulSoup('') floats = s.findAll(attrs={'class':'thing_float'}) for i in floats: self.float_scrape(i,self.sdb) return True def float_scrape(self,i,sdb): t = thingi(sdb) info = i.find(attrs={'class':'thing_info'}) try: z = info.find(name='a').get('href') except: z = '' t.url=z t.id = int(t.url.split(':')[-1:][0]) t.path = 'cache/thingiverse_data/things/'+str(t.id)+'.html' # get thumbnail t2 = thumb(sdb,t.uuid) t2.id = t.id t2.url = i.find(name='img').get('src') t2.ftype = 'jpg' t2.path = 'cache/thingiverse_data/thumbs/'+str(t.id)+'.jpg' t2.insert_line(sdb) r = i.findAll(name='a') if len(r) > 1: try: t.name = rx.sub(' ',r[1].string) t.author = r[2].string t3 = author(sdb) t3.url = 'http://thingiverse.com/'+t.author t3.path = 'cache/thingiverse_data/authors/'+str(t.author) t3.name = t.author t3.insert_line(sdb) except: print(dir(r[1])) print(r[1]) if t.insert_line(sdb) == False: return False class thingi(thing): def __init__(self,sdb,parent=''): thing.__init__(self,sdb,parent) self.ftype = 'thingi' def process(self): f = open(self.path) d = f.read() f.close() s = BeautifulSoup(d) parent = self.uuid sdb = self.sdb self.s = s divs = s.findAll(name='div') try: m = divs[1] except: return False # tags self.tags = {} tag_list = m.find(name='ul',attrs={'class':'tags'}) if tag_list != None: for i in tag_list: tags = i.findNext(name='a').get('href').split(':')[1] self.tags[tag]='.' for i in self.tags.keys(): t = tag(sdb,parent) t.processed = 1 t.name = str(i) t.insert_line(sdb) # likes self.likes = {} like_list = s.findAll(name='div',attrs={'id':'like_content'}) if len(like_list) > 0: likes = like_list[0].findAll(name='a') for i in likes: l = i.get('href').split('/')[-1] self.likes[l] = '.' #for i in self.likes.keys(): ## t = like(self.sdb,self.uuid) # t.processed = 1 # t.name = str(i) # t.insert_line(sdb) sections = s.findAll(name='h3') self.description = sections[1].findNext().decodeContents().strip() self.instructions = sections[2].findNext().decodeContents().strip() #downloads dl_tables = s.findAll(name='table') self.downloads = {} for i in dl_tables[2:]: name = i.find(name='h3').string.strip() link = i.find(name='a').get('href') self.downloads[name] = link for i in self.downloads.keys(): t = download(sdb,parent) t.path = 'cache/thingiverse_data/download'+self.downloads[i] t.url = 'http://www.thingiverse.com'+self.downloads[i] t.name = str(i) t.insert_line(sdb) # small images img = s.findAll(attrs={'class':'render'}) self.small_images = {} for i in img: link = i.get('src') name = str(i.parent.get('href')) fix_name = name.split('/')[-1] self.small_images[fix_name] = link count = 1 for i in self.small_images.keys(): t = image(sdb,parent) t.ftype = 'image' t.url= self.small_images[i] t.name = str(i) t.path = 'cache/thingiverse_data/images/'+str(i) t.insert_line(sdb) return True def __astr__(self): s = '' s = s + self.description+'\n\n' s = s + str(self.downloads)+'\n\n' s = s + str(self.tags)+'\n\n' s = s + str(self.likes)+'\n\n' return s