import urllib from BeautifulSoup import BeautifulSoup import os.path import os home = "http://threadforthought.net" filesUrl = "http://threadforthought.files.wordpress.com" saveDir = "./oldimages" visited = [] imgUrls = [] def prepPath(path): if not os.path.exists(path): print "making a path to " + path os.makedirs(path) def shouldFollow(url,storage, domain): return url.startswith(domain) and (not url in storage) def getDoc(url): if shouldFollow(url,visited,home): print "visiting ",url sock = urllib.urlopen(url) html = sock.read() sock.close() visited.append(url) handleDoc(html) def handleDoc(doc): soup = BeautifulSoup(''.join(doc)) "first get all the images" images = soup.findAll('img') [handleImage(img['src']) for img in images] links = soup.findAll('a') [getDoc(link['href']) for link in links] def handleImage(imgUrl): if shouldFollow(imgUrl,imgUrls,filesUrl): imgUrls.append(imgUrl) imgDirs = imgUrl.split('/')[3:] imgPath = os.path.join( saveDir, os.sep.join(imgDirs[:-1])) imgFileName = imgDirs[-1].split('?')[0] prepPath(imgPath) newImgUrl = imgUrl.split('?')[0] urllib.urlretrieve(newImgUrl,imgPath+os.sep+imgFileName) prepPath(saveDir) getDoc(home) print "done spidering"