Home > Code > pigtoddler.py – Culling Files By Riding Spiders

pigtoddler.py – Culling Files By Riding Spiders

A while ago I put out a tweet inquiring as to how people where going about the task of downloading examples (lots) of a specific filetype from teh internetz.  I got a couple replies, which I certainly appreciated.  The most interesting of which came from @shawnmoyer related to @rwnin‘s tool ‘datapyning‘.  rwnin’s script definitely has some strengths, particularly regarding surgical searches related to specific domains.  I wanted a troubled little script that didn’t require much care and had a little more “dice roll” type attitude.  If the utility I describe below doesn’t fit the bill, you might check out datapyning.

My criteria looked more like this:

  • Must be able to pull a large (until disk reaches capacity) number of files
  • Must run unattended for long periods of time
  • Must be able to pull files based on regex for URL
  • Must allow me to determine all known URLs where a particular file was found

I chose to maintain the links and file tracking information in a sqlite database, hopefully making it simple for anyone to use. That said, this is my first piece of Python code, so ease of maintenance is up for debate.

So basically run something like:

./pigtoddler.py -s ./spiderpoo -d searchdb -f "\.pdf$" -u http://www.somesite.org -p .pdf

Get something like this in the console:

Get a folder that looks something like this:

Get a listing of all the places a specific file was found:

Code: [download]

?View Code PYTHON
 #!/usr/bin/python">Version:
	0.8.22
 
TODO:
	- Performance enhancements by using sqlite transactions correctly
	- Implement option for users to go after text/html files for download and also spider targets !teh b0rken!
	- Implement option for reading and honoring robots.txt (See [1])
	- Create sensible data access layer to easily allow a variety of DB storage options
	- SAFE distributed spidering
	- Exclusion option
 
Notes:
	[1] This spider is currently susceptible to spidertraps.  We are not honoring robots.txt
		AND we do not have a legit algorithm for detection of traps.  Balancing is our only
		defense and your mileage will vary.
	[2] The database has 2 tables, one tracking links and visited/not and the other tracks file
		URLs and their MD5s.
	[3] The file extension takes a regex argument.  If you want to do something like \.pdf$ quote it
		"\.pdf$" for sexy results.
	[4] No attempts are made to follow JS links, so ajaxy pages are pretty much death
	[5] I'm doing something nasty with a catchall exception handler for main().  Logging this shit might make sense ;)
 
"""
 
import urllib2
import hashlib
import time
import sgmllib
import httplib
from urllib2 import HTTPError, URLError
import urllib, re, sys, os
from optparse import OptionParser
from sqlite3 import *
from sgmllib import SGMLParser
from urlparse import urljoin, urlparse
 
#option parsing
theparser = OptionParser()
theparser.add_option("-u","--urls",dest="listofurls",type="string",help="Provide a seed list of comma separated URLS (this or file required)")
#theparser.add_option("-U","--url-file",dest="urlfile",type="string",help="Provide a seed list from a text file, 1 URL per line (this or list required)")
theparser.add_option("-f","--filext",dest="fileextension",type="string",help="The file extension you are hoping to find..regex style")
theparser.add_option("-a","--user-agent",dest="useragent",type="string",help="Custom user-agent.  Default is IE8")
theparser.add_option("-d","--database",dest="database",type="string",help="Database to create or connect to (required)")
theparser.add_option("-s","--storage",dest="storage",type="string",help="The storage location for your files")
theparser.add_option("-t","--throttle",dest="throttle",type="int",help="Number (integer) of seconds to wait between requests for a single domain")
theparser.add_option("-p","--postfix",dest="postfix",type="string",help="Name all files saved with this postfix")
 
(options, args) = theparser.parse_args()
 
def main():
	store = options.storage
 
	#Quick check of required stuff.  If fail, print help.
	if options.listofurls == None or options.database == None:
		theparser.print_help()
		sys.exit(0)
 
	database_setup()
 
	#if we have a list on the command line split it and add the URLs to the db
	if options.listofurls != None:
		urlsFromList = options.listofurls.split(',')
		for site in urlsFromList:
			addurlToDB(site.strip())
 
	#set default throttle if not specified
	if options.throttle != None:
		throttle = options.throttle
	else:
		throttle = 15
 
	lastRequestTime = {}
 
	mylasturl = ''
	while True:
 
		opener = urllib2.build_opener()
		myurl = grabFreshURL(mylasturl)
		myurl = myurl[0]
 
		mylasturl = myurl
 
		request = urllib2.Request(myurl)
 
		#set user-agent or use IE8 default
		if options.useragent != None:
			request.add_header('User-Agent',options.useragent)
		else:
			#better ways to do multiline strings (cosmetic)?
			defaultua = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; MRA 5.5 (build 02842);'
			defaultua = defaultua + ' GTB6.3; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2)'
			request.add_header('User-Agent', defaultua )
 
		#First thing we make a HEAD request and check that the page is text/html.
		#This helps save time/data on pulling content that we can't parse and that doesn't match our extension.
		try:
			currentTime = time.time()
			thisHost = urlparse(myurl).hostname
			#Throttling
			if ((thisHost in lastRequestTime) and (currentTime - lastRequestTime[thisHost] < throttle)):
				throttleTime = throttle - (currentTime - lastRequestTime[thisHost])
				time.sleep(throttleTime)
 
			inresponsetoyourrequestforhead = urllib2.urlopen(HeadRequest(myurl))	
 
			lastRequestTime[thisHost] = currentTime
			print '[*] Making request for ' + myurl
 
			headinfo = inresponsetoyourrequestforhead.info()
 
			textregex = re.compile(r'text/html',re.IGNORECASE)
			pagetextmatch = textregex.search(headinfo['content-type'])
 
			#We feel confident in marking it as visited at this point
			markURLvisited(myurl)
			#If the resource is text/html we parse it for links and add those links to the db
			try:
				if pagetextmatch != None:
					response = opener.open(request).read()
					currentTime = time.asctime()
					html = response
					parser = URLLister()
					parser.feed(html)
					for url in parser.urls:
						thisHost = urlparse(url).hostname
						currentTime = time.time()
						#Check if the link ends in the specified extension.  If so, get it.
						findshit = re.search(options.fileextension, url)
						if findshit != None:
							if ((thisHost in lastRequestTime) and (currentTime - lastRequestTime[thisHost] < throttle)):
								throttleTime = throttle - (currentTime - lastRequestTime[thisHost])
								time.sleep(throttleTime)
				 			# Entering download territory
				 			prettyurl = urljoin(myurl,url).strip()
				 			boolres = prevDownloaded(prettyurl)
				 			if prevDownloaded(prettyurl) == False:
					 			print "[+] Attempting to DL " + prettyurl
					 			addurlToDB(unicode(urljoin(myurl,url).strip()))
					 			# Note that if we fail, we still mark it as visited and don't try again.
					 			try:
									urllib.urlretrieve(prettyurl,store +"/"+ prettyurl.split('/')[-1])
									print "[+] Downloaded " + prettyurl
									f = open(store+"/"+prettyurl.split('/')[-1])
									fileMD5 = md5_for_file(f)
									#rename the file with hash + extension
									if options.postfix != None:
										os.rename(store+"/"+prettyurl.split('/')[-1],store+"/"+fileMD5+options.postfix)
									else:
										os.rename(store+"/"+prettyurl.split('/')[-1],store+"/"+fileMD5+options.fileextension)
									lastRequestTime[thisHost] = currentTime
									trackFileInDB(prettyurl,fileMD5)
								except IOError,e:
									print "[!] Error downloading " + prettyurl
								finally:
									markURLvisited(prettyurl)
							else:
								print "[?] You've already downloaded "+prettyurl+ ". Skipping"
						else:
							addurlToDB(unicode(urljoin(myurl,url).strip()))
			except sgmllib.SGMLParseError, e:
				print "[!] Parsing failed: " + str(e)
		# shit happens
		except HTTPError, e:
			print "[!] HTTP Error " , e.code , " from " + myurl
			markURLvisited(myurl)
		except URLError, e:
			print "[!] URL Error " , e.reason , " from " + myurl
			markURLvisited(myurl)
		except httplib.BadStatusLine, e:
			print "[!] Received a bad http status: " + str(e)
			markURLvisited(myurl)
		except Exception, e:
			print "[!] Houston we have a problem: " + str(e)
			markURLvisited(myurl)
 
#this method checks that the sqlite schema is ready to roll
def database_setup():
	conn = connect(options.database)
	mycursor = conn.cursor()
	#this is my method of checking if the table is there, better pythony ways?
	mycursor.execute("SELECT name FROM sqlite_master WHERE name='frontier'")
	if mycursor.fetchone() == None:
		print '[*] Missing table.  One will be created for you'
		mycursor.execute("create table frontier (url text, downloaded integer)")
		mycursor.execute("create table files (originalURL text, md5 text)")
		print '[+] Table created successfully'
	else:
		print '[+] Existing table will be used'
	mycursor.close()
	conn.close()
 
def addurlToDB(thisURL):
	conn = connect(options.database)
	mycursor = conn.cursor()
	localtuple = [thisURL]
	mycursor.execute('select url from frontier where url = ?',localtuple)
	if mycursor.fetchone() == None:
		localtuple = [thisURL,0]
		mycursor.execute('insert into frontier values (?,?)',localtuple)
		conn.commit()
		print "[+] Added " + thisURL + " to the database"
	mycursor.close()
	conn.close()
 
def prevDownloaded(thisURL):
	conn = connect(options.database)
	mycursor = conn.cursor()
	localtuple = [thisURL]
	mycursor.execute('select originalURL from files where originalURL = ?',localtuple)
	if mycursor.fetchone() == None:
		mycursor.close()
		conn.close()
		return False
	else:
		mycursor.close()
		conn.close()
		return True
 
# So, if you find something interesting you can know where you found it.
def trackFileInDB(thisURL,md5):
	conn = connect(options.database)
	mycursor = conn.cursor()
	localtuple = [thisURL,md5]
	mycursor.execute('insert into files values (?,?)',localtuple)
	conn.commit()
	print "[+] Tracking file: " + md5
 
# Get a new URL to spider and look for files on.
def grabFreshURL(lasturl):
	conn = connect(options.database)
	mycursor = conn.cursor()
	urlpieces = urlparse(lasturl)
	# Building the like
	lastdomain = "%"+urlpieces[1]+"%"
	localtuple = [lastdomain]
	if lastdomain == '%%':
		mycursor.execute('select url from frontier where downloaded = 0')
	else:
		# We attempt to do some balancing, it's not great, but its better than request
		# after request going to one domain.
		mycursor.execute('select url from frontier where downloaded = 0 and url not like ?',localtuple)
		#print "[*] balancing off of "+ lastdomain
	value = mycursor.fetchone()
 
	if value == None:
		# We try here again, just in case we only have URLs of one domain. !last ditch effort!
		mycursor.execute('select url from frontier where downloaded = 0')
		value = mycursor.fetchone()
		#print "FAIL"+str(value)
		if value == None:
			print '[!] Sorry, I\'m fresh out of URLs to follow'
			sys.exit(0)
	#else:
	mycursor.close()
	conn.close()
	#print str(value)
	return value
 
def markURLvisited(thisURL):
	conn = connect(options.database)
	mycursor = conn.cursor()
	localtuple = [thisURL]
	mycursor.execute('update frontier set downloaded = 1 where url = ?',localtuple)
	conn.commit()
	print "[-] Marked " + thisURL + " as visited"
	mycursor.close()
	conn.close()
 
#Grabbed from Lars Wirzenius's example, just added encoding
#Reads file and feeds mb chunks
def md5_for_file(f, block_size=2**20):
    md5 = hashlib.md5()
    while True:
        data = f.read(block_size)
        if not data:
            break
        md5.update(data)
    return md5.hexdigest()
 
#Overriding a bunch of SGMLParser stuff
class URLLister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []
 
    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            self.urls.extend(href)
 
    def start_img(self,attrs):
    	href = [v for k, v in attrs if k=='src']
        if href:
            self.urls.extend(href)
 
    def start_link(self,attrs):
    	href = [v for k, v in attrs if k=='href']
        if href:
            self.urls.extend(href)   
 
    def start_object(self,attrs):
    	href = [v for k, v in attrs if k=='codebase']
        if href:
            self.urls.extend(href)       
 
    def start_applet(self,attrs):
        href = [v for k, v in attrs if k=='code']
        if href:
            self.urls.extend(href)
 
    def start_form(self,attrs):
        href = [v for k, v in attrs if k=='action']
        if href:
            self.urls.extend(href)
 
    def start_frame(self,attrs):
        href = [v for k, v in attrs if k=='src']
        if href:
            self.urls.extend(href)
 
    def start_iframe(self,attrs):
        href = [v for k, v in attrs if k=='src']
        if href:
            self.urls.extend(href)
 
	def start_script(self,attrs):
		href = [v for k, v in attrs if k=='src']
		if href:
			self.urls.extend(href)
 
class HeadRequest(urllib2.Request):
	def get_method(self):
		return "HEAD"
 
if __name__ == "__main__":
	main()

Code