#!/usr/bin/python """ Description: Dirty hack to scour the interwebs for a specific filetype and download what it finds. Downloading tons of content may piss people off. Please be courteous ;) You can always find the latest version @ http://www.l1pht.com Author: Saint Patrick Version: 0.8.22 TODO: - Performance enhancements by using sqlite transactions correctly - Implement option for users to go after text/html files for download and also spider targets !teh b0rken! - Implement option for reading and honoring robots.txt (See [1]) - Create sensible data access layer to easily allow a variety of DB storage options - SAFE distributed spidering - Exclusion option Notes: [1] This spider is currently susceptible to spidertraps. We are not honoring robots.txt AND we do not have a legit algorithm for detection of traps. Balancing is our only defense and your mileage will vary. [2] The database has 2 tables, one tracking links and visited/not and the other tracks file URLs and their MD5s. [3] The file extension takes a regex argument. If you want to do something like \.pdf$ quote it "\.pdf$" for sexy results. [4] No attempts are made to follow JS links, so ajaxy pages are pretty much death [5] I'm doing something nasty with a catchall exception handler for main(). Logging this shit might make sense ;) """ import urllib2 import hashlib import time import sgmllib import httplib from urllib2 import HTTPError, URLError import urllib, re, sys, os from optparse import OptionParser from sqlite3 import * from sgmllib import SGMLParser from urlparse import urljoin, urlparse #option parsing theparser = OptionParser() theparser.add_option("-u","--urls",dest="listofurls",type="string",help="Provide a seed list of comma separated URLS (this or file required)") #theparser.add_option("-U","--url-file",dest="urlfile",type="string",help="Provide a seed list from a text file, 1 URL per line (this or list required)") theparser.add_option("-f","--filext",dest="fileextension",type="string",help="The file extension you are hoping to find..regex style") theparser.add_option("-a","--user-agent",dest="useragent",type="string",help="Custom user-agent. Default is IE8") theparser.add_option("-d","--database",dest="database",type="string",help="Database to create or connect to (required)") theparser.add_option("-s","--storage",dest="storage",type="string",help="The storage location for your files") theparser.add_option("-t","--throttle",dest="throttle",type="int",help="Number (integer) of seconds to wait between requests for a single domain") theparser.add_option("-p","--postfix",dest="postfix",type="string",help="Name all files saved with this postfix") (options, args) = theparser.parse_args() def main(): store = options.storage #Quick check of required stuff. If fail, print help. if options.listofurls == None or options.database == None: theparser.print_help() sys.exit(0) database_setup() #if we have a list on the command line split it and add the URLs to the db if options.listofurls != None: urlsFromList = options.listofurls.split(',') for site in urlsFromList: addurlToDB(site.strip()) #set default throttle if not specified if options.throttle != None: throttle = options.throttle else: throttle = 15 lastRequestTime = {} mylasturl = '' while True: opener = urllib2.build_opener() myurl = grabFreshURL(mylasturl) myurl = myurl[0] mylasturl = myurl request = urllib2.Request(myurl) #set user-agent or use IE8 default if options.useragent != None: request.add_header('User-Agent',options.useragent) else: #better ways to do multiline strings (cosmetic)? defaultua = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; MRA 5.5 (build 02842);' defaultua = defaultua + ' GTB6.3; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2)' request.add_header('User-Agent', defaultua ) #First thing we make a HEAD request and check that the page is text/html. #This helps save time/data on pulling content that we can't parse and that doesn't match our extension. try: currentTime = time.time() thisHost = urlparse(myurl).hostname #Throttling if ((thisHost in lastRequestTime) and (currentTime - lastRequestTime[thisHost] < throttle)): throttleTime = throttle - (currentTime - lastRequestTime[thisHost]) time.sleep(throttleTime) inresponsetoyourrequestforhead = urllib2.urlopen(HeadRequest(myurl)) lastRequestTime[thisHost] = currentTime print '[*] Making request for ' + myurl headinfo = inresponsetoyourrequestforhead.info() textregex = re.compile(r'text/html',re.IGNORECASE) pagetextmatch = textregex.search(headinfo['content-type']) #We feel confident in marking it as visited at this point markURLvisited(myurl) #If the resource is text/html we parse it for links and add those links to the db try: if pagetextmatch != None: response = opener.open(request).read() currentTime = time.asctime() html = response parser = URLLister() parser.feed(html) for url in parser.urls: thisHost = urlparse(url).hostname currentTime = time.time() #Check if the link ends in the specified extension. If so, get it. findshit = re.search(options.fileextension, url) if findshit != None: if ((thisHost in lastRequestTime) and (currentTime - lastRequestTime[thisHost] < throttle)): throttleTime = throttle - (currentTime - lastRequestTime[thisHost]) time.sleep(throttleTime) # Entering download territory prettyurl = urljoin(myurl,url).strip() boolres = prevDownloaded(prettyurl) if prevDownloaded(prettyurl) == False: print "[+] Attempting to DL " + prettyurl addurlToDB(unicode(urljoin(myurl,url).strip())) # Note that if we fail, we still mark it as visited and don't try again. try: urllib.urlretrieve(prettyurl,store +"/"+ prettyurl.split('/')[-1]) print "[+] Downloaded " + prettyurl f = open(store+"/"+prettyurl.split('/')[-1]) fileMD5 = md5_for_file(f) #rename the file with hash + extension if options.postfix != None: os.rename(store+"/"+prettyurl.split('/')[-1],store+"/"+fileMD5+options.postfix) else: os.rename(store+"/"+prettyurl.split('/')[-1],store+"/"+fileMD5+options.fileextension) lastRequestTime[thisHost] = currentTime trackFileInDB(prettyurl,fileMD5) except IOError,e: print "[!] Error downloading " + prettyurl finally: markURLvisited(prettyurl) else: print "[?] You've already downloaded "+prettyurl+ ". Skipping" else: addurlToDB(unicode(urljoin(myurl,url).strip())) except sgmllib.SGMLParseError, e: print "[!] Parsing failed: " + str(e) # shit happens except HTTPError, e: print "[!] HTTP Error " , e.code , " from " + myurl markURLvisited(myurl) except URLError, e: print "[!] URL Error " , e.reason , " from " + myurl markURLvisited(myurl) except httplib.BadStatusLine, e: print "[!] Received a bad http status: " + str(e) markURLvisited(myurl) except Exception, e: print "[!] Houston we have a problem: " + str(e) markURLvisited(myurl) #this method checks that the sqlite schema is ready to roll def database_setup(): conn = connect(options.database) mycursor = conn.cursor() #this is my method of checking if the table is there, better pythony ways? mycursor.execute("SELECT name FROM sqlite_master WHERE name='frontier'") if mycursor.fetchone() == None: print '[*] Missing table. One will be created for you' mycursor.execute("create table frontier (url text, downloaded integer)") mycursor.execute("create table files (originalURL text, md5 text)") print '[+] Table created successfully' else: print '[+] Existing table will be used' mycursor.close() conn.close() def addurlToDB(thisURL): conn = connect(options.database) mycursor = conn.cursor() localtuple = [thisURL] mycursor.execute('select url from frontier where url = ?',localtuple) if mycursor.fetchone() == None: localtuple = [thisURL,0] mycursor.execute('insert into frontier values (?,?)',localtuple) conn.commit() print "[+] Added " + thisURL + " to the database" mycursor.close() conn.close() def prevDownloaded(thisURL): conn = connect(options.database) mycursor = conn.cursor() localtuple = [thisURL] mycursor.execute('select originalURL from files where originalURL = ?',localtuple) if mycursor.fetchone() == None: mycursor.close() conn.close() return False else: mycursor.close() conn.close() return True # So, if you find something interesting you can know where you found it. def trackFileInDB(thisURL,md5): conn = connect(options.database) mycursor = conn.cursor() localtuple = [thisURL,md5] mycursor.execute('insert into files values (?,?)',localtuple) conn.commit() print "[+] Tracking file: " + md5 # Get a new URL to spider and look for files on. def grabFreshURL(lasturl): conn = connect(options.database) mycursor = conn.cursor() urlpieces = urlparse(lasturl) # Building the like lastdomain = "%"+urlpieces[1]+"%" localtuple = [lastdomain] if lastdomain == '%%': mycursor.execute('select url from frontier where downloaded = 0') else: # We attempt to do some balancing, it's not great, but its better than request # after request going to one domain. mycursor.execute('select url from frontier where downloaded = 0 and url not like ?',localtuple) #print "[*] balancing off of "+ lastdomain value = mycursor.fetchone() if value == None: # We try here again, just in case we only have URLs of one domain. !last ditch effort! mycursor.execute('select url from frontier where downloaded = 0') value = mycursor.fetchone() #print "FAIL"+str(value) if value == None: print '[!] Sorry, I\'m fresh out of URLs to follow' sys.exit(0) #else: mycursor.close() conn.close() #print str(value) return value def markURLvisited(thisURL): conn = connect(options.database) mycursor = conn.cursor() localtuple = [thisURL] mycursor.execute('update frontier set downloaded = 1 where url = ?',localtuple) conn.commit() print "[-] Marked " + thisURL + " as visited" mycursor.close() conn.close() #Grabbed from Lars Wirzenius's example, just added encoding #Reads file and feeds mb chunks def md5_for_file(f, block_size=2**20): md5 = hashlib.md5() while True: data = f.read(block_size) if not data: break md5.update(data) return md5.hexdigest() #Overriding a bunch of SGMLParser stuff class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] def start_a(self, attrs): href = [v for k, v in attrs if k=='href'] if href: self.urls.extend(href) def start_img(self,attrs): href = [v for k, v in attrs if k=='src'] if href: self.urls.extend(href) def start_link(self,attrs): href = [v for k, v in attrs if k=='href'] if href: self.urls.extend(href) def start_object(self,attrs): href = [v for k, v in attrs if k=='codebase'] if href: self.urls.extend(href) def start_applet(self,attrs): href = [v for k, v in attrs if k=='code'] if href: self.urls.extend(href) def start_form(self,attrs): href = [v for k, v in attrs if k=='action'] if href: self.urls.extend(href) def start_frame(self,attrs): href = [v for k, v in attrs if k=='src'] if href: self.urls.extend(href) def start_iframe(self,attrs): href = [v for k, v in attrs if k=='src'] if href: self.urls.extend(href) def start_script(self,attrs): href = [v for k, v in attrs if k=='src'] if href: self.urls.extend(href) class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" if __name__ == "__main__": main()