1#!/usr/bin/env python 2"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree 3 4usage: 5import spider 6s = spider.Spider() 7s.spider("http://www.google.com", maxURLs=100) 8""" 9 10import urllib.request, urllib.error, urllib.parse 11import urllib.robotparser 12import md5 13 14import httplib2 15 16import html5lib 17from html5lib.treebuilders import etree 18 19class Spider(object): 20 def __init__(self): 21 self.unvisitedURLs = set() 22 self.visitedURLs = set() 23 self.buggyURLs=set() 24 self.robotParser = urllib.robotparser.RobotFileParser() 25 self.contentDigest = {} 26 self.http = httplib2.Http(".cache") 27 28 def run(self, initialURL, maxURLs=1000): 29 urlNumber = 0 30 self.visitedURLs.add(initialURL) 31 content = self.loadURL(initialURL) 32 while maxURLs is None or urlNumber < maxURLs: 33 if content is not None: 34 self.parse(content) 35 urlNumber += 1 36 if not self.unvisitedURLs: 37 break 38 content = self.loadURL(self.unvisitedURLs.pop()) 39 40 def parse(self, content): 41 failed = False 42 p = html5lib.HTMLParser(tree=etree.TreeBuilder) 43 try: 44 tree = p.parse(content) 45 except: 46 self.buggyURLs.add(self.currentURL) 47 failed = True 48 print("BUGGY:", self.currentURL) 49 self.visitedURLs.add(self.currentURL) 50 if not failed: 51 self.updateURLs(tree) 52 53 def loadURL(self, url): 54 resp, content = self.http.request(url, "GET") 55 self.currentURL = url 56 digest = md5.md5(content).hexdigest() 57 if digest in self.contentDigest: 58 content = None 59 self.visitedURLs.add(url) 60 else: 61 self.contentDigest[digest] = url 62 63 if resp['status'] != "200": 64 content = None 65 66 return content 67 68 def updateURLs(self, tree): 69 """Take all the links in the current document, extract the URLs and 70 update the list of visited and unvisited URLs according to whether we 71 have seen them before or not""" 72 urls = set() 73 #Remove all links we have already visited 74 for link in tree.findall(".//a"): 75 try: 76 url = urllib.parse.urldefrag(link.attrib['href'])[0] 77 if (url and url not in self.unvisitedURLs and url 78 not in self.visitedURLs): 79 urls.add(url) 80 except KeyError: 81 pass 82 83 #Remove all non-http URLs and a dd a sutiable base URL where that is 84 #missing 85 newUrls = set() 86 for url in urls: 87 splitURL = list(urllib.parse.urlsplit(url)) 88 if splitURL[0] != "http": 89 continue 90 if splitURL[1] == "": 91 splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1] 92 newUrls.add(urllib.parse.urlunsplit(splitURL)) 93 urls = newUrls 94 95 responseHeaders = {} 96 #Now we want to find the content types of the links we haven't visited 97 for url in urls: 98 try: 99 resp, content = self.http.request(url, "HEAD") 100 responseHeaders[url] = resp 101 except AttributeError as KeyError: 102 #Don't know why this happens 103 pass 104 105 106 #Remove links not of content-type html or pages not found 107 #XXX - need to deal with other status codes? 108 toVisit = set([url for url in urls if url in responseHeaders and 109 "html" in responseHeaders[url]['content-type'] and 110 responseHeaders[url]['status'] == "200"]) 111 112 #Now check we are allowed to spider the page 113 for url in toVisit: 114 robotURL = list(urllib.parse.urlsplit(url)[:2]) 115 robotURL.extend(["robots.txt", "", ""]) 116 robotURL = urllib.parse.urlunsplit(robotURL) 117 self.robotParser.set_url(robotURL) 118 if not self.robotParser.can_fetch("*", url): 119 toVisit.remove(url) 120 121 self.visitedURLs.update(urls) 122 self.unvisitedURLs.update(toVisit) 123