1#!/usr/bin/env python
2"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree
3
4usage:
5import spider
6s = spider.Spider()
7s.spider("http://www.google.com", maxURLs=100)
8"""
9
10import urllib.request, urllib.error, urllib.parse
11import urllib.robotparser
12import md5
13
14import httplib2
15
16import html5lib
17from html5lib.treebuilders import etree
18
19class Spider(object):
20    def __init__(self):
21        self.unvisitedURLs = set()
22        self.visitedURLs = set()
23        self.buggyURLs=set()
24        self.robotParser = urllib.robotparser.RobotFileParser()
25        self.contentDigest = {}
26        self.http = httplib2.Http(".cache")
27
28    def run(self, initialURL, maxURLs=1000):
29        urlNumber = 0
30        self.visitedURLs.add(initialURL)
31        content = self.loadURL(initialURL)
32        while maxURLs is None or urlNumber < maxURLs:
33            if content is not None:
34                self.parse(content)
35                urlNumber += 1
36            if not self.unvisitedURLs:
37                break
38            content = self.loadURL(self.unvisitedURLs.pop())
39
40    def parse(self, content):
41        failed = False
42        p = html5lib.HTMLParser(tree=etree.TreeBuilder)
43        try:
44            tree = p.parse(content)
45        except:
46            self.buggyURLs.add(self.currentURL)
47            failed = True
48            print("BUGGY:", self.currentURL)
49        self.visitedURLs.add(self.currentURL)
50        if not failed:
51            self.updateURLs(tree)
52
53    def loadURL(self, url):
54        resp, content = self.http.request(url, "GET")
55        self.currentURL = url
56        digest = md5.md5(content).hexdigest()
57        if digest in self.contentDigest:
58            content = None
59            self.visitedURLs.add(url)
60        else:
61            self.contentDigest[digest] = url
62
63        if resp['status'] != "200":
64            content = None
65
66        return content
67
68    def updateURLs(self, tree):
69        """Take all the links in the current document, extract the URLs and
70        update the list of visited and unvisited URLs according to whether we
71        have seen them before or not"""
72        urls = set()
73        #Remove all links we have already visited
74        for link in tree.findall(".//a"):
75                try:
76                    url = urllib.parse.urldefrag(link.attrib['href'])[0]
77                    if (url and url not in self.unvisitedURLs and url
78                        not in self.visitedURLs):
79                        urls.add(url)
80                except KeyError:
81                    pass
82
83        #Remove all non-http URLs and a dd a sutiable base URL where that is
84        #missing
85        newUrls = set()
86        for url in urls:
87            splitURL = list(urllib.parse.urlsplit(url))
88            if splitURL[0] != "http":
89                continue
90            if splitURL[1] == "":
91                splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
92            newUrls.add(urllib.parse.urlunsplit(splitURL))
93        urls = newUrls
94
95        responseHeaders = {}
96        #Now we want to find the content types of the links we haven't visited
97        for url in urls:
98            try:
99                resp, content = self.http.request(url, "HEAD")
100                responseHeaders[url] = resp
101            except AttributeError as KeyError:
102                #Don't know why this happens
103                pass
104
105
106        #Remove links not of content-type html or pages not found
107        #XXX - need to deal with other status codes?
108        toVisit = set([url for url in urls if url in responseHeaders and
109                      "html" in responseHeaders[url]['content-type'] and
110                      responseHeaders[url]['status'] == "200"])
111
112        #Now check we are allowed to spider the page
113        for url in toVisit:
114            robotURL = list(urllib.parse.urlsplit(url)[:2])
115            robotURL.extend(["robots.txt", "", ""])
116            robotURL = urllib.parse.urlunsplit(robotURL)
117            self.robotParser.set_url(robotURL)
118            if not self.robotParser.can_fetch("*", url):
119                toVisit.remove(url)
120
121        self.visitedURLs.update(urls)
122        self.unvisitedURLs.update(toVisit)
123