html5lib-python/utils/spider.py

#!/usr/bin/env python
"""Spider to try and find bugs in the parser. Requires httplib2 and elementtree

usage:
import spider
s = spider.Spider()
s.spider("http://www.google.com", maxURLs=100)
"""

import urllib.request, urllib.error, urllib.parse
import urllib.robotparser
import md5

import httplib2

import html5lib
from html5lib.treebuilders import etree

class Spider(object):
    def __init__(self):
        self.unvisitedURLs = set()
        self.visitedURLs = set()
        self.buggyURLs=set()
        self.robotParser = urllib.robotparser.RobotFileParser()
        self.contentDigest = {}
        self.http = httplib2.Http(".cache")

    def run(self, initialURL, maxURLs=1000):
        urlNumber = 0
        self.visitedURLs.add(initialURL)
        content = self.loadURL(initialURL)
        while maxURLs is None or urlNumber < maxURLs:
            if content is not None:
                self.parse(content)
                urlNumber += 1
            if not self.unvisitedURLs:
                break
            content = self.loadURL(self.unvisitedURLs.pop())

    def parse(self, content):
        failed = False
        p = html5lib.HTMLParser(tree=etree.TreeBuilder)
        try:
            tree = p.parse(content)
        except:
            self.buggyURLs.add(self.currentURL)
            failed = True
            print("BUGGY:", self.currentURL)
        self.visitedURLs.add(self.currentURL)
        if not failed:
            self.updateURLs(tree)

    def loadURL(self, url):
        resp, content = self.http.request(url, "GET")
        self.currentURL = url
        digest = md5.md5(content).hexdigest()
        if digest in self.contentDigest:
            content = None
            self.visitedURLs.add(url)
        else:
            self.contentDigest[digest] = url

        if resp['status'] != "200":
            content = None

        return content

    def updateURLs(self, tree):
        """Take all the links in the current document, extract the URLs and
        update the list of visited and unvisited URLs according to whether we
        have seen them before or not"""
        urls = set()
        #Remove all links we have already visited
        for link in tree.findall(".//a"):
                try:
                    url = urllib.parse.urldefrag(link.attrib['href'])[0]
                    if (url and url not in self.unvisitedURLs and url
                        not in self.visitedURLs):
                        urls.add(url)
                except KeyError:
                    pass

        #Remove all non-http URLs and a dd a sutiable base URL where that is
        #missing
        newUrls = set()
        for url in urls:
            splitURL = list(urllib.parse.urlsplit(url))
            if splitURL[0] != "http":
                continue
            if splitURL[1] == "":
                splitURL[1] = urllib.parse.urlsplit(self.currentURL)[1]
            newUrls.add(urllib.parse.urlunsplit(splitURL))
        urls = newUrls

        responseHeaders = {}
        #Now we want to find the content types of the links we haven't visited
        for url in urls:
            try:
                resp, content = self.http.request(url, "HEAD")
                responseHeaders[url] = resp
            except AttributeError as KeyError:
                #Don't know why this happens
                pass


        #Remove links not of content-type html or pages not found
        #XXX - need to deal with other status codes?
        toVisit = set([url for url in urls if url in responseHeaders and
                      "html" in responseHeaders[url]['content-type'] and
                      responseHeaders[url]['status'] == "200"])

        #Now check we are allowed to spider the page
        for url in toVisit:
            robotURL = list(urllib.parse.urlsplit(url)[:2])
            robotURL.extend(["robots.txt", "", ""])
            robotURL = urllib.parse.urlunsplit(robotURL)
            self.robotParser.set_url(robotURL)
            if not self.robotParser.can_fetch("*", url):
                toVisit.remove(url)

        self.visitedURLs.update(urls)
        self.unvisitedURLs.update(toVisit)