1""" robotparser.py
2
3    Copyright (C) 2000  Bastian Kleineidam
4
5    You can choose between two licenses when using this package:
6    1) GNU GPLv2
7    2) PSF license for Python 2.2
8
9    The robots.txt Exclusion Protocol is implemented as specified in
10    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11"""
12import urlparse
13import urllib
14
15__all__ = ["RobotFileParser"]
16
17
18class RobotFileParser:
19    """ This class provides a set of methods to read, parse and answer
20    questions about a single robots.txt file.
21
22    """
23
24    def __init__(self, url=''):
25        self.entries = []
26        self.default_entry = None
27        self.disallow_all = False
28        self.allow_all = False
29        self.set_url(url)
30        self.last_checked = 0
31
32    def mtime(self):
33        """Returns the time the robots.txt file was last fetched.
34
35        This is useful for long-running web spiders that need to
36        check for new robots.txt files periodically.
37
38        """
39        return self.last_checked
40
41    def modified(self):
42        """Sets the time the robots.txt file was last fetched to the
43        current time.
44
45        """
46        import time
47        self.last_checked = time.time()
48
49    def set_url(self, url):
50        """Sets the URL referring to a robots.txt file."""
51        self.url = url
52        self.host, self.path = urlparse.urlparse(url)[1:3]
53
54    def read(self):
55        """Reads the robots.txt URL and feeds it to the parser."""
56        opener = URLopener()
57        f = opener.open(self.url)
58        lines = [line.strip() for line in f]
59        f.close()
60        self.errcode = opener.errcode
61        if self.errcode in (401, 403):
62            self.disallow_all = True
63        elif self.errcode >= 400:
64            self.allow_all = True
65        elif self.errcode == 200 and lines:
66            self.parse(lines)
67
68    def _add_entry(self, entry):
69        if "*" in entry.useragents:
70            # the default entry is considered last
71            if self.default_entry is None:
72                # the first default entry wins
73                self.default_entry = entry
74        else:
75            self.entries.append(entry)
76
77    def parse(self, lines):
78        """parse the input lines from a robots.txt file.
79           We allow that a user-agent: line is not preceded by
80           one or more blank lines."""
81        # states:
82        #   0: start state
83        #   1: saw user-agent line
84        #   2: saw an allow or disallow line
85        state = 0
86        linenumber = 0
87        entry = Entry()
88
89        for line in lines:
90            linenumber += 1
91            if not line:
92                if state == 1:
93                    entry = Entry()
94                    state = 0
95                elif state == 2:
96                    self._add_entry(entry)
97                    entry = Entry()
98                    state = 0
99            # remove optional comment and strip line
100            i = line.find('#')
101            if i >= 0:
102                line = line[:i]
103            line = line.strip()
104            if not line:
105                continue
106            line = line.split(':', 1)
107            if len(line) == 2:
108                line[0] = line[0].strip().lower()
109                line[1] = urllib.unquote(line[1].strip())
110                if line[0] == "user-agent":
111                    if state == 2:
112                        self._add_entry(entry)
113                        entry = Entry()
114                    entry.useragents.append(line[1])
115                    state = 1
116                elif line[0] == "disallow":
117                    if state != 0:
118                        entry.rulelines.append(RuleLine(line[1], False))
119                        state = 2
120                elif line[0] == "allow":
121                    if state != 0:
122                        entry.rulelines.append(RuleLine(line[1], True))
123                        state = 2
124        if state == 2:
125            self._add_entry(entry)
126
127
128    def can_fetch(self, useragent, url):
129        """using the parsed robots.txt decide if useragent can fetch url"""
130        if self.disallow_all:
131            return False
132        if self.allow_all:
133            return True
134        # search for given user agent matches
135        # the first match counts
136        parsed_url = urlparse.urlparse(urllib.unquote(url))
137        url = urlparse.urlunparse(('', '', parsed_url.path,
138            parsed_url.params, parsed_url.query, parsed_url.fragment))
139        url = urllib.quote(url)
140        if not url:
141            url = "/"
142        for entry in self.entries:
143            if entry.applies_to(useragent):
144                return entry.allowance(url)
145        # try the default entry last
146        if self.default_entry:
147            return self.default_entry.allowance(url)
148        # agent not found ==> access granted
149        return True
150
151
152    def __str__(self):
153        return ''.join([str(entry) + "\n" for entry in self.entries])
154
155
156class RuleLine:
157    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
158       (allowance==False) followed by a path."""
159    def __init__(self, path, allowance):
160        if path == '' and not allowance:
161            # an empty value means allow all
162            allowance = True
163        self.path = urllib.quote(path)
164        self.allowance = allowance
165
166    def applies_to(self, filename):
167        return self.path == "*" or filename.startswith(self.path)
168
169    def __str__(self):
170        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
171
172
173class Entry:
174    """An entry has one or more user-agents and zero or more rulelines"""
175    def __init__(self):
176        self.useragents = []
177        self.rulelines = []
178
179    def __str__(self):
180        ret = []
181        for agent in self.useragents:
182            ret.extend(["User-agent: ", agent, "\n"])
183        for line in self.rulelines:
184            ret.extend([str(line), "\n"])
185        return ''.join(ret)
186
187    def applies_to(self, useragent):
188        """check if this entry applies to the specified agent"""
189        # split the name token and make it lower case
190        useragent = useragent.split("/")[0].lower()
191        for agent in self.useragents:
192            if agent == '*':
193                # we have the catch-all agent
194                return True
195            agent = agent.lower()
196            if agent in useragent:
197                return True
198        return False
199
200    def allowance(self, filename):
201        """Preconditions:
202        - our agent applies to this entry
203        - filename is URL decoded"""
204        for line in self.rulelines:
205            if line.applies_to(filename):
206                return line.allowance
207        return True
208
209class URLopener(urllib.FancyURLopener):
210    def __init__(self, *args):
211        urllib.FancyURLopener.__init__(self, *args)
212        self.errcode = 200
213
214    def prompt_user_passwd(self, host, realm):
215        ## If robots.txt file is accessible only with a password,
216        ## we act as if the file wasn't there.
217        return None, None
218
219    def http_error_default(self, url, fp, errcode, errmsg, headers):
220        self.errcode = errcode
221        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
222                                                        errmsg, headers)
223