1""" robotparser.py
2
3    Copyright (C) 2000  Bastian Kleineidam
4
5    You can choose between two licenses when using this package:
6    1) GNU GPLv2
7    2) PSF license for Python 2.2
8
9    The robots.txt Exclusion Protocol is implemented as specified in
10    http://www.robotstxt.org/norobots-rfc.txt
11
12"""
13import urlparse
14import urllib
15
16__all__ = ["RobotFileParser"]
17
18
19class RobotFileParser:
20    """ This class provides a set of methods to read, parse and answer
21    questions about a single robots.txt file.
22
23    """
24
25    def __init__(self, url=''):
26        self.entries = []
27        self.default_entry = None
28        self.disallow_all = False
29        self.allow_all = False
30        self.set_url(url)
31        self.last_checked = 0
32
33    def mtime(self):
34        """Returns the time the robots.txt file was last fetched.
35
36        This is useful for long-running web spiders that need to
37        check for new robots.txt files periodically.
38
39        """
40        return self.last_checked
41
42    def modified(self):
43        """Sets the time the robots.txt file was last fetched to the
44        current time.
45
46        """
47        import time
48        self.last_checked = time.time()
49
50    def set_url(self, url):
51        """Sets the URL referring to a robots.txt file."""
52        self.url = url
53        self.host, self.path = urlparse.urlparse(url)[1:3]
54
55    def read(self):
56        """Reads the robots.txt URL and feeds it to the parser."""
57        opener = URLopener()
58        f = opener.open(self.url)
59        lines = [line.strip() for line in f]
60        f.close()
61        self.errcode = opener.errcode
62        if self.errcode in (401, 403):
63            self.disallow_all = True
64        elif self.errcode >= 400 and self.errcode < 500:
65            self.allow_all = True
66        elif self.errcode == 200 and lines:
67            self.parse(lines)
68
69    def _add_entry(self, entry):
70        if "*" in entry.useragents:
71            # the default entry is considered last
72            if self.default_entry is None:
73                # the first default entry wins
74                self.default_entry = entry
75        else:
76            self.entries.append(entry)
77
78    def parse(self, lines):
79        """parse the input lines from a robots.txt file.
80           We allow that a user-agent: line is not preceded by
81           one or more blank lines."""
82        # states:
83        #   0: start state
84        #   1: saw user-agent line
85        #   2: saw an allow or disallow line
86        state = 0
87        linenumber = 0
88        entry = Entry()
89
90        self.modified()
91        for line in lines:
92            linenumber += 1
93            if not line:
94                if state == 1:
95                    entry = Entry()
96                    state = 0
97                elif state == 2:
98                    self._add_entry(entry)
99                    entry = Entry()
100                    state = 0
101            # remove optional comment and strip line
102            i = line.find('#')
103            if i >= 0:
104                line = line[:i]
105            line = line.strip()
106            if not line:
107                continue
108            line = line.split(':', 1)
109            if len(line) == 2:
110                line[0] = line[0].strip().lower()
111                line[1] = urllib.unquote(line[1].strip())
112                if line[0] == "user-agent":
113                    if state == 2:
114                        self._add_entry(entry)
115                        entry = Entry()
116                    entry.useragents.append(line[1])
117                    state = 1
118                elif line[0] == "disallow":
119                    if state != 0:
120                        entry.rulelines.append(RuleLine(line[1], False))
121                        state = 2
122                elif line[0] == "allow":
123                    if state != 0:
124                        entry.rulelines.append(RuleLine(line[1], True))
125                        state = 2
126        if state == 2:
127            self._add_entry(entry)
128
129
130    def can_fetch(self, useragent, url):
131        """using the parsed robots.txt decide if useragent can fetch url"""
132        if self.disallow_all:
133            return False
134        if self.allow_all:
135            return True
136
137        # Until the robots.txt file has been read or found not
138        # to exist, we must assume that no url is allowable.
139        # This prevents false positives when a user erroneously
140        # calls can_fetch() before calling read().
141        if not self.last_checked:
142            return False
143
144        # search for given user agent matches
145        # the first match counts
146        parsed_url = urlparse.urlparse(urllib.unquote(url))
147        url = urlparse.urlunparse(('', '', parsed_url.path,
148            parsed_url.params, parsed_url.query, parsed_url.fragment))
149        url = urllib.quote(url)
150        if not url:
151            url = "/"
152        for entry in self.entries:
153            if entry.applies_to(useragent):
154                return entry.allowance(url)
155        # try the default entry last
156        if self.default_entry:
157            return self.default_entry.allowance(url)
158        # agent not found ==> access granted
159        return True
160
161
162    def __str__(self):
163        entries = self.entries
164        if self.default_entry is not None:
165            entries = entries + [self.default_entry]
166        return '\n'.join(map(str, entries)) + '\n'
167
168
169class RuleLine:
170    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
171       (allowance==False) followed by a path."""
172    def __init__(self, path, allowance):
173        if path == '' and not allowance:
174            # an empty value means allow all
175            allowance = True
176        path = urlparse.urlunparse(urlparse.urlparse(path))
177        self.path = urllib.quote(path)
178        self.allowance = allowance
179
180    def applies_to(self, filename):
181        return self.path == "*" or filename.startswith(self.path)
182
183    def __str__(self):
184        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
185
186
187class Entry:
188    """An entry has one or more user-agents and zero or more rulelines"""
189    def __init__(self):
190        self.useragents = []
191        self.rulelines = []
192
193    def __str__(self):
194        ret = []
195        for agent in self.useragents:
196            ret.extend(["User-agent: ", agent, "\n"])
197        for line in self.rulelines:
198            ret.extend([str(line), "\n"])
199        return ''.join(ret)
200
201    def applies_to(self, useragent):
202        """check if this entry applies to the specified agent"""
203        # split the name token and make it lower case
204        useragent = useragent.split("/")[0].lower()
205        for agent in self.useragents:
206            if agent == '*':
207                # we have the catch-all agent
208                return True
209            agent = agent.lower()
210            if agent in useragent:
211                return True
212        return False
213
214    def allowance(self, filename):
215        """Preconditions:
216        - our agent applies to this entry
217        - filename is URL decoded"""
218        for line in self.rulelines:
219            if line.applies_to(filename):
220                return line.allowance
221        return True
222
223class URLopener(urllib.FancyURLopener):
224    def __init__(self, *args):
225        urllib.FancyURLopener.__init__(self, *args)
226        self.errcode = 200
227
228    def prompt_user_passwd(self, host, realm):
229        ## If robots.txt file is accessible only with a password,
230        ## we act as if the file wasn't there.
231        return None, None
232
233    def http_error_default(self, url, fp, errcode, errmsg, headers):
234        self.errcode = errcode
235        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
236                                                        errmsg, headers)
237