1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" robotparser.py
2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    Copyright (C) 2000  Bastian Kleineidam
4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    You can choose between two licenses when using this package:
6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    1) GNU GPLv2
7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    2) PSF license for Python 2.2
8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    The robots.txt Exclusion Protocol is implemented as specified in
10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh"""
12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport urlparse
13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport urllib
14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh__all__ = ["RobotFileParser"]
16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass RobotFileParser:
19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """ This class provides a set of methods to read, parse and answer
20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    questions about a single robots.txt file.
21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """
23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, url=''):
25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.entries = []
26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.default_entry = None
27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.disallow_all = False
28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.allow_all = False
29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.set_url(url)
30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.last_checked = 0
31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def mtime(self):
33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Returns the time the robots.txt file was last fetched.
34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        This is useful for long-running web spiders that need to
36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        check for new robots.txt files periodically.
37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self.last_checked
40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def modified(self):
42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Sets the time the robots.txt file was last fetched to the
43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        current time.
44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """
46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        import time
47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.last_checked = time.time()
48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def set_url(self, url):
50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Sets the URL referring to a robots.txt file."""
51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.url = url
52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.host, self.path = urlparse.urlparse(url)[1:3]
53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def read(self):
55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Reads the robots.txt URL and feeds it to the parser."""
56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        opener = URLopener()
57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        f = opener.open(self.url)
58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        lines = [line.strip() for line in f]
59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        f.close()
60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.errcode = opener.errcode
61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.errcode in (401, 403):
62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.disallow_all = True
63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif self.errcode >= 400:
64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.allow_all = True
65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        elif self.errcode == 200 and lines:
66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.parse(lines)
67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def _add_entry(self, entry):
69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if "*" in entry.useragents:
70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # the default entry is considered last
71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if self.default_entry is None:
72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # the first default entry wins
73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                self.default_entry = entry
74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        else:
75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self.entries.append(entry)
76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def parse(self, lines):
78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """parse the input lines from a robots.txt file.
79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           We allow that a user-agent: line is not preceded by
80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh           one or more blank lines."""
81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # states:
82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #   0: start state
83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #   1: saw user-agent line
84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        #   2: saw an allow or disallow line
85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        state = 0
86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        linenumber = 0
87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        entry = Entry()
88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in lines:
90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            linenumber += 1
91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if not line:
92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if state == 1:
93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    entry = Entry()
94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    state = 0
95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif state == 2:
96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    self._add_entry(entry)
97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    entry = Entry()
98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    state = 0
99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # remove optional comment and strip line
100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            i = line.find('#')
101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if i >= 0:
102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                line = line[:i]
103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line = line.strip()
104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if not line:
105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                continue
106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            line = line.split(':', 1)
107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if len(line) == 2:
108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                line[0] = line[0].strip().lower()
109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                line[1] = urllib.unquote(line[1].strip())
110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                if line[0] == "user-agent":
111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if state == 2:
112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        self._add_entry(entry)
113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        entry = Entry()
114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    entry.useragents.append(line[1])
115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    state = 1
116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif line[0] == "disallow":
117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if state != 0:
118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        entry.rulelines.append(RuleLine(line[1], False))
119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        state = 2
120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                elif line[0] == "allow":
121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                    if state != 0:
122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        entry.rulelines.append(RuleLine(line[1], True))
123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                        state = 2
124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if state == 2:
125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            self._add_entry(entry)
126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def can_fetch(self, useragent, url):
129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """using the parsed robots.txt decide if useragent can fetch url"""
130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.disallow_all:
131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return False
132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.allow_all:
133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return True
134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # search for given user agent matches
135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # the first match counts
136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        parsed_url = urlparse.urlparse(urllib.unquote(url))
137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        url = urlparse.urlunparse(('', '', parsed_url.path,
138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            parsed_url.params, parsed_url.query, parsed_url.fragment))
139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        url = urllib.quote(url)
140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if not url:
141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            url = "/"
142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for entry in self.entries:
143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if entry.applies_to(useragent):
144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return entry.allowance(url)
145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # try the default entry last
146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if self.default_entry:
147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            return self.default_entry.allowance(url)
148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # agent not found ==> access granted
149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return True
150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __str__(self):
153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return ''.join([str(entry) + "\n" for entry in self.entries])
154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass RuleLine:
157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh       (allowance==False) followed by a path."""
159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, path, allowance):
160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        if path == '' and not allowance:
161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            # an empty value means allow all
162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            allowance = True
163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.path = urllib.quote(path)
164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.allowance = allowance
165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def applies_to(self, filename):
167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return self.path == "*" or filename.startswith(self.path)
168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __str__(self):
170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return (self.allowance and "Allow" or "Disallow") + ": " + self.path
171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass Entry:
174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    """An entry has one or more user-agents and zero or more rulelines"""
175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self):
176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.useragents = []
177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.rulelines = []
178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __str__(self):
180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ret = []
181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for agent in self.useragents:
182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            ret.extend(["User-agent: ", agent, "\n"])
183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in self.rulelines:
184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            ret.extend([str(line), "\n"])
185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return ''.join(ret)
186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def applies_to(self, useragent):
188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """check if this entry applies to the specified agent"""
189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        # split the name token and make it lower case
190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        useragent = useragent.split("/")[0].lower()
191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for agent in self.useragents:
192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if agent == '*':
193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                # we have the catch-all agent
194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return True
195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            agent = agent.lower()
196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if agent in useragent:
197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return True
198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return False
199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def allowance(self, filename):
201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        """Preconditions:
202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - our agent applies to this entry
203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        - filename is URL decoded"""
204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        for line in self.rulelines:
205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh            if line.applies_to(filename):
206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                return line.allowance
207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return True
208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass URLopener(urllib.FancyURLopener):
210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def __init__(self, *args):
211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        urllib.FancyURLopener.__init__(self, *args)
212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.errcode = 200
213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def prompt_user_passwd(self, host, realm):
215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ## If robots.txt file is accessible only with a password,
216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        ## we act as if the file wasn't there.
217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return None, None
218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh
219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh    def http_error_default(self, url, fp, errcode, errmsg, headers):
220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        self.errcode = errcode
221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh                                                        errmsg, headers)
223