1ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" robotparser.py 2ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 3ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh Copyright (C) 2000 Bastian Kleineidam 4ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 5ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh You can choose between two licenses when using this package: 6ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 1) GNU GPLv2 7ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 2) PSF license for Python 2.2 8ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 9ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh The robots.txt Exclusion Protocol is implemented as specified in 10ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html 11ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh""" 12ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport urlparse 13ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehimport urllib 14ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 15ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh__all__ = ["RobotFileParser"] 16ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 17ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 18ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass RobotFileParser: 19ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ This class provides a set of methods to read, parse and answer 20ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh questions about a single robots.txt file. 21ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 22ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 23ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 24ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, url=''): 25ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.entries = [] 26ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.default_entry = None 27ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.disallow_all = False 28ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.allow_all = False 29ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.set_url(url) 30ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.last_checked = 0 31ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 32ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def mtime(self): 33ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Returns the time the robots.txt file was last fetched. 34ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 35ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh This is useful for long-running web spiders that need to 36ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh check for new robots.txt files periodically. 37ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 38ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 39ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self.last_checked 40ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 41ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def modified(self): 42ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Sets the time the robots.txt file was last fetched to the 43ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh current time. 44ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 45ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """ 46ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh import time 47ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.last_checked = time.time() 48ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 49ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def set_url(self, url): 50ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Sets the URL referring to a robots.txt file.""" 51ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.url = url 52ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.host, self.path = urlparse.urlparse(url)[1:3] 53ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 54ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def read(self): 55ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Reads the robots.txt URL and feeds it to the parser.""" 56ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh opener = URLopener() 57ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f = opener.open(self.url) 58ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh lines = [line.strip() for line in f] 59ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh f.close() 60ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.errcode = opener.errcode 61ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.errcode in (401, 403): 62ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.disallow_all = True 63ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif self.errcode >= 400: 64ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.allow_all = True 65ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif self.errcode == 200 and lines: 66ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.parse(lines) 67ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 68ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def _add_entry(self, entry): 69ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if "*" in entry.useragents: 70ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # the default entry is considered last 71ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.default_entry is None: 72ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # the first default entry wins 73ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.default_entry = entry 74ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh else: 75ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.entries.append(entry) 76ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 77ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def parse(self, lines): 78ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """parse the input lines from a robots.txt file. 79ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh We allow that a user-agent: line is not preceded by 80ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh one or more blank lines.""" 81ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # states: 82ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # 0: start state 83ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # 1: saw user-agent line 84ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # 2: saw an allow or disallow line 85ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 0 86ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh linenumber = 0 87ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry = Entry() 88ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 89ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for line in lines: 90ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh linenumber += 1 91ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not line: 92ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if state == 1: 93ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry = Entry() 94ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 0 95ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif state == 2: 96ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._add_entry(entry) 97ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry = Entry() 98ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 0 99ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # remove optional comment and strip line 100ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh i = line.find('#') 101ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if i >= 0: 102ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh line = line[:i] 103ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh line = line.strip() 104ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not line: 105ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh continue 106ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh line = line.split(':', 1) 107ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if len(line) == 2: 108ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh line[0] = line[0].strip().lower() 109ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh line[1] = urllib.unquote(line[1].strip()) 110ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if line[0] == "user-agent": 111ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if state == 2: 112ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._add_entry(entry) 113ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry = Entry() 114ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry.useragents.append(line[1]) 115ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 1 116ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif line[0] == "disallow": 117ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if state != 0: 118ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry.rulelines.append(RuleLine(line[1], False)) 119ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 2 120ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh elif line[0] == "allow": 121ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if state != 0: 122ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh entry.rulelines.append(RuleLine(line[1], True)) 123ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh state = 2 124ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if state == 2: 125ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self._add_entry(entry) 126ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 127ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 128ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def can_fetch(self, useragent, url): 129ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """using the parsed robots.txt decide if useragent can fetch url""" 130ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.disallow_all: 131ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return False 132ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.allow_all: 133ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return True 134ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # search for given user agent matches 135ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # the first match counts 136ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parsed_url = urlparse.urlparse(urllib.unquote(url)) 137ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh url = urlparse.urlunparse(('', '', parsed_url.path, 138ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh parsed_url.params, parsed_url.query, parsed_url.fragment)) 139ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh url = urllib.quote(url) 140ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if not url: 141ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh url = "/" 142ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for entry in self.entries: 143ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if entry.applies_to(useragent): 144ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return entry.allowance(url) 145ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # try the default entry last 146ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if self.default_entry: 147ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self.default_entry.allowance(url) 148ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # agent not found ==> access granted 149ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return True 150ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 151ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 152ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __str__(self): 153ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return ''.join([str(entry) + "\n" for entry in self.entries]) 154ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 155ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 156ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass RuleLine: 157ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 158ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh (allowance==False) followed by a path.""" 159ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, path, allowance): 160ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if path == '' and not allowance: 161ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # an empty value means allow all 162ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh allowance = True 163ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.path = urllib.quote(path) 164ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.allowance = allowance 165ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 166ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def applies_to(self, filename): 167ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return self.path == "*" or filename.startswith(self.path) 168ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 169ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __str__(self): 170ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return (self.allowance and "Allow" or "Disallow") + ": " + self.path 171ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 172ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 173ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass Entry: 174ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """An entry has one or more user-agents and zero or more rulelines""" 175ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self): 176ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.useragents = [] 177ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.rulelines = [] 178ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 179ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __str__(self): 180ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ret = [] 181ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for agent in self.useragents: 182ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ret.extend(["User-agent: ", agent, "\n"]) 183ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for line in self.rulelines: 184ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ret.extend([str(line), "\n"]) 185ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return ''.join(ret) 186ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 187ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def applies_to(self, useragent): 188ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """check if this entry applies to the specified agent""" 189ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # split the name token and make it lower case 190ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh useragent = useragent.split("/")[0].lower() 191ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for agent in self.useragents: 192ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if agent == '*': 193ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh # we have the catch-all agent 194ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return True 195ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh agent = agent.lower() 196ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if agent in useragent: 197ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return True 198ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return False 199ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 200ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def allowance(self, filename): 201ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh """Preconditions: 202ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh - our agent applies to this entry 203ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh - filename is URL decoded""" 204ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh for line in self.rulelines: 205ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh if line.applies_to(filename): 206ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return line.allowance 207ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return True 208ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 209ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsiehclass URLopener(urllib.FancyURLopener): 210ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def __init__(self, *args): 211ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh urllib.FancyURLopener.__init__(self, *args) 212ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.errcode = 200 213ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 214ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def prompt_user_passwd(self, host, realm): 215ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ## If robots.txt file is accessible only with a password, 216ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh ## we act as if the file wasn't there. 217ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return None, None 218ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh 219ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh def http_error_default(self, url, fp, errcode, errmsg, headers): 220ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh self.errcode = errcode 221ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, 222ffab958fd8d42ed7227d83007350e61555a1fa36Andrew Hsieh errmsg, headers) 223