1import io 2import os 3import unittest 4import urllib.robotparser 5from collections import namedtuple 6from test import support 7from http.server import BaseHTTPRequestHandler, HTTPServer 8try: 9 import threading 10except ImportError: 11 threading = None 12 13 14class BaseRobotTest: 15 robots_txt = '' 16 agent = 'test_robotparser' 17 good = [] 18 bad = [] 19 20 def setUp(self): 21 lines = io.StringIO(self.robots_txt).readlines() 22 self.parser = urllib.robotparser.RobotFileParser() 23 self.parser.parse(lines) 24 25 def get_agent_and_url(self, url): 26 if isinstance(url, tuple): 27 agent, url = url 28 return agent, url 29 return self.agent, url 30 31 def test_good_urls(self): 32 for url in self.good: 33 agent, url = self.get_agent_and_url(url) 34 with self.subTest(url=url, agent=agent): 35 self.assertTrue(self.parser.can_fetch(agent, url)) 36 37 def test_bad_urls(self): 38 for url in self.bad: 39 agent, url = self.get_agent_and_url(url) 40 with self.subTest(url=url, agent=agent): 41 self.assertFalse(self.parser.can_fetch(agent, url)) 42 43 44class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 45 robots_txt = """\ 46User-agent: * 47Disallow: /cyberworld/map/ # This is an infinite virtual URL space 48Disallow: /tmp/ # these will soon disappear 49Disallow: /foo.html 50 """ 51 good = ['/', '/test.html'] 52 bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html'] 53 54 55class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase): 56 robots_txt = """\ 57# robots.txt for http://www.example.com/ 58 59User-agent: * 60Crawl-delay: 1 61Request-rate: 3/15 62Disallow: /cyberworld/map/ # This is an infinite virtual URL space 63 64# Cybermapper knows where to go. 65User-agent: cybermapper 66Disallow: 67 """ 68 good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')] 69 bad = ['/cyberworld/map/index.html'] 70 71 72class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): 73 robots_txt = """\ 74# go away 75User-agent: * 76Disallow: / 77 """ 78 good = [] 79 bad = ['/cyberworld/map/index.html', '/', '/tmp/'] 80 81 82class BaseRequestRateTest(BaseRobotTest): 83 84 def test_request_rate(self): 85 for url in self.good + self.bad: 86 agent, url = self.get_agent_and_url(url) 87 with self.subTest(url=url, agent=agent): 88 if self.crawl_delay: 89 self.assertEqual( 90 self.parser.crawl_delay(agent), self.crawl_delay 91 ) 92 if self.request_rate: 93 self.assertEqual( 94 self.parser.request_rate(agent).requests, 95 self.request_rate.requests 96 ) 97 self.assertEqual( 98 self.parser.request_rate(agent).seconds, 99 self.request_rate.seconds 100 ) 101 102 103class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): 104 robots_txt = """\ 105User-agent: figtree 106Crawl-delay: 3 107Request-rate: 9/30 108Disallow: /tmp 109Disallow: /a%3cd.html 110Disallow: /a%2fb.html 111Disallow: /%7ejoe/index.html 112 """ 113 agent = 'figtree' 114 request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) 115 crawl_delay = 3 116 good = [('figtree', '/foo.html')] 117 bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', 118 '/a%2fb.html', '/~joe/index.html'] 119 120 121class DifferentAgentTest(CrawlDelayAndRequestRateTest): 122 agent = 'FigTree Robot libwww-perl/5.04' 123 # these are not actually tested, but we still need to parse it 124 # in order to accommodate the input parameters 125 request_rate = None 126 crawl_delay = None 127 128 129class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 130 robots_txt = """\ 131User-agent: * 132Disallow: /tmp/ 133Disallow: /a%3Cd.html 134Disallow: /a/b.html 135Disallow: /%7ejoe/index.html 136Crawl-delay: 3 137Request-rate: 9/banana 138 """ 139 good = ['/tmp'] 140 bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html', 141 '/%7Ejoe/index.html'] 142 crawl_delay = 3 143 144 145class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase): 146 # From bug report #523041 147 robots_txt = """\ 148User-Agent: * 149Disallow: /. 150Crawl-delay: pears 151 """ 152 good = ['/foo.html'] 153 # bug report says "/" should be denied, but that is not in the RFC 154 bad = [] 155 156 157class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase): 158 # also test that Allow and Diasallow works well with each other 159 robots_txt = """\ 160User-agent: Googlebot 161Allow: /folder1/myfile.html 162Disallow: /folder1/ 163Request-rate: whale/banana 164 """ 165 agent = 'Googlebot' 166 good = ['/folder1/myfile.html'] 167 bad = ['/folder1/anotherfile.html'] 168 169 170class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase): 171 # the order of User-agent should be correct. note 172 # that this file is incorrect because "Googlebot" is a 173 # substring of "Googlebot-Mobile" 174 robots_txt = """\ 175User-agent: Googlebot 176Disallow: / 177 178User-agent: Googlebot-Mobile 179Allow: / 180 """ 181 agent = 'Googlebot' 182 bad = ['/something.jpg'] 183 184 185class UserAgentGoogleMobileTest(UserAgentOrderingTest): 186 agent = 'Googlebot-Mobile' 187 188 189class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase): 190 # Google also got the order wrong. You need 191 # to specify the URLs from more specific to more general 192 robots_txt = """\ 193User-agent: Googlebot 194Allow: /folder1/myfile.html 195Disallow: /folder1/ 196 """ 197 agent = 'googlebot' 198 good = ['/folder1/myfile.html'] 199 bad = ['/folder1/anotherfile.html'] 200 201 202class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase): 203 # see issue #6325 for details 204 robots_txt = """\ 205User-agent: * 206Disallow: /some/path?name=value 207 """ 208 good = ['/some/path'] 209 bad = ['/some/path?name=value'] 210 211 212class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase): 213 # obey first * entry (#4108) 214 robots_txt = """\ 215User-agent: * 216Disallow: /some/path 217 218User-agent: * 219Disallow: /another/path 220 """ 221 good = ['/another/path'] 222 bad = ['/some/path'] 223 224 225class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase): 226 # normalize the URL first (#17403) 227 robots_txt = """\ 228User-agent: * 229Allow: /some/path? 230Disallow: /another/path? 231 """ 232 good = ['/some/path?'] 233 bad = ['/another/path?'] 234 235 236class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): 237 robots_txt = """\ 238User-agent: * 239Crawl-delay: 1 240Request-rate: 3/15 241Disallow: /cyberworld/map/ 242 """ 243 request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) 244 crawl_delay = 1 245 good = ['/', '/test.html'] 246 bad = ['/cyberworld/map/index.html'] 247 248 249class RobotHandler(BaseHTTPRequestHandler): 250 251 def do_GET(self): 252 self.send_error(403, "Forbidden access") 253 254 def log_message(self, format, *args): 255 pass 256 257 258@unittest.skipUnless(threading, 'threading required for this test') 259class PasswordProtectedSiteTestCase(unittest.TestCase): 260 261 def setUp(self): 262 self.server = HTTPServer((support.HOST, 0), RobotHandler) 263 264 self.t = threading.Thread( 265 name='HTTPServer serving', 266 target=self.server.serve_forever, 267 # Short poll interval to make the test finish quickly. 268 # Time between requests is short enough that we won't wake 269 # up spuriously too many times. 270 kwargs={'poll_interval':0.01}) 271 self.t.daemon = True # In case this function raises. 272 self.t.start() 273 274 def tearDown(self): 275 self.server.shutdown() 276 self.t.join() 277 self.server.server_close() 278 279 @support.reap_threads 280 def testPasswordProtectedSite(self): 281 addr = self.server.server_address 282 url = 'http://' + support.HOST + ':' + str(addr[1]) 283 robots_url = url + "/robots.txt" 284 parser = urllib.robotparser.RobotFileParser() 285 parser.set_url(url) 286 parser.read() 287 self.assertFalse(parser.can_fetch("*", robots_url)) 288 289 290class NetworkTestCase(unittest.TestCase): 291 292 base_url = 'http://www.pythontest.net/' 293 robots_txt = '{}elsewhere/robots.txt'.format(base_url) 294 295 @classmethod 296 def setUpClass(cls): 297 support.requires('network') 298 with support.transient_internet(cls.base_url): 299 cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt) 300 cls.parser.read() 301 302 def url(self, path): 303 return '{}{}{}'.format( 304 self.base_url, path, '/' if not os.path.splitext(path)[1] else '' 305 ) 306 307 def test_basic(self): 308 self.assertFalse(self.parser.disallow_all) 309 self.assertFalse(self.parser.allow_all) 310 self.assertGreater(self.parser.mtime(), 0) 311 self.assertFalse(self.parser.crawl_delay('*')) 312 self.assertFalse(self.parser.request_rate('*')) 313 314 def test_can_fetch(self): 315 self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere'))) 316 self.assertFalse(self.parser.can_fetch('Nutch', self.base_url)) 317 self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian'))) 318 self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats'))) 319 self.assertFalse(self.parser.can_fetch('*', self.url('webstats'))) 320 self.assertTrue(self.parser.can_fetch('*', self.base_url)) 321 322 def test_read_404(self): 323 parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt')) 324 parser.read() 325 self.assertTrue(parser.allow_all) 326 self.assertFalse(parser.disallow_all) 327 self.assertEqual(parser.mtime(), 0) 328 self.assertIsNone(parser.crawl_delay('*')) 329 self.assertIsNone(parser.request_rate('*')) 330 331if __name__=='__main__': 332 unittest.main() 333