1import io
2import os
3import unittest
4import urllib.robotparser
5from collections import namedtuple
6from test import support
7from http.server import BaseHTTPRequestHandler, HTTPServer
8try:
9    import threading
10except ImportError:
11    threading = None
12
13
14class BaseRobotTest:
15    robots_txt = ''
16    agent = 'test_robotparser'
17    good = []
18    bad = []
19
20    def setUp(self):
21        lines = io.StringIO(self.robots_txt).readlines()
22        self.parser = urllib.robotparser.RobotFileParser()
23        self.parser.parse(lines)
24
25    def get_agent_and_url(self, url):
26        if isinstance(url, tuple):
27            agent, url = url
28            return agent, url
29        return self.agent, url
30
31    def test_good_urls(self):
32        for url in self.good:
33            agent, url = self.get_agent_and_url(url)
34            with self.subTest(url=url, agent=agent):
35                self.assertTrue(self.parser.can_fetch(agent, url))
36
37    def test_bad_urls(self):
38        for url in self.bad:
39            agent, url = self.get_agent_and_url(url)
40            with self.subTest(url=url, agent=agent):
41                self.assertFalse(self.parser.can_fetch(agent, url))
42
43
44class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
45    robots_txt = """\
46User-agent: *
47Disallow: /cyberworld/map/ # This is an infinite virtual URL space
48Disallow: /tmp/ # these will soon disappear
49Disallow: /foo.html
50    """
51    good = ['/', '/test.html']
52    bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
53
54
55class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
56    robots_txt = """\
57# robots.txt for http://www.example.com/
58
59User-agent: *
60Crawl-delay: 1
61Request-rate: 3/15
62Disallow: /cyberworld/map/ # This is an infinite virtual URL space
63
64# Cybermapper knows where to go.
65User-agent: cybermapper
66Disallow:
67    """
68    good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
69    bad = ['/cyberworld/map/index.html']
70
71
72class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
73    robots_txt = """\
74# go away
75User-agent: *
76Disallow: /
77    """
78    good = []
79    bad = ['/cyberworld/map/index.html', '/', '/tmp/']
80
81
82class BaseRequestRateTest(BaseRobotTest):
83
84    def test_request_rate(self):
85        for url in self.good + self.bad:
86            agent, url = self.get_agent_and_url(url)
87            with self.subTest(url=url, agent=agent):
88                if self.crawl_delay:
89                    self.assertEqual(
90                        self.parser.crawl_delay(agent), self.crawl_delay
91                    )
92                if self.request_rate:
93                    self.assertEqual(
94                        self.parser.request_rate(agent).requests,
95                        self.request_rate.requests
96                    )
97                    self.assertEqual(
98                        self.parser.request_rate(agent).seconds,
99                        self.request_rate.seconds
100                    )
101
102
103class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
104    robots_txt = """\
105User-agent: figtree
106Crawl-delay: 3
107Request-rate: 9/30
108Disallow: /tmp
109Disallow: /a%3cd.html
110Disallow: /a%2fb.html
111Disallow: /%7ejoe/index.html
112    """
113    agent = 'figtree'
114    request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
115    crawl_delay = 3
116    good = [('figtree', '/foo.html')]
117    bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
118           '/a%2fb.html', '/~joe/index.html']
119
120
121class DifferentAgentTest(CrawlDelayAndRequestRateTest):
122    agent = 'FigTree Robot libwww-perl/5.04'
123    # these are not actually tested, but we still need to parse it
124    # in order to accommodate the input parameters
125    request_rate = None
126    crawl_delay = None
127
128
129class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
130    robots_txt = """\
131User-agent: *
132Disallow: /tmp/
133Disallow: /a%3Cd.html
134Disallow: /a/b.html
135Disallow: /%7ejoe/index.html
136Crawl-delay: 3
137Request-rate: 9/banana
138    """
139    good = ['/tmp']
140    bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
141           '/%7Ejoe/index.html']
142    crawl_delay = 3
143
144
145class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
146    # From bug report #523041
147    robots_txt = """\
148User-Agent: *
149Disallow: /.
150Crawl-delay: pears
151    """
152    good = ['/foo.html']
153    # bug report says "/" should be denied, but that is not in the RFC
154    bad = []
155
156
157class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
158    # also test that Allow and Diasallow works well with each other
159    robots_txt = """\
160User-agent: Googlebot
161Allow: /folder1/myfile.html
162Disallow: /folder1/
163Request-rate: whale/banana
164    """
165    agent = 'Googlebot'
166    good = ['/folder1/myfile.html']
167    bad = ['/folder1/anotherfile.html']
168
169
170class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
171    # the order of User-agent should be correct. note
172    # that this file is incorrect because "Googlebot" is a
173    # substring of "Googlebot-Mobile"
174    robots_txt = """\
175User-agent: Googlebot
176Disallow: /
177
178User-agent: Googlebot-Mobile
179Allow: /
180    """
181    agent = 'Googlebot'
182    bad = ['/something.jpg']
183
184
185class UserAgentGoogleMobileTest(UserAgentOrderingTest):
186    agent = 'Googlebot-Mobile'
187
188
189class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
190    # Google also got the order wrong. You need
191    # to specify the URLs from more specific to more general
192    robots_txt = """\
193User-agent: Googlebot
194Allow: /folder1/myfile.html
195Disallow: /folder1/
196    """
197    agent = 'googlebot'
198    good = ['/folder1/myfile.html']
199    bad = ['/folder1/anotherfile.html']
200
201
202class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
203    # see issue #6325 for details
204    robots_txt = """\
205User-agent: *
206Disallow: /some/path?name=value
207    """
208    good = ['/some/path']
209    bad = ['/some/path?name=value']
210
211
212class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
213    # obey first * entry (#4108)
214    robots_txt = """\
215User-agent: *
216Disallow: /some/path
217
218User-agent: *
219Disallow: /another/path
220    """
221    good = ['/another/path']
222    bad = ['/some/path']
223
224
225class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
226    # normalize the URL first (#17403)
227    robots_txt = """\
228User-agent: *
229Allow: /some/path?
230Disallow: /another/path?
231    """
232    good = ['/some/path?']
233    bad = ['/another/path?']
234
235
236class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
237    robots_txt = """\
238User-agent: *
239Crawl-delay: 1
240Request-rate: 3/15
241Disallow: /cyberworld/map/
242    """
243    request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
244    crawl_delay = 1
245    good = ['/', '/test.html']
246    bad = ['/cyberworld/map/index.html']
247
248
249class RobotHandler(BaseHTTPRequestHandler):
250
251    def do_GET(self):
252        self.send_error(403, "Forbidden access")
253
254    def log_message(self, format, *args):
255        pass
256
257
258@unittest.skipUnless(threading, 'threading required for this test')
259class PasswordProtectedSiteTestCase(unittest.TestCase):
260
261    def setUp(self):
262        self.server = HTTPServer((support.HOST, 0), RobotHandler)
263
264        self.t = threading.Thread(
265            name='HTTPServer serving',
266            target=self.server.serve_forever,
267            # Short poll interval to make the test finish quickly.
268            # Time between requests is short enough that we won't wake
269            # up spuriously too many times.
270            kwargs={'poll_interval':0.01})
271        self.t.daemon = True  # In case this function raises.
272        self.t.start()
273
274    def tearDown(self):
275        self.server.shutdown()
276        self.t.join()
277        self.server.server_close()
278
279    @support.reap_threads
280    def testPasswordProtectedSite(self):
281        addr = self.server.server_address
282        url = 'http://' + support.HOST + ':' + str(addr[1])
283        robots_url = url + "/robots.txt"
284        parser = urllib.robotparser.RobotFileParser()
285        parser.set_url(url)
286        parser.read()
287        self.assertFalse(parser.can_fetch("*", robots_url))
288
289
290class NetworkTestCase(unittest.TestCase):
291
292    base_url = 'http://www.pythontest.net/'
293    robots_txt = '{}elsewhere/robots.txt'.format(base_url)
294
295    @classmethod
296    def setUpClass(cls):
297        support.requires('network')
298        with support.transient_internet(cls.base_url):
299            cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
300            cls.parser.read()
301
302    def url(self, path):
303        return '{}{}{}'.format(
304            self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
305        )
306
307    def test_basic(self):
308        self.assertFalse(self.parser.disallow_all)
309        self.assertFalse(self.parser.allow_all)
310        self.assertGreater(self.parser.mtime(), 0)
311        self.assertFalse(self.parser.crawl_delay('*'))
312        self.assertFalse(self.parser.request_rate('*'))
313
314    def test_can_fetch(self):
315        self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
316        self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
317        self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
318        self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
319        self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
320        self.assertTrue(self.parser.can_fetch('*', self.base_url))
321
322    def test_read_404(self):
323        parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
324        parser.read()
325        self.assertTrue(parser.allow_all)
326        self.assertFalse(parser.disallow_all)
327        self.assertEqual(parser.mtime(), 0)
328        self.assertIsNone(parser.crawl_delay('*'))
329        self.assertIsNone(parser.request_rate('*'))
330
331if __name__=='__main__':
332    unittest.main()
333