diff options
author | Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> | 2019-06-16 00:10:06 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-06-16 00:10:06 -0700 |
commit | 45d6547acfb9ae1639adbe03dd14f38cd0642ca2 (patch) | |
tree | 28e30a63d8ff5b43b22152c3a93153f97e15ae51 | |
parent | 159ae24895272dce5fd53dd8e54809743e4f394f (diff) | |
download | cpython-git-45d6547acfb9ae1639adbe03dd14f38cd0642ca2.tar.gz |
bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)
Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
(cherry picked from commit 8047e0e1c620f69cc21f9ca48b24bf2cdd5c3668)
Co-authored-by: Rémi Lapeyre <remi.lapeyre@henki.fr>
-rw-r--r-- | Lib/test/test_robotparser.py | 28 | ||||
-rw-r--r-- | Lib/urllib/robotparser.py | 8 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst | 4 |
3 files changed, 26 insertions, 14 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 140636590a..d478e7f127 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -76,30 +76,38 @@ Disallow: / class BaseRequestRateTest(BaseRobotTest): + request_rate = None + crawl_delay = None def test_request_rate(self): + parser = self.parser for url in self.good + self.bad: agent, url = self.get_agent_and_url(url) with self.subTest(url=url, agent=agent): - if self.crawl_delay: - self.assertEqual( - self.parser.crawl_delay(agent), self.crawl_delay - ) - if self.request_rate: + self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) + + parsed_request_rate = parser.request_rate(agent) + self.assertEqual(parsed_request_rate, self.request_rate) + if self.request_rate is not None: self.assertIsInstance( - self.parser.request_rate(agent), + parsed_request_rate, urllib.robotparser.RequestRate ) self.assertEqual( - self.parser.request_rate(agent).requests, + parsed_request_rate.requests, self.request_rate.requests ) self.assertEqual( - self.parser.request_rate(agent).seconds, + parsed_request_rate.seconds, self.request_rate.seconds ) +class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = '' + good = ['/foo'] + + class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): robots_txt = """\ User-agent: figtree @@ -120,10 +128,6 @@ Disallow: /%7ejoe/index.html class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' - # these are not actually tested, but we still need to parse it - # in order to accommodate the input parameters - request_rate = None - crawl_delay = None class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 883ef24921..f3bd806f07 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -179,7 +179,9 @@ class RobotFileParser: for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return self.default_entry.delay + if self.default_entry: + return self.default_entry.delay + return None def request_rate(self, useragent): if not self.mtime(): @@ -187,7 +189,9 @@ class RobotFileParser: for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return self.default_entry.req_rate + if self.default_entry: + return self.default_entry.req_rate + return None def __str__(self): entries = self.entries diff --git a/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst new file mode 100644 index 0000000000..5271a49562 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst @@ -0,0 +1,4 @@ +Fix :meth:`RobotFileParser.crawl_delay` and +:meth:`RobotFileParser.request_rate` to return ``None`` rather than +raise :exc:`AttributeError` when no relevant rule is defined in the +robots.txt file. Patch by Rémi Lapeyre. |