summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Lib/test/test_robotparser.py12
-rw-r--r--Lib/urllib/robotparser.py1
-rw-r--r--Misc/NEWS4
3 files changed, 17 insertions, 0 deletions
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
index 8c09e74..d1dfd9e 100644
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@@ -234,6 +234,18 @@ bad = ['/some/path']
RobotTest(15, doc, good, bad)
+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+
class NetworkTestCase(unittest.TestCase):
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index 75be4af..978ba58 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -157,6 +157,7 @@ class RuleLine:
if path == '' and not allowance:
# an empty value means allow all
allowance = True
+ path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
self.path = urllib.parse.quote(path)
self.allowance = allowance
diff --git a/Misc/NEWS b/Misc/NEWS
index be1be6a..3b9416b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -96,6 +96,10 @@ Core and Builtins
Library
-------
+- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to
+ ruleline. This helps in handling certain types invalid urls in a conservative
+ manner. Patch contributed by Mher Movsisyan.
+
- Issue #18070: Have importlib.util.module_for_loader() set attributes
unconditionally in order to properly support reloading.