summaryrefslogtreecommitdiffstats
path: root/Lib/urllib/robotparser.py
diff options
context:
space:
mode:
authorJeremy Hylton <jeremy@alum.mit.edu>2008-07-18 20:59:44 (GMT)
committerJeremy Hylton <jeremy@alum.mit.edu>2008-07-18 20:59:44 (GMT)
commit73fd46d24e45c34f0fb87261e5471584a7c273df (patch)
treecf36eca08149e5fe933a90b71e7b3b3a1521305a /Lib/urllib/robotparser.py
parent48577d1944c6b03be12bd7b144eb22db6bd6d296 (diff)
downloadcpython-73fd46d24e45c34f0fb87261e5471584a7c273df.zip
cpython-73fd46d24e45c34f0fb87261e5471584a7c273df.tar.gz
cpython-73fd46d24e45c34f0fb87261e5471584a7c273df.tar.bz2
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely sure if this is safe, but it looks like robots.txt is expected to be ascii.
Diffstat (limited to 'Lib/urllib/robotparser.py')
-rw-r--r--Lib/urllib/robotparser.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
index a91df8d..c55fb50 100644
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@@ -60,7 +60,8 @@ class RobotFileParser:
elif err.code >= 400:
self.allow_all = True
else:
- self.parse(f.read().splitlines())
+ raw = f.read()
+ self.parse(raw.decode("utf-8").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
@@ -123,7 +124,10 @@ class RobotFileParser:
return True
# search for given user agent matches
# the first match counts
- url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+ url = urllib.parse.quote(
+ urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+ if not url:
+ url = "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)