summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@haypocalc.com>2011-01-14 13:05:21 (GMT)
committerVictor Stinner <victor.stinner@haypocalc.com>2011-01-14 13:05:21 (GMT)
commit5c23b8e6ea7cdb0002842d16dbce4b4d716dd35a (patch)
treef426907a4cf85dcbf58941f4c78a855554e4fce9
parent1d87deb60546050f3ef3c136ae124efb96de7517 (diff)
downloadcpython-5c23b8e6ea7cdb0002842d16dbce4b4d716dd35a.zip
cpython-5c23b8e6ea7cdb0002842d16dbce4b4d716dd35a.tar.gz
cpython-5c23b8e6ea7cdb0002842d16dbce4b4d716dd35a.tar.bz2
Issue #4953: cgi.FieldStorage and cgi.parse() parse the request as bytes, not
as unicode, and accept binary files. Add encoding and errors attributes to cgi.FieldStorage.
-rwxr-xr-xLib/cgi.py232
-rw-r--r--Lib/test/test_cgi.py33
-rw-r--r--Misc/NEWS4
3 files changed, 190 insertions, 79 deletions
diff --git a/Lib/cgi.py b/Lib/cgi.py
index 8786e58..d17ed96 100755
--- a/Lib/cgi.py
+++ b/Lib/cgi.py
@@ -31,13 +31,15 @@ __version__ = "2.6"
# Imports
# =======
-from io import StringIO
+from io import StringIO, BytesIO, TextIOWrapper
import sys
import os
import urllib.parse
-import email.parser
+from email.parser import FeedParser
from warnings import warn
import html
+import locale
+import tempfile
__all__ = ["MiniFieldStorage", "FieldStorage",
"parse", "parse_qs", "parse_qsl", "parse_multipart",
@@ -109,7 +111,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
Arguments, all optional:
- fp : file pointer; default: sys.stdin
+ fp : file pointer; default: sys.stdin.buffer
environ : environment dictionary; default: os.environ
@@ -126,6 +128,18 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
"""
if fp is None:
fp = sys.stdin
+
+ # field keys and values (except for files) are returned as strings
+ # an encoding is required to decode the bytes read from self.fp
+ if hasattr(fp,'encoding'):
+ encoding = fp.encoding
+ else:
+ encoding = 'latin-1'
+
+ # fp.read() must return bytes
+ if isinstance(fp, TextIOWrapper):
+ fp = fp.buffer
+
if not 'REQUEST_METHOD' in environ:
environ['REQUEST_METHOD'] = 'GET' # For testing stand-alone
if environ['REQUEST_METHOD'] == 'POST':
@@ -136,7 +150,7 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
clength = int(environ['CONTENT_LENGTH'])
if maxlen and clength > maxlen:
raise ValueError('Maximum content length exceeded')
- qs = fp.read(clength)
+ qs = fp.read(clength).decode(encoding)
else:
qs = '' # Unknown content-type
if 'QUERY_STRING' in environ:
@@ -154,7 +168,8 @@ def parse(fp=None, environ=os.environ, keep_blank_values=0, strict_parsing=0):
else:
qs = ""
environ['QUERY_STRING'] = qs # XXX Shouldn't, really
- return urllib.parse.parse_qs(qs, keep_blank_values, strict_parsing)
+ return urllib.parse.parse_qs(qs, keep_blank_values, strict_parsing,
+ encoding=encoding)
# parse query string function called from urlparse,
@@ -236,8 +251,8 @@ def parse_multipart(fp, pdict):
if not line:
terminator = lastpart # End outer loop
break
- if line[:2] == "--":
- terminator = line.strip()
+ if line.startswith("--"):
+ terminator = line.rstrip()
if terminator in (nextpart, lastpart):
break
lines.append(line)
@@ -352,9 +367,10 @@ class FieldStorage:
value: the value as a *string*; for file uploads, this
transparently reads the file every time you request the value
+ and returns *bytes*
- file: the file(-like) object from which you can read the data;
- None if the data is stored a simple string
+ file: the file(-like) object from which you can read the data *as
+ bytes* ; None if the data is stored a simple string
type: the content-type, or None if not specified
@@ -375,15 +391,18 @@ class FieldStorage:
directory and unlinking them as soon as they have been opened.
"""
-
- def __init__(self, fp=None, headers=None, outerboundary="",
- environ=os.environ, keep_blank_values=0, strict_parsing=0):
+ def __init__(self, fp=None, headers=None, outerboundary=b'',
+ environ=os.environ, keep_blank_values=0, strict_parsing=0,
+ limit=None, encoding='utf-8', errors='replace'):
"""Constructor. Read multipart/* until last part.
Arguments, all optional:
- fp : file pointer; default: sys.stdin
+ fp : file pointer; default: sys.stdin.buffer
(not used when the request method is GET)
+ Can be :
+ 1. a TextIOWrapper object
+ 2. an object whose read() and readline() methods return bytes
headers : header dictionary-like object; default:
taken from environ as per CGI spec
@@ -404,6 +423,16 @@ class FieldStorage:
If false (the default), errors are silently ignored.
If true, errors raise a ValueError exception.
+ limit : used internally to read parts of multipart/form-data forms,
+ to exit from the reading loop when reached. It is the difference
+ between the form content-length and the number of bytes already
+ read
+
+ encoding, errors : the encoding and error handler used to decode the
+ binary stream to strings. Must be the same as the charset defined
+ for the page sending the form (content-type : meta http-equiv or
+ header)
+
"""
method = 'GET'
self.keep_blank_values = keep_blank_values
@@ -418,7 +447,8 @@ class FieldStorage:
qs = sys.argv[1]
else:
qs = ""
- fp = StringIO(qs)
+ qs = qs.encode(locale.getpreferredencoding(), 'surrogateescape')
+ fp = BytesIO(qs)
if headers is None:
headers = {'content-type':
"application/x-www-form-urlencoded"}
@@ -433,10 +463,26 @@ class FieldStorage:
self.qs_on_post = environ['QUERY_STRING']
if 'CONTENT_LENGTH' in environ:
headers['content-length'] = environ['CONTENT_LENGTH']
- self.fp = fp or sys.stdin
+ if fp is None:
+ self.fp = sys.stdin.buffer
+ # self.fp.read() must return bytes
+ elif isinstance(fp, TextIOWrapper):
+ self.fp = fp.buffer
+ else:
+ self.fp = fp
+
+ self.encoding = encoding
+ self.errors = errors
+
self.headers = headers
+ if not isinstance(outerboundary, bytes):
+ raise TypeError('outerboundary must be bytes, not %s'
+ % type(outerboundary).__name__)
self.outerboundary = outerboundary
+ self.bytes_read = 0
+ self.limit = limit
+
# Process content-disposition header
cdisp, pdict = "", {}
if 'content-disposition' in self.headers:
@@ -449,6 +495,7 @@ class FieldStorage:
self.filename = None
if 'filename' in pdict:
self.filename = pdict['filename']
+ self._binary_file = self.filename is not None
# Process content-type header
#
@@ -470,9 +517,11 @@ class FieldStorage:
ctype, pdict = 'application/x-www-form-urlencoded', {}
self.type = ctype
self.type_options = pdict
- self.innerboundary = ""
if 'boundary' in pdict:
- self.innerboundary = pdict['boundary']
+ self.innerboundary = pdict['boundary'].encode(self.encoding)
+ else:
+ self.innerboundary = b""
+
clen = -1
if 'content-length' in self.headers:
try:
@@ -482,6 +531,8 @@ class FieldStorage:
if maxlen and clen > maxlen:
raise ValueError('Maximum content length exceeded')
self.length = clen
+ if self.limit is None and clen:
+ self.limit = clen
self.list = self.file = None
self.done = 0
@@ -582,12 +633,18 @@ class FieldStorage:
def read_urlencoded(self):
"""Internal: read data in query string format."""
qs = self.fp.read(self.length)
+ if not isinstance(qs, bytes):
+ raise ValueError("%s should return bytes, got %s" \
+ % (self.fp, type(qs).__name__))
+ qs = qs.decode(self.encoding, self.errors)
if self.qs_on_post:
qs += '&' + self.qs_on_post
- self.list = list = []
- for key, value in urllib.parse.parse_qsl(qs, self.keep_blank_values,
- self.strict_parsing):
- list.append(MiniFieldStorage(key, value))
+ self.list = []
+ query = urllib.parse.parse_qsl(
+ qs, self.keep_blank_values, self.strict_parsing,
+ encoding=self.encoding, errors=self.errors)
+ for key, value in query:
+ self.list.append(MiniFieldStorage(key, value))
self.skip_lines()
FieldStorageClass = None
@@ -599,24 +656,42 @@ class FieldStorage:
raise ValueError('Invalid boundary in multipart form: %r' % (ib,))
self.list = []
if self.qs_on_post:
- for key, value in urllib.parse.parse_qsl(self.qs_on_post,
- self.keep_blank_values, self.strict_parsing):
+ query = urllib.parse.parse_qsl(
+ self.qs_on_post, self.keep_blank_values, self.strict_parsing,
+ encoding=self.encoding, errors=self.errors)
+ for key, value in query:
self.list.append(MiniFieldStorage(key, value))
FieldStorageClass = None
klass = self.FieldStorageClass or self.__class__
- parser = email.parser.FeedParser()
- # Create bogus content-type header for proper multipart parsing
- parser.feed('Content-Type: %s; boundary=%s\r\n\r\n' % (self.type, ib))
- parser.feed(self.fp.read())
- full_msg = parser.close()
- # Get subparts
- msgs = full_msg.get_payload()
- for msg in msgs:
- fp = StringIO(msg.get_payload())
- part = klass(fp, msg, ib, environ, keep_blank_values,
- strict_parsing)
+ first_line = self.fp.readline() # bytes
+ if not isinstance(first_line, bytes):
+ raise ValueError("%s should return bytes, got %s" \
+ % (self.fp, type(first_line).__name__))
+ self.bytes_read += len(first_line)
+ # first line holds boundary ; ignore it, or check that
+ # b"--" + ib == first_line.strip() ?
+ while True:
+ parser = FeedParser()
+ hdr_text = b""
+ while True:
+ data = self.fp.readline()
+ hdr_text += data
+ if not data.strip():
+ break
+ if not hdr_text:
+ break
+ # parser takes strings, not bytes
+ self.bytes_read += len(hdr_text)
+ parser.feed(hdr_text.decode(self.encoding, self.errors))
+ headers = parser.close()
+ part = klass(self.fp, headers, ib, environ, keep_blank_values,
+ strict_parsing,self.limit-self.bytes_read,
+ self.encoding, self.errors)
+ self.bytes_read += part.bytes_read
self.list.append(part)
+ if self.bytes_read >= self.length:
+ break
self.skip_lines()
def read_single(self):
@@ -636,7 +711,11 @@ class FieldStorage:
todo = self.length
if todo >= 0:
while todo > 0:
- data = self.fp.read(min(todo, self.bufsize))
+ data = self.fp.read(min(todo, self.bufsize)) # bytes
+ if not isinstance(data, bytes):
+ raise ValueError("%s should return bytes, got %s"
+ % (self.fp, type(data).__name__))
+ self.bytes_read += len(data)
if not data:
self.done = -1
break
@@ -645,59 +724,77 @@ class FieldStorage:
def read_lines(self):
"""Internal: read lines until EOF or outerboundary."""
- self.file = self.__file = StringIO()
+ if self._binary_file:
+ self.file = self.__file = BytesIO() # store data as bytes for files
+ else:
+ self.file = self.__file = StringIO() # as strings for other fields
if self.outerboundary:
self.read_lines_to_outerboundary()
else:
self.read_lines_to_eof()
def __write(self, line):
+ """line is always bytes, not string"""
if self.__file is not None:
if self.__file.tell() + len(line) > 1000:
self.file = self.make_file()
data = self.__file.getvalue()
self.file.write(data)
self.__file = None
- self.file.write(line)
+ if self._binary_file:
+ # keep bytes
+ self.file.write(line)
+ else:
+ # decode to string
+ self.file.write(line.decode(self.encoding, self.errors))
def read_lines_to_eof(self):
"""Internal: read lines until EOF."""
while 1:
- line = self.fp.readline(1<<16)
+ line = self.fp.readline(1<<16) # bytes
+ self.bytes_read += len(line)
if not line:
self.done = -1
break
self.__write(line)
def read_lines_to_outerboundary(self):
- """Internal: read lines until outerboundary."""
- next = "--" + self.outerboundary
- last = next + "--"
- delim = ""
+ """Internal: read lines until outerboundary.
+ Data is read as bytes: boundaries and line ends must be converted
+ to bytes for comparisons.
+ """
+ next_boundary = b"--" + self.outerboundary
+ last_boundary = next_boundary + b"--"
+ delim = b""
last_line_lfend = True
+ _read = 0
while 1:
- line = self.fp.readline(1<<16)
+ if _read >= self.limit:
+ break
+ line = self.fp.readline(1<<16) # bytes
+ self.bytes_read += len(line)
+ _read += len(line)
if not line:
self.done = -1
break
- if line[:2] == "--" and last_line_lfend:
- strippedline = line.strip()
- if strippedline == next:
+ if line.startswith(b"--") and last_line_lfend:
+ strippedline = line.rstrip()
+ if strippedline == next_boundary:
break
- if strippedline == last:
+ if strippedline == last_boundary:
self.done = 1
break
odelim = delim
- if line[-2:] == "\r\n":
- delim = "\r\n"
+ if line.endswith(b"\r\n"):
+ delim = b"\r\n"
line = line[:-2]
last_line_lfend = True
- elif line[-1] == "\n":
- delim = "\n"
+ elif line.endswith(b"\n"):
+ delim = b"\n"
line = line[:-1]
last_line_lfend = True
else:
- delim = ""
+ delim = b""
last_line_lfend = False
self.__write(odelim + line)
@@ -705,22 +802,23 @@ class FieldStorage:
"""Internal: skip lines until outer boundary if defined."""
if not self.outerboundary or self.done:
return
- next = "--" + self.outerboundary
- last = next + "--"
+ next_boundary = b"--" + self.outerboundary
+ last_boundary = next_boundary + b"--"
last_line_lfend = True
- while 1:
+ while True:
line = self.fp.readline(1<<16)
+ self.bytes_read += len(line)
if not line:
self.done = -1
break
- if line[:2] == "--" and last_line_lfend:
+ if line.endswith(b"--") and last_line_lfend:
strippedline = line.strip()
- if strippedline == next:
+ if strippedline == next_boundary:
break
- if strippedline == last:
+ if strippedline == last_boundary:
self.done = 1
break
- last_line_lfend = line.endswith('\n')
+ last_line_lfend = line.endswith(b'\n')
def make_file(self):
"""Overridable: return a readable & writable file.
@@ -730,7 +828,8 @@ class FieldStorage:
- seek(0)
- data is read from it
- The file is always opened in text mode.
+ The file is opened in binary mode for files, in text mode
+ for other fields
This version opens a temporary file for reading and writing,
and immediately deletes (unlinks) it. The trick (on Unix!) is
@@ -745,8 +844,11 @@ class FieldStorage:
which unlinks the temporary files you have created.
"""
- import tempfile
- return tempfile.TemporaryFile("w+", encoding="utf-8", newline="\n")
+ if self._binary_file:
+ return tempfile.TemporaryFile("wb+")
+ else:
+ return tempfile.TemporaryFile("w+",
+ encoding=self.encoding, newline = '\n')
# Test/debug code
@@ -910,8 +1012,12 @@ def escape(s, quote=None):
return s
-def valid_boundary(s, _vb_pattern="^[ -~]{0,200}[!-~]$"):
+def valid_boundary(s, _vb_pattern=None):
import re
+ if isinstance(s, bytes):
+ _vb_pattern = b"^[ -~]{0,200}[!-~]$"
+ else:
+ _vb_pattern = "^[ -~]{0,200}[!-~]$"
return re.match(_vb_pattern, s)
# Invoke mainline
diff --git a/Lib/test/test_cgi.py b/Lib/test/test_cgi.py
index 2dcbfc1..3b6f59b 100644
--- a/Lib/test/test_cgi.py
+++ b/Lib/test/test_cgi.py
@@ -4,7 +4,7 @@ import os
import sys
import tempfile
import unittest
-from io import StringIO
+from io import StringIO, BytesIO
class HackedSysModule:
# The regression test will have real values in sys.argv, which
@@ -14,7 +14,6 @@ class HackedSysModule:
cgi.sys = HackedSysModule()
-
class ComparableException:
def __init__(self, err):
self.err = err
@@ -38,7 +37,7 @@ def do_test(buf, method):
env['REQUEST_METHOD'] = 'GET'
env['QUERY_STRING'] = buf
elif method == "POST":
- fp = StringIO(buf)
+ fp = BytesIO(buf.encode('latin-1')) # FieldStorage expects bytes
env['REQUEST_METHOD'] = 'POST'
env['CONTENT_TYPE'] = 'application/x-www-form-urlencoded'
env['CONTENT_LENGTH'] = str(len(buf))
@@ -106,9 +105,10 @@ def first_second_elts(list):
return [(p[0], p[1][0]) for p in list]
def gen_result(data, environ):
- fake_stdin = StringIO(data)
+ encoding = 'latin-1'
+ fake_stdin = BytesIO(data.encode(encoding))
fake_stdin.seek(0)
- form = cgi.FieldStorage(fp=fake_stdin, environ=environ)
+ form = cgi.FieldStorage(fp=fake_stdin, environ=environ, encoding=encoding)
result = {}
for k, v in dict(form).items():
@@ -122,9 +122,9 @@ class CgiTests(unittest.TestCase):
for orig, expect in parse_strict_test_cases:
# Test basic parsing
d = do_test(orig, "GET")
- self.assertEqual(d, expect, "Error parsing %s" % repr(orig))
+ self.assertEqual(d, expect, "Error parsing %s method GET" % repr(orig))
d = do_test(orig, "POST")
- self.assertEqual(d, expect, "Error parsing %s" % repr(orig))
+ self.assertEqual(d, expect, "Error parsing %s method POST" % repr(orig))
env = {'QUERY_STRING': orig}
fs = cgi.FieldStorage(environ=env)
@@ -181,9 +181,9 @@ class CgiTests(unittest.TestCase):
setattr(self, name, a)
return a
- f = TestReadlineFile(tempfile.TemporaryFile("w+"))
+ f = TestReadlineFile(tempfile.TemporaryFile("wb+"))
self.addCleanup(f.close)
- f.write('x' * 256 * 1024)
+ f.write(b'x' * 256 * 1024)
f.seek(0)
env = {'REQUEST_METHOD':'PUT'}
fs = cgi.FieldStorage(fp=f, environ=env)
@@ -192,6 +192,7 @@ class CgiTests(unittest.TestCase):
# (by read_binary); if we are chunking properly, it will be called 5 times
# as long as the chunksize is 1 << 16.
self.assertTrue(f.numcalls > 2)
+ f.close()
def test_fieldstorage_multipart(self):
#Test basic FieldStorage multipart parsing
@@ -216,11 +217,13 @@ Content-Disposition: form-data; name="submit"
Add\x20
-----------------------------721837373350705526688164684--
"""
- fs = cgi.FieldStorage(fp=StringIO(postdata), environ=env)
+ encoding = 'ascii'
+ fp = BytesIO(postdata.encode(encoding))
+ fs = cgi.FieldStorage(fp, environ=env, encoding=encoding)
self.assertEqual(len(fs.list), 4)
expect = [{'name':'id', 'filename':None, 'value':'1234'},
{'name':'title', 'filename':None, 'value':''},
- {'name':'file', 'filename':'test.txt', 'value':'Testing 123.'},
+ {'name':'file', 'filename':'test.txt', 'value':b'Testing 123.\n'},
{'name':'submit', 'filename':None, 'value':' Add '}]
for x in range(len(fs.list)):
for k, exp in expect[x].items():
@@ -245,8 +248,7 @@ Content-Disposition: form-data; name="submit"
self.assertEqual(self._qs_result, v)
def testQSAndFormData(self):
- data = """
----123
+ data = """---123
Content-Disposition: form-data; name="key2"
value2y
@@ -270,8 +272,7 @@ value4
self.assertEqual(self._qs_result, v)
def testQSAndFormDataFile(self):
- data = """
----123
+ data = """---123
Content-Disposition: form-data; name="key2"
value2y
@@ -299,7 +300,7 @@ this is the content of the fake file
}
result = self._qs_result.copy()
result.update({
- 'upload': 'this is the content of the fake file'
+ 'upload': b'this is the content of the fake file\n'
})
v = gen_result(data, environ)
self.assertEqual(result, v)
diff --git a/Misc/NEWS b/Misc/NEWS
index 3b0e813..dff100b 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -43,6 +43,10 @@ Core and Builtins
Library
-------
+- Issue #4953: cgi.FieldStorage and cgi.parse() parse the request as bytes, not
+ as unicode, and accept binary files. Add encoding and errors attributes to
+ cgi.FieldStorage.
+
- Add encoding and errors arguments to urllib.parse_qs() and urllib.parse_qsl()
- Issue #10899: No function type annotations in the standard library.