diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2015-05-31 17:21:00 (GMT) |
---|---|---|
committer | Serhiy Storchaka <storchaka@gmail.com> | 2015-05-31 17:21:00 (GMT) |
commit | c7797dc7482035ee166ca2e941b623382b92e1fc (patch) | |
tree | 526e26fa4dac506f02859fdbe946d33ed4165f5e | |
parent | cfb7028df4bdf12325786e48ebef3b4982efa119 (diff) | |
download | cpython-c7797dc7482035ee166ca2e941b623382b92e1fc.zip cpython-c7797dc7482035ee166ca2e941b623382b92e1fc.tar.gz cpython-c7797dc7482035ee166ca2e941b623382b92e1fc.tar.bz2 |
Issue #19543: Emit deprecation warning for known non-text encodings.
Backported issues #19619: encode() and decode() methods and constructors
of str, unicode and bytearray classes now emit deprecation warning for known
non-text encodings when Python is ran with the -3 option.
Backported issues #20404: io.TextIOWrapper (and hence io.open()) now uses the
internal codec marking system added to emit deprecation warning for known non-text
encodings at stream construction time when Python is ran with the -3 option.
-rw-r--r-- | Include/codecs.h | 45 | ||||
-rw-r--r-- | Lib/_pyio.py | 6 | ||||
-rw-r--r-- | Lib/codecs.py | 14 | ||||
-rw-r--r-- | Lib/encodings/base64_codec.py | 1 | ||||
-rw-r--r-- | Lib/encodings/bz2_codec.py | 1 | ||||
-rw-r--r-- | Lib/encodings/hex_codec.py | 1 | ||||
-rw-r--r-- | Lib/encodings/quopri_codec.py | 1 | ||||
-rwxr-xr-x | Lib/encodings/rot_13.py | 1 | ||||
-rw-r--r-- | Lib/encodings/uu_codec.py | 1 | ||||
-rw-r--r-- | Lib/encodings/zlib_codec.py | 1 | ||||
-rw-r--r-- | Lib/json/decoder.py | 6 | ||||
-rw-r--r-- | Lib/test/string_tests.py | 12 | ||||
-rw-r--r-- | Lib/test/test_calendar.py | 4 | ||||
-rw-r--r-- | Lib/test/test_codecs.py | 48 | ||||
-rw-r--r-- | Lib/test/test_fileinput.py | 7 | ||||
-rw-r--r-- | Lib/test/test_io.py | 43 | ||||
-rw-r--r-- | Misc/NEWS | 8 | ||||
-rw-r--r-- | Modules/_io/textio.c | 34 | ||||
-rw-r--r-- | Objects/bytearrayobject.c | 6 | ||||
-rw-r--r-- | Objects/stringobject.c | 4 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 8 | ||||
-rw-r--r-- | Python/codecs.c | 209 |
22 files changed, 391 insertions, 70 deletions
diff --git a/Include/codecs.h b/Include/codecs.h index c038c6a..8a9041b 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -81,6 +81,51 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( const char *errors ); +/* Text codec specific encoding and decoding API. + + Checks the encoding against a list of codecs which do not + implement a unicode<->bytes encoding before attempting the + operation. + + Please note that these APIs are internal and should not + be used in Python C extensions. + + XXX (ncoghlan): should we make these, or something like them, public + in Python 3.5+? + + */ +PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command + ); + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( + PyObject *object, + const char *encoding, + const char *errors + ); + +/* These two aren't actually text encoding specific, but _io.TextIOWrapper + * is the only current API consumer. + */ +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder( + PyObject *codec_info, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder( + PyObject *codec_info, + const char *errors + ); + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are diff --git a/Lib/_pyio.py b/Lib/_pyio.py index a7f4301..694b778 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -7,6 +7,7 @@ from __future__ import (print_function, unicode_literals) import os import abc import codecs +import sys import warnings import errno # Import thread instead of threading to reduce startup cost @@ -1497,6 +1498,11 @@ class TextIOWrapper(TextIOBase): if not isinstance(encoding, basestring): raise ValueError("invalid encoding: %r" % encoding) + if sys.py3kwarning and not codecs.lookup(encoding)._is_text_encoding: + msg = ("%r is not a text encoding; " + "use codecs.open() to handle arbitrary codecs") + warnings.warnpy3k(msg % encoding, stacklevel=2) + if errors is None: errors = "strict" else: diff --git a/Lib/codecs.py b/Lib/codecs.py index 049a3f0..12213e2 100644 --- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -79,9 +79,19 @@ BOM64_BE = BOM_UTF32_BE ### Codec base classes (defining the API) class CodecInfo(tuple): + """Codec details when looking up the codec registry""" + + # Private API to allow Python to blacklist the known non-Unicode + # codecs in the standard library. A more general mechanism to + # reliably distinguish test encodings from other codecs will hopefully + # be defined for Python 3.5 + # + # See http://bugs.python.org/issue19619 + _is_text_encoding = True # Assume codecs are text encodings by default def __new__(cls, encode, decode, streamreader=None, streamwriter=None, - incrementalencoder=None, incrementaldecoder=None, name=None): + incrementalencoder=None, incrementaldecoder=None, name=None, + _is_text_encoding=None): self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter)) self.name = name self.encode = encode @@ -90,6 +100,8 @@ class CodecInfo(tuple): self.incrementaldecoder = incrementaldecoder self.streamwriter = streamwriter self.streamreader = streamreader + if _is_text_encoding is not None: + self._is_text_encoding = _is_text_encoding return self def __repr__(self): diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py index f84e780..34ac555 100644 --- a/Lib/encodings/base64_codec.py +++ b/Lib/encodings/base64_codec.py @@ -76,4 +76,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py index 054b36b..136503a 100644 --- a/Lib/encodings/bz2_codec.py +++ b/Lib/encodings/bz2_codec.py @@ -99,4 +99,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py index 91b38d9..154488c 100644 --- a/Lib/encodings/hex_codec.py +++ b/Lib/encodings/hex_codec.py @@ -76,4 +76,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py index d8683fd..f259149 100644 --- a/Lib/encodings/quopri_codec.py +++ b/Lib/encodings/quopri_codec.py @@ -72,4 +72,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py index 52b6431..4eaf433 100755 --- a/Lib/encodings/rot_13.py +++ b/Lib/encodings/rot_13.py @@ -44,6 +44,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader, + _is_text_encoding=False, ) ### Decoding Map diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py index 4b137a5..5cb0d2b 100644 --- a/Lib/encodings/uu_codec.py +++ b/Lib/encodings/uu_codec.py @@ -126,4 +126,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py index 3419f9f..0c2599d 100644 --- a/Lib/encodings/zlib_codec.py +++ b/Lib/encodings/zlib_codec.py @@ -99,4 +99,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter, + _is_text_encoding=False, ) diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index 1b43238..5141f87 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -15,10 +15,8 @@ __all__ = ['JSONDecoder'] FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL def _floatconstants(): - _BYTES = '7FF80000000000007FF0000000000000'.decode('hex') - if sys.byteorder != 'big': - _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1] - nan, inf = struct.unpack('dd', _BYTES) + nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00') + inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00') return nan, inf, -inf NaN, PosInf, NegInf = _floatconstants() diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 6d87eb6..b2f837b 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -1295,8 +1295,10 @@ class MixinStrUserStringTest: ('hex', '68656c6c6f20776f726c64'), ('uu', 'begin 666 <data>\n+:&5L;&\\@=V]R;&0 \n \nend\n')] for encoding, data in codecs: - self.checkequal(data, 'hello world', 'encode', encoding) - self.checkequal('hello world', data, 'decode', encoding) + with test_support.check_py3k_warnings(): + self.checkequal(data, 'hello world', 'encode', encoding) + with test_support.check_py3k_warnings(): + self.checkequal('hello world', data, 'decode', encoding) # zlib is optional, so we make the test optional too... try: import zlib @@ -1304,8 +1306,10 @@ class MixinStrUserStringTest: pass else: data = 'x\x9c\xcbH\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x1a\x0b\x04]' - self.checkequal(data, 'hello world', 'encode', 'zlib') - self.checkequal('hello world', data, 'decode', 'zlib') + with test_support.check_py3k_warnings(): + self.checkequal(data, 'hello world', 'encode', 'zlib') + with test_support.check_py3k_warnings(): + self.checkequal('hello world', data, 'decode', 'zlib') self.checkraises(TypeError, 'xyz', 'decode', 42) self.checkraises(TypeError, 'xyz', 'encode', 42) diff --git a/Lib/test/test_calendar.py b/Lib/test/test_calendar.py index 5692642..46c4a6f 100644 --- a/Lib/test/test_calendar.py +++ b/Lib/test/test_calendar.py @@ -513,8 +513,8 @@ class CommandLineTestCase(unittest.TestCase): def test_option_encoding(self): self.assertFailure('-e') self.assertFailure('--encoding') - stdout = self.run_ok('--encoding', 'rot-13', '2004') - self.assertEqual(stdout.strip(), conv(result_2004_text.encode('rot-13')).strip()) + stdout = self.run_ok('--encoding', 'utf-16-le', '2004') + self.assertEqual(stdout.strip(), conv(result_2004_text.encode('utf-16-le')).strip()) def test_option_locale(self): self.assertFailure('-L') diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index de80b07..c7072a6 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1395,14 +1395,14 @@ class EncodedFileTest(unittest.TestCase): class Str2StrTest(unittest.TestCase): def test_read(self): - sin = "\x80".encode("base64_codec") + sin = codecs.encode("\x80", "base64_codec") reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) sout = reader.read() self.assertEqual(sout, "\x80") self.assertIsInstance(sout, str) def test_readline(self): - sin = "\x80".encode("base64_codec") + sin = codecs.encode("\x80", "base64_codec") reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) sout = reader.readline() self.assertEqual(sout, "\x80") @@ -1536,6 +1536,9 @@ broken_unicode_with_streams = [ ] broken_incremental_coders = broken_unicode_with_streams[:] +if sys.flags.py3k_warning: + broken_unicode_with_streams.append("rot_13") + # The following encodings only support "strict" mode only_strict_mode = [ "idna", @@ -2135,6 +2138,47 @@ def test_main(): # Missing "begin" line self.assertRaises(ValueError, codecs.decode, "", "uu-codec") + def test_text_to_binary_blacklists_binary_transforms(self): + # Check binary -> binary codecs give a good error for str input + bad_input = "bad input type" + for encoding in bytes_transform_encodings: + fmt = (r"{!r} is not a text encoding; " + r"use codecs.encode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) + with self.assertRaisesRegex(LookupError, msg) as failure: + bad_input.encode(encoding) + self.assertIsNone(failure.exception.__cause__) + + def test_text_to_binary_blacklists_text_transforms(self): + # Check str.encode gives a good error message for str -> str codecs + msg = (r"^'rot_13' is not a text encoding; " + r"use codecs.encode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg): + "just an example message".encode("rot_13") + + def test_binary_to_text_blacklists_binary_transforms(self): + # Check bytes.decode and bytearray.decode give a good error + # message for binary -> binary codecs + data = b"encode first to ensure we meet any format restrictions" + for encoding in bytes_transform_encodings: + encoded_data = codecs.encode(data, encoding) + fmt = (r"{!r} is not a text encoding; " + r"use codecs.decode\(\) to handle arbitrary codecs") + msg = fmt.format(encoding) + with self.assertRaisesRegex(LookupError, msg): + encoded_data.decode(encoding) + with self.assertRaisesRegex(LookupError, msg): + bytearray(encoded_data).decode(encoding) + + def test_binary_to_text_blacklists_text_transforms(self): + # Check str -> str codec gives a good error for binary input + for bad_input in (b"immutable", bytearray(b"mutable")): + msg = (r"^'rot_13' is not a text encoding; " + r"use codecs.decode\(\) to handle arbitrary codecs") + with self.assertRaisesRegex(LookupError, msg) as failure: + bad_input.decode("rot_13") + self.assertIsNone(failure.exception.__cause__) + if __name__ == "__main__": test_main() diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py index c15ad84..facc56e 100644 --- a/Lib/test/test_fileinput.py +++ b/Lib/test/test_fileinput.py @@ -211,10 +211,11 @@ class FileInputTests(unittest.TestCase): except ValueError: pass try: - t1 = writeTmp(1, ["A\nB"], mode="wb") - fi = FileInput(files=t1, openhook=hook_encoded("rot13")) + # UTF-7 is a convenient, seldom used encoding + t1 = writeTmp(1, ['+AEE-\n+AEI-'], mode="wb") + fi = FileInput(files=t1, openhook=hook_encoded("utf-7")) lines = list(fi) - self.assertEqual(lines, ["N\n", "O"]) + self.assertEqual(lines, [u'A\n', u'B']) finally: remove_tempfiles(t1) diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index bbc804b..1a17d81 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2001,6 +2001,15 @@ class TextIOWrapperTest(unittest.TestCase): t.__init__(self.MockRawIO()) self.assertEqual(t.read(0), u'') + def test_non_text_encoding_codecs_are_rejected(self): + # Ensure the constructor complains if passed a codec that isn't + # marked as a text encoding + # http://bugs.python.org/issue20404 + r = self.BytesIO() + b = self.BufferedWriter(r) + with support.check_py3k_warnings(): + self.TextIOWrapper(b, encoding="hex_codec") + def test_detach(self): r = self.BytesIO() b = self.BufferedWriter(r) @@ -2617,19 +2626,39 @@ class TextIOWrapperTest(unittest.TestCase): def test_illegal_decoder(self): # Issue #17106 + # Bypass the early encoding check added in issue 20404 + def _make_illegal_wrapper(): + quopri = codecs.lookup("quopri_codec") + quopri._is_text_encoding = True + try: + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), + newline='\n', encoding="quopri_codec") + finally: + quopri._is_text_encoding = False + return t # Crash when decoder returns non-string - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + with support.check_py3k_warnings(): + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', + encoding='quopri_codec') with self.maybeRaises(TypeError): t.read(1) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + with support.check_py3k_warnings(): + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', + encoding='quopri_codec') with self.maybeRaises(TypeError): t.readline() - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + with support.check_py3k_warnings(): + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', + encoding='quopri_codec') with self.maybeRaises(TypeError): t.read() + #else: + #t = _make_illegal_wrapper() + #self.assertRaises(TypeError, t.read, 1) + #t = _make_illegal_wrapper() + #self.assertRaises(TypeError, t.readline) + #t = _make_illegal_wrapper() + #self.assertRaises(TypeError, t.read) class CTextIOWrapperTest(TextIOWrapperTest): @@ -3002,9 +3031,11 @@ class MiscIOTest(unittest.TestCase): class CMiscIOTest(MiscIOTest): io = io + shutdown_error = "RuntimeError: could not find io module state" class PyMiscIOTest(MiscIOTest): io = pyio + shutdown_error = "LookupError: unknown encoding: ascii" @unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.') @@ -10,6 +10,10 @@ What's New in Python 2.7.11? Core and Builtins ----------------- +- Issue #19543: encode() and decode() methods and constructors of str, + unicode and bytearray classes now emit deprecation warning for known + non-text encodings when Python is ran with the -3 option. + - Issue #24115: Update uses of PyObject_IsTrue(), PyObject_Not(), PyObject_IsInstance(), PyObject_RichCompareBool() and _PyDict_Contains() to check for and handle errors correctly. @@ -26,6 +30,10 @@ Core and Builtins Library ------- +- Issue #19543: io.TextIOWrapper (and hence io.open()) now uses the internal + codec marking system added to emit deprecation warning for known non-text + encodings at stream construction time when Python is ran with the -3 option. + - Issue #24264: Fixed buffer overflow in the imageop module. - Issue #5633: Fixed timeit when the statement is a string and the setup is not. diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 8ac8a4a..9981d4c 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -826,7 +826,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) char *kwlist[] = {"buffer", "encoding", "errors", "newline", "line_buffering", NULL}; - PyObject *buffer, *raw; + PyObject *buffer, *raw, *codec_info = NULL; char *encoding = NULL; char *errors = NULL; char *newline = NULL; @@ -909,6 +909,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) "could not determine default encoding"); } + /* Check we have been asked for a real text encoding */ + codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()"); + if (codec_info == NULL) { + Py_CLEAR(self->encoding); + goto error; + } + + /* XXX: Failures beyond this point have the potential to leak elements + * of the partially constructed object (like self->encoding) + */ + if (errors == NULL) errors = "strict"; self->errors = PyBytes_FromString(errors); @@ -922,7 +933,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (newline) { self->readnl = PyString_FromString(newline); if (self->readnl == NULL) - return -1; + goto error; } self->writetranslate = (newline == NULL || newline[0] != '\0'); if (!self->readuniversal && self->writetranslate) { @@ -944,8 +955,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - self->decoder = PyCodec_IncrementalDecoder( - encoding, errors); + self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info, + errors); if (self->decoder == NULL) goto error; @@ -969,17 +980,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - PyObject *ci; - self->encoder = PyCodec_IncrementalEncoder( - encoding, errors); + self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info, + errors); if (self->encoder == NULL) goto error; /* Get the normalized named of the codec */ - ci = _PyCodec_Lookup(encoding); - if (ci == NULL) - goto error; - res = PyObject_GetAttrString(ci, "name"); - Py_DECREF(ci); + res = PyObject_GetAttrString(codec_info, "name"); if (res == NULL) { if (PyErr_ExceptionMatches(PyExc_AttributeError)) PyErr_Clear(); @@ -999,6 +1005,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) Py_XDECREF(res); } + /* Finished sorting out the codec details */ + Py_DECREF(codec_info); + self->buffer = buffer; Py_INCREF(buffer); @@ -1059,6 +1068,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) return 0; error: + Py_XDECREF(codec_info); return -1; } diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index fd201ca..5f57580 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -783,7 +783,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds) if (PyBytes_Check(arg)) { PyObject *new, *encoded; if (encoding != NULL) { - encoded = PyCodec_Encode(arg, encoding, errors); + encoded = _PyCodec_EncodeText(arg, encoding, errors); if (encoded == NULL) return -1; assert(PyBytes_Check(encoded)); @@ -809,7 +809,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds) "unicode argument without an encoding"); return -1; } - encoded = PyCodec_Encode(arg, encoding, errors); + encoded = _PyCodec_EncodeText(arg, encoding, errors); if (encoded == NULL) return -1; assert(PyBytes_Check(encoded)); @@ -2567,7 +2567,7 @@ bytearray_decode(PyObject *self, PyObject *args, PyObject *kwargs) return NULL; #endif } - return PyCodec_Decode(self, encoding, errors); + return _PyCodec_DecodeText(self, encoding, errors); } PyDoc_STRVAR(alloc_doc, diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 46f46db..c1e12a7 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -449,7 +449,7 @@ PyObject *PyString_AsDecodedObject(PyObject *str, } /* Decode via the codec registry */ - v = PyCodec_Decode(str, encoding, errors); + v = _PyCodec_DecodeText(str, encoding, errors); if (v == NULL) goto onError; @@ -529,7 +529,7 @@ PyObject *PyString_AsEncodedObject(PyObject *str, } /* Encode via the codec registry */ - v = PyCodec_Encode(str, encoding, errors); + v = _PyCodec_EncodeText(str, encoding, errors); if (v == NULL) goto onError; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 91e7524..08723ac 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1259,7 +1259,7 @@ PyObject *PyUnicode_Decode(const char *s, buffer = PyBuffer_FromMemory((void *)s, size); if (buffer == NULL) goto onError; - unicode = PyCodec_Decode(buffer, encoding, errors); + unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -1292,7 +1292,7 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, encoding = PyUnicode_GetDefaultEncoding(); /* Decode via the codec registry */ - v = PyCodec_Decode(unicode, encoding, errors); + v = _PyCodec_DecodeText(unicode, encoding, errors); if (v == NULL) goto onError; return v; @@ -1331,7 +1331,7 @@ PyObject *PyUnicode_AsEncodedObject(PyObject *unicode, encoding = PyUnicode_GetDefaultEncoding(); /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) goto onError; return v; @@ -1369,7 +1369,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, } /* Encode via the codec registry */ - v = PyCodec_Encode(unicode, encoding, errors); + v = _PyCodec_EncodeText(unicode, encoding, errors); if (v == NULL) goto onError; if (!PyString_Check(v)) { diff --git a/Python/codecs.c b/Python/codecs.c index 184d147..d672362 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -217,20 +217,15 @@ PyObject *codec_getitem(const char *encoding, int index) return v; } -/* Helper function to create an incremental codec. */ - +/* Helper functions to create an incremental codec. */ static -PyObject *codec_getincrementalcodec(const char *encoding, - const char *errors, - const char *attrname) +PyObject *codec_makeincrementalcodec(PyObject *codec_info, + const char *errors, + const char *attrname) { - PyObject *codecs, *ret, *inccodec; + PyObject *ret, *inccodec; - codecs = _PyCodec_Lookup(encoding); - if (codecs == NULL) - return NULL; - inccodec = PyObject_GetAttrString(codecs, attrname); - Py_DECREF(codecs); + inccodec = PyObject_GetAttrString(codec_info, attrname); if (inccodec == NULL) return NULL; if (errors) @@ -241,6 +236,21 @@ PyObject *codec_getincrementalcodec(const char *encoding, return ret; } +static +PyObject *codec_getincrementalcodec(const char *encoding, + const char *errors, + const char *attrname) +{ + PyObject *codec_info, *ret; + + codec_info = _PyCodec_Lookup(encoding); + if (codec_info == NULL) + return NULL; + ret = codec_makeincrementalcodec(codec_info, errors, attrname); + Py_DECREF(codec_info); + return ret; +} + /* Helper function to create a stream codec. */ static @@ -264,6 +274,24 @@ PyObject *codec_getstreamcodec(const char *encoding, return streamcodec; } +/* Helpers to work with the result of _PyCodec_Lookup + + */ +PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementaldecoder"); +} + +PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementalencoder"); +} + + /* Convenience APIs to query the Codec registry. All APIs return a codec object with incremented refcount. @@ -311,18 +339,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding, errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_EncodeInternal(PyObject *object, + PyObject *encoder, + const char *encoding, + const char *errors) { - PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - encoder = PyCodec_Encoder(encoding); - if (encoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -358,18 +383,15 @@ PyObject *PyCodec_Encode(PyObject *object, errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object, - const char *encoding, - const char *errors) +static PyObject * +_PyCodec_DecodeInternal(PyObject *object, + PyObject *decoder, + const char *encoding, + const char *errors) { - PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v; - decoder = PyCodec_Decoder(encoding); - if (decoder == NULL) - goto onError; - args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -399,6 +421,139 @@ PyObject *PyCodec_Decode(PyObject *object, return NULL; } +/* Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = PyCodec_Encoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *PyCodec_Decode(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = PyCodec_Decoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + +/* Text encoding/decoding API */ +PyObject * _PyCodec_LookupTextEncoding(const char *encoding, + const char *alternate_command) +{ + PyObject *codec; + PyObject *attr; + int is_text_codec; + + codec = _PyCodec_Lookup(encoding); + if (codec == NULL) + return NULL; + + /* Backwards compatibility: assume any raw tuple describes a text + * encoding, and the same for anything lacking the private + * attribute. + */ + if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) { + attr = PyObject_GetAttrString(codec, "_is_text_encoding"); + if (attr == NULL) { + if (!PyErr_ExceptionMatches(PyExc_AttributeError)) + goto onError; + PyErr_Clear(); + } else { + is_text_codec = PyObject_IsTrue(attr); + Py_DECREF(attr); + if (is_text_codec < 0) + goto onError; + if (!is_text_codec) { + PyObject *msg = PyString_FromFormat( + "'%.400s' is not a text encoding; " + "use %s to handle arbitrary codecs", + encoding, alternate_command); + if (msg == NULL) + goto onError; + if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) { + Py_DECREF(msg); + goto onError; + } + Py_DECREF(msg); + } + } + } + + /* This appears to be a valid text encoding */ + return codec; + + onError: + Py_DECREF(codec); + return NULL; +} + + +static +PyObject *codec_getitem_checked(const char *encoding, + const char *alternate_command, + int index) +{ + PyObject *codec; + PyObject *v; + + codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); + if (codec == NULL) + return NULL; + + v = PyTuple_GET_ITEM(codec, index); + Py_INCREF(v); + Py_DECREF(codec); + return v; +} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "codecs.encode()", 0); +} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{ + return codec_getitem_checked(encoding, "codecs.decode()", 1); +} + +PyObject *_PyCodec_EncodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *encoder; + + encoder = _PyCodec_TextEncoder(encoding); + if (encoder == NULL) + return NULL; + + return _PyCodec_EncodeInternal(object, encoder, encoding, errors); +} + +PyObject *_PyCodec_DecodeText(PyObject *object, + const char *encoding, + const char *errors) +{ + PyObject *decoder; + + decoder = _PyCodec_TextDecoder(encoding); + if (decoder == NULL) + return NULL; + + return _PyCodec_DecodeInternal(object, decoder, encoding, errors); +} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the |