summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2015-05-31 17:21:00 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2015-05-31 17:21:00 (GMT)
commitc7797dc7482035ee166ca2e941b623382b92e1fc (patch)
tree526e26fa4dac506f02859fdbe946d33ed4165f5e
parentcfb7028df4bdf12325786e48ebef3b4982efa119 (diff)
downloadcpython-c7797dc7482035ee166ca2e941b623382b92e1fc.zip
cpython-c7797dc7482035ee166ca2e941b623382b92e1fc.tar.gz
cpython-c7797dc7482035ee166ca2e941b623382b92e1fc.tar.bz2
Issue #19543: Emit deprecation warning for known non-text encodings.
Backported issues #19619: encode() and decode() methods and constructors of str, unicode and bytearray classes now emit deprecation warning for known non-text encodings when Python is ran with the -3 option. Backported issues #20404: io.TextIOWrapper (and hence io.open()) now uses the internal codec marking system added to emit deprecation warning for known non-text encodings at stream construction time when Python is ran with the -3 option.
-rw-r--r--Include/codecs.h45
-rw-r--r--Lib/_pyio.py6
-rw-r--r--Lib/codecs.py14
-rw-r--r--Lib/encodings/base64_codec.py1
-rw-r--r--Lib/encodings/bz2_codec.py1
-rw-r--r--Lib/encodings/hex_codec.py1
-rw-r--r--Lib/encodings/quopri_codec.py1
-rwxr-xr-xLib/encodings/rot_13.py1
-rw-r--r--Lib/encodings/uu_codec.py1
-rw-r--r--Lib/encodings/zlib_codec.py1
-rw-r--r--Lib/json/decoder.py6
-rw-r--r--Lib/test/string_tests.py12
-rw-r--r--Lib/test/test_calendar.py4
-rw-r--r--Lib/test/test_codecs.py48
-rw-r--r--Lib/test/test_fileinput.py7
-rw-r--r--Lib/test/test_io.py43
-rw-r--r--Misc/NEWS8
-rw-r--r--Modules/_io/textio.c34
-rw-r--r--Objects/bytearrayobject.c6
-rw-r--r--Objects/stringobject.c4
-rw-r--r--Objects/unicodeobject.c8
-rw-r--r--Python/codecs.c209
22 files changed, 391 insertions, 70 deletions
diff --git a/Include/codecs.h b/Include/codecs.h
index c038c6a..8a9041b 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -81,6 +81,51 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode(
const char *errors
);
+/* Text codec specific encoding and decoding API.
+
+ Checks the encoding against a list of codecs which do not
+ implement a unicode<->bytes encoding before attempting the
+ operation.
+
+ Please note that these APIs are internal and should not
+ be used in Python C extensions.
+
+ XXX (ncoghlan): should we make these, or something like them, public
+ in Python 3.5+?
+
+ */
+PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
+ const char *encoding,
+ const char *alternate_command
+ );
+
+PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
+ PyObject *object,
+ const char *encoding,
+ const char *errors
+ );
+
+PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
+ PyObject *object,
+ const char *encoding,
+ const char *errors
+ );
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
+ PyObject *codec_info,
+ const char *errors
+ );
+
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
+ PyObject *codec_info,
+ const char *errors
+ );
+
+
+
/* --- Codec Lookup APIs --------------------------------------------------
All APIs return a codec object with incremented refcount and are
diff --git a/Lib/_pyio.py b/Lib/_pyio.py
index a7f4301..694b778 100644
--- a/Lib/_pyio.py
+++ b/Lib/_pyio.py
@@ -7,6 +7,7 @@ from __future__ import (print_function, unicode_literals)
import os
import abc
import codecs
+import sys
import warnings
import errno
# Import thread instead of threading to reduce startup cost
@@ -1497,6 +1498,11 @@ class TextIOWrapper(TextIOBase):
if not isinstance(encoding, basestring):
raise ValueError("invalid encoding: %r" % encoding)
+ if sys.py3kwarning and not codecs.lookup(encoding)._is_text_encoding:
+ msg = ("%r is not a text encoding; "
+ "use codecs.open() to handle arbitrary codecs")
+ warnings.warnpy3k(msg % encoding, stacklevel=2)
+
if errors is None:
errors = "strict"
else:
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 049a3f0..12213e2 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -79,9 +79,19 @@ BOM64_BE = BOM_UTF32_BE
### Codec base classes (defining the API)
class CodecInfo(tuple):
+ """Codec details when looking up the codec registry"""
+
+ # Private API to allow Python to blacklist the known non-Unicode
+ # codecs in the standard library. A more general mechanism to
+ # reliably distinguish test encodings from other codecs will hopefully
+ # be defined for Python 3.5
+ #
+ # See http://bugs.python.org/issue19619
+ _is_text_encoding = True # Assume codecs are text encodings by default
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
- incrementalencoder=None, incrementaldecoder=None, name=None):
+ incrementalencoder=None, incrementaldecoder=None, name=None,
+ _is_text_encoding=None):
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
self.name = name
self.encode = encode
@@ -90,6 +100,8 @@ class CodecInfo(tuple):
self.incrementaldecoder = incrementaldecoder
self.streamwriter = streamwriter
self.streamreader = streamreader
+ if _is_text_encoding is not None:
+ self._is_text_encoding = _is_text_encoding
return self
def __repr__(self):
diff --git a/Lib/encodings/base64_codec.py b/Lib/encodings/base64_codec.py
index f84e780..34ac555 100644
--- a/Lib/encodings/base64_codec.py
+++ b/Lib/encodings/base64_codec.py
@@ -76,4 +76,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/bz2_codec.py b/Lib/encodings/bz2_codec.py
index 054b36b..136503a 100644
--- a/Lib/encodings/bz2_codec.py
+++ b/Lib/encodings/bz2_codec.py
@@ -99,4 +99,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/hex_codec.py b/Lib/encodings/hex_codec.py
index 91b38d9..154488c 100644
--- a/Lib/encodings/hex_codec.py
+++ b/Lib/encodings/hex_codec.py
@@ -76,4 +76,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/quopri_codec.py b/Lib/encodings/quopri_codec.py
index d8683fd..f259149 100644
--- a/Lib/encodings/quopri_codec.py
+++ b/Lib/encodings/quopri_codec.py
@@ -72,4 +72,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/rot_13.py b/Lib/encodings/rot_13.py
index 52b6431..4eaf433 100755
--- a/Lib/encodings/rot_13.py
+++ b/Lib/encodings/rot_13.py
@@ -44,6 +44,7 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
+ _is_text_encoding=False,
)
### Decoding Map
diff --git a/Lib/encodings/uu_codec.py b/Lib/encodings/uu_codec.py
index 4b137a5..5cb0d2b 100644
--- a/Lib/encodings/uu_codec.py
+++ b/Lib/encodings/uu_codec.py
@@ -126,4 +126,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _is_text_encoding=False,
)
diff --git a/Lib/encodings/zlib_codec.py b/Lib/encodings/zlib_codec.py
index 3419f9f..0c2599d 100644
--- a/Lib/encodings/zlib_codec.py
+++ b/Lib/encodings/zlib_codec.py
@@ -99,4 +99,5 @@ def getregentry():
incrementaldecoder=IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
+ _is_text_encoding=False,
)
diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 1b43238..5141f87 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py
@@ -15,10 +15,8 @@ __all__ = ['JSONDecoder']
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
def _floatconstants():
- _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
- if sys.byteorder != 'big':
- _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
- nan, inf = struct.unpack('dd', _BYTES)
+ nan, = struct.unpack('>d', b'\x7f\xf8\x00\x00\x00\x00\x00\x00')
+ inf, = struct.unpack('>d', b'\x7f\xf0\x00\x00\x00\x00\x00\x00')
return nan, inf, -inf
NaN, PosInf, NegInf = _floatconstants()
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
index 6d87eb6..b2f837b 100644
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -1295,8 +1295,10 @@ class MixinStrUserStringTest:
('hex', '68656c6c6f20776f726c64'),
('uu', 'begin 666 <data>\n+:&5L;&\\@=V]R;&0 \n \nend\n')]
for encoding, data in codecs:
- self.checkequal(data, 'hello world', 'encode', encoding)
- self.checkequal('hello world', data, 'decode', encoding)
+ with test_support.check_py3k_warnings():
+ self.checkequal(data, 'hello world', 'encode', encoding)
+ with test_support.check_py3k_warnings():
+ self.checkequal('hello world', data, 'decode', encoding)
# zlib is optional, so we make the test optional too...
try:
import zlib
@@ -1304,8 +1306,10 @@ class MixinStrUserStringTest:
pass
else:
data = 'x\x9c\xcbH\xcd\xc9\xc9W(\xcf/\xcaI\x01\x00\x1a\x0b\x04]'
- self.checkequal(data, 'hello world', 'encode', 'zlib')
- self.checkequal('hello world', data, 'decode', 'zlib')
+ with test_support.check_py3k_warnings():
+ self.checkequal(data, 'hello world', 'encode', 'zlib')
+ with test_support.check_py3k_warnings():
+ self.checkequal('hello world', data, 'decode', 'zlib')
self.checkraises(TypeError, 'xyz', 'decode', 42)
self.checkraises(TypeError, 'xyz', 'encode', 42)
diff --git a/Lib/test/test_calendar.py b/Lib/test/test_calendar.py
index 5692642..46c4a6f 100644
--- a/Lib/test/test_calendar.py
+++ b/Lib/test/test_calendar.py
@@ -513,8 +513,8 @@ class CommandLineTestCase(unittest.TestCase):
def test_option_encoding(self):
self.assertFailure('-e')
self.assertFailure('--encoding')
- stdout = self.run_ok('--encoding', 'rot-13', '2004')
- self.assertEqual(stdout.strip(), conv(result_2004_text.encode('rot-13')).strip())
+ stdout = self.run_ok('--encoding', 'utf-16-le', '2004')
+ self.assertEqual(stdout.strip(), conv(result_2004_text.encode('utf-16-le')).strip())
def test_option_locale(self):
self.assertFailure('-L')
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index de80b07..c7072a6 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -1395,14 +1395,14 @@ class EncodedFileTest(unittest.TestCase):
class Str2StrTest(unittest.TestCase):
def test_read(self):
- sin = "\x80".encode("base64_codec")
+ sin = codecs.encode("\x80", "base64_codec")
reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
sout = reader.read()
self.assertEqual(sout, "\x80")
self.assertIsInstance(sout, str)
def test_readline(self):
- sin = "\x80".encode("base64_codec")
+ sin = codecs.encode("\x80", "base64_codec")
reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
sout = reader.readline()
self.assertEqual(sout, "\x80")
@@ -1536,6 +1536,9 @@ broken_unicode_with_streams = [
]
broken_incremental_coders = broken_unicode_with_streams[:]
+if sys.flags.py3k_warning:
+ broken_unicode_with_streams.append("rot_13")
+
# The following encodings only support "strict" mode
only_strict_mode = [
"idna",
@@ -2135,6 +2138,47 @@ def test_main():
# Missing "begin" line
self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
+ def test_text_to_binary_blacklists_binary_transforms(self):
+ # Check binary -> binary codecs give a good error for str input
+ bad_input = "bad input type"
+ for encoding in bytes_transform_encodings:
+ fmt = (r"{!r} is not a text encoding; "
+ r"use codecs.encode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.encode(encoding)
+ self.assertIsNone(failure.exception.__cause__)
+
+ def test_text_to_binary_blacklists_text_transforms(self):
+ # Check str.encode gives a good error message for str -> str codecs
+ msg = (r"^'rot_13' is not a text encoding; "
+ r"use codecs.encode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg):
+ "just an example message".encode("rot_13")
+
+ def test_binary_to_text_blacklists_binary_transforms(self):
+ # Check bytes.decode and bytearray.decode give a good error
+ # message for binary -> binary codecs
+ data = b"encode first to ensure we meet any format restrictions"
+ for encoding in bytes_transform_encodings:
+ encoded_data = codecs.encode(data, encoding)
+ fmt = (r"{!r} is not a text encoding; "
+ r"use codecs.decode\(\) to handle arbitrary codecs")
+ msg = fmt.format(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ encoded_data.decode(encoding)
+ with self.assertRaisesRegex(LookupError, msg):
+ bytearray(encoded_data).decode(encoding)
+
+ def test_binary_to_text_blacklists_text_transforms(self):
+ # Check str -> str codec gives a good error for binary input
+ for bad_input in (b"immutable", bytearray(b"mutable")):
+ msg = (r"^'rot_13' is not a text encoding; "
+ r"use codecs.decode\(\) to handle arbitrary codecs")
+ with self.assertRaisesRegex(LookupError, msg) as failure:
+ bad_input.decode("rot_13")
+ self.assertIsNone(failure.exception.__cause__)
+
if __name__ == "__main__":
test_main()
diff --git a/Lib/test/test_fileinput.py b/Lib/test/test_fileinput.py
index c15ad84..facc56e 100644
--- a/Lib/test/test_fileinput.py
+++ b/Lib/test/test_fileinput.py
@@ -211,10 +211,11 @@ class FileInputTests(unittest.TestCase):
except ValueError:
pass
try:
- t1 = writeTmp(1, ["A\nB"], mode="wb")
- fi = FileInput(files=t1, openhook=hook_encoded("rot13"))
+ # UTF-7 is a convenient, seldom used encoding
+ t1 = writeTmp(1, ['+AEE-\n+AEI-'], mode="wb")
+ fi = FileInput(files=t1, openhook=hook_encoded("utf-7"))
lines = list(fi)
- self.assertEqual(lines, ["N\n", "O"])
+ self.assertEqual(lines, [u'A\n', u'B'])
finally:
remove_tempfiles(t1)
diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index bbc804b..1a17d81 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2001,6 +2001,15 @@ class TextIOWrapperTest(unittest.TestCase):
t.__init__(self.MockRawIO())
self.assertEqual(t.read(0), u'')
+ def test_non_text_encoding_codecs_are_rejected(self):
+ # Ensure the constructor complains if passed a codec that isn't
+ # marked as a text encoding
+ # http://bugs.python.org/issue20404
+ r = self.BytesIO()
+ b = self.BufferedWriter(r)
+ with support.check_py3k_warnings():
+ self.TextIOWrapper(b, encoding="hex_codec")
+
def test_detach(self):
r = self.BytesIO()
b = self.BufferedWriter(r)
@@ -2617,19 +2626,39 @@ class TextIOWrapperTest(unittest.TestCase):
def test_illegal_decoder(self):
# Issue #17106
+ # Bypass the early encoding check added in issue 20404
+ def _make_illegal_wrapper():
+ quopri = codecs.lookup("quopri_codec")
+ quopri._is_text_encoding = True
+ try:
+ t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'),
+ newline='\n', encoding="quopri_codec")
+ finally:
+ quopri._is_text_encoding = False
+ return t
# Crash when decoder returns non-string
- t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
- encoding='quopri_codec')
+ with support.check_py3k_warnings():
+ t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+ encoding='quopri_codec')
with self.maybeRaises(TypeError):
t.read(1)
- t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
- encoding='quopri_codec')
+ with support.check_py3k_warnings():
+ t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+ encoding='quopri_codec')
with self.maybeRaises(TypeError):
t.readline()
- t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
- encoding='quopri_codec')
+ with support.check_py3k_warnings():
+ t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n',
+ encoding='quopri_codec')
with self.maybeRaises(TypeError):
t.read()
+ #else:
+ #t = _make_illegal_wrapper()
+ #self.assertRaises(TypeError, t.read, 1)
+ #t = _make_illegal_wrapper()
+ #self.assertRaises(TypeError, t.readline)
+ #t = _make_illegal_wrapper()
+ #self.assertRaises(TypeError, t.read)
class CTextIOWrapperTest(TextIOWrapperTest):
@@ -3002,9 +3031,11 @@ class MiscIOTest(unittest.TestCase):
class CMiscIOTest(MiscIOTest):
io = io
+ shutdown_error = "RuntimeError: could not find io module state"
class PyMiscIOTest(MiscIOTest):
io = pyio
+ shutdown_error = "LookupError: unknown encoding: ascii"
@unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.')
diff --git a/Misc/NEWS b/Misc/NEWS
index 4de6d0d..05cb0cc 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 2.7.11?
Core and Builtins
-----------------
+- Issue #19543: encode() and decode() methods and constructors of str,
+ unicode and bytearray classes now emit deprecation warning for known
+ non-text encodings when Python is ran with the -3 option.
+
- Issue #24115: Update uses of PyObject_IsTrue(), PyObject_Not(),
PyObject_IsInstance(), PyObject_RichCompareBool() and _PyDict_Contains()
to check for and handle errors correctly.
@@ -26,6 +30,10 @@ Core and Builtins
Library
-------
+- Issue #19543: io.TextIOWrapper (and hence io.open()) now uses the internal
+ codec marking system added to emit deprecation warning for known non-text
+ encodings at stream construction time when Python is ran with the -3 option.
+
- Issue #24264: Fixed buffer overflow in the imageop module.
- Issue #5633: Fixed timeit when the statement is a string and the setup is not.
diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c
index 8ac8a4a..9981d4c 100644
--- a/Modules/_io/textio.c
+++ b/Modules/_io/textio.c
@@ -826,7 +826,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
char *kwlist[] = {"buffer", "encoding", "errors",
"newline", "line_buffering",
NULL};
- PyObject *buffer, *raw;
+ PyObject *buffer, *raw, *codec_info = NULL;
char *encoding = NULL;
char *errors = NULL;
char *newline = NULL;
@@ -909,6 +909,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
"could not determine default encoding");
}
+ /* Check we have been asked for a real text encoding */
+ codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()");
+ if (codec_info == NULL) {
+ Py_CLEAR(self->encoding);
+ goto error;
+ }
+
+ /* XXX: Failures beyond this point have the potential to leak elements
+ * of the partially constructed object (like self->encoding)
+ */
+
if (errors == NULL)
errors = "strict";
self->errors = PyBytes_FromString(errors);
@@ -922,7 +933,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
if (newline) {
self->readnl = PyString_FromString(newline);
if (self->readnl == NULL)
- return -1;
+ goto error;
}
self->writetranslate = (newline == NULL || newline[0] != '\0');
if (!self->readuniversal && self->writetranslate) {
@@ -944,8 +955,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
if (r == -1)
goto error;
if (r == 1) {
- self->decoder = PyCodec_IncrementalDecoder(
- encoding, errors);
+ self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info,
+ errors);
if (self->decoder == NULL)
goto error;
@@ -969,17 +980,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
if (r == -1)
goto error;
if (r == 1) {
- PyObject *ci;
- self->encoder = PyCodec_IncrementalEncoder(
- encoding, errors);
+ self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info,
+ errors);
if (self->encoder == NULL)
goto error;
/* Get the normalized named of the codec */
- ci = _PyCodec_Lookup(encoding);
- if (ci == NULL)
- goto error;
- res = PyObject_GetAttrString(ci, "name");
- Py_DECREF(ci);
+ res = PyObject_GetAttrString(codec_info, "name");
if (res == NULL) {
if (PyErr_ExceptionMatches(PyExc_AttributeError))
PyErr_Clear();
@@ -999,6 +1005,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
Py_XDECREF(res);
}
+ /* Finished sorting out the codec details */
+ Py_DECREF(codec_info);
+
self->buffer = buffer;
Py_INCREF(buffer);
@@ -1059,6 +1068,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds)
return 0;
error:
+ Py_XDECREF(codec_info);
return -1;
}
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index fd201ca..5f57580 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -783,7 +783,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
if (PyBytes_Check(arg)) {
PyObject *new, *encoded;
if (encoding != NULL) {
- encoded = PyCodec_Encode(arg, encoding, errors);
+ encoded = _PyCodec_EncodeText(arg, encoding, errors);
if (encoded == NULL)
return -1;
assert(PyBytes_Check(encoded));
@@ -809,7 +809,7 @@ bytearray_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
"unicode argument without an encoding");
return -1;
}
- encoded = PyCodec_Encode(arg, encoding, errors);
+ encoded = _PyCodec_EncodeText(arg, encoding, errors);
if (encoded == NULL)
return -1;
assert(PyBytes_Check(encoded));
@@ -2567,7 +2567,7 @@ bytearray_decode(PyObject *self, PyObject *args, PyObject *kwargs)
return NULL;
#endif
}
- return PyCodec_Decode(self, encoding, errors);
+ return _PyCodec_DecodeText(self, encoding, errors);
}
PyDoc_STRVAR(alloc_doc,
diff --git a/Objects/stringobject.c b/Objects/stringobject.c
index 46f46db..c1e12a7 100644
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -449,7 +449,7 @@ PyObject *PyString_AsDecodedObject(PyObject *str,
}
/* Decode via the codec registry */
- v = PyCodec_Decode(str, encoding, errors);
+ v = _PyCodec_DecodeText(str, encoding, errors);
if (v == NULL)
goto onError;
@@ -529,7 +529,7 @@ PyObject *PyString_AsEncodedObject(PyObject *str,
}
/* Encode via the codec registry */
- v = PyCodec_Encode(str, encoding, errors);
+ v = _PyCodec_EncodeText(str, encoding, errors);
if (v == NULL)
goto onError;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 91e7524..08723ac 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1259,7 +1259,7 @@ PyObject *PyUnicode_Decode(const char *s,
buffer = PyBuffer_FromMemory((void *)s, size);
if (buffer == NULL)
goto onError;
- unicode = PyCodec_Decode(buffer, encoding, errors);
+ unicode = _PyCodec_DecodeText(buffer, encoding, errors);
if (unicode == NULL)
goto onError;
if (!PyUnicode_Check(unicode)) {
@@ -1292,7 +1292,7 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
encoding = PyUnicode_GetDefaultEncoding();
/* Decode via the codec registry */
- v = PyCodec_Decode(unicode, encoding, errors);
+ v = _PyCodec_DecodeText(unicode, encoding, errors);
if (v == NULL)
goto onError;
return v;
@@ -1331,7 +1331,7 @@ PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
encoding = PyUnicode_GetDefaultEncoding();
/* Encode via the codec registry */
- v = PyCodec_Encode(unicode, encoding, errors);
+ v = _PyCodec_EncodeText(unicode, encoding, errors);
if (v == NULL)
goto onError;
return v;
@@ -1369,7 +1369,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
}
/* Encode via the codec registry */
- v = PyCodec_Encode(unicode, encoding, errors);
+ v = _PyCodec_EncodeText(unicode, encoding, errors);
if (v == NULL)
goto onError;
if (!PyString_Check(v)) {
diff --git a/Python/codecs.c b/Python/codecs.c
index 184d147..d672362 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -217,20 +217,15 @@ PyObject *codec_getitem(const char *encoding, int index)
return v;
}
-/* Helper function to create an incremental codec. */
-
+/* Helper functions to create an incremental codec. */
static
-PyObject *codec_getincrementalcodec(const char *encoding,
- const char *errors,
- const char *attrname)
+PyObject *codec_makeincrementalcodec(PyObject *codec_info,
+ const char *errors,
+ const char *attrname)
{
- PyObject *codecs, *ret, *inccodec;
+ PyObject *ret, *inccodec;
- codecs = _PyCodec_Lookup(encoding);
- if (codecs == NULL)
- return NULL;
- inccodec = PyObject_GetAttrString(codecs, attrname);
- Py_DECREF(codecs);
+ inccodec = PyObject_GetAttrString(codec_info, attrname);
if (inccodec == NULL)
return NULL;
if (errors)
@@ -241,6 +236,21 @@ PyObject *codec_getincrementalcodec(const char *encoding,
return ret;
}
+static
+PyObject *codec_getincrementalcodec(const char *encoding,
+ const char *errors,
+ const char *attrname)
+{
+ PyObject *codec_info, *ret;
+
+ codec_info = _PyCodec_Lookup(encoding);
+ if (codec_info == NULL)
+ return NULL;
+ ret = codec_makeincrementalcodec(codec_info, errors, attrname);
+ Py_DECREF(codec_info);
+ return ret;
+}
+
/* Helper function to create a stream codec. */
static
@@ -264,6 +274,24 @@ PyObject *codec_getstreamcodec(const char *encoding,
return streamcodec;
}
+/* Helpers to work with the result of _PyCodec_Lookup
+
+ */
+PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
+ const char *errors)
+{
+ return codec_makeincrementalcodec(codec_info, errors,
+ "incrementaldecoder");
+}
+
+PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
+ const char *errors)
+{
+ return codec_makeincrementalcodec(codec_info, errors,
+ "incrementalencoder");
+}
+
+
/* Convenience APIs to query the Codec registry.
All APIs return a codec object with incremented refcount.
@@ -311,18 +339,15 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
errors is passed to the encoder factory as argument if non-NULL. */
-PyObject *PyCodec_Encode(PyObject *object,
- const char *encoding,
- const char *errors)
+static PyObject *
+_PyCodec_EncodeInternal(PyObject *object,
+ PyObject *encoder,
+ const char *encoding,
+ const char *errors)
{
- PyObject *encoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v;
- encoder = PyCodec_Encoder(encoding);
- if (encoder == NULL)
- goto onError;
-
args = args_tuple(object, errors);
if (args == NULL)
goto onError;
@@ -358,18 +383,15 @@ PyObject *PyCodec_Encode(PyObject *object,
errors is passed to the decoder factory as argument if non-NULL. */
-PyObject *PyCodec_Decode(PyObject *object,
- const char *encoding,
- const char *errors)
+static PyObject *
+_PyCodec_DecodeInternal(PyObject *object,
+ PyObject *decoder,
+ const char *encoding,
+ const char *errors)
{
- PyObject *decoder = NULL;
PyObject *args = NULL, *result = NULL;
PyObject *v;
- decoder = PyCodec_Decoder(encoding);
- if (decoder == NULL)
- goto onError;
-
args = args_tuple(object, errors);
if (args == NULL)
goto onError;
@@ -399,6 +421,139 @@ PyObject *PyCodec_Decode(PyObject *object,
return NULL;
}
+/* Generic encoding/decoding API */
+PyObject *PyCodec_Encode(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *encoder;
+
+ encoder = PyCodec_Encoder(encoding);
+ if (encoder == NULL)
+ return NULL;
+
+ return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *PyCodec_Decode(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *decoder;
+
+ decoder = PyCodec_Decoder(encoding);
+ if (decoder == NULL)
+ return NULL;
+
+ return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
+/* Text encoding/decoding API */
+PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
+ const char *alternate_command)
+{
+ PyObject *codec;
+ PyObject *attr;
+ int is_text_codec;
+
+ codec = _PyCodec_Lookup(encoding);
+ if (codec == NULL)
+ return NULL;
+
+ /* Backwards compatibility: assume any raw tuple describes a text
+ * encoding, and the same for anything lacking the private
+ * attribute.
+ */
+ if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
+ attr = PyObject_GetAttrString(codec, "_is_text_encoding");
+ if (attr == NULL) {
+ if (!PyErr_ExceptionMatches(PyExc_AttributeError))
+ goto onError;
+ PyErr_Clear();
+ } else {
+ is_text_codec = PyObject_IsTrue(attr);
+ Py_DECREF(attr);
+ if (is_text_codec < 0)
+ goto onError;
+ if (!is_text_codec) {
+ PyObject *msg = PyString_FromFormat(
+ "'%.400s' is not a text encoding; "
+ "use %s to handle arbitrary codecs",
+ encoding, alternate_command);
+ if (msg == NULL)
+ goto onError;
+ if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
+ Py_DECREF(msg);
+ goto onError;
+ }
+ Py_DECREF(msg);
+ }
+ }
+ }
+
+ /* This appears to be a valid text encoding */
+ return codec;
+
+ onError:
+ Py_DECREF(codec);
+ return NULL;
+}
+
+
+static
+PyObject *codec_getitem_checked(const char *encoding,
+ const char *alternate_command,
+ int index)
+{
+ PyObject *codec;
+ PyObject *v;
+
+ codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
+ if (codec == NULL)
+ return NULL;
+
+ v = PyTuple_GET_ITEM(codec, index);
+ Py_INCREF(v);
+ Py_DECREF(codec);
+ return v;
+}
+
+static PyObject * _PyCodec_TextEncoder(const char *encoding)
+{
+ return codec_getitem_checked(encoding, "codecs.encode()", 0);
+}
+
+static PyObject * _PyCodec_TextDecoder(const char *encoding)
+{
+ return codec_getitem_checked(encoding, "codecs.decode()", 1);
+}
+
+PyObject *_PyCodec_EncodeText(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *encoder;
+
+ encoder = _PyCodec_TextEncoder(encoding);
+ if (encoder == NULL)
+ return NULL;
+
+ return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
+}
+
+PyObject *_PyCodec_DecodeText(PyObject *object,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *decoder;
+
+ decoder = _PyCodec_TextDecoder(encoding);
+ if (decoder == NULL)
+ return NULL;
+
+ return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
+}
+
/* Register the error handling callback function error under the name
name. This function will be called by the codec when it encounters
an unencodable characters/undecodable bytes and doesn't know the