summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/whatsnew/3.6.rst3
-rw-r--r--Lib/test/test_codecs.py60
-rw-r--r--Misc/NEWS4
-rw-r--r--Objects/unicodeobject.c16
4 files changed, 83 insertions, 0 deletions
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
index 0ea3f3b..16cdca0 100644
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -117,6 +117,9 @@ Optimizations
* The ASCII decoder is now up to 60 times as fast for error handlers:
``surrogateescape``, ``ignore`` and ``replace``.
+* The ASCII and the Latin1 encoders are now up to 3 times as fast for the error
+ error ``surrogateescape``.
+
Build and C API Changes
=======================
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index e0e3119..254c0c1 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3060,7 +3060,31 @@ class CodePageTest(unittest.TestCase):
class ASCIITest(unittest.TestCase):
+ def test_encode(self):
+ self.assertEqual('abc123'.encode('ascii'), b'abc123')
+
+ def test_encode_error(self):
+ for data, error_handler, expected in (
+ ('[\x80\xff\u20ac]', 'ignore', b'[]'),
+ ('[\x80\xff\u20ac]', 'replace', b'[???]'),
+ ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'),
+ ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'),
+ ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.encode('ascii', error_handler),
+ expected)
+
+ def test_encode_surrogateescape_error(self):
+ with self.assertRaises(UnicodeEncodeError):
+ # the first character can be decoded, but not the second
+ '\udc80\xff'.encode('ascii', 'surrogateescape')
+
def test_decode(self):
+ self.assertEqual(b'abc'.decode('ascii'), 'abc')
+
+ def test_decode_error(self):
for data, error_handler, expected in (
(b'[\x80\xff]', 'ignore', '[]'),
(b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
@@ -3073,5 +3097,41 @@ class ASCIITest(unittest.TestCase):
expected)
+class Latin1Test(unittest.TestCase):
+ def test_encode(self):
+ for data, expected in (
+ ('abc', b'abc'),
+ ('\x80\xe9\xff', b'\x80\xe9\xff'),
+ ):
+ with self.subTest(data=data, expected=expected):
+ self.assertEqual(data.encode('latin1'), expected)
+
+ def test_encode_errors(self):
+ for data, error_handler, expected in (
+ ('[\u20ac\udc80]', 'ignore', b'[]'),
+ ('[\u20ac\udc80]', 'replace', b'[??]'),
+ ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'),
+ ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'),
+ ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
+ ):
+ with self.subTest(data=data, error_handler=error_handler,
+ expected=expected):
+ self.assertEqual(data.encode('latin1', error_handler),
+ expected)
+
+ def test_encode_surrogateescape_error(self):
+ with self.assertRaises(UnicodeEncodeError):
+ # the first character can be decoded, but not the second
+ '\udc80\u20ac'.encode('latin1', 'surrogateescape')
+
+ def test_decode(self):
+ for data, expected in (
+ (b'abc', 'abc'),
+ (b'[\x80\xff]', '[\x80\xff]'),
+ ):
+ with self.subTest(data=data, expected=expected):
+ self.assertEqual(data.decode('latin1'), expected)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/Misc/NEWS b/Misc/NEWS
index 72e521d..ceac0a3 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ Release date: XXXX-XX-XX
Core and Builtins
-----------------
+- Issue #25227: Optimize ASCII and latin1 encoders with the ``surrogateescape``
+ error handler: the encoders are now up to 3 times as fast. Initial patch
+ written by Serhiy Storchaka.
+
- Issue #25003: On Solaris 11.3 or newer, os.urandom() now uses the
getrandom() function instead of the getentropy() function. The getentropy()
function is blocking to generate very good quality entropy, os.urandom()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index da2aac7..6657cd4 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6532,6 +6532,22 @@ unicode_encode_ucs1(PyObject *unicode,
pos = collend;
break;
+ case _Py_ERROR_SURROGATEESCAPE:
+ for (i = collstart; i < collend; ++i) {
+ ch = PyUnicode_READ(kind, data, i);
+ if (ch < 0xdc80 || 0xdcff < ch) {
+ /* Not a UTF-8b surrogate */
+ break;
+ }
+ *str++ = (char)(ch - 0xdc00);
+ ++pos;
+ }
+ if (i >= collend)
+ break;
+ collstart = pos;
+ assert(collstart != collend);
+ /* fallback to general error handling */
+
default:
repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
encoding, reason, unicode, &exc,