[3.14] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133942)

If the error handler is used, a new bytes object is created to set as the object attribute of UnicodeDecodeError, and that bytes object then replaces the original data. A pointer to the decoded data will became invalid after destroying that temporary bytes object. So we need other way to return the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal(). _PyBytes_DecodeEscape() does not have such issue, because it does not use the error handlers registry, but it should be changed for compatibility with _PyUnicode_DecodeUnicodeEscapeInternal(). (cherry picked from commit 9f69a58623bd01349a18ba0c7a9cb1dad6a51e8e) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> 2025-05-13 13:25:08 (GMT)
committer: GitHub <noreply@github.com> 2025-05-13 13:25:08 (GMT)
commit: 69b4387f78f413e8c47572a85b3478c47eba8142 (patch)
tree: 30bc6e580011bda809575c500cf3692edcb6a220 /Lib/test/test_codeccallbacks.py
parent: f0a7a6c2cc066523fc03d312cfebff8135d81aa2 (diff)
download: cpython-69b4387f78f413e8c47572a85b3478c47eba8142.zip
cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.gz
cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.bz2
1 files changed, 38 insertions, 1 deletions
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index 86e5e5c..a767f67 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -2,6 +2,7 @@ from _codecs import _unregister_error as _codecs_unregister_error
 import codecs
 import html.entities
 import itertools
+import re
 import sys
 import unicodedata
 import unittest
@@ -1125,7 +1126,7 @@ class CodecCallbackTest(unittest.TestCase):
             text = 'abc<def>ghi'*n
             text.translate(charmap)
 
-    def test_mutatingdecodehandler(self):
+    def test_mutating_decode_handler(self):
         baddata = [
             ("ascii", b"\xff"),
             ("utf-7", b"++"),
@@ -1160,6 +1161,42 @@ class CodecCallbackTest(unittest.TestCase):
         for (encoding, data) in baddata:
             self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
 
+    def test_mutating_decode_handler_unicode_escape(self):
+        decode = codecs.unicode_escape_decode
+        def mutating(exc):
+            if isinstance(exc, UnicodeDecodeError):
+                r = data.get(exc.object[:exc.end])
+                if r is not None:
+                    exc.object = r[0] + exc.object[exc.end:]
+                    return ('\u0404', r[1])
+            raise AssertionError("don't know how to handle %r" % exc)
+
+        codecs.register_error('test.mutating2', mutating)
+        data = {
+            br'\x0': (b'\\', 0),
+            br'\x3': (b'xxx\\', 3),
+            br'\x5': (b'x\\', 1),
+        }
+        def check(input, expected, msg):
+            with self.assertWarns(DeprecationWarning) as cm:
+                self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
+            self.assertIn(msg, str(cm.warning))
+
+        check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+        check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+        check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+
+        check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
+        check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
+        check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
+
+        check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
+        check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
+        check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
+        check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
+
     # issue32583
     def test_crashing_decode_handler(self):
         # better generating one more character to fill the extra space slot
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>	2025-05-13 13:25:08 (GMT)
committer	GitHub <noreply@github.com>	2025-05-13 13:25:08 (GMT)
commit	69b4387f78f413e8c47572a85b3478c47eba8142 (patch)
tree	30bc6e580011bda809575c500cf3692edcb6a220 /Lib/test/test_codeccallbacks.py
parent	f0a7a6c2cc066523fc03d312cfebff8135d81aa2 (diff)
download	cpython-69b4387f78f413e8c47572a85b3478c47eba8142.zip cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.gz cpython-69b4387f78f413e8c47572a85b3478c47eba8142.tar.bz2