From 797485e10135ca323565b22b4fabf1e161a5ec7a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 9 Oct 2015 03:17:30 +0200 Subject: Issue #25318: Avoid sprintf() in backslashreplace() Rewrite backslashreplace() to be closer to PyCodec_BackslashReplaceErrors(). Add also unit tests for non-BMP characters. --- Lib/test/test_codecs.py | 6 ++++-- Objects/unicodeobject.c | 25 ++++++++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 7b6883f..ff314b1 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3155,7 +3155,8 @@ class ASCIITest(unittest.TestCase): ('[\x80\xff\u20ac]', 'ignore', b'[]'), ('[\x80\xff\u20ac]', 'replace', b'[???]'), ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[€ÿ€]'), - ('[\x80\xff\u20ac]', 'backslashreplace', b'[\\x80\\xff\\u20ac]'), + ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace', + b'[\\x80\\xff\\u20ac\\U000abcde]'), ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), ): with self.subTest(data=data, error_handler=error_handler, @@ -3197,7 +3198,8 @@ class Latin1Test(unittest.TestCase): for data, error_handler, expected in ( ('[\u20ac\udc80]', 'ignore', b'[]'), ('[\u20ac\udc80]', 'replace', b'[??]'), - ('[\u20ac\udc80]', 'backslashreplace', b'[\\u20ac\\udc80]'), + ('[\u20ac\U000abcde]', 'backslashreplace', + b'[\\u20ac\\U000abcde]'), ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[€�]'), ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'), ): diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 614d214..10cdcc0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -610,14 +610,25 @@ backslashreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char, /* generate replacement */ for (i = collstart; i < collend; ++i) { ch = PyUnicode_READ(kind, data, i); - if (ch < 0x100) - str += sprintf(str, "\\x%02x", ch); - else if (ch < 0x10000) - str += sprintf(str, "\\u%04x", ch); - else { - assert(ch <= MAX_UNICODE); - str += sprintf(str, "\\U%08x", ch); + *str++ = '\\'; + if (ch >= 0x00010000) { + *str++ = 'U'; + *str++ = Py_hexdigits[(ch>>28)&0xf]; + *str++ = Py_hexdigits[(ch>>24)&0xf]; + *str++ = Py_hexdigits[(ch>>20)&0xf]; + *str++ = Py_hexdigits[(ch>>16)&0xf]; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; + } + else if (ch >= 0x100) { + *str++ = 'u'; + *str++ = Py_hexdigits[(ch>>12)&0xf]; + *str++ = Py_hexdigits[(ch>>8)&0xf]; } + else + *str++ = 'x'; + *str++ = Py_hexdigits[(ch>>4)&0xf]; + *str++ = Py_hexdigits[ch&0xf]; } return str; } -- cgit v0.12