summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2014-11-25 11:57:17 (GMT)
committerSerhiy Storchaka <storchaka@gmail.com>2014-11-25 11:57:17 (GMT)
commit166ebc4e5dd09f005c6144b7568da83728b8b893 (patch)
treef6b9deb3cb72095ef55bcef31637f4aaafe95248
parent6cecf68c7b51390429a2488846b1d0c29581987a (diff)
downloadcpython-166ebc4e5dd09f005c6144b7568da83728b8b893.zip
cpython-166ebc4e5dd09f005c6144b7568da83728b8b893.tar.gz
cpython-166ebc4e5dd09f005c6144b7568da83728b8b893.tar.bz2
Issue #19676: Added the "namereplace" error handler.
-rw-r--r--Doc/c-api/codec.rst5
-rw-r--r--Doc/howto/unicode.rst7
-rw-r--r--Doc/library/codecs.rst17
-rw-r--r--Doc/library/functions.rst3
-rw-r--r--Doc/library/io.rst7
-rw-r--r--Include/codecs.h3
-rw-r--r--Lib/codecs.py3
-rw-r--r--Lib/test/test_codeccallbacks.py100
-rw-r--r--Lib/test/test_codecs.py7
-rw-r--r--Misc/NEWS2
-rw-r--r--Python/codecs.c108
11 files changed, 255 insertions, 7 deletions
diff --git a/Doc/c-api/codec.rst b/Doc/c-api/codec.rst
index 83252af..5bb56e3 100644
--- a/Doc/c-api/codec.rst
+++ b/Doc/c-api/codec.rst
@@ -116,3 +116,8 @@ Registry API for Unicode encoding error handlers
Replace the unicode encode error with backslash escapes (``\x``, ``\u`` and
``\U``).
+.. c:function:: PyObject* PyCodec_NameReplaceErrors(PyObject *exc)
+
+ Replace the unicode encode error with `\N{...}` escapes.
+
+ .. versionadded: 3.4
diff --git a/Doc/howto/unicode.rst b/Doc/howto/unicode.rst
index 50bca5a..aac2373 100644
--- a/Doc/howto/unicode.rst
+++ b/Doc/howto/unicode.rst
@@ -325,8 +325,9 @@ The *errors* parameter is the same as the parameter of the
:meth:`~bytes.decode` method but supports a few more possible handlers. As well as
``'strict'``, ``'ignore'``, and ``'replace'`` (which in this case
inserts a question mark instead of the unencodable character), there is
-also ``'xmlcharrefreplace'`` (inserts an XML character reference) and
-``backslashreplace`` (inserts a ``\uNNNN`` escape sequence).
+also ``'xmlcharrefreplace'`` (inserts an XML character reference),
+``backslashreplace`` (inserts a ``\uNNNN`` escape sequence) and
+``namereplace`` (inserts a ``\N{...}`` escape sequence).
The following example shows the different results::
@@ -346,6 +347,8 @@ The following example shows the different results::
b'&#40960;abcd&#1972;'
>>> u.encode('ascii', 'backslashreplace')
b'\\ua000abcd\\u07b4'
+ >>> u.encode('ascii', 'namereplace')
+ b'\\N{YI SYLLABLE IT}abcd\\u07b4'
The low-level routines for registering and accessing the available
encodings are found in the :mod:`codecs` module. Implementing new
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
index 4c2a023..ea4c450 100644
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -98,6 +98,8 @@ It defines the following functions:
reference (for encoding only)
* ``'backslashreplace'``: replace with backslashed escape sequences (for
encoding only)
+ * ``'namereplace'``: replace with ``\N{...}`` escape sequences (for
+ encoding only)
* ``'surrogateescape'``: on decoding, replace with code points in the Unicode
Private Use Area ranging from U+DC80 to U+DCFF. These private code
points will then be turned back into the same bytes when the
@@ -232,6 +234,11 @@ functions which use :func:`lookup` for the codec lookup:
Implements the ``backslashreplace`` error handling (for encoding only): the
unencodable character is replaced by a backslashed escape sequence.
+.. function:: namereplace_errors(exception)
+
+ Implements the ``namereplace`` error handling (for encoding only): the
+ unencodable character is replaced by a ``\N{...}`` escape sequence.
+
To simplify working with encoded files or stream, the module also defines these
utility functions:
@@ -363,6 +370,9 @@ and implemented by all standard Python codecs:
| ``'backslashreplace'`` | Replace with backslashed escape sequences |
| | (only for encoding). |
+-------------------------+-----------------------------------------------+
+| ``'namereplace'`` | Replace with ``\N{...}`` escape sequences |
+| | (only for encoding). |
++-------------------------+-----------------------------------------------+
| ``'surrogateescape'`` | Replace byte with surrogate U+DCxx, as defined|
| | in :pep:`383`. |
+-------------------------+-----------------------------------------------+
@@ -384,6 +394,9 @@ schemes:
.. versionchanged:: 3.4
The ``'surrogatepass'`` error handlers now works with utf-16\* and utf-32\* codecs.
+.. versionadded:: 3.4
+ The ``'namereplace'`` error handler.
+
The set of allowed values can be extended via :meth:`register_error`.
@@ -477,6 +490,8 @@ define in order to be compatible with the Python codec registry.
* ``'backslashreplace'`` Replace with backslashed escape sequences.
+ * ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
+
The *errors* argument will be assigned to an attribute of the same name.
Assigning to this attribute makes it possible to switch between different error
handling strategies during the lifetime of the :class:`IncrementalEncoder`
@@ -625,6 +640,8 @@ compatible with the Python codec registry.
* ``'backslashreplace'`` Replace with backslashed escape sequences.
+ * ``'namereplace'`` Replace with ``\N{...}`` escape sequences.
+
The *errors* argument will be assigned to an attribute of the same name.
Assigning to this attribute makes it possible to switch between different error
handling strategies during the lifetime of the :class:`StreamWriter` object.
diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst
index 9e38d6f..d1e3407 100644
--- a/Doc/library/functions.rst
+++ b/Doc/library/functions.rst
@@ -975,6 +975,9 @@ are always available. They are listed here in alphabetical order.
replaces unsupported characters with Python's backslashed escape
sequences.
+ * ``'namereplace'`` (also only supported when writing)
+ replaces unsupported characters with ``\N{...}`` escape sequences.
+
.. index::
single: universal newlines; open() built-in function
diff --git a/Doc/library/io.rst b/Doc/library/io.rst
index 0054286..c77db90 100644
--- a/Doc/library/io.rst
+++ b/Doc/library/io.rst
@@ -827,9 +827,10 @@ Text I/O
errors can lead to data loss.) ``'replace'`` causes a replacement marker
(such as ``'?'``) to be inserted where there is malformed data. When
writing, ``'xmlcharrefreplace'`` (replace with the appropriate XML character
- reference) or ``'backslashreplace'`` (replace with backslashed escape
- sequences) can be used. Any other error handling name that has been
- registered with :func:`codecs.register_error` is also valid.
+ reference), ``'backslashreplace'`` (replace with backslashed escape
+ sequences) or ``'namereplace'`` (replace with ``\N{...}`` escape sequences)
+ can be used. Any other error handling name that has been registered with
+ :func:`codecs.register_error` is also valid.
.. index::
single: universal newlines; io.TextIOWrapper class
diff --git a/Include/codecs.h b/Include/codecs.h
index b3088e4..4669135 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -225,6 +225,9 @@ PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc);
/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
+/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
+
PyAPI_DATA(const char *) Py_hexdigits;
#ifdef __cplusplus
diff --git a/Lib/codecs.py b/Lib/codecs.py
index 9934517..85df89a 100644
--- a/Lib/codecs.py
+++ b/Lib/codecs.py
@@ -22,6 +22,7 @@ __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
"strict_errors", "ignore_errors", "replace_errors",
"xmlcharrefreplace_errors",
+ "backslashreplace_errors", "namereplace_errors",
"register_error", "lookup_error"]
### Constants
@@ -1085,6 +1086,7 @@ try:
replace_errors = lookup_error("replace")
xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
backslashreplace_errors = lookup_error("backslashreplace")
+ namereplace_errors = lookup_error("namereplace")
except LookupError:
# In --disable-unicode builds, these error handler are missing
strict_errors = None
@@ -1092,6 +1094,7 @@ except LookupError:
replace_errors = None
xmlcharrefreplace_errors = None
backslashreplace_errors = None
+ namereplace_errors = None
# Tell modulefinder that using codecs probably needs the encodings
# package
diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py
index a1ce9cf..9743791 100644
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@@ -158,6 +158,22 @@ class CodecCallbackTest(unittest.TestCase):
sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
+ def test_nameescape(self):
+ # Does the same as backslashescape, but prefers ``\N{...}`` escape
+ # sequences.
+ sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
+ sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
+ b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+ self.assertEqual(sin.encode("ascii", "namereplace"), sout)
+
+ sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
+ b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+ self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
+
+ sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
+ b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
+ self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
+
def test_decoding_callbacks(self):
# This is a test for a decoding callback handler
# that allows the decoding of the invalid sequence
@@ -297,7 +313,7 @@ class CodecCallbackTest(unittest.TestCase):
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
- "backslashreplace"]
+ "backslashreplace", "namereplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
@@ -611,6 +627,81 @@ class CodecCallbackTest(unittest.TestCase):
("\\udfff", 1)
)
+ def test_badandgoodnamereplaceexceptions(self):
+ # "namereplace" complains about a non-exception passed in
+ self.assertRaises(
+ TypeError,
+ codecs.namereplace_errors,
+ 42
+ )
+ # "namereplace" complains about the wrong exception types
+ self.assertRaises(
+ TypeError,
+ codecs.namereplace_errors,
+ UnicodeError("ouch")
+ )
+ # "namereplace" can only be used for encoding
+ self.assertRaises(
+ TypeError,
+ codecs.namereplace_errors,
+ UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
+ )
+ self.assertRaises(
+ TypeError,
+ codecs.namereplace_errors,
+ UnicodeTranslateError("\u3042", 0, 1, "ouch")
+ )
+ # Use the correct exception
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")),
+ ("\\N{HIRAGANA LETTER A}", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\x00", 0, 1, "ouch")),
+ ("\\x00", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\xff", 0, 1, "ouch")),
+ ("\\N{LATIN SMALL LETTER Y WITH DIAERESIS}", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\u0100", 0, 1, "ouch")),
+ ("\\N{LATIN CAPITAL LETTER A WITH MACRON}", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
+ ("\\uffff", 1)
+ )
+ if SIZEOF_WCHAR_T > 0:
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\U00010000",
+ 0, 1, "ouch")),
+ ("\\N{LINEAR B SYLLABLE B008 A}", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\U0010ffff",
+ 0, 1, "ouch")),
+ ("\\U0010ffff", 1)
+ )
+ # Lone surrogates (regardless of unicode width)
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
+ ("\\ud800", 1)
+ )
+ self.assertEqual(
+ codecs.namereplace_errors(
+ UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
+ ("\\udfff", 1)
+ )
+
def test_badhandlerresults(self):
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
@@ -651,6 +742,10 @@ class CodecCallbackTest(unittest.TestCase):
codecs.backslashreplace_errors,
codecs.lookup_error("backslashreplace")
)
+ self.assertEqual(
+ codecs.namereplace_errors,
+ codecs.lookup_error("namereplace")
+ )
def test_unencodablereplacement(self):
def unencrepl(exc):
@@ -804,7 +899,8 @@ class CodecCallbackTest(unittest.TestCase):
class D(dict):
def __getitem__(self, key):
raise ValueError
- for err in ("strict", "replace", "xmlcharrefreplace", "backslashreplace", "test.posreturn"):
+ for err in ("strict", "replace", "xmlcharrefreplace",
+ "backslashreplace", "namereplace", "test.posreturn"):
self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 7d0eeb6..e03a1db 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -349,6 +349,8 @@ class ReadTest(MixInCheckStateHandling):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
"[\\udc80]".encode(self.encoding))
+ self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
+ "[\\udc80]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
"[&#56448;]".encode(self.encoding))
self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
@@ -808,6 +810,7 @@ class CP65001Test(ReadTest, unittest.TestCase):
('\udc80', 'ignore', b''),
('\udc80', 'replace', b'?'),
('\udc80', 'backslashreplace', b'\\udc80'),
+ ('\udc80', 'namereplace', b'\\udc80'),
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
))
else:
@@ -869,6 +872,8 @@ class CP65001Test(ReadTest, unittest.TestCase):
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
b'[\\udc80]')
+ self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
+ b'[\\udc80]')
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
b'[&#56448;]')
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
@@ -2824,6 +2829,8 @@ class CodePageTest(unittest.TestCase):
('[\xff]', 'replace', b'[y]'),
('[\u20ac]', 'replace', b'[?]'),
('[\xff]', 'backslashreplace', b'[\\xff]'),
+ ('[\xff]', 'namereplace',
+ b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
('\udcff', 'strict', None),
('[\udcff]', 'surrogateescape', b'[\xff]'),
diff --git a/Misc/NEWS b/Misc/NEWS
index bf117c2..2662e60 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -191,6 +191,8 @@ Core and Builtins
Library
-------
+- Issue #19676: Added the "namereplace" error handler.
+
- Issue #22788: Add *context* parameter to logging.handlers.HTTPHandler.
- Issue #22921: Allow SSLContext to take the *hostname* parameter even if
diff --git a/Python/codecs.c b/Python/codecs.c
index 151fea7..b09ea3a 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
+#include "ucnhash.h"
#include <ctype.h>
const char *Py_hexdigits = "0123456789abcdef";
@@ -933,6 +934,97 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
+static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
+static int ucnhash_initialized = 0;
+
+PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
+{
+ if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
+ PyObject *restuple;
+ PyObject *object;
+ Py_ssize_t i;
+ Py_ssize_t start;
+ Py_ssize_t end;
+ PyObject *res;
+ unsigned char *outp;
+ int ressize;
+ Py_UCS4 c;
+ char buffer[256]; /* NAME_MAXLEN */
+ if (PyUnicodeEncodeError_GetStart(exc, &start))
+ return NULL;
+ if (PyUnicodeEncodeError_GetEnd(exc, &end))
+ return NULL;
+ if (!(object = PyUnicodeEncodeError_GetObject(exc)))
+ return NULL;
+ if (!ucnhash_initialized) {
+ /* load the unicode data module */
+ ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
+ PyUnicodeData_CAPSULE_NAME, 1);
+ ucnhash_initialized = 1;
+ }
+ for (i = start, ressize = 0; i < end; ++i) {
+ /* object is guaranteed to be "ready" */
+ c = PyUnicode_READ_CHAR(object, i);
+ if (ucnhash_CAPI &&
+ ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
+ ressize += 1+1+1+strlen(buffer)+1;
+ }
+ else if (c >= 0x10000) {
+ ressize += 1+1+8;
+ }
+ else if (c >= 0x100) {
+ ressize += 1+1+4;
+ }
+ else
+ ressize += 1+1+2;
+ }
+ res = PyUnicode_New(ressize, 127);
+ if (res==NULL)
+ return NULL;
+ for (i = start, outp = PyUnicode_1BYTE_DATA(res);
+ i < end; ++i) {
+ c = PyUnicode_READ_CHAR(object, i);
+ *outp++ = '\\';
+ if (ucnhash_CAPI &&
+ ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
+ *outp++ = 'N';
+ *outp++ = '{';
+ strcpy((char *)outp, buffer);
+ outp += strlen(buffer);
+ *outp++ = '}';
+ continue;
+ }
+ if (c >= 0x00010000) {
+ *outp++ = 'U';
+ *outp++ = Py_hexdigits[(c>>28)&0xf];
+ *outp++ = Py_hexdigits[(c>>24)&0xf];
+ *outp++ = Py_hexdigits[(c>>20)&0xf];
+ *outp++ = Py_hexdigits[(c>>16)&0xf];
+ *outp++ = Py_hexdigits[(c>>12)&0xf];
+ *outp++ = Py_hexdigits[(c>>8)&0xf];
+ }
+ else if (c >= 0x100) {
+ *outp++ = 'u';
+ *outp++ = Py_hexdigits[(c>>12)&0xf];
+ *outp++ = Py_hexdigits[(c>>8)&0xf];
+ }
+ else
+ *outp++ = 'x';
+ *outp++ = Py_hexdigits[(c>>4)&0xf];
+ *outp++ = Py_hexdigits[c&0xf];
+ }
+
+ assert(_PyUnicode_CheckConsistency(res, 1));
+ restuple = Py_BuildValue("(Nn)", res, end);
+ Py_DECREF(object);
+ return restuple;
+ }
+ else {
+ wrong_exception_type(exc);
+ return NULL;
+ }
+}
+
#define ENC_UNKNOWN -1
#define ENC_UTF8 0
#define ENC_UTF16BE 1
@@ -1276,6 +1368,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}
+static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
+{
+ return PyCodec_NameReplaceErrors(exc);
+}
+
static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogatePassErrors(exc);
@@ -1346,6 +1443,17 @@ static int _PyCodecRegistry_Init(void)
}
},
{
+ "namereplace",
+ {
+ "namereplace_errors",
+ namereplace_errors,
+ METH_O,
+ PyDoc_STR("Implements the 'namereplace' error handling, "
+ "which replaces an unencodable character with a "
+ "\\N{...} escape sequence.")
+ }
+ },
+ {
"surrogatepass",
{
"surrogatepass",