summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <vstinner@redhat.com>2018-08-29 20:21:32 (GMT)
committerGitHub <noreply@github.com>2018-08-29 20:21:32 (GMT)
commit3d4226a832cabc630402589cc671cc4035d504e5 (patch)
treea1c5b1c51cbbca3aedd52593c979a5c15d72dd52
parentc5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 (diff)
downloadcpython-3d4226a832cabc630402589cc671cc4035d504e5.zip
cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.gz
cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.bz2
bpo-34523: Support surrogatepass in locale codecs (GH-8995)
Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
-rw-r--r--Include/fileutils.h29
-rw-r--r--Lib/test/test_codecs.py118
-rw-r--r--Modules/_testcapimodule.c94
-rw-r--r--Objects/stringlib/codecs.h2
-rw-r--r--Objects/unicodeobject.c173
-rw-r--r--Programs/_freeze_importlib.c8
-rw-r--r--Python/fileutils.c112
7 files changed, 421 insertions, 115 deletions
diff --git a/Include/fileutils.h b/Include/fileutils.h
index 3708784..f0a8e2c 100644
--- a/Include/fileutils.h
+++ b/Include/fileutils.h
@@ -5,6 +5,24 @@
extern "C" {
#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
+typedef enum {
+ _Py_ERROR_UNKNOWN=0,
+ _Py_ERROR_STRICT,
+ _Py_ERROR_SURROGATEESCAPE,
+ _Py_ERROR_REPLACE,
+ _Py_ERROR_IGNORE,
+ _Py_ERROR_BACKSLASHREPLACE,
+ _Py_ERROR_SURROGATEPASS,
+ _Py_ERROR_XMLCHARREFREPLACE,
+ _Py_ERROR_OTHER
+} _Py_error_handler;
+
+PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
+#endif
+
+
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
const char *arg,
@@ -26,7 +44,7 @@ PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
wchar_t **wstr,
size_t *wlen,
const char **reason,
- int surrogateescape);
+ _Py_error_handler errors);
PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
const wchar_t *text,
@@ -34,19 +52,22 @@ PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
size_t *error_pos,
const char **reason,
int raw_malloc,
- int surrogateescape);
+ _Py_error_handler errors);
PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
const char *arg,
Py_ssize_t arglen);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
PyAPI_FUNC(int) _Py_DecodeLocaleEx(
const char *arg,
wchar_t **wstr,
size_t *wlen,
const char **reason,
int current_locale,
- int surrogateescape);
+ _Py_error_handler errors);
PyAPI_FUNC(int) _Py_EncodeLocaleEx(
const wchar_t *text,
@@ -54,7 +75,7 @@ PyAPI_FUNC(int) _Py_EncodeLocaleEx(
size_t *error_pos,
const char **reason,
int current_locale,
- int surrogateescape);
+ _Py_error_handler errors);
#endif
#ifndef Py_LIMITED_API
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
index 86d0dde..00b5d31 100644
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -10,6 +10,11 @@ from unittest import mock
from test import support
try:
+ import _testcapi
+except ImportError as exc:
+ _testcapi = None
+
+try:
import ctypes
except ImportError:
ctypes = None
@@ -2051,13 +2056,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
@support.cpython_only
def test_basics_capi(self):
- from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
s = "abc123" # all codecs should be able to encode these
for encoding in all_unicode_encodings:
if encoding not in broken_unicode_with_stateful:
# check incremental decoder/encoder (fetched via the C API)
try:
- cencoder = codec_incrementalencoder(encoding)
+ cencoder = _testcapi.codec_incrementalencoder(encoding)
except LookupError: # no IncrementalEncoder
pass
else:
@@ -2066,7 +2070,7 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
for c in s:
encodedresult += cencoder.encode(c)
encodedresult += cencoder.encode("", True)
- cdecoder = codec_incrementaldecoder(encoding)
+ cdecoder = _testcapi.codec_incrementaldecoder(encoding)
decodedresult = ""
for c in encodedresult:
decodedresult += cdecoder.decode(bytes([c]))
@@ -2077,12 +2081,12 @@ class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
if encoding not in ("idna", "mbcs"):
# check incremental decoder/encoder with errors argument
try:
- cencoder = codec_incrementalencoder(encoding, "ignore")
+ cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
except LookupError: # no IncrementalEncoder
pass
else:
encodedresult = b"".join(cencoder.encode(c) for c in s)
- cdecoder = codec_incrementaldecoder(encoding, "ignore")
+ cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
decodedresult = "".join(cdecoder.decode(bytes([c]))
for c in encodedresult)
self.assertEqual(decodedresult, s,
@@ -3263,5 +3267,109 @@ class Latin1Test(unittest.TestCase):
self.assertEqual(data.decode('latin1'), expected)
+@unittest.skipIf(_testcapi is None, 'need _testcapi module')
+class LocaleCodecTest(unittest.TestCase):
+ """
+ Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
+ """
+ ENCODING = sys.getfilesystemencoding()
+ STRINGS = ("ascii", "ulatin1:\xa7\xe9",
+ "u255:\xff",
+ "UCS:\xe9\u20ac\U0010ffff",
+ "surrogates:\uDC80\uDCFF")
+ BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
+ SURROGATES = "\uDC80\uDCFF"
+
+ def encode(self, text, errors="strict"):
+ return _testcapi.EncodeLocaleEx(text, 0, errors)
+
+ def check_encode_strings(self, errors):
+ for text in self.STRINGS:
+ with self.subTest(text=text):
+ try:
+ expected = text.encode(self.ENCODING, errors)
+ except UnicodeEncodeError:
+ with self.assertRaises(RuntimeError) as cm:
+ self.encode(self.SURROGATES)
+ errmsg = str(cm.exception)
+ self.assertTrue(errmsg.startswith("encode error: pos=0, reason="), errmsg)
+ else:
+ encoded = self.encode(text, errors)
+ self.assertEqual(encoded, expected)
+
+ def test_encode_strict(self):
+ self.check_encode_strings("strict")
+
+ def test_encode_surrogateescape(self):
+ self.check_encode_strings("surrogateescape")
+
+ def test_encode_surrogatepass(self):
+ try:
+ self.encode('', 'surrogatepass')
+ except ValueError as exc:
+ if str(exc) == 'unsupported error handler':
+ self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
+ f"surrogatepass error handler")
+ else:
+ raise
+
+ self.check_encode_strings("surrogatepass")
+
+ def decode(self, encoded, errors="strict"):
+ return _testcapi.DecodeLocaleEx(encoded, 0, errors)
+
+ def check_decode_strings(self, errors):
+ is_utf8 = (self.ENCODING == "utf-8")
+ if is_utf8:
+ encode_errors = 'surrogateescape'
+ else:
+ encode_errors = 'strict'
+
+ strings = list(self.BYTES_STRINGS)
+ for text in self.STRINGS:
+ try:
+ encoded = text.encode(self.ENCODING, encode_errors)
+ if encoded not in strings:
+ strings.append(encoded)
+ except UnicodeEncodeError:
+ encoded = None
+
+ if is_utf8:
+ encoded2 = text.encode(self.ENCODING, 'surrogatepass')
+ if encoded2 != encoded:
+ strings.append(encoded2)
+
+ for encoded in strings:
+ with self.subTest(encoded=encoded):
+ try:
+ expected = encoded.decode(self.ENCODING, errors)
+ except UnicodeDecodeError:
+ with self.assertRaises(RuntimeError) as cm:
+ self.decode(encoded, errors)
+ errmsg = str(cm.exception)
+ self.assertTrue(errmsg.startswith("decode error: "), errmsg)
+ else:
+ decoded = self.decode(encoded, errors)
+ self.assertEqual(decoded, expected)
+
+ def test_decode_strict(self):
+ self.check_decode_strings("strict")
+
+ def test_decode_surrogateescape(self):
+ self.check_decode_strings("surrogateescape")
+
+ def test_decode_surrogatepass(self):
+ try:
+ self.decode(b'', 'surrogatepass')
+ except ValueError as exc:
+ if str(exc) == 'unsupported error handler':
+ self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
+ f"surrogatepass error handler")
+ else:
+ raise
+
+ self.check_decode_strings("surrogatepass")
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c
index 014c2f3..7c2c57b 100644
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@@ -4550,6 +4550,98 @@ new_hamt(PyObject *self, PyObject *args)
}
+static PyObject *
+encode_locale_ex(PyObject *self, PyObject *args)
+{
+ PyObject *unicode;
+ int current_locale = 0;
+ wchar_t *wstr;
+ PyObject *res = NULL;
+ const char *errors = NULL;
+
+ if (!PyArg_ParseTuple(args, "U|is", &unicode, &current_locale, &errors)) {
+ return NULL;
+ }
+ wstr = PyUnicode_AsWideCharString(unicode, NULL);
+ if (wstr == NULL) {
+ return NULL;
+ }
+ _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+
+ char *str = NULL;
+ size_t error_pos;
+ const char *reason = NULL;
+ int ret = _Py_EncodeLocaleEx(wstr,
+ &str, &error_pos, &reason,
+ current_locale, error_handler);
+ PyMem_Free(wstr);
+
+ switch(ret) {
+ case 0:
+ res = PyBytes_FromString(str);
+ PyMem_RawFree(str);
+ break;
+ case -1:
+ PyErr_NoMemory();
+ break;
+ case -2:
+ PyErr_Format(PyExc_RuntimeError, "encode error: pos=%zu, reason=%s",
+ error_pos, reason);
+ break;
+ case -3:
+ PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+ break;
+ default:
+ PyErr_SetString(PyExc_ValueError, "unknow error code");
+ break;
+ }
+ return res;
+}
+
+
+static PyObject *
+decode_locale_ex(PyObject *self, PyObject *args)
+{
+ char *str;
+ int current_locale = 0;
+ PyObject *res = NULL;
+ const char *errors = NULL;
+
+ if (!PyArg_ParseTuple(args, "y|is", &str, &current_locale, &errors)) {
+ return NULL;
+ }
+ _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
+
+ wchar_t *wstr = NULL;
+ size_t wlen = 0;
+ const char *reason = NULL;
+ int ret = _Py_DecodeLocaleEx(str,
+ &wstr, &wlen, &reason,
+ current_locale, error_handler);
+
+ switch(ret) {
+ case 0:
+ res = PyUnicode_FromWideChar(wstr, wlen);
+ PyMem_RawFree(wstr);
+ break;
+ case -1:
+ PyErr_NoMemory();
+ break;
+ case -2:
+ PyErr_Format(PyExc_RuntimeError, "decode error: pos=%zu, reason=%s",
+ wlen, reason);
+ break;
+ case -3:
+ PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+ break;
+ default:
+ PyErr_SetString(PyExc_ValueError, "unknow error code");
+ break;
+ }
+ return res;
+}
+
+
static PyMethodDef TestMethods[] = {
{"raise_exception", raise_exception, METH_VARARGS},
{"raise_memoryerror", raise_memoryerror, METH_NOARGS},
@@ -4771,6 +4863,8 @@ static PyMethodDef TestMethods[] = {
{"get_mapping_items", get_mapping_items, METH_O},
{"test_pythread_tss_key_state", test_pythread_tss_key_state, METH_VARARGS},
{"hamt", new_hamt, METH_NOARGS},
+ {"EncodeLocaleEx", encode_locale_ex, METH_VARARGS},
+ {"DecodeLocaleEx", decode_locale_ex, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h
index f019d9a..0abb4c8 100644
--- a/Objects/stringlib/codecs.h
+++ b/Objects/stringlib/codecs.h
@@ -313,7 +313,7 @@ STRINGLIB(utf8_encoder)(PyObject *unicode,
Py_ssize_t startpos, endpos, newpos;
Py_ssize_t k;
if (error_handler == _Py_ERROR_UNKNOWN) {
- error_handler = get_error_handler(errors);
+ error_handler = _Py_GetErrorHandler(errors);
}
startpos = i-1;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 60adcd9..a797f83 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -318,20 +318,8 @@ static int convert_uc(PyObject *obj, void *addr);
#include "clinic/unicodeobject.c.h"
-typedef enum {
- _Py_ERROR_UNKNOWN=0,
- _Py_ERROR_STRICT,
- _Py_ERROR_SURROGATEESCAPE,
- _Py_ERROR_REPLACE,
- _Py_ERROR_IGNORE,
- _Py_ERROR_BACKSLASHREPLACE,
- _Py_ERROR_SURROGATEPASS,
- _Py_ERROR_XMLCHARREFREPLACE,
- _Py_ERROR_OTHER
-} _Py_error_handler;
-
-static _Py_error_handler
-get_error_handler(const char *errors)
+_Py_error_handler
+_Py_GetErrorHandler(const char *errors)
{
if (errors == NULL || strcmp(errors, "strict") == 0) {
return _Py_ERROR_STRICT;
@@ -3327,34 +3315,12 @@ PyUnicode_AsEncodedObject(PyObject *unicode,
return NULL;
}
-static int
-locale_error_handler(const char *errors, int *surrogateescape)
-{
- _Py_error_handler error_handler = get_error_handler(errors);
- switch (error_handler)
- {
- case _Py_ERROR_STRICT:
- *surrogateescape = 0;
- return 0;
- case _Py_ERROR_SURROGATEESCAPE:
- *surrogateescape = 1;
- return 0;
- default:
- PyErr_Format(PyExc_ValueError,
- "only 'strict' and 'surrogateescape' error handlers "
- "are supported, not '%s'",
- errors);
- return -1;
- }
-}
static PyObject *
unicode_encode_locale(PyObject *unicode, const char *errors,
int current_locale)
{
- int surrogateescape;
- if (locale_error_handler(errors, &surrogateescape) < 0)
- return NULL;
+ _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
Py_ssize_t wlen;
wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
@@ -3373,7 +3339,7 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
size_t error_pos;
const char *reason;
int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
- current_locale, surrogateescape);
+ current_locale, error_handler);
if (res != 0) {
if (res == -2) {
PyObject *exc;
@@ -3388,6 +3354,9 @@ unicode_encode_locale(PyObject *unicode, const char *errors,
}
return NULL;
}
+ else if (res == -3) {
+ PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+ }
else {
PyErr_NoMemory();
PyMem_Free(wstr);
@@ -3571,9 +3540,7 @@ static PyObject*
unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
int current_locale)
{
- int surrogateescape;
- if (locale_error_handler(errors, &surrogateescape) < 0)
- return NULL;
+ _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
if (str[len] != '\0' || (size_t)len != strlen(str)) {
PyErr_SetString(PyExc_ValueError, "embedded null byte");
@@ -3584,7 +3551,7 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
size_t wlen;
const char *reason;
int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
- current_locale, surrogateescape);
+ current_locale, error_handler);
if (res != 0) {
if (res == -2) {
PyObject *exc;
@@ -3598,6 +3565,9 @@ unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
Py_DECREF(exc);
}
}
+ else if (res == -3) {
+ PyErr_SetString(PyExc_ValueError, "unsupported error handler");
+ }
else {
PyErr_NoMemory();
}
@@ -4863,7 +4833,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
}
if (error_handler == _Py_ERROR_UNKNOWN)
- error_handler = get_error_handler(errors);
+ error_handler = _Py_GetErrorHandler(errors);
switch (error_handler) {
case _Py_ERROR_IGNORE:
@@ -4932,13 +4902,29 @@ onError:
is not NULL, write the decoding error message into *reason. */
int
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
- const char **reason, int surrogateescape)
+ const char **reason, _Py_error_handler errors)
{
const char *orig_s = s;
const char *e;
wchar_t *unicode;
Py_ssize_t outpos;
+ int surrogateescape = 0;
+ int surrogatepass = 0;
+ switch (errors)
+ {
+ case _Py_ERROR_STRICT:
+ break;
+ case _Py_ERROR_SURROGATEESCAPE:
+ surrogateescape = 1;
+ break;
+ case _Py_ERROR_SURROGATEPASS:
+ surrogatepass = 1;
+ break;
+ default:
+ return -3;
+ }
+
/* Note: size will always be longer than the resulting Unicode
character count */
if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
@@ -4971,31 +4957,47 @@ _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
#endif
}
else {
- if (!ch && s == e)
+ if (!ch && s == e) {
break;
- if (!surrogateescape) {
- PyMem_RawFree(unicode );
- if (reason != NULL) {
- switch (ch) {
- case 0:
- *reason = "unexpected end of data";
- break;
- case 1:
- *reason = "invalid start byte";
- break;
- /* 2, 3, 4 */
- default:
- *reason = "invalid continuation byte";
- break;
- }
+ }
+
+ if (surrogateescape) {
+ unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
+ }
+ else {
+ /* Is it a valid three-byte code? */
+ if (surrogatepass
+ && (e - s) >= 3
+ && (s[0] & 0xf0) == 0xe0
+ && (s[1] & 0xc0) == 0x80
+ && (s[2] & 0xc0) == 0x80)
+ {
+ ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
+ s += 3;
+ unicode[outpos++] = ch;
}
- if (wlen != NULL) {
- *wlen = s - orig_s;
+ else {
+ PyMem_RawFree(unicode );
+ if (reason != NULL) {
+ switch (ch) {
+ case 0:
+ *reason = "unexpected end of data";
+ break;
+ case 1:
+ *reason = "invalid start byte";
+ break;
+ /* 2, 3, 4 */
+ default:
+ *reason = "invalid continuation byte";
+ break;
+ }
+ }
+ if (wlen != NULL) {
+ *wlen = s - orig_s;
+ }
+ return -2;
}
- return -2;
}
- /* surrogateescape */
- unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
}
}
unicode[outpos] = L'\0';
@@ -5030,13 +5032,29 @@ _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
On memory allocation failure, return -1. */
int
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
- const char **reason, int raw_malloc, int surrogateescape)
+ const char **reason, int raw_malloc, _Py_error_handler errors)
{
const Py_ssize_t max_char_size = 4;
Py_ssize_t len = wcslen(text);
assert(len >= 0);
+ int surrogateescape = 0;
+ int surrogatepass = 0;
+ switch (errors)
+ {
+ case _Py_ERROR_STRICT:
+ break;
+ case _Py_ERROR_SURROGATEESCAPE:
+ surrogateescape = 1;
+ break;
+ case _Py_ERROR_SURROGATEPASS:
+ surrogatepass = 1;
+ break;
+ default:
+ return -3;
+ }
+
if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
return -1;
}
@@ -5053,8 +5071,19 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
char *p = bytes;
Py_ssize_t i;
- for (i = 0; i < len; i++) {
+ for (i = 0; i < len; ) {
+ Py_ssize_t ch_pos = i;
Py_UCS4 ch = text[i];
+ i++;
+#if Py_UNICODE_SIZE == 2
+ if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
+ && i < len
+ && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
+ {
+ ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
+ i++;
+ }
+#endif
if (ch < 0x80) {
/* Encode ASCII */
@@ -5066,11 +5095,11 @@ _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
*p++ = (char)(0xc0 | (ch >> 6));
*p++ = (char)(0x80 | (ch & 0x3f));
}
- else if (Py_UNICODE_IS_SURROGATE(ch)) {
+ else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
/* surrogateescape error handler */
if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
if (error_pos != NULL) {
- *error_pos = (size_t)i;
+ *error_pos = (size_t)ch_pos;
}
if (reason != NULL) {
*reason = "encoding error";
@@ -6741,7 +6770,7 @@ unicode_encode_ucs1(PyObject *unicode,
/* cache callback name lookup (if not done yet, i.e. it's the first error) */
if (error_handler == _Py_ERROR_UNKNOWN)
- error_handler = get_error_handler(errors);
+ error_handler = _Py_GetErrorHandler(errors);
switch (error_handler) {
case _Py_ERROR_STRICT:
@@ -6945,7 +6974,7 @@ PyUnicode_DecodeASCII(const char *s,
/* byte outsize range 0x00..0x7f: call the error handler */
if (error_handler == _Py_ERROR_UNKNOWN)
- error_handler = get_error_handler(errors);
+ error_handler = _Py_GetErrorHandler(errors);
switch (error_handler)
{
@@ -8404,7 +8433,7 @@ charmap_encoding_error(
/* cache callback name lookup
* (if not done yet, i.e. it's the first error) */
if (*error_handler == _Py_ERROR_UNKNOWN)
- *error_handler = get_error_handler(errors);
+ *error_handler = _Py_GetErrorHandler(errors);
switch (*error_handler) {
case _Py_ERROR_STRICT:
diff --git a/Programs/_freeze_importlib.c b/Programs/_freeze_importlib.c
index 2621a76..8830d13 100644
--- a/Programs/_freeze_importlib.c
+++ b/Programs/_freeze_importlib.c
@@ -82,14 +82,6 @@ main(int argc, char *argv[])
/* Don't install importlib, since it could execute outdated bytecode. */
config._install_importlib = 0;
config._frozen = 1;
-#ifdef MS_WINDOWS
- /* bpo-34523: initfsencoding() is not called if _install_importlib=0,
- so interp->fscodec_initialized value remains 0.
- PyUnicode_EncodeFSDefault() doesn't support the "surrogatepass" error
- handler in such case, whereas it's the default error handler on Windows.
- Force the "strict" error handler to work around this bootstrap issue. */
- config.filesystem_errors = "strict";
-#endif
_PyInitError err = _Py_InitializeFromConfig(&config);
/* No need to call _PyCoreConfig_Clear() since we didn't allocate any
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 9a3c334..0486f86 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -32,6 +32,24 @@ extern int winerror_to_errno(int);
int _Py_open_cloexec_works = -1;
#endif
+
+static int
+get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
+{
+ switch (errors)
+ {
+ case _Py_ERROR_STRICT:
+ *surrogateescape = 0;
+ return 0;
+ case _Py_ERROR_SURROGATEESCAPE:
+ *surrogateescape = 1;
+ return 0;
+ default:
+ return -1;
+ }
+}
+
+
PyObject *
_Py_device_encoding(int fd)
{
@@ -215,12 +233,17 @@ _Py_GetForceASCII(void)
static int
encode_ascii(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
- int raw_malloc, int surrogateescape)
+ int raw_malloc, _Py_error_handler errors)
{
char *result = NULL, *out;
size_t len, i;
wchar_t ch;
+ int surrogateescape;
+ if (get_surrogateescape(errors, &surrogateescape) < 0) {
+ return -3;
+ }
+
len = wcslen(text);
/* +1 for NULL byte */
@@ -278,13 +301,18 @@ _Py_GetForceASCII(void)
#if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
static int
decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
- const char **reason, int surrogateescape)
+ const char **reason, _Py_error_handler errors)
{
wchar_t *res;
unsigned char *in;
wchar_t *out;
size_t argsize = strlen(arg) + 1;
+ int surrogateescape;
+ if (get_surrogateescape(errors, &surrogateescape) < 0) {
+ return -3;
+ }
+
if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
return -1;
}
@@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
static int
decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
- const char **reason, int surrogateescape)
+ const char **reason, _Py_error_handler errors)
{
wchar_t *res;
size_t argsize;
@@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
mbstate_t mbs;
#endif
+ int surrogateescape;
+ if (get_surrogateescape(errors, &surrogateescape) < 0) {
+ return -3;
+ }
+
#ifdef HAVE_BROKEN_MBSTOWCS
/* Some platforms have a broken implementation of
* mbstowcs which does not count the characters that
@@ -456,7 +489,7 @@ decode_error:
/* Cannot use C locale for escaping; manually escape as if charset
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
correctly in the locale's charset, which must be an ASCII superset. */
- return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+ return decode_ascii(arg, wstr, wlen, reason, errors);
#endif /* HAVE_MBRTOWC */
}
@@ -479,33 +512,35 @@ decode_error:
invalid byte sequence in the input string into *wlen. If reason is not NULL,
write the decoding error message into *reason.
+ Return -3 if the error handler 'errors' is not supported.
+
Use the Py_EncodeLocaleEx() function to encode the character string back to
a byte string. */
int
_Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
const char **reason,
- int current_locale, int surrogateescape)
+ int current_locale, _Py_error_handler errors)
{
if (current_locale) {
#ifdef __ANDROID__
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
- surrogateescape);
+ errors);
#else
- return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+ return decode_current_locale(arg, wstr, wlen, reason, errors);
#endif
}
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
- surrogateescape);
+ errors);
#else
int use_utf8 = (Py_UTF8Mode == 1);
#ifdef MS_WINDOWS
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
#endif
if (use_utf8) {
- return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
- reason, surrogateescape);
+ return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+ errors);
}
#ifdef USE_FORCE_ASCII
@@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
if (force_ascii) {
/* force ASCII encoding to workaround mbstowcs() issue */
- return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+ return decode_ascii(arg, wstr, wlen, reason, errors);
}
#endif
- return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+ return decode_current_locale(arg, wstr, wlen, reason, errors);
#endif /* __APPLE__ or __ANDROID__ */
}
@@ -547,8 +582,11 @@ wchar_t*
Py_DecodeLocale(const char* arg, size_t *wlen)
{
wchar_t *wstr;
- int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+ int res = _Py_DecodeLocaleEx(arg, &wstr, wlen,
+ NULL, 0,
+ _Py_ERROR_SURROGATEESCAPE);
if (res != 0) {
+ assert(res != -3);
if (wlen != NULL) {
*wlen = (size_t)res;
}
@@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen)
static int
encode_current_locale(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
- int raw_malloc, int surrogateescape)
+ int raw_malloc, _Py_error_handler errors)
{
const size_t len = wcslen(text);
char *result = NULL, *bytes = NULL;
size_t i, size, converted;
wchar_t c, buf[2];
+ int surrogateescape;
+ if (get_surrogateescape(errors, &surrogateescape) < 0) {
+ return -3;
+ }
+
/* The function works in two steps:
1. compute the length of the output buffer in bytes (size)
2. outputs the bytes */
@@ -646,32 +689,50 @@ encode_error:
return -2;
}
+
+/* Encode a string to the locale encoding.
+
+ Parameters:
+
+ * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead
+ of PyMem_Malloc().
+ * current_locale: if non-zero, use the current LC_CTYPE, otherwise use
+ Python filesystem encoding.
+ * errors: error handler like "strict" or "surrogateescape".
+
+ Return value:
+
+ 0: success, *str is set to a newly allocated decoded string.
+ -1: memory allocation failure
+ -2: encoding error, set *error_pos and *reason (if set).
+ -3: the error handler 'errors' is not supported.
+ */
static int
encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
const char **reason,
- int raw_malloc, int current_locale, int surrogateescape)
+ int raw_malloc, int current_locale, _Py_error_handler errors)
{
if (current_locale) {
#ifdef __ANDROID__
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
+ raw_malloc, errors);
#else
return encode_current_locale(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
+ raw_malloc, errors);
#endif
}
#if defined(__APPLE__) || defined(__ANDROID__)
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
-#else /* __APPLE__ */
+ raw_malloc, errors);
+#else
int use_utf8 = (Py_UTF8Mode == 1);
#ifdef MS_WINDOWS
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
#endif
if (use_utf8) {
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
+ raw_malloc, errors);
}
#ifdef USE_FORCE_ASCII
@@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
if (force_ascii) {
return encode_ascii(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
+ raw_malloc, errors);
}
#endif
return encode_current_locale(text, str, error_pos, reason,
- raw_malloc, surrogateescape);
+ raw_malloc, errors);
#endif /* __APPLE__ or __ANDROID__ */
}
@@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos,
{
char *str;
int res = encode_locale_ex(text, &str, error_pos, NULL,
- raw_malloc, current_locale, 1);
+ raw_malloc, current_locale,
+ _Py_ERROR_SURROGATEESCAPE);
if (res != -2 && error_pos) {
*error_pos = (size_t)-1;
}
@@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
int
_Py_EncodeLocaleEx(const wchar_t *text, char **str,
size_t *error_pos, const char **reason,
- int current_locale, int surrogateescape)
+ int current_locale, _Py_error_handler errors)
{
return encode_locale_ex(text, str, error_pos, reason, 1,
- current_locale, surrogateescape);
+ current_locale, errors);
}