bpo-34523: Support surrogatepass in locale codecs (GH-8995)

Add support for the "surrogatepass" error handler in PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() for the UTF-8 encoding. Changes: * _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex() now support the surrogatepass error handler (_Py_ERROR_SURROGATEPASS). * _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() now use the _Py_error_handler enum instead of "int surrogateescape" to pass the error handler. These functions now return -3 if the error handler is unknown. * Add unit tests on _Py_DecodeLocaleEx() and _Py_EncodeLocaleEx() in test_codecs. * Rename get_error_handler() to _Py_GetErrorHandler() and expose it as a private function. * _freeze_importlib doesn't need config.filesystem_errors="strict" workaround anymore.
author: Victor Stinner <vstinner@redhat.com> 2018-08-29 20:21:32 (GMT)
committer: GitHub <noreply@github.com> 2018-08-29 20:21:32 (GMT)
commit: 3d4226a832cabc630402589cc671cc4035d504e5 (patch)
tree: a1c5b1c51cbbca3aedd52593c979a5c15d72dd52 /Python/fileutils.c
parent: c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 (diff)
download: cpython-3d4226a832cabc630402589cc671cc4035d504e5.zip
cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.gz
cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.bz2
1 files changed, 87 insertions, 25 deletions
diff --git a/Python/fileutils.c b/Python/fileutils.c
index 9a3c334..0486f86 100644
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -32,6 +32,24 @@ extern int winerror_to_errno(int);
 int _Py_open_cloexec_works = -1;
 #endif
 
+
+static int
+get_surrogateescape(_Py_error_handler errors, int *surrogateescape)
+{
+    switch (errors)
+    {
+    case _Py_ERROR_STRICT:
+        *surrogateescape = 0;
+        return 0;
+    case _Py_ERROR_SURROGATEESCAPE:
+        *surrogateescape = 1;
+        return 0;
+    default:
+        return -1;
+    }
+}
+
+
 PyObject *
 _Py_device_encoding(int fd)
 {
@@ -215,12 +233,17 @@ _Py_GetForceASCII(void)
 static int
 encode_ascii(const wchar_t *text, char **str,
              size_t *error_pos, const char **reason,
-             int raw_malloc, int surrogateescape)
+             int raw_malloc, _Py_error_handler errors)
 {
     char *result = NULL, *out;
     size_t len, i;
     wchar_t ch;
 
+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
     len = wcslen(text);
 
     /* +1 for NULL byte */
@@ -278,13 +301,18 @@ _Py_GetForceASCII(void)
 #if !defined(HAVE_MBRTOWC) || defined(USE_FORCE_ASCII)
 static int
 decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
-             const char **reason, int surrogateescape)
+             const char **reason, _Py_error_handler errors)
 {
     wchar_t *res;
     unsigned char *in;
     wchar_t *out;
     size_t argsize = strlen(arg) + 1;
 
+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
     if (argsize > PY_SSIZE_T_MAX / sizeof(wchar_t)) {
         return -1;
     }
@@ -325,7 +353,7 @@ decode_ascii(const char *arg, wchar_t **wstr, size_t *wlen,
 
 static int
 decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
-                      const char **reason, int surrogateescape)
+                      const char **reason, _Py_error_handler errors)
 {
     wchar_t *res;
     size_t argsize;
@@ -336,6 +364,11 @@ decode_current_locale(const char* arg, wchar_t **wstr, size_t *wlen,
     mbstate_t mbs;
 #endif
 
+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
 #ifdef HAVE_BROKEN_MBSTOWCS
     /* Some platforms have a broken implementation of
      * mbstowcs which does not count the characters that
@@ -456,7 +489,7 @@ decode_error:
     /* Cannot use C locale for escaping; manually escape as if charset
        is ASCII (i.e. escape all bytes > 128. This will still roundtrip
        correctly in the locale's charset, which must be an ASCII superset. */
-    return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+    return decode_ascii(arg, wstr, wlen, reason, errors);
 #endif   /* HAVE_MBRTOWC */
 }
 
@@ -479,33 +512,35 @@ decode_error:
    invalid byte sequence in the input string into *wlen. If reason is not NULL,
    write the decoding error message into *reason.
 
+   Return -3 if the error handler 'errors' is not supported.
+
    Use the Py_EncodeLocaleEx() function to encode the character string back to
    a byte string. */
 int
 _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
                    const char **reason,
-                   int current_locale, int surrogateescape)
+                   int current_locale, _Py_error_handler errors)
 {
     if (current_locale) {
 #ifdef __ANDROID__
         return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
-                                surrogateescape);
+                                errors);
 #else
-        return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+        return decode_current_locale(arg, wstr, wlen, reason, errors);
 #endif
     }
 
 #if defined(__APPLE__) || defined(__ANDROID__)
     return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
-                            surrogateescape);
+                            errors);
 #else
     int use_utf8 = (Py_UTF8Mode == 1);
 #ifdef MS_WINDOWS
     use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
 #endif
     if (use_utf8) {
-        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
-                                reason, surrogateescape);
+        return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
+                                errors);
     }
 
 #ifdef USE_FORCE_ASCII
@@ -515,11 +550,11 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
 
     if (force_ascii) {
         /* force ASCII encoding to workaround mbstowcs() issue */
-        return decode_ascii(arg, wstr, wlen, reason, surrogateescape);
+        return decode_ascii(arg, wstr, wlen, reason, errors);
     }
 #endif
 
-    return decode_current_locale(arg, wstr, wlen, reason, surrogateescape);
+    return decode_current_locale(arg, wstr, wlen, reason, errors);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
@@ -547,8 +582,11 @@ wchar_t*
 Py_DecodeLocale(const char* arg, size_t *wlen)
 {
     wchar_t *wstr;
-    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen, NULL, 0, 1);
+    int res = _Py_DecodeLocaleEx(arg, &wstr, wlen,
+                                 NULL, 0,
+                                 _Py_ERROR_SURROGATEESCAPE);
     if (res != 0) {
+        assert(res != -3);
         if (wlen != NULL) {
             *wlen = (size_t)res;
         }
@@ -561,13 +599,18 @@ Py_DecodeLocale(const char* arg, size_t *wlen)
 static int
 encode_current_locale(const wchar_t *text, char **str,
                       size_t *error_pos, const char **reason,
-                      int raw_malloc, int surrogateescape)
+                      int raw_malloc, _Py_error_handler errors)
 {
     const size_t len = wcslen(text);
     char *result = NULL, *bytes = NULL;
     size_t i, size, converted;
     wchar_t c, buf[2];
 
+    int surrogateescape;
+    if (get_surrogateescape(errors, &surrogateescape) < 0) {
+        return -3;
+    }
+
     /* The function works in two steps:
        1. compute the length of the output buffer in bytes (size)
        2. outputs the bytes */
@@ -646,32 +689,50 @@ encode_error:
     return -2;
 }
 
+
+/* Encode a string to the locale encoding.
+
+   Parameters:
+
+   * raw_malloc: if non-zero, allocate memory using PyMem_RawMalloc() instead
+     of PyMem_Malloc().
+   * current_locale: if non-zero, use the current LC_CTYPE, otherwise use
+     Python filesystem encoding.
+   * errors: error handler like "strict" or "surrogateescape".
+
+   Return value:
+
+    0: success, *str is set to a newly allocated decoded string.
+   -1: memory allocation failure
+   -2: encoding error, set *error_pos and *reason (if set).
+   -3: the error handler 'errors' is not supported.
+ */
 static int
 encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
                  const char **reason,
-                 int raw_malloc, int current_locale, int surrogateescape)
+                 int raw_malloc, int current_locale, _Py_error_handler errors)
 {
     if (current_locale) {
 #ifdef __ANDROID__
         return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                                raw_malloc, surrogateescape);
+                                raw_malloc, errors);
 #else
         return encode_current_locale(text, str, error_pos, reason,
-                                     raw_malloc, surrogateescape);
+                                     raw_malloc, errors);
 #endif
     }
 
 #if defined(__APPLE__) || defined(__ANDROID__)
     return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                            raw_malloc, surrogateescape);
-#else   /* __APPLE__ */
+                            raw_malloc, errors);
+#else
     int use_utf8 = (Py_UTF8Mode == 1);
 #ifdef MS_WINDOWS
     use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
 #endif
     if (use_utf8) {
         return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
-                                raw_malloc, surrogateescape);
+                                raw_malloc, errors);
     }
 
 #ifdef USE_FORCE_ASCII
@@ -681,12 +742,12 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
 
     if (force_ascii) {
         return encode_ascii(text, str, error_pos, reason,
-                            raw_malloc, surrogateescape);
+                            raw_malloc, errors);
     }
 #endif
 
     return encode_current_locale(text, str, error_pos, reason,
-                                 raw_malloc, surrogateescape);
+                                 raw_malloc, errors);
 #endif   /* __APPLE__ or __ANDROID__ */
 }
 
@@ -696,7 +757,8 @@ encode_locale(const wchar_t *text, size_t *error_pos,
 {
     char *str;
     int res = encode_locale_ex(text, &str, error_pos, NULL,
-                               raw_malloc, current_locale, 1);
+                               raw_malloc, current_locale,
+                               _Py_ERROR_SURROGATEESCAPE);
     if (res != -2 && error_pos) {
         *error_pos = (size_t)-1;
     }
@@ -737,10 +799,10 @@ _Py_EncodeLocaleRaw(const wchar_t *text, size_t *error_pos)
 int
 _Py_EncodeLocaleEx(const wchar_t *text, char **str,
                    size_t *error_pos, const char **reason,
-                   int current_locale, int surrogateescape)
+                   int current_locale, _Py_error_handler errors)
 {
     return encode_locale_ex(text, str, error_pos, reason, 1,
-                            current_locale, surrogateescape);
+                            current_locale, errors);
 }
author	Victor Stinner <vstinner@redhat.com>	2018-08-29 20:21:32 (GMT)
committer	GitHub <noreply@github.com>	2018-08-29 20:21:32 (GMT)
commit	3d4226a832cabc630402589cc671cc4035d504e5 (patch)
tree	a1c5b1c51cbbca3aedd52593c979a5c15d72dd52 /Python/fileutils.c
parent	c5989cd87659acbfd4d19dc00dbe99c3a0fc9bd2 (diff)
download	cpython-3d4226a832cabc630402589cc671cc4035d504e5.zip cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.gz cpython-3d4226a832cabc630402589cc671cc4035d504e5.tar.bz2