diff options
author | Victor Stinner <victor.stinner@gmail.com> | 2017-12-13 11:29:09 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-12-13 11:29:09 (GMT) |
commit | 91106cd9ff2f321c0f60fbaa09fd46c80aa5c266 (patch) | |
tree | ff002e0532736a97f3ddd367c1491e7b04611816 /Objects | |
parent | c3e070f84931c847d1b35e7fb36aa71edd6215f6 (diff) | |
download | cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.zip cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.gz cpython-91106cd9ff2f321c0f60fbaa09fd46c80aa5c266.tar.bz2 |
bpo-29240: PEP 540: Add a new UTF-8 Mode (#855)
* Add -X utf8 command line option, PYTHONUTF8 environment variable
and a new sys.flags.utf8_mode flag.
* If the LC_CTYPE locale is "C" at startup: enable automatically the
UTF-8 mode.
* Add _winapi.GetACP(). encodings._alias_mbcs() now calls
_winapi.GetACP() to get the ANSI code page
* locale.getpreferredencoding() now returns 'UTF-8' in the UTF-8
mode. As a side effect, open() now uses the UTF-8 encoding by
default in this mode.
* Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding
in the UTF-8 Mode.
* Update subprocess._args_from_interpreter_flags() to handle -X utf8
* Skip some tests relying on the current locale if the UTF-8 mode is
enabled.
* Add test_utf8mode.py.
* _Py_DecodeUTF8_surrogateescape() gets a new optional parameter to
return also the length (number of wide characters).
* pymain_get_global_config() and pymain_set_global_config() now
always copy flag values, rather than only copying if the new value
is greater than the old value.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 30 |
1 files changed, 21 insertions, 9 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8d4fea8..c7480a0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5079,16 +5079,17 @@ onError: return NULL; } -#if defined(__APPLE__) || defined(__ANDROID__) -/* Simplified UTF-8 decoder using surrogateescape error handler, - used to decode the command line arguments on Mac OS X and Android. +/* UTF-8 decoder using the surrogateescape error handler . - Return a pointer to a newly allocated wide character string (use - PyMem_RawFree() to free the memory), or NULL on memory allocation error. */ + On success, return a pointer to a newly allocated wide character string (use + PyMem_RawFree() to free the memory) and write the output length (in number + of wchar_t units) into *p_wlen (if p_wlen is set). + On memory allocation failure, return -1 and write (size_t)-1 into *p_wlen + (if p_wlen is set). */ wchar_t* -_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) +_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen) { const char *e; wchar_t *unicode; @@ -5096,11 +5097,20 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) /* Note: size will always be longer than the resulting Unicode character count */ - if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) + if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } return NULL; + } + unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t)); - if (!unicode) + if (!unicode) { + if (p_wlen) { + *p_wlen = (size_t)-1; + } return NULL; + } /* Unpack UTF-8 encoded data */ e = s + size; @@ -5130,10 +5140,12 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) } } unicode[outpos] = L'\0'; + if (p_wlen) { + *p_wlen = outpos; + } return unicode; } -#endif /* __APPLE__ or __ANDROID__ */ /* Primary internal function which creates utf8 encoded bytes objects. |