From 37296e89a5119eb3af8344796ce653b2d89e403a Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 10 Jun 2010 13:36:23 +0000 Subject: Fix r81869: ISO-8859-15 was seen as an alias to ISO-8859-1 Don't use normalize_encoding() result if it is truncated. --- Objects/unicodeobject.c | 84 ++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 86fd153..aa0b4c6 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1294,11 +1294,12 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, } /* Convert encoding to lower case and replace '_' with '-' in order to - catch e.g. UTF_8. Truncate the string if it is longer than lower_len-1 - characters. */ -static void normalize_encoding(const char *encoding, - char *lower, - size_t lower_len) + catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1), + 1 on success. */ +static int +normalize_encoding(const char *encoding, + char *lower, + size_t lower_len) { const char *e; char *l; @@ -1307,7 +1308,9 @@ static void normalize_encoding(const char *encoding, e = encoding; l = lower; l_end = &lower[lower_len - 1]; - while (*e && l < l_end) { + while (*e) { + if (l == l_end) + return 0; if (ISUPPER(*e)) { *l++ = TOLOWER(*e++); } @@ -1320,6 +1323,7 @@ static void normalize_encoding(const char *encoding, } } *l = '\0'; + return 1; } PyObject *PyUnicode_Decode(const char *s, @@ -1335,22 +1339,23 @@ PyObject *PyUnicode_Decode(const char *s, encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - normalize_encoding(encoding, lower, sizeof(lower)); - if (strcmp(lower, "utf-8") == 0) - return PyUnicode_DecodeUTF8(s, size, errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) - return PyUnicode_DecodeLatin1(s, size, errors); + if (normalize_encoding(encoding, lower, sizeof(lower))) { + if (strcmp(lower, "utf-8") == 0) + return PyUnicode_DecodeUTF8(s, size, errors); + else if ((strcmp(lower, "latin-1") == 0) || + (strcmp(lower, "iso-8859-1") == 0)) + return PyUnicode_DecodeLatin1(s, size, errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) - else if (strcmp(lower, "mbcs") == 0) - return PyUnicode_DecodeMBCS(s, size, errors); + else if (strcmp(lower, "mbcs") == 0) + return PyUnicode_DecodeMBCS(s, size, errors); #endif - else if (strcmp(lower, "ascii") == 0) - return PyUnicode_DecodeASCII(s, size, errors); - else if (strcmp(lower, "utf-16") == 0) - return PyUnicode_DecodeUTF16(s, size, errors, 0); - else if (strcmp(lower, "utf-32") == 0) - return PyUnicode_DecodeUTF32(s, size, errors, 0); + else if (strcmp(lower, "ascii") == 0) + return PyUnicode_DecodeASCII(s, size, errors); + else if (strcmp(lower, "utf-16") == 0) + return PyUnicode_DecodeUTF16(s, size, errors, 0); + else if (strcmp(lower, "utf-32") == 0) + return PyUnicode_DecodeUTF32(s, size, errors, 0); + } /* Decode via the codec registry */ buffer = NULL; @@ -1499,26 +1504,27 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, encoding = PyUnicode_GetDefaultEncoding(); /* Shortcuts for common default encodings */ - normalize_encoding(encoding, lower, sizeof(lower)); - if (strcmp(lower, "utf-8") == 0) - return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); - else if ((strcmp(lower, "latin-1") == 0) || - (strcmp(lower, "iso-8859-1") == 0)) - return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + if (normalize_encoding(encoding, lower, sizeof(lower))) { + if (strcmp(lower, "utf-8") == 0) + return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); + else if ((strcmp(lower, "latin-1") == 0) || + (strcmp(lower, "iso-8859-1") == 0)) + return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T) - else if (strcmp(lower, "mbcs") == 0) - return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + else if (strcmp(lower, "mbcs") == 0) + return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); #endif - else if (strcmp(lower, "ascii") == 0) - return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), - PyUnicode_GET_SIZE(unicode), - errors); + else if (strcmp(lower, "ascii") == 0) + return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + errors); + } /* During bootstrap, we may need to find the encodings package, to load the file system encoding, and require the file system encoding in order to load the encodings @@ -1528,7 +1534,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, the encodings module is ASCII-only. XXX could try wcstombs instead, if the file system encoding is the locale's encoding. */ - else if (Py_FileSystemDefaultEncoding && + if (Py_FileSystemDefaultEncoding && strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 && !PyThreadState_GET()->interp->codecs_initialized) return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode), -- cgit v0.12