From 7e303731262b9bd7c468c943c9d8f5359be76e4b Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Mon, 10 Jun 2013 09:19:46 -0700 Subject: remove MAX_MAXCHAR because it's unsafe for computing maximum codepoitn value (see #18183) --- Lib/test/test_unicode.py | 3 +++ Misc/NEWS | 3 +++ Objects/unicodeobject.c | 57 ++++++++++++++++++++++-------------------------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index bef64aa..bf0ddca 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -566,6 +566,9 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('\U0008fffe'.lower(), '\U0008fffe') self.assertEqual('\u2177'.lower(), '\u2177') + # See issue #18183 for this one. + '\U00010000\U00100000'.lower() + def test_casefold(self): self.assertEqual('hello'.casefold(), 'hello') self.assertEqual('hELlo'.casefold(), 'hello') diff --git a/Misc/NEWS b/Misc/NEWS index a204bb3..00f1ac0 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 3.3.3 release candidate 1? Core and Builtins ----------------- +- Issue #18183: Fix various unicode operations on strings with large unicode + codepoints. + - Issue #18180: Fix ref leak in _PyImport_GetDynLoadWindows(). - Issue #18038: SyntaxError raised during compilation sources with illegal diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8d6cda5..1c48197 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -112,11 +112,6 @@ extern "C" { #define _PyUnicode_DATA_ANY(op) \ (((PyUnicodeObject*)(op))->data.any) -/* Optimized version of Py_MAX() to compute the maximum character: - use it when your are computing the second argument of PyUnicode_New() */ -#define MAX_MAXCHAR(maxchar1, maxchar2) \ - ((maxchar1) | (maxchar2)) - #undef PyUnicode_READY #define PyUnicode_READY(op) \ (assert(_PyUnicode_CHECK(op)), \ @@ -2495,7 +2490,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 'c': { Py_UCS4 ordinal = va_arg(count, int); - maxchar = MAX_MAXCHAR(maxchar, ordinal); + maxchar = Py_MAX(maxchar, ordinal); n++; break; } @@ -2591,7 +2586,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) /* since PyUnicode_DecodeUTF8 returns already flexible unicode objects, there is no need to call ready on them */ argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(str); /* Remember the str and switch to the next slot */ *callresult++ = str; @@ -2604,7 +2599,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (PyUnicode_READY(obj) == -1) goto fail; argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(obj); break; } @@ -2619,7 +2614,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (PyUnicode_READY(obj) == -1) goto fail; argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(obj); *callresult++ = NULL; } @@ -2632,7 +2627,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) goto fail; } argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(str_obj); *callresult++ = str_obj; } @@ -2651,7 +2646,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) goto fail; } argmaxchar = PyUnicode_MAX_CHAR_VALUE(str); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(str); /* Remember the str and switch to the next slot */ *callresult++ = str; @@ -2670,7 +2665,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) goto fail; } argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(repr); /* Remember the repr and switch to the next slot */ *callresult++ = repr; @@ -2689,7 +2684,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) goto fail; } argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii); - maxchar = MAX_MAXCHAR(maxchar, argmaxchar); + maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(ascii); /* Remember the repr and switch to the next slot */ *callresult++ = ascii; @@ -8628,11 +8623,11 @@ fix_decimal_and_space_to_ascii(PyObject *self) } if (fixed != 0) { modified = 1; - maxchar = MAX_MAXCHAR(maxchar, fixed); + maxchar = Py_MAX(maxchar, fixed); PyUnicode_WRITE(kind, data, i, fixed); } else - maxchar = MAX_MAXCHAR(maxchar, ch); + maxchar = Py_MAX(maxchar, ch); } } @@ -8673,7 +8668,7 @@ PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, int decimal = Py_UNICODE_TODECIMAL(ch); if (decimal >= 0) ch = '0' + decimal; - maxchar = MAX_MAXCHAR(maxchar, ch); + maxchar = Py_MAX(maxchar, ch); } } @@ -8914,7 +8909,7 @@ _PyUnicode_InsertThousandsGrouping( if (unicode == NULL) { *maxchar = 127; if (len != n_digits) { - *maxchar = MAX_MAXCHAR(*maxchar, + *maxchar = Py_MAX(*maxchar, PyUnicode_MAX_CHAR_VALUE(thousands_sep)); } } @@ -9309,14 +9304,14 @@ do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *ma c = PyUnicode_READ(kind, data, 0); n_res = _PyUnicode_ToUpperFull(c, mapped); for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } for (i = 1; i < length; i++) { c = PyUnicode_READ(kind, data, i); n_res = lower_ucs4(kind, data, length, i, c, mapped); for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } } @@ -9341,7 +9336,7 @@ do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxc mapped[0] = c; } for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } } @@ -9362,7 +9357,7 @@ do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, else n_res = _PyUnicode_ToUpperFull(c, mapped); for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } } @@ -9391,7 +9386,7 @@ do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxc Py_UCS4 mapped[3]; int j, n_res = _PyUnicode_ToFoldedFull(c, mapped); for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } } @@ -9416,7 +9411,7 @@ do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar n_res = _PyUnicode_ToTitleFull(c, mapped); for (j = 0; j < n_res; j++) { - *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]); + *maxchar = Py_MAX(*maxchar, mapped[j]); res[k++] = mapped[j]; } @@ -9571,7 +9566,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) goto onError; sz += PyUnicode_GET_LENGTH(item); item_maxchar = PyUnicode_MAX_CHAR_VALUE(item); - maxchar = MAX_MAXCHAR(maxchar, item_maxchar); + maxchar = Py_MAX(maxchar, item_maxchar); if (i != 0) sz += seplen; if (sz < old_sz || sz > PY_SSIZE_T_MAX) { @@ -9747,7 +9742,7 @@ pad(PyObject *self, return NULL; } maxchar = PyUnicode_MAX_CHAR_VALUE(self); - maxchar = MAX_MAXCHAR(maxchar, fill); + maxchar = Py_MAX(maxchar, fill); u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar); if (!u) return NULL; @@ -10061,7 +10056,7 @@ replace(PyObject *self, PyObject *str1, /* Replacing str1 with str2 may cause a maxchar reduction in the result string. */ mayshrink = (maxchar_str2 < maxchar); - maxchar = MAX_MAXCHAR(maxchar, maxchar_str2); + maxchar = Py_MAX(maxchar, maxchar_str2); if (len1 == len2) { /* same length */ @@ -10647,7 +10642,7 @@ PyUnicode_Concat(PyObject *left, PyObject *right) maxchar = PyUnicode_MAX_CHAR_VALUE(u); maxchar2 = PyUnicode_MAX_CHAR_VALUE(v); - maxchar = MAX_MAXCHAR(maxchar, maxchar2); + maxchar = Py_MAX(maxchar, maxchar2); /* Concat the two Unicode strings */ w = PyUnicode_New(new_len, maxchar); @@ -10734,7 +10729,7 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) else { maxchar = PyUnicode_MAX_CHAR_VALUE(left); maxchar2 = PyUnicode_MAX_CHAR_VALUE(right); - maxchar = MAX_MAXCHAR(maxchar, maxchar2); + maxchar = Py_MAX(maxchar, maxchar2); /* Concat the two Unicode strings */ res = PyUnicode_New(new_len, maxchar); @@ -13846,15 +13841,15 @@ PyUnicode_Format(PyObject *format, PyObject *args) if (!(flags & F_LJUST)) { if (sign) { if ((width-1) > len) - bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); + bufmaxchar = Py_MAX(bufmaxchar, fill); } else { if (width > len) - bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill); + bufmaxchar = Py_MAX(bufmaxchar, fill); } } maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len); - bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar); + bufmaxchar = Py_MAX(bufmaxchar, maxchar); buflen = width; if (sign && len == width) -- cgit v0.12