diff options
author | Benjamin Peterson <benjamin@python.org> | 2012-01-11 23:17:06 (GMT) |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2012-01-11 23:17:06 (GMT) |
commit | b2bf01d824ea5a13b375d0aa79211c01f8ab726a (patch) | |
tree | c2e840d182aff5a4ae272ca9a80b6a1cf3c1db3d /Objects/unicodeobject.c | |
parent | 9007f72db095212a169b3234194fcc08bd14bf6e (diff) | |
download | cpython-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.zip cpython-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.tar.gz cpython-b2bf01d824ea5a13b375d0aa79211c01f8ab726a.tar.bz2 |
use full unicode mappings for upper/lower/title case (#12736)
Also broaden the category of characters that count as lowercase/uppercase.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 345 |
1 files changed, 196 insertions, 149 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5ca2c53..cff97f7 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #define PY_SSIZE_T_CLEAN #include "Python.h" #include "ucnhash.h" +#include "bytes_methods.h" #ifdef MS_WINDOWS #include <windows.h> @@ -9428,188 +9429,222 @@ fixup(PyObject *self, return v; } -static Py_UCS4 -fixupper(PyObject *self) +static PyObject * +ascii_upper_or_lower(PyObject *self, int lower) { - /* No need to call PyUnicode_READY(self) because this function is only - called as a callback from fixup() which does it already. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; - - for (i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - const Py_UCS4 up = Py_UNICODE_TOUPPER(ch); - if (up != ch) { - if (up > maxchar) - maxchar = up; - PyUnicode_WRITE(kind, data, i, up); - touched = 1; - } - else if (ch > maxchar) - maxchar = ch; - } + Py_ssize_t len = PyUnicode_GET_LENGTH(self); + char *resdata, *data = PyUnicode_DATA(self); + PyObject *res; - if (touched) - return maxchar; + res = PyUnicode_New(len, 127); + if (res == NULL) + return NULL; + resdata = PyUnicode_DATA(res); + if (lower) + _Py_bytes_lower(resdata, data, len); else - return 0; + _Py_bytes_upper(resdata, data, len); + return res; } static Py_UCS4 -fixlower(PyObject *self) +handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i) { - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; + Py_ssize_t j; + int final_sigma; + Py_UCS4 c; + /* U+03A3 is in the Final_Sigma context when, it is found like this: - for(i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); - if (lo != ch) { - if (lo > maxchar) - maxchar = lo; - PyUnicode_WRITE(kind, data, i, lo); - touched = 1; + \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased}) + + where ! is a negation and \p{xxx} is a character with property xxx. + */ + for (j = i - 1; j >= 0; j--) { + c = PyUnicode_READ(kind, data, j); + if (!_PyUnicode_IsCaseIgnorable(c)) + break; + } + final_sigma = j >= 0 && _PyUnicode_IsCased(c); + if (final_sigma) { + for (j = i + 1; j < length; j++) { + c = PyUnicode_READ(kind, data, j); + if (!_PyUnicode_IsCaseIgnorable(c)) + break; } - else if (ch > maxchar) - maxchar = ch; + final_sigma = j == length || !_PyUnicode_IsCased(c); } + return (final_sigma) ? 0x3C2 : 0x3C3; +} - if (touched) - return maxchar; - else - return 0; +static int +lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i, + Py_UCS4 c, Py_UCS4 *mapped) +{ + /* Obscure special case. */ + if (c == 0x3A3) { + mapped[0] = handle_capital_sigma(kind, data, length, i); + return 1; + } + return _PyUnicode_ToLowerFull(c, mapped); } -static Py_UCS4 -fixswapcase(PyObject *self) +static Py_ssize_t +do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i; + Py_ssize_t i, k = 0; + int n_res, j; + Py_UCS4 c, mapped[3]; - for(i = 0; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 nu = 0; + c = PyUnicode_READ(kind, data, 0); + n_res = _PyUnicode_ToUpperFull(c, mapped); + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; + } + for (i = 1; i < length; i++) { + c = PyUnicode_READ(kind, data, i); + n_res = lower_ucs4(kind, data, length, i, c, mapped); + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; + } + } + return k; +} - if (Py_UNICODE_ISUPPER(ch)) - nu = Py_UNICODE_TOLOWER(ch); - else if (Py_UNICODE_ISLOWER(ch)) - nu = Py_UNICODE_TOUPPER(ch); +static Py_ssize_t +do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { + Py_ssize_t i, k = 0; - if (nu != 0) { - if (nu > maxchar) - maxchar = nu; - PyUnicode_WRITE(kind, data, i, nu); - touched = 1; + for (i = 0; i < length; i++) { + Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; + int n_res, j; + if (Py_UNICODE_ISUPPER(c)) { + n_res = lower_ucs4(kind, data, length, i, c, mapped); + } + else if (Py_UNICODE_ISLOWER(c)) { + n_res = _PyUnicode_ToUpperFull(c, mapped); + } + else { + n_res = 1; + mapped[0] = c; + } + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; } - else if (ch > maxchar) - maxchar = ch; } - - if (touched) - return maxchar; - else - return 0; + return k; } -static Py_UCS4 -fixcapitalize(PyObject *self) +static Py_ssize_t +do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, + Py_UCS4 *maxchar, int lower) { - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - int touched = 0; - Py_UCS4 maxchar = 0; - Py_ssize_t i = 0; - Py_UCS4 ch; + Py_ssize_t i, k = 0; - if (len == 0) - return 0; - - ch = PyUnicode_READ(kind, data, i); - if (!Py_UNICODE_ISUPPER(ch)) { - maxchar = Py_UNICODE_TOUPPER(ch); - PyUnicode_WRITE(kind, data, i, maxchar); - touched = 1; - } - ++i; - for(; i < len; ++i) { - ch = PyUnicode_READ(kind, data, i); - if (!Py_UNICODE_ISLOWER(ch)) { - const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch); - if (lo > maxchar) - maxchar = lo; - PyUnicode_WRITE(kind, data, i, lo); - touched = 1; + for (i = 0; i < length; i++) { + Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3]; + int n_res, j; + if (lower) + n_res = lower_ucs4(kind, data, length, i, c, mapped); + else + n_res = _PyUnicode_ToUpperFull(c, mapped); + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; } - else if (ch > maxchar) - maxchar = ch; } + return k; +} - if (touched) - return maxchar; - else - return 0; +static Py_ssize_t +do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) +{ + return do_upper_or_lower(kind, data, length, res, maxchar, 0); } -static Py_UCS4 -fixtitle(PyObject *self) +static Py_ssize_t +do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) { - /* No need to call PyUnicode_READY(self) because fixup() which does it. */ - const Py_ssize_t len = PyUnicode_GET_LENGTH(self); - const int kind = PyUnicode_KIND(self); - void *data = PyUnicode_DATA(self); - Py_UCS4 maxchar = 0; - Py_ssize_t i = 0; - int previous_is_cased; + return do_upper_or_lower(kind, data, length, res, maxchar, 1); +} - /* Shortcut for single character strings */ - if (len == 1) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch); - if (ti != ch) { - PyUnicode_WRITE(kind, data, i, ti); - return ti; - } - else - return 0; +static PyObject * +case_operation(PyObject *self, + Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *)) +{ + PyObject *res = NULL; + Py_ssize_t length, newlength = 0; + int kind, outkind; + void *data, *outdata; + Py_UCS4 maxchar = 0, *tmp, *tmpend; + + if (PyUnicode_READY(self) == -1) + return NULL; + + kind = PyUnicode_KIND(self); + data = PyUnicode_DATA(self); + length = PyUnicode_GET_LENGTH(self); + tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length); + if (tmp == NULL) + return PyErr_NoMemory(); + newlength = perform(kind, data, length, tmp, &maxchar); + res = PyUnicode_New(newlength, maxchar); + if (res == NULL) + goto leave; + tmpend = tmp + newlength; + outdata = PyUnicode_DATA(res); + outkind = PyUnicode_KIND(res); + switch (outkind) { + case PyUnicode_1BYTE_KIND: + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata); + break; + case PyUnicode_2BYTE_KIND: + _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata); + break; + case PyUnicode_4BYTE_KIND: + memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength); + break; + default: + assert(0); + break; } + leave: + PyMem_FREE(tmp); + return res; +} + +static Py_ssize_t +do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) +{ + Py_ssize_t i, k = 0; + int previous_is_cased; + previous_is_cased = 0; - for(; i < len; ++i) { - const Py_UCS4 ch = PyUnicode_READ(kind, data, i); - Py_UCS4 nu; + for (i = 0; i < length; i++) { + const Py_UCS4 c = PyUnicode_READ(kind, data, i); + Py_UCS4 mapped[3]; + int n_res, j; if (previous_is_cased) - nu = Py_UNICODE_TOLOWER(ch); + n_res = lower_ucs4(kind, data, length, i, c, mapped); else - nu = Py_UNICODE_TOTITLE(ch); + n_res = _PyUnicode_ToTitleFull(c, mapped); - if (nu > maxchar) - maxchar = nu; - PyUnicode_WRITE(kind, data, i, nu); + for (j = 0; j < n_res; j++) { + if (mapped[j] > *maxchar) + *maxchar = mapped[j]; + res[k++] = mapped[j]; + } - if (Py_UNICODE_ISLOWER(ch) || - Py_UNICODE_ISUPPER(ch) || - Py_UNICODE_ISTITLE(ch)) - previous_is_cased = 1; - else - previous_is_cased = 0; + previous_is_cased = _PyUnicode_IsCased(c); } - return maxchar; + return k; } PyObject * @@ -10445,7 +10480,7 @@ characters, all remaining cased characters have lower case."); static PyObject* unicode_title(PyObject *self) { - return fixup(self, fixtitle); + return case_operation(self, do_title); } PyDoc_STRVAR(capitalize__doc__, @@ -10457,7 +10492,11 @@ have upper case and the rest lower case."); static PyObject* unicode_capitalize(PyObject *self) { - return fixup(self, fixcapitalize); + if (PyUnicode_READY(self) == -1) + return NULL; + if (PyUnicode_GET_LENGTH(self) == 0) + return unicode_result_unchanged(self); + return case_operation(self, do_capitalize); } #if 0 @@ -11715,7 +11754,11 @@ Return a copy of the string S converted to lowercase."); static PyObject* unicode_lower(PyObject *self) { - return fixup(self, fixlower); + if (PyUnicode_READY(self) == -1) + return NULL; + if (PyUnicode_IS_ASCII(self)) + return ascii_upper_or_lower(self, 1); + return case_operation(self, do_lower); } #define LEFTSTRIP 0 @@ -12604,7 +12647,7 @@ and vice versa."); static PyObject* unicode_swapcase(PyObject *self) { - return fixup(self, fixswapcase); + return case_operation(self, do_swapcase); } PyDoc_STRVAR(maketrans__doc__, @@ -12750,7 +12793,11 @@ Return a copy of S converted to uppercase."); static PyObject* unicode_upper(PyObject *self) { - return fixup(self, fixupper); + if (PyUnicode_READY(self) == -1) + return NULL; + if (PyUnicode_IS_ASCII(self)) + return ascii_upper_or_lower(self, 0); + return case_operation(self, do_upper); } PyDoc_STRVAR(zfill__doc__, |