summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2012-01-14 18:23:30 (GMT)
committerBenjamin Peterson <benjamin@python.org>2012-01-14 18:23:30 (GMT)
commitd5890c8db5ed67d41719543a34b33f6a0e0a6f7f (patch)
tree9bf92ee63587d31ed44e8a37d90582913d37d95c
parent94d5a7174aaa107617b208ebc511a8f360196b1a (diff)
downloadcpython-d5890c8db5ed67d41719543a34b33f6a0e0a6f7f.zip
cpython-d5890c8db5ed67d41719543a34b33f6a0e0a6f7f.tar.gz
cpython-d5890c8db5ed67d41719543a34b33f6a0e0a6f7f.tar.bz2
add str.casefold() (closes #13752)
-rw-r--r--Doc/library/stdtypes.rst8
-rw-r--r--Include/unicodeobject.h5
-rw-r--r--Lib/test/test_unicode.py8
-rw-r--r--Misc/NEWS2
-rw-r--r--Objects/unicodectype.c25
-rw-r--r--Objects/unicodeobject.c35
-rw-r--r--Objects/unicodetype_db.h497
-rw-r--r--Tools/unicode/makeunicodedata.py50
8 files changed, 493 insertions, 137 deletions
diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
index 04fd57f..8fba7b8 100644
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1002,6 +1002,14 @@ functions based on regular expressions.
rest lowercased.
+.. method:: str.casefold()
+
+ Return a casefolded copy of the string. Casefolded strings may be used for
+ caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``.
+
+ .. versionadded:: 3.3
+
+
.. method:: str.center(width[, fillchar])
Return centered in a string of length *width*. Padding is done using the
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index e42631c..29d927f 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -2023,6 +2023,11 @@ PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
Py_UCS4 *res
);
+PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
+ Py_UCS4 ch, /* Unicode character */
+ Py_UCS4 *res
+ );
+
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Py_UCS4 ch /* Unicode character */
);
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index cc933b6..33d7b35 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -565,6 +565,14 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
self.assertEqual('\u2177'.lower(), '\u2177')
+ def test_casefold(self):
+ self.assertEqual('hello'.casefold(), 'hello')
+ self.assertEqual('hELlo'.casefold(), 'hello')
+ self.assertEqual('ß'.casefold(), 'ss')
+ self.assertEqual('fi'.casefold(), 'fi')
+ self.assertEqual('\u03a3'.casefold(), '\u03c3')
+ self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
+
def test_upper(self):
string_tests.CommonTest.test_upper(self)
self.assertEqual('\U0001044F'.upper(), '\U00010427')
diff --git a/Misc/NEWS b/Misc/NEWS
index adb6e45..1afe584 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
+- Issue #13752: Add a casefold() method to str.
+
- Issue #13761: Add a "flush" keyword argument to the print() function,
used to ensure flushing the output stream.
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
index 05b63cc..0ebdedb 100644
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -185,7 +185,7 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK)
- return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
+ return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
return ctype->upper ? ctype->upper : ch;
}
@@ -197,7 +197,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK)
- return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
+ return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
return ctype->lower ? ctype->lower : ch;
}
@@ -206,7 +206,7 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
- int index = ctype->lower & 0xFFFFFF;
+ int index = ctype->lower & 0xFFFF;
int n = ctype->lower >> 24;
int i;
for (i = 0; i < n; i++)
@@ -222,7 +222,7 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
- int index = ctype->title & 0xFFFFFF;
+ int index = ctype->title & 0xFFFF;
int n = ctype->title >> 24;
int i;
for (i = 0; i < n; i++)
@@ -238,7 +238,7 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
- int index = ctype->upper & 0xFFFFFF;
+ int index = ctype->upper & 0xFFFF;
int n = ctype->upper >> 24;
int i;
for (i = 0; i < n; i++)
@@ -249,6 +249,21 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
return 1;
}
+int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+ const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+ if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
+ int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
+ int n = (ctype->lower >> 20) & 7;
+ int i;
+ for (i = 0; i < n; i++)
+ res[i] = _PyUnicode_ExtendedCase[index + i];
+ return n;
+ }
+ return _PyUnicode_ToLowerFull(ch, res);
+}
+
int _PyUnicode_IsCased(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 471d98b..6d9df18 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -9577,6 +9577,24 @@ do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar
}
static Py_ssize_t
+do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+ Py_ssize_t i, k = 0;
+
+ for (i = 0; i < length; i++) {
+ Py_UCS4 c = PyUnicode_READ(kind, data, i);
+ Py_UCS4 mapped[3];
+ int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
+ for (j = 0; j < n_res; j++) {
+ if (mapped[j] > *maxchar)
+ *maxchar = mapped[j];
+ res[k++] = mapped[j];
+ }
+ }
+ return k;
+}
+
+static Py_ssize_t
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{
Py_ssize_t i, k = 0;
@@ -10501,6 +10519,22 @@ unicode_capitalize(PyObject *self)
return case_operation(self, do_capitalize);
}
+PyDoc_STRVAR(casefold__doc__,
+ "S.casefold() -> str\n\
+\n\
+Return a version of S suitable for caseless comparisons.");
+
+static PyObject *
+unicode_casefold(PyObject *self)
+{
+ if (PyUnicode_READY(self) == -1)
+ return NULL;
+ if (PyUnicode_IS_ASCII(self))
+ return ascii_upper_or_lower(self, 1);
+ return case_operation(self, do_casefold);
+}
+
+
/* Argument converter. Coerces to a single unicode character */
static int
@@ -12998,6 +13032,7 @@ static PyMethodDef unicode_methods[] = {
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
+ {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
index 4786e25..fbfc98a 100644
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -76,7 +76,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{0, 0, 0, 0, 0, 4096},
{0, 0, 0, 0, 2, 3076},
{0, 0, 0, 0, 3, 3076},
- {924, 181, 924, 0, 0, 9993},
+ {16777218, 17825792, 16777218, 0, 0, 26377},
{0, 0, 0, 0, 0, 5632},
{0, 0, 0, 0, 1, 3076},
{0, 0, 0, 0, 0, 3072},
@@ -110,7 +110,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{220, 252, 220, 0, 0, 10113},
{221, 253, 221, 0, 0, 10113},
{222, 254, 222, 0, 0, 10113},
- {33554433, 16777216, 33554435, 0, 0, 26377},
+ {33554438, 18874371, 33554440, 0, 0, 26377},
{192, 224, 192, 0, 0, 9993},
{193, 225, 193, 0, 0, 9993},
{194, 226, 194, 0, 0, 9993},
@@ -190,7 +190,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{300, 301, 300, 0, 0, 9993},
{302, 303, 302, 0, 0, 10113},
{302, 303, 302, 0, 0, 9993},
- {16777223, 33554437, 16777223, 0, 0, 26497},
+ {16777228, 33554442, 16777228, 0, 0, 26497},
{73, 305, 73, 0, 0, 9993},
{306, 307, 306, 0, 0, 10113},
{306, 307, 306, 0, 0, 9993},
@@ -214,7 +214,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{325, 326, 325, 0, 0, 9993},
{327, 328, 327, 0, 0, 10113},
{327, 328, 327, 0, 0, 9993},
- {33554441, 16777224, 33554441, 0, 0, 26377},
+ {33554448, 18874381, 33554448, 0, 0, 26377},
{330, 331, 330, 0, 0, 10113},
{330, 331, 330, 0, 0, 9993},
{332, 333, 332, 0, 0, 10113},
@@ -268,7 +268,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{379, 380, 379, 0, 0, 9993},
{381, 382, 381, 0, 0, 10113},
{381, 382, 381, 0, 0, 9993},
- {83, 383, 83, 0, 0, 9993},
+ {16777236, 17825810, 16777236, 0, 0, 26377},
{579, 384, 579, 0, 0, 9993},
{385, 595, 385, 0, 0, 10113},
{386, 387, 386, 0, 0, 10113},
@@ -371,7 +371,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{492, 493, 492, 0, 0, 9993},
{494, 495, 494, 0, 0, 10113},
{494, 495, 494, 0, 0, 9993},
- {33554444, 16777227, 33554444, 0, 0, 26377},
+ {33554456, 18874389, 33554456, 0, 0, 26377},
{497, 499, 498, 0, 0, 10113},
{497, 499, 498, 0, 0, 10049},
{497, 499, 498, 0, 0, 9993},
@@ -490,7 +490,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{439, 658, 439, 0, 0, 9993},
{0, 0, 0, 0, 0, 14089},
{0, 0, 0, 0, 0, 5889},
- {921, 837, 921, 0, 0, 13832},
+ {16777244, 17825818, 16777244, 0, 0, 30216},
{880, 881, 880, 0, 0, 10113},
{880, 881, 880, 0, 0, 9993},
{882, 883, 882, 0, 0, 10113},
@@ -508,7 +508,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{908, 972, 908, 0, 0, 10113},
{910, 973, 910, 0, 0, 10113},
{911, 974, 911, 0, 0, 10113},
- {50331663, 16777230, 50331663, 0, 0, 26377},
+ {50331681, 19922973, 50331681, 0, 0, 26377},
{913, 945, 913, 0, 0, 10113},
{914, 946, 914, 0, 0, 10113},
{915, 947, 915, 0, 0, 10113},
@@ -539,7 +539,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{904, 941, 904, 0, 0, 9993},
{905, 942, 905, 0, 0, 9993},
{906, 943, 906, 0, 0, 9993},
- {50331667, 16777234, 50331667, 0, 0, 26377},
+ {50331688, 19922980, 50331688, 0, 0, 26377},
{913, 945, 913, 0, 0, 9993},
{914, 946, 914, 0, 0, 9993},
{915, 947, 915, 0, 0, 9993},
@@ -557,7 +557,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{927, 959, 927, 0, 0, 9993},
{928, 960, 928, 0, 0, 9993},
{929, 961, 929, 0, 0, 9993},
- {931, 962, 931, 0, 0, 9993},
+ {16777261, 17825835, 16777261, 0, 0, 26377},
{931, 963, 931, 0, 0, 9993},
{932, 964, 932, 0, 0, 9993},
{933, 965, 933, 0, 0, 9993},
@@ -571,11 +571,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{910, 973, 910, 0, 0, 9993},
{911, 974, 911, 0, 0, 9993},
{975, 983, 975, 0, 0, 10113},
- {914, 976, 914, 0, 0, 9993},
- {920, 977, 920, 0, 0, 9993},
+ {16777264, 17825838, 16777264, 0, 0, 26377},
+ {16777267, 17825841, 16777267, 0, 0, 26377},
{0, 0, 0, 0, 0, 10113},
- {934, 981, 934, 0, 0, 9993},
- {928, 982, 928, 0, 0, 9993},
+ {16777270, 17825844, 16777270, 0, 0, 26377},
+ {16777273, 17825847, 16777273, 0, 0, 26377},
{975, 983, 975, 0, 0, 9993},
{984, 985, 984, 0, 0, 10113},
{984, 985, 984, 0, 0, 9993},
@@ -601,11 +601,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{1004, 1005, 1004, 0, 0, 9993},
{1006, 1007, 1006, 0, 0, 10113},
{1006, 1007, 1006, 0, 0, 9993},
- {922, 1008, 922, 0, 0, 9993},
- {929, 1009, 929, 0, 0, 9993},
+ {16777276, 17825850, 16777276, 0, 0, 26377},
+ {16777279, 17825853, 16777279, 0, 0, 26377},
{1017, 1010, 1017, 0, 0, 9993},
{1012, 952, 1012, 0, 0, 10113},
- {917, 1013, 917, 0, 0, 9993},
+ {16777282, 17825856, 16777282, 0, 0, 26377},
{1015, 1016, 1015, 0, 0, 10113},
{1015, 1016, 1015, 0, 0, 9993},
{1017, 1010, 1017, 0, 0, 10113},
@@ -978,7 +978,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{1364, 1412, 1364, 0, 0, 9993},
{1365, 1413, 1365, 0, 0, 9993},
{1366, 1414, 1366, 0, 0, 9993},
- {33554455, 16777238, 33554457, 0, 0, 26377},
+ {33554502, 18874435, 33554504, 0, 0, 26377},
{0, 0, 0, 0, 0, 1537},
{4256, 11520, 4256, 0, 0, 10113},
{4257, 11521, 4257, 0, 0, 10113},
@@ -1180,13 +1180,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{7826, 7827, 7826, 0, 0, 9993},
{7828, 7829, 7828, 0, 0, 10113},
{7828, 7829, 7828, 0, 0, 9993},
- {33554460, 16777243, 33554460, 0, 0, 26377},
- {33554463, 16777246, 33554463, 0, 0, 26377},
- {33554466, 16777249, 33554466, 0, 0, 26377},
- {33554469, 16777252, 33554469, 0, 0, 26377},
- {33554472, 16777255, 33554472, 0, 0, 26377},
- {7776, 7835, 7776, 0, 0, 9993},
- {7838, 223, 7838, 0, 0, 10113},
+ {33554509, 18874442, 33554509, 0, 0, 26377},
+ {33554514, 18874447, 33554514, 0, 0, 26377},
+ {33554519, 18874452, 33554519, 0, 0, 26377},
+ {33554524, 18874457, 33554524, 0, 0, 26377},
+ {33554529, 18874462, 33554529, 0, 0, 26377},
+ {16777317, 17825891, 16777317, 0, 0, 26377},
+ {16777321, 18874470, 16777321, 0, 0, 26497},
{7840, 7841, 7840, 0, 0, 10113},
{7840, 7841, 7840, 0, 0, 9993},
{7842, 7843, 7842, 0, 0, 10113},
@@ -1355,13 +1355,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{8011, 8003, 8011, 0, 0, 10113},
{8012, 8004, 8012, 0, 0, 10113},
{8013, 8005, 8013, 0, 0, 10113},
- {33554475, 16777258, 33554475, 0, 0, 26377},
+ {33554541, 18874474, 33554541, 0, 0, 26377},
{8025, 8017, 8025, 0, 0, 9993},
- {50331694, 16777261, 50331694, 0, 0, 26377},
+ {50331763, 19923055, 50331763, 0, 0, 26377},
{8027, 8019, 8027, 0, 0, 9993},
- {50331698, 16777265, 50331698, 0, 0, 26377},
+ {50331770, 19923062, 50331770, 0, 0, 26377},
{8029, 8021, 8029, 0, 0, 9993},
- {50331702, 16777269, 50331702, 0, 0, 26377},
+ {50331777, 19923069, 50331777, 0, 0, 26377},
{8031, 8023, 8031, 0, 0, 9993},
{8025, 8017, 8025, 0, 0, 10113},
{8027, 8019, 8027, 0, 0, 10113},
@@ -1397,110 +1397,110 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{8171, 8059, 8171, 0, 0, 9993},
{8186, 8060, 8186, 0, 0, 9993},
{8187, 8061, 8187, 0, 0, 9993},
- {33554490, 16777273, 16777276, 0, 0, 26377},
- {33554494, 16777277, 16777280, 0, 0, 26377},
- {33554498, 16777281, 16777284, 0, 0, 26377},
- {33554502, 16777285, 16777288, 0, 0, 26377},
- {33554506, 16777289, 16777292, 0, 0, 26377},
- {33554510, 16777293, 16777296, 0, 0, 26377},
- {33554514, 16777297, 16777300, 0, 0, 26377},
- {33554518, 16777301, 16777304, 0, 0, 26377},
- {33554522, 16777305, 16777308, 0, 0, 26433},
- {33554526, 16777309, 16777312, 0, 0, 26433},
- {33554530, 16777313, 16777316, 0, 0, 26433},
- {33554534, 16777317, 16777320, 0, 0, 26433},
- {33554538, 16777321, 16777324, 0, 0, 26433},
- {33554542, 16777325, 16777328, 0, 0, 26433},
- {33554546, 16777329, 16777332, 0, 0, 26433},
- {33554550, 16777333, 16777336, 0, 0, 26433},
- {33554554, 16777337, 16777340, 0, 0, 26377},
- {33554558, 16777341, 16777344, 0, 0, 26377},
- {33554562, 16777345, 16777348, 0, 0, 26377},
- {33554566, 16777349, 16777352, 0, 0, 26377},
- {33554570, 16777353, 16777356, 0, 0, 26377},
- {33554574, 16777357, 16777360, 0, 0, 26377},
- {33554578, 16777361, 16777364, 0, 0, 26377},
- {33554582, 16777365, 16777368, 0, 0, 26377},
- {33554586, 16777369, 16777372, 0, 0, 26433},
- {33554590, 16777373, 16777376, 0, 0, 26433},
- {33554594, 16777377, 16777380, 0, 0, 26433},
- {33554598, 16777381, 16777384, 0, 0, 26433},
- {33554602, 16777385, 16777388, 0, 0, 26433},
- {33554606, 16777389, 16777392, 0, 0, 26433},
- {33554610, 16777393, 16777396, 0, 0, 26433},
- {33554614, 16777397, 16777400, 0, 0, 26433},
- {33554618, 16777401, 16777404, 0, 0, 26377},
- {33554622, 16777405, 16777408, 0, 0, 26377},
- {33554626, 16777409, 16777412, 0, 0, 26377},
- {33554630, 16777413, 16777416, 0, 0, 26377},
- {33554634, 16777417, 16777420, 0, 0, 26377},
- {33554638, 16777421, 16777424, 0, 0, 26377},
- {33554642, 16777425, 16777428, 0, 0, 26377},
- {33554646, 16777429, 16777432, 0, 0, 26377},
- {33554650, 16777433, 16777436, 0, 0, 26433},
- {33554654, 16777437, 16777440, 0, 0, 26433},
- {33554658, 16777441, 16777444, 0, 0, 26433},
- {33554662, 16777445, 16777448, 0, 0, 26433},
- {33554666, 16777449, 16777452, 0, 0, 26433},
- {33554670, 16777453, 16777456, 0, 0, 26433},
- {33554674, 16777457, 16777460, 0, 0, 26433},
- {33554678, 16777461, 16777464, 0, 0, 26433},
+ {33554567, 18874500, 16777353, 0, 0, 26377},
+ {33554573, 18874506, 16777359, 0, 0, 26377},
+ {33554579, 18874512, 16777365, 0, 0, 26377},
+ {33554585, 18874518, 16777371, 0, 0, 26377},
+ {33554591, 18874524, 16777377, 0, 0, 26377},
+ {33554597, 18874530, 16777383, 0, 0, 26377},
+ {33554603, 18874536, 16777389, 0, 0, 26377},
+ {33554609, 18874542, 16777395, 0, 0, 26377},
+ {33554615, 18874548, 16777401, 0, 0, 26433},
+ {33554621, 18874554, 16777407, 0, 0, 26433},
+ {33554627, 18874560, 16777413, 0, 0, 26433},
+ {33554633, 18874566, 16777419, 0, 0, 26433},
+ {33554639, 18874572, 16777425, 0, 0, 26433},
+ {33554645, 18874578, 16777431, 0, 0, 26433},
+ {33554651, 18874584, 16777437, 0, 0, 26433},
+ {33554657, 18874590, 16777443, 0, 0, 26433},
+ {33554663, 18874596, 16777449, 0, 0, 26377},
+ {33554669, 18874602, 16777455, 0, 0, 26377},
+ {33554675, 18874608, 16777461, 0, 0, 26377},
+ {33554681, 18874614, 16777467, 0, 0, 26377},
+ {33554687, 18874620, 16777473, 0, 0, 26377},
+ {33554693, 18874626, 16777479, 0, 0, 26377},
+ {33554699, 18874632, 16777485, 0, 0, 26377},
+ {33554705, 18874638, 16777491, 0, 0, 26377},
+ {33554711, 18874644, 16777497, 0, 0, 26433},
+ {33554717, 18874650, 16777503, 0, 0, 26433},
+ {33554723, 18874656, 16777509, 0, 0, 26433},
+ {33554729, 18874662, 16777515, 0, 0, 26433},
+ {33554735, 18874668, 16777521, 0, 0, 26433},
+ {33554741, 18874674, 16777527, 0, 0, 26433},
+ {33554747, 18874680, 16777533, 0, 0, 26433},
+ {33554753, 18874686, 16777539, 0, 0, 26433},
+ {33554759, 18874692, 16777545, 0, 0, 26377},
+ {33554765, 18874698, 16777551, 0, 0, 26377},
+ {33554771, 18874704, 16777557, 0, 0, 26377},
+ {33554777, 18874710, 16777563, 0, 0, 26377},
+ {33554783, 18874716, 16777569, 0, 0, 26377},
+ {33554789, 18874722, 16777575, 0, 0, 26377},
+ {33554795, 18874728, 16777581, 0, 0, 26377},
+ {33554801, 18874734, 16777587, 0, 0, 26377},
+ {33554807, 18874740, 16777593, 0, 0, 26433},
+ {33554813, 18874746, 16777599, 0, 0, 26433},
+ {33554819, 18874752, 16777605, 0, 0, 26433},
+ {33554825, 18874758, 16777611, 0, 0, 26433},
+ {33554831, 18874764, 16777617, 0, 0, 26433},
+ {33554837, 18874770, 16777623, 0, 0, 26433},
+ {33554843, 18874776, 16777629, 0, 0, 26433},
+ {33554849, 18874782, 16777635, 0, 0, 26433},
{8120, 8112, 8120, 0, 0, 9993},
{8121, 8113, 8121, 0, 0, 9993},
- {33554682, 16777465, 33554684, 0, 0, 26377},
- {33554687, 16777470, 16777473, 0, 0, 26377},
- {33554691, 16777474, 33554693, 0, 0, 26377},
- {33554696, 16777479, 33554696, 0, 0, 26377},
- {50331915, 16777482, 50331918, 0, 0, 26377},
+ {33554855, 18874788, 33554857, 0, 0, 26377},
+ {33554862, 18874795, 16777648, 0, 0, 26377},
+ {33554868, 18874801, 33554870, 0, 0, 26377},
+ {33554875, 18874808, 33554875, 0, 0, 26377},
+ {50332097, 19923389, 50332100, 0, 0, 26377},
{8120, 8112, 8120, 0, 0, 10113},
{8121, 8113, 8121, 0, 0, 10113},
{8122, 8048, 8122, 0, 0, 10113},
{8123, 8049, 8123, 0, 0, 10113},
- {33554706, 16777489, 16777492, 0, 0, 26433},
- {921, 8126, 921, 0, 0, 9993},
- {33554710, 16777493, 33554712, 0, 0, 26377},
- {33554715, 16777498, 16777501, 0, 0, 26377},
- {33554719, 16777502, 33554721, 0, 0, 26377},
- {33554724, 16777507, 33554724, 0, 0, 26377},
- {50331943, 16777510, 50331946, 0, 0, 26377},
+ {33554890, 18874823, 16777676, 0, 0, 26433},
+ {16777679, 17826253, 16777679, 0, 0, 26377},
+ {33554899, 18874832, 33554901, 0, 0, 26377},
+ {33554906, 18874839, 16777692, 0, 0, 26377},
+ {33554912, 18874845, 33554914, 0, 0, 26377},
+ {33554919, 18874852, 33554919, 0, 0, 26377},
+ {50332141, 19923433, 50332144, 0, 0, 26377},
{8136, 8050, 8136, 0, 0, 10113},
{8137, 8051, 8137, 0, 0, 10113},
{8138, 8052, 8138, 0, 0, 10113},
{8139, 8053, 8139, 0, 0, 10113},
- {33554734, 16777517, 16777520, 0, 0, 26433},
+ {33554934, 18874867, 16777720, 0, 0, 26433},
{8152, 8144, 8152, 0, 0, 9993},
{8153, 8145, 8153, 0, 0, 9993},
- {50331954, 16777521, 50331954, 0, 0, 26377},
- {50331958, 16777525, 50331958, 0, 0, 26377},
- {33554746, 16777529, 33554746, 0, 0, 26377},
- {50331965, 16777532, 50331965, 0, 0, 26377},
+ {50332157, 19923449, 50332157, 0, 0, 26377},
+ {50332164, 19923456, 50332164, 0, 0, 26377},
+ {33554954, 18874887, 33554954, 0, 0, 26377},
+ {50332176, 19923468, 50332176, 0, 0, 26377},
{8152, 8144, 8152, 0, 0, 10113},
{8153, 8145, 8153, 0, 0, 10113},
{8154, 8054, 8154, 0, 0, 10113},
{8155, 8055, 8155, 0, 0, 10113},
{8168, 8160, 8168, 0, 0, 9993},
{8169, 8161, 8169, 0, 0, 9993},
- {50331969, 16777536, 50331969, 0, 0, 26377},
- {50331973, 16777540, 50331973, 0, 0, 26377},
- {33554761, 16777544, 33554761, 0, 0, 26377},
+ {50332183, 19923475, 50332183, 0, 0, 26377},
+ {50332190, 19923482, 50332190, 0, 0, 26377},
+ {33554980, 18874913, 33554980, 0, 0, 26377},
{8172, 8165, 8172, 0, 0, 9993},
- {33554764, 16777547, 33554764, 0, 0, 26377},
- {50331983, 16777550, 50331983, 0, 0, 26377},
+ {33554985, 18874918, 33554985, 0, 0, 26377},
+ {50332207, 19923499, 50332207, 0, 0, 26377},
{8168, 8160, 8168, 0, 0, 10113},
{8169, 8161, 8169, 0, 0, 10113},
{8170, 8058, 8170, 0, 0, 10113},
{8171, 8059, 8171, 0, 0, 10113},
{8172, 8165, 8172, 0, 0, 10113},
- {33554771, 16777554, 33554773, 0, 0, 26377},
- {33554776, 16777559, 16777562, 0, 0, 26377},
- {33554780, 16777563, 33554782, 0, 0, 26377},
- {33554785, 16777568, 33554785, 0, 0, 26377},
- {50332004, 16777571, 50332007, 0, 0, 26377},
+ {33554997, 18874930, 33554999, 0, 0, 26377},
+ {33555004, 18874937, 16777790, 0, 0, 26377},
+ {33555010, 18874943, 33555012, 0, 0, 26377},
+ {33555017, 18874950, 33555017, 0, 0, 26377},
+ {50332239, 19923531, 50332242, 0, 0, 26377},
{8184, 8056, 8184, 0, 0, 10113},
{8185, 8057, 8185, 0, 0, 10113},
{8186, 8060, 8186, 0, 0, 10113},
{8187, 8061, 8187, 0, 0, 10113},
- {33554795, 16777578, 16777581, 0, 0, 26433},
+ {33555032, 18874965, 16777818, 0, 0, 26433},
{0, 0, 0, 0, 0, 3076},
{0, 0, 0, 0, 4, 3076},
{0, 0, 0, 0, 5, 3076},
@@ -2037,18 +2037,18 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{42918, 42919, 42918, 0, 0, 9993},
{42920, 42921, 42920, 0, 0, 10113},
{42920, 42921, 42920, 0, 0, 9993},
- {33554799, 16777582, 33554801, 0, 0, 26377},
- {33554804, 16777587, 33554806, 0, 0, 26377},
- {33554809, 16777592, 33554811, 0, 0, 26377},
- {50332030, 16777597, 50332033, 0, 0, 26377},
- {50332037, 16777604, 50332040, 0, 0, 26377},
- {33554828, 16777611, 33554830, 0, 0, 26377},
- {33554833, 16777616, 33554835, 0, 0, 26377},
- {33554838, 16777621, 33554840, 0, 0, 26377},
- {33554843, 16777626, 33554845, 0, 0, 26377},
- {33554848, 16777631, 33554850, 0, 0, 26377},
- {33554853, 16777636, 33554855, 0, 0, 26377},
- {33554858, 16777641, 33554860, 0, 0, 26377},
+ {33555038, 18874971, 33555040, 0, 0, 26377},
+ {33555045, 18874978, 33555047, 0, 0, 26377},
+ {33555052, 18874985, 33555054, 0, 0, 26377},
+ {50332276, 19923568, 50332279, 0, 0, 26377},
+ {50332286, 19923578, 50332289, 0, 0, 26377},
+ {33555079, 18875012, 33555081, 0, 0, 26377},
+ {33555086, 18875019, 33555088, 0, 0, 26377},
+ {33555093, 18875026, 33555095, 0, 0, 26377},
+ {33555100, 18875033, 33555102, 0, 0, 26377},
+ {33555107, 18875040, 33555109, 0, 0, 26377},
+ {33555114, 18875047, 33555116, 0, 0, 26377},
+ {33555121, 18875054, 33555123, 0, 0, 26377},
{0, 0, 0, 0, 0, 1025},
{65313, 65345, 65313, 0, 0, 10113},
{65314, 65346, 65314, 0, 0, 10113},
@@ -2188,7 +2188,12 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
/* extended case mappings */
const Py_UCS4 _PyUnicode_ExtendedCase[] = {
+ 181,
+ 956,
+ 924,
223,
+ 115,
+ 115,
83,
83,
83,
@@ -2198,263 +2203,440 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
304,
329,
700,
+ 110,
+ 700,
78,
+ 383,
+ 115,
+ 83,
496,
+ 106,
+ 780,
74,
780,
+ 837,
+ 953,
+ 921,
912,
+ 953,
+ 776,
+ 769,
921,
776,
769,
944,
+ 965,
+ 776,
+ 769,
933,
776,
769,
+ 962,
+ 963,
+ 931,
+ 976,
+ 946,
+ 914,
+ 977,
+ 952,
+ 920,
+ 981,
+ 966,
+ 934,
+ 982,
+ 960,
+ 928,
+ 1008,
+ 954,
+ 922,
+ 1009,
+ 961,
+ 929,
+ 1013,
+ 949,
+ 917,
1415,
+ 1381,
+ 1410,
1333,
1362,
1333,
1410,
7830,
+ 104,
+ 817,
72,
817,
7831,
+ 116,
+ 776,
84,
776,
7832,
+ 119,
+ 778,
87,
778,
7833,
+ 121,
+ 778,
89,
778,
7834,
+ 97,
+ 702,
65,
702,
+ 7835,
+ 7777,
+ 7776,
+ 223,
+ 115,
+ 115,
+ 7838,
8016,
+ 965,
+ 787,
933,
787,
8018,
+ 965,
+ 787,
+ 768,
933,
787,
768,
8020,
+ 965,
+ 787,
+ 769,
933,
787,
769,
8022,
+ 965,
+ 787,
+ 834,
933,
787,
834,
8064,
+ 7936,
+ 953,
7944,
921,
8072,
8065,
+ 7937,
+ 953,
7945,
921,
8073,
8066,
+ 7938,
+ 953,
7946,
921,
8074,
8067,
+ 7939,
+ 953,
7947,
921,
8075,
8068,
+ 7940,
+ 953,
7948,
921,
8076,
8069,
+ 7941,
+ 953,
7949,
921,
8077,
8070,
+ 7942,
+ 953,
7950,
921,
8078,
8071,
+ 7943,
+ 953,
7951,
921,
8079,
8064,
+ 7936,
+ 953,
7944,
921,
8072,
8065,
+ 7937,
+ 953,
7945,
921,
8073,
8066,
+ 7938,
+ 953,
7946,
921,
8074,
8067,
+ 7939,
+ 953,
7947,
921,
8075,
8068,
+ 7940,
+ 953,
7948,
921,
8076,
8069,
+ 7941,
+ 953,
7949,
921,
8077,
8070,
+ 7942,
+ 953,
7950,
921,
8078,
8071,
+ 7943,
+ 953,
7951,
921,
8079,
8080,
+ 7968,
+ 953,
7976,
921,
8088,
8081,
+ 7969,
+ 953,
7977,
921,
8089,
8082,
+ 7970,
+ 953,
7978,
921,
8090,
8083,
+ 7971,
+ 953,
7979,
921,
8091,
8084,
+ 7972,
+ 953,
7980,
921,
8092,
8085,
+ 7973,
+ 953,
7981,
921,
8093,
8086,
+ 7974,
+ 953,
7982,
921,
8094,
8087,
+ 7975,
+ 953,
7983,
921,
8095,
8080,
+ 7968,
+ 953,
7976,
921,
8088,
8081,
+ 7969,
+ 953,
7977,
921,
8089,
8082,
+ 7970,
+ 953,
7978,
921,
8090,
8083,
+ 7971,
+ 953,
7979,
921,
8091,
8084,
+ 7972,
+ 953,
7980,
921,
8092,
8085,
+ 7973,
+ 953,
7981,
921,
8093,
8086,
+ 7974,
+ 953,
7982,
921,
8094,
8087,
+ 7975,
+ 953,
7983,
921,
8095,
8096,
+ 8032,
+ 953,
8040,
921,
8104,
8097,
+ 8033,
+ 953,
8041,
921,
8105,
8098,
+ 8034,
+ 953,
8042,
921,
8106,
8099,
+ 8035,
+ 953,
8043,
921,
8107,
8100,
+ 8036,
+ 953,
8044,
921,
8108,
8101,
+ 8037,
+ 953,
8045,
921,
8109,
8102,
+ 8038,
+ 953,
8046,
921,
8110,
8103,
+ 8039,
+ 953,
8047,
921,
8111,
8096,
+ 8032,
+ 953,
8040,
921,
8104,
8097,
+ 8033,
+ 953,
8041,
921,
8105,
8098,
+ 8034,
+ 953,
8042,
921,
8106,
8099,
+ 8035,
+ 953,
8043,
921,
8107,
8100,
+ 8036,
+ 953,
8044,
921,
8108,
8101,
+ 8037,
+ 953,
8045,
921,
8109,
8102,
+ 8038,
+ 953,
8046,
921,
8110,
8103,
+ 8039,
+ 953,
8047,
921,
8111,
8114,
+ 8048,
+ 953,
8122,
921,
8122,
837,
8115,
+ 945,
+ 953,
913,
921,
8124,
8116,
+ 940,
+ 953,
902,
921,
902,
837,
8118,
+ 945,
+ 834,
913,
834,
8119,
+ 945,
+ 834,
+ 953,
913,
834,
921,
@@ -2462,27 +2644,43 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8115,
+ 945,
+ 953,
913,
921,
8124,
+ 8126,
+ 953,
+ 921,
8130,
+ 8052,
+ 953,
8138,
921,
8138,
837,
8131,
+ 951,
+ 953,
919,
921,
8140,
8132,
+ 942,
+ 953,
905,
921,
905,
837,
8134,
+ 951,
+ 834,
919,
834,
8135,
+ 951,
+ 834,
+ 953,
919,
834,
921,
@@ -2490,60 +2688,97 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8131,
+ 951,
+ 953,
919,
921,
8140,
8146,
+ 953,
+ 776,
+ 768,
921,
776,
768,
8147,
+ 953,
+ 776,
+ 769,
921,
776,
769,
8150,
+ 953,
+ 834,
921,
834,
8151,
+ 953,
+ 776,
+ 834,
921,
776,
834,
8162,
+ 965,
+ 776,
+ 768,
933,
776,
768,
8163,
+ 965,
+ 776,
+ 769,
933,
776,
769,
8164,
+ 961,
+ 787,
929,
787,
8166,
+ 965,
+ 834,
933,
834,
8167,
+ 965,
+ 776,
+ 834,
933,
776,
834,
8178,
+ 8060,
+ 953,
8186,
921,
8186,
837,
8179,
+ 969,
+ 953,
937,
921,
8188,
8180,
+ 974,
+ 953,
911,
921,
911,
837,
8182,
+ 969,
+ 834,
937,
834,
8183,
+ 969,
+ 834,
+ 953,
937,
834,
921,
@@ -2551,25 +2786,36 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8179,
+ 969,
+ 953,
937,
921,
8188,
64256,
+ 102,
+ 102,
70,
70,
70,
102,
64257,
+ 102,
+ 105,
70,
73,
70,
105,
64258,
+ 102,
+ 108,
70,
76,
70,
108,
64259,
+ 102,
+ 102,
+ 105,
70,
70,
73,
@@ -2577,6 +2823,9 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
102,
105,
64260,
+ 102,
+ 102,
+ 108,
70,
70,
76,
@@ -2584,36 +2833,50 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
102,
108,
64261,
+ 115,
+ 116,
83,
84,
83,
116,
64262,
+ 115,
+ 116,
83,
84,
83,
116,
64275,
+ 1396,
+ 1398,
1348,
1350,
1348,
1398,
64276,
+ 1396,
+ 1381,
1348,
1333,
1348,
1381,
64277,
+ 1396,
+ 1387,
1348,
1339,
1348,
1387,
64278,
+ 1406,
+ 1398,
1358,
1350,
1358,
1398,
64279,
+ 1396,
+ 1389,
1348,
1341,
1348,
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 140fc64..0795d9e 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -49,6 +49,7 @@ LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
+CASE_FOLDING = "CaseFolding%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
@@ -424,28 +425,36 @@ def makeunicodetype(unicode, trace):
if "Case_Ignorable" in properties:
flags |= CASE_IGNORABLE_MASK
sc = unicode.special_casing.get(char)
+ cf = unicode.case_folding.get(char, [char])
+ if record[12]:
+ upper = int(record[12], 16)
+ else:
+ upper = char
+ if record[13]:
+ lower = int(record[13], 16)
+ else:
+ lower = char
+ if record[14]:
+ title = int(record[14], 16)
+ else:
+ title = upper
+ if sc is None and cf != [lower]:
+ sc = ([lower], [title], [upper])
if sc is None:
- if record[12]:
- upper = int(record[12], 16)
- else:
- upper = char
- if record[13]:
- lower = int(record[13], 16)
- else:
- lower = char
- if record[14]:
- title = int(record[14], 16)
- else:
- title = upper
if upper == lower == title:
upper = lower = title = 0
else:
- # This happens when some character maps to more than one
- # character in uppercase, lowercase, or titlecase. The extra
- # characters are stored in a different array.
+ # This happens either when some character maps to more than one
+ # character in uppercase, lowercase, or titlecase or the
+ # casefolded version of the character is different from the
+ # lowercase. The extra characters are stored in a different
+ # array.
flags |= EXTENDED_CASE_MASK
lower = len(extra_casing) | (len(sc[0]) << 24)
extra_casing.extend(sc[0])
+ if cf != sc[0]:
+ lower |= len(cf) << 20
+ extra_casing.extend(cf)
upper = len(extra_casing) | (len(sc[2]) << 24)
extra_casing.extend(sc[2])
# Title is probably equal to upper.
@@ -1107,6 +1116,17 @@ class UnicodeData:
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
+ cf = self.case_folding = {}
+ if version != '3.2.0':
+ with open_data(CASE_FOLDING, version) as file:
+ for s in file:
+ s = s[:-1].split('#', 1)[0]
+ if not s:
+ continue
+ data = s.split("; ")
+ if data[1] in "CF":
+ c = int(data[0], 16)
+ cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1