diff options
author | Antoine Pitrou <solipsis@pitrou.net> | 2009-04-27 22:31:40 (GMT) |
---|---|---|
committer | Antoine Pitrou <solipsis@pitrou.net> | 2009-04-27 22:31:40 (GMT) |
commit | 7a0fedfd1d387424c9e19059a126939d5f7bdea2 (patch) | |
tree | 0a180a7793e5b590a846869d01379a41e3078565 /Modules/unicodedata.c | |
parent | 57f3d93552edf5f4f5d5e8fad5aff9f72788bc7a (diff) | |
download | cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.zip cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.gz cpython-7a0fedfd1d387424c9e19059a126939d5f7bdea2.tar.bz2 |
Merged revisions 72054 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r72054 | antoine.pitrou | 2009-04-27 23:53:26 +0200 (lun., 27 avril 2009) | 5 lines
Issue #1734234: Massively speedup `unicodedata.normalize()` when the
string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
........
Diffstat (limited to 'Modules/unicodedata.c')
-rw-r--r-- | Modules/unicodedata.c | 63 |
1 files changed, 58 insertions, 5 deletions
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index ba6d0b1..301cee7 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -27,6 +27,7 @@ typedef struct { const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ + const unsigned char normalization_quick_check; /* see is_normalized() */ } _PyUnicode_DatabaseRecord; typedef struct change_record { @@ -722,7 +723,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); return result; } - + +/* Return 1 if the input is certainly normalized, 0 if it might not be. */ +static int +is_normalized(PyObject *self, PyObject *input, int nfc, int k) +{ + Py_UNICODE *i, *end; + unsigned char prev_combining = 0, quickcheck_mask; + + /* An older version of the database is requested, quickchecks must be + disabled. */ + if (self && UCD_Check(self)) + return 0; + + /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, + as described in http://unicode.org/reports/tr15/#Annex8. */ + quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); + + i = PyUnicode_AS_UNICODE(input); + end = i + PyUnicode_GET_SIZE(input); + while (i < end) { + const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++); + unsigned char combining = record->combining; + unsigned char quickcheck = record->normalization_quick_check; + + if (quickcheck & quickcheck_mask) + return 0; /* this string might need normalization */ + if (combining && prev_combining > combining) + return 0; /* non-canonical sort order, not normalized */ + prev_combining = combining; + } + return 1; /* certainly normalized */ +} + PyDoc_STRVAR(unicodedata_normalize__doc__, "normalize(form, unistr)\n\ \n\ @@ -746,14 +779,34 @@ unicodedata_normalize(PyObject *self, PyObject *args) return input; } - if (strcmp(form, "NFC") == 0) + if (strcmp(form, "NFC") == 0) { + if (is_normalized(self, input, 1, 0)) { + Py_INCREF(input); + return input; + } return nfc_nfkc(self, input, 0); - if (strcmp(form, "NFKC") == 0) + } + if (strcmp(form, "NFKC") == 0) { + if (is_normalized(self, input, 1, 1)) { + Py_INCREF(input); + return input; + } return nfc_nfkc(self, input, 1); - if (strcmp(form, "NFD") == 0) + } + if (strcmp(form, "NFD") == 0) { + if (is_normalized(self, input, 0, 0)) { + Py_INCREF(input); + return input; + } return nfd_nfkd(self, input, 0); - if (strcmp(form, "NFKD") == 0) + } + if (strcmp(form, "NFKD") == 0) { + if (is_normalized(self, input, 0, 1)) { + Py_INCREF(input); + return input; + } return nfd_nfkd(self, input, 1); + } PyErr_SetString(PyExc_ValueError, "invalid normalization form"); return NULL; } |