From 7186cc29be352bed6f1110873283d073fd0643e4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 5 May 2017 10:42:46 +0300 Subject: bpo-30277: Replace _sre.getlower() with _sre.ascii_tolower() and _sre.unicode_tolower(). (#1468) --- Lib/sre_compile.py | 23 ++++++++++------------- Lib/test/test_re.py | 26 ++++++++++++++++---------- Modules/_sre.c | 29 ++++++++++++++++++++--------- Modules/clinic/_sre.c.h | 49 +++++++++++++++++++++++++++++++++++++------------ 4 files changed, 83 insertions(+), 44 deletions(-) diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index d7ee4e8..db8b8a2 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -69,13 +69,14 @@ def _compile(code, pattern, flags): REPEATING_CODES = _REPEATING_CODES SUCCESS_CODES = _SUCCESS_CODES ASSERT_CODES = _ASSERT_CODES - if (flags & SRE_FLAG_IGNORECASE and - not (flags & SRE_FLAG_LOCALE) and - flags & SRE_FLAG_UNICODE and - not (flags & SRE_FLAG_ASCII)): - fixes = _ignorecase_fixes - else: - fixes = None + tolower = None + fixes = None + if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE: + if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII: + tolower = _sre.unicode_tolower + fixes = _ignorecase_fixes + else: + tolower = _sre.ascii_tolower for op, av in pattern: if op in LITERAL_CODES: if not flags & SRE_FLAG_IGNORECASE: @@ -85,7 +86,7 @@ def _compile(code, pattern, flags): emit(OP_LOC_IGNORE[op]) emit(av) else: - lo = _sre.getlower(av, flags) + lo = tolower(av) if fixes and lo in fixes: emit(IN_IGNORE) skip = _len(code); emit(0) @@ -102,16 +103,12 @@ def _compile(code, pattern, flags): elif op is IN: if not flags & SRE_FLAG_IGNORECASE: emit(op) - fixup = None elif flags & SRE_FLAG_LOCALE: emit(IN_LOC_IGNORE) - fixup = None else: emit(IN_IGNORE) - def fixup(literal, flags=flags): - return _sre.getlower(literal, flags) skip = _len(code); emit(0) - _compile_charset(av, flags, code, fixup, fixes) + _compile_charset(av, flags, code, tolower, fixes) code[skip] = _len(code) - skip elif op is ANY: if flags & SRE_FLAG_DOTALL: diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 7601dc8..b5b7cff 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -883,17 +883,23 @@ class ReTests(unittest.TestCase): def test_category(self): self.assertEqual(re.match(r"(\s)", " ").group(1), " ") - def test_getlower(self): + @cpython_only + def test_case_helpers(self): import _sre - self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) - self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) - self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) - self.assertEqual(_sre.getlower(ord('A'), re.ASCII), ord('a')) - - self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") - self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") - self.assertEqual(re.match("abc", "ABC", re.I|re.A).group(0), "ABC") - self.assertEqual(re.match(b"abc", b"ABC", re.I|re.L).group(0), b"ABC") + for i in range(128): + c = chr(i) + lo = ord(c.lower()) + self.assertEqual(_sre.ascii_tolower(i), lo) + self.assertEqual(_sre.unicode_tolower(i), lo) + + for i in list(range(128, 0x1000)) + [0x10400, 0x10428]: + c = chr(i) + self.assertEqual(_sre.ascii_tolower(i), i) + if i != 0x0130: + self.assertEqual(_sre.unicode_tolower(i), ord(c.lower())) + + self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130) + self.assertEqual(_sre.unicode_tolower(0x0130), ord('i')) def test_not_literal(self): self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b") diff --git a/Modules/_sre.c b/Modules/_sre.c index afb2bce..a86c5f2 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -274,25 +274,35 @@ _sre_getcodesize_impl(PyObject *module) } /*[clinic input] -_sre.getlower -> int +_sre.ascii_tolower -> int character: int - flags: int / [clinic start generated code]*/ static int -_sre_getlower_impl(PyObject *module, int character, int flags) -/*[clinic end generated code: output=47eebc4c1214feb5 input=087d2f1c44bbca6f]*/ +_sre_ascii_tolower_impl(PyObject *module, int character) +/*[clinic end generated code: output=228294ed6ff2a612 input=272c609b5b61f136]*/ { - if (flags & SRE_FLAG_LOCALE) - return sre_lower_locale(character); - if (flags & SRE_FLAG_UNICODE) - return sre_lower_unicode(character); return sre_lower(character); } +/*[clinic input] +_sre.unicode_tolower -> int + + character: int + / + +[clinic start generated code]*/ + +static int +_sre_unicode_tolower_impl(PyObject *module, int character) +/*[clinic end generated code: output=6422272d7d7fee65 input=91d708c5f3c2045a]*/ +{ + return sre_lower_unicode(character); +} + LOCAL(void) state_reset(SRE_STATE* state) { @@ -2740,7 +2750,8 @@ static PyTypeObject Scanner_Type = { static PyMethodDef _functions[] = { _SRE_COMPILE_METHODDEF _SRE_GETCODESIZE_METHODDEF - _SRE_GETLOWER_METHODDEF + _SRE_ASCII_TOLOWER_METHODDEF + _SRE_UNICODE_TOLOWER_METHODDEF {NULL, NULL} }; diff --git a/Modules/clinic/_sre.c.h b/Modules/clinic/_sre.c.h index 5278323..8056eda 100644 --- a/Modules/clinic/_sre.c.h +++ b/Modules/clinic/_sre.c.h @@ -29,34 +29,59 @@ exit: return return_value; } -PyDoc_STRVAR(_sre_getlower__doc__, -"getlower($module, character, flags, /)\n" +PyDoc_STRVAR(_sre_ascii_tolower__doc__, +"ascii_tolower($module, character, /)\n" "--\n" "\n"); -#define _SRE_GETLOWER_METHODDEF \ - {"getlower", (PyCFunction)_sre_getlower, METH_FASTCALL, _sre_getlower__doc__}, +#define _SRE_ASCII_TOLOWER_METHODDEF \ + {"ascii_tolower", (PyCFunction)_sre_ascii_tolower, METH_O, _sre_ascii_tolower__doc__}, static int -_sre_getlower_impl(PyObject *module, int character, int flags); +_sre_ascii_tolower_impl(PyObject *module, int character); static PyObject * -_sre_getlower(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject *kwnames) +_sre_ascii_tolower(PyObject *module, PyObject *arg) { PyObject *return_value = NULL; int character; - int flags; int _return_value; - if (!_PyArg_ParseStack(args, nargs, "ii:getlower", - &character, &flags)) { + if (!PyArg_Parse(arg, "i:ascii_tolower", &character)) { + goto exit; + } + _return_value = _sre_ascii_tolower_impl(module, character); + if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } + return_value = PyLong_FromLong((long)_return_value); + +exit: + return return_value; +} + +PyDoc_STRVAR(_sre_unicode_tolower__doc__, +"unicode_tolower($module, character, /)\n" +"--\n" +"\n"); + +#define _SRE_UNICODE_TOLOWER_METHODDEF \ + {"unicode_tolower", (PyCFunction)_sre_unicode_tolower, METH_O, _sre_unicode_tolower__doc__}, + +static int +_sre_unicode_tolower_impl(PyObject *module, int character); + +static PyObject * +_sre_unicode_tolower(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + int character; + int _return_value; - if (!_PyArg_NoStackKeywords("getlower", kwnames)) { + if (!PyArg_Parse(arg, "i:unicode_tolower", &character)) { goto exit; } - _return_value = _sre_getlower_impl(module, character, flags); + _return_value = _sre_unicode_tolower_impl(module, character); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -690,4 +715,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored)) { return _sre_SRE_Scanner_search_impl(self); } -/*[clinic end generated code: output=e6dab3ba8864da9e input=a9049054013a1b77]*/ +/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/ -- cgit v0.12