summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2024-11-11 16:27:26 (GMT)
committerGitHub <noreply@github.com>2024-11-11 16:27:26 (GMT)
commit819830f34a11ecaa3aada174ca8eedeb3f260630 (patch)
treeb327077f72d7a8a2e9f5ae7b7121f9fa57b4d67b
parent9fc2808eaf4e74a9f52f44d20a7d1110bd949d41 (diff)
downloadcpython-819830f34a11ecaa3aada174ca8eedeb3f260630.zip
cpython-819830f34a11ecaa3aada174ca8eedeb3f260630.tar.gz
cpython-819830f34a11ecaa3aada174ca8eedeb3f260630.tar.bz2
gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557)
* upper-case non-BMP character was ignored * the ASCII flag was ignored when matching a character range whose upper bound is beyond the BMP region
-rw-r--r--Lib/re/_compiler.py23
-rw-r--r--Lib/test/test_re.py55
-rw-r--r--Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst4
3 files changed, 73 insertions, 9 deletions
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py
index 29109f8..20dd561 100644
--- a/Lib/re/_compiler.py
+++ b/Lib/re/_compiler.py
@@ -255,11 +255,11 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
while True:
try:
if op is LITERAL:
- if fixup:
- lo = fixup(av)
- charmap[lo] = 1
- if fixes and lo in fixes:
- for k in fixes[lo]:
+ if fixup: # IGNORECASE and not LOCALE
+ av = fixup(av)
+ charmap[av] = 1
+ if fixes and av in fixes:
+ for k in fixes[av]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
@@ -267,7 +267,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
- if fixup:
+ if fixup: # IGNORECASE and not LOCALE
if fixes:
for i in map(fixup, r):
charmap[i] = 1
@@ -298,8 +298,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# Character set contains non-BMP character codes.
# For range, all BMP characters in the range are already
# proceeded.
- if fixup:
- hascased = True
+ if fixup: # IGNORECASE and not LOCALE
# For now, IN_UNI_IGNORE+LITERAL and
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
# characters, because two characters (at least one of
@@ -310,7 +309,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# Also, both c.lower() and c.lower().upper() are single
# characters for every non-BMP character.
if op is RANGE:
- op = RANGE_UNI_IGNORE
+ if fixes: # not ASCII
+ op = RANGE_UNI_IGNORE
+ hascased = True
+ else:
+ assert op is LITERAL
+ if not hascased and iscased(av):
+ hascased = True
tail.append((op, av))
break
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index ff95f54..7bc702e 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1136,6 +1136,39 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
+ self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
+ self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
+ self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
+ self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
+ self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
+ self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
+ self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
+ self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
+ self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
+ self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
+ self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
+ self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))
+
+ self.assertTrue(re.match(br'[19A]', b'A', re.I))
+ self.assertTrue(re.match(br'[19a]', b'a', re.I))
+ self.assertTrue(re.match(br'[19a]', b'A', re.I))
+ self.assertTrue(re.match(br'[19A]', b'a', re.I))
+ self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
+ self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
+ self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
+ self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
+ self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
+ self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
+ self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
+ self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
+ self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
+ self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
+ self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
@@ -1172,8 +1205,10 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
+ self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
+ self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
@@ -1184,6 +1219,26 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
+ self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
+ self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
+ self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
+ self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
+ self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
+ self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
+ self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
+ self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
+ self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
+ self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
+ self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
+ self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))
+
+ self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
+ self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
+ self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
+ self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
+ self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
+ self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))
+
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
diff --git a/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst b/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst
new file mode 100644
index 0000000..0a0f893
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-11-07-22-41-47.gh-issue-126505.iztYE1.rst
@@ -0,0 +1,4 @@
+Fix bugs in compiling case-insensitive :mod:`regular expressions <re>` with
+character classes containing non-BMP characters: upper-case non-BMP
+character did was ignored and the ASCII flag was ignored when
+matching a character range whose upper bound is beyond the BMP region.