diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2024-11-11 16:27:26 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-11-11 16:27:26 (GMT) |
commit | 819830f34a11ecaa3aada174ca8eedeb3f260630 (patch) | |
tree | b327077f72d7a8a2e9f5ae7b7121f9fa57b4d67b /Lib/re/_compiler.py | |
parent | 9fc2808eaf4e74a9f52f44d20a7d1110bd949d41 (diff) | |
download | cpython-819830f34a11ecaa3aada174ca8eedeb3f260630.zip cpython-819830f34a11ecaa3aada174ca8eedeb3f260630.tar.gz cpython-819830f34a11ecaa3aada174ca8eedeb3f260630.tar.bz2 |
gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557)
* upper-case non-BMP character was ignored
* the ASCII flag was ignored when matching a character range whose
upper bound is beyond the BMP region
Diffstat (limited to 'Lib/re/_compiler.py')
-rw-r--r-- | Lib/re/_compiler.py | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py index 29109f8..20dd561 100644 --- a/Lib/re/_compiler.py +++ b/Lib/re/_compiler.py @@ -255,11 +255,11 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): while True: try: if op is LITERAL: - if fixup: - lo = fixup(av) - charmap[lo] = 1 - if fixes and lo in fixes: - for k in fixes[lo]: + if fixup: # IGNORECASE and not LOCALE + av = fixup(av) + charmap[av] = 1 + if fixes and av in fixes: + for k in fixes[av]: charmap[k] = 1 if not hascased and iscased(av): hascased = True @@ -267,7 +267,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) - if fixup: + if fixup: # IGNORECASE and not LOCALE if fixes: for i in map(fixup, r): charmap[i] = 1 @@ -298,8 +298,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # Character set contains non-BMP character codes. # For range, all BMP characters in the range are already # proceeded. - if fixup: - hascased = True + if fixup: # IGNORECASE and not LOCALE # For now, IN_UNI_IGNORE+LITERAL and # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP # characters, because two characters (at least one of @@ -310,7 +309,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): # Also, both c.lower() and c.lower().upper() are single # characters for every non-BMP character. if op is RANGE: - op = RANGE_UNI_IGNORE + if fixes: # not ASCII + op = RANGE_UNI_IGNORE + hascased = True + else: + assert op is LITERAL + if not hascased and iscased(av): + hascased = True tail.append((op, av)) break |