diff options
author | Serhiy Storchaka <storchaka@gmail.com> | 2017-10-24 20:31:42 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-24 20:31:42 (GMT) |
commit | 3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132 (patch) | |
tree | aa741f0d09293f6dfe9668a5b328658ce13c8279 /Lib | |
parent | fdd9b217c60b454ac6a82f02c8b0b551caeac88b (diff) | |
download | cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.zip cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.gz cpython-3557b05c5a7dfd7d97ddfd3b79aefd53d25e5132.tar.bz2 |
bpo-31690: Allow the inline flags "a", "L", and "u" to be used as group flags for RE. (#3885)
Diffstat (limited to 'Lib')
-rw-r--r-- | Lib/sre_compile.py | 59 | ||||
-rw-r--r-- | Lib/sre_constants.py | 40 | ||||
-rw-r--r-- | Lib/sre_parse.py | 24 | ||||
-rw-r--r-- | Lib/test/test_re.py | 22 |
4 files changed, 101 insertions, 44 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 144620c..e5216b7 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -62,6 +62,12 @@ _equivalences = ( _ignorecase_fixes = {i: tuple(j for j in t if i != j) for t in _equivalences for i in t} +def _combine_flags(flags, add_flags, del_flags, + TYPE_FLAGS=sre_parse.TYPE_FLAGS): + if add_flags & TYPE_FLAGS: + flags &= ~TYPE_FLAGS + return (flags | add_flags) & ~del_flags + def _compile(code, pattern, flags): # internal: compile a (sub)pattern emit = code.append @@ -87,15 +93,21 @@ def _compile(code, pattern, flags): emit(op) emit(av) elif flags & SRE_FLAG_LOCALE: - emit(OP_LOC_IGNORE[op]) + emit(OP_LOCALE_IGNORE[op]) emit(av) elif not iscased(av): emit(op) emit(av) else: lo = tolower(av) - if fixes and lo in fixes: - emit(IN_IGNORE) + if not fixes: # ascii + emit(OP_IGNORE[op]) + emit(lo) + elif lo not in fixes: + emit(OP_UNICODE_IGNORE[op]) + emit(lo) + else: + emit(IN_UNI_IGNORE) skip = _len(code); emit(0) if op is NOT_LITERAL: emit(NEGATE) @@ -104,17 +116,16 @@ def _compile(code, pattern, flags): emit(k) emit(FAILURE) code[skip] = _len(code) - skip - else: - emit(OP_IGNORE[op]) - emit(lo) elif op is IN: charset, hascased = _optimize_charset(av, iscased, tolower, fixes) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: emit(IN_LOC_IGNORE) - elif hascased: + elif not hascased: + emit(IN) + elif not fixes: # ascii emit(IN_IGNORE) else: - emit(IN) + emit(IN_UNI_IGNORE) skip = _len(code); emit(0) _compile_charset(charset, flags, code) code[skip] = _len(code) - skip @@ -153,8 +164,8 @@ def _compile(code, pattern, flags): if group: emit(MARK) emit((group-1)*2) - # _compile_info(code, p, (flags | add_flags) & ~del_flags) - _compile(code, p, (flags | add_flags) & ~del_flags) + # _compile_info(code, p, _combine_flags(flags, add_flags, del_flags)) + _compile(code, p, _combine_flags(flags, add_flags, del_flags)) if group: emit(MARK) emit((group-1)*2+1) @@ -210,10 +221,14 @@ def _compile(code, pattern, flags): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: - if flags & SRE_FLAG_IGNORECASE: - emit(OP_IGNORE[op]) - else: + if not flags & SRE_FLAG_IGNORECASE: emit(op) + elif flags & SRE_FLAG_LOCALE: + emit(GROUPREF_LOC_IGNORE) + elif not fixes: # ascii + emit(GROUPREF_IGNORE) + else: + emit(GROUPREF_UNI_IGNORE) emit(av-1) elif op is GROUPREF_EXISTS: emit(op) @@ -240,7 +255,7 @@ def _compile_charset(charset, flags, code): pass elif op is LITERAL: emit(av) - elif op is RANGE or op is RANGE_IGNORE: + elif op is RANGE or op is RANGE_UNI_IGNORE: emit(av[0]) emit(av[1]) elif op is CHARSET: @@ -309,9 +324,9 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None): hascased = True # There are only two ranges of cased non-BMP characters: # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), - # and for both ranges RANGE_IGNORE works. + # and for both ranges RANGE_UNI_IGNORE works. if op is RANGE: - op = RANGE_IGNORE + op = RANGE_UNI_IGNORE tail.append((op, av)) break @@ -456,7 +471,7 @@ def _get_literal_prefix(pattern, flags): prefixappend(av) elif op is SUBPATTERN: group, add_flags, del_flags, p = av - flags1 = (flags | add_flags) & ~del_flags + flags1 = _combine_flags(flags, add_flags, del_flags) if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE: break prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1) @@ -482,7 +497,7 @@ def _get_charset_prefix(pattern, flags): if op is not SUBPATTERN: break group, add_flags, del_flags, pattern = av - flags = (flags | add_flags) & ~del_flags + flags = _combine_flags(flags, add_flags, del_flags) if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE: return None @@ -631,6 +646,7 @@ def dis(code): print_(op) elif op in (LITERAL, NOT_LITERAL, LITERAL_IGNORE, NOT_LITERAL_IGNORE, + LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE, LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE): arg = code[i] i += 1 @@ -647,12 +663,12 @@ def dis(code): arg = str(CHCODES[arg]) assert arg[:9] == 'CATEGORY_' print_(op, arg[9:]) - elif op in (IN, IN_IGNORE, IN_LOC_IGNORE): + elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE): skip = code[i] print_(op, skip, to=i+skip) dis_(i+1, i+skip) i += skip - elif op in (RANGE, RANGE_IGNORE): + elif op in (RANGE, RANGE_UNI_IGNORE): lo, hi = code[i: i+2] i += 2 print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi))) @@ -671,7 +687,8 @@ def dis(code): print_2(_hex_code(code[i: i + 256//_CODEBITS])) i += 256//_CODEBITS level -= 1 - elif op in (MARK, GROUPREF, GROUPREF_IGNORE): + elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE, + GROUPREF_LOC_IGNORE): arg = code[i] i += 1 print_(op, arg) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index 1daa7bd..13deb00 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -13,7 +13,7 @@ # update when constants are added or removed -MAGIC = 20170530 +MAGIC = 20171005 from _sre import MAXREPEAT, MAXGROUPS @@ -84,25 +84,37 @@ OPCODES = _makecodes(""" CALL CATEGORY CHARSET BIGCHARSET - GROUPREF GROUPREF_EXISTS GROUPREF_IGNORE - IN IN_IGNORE + GROUPREF GROUPREF_EXISTS + IN INFO JUMP - LITERAL LITERAL_IGNORE + LITERAL MARK MAX_UNTIL MIN_UNTIL - NOT_LITERAL NOT_LITERAL_IGNORE + NOT_LITERAL NEGATE RANGE REPEAT REPEAT_ONE SUBPATTERN MIN_REPEAT_ONE - RANGE_IGNORE + + GROUPREF_IGNORE + IN_IGNORE + LITERAL_IGNORE + NOT_LITERAL_IGNORE + + GROUPREF_LOC_IGNORE + IN_LOC_IGNORE LITERAL_LOC_IGNORE NOT_LITERAL_LOC_IGNORE - IN_LOC_IGNORE + + GROUPREF_UNI_IGNORE + IN_UNI_IGNORE + LITERAL_UNI_IGNORE + NOT_LITERAL_UNI_IGNORE + RANGE_UNI_IGNORE MIN_REPEAT MAX_REPEAT """) @@ -113,7 +125,9 @@ ATCODES = _makecodes(""" AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING AT_BOUNDARY AT_NON_BOUNDARY AT_END AT_END_LINE AT_END_STRING + AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY + AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY """) @@ -123,7 +137,9 @@ CHCODES = _makecodes(""" CATEGORY_SPACE CATEGORY_NOT_SPACE CATEGORY_WORD CATEGORY_NOT_WORD CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK + CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD + CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD @@ -133,18 +149,20 @@ CHCODES = _makecodes(""" # replacement operations for "ignore case" mode OP_IGNORE = { - GROUPREF: GROUPREF_IGNORE, - IN: IN_IGNORE, LITERAL: LITERAL_IGNORE, NOT_LITERAL: NOT_LITERAL_IGNORE, - RANGE: RANGE_IGNORE, } -OP_LOC_IGNORE = { +OP_LOCALE_IGNORE = { LITERAL: LITERAL_LOC_IGNORE, NOT_LITERAL: NOT_LITERAL_LOC_IGNORE, } +OP_UNICODE_IGNORE = { + LITERAL: LITERAL_UNI_IGNORE, + NOT_LITERAL: NOT_LITERAL_UNI_IGNORE, +} + AT_MULTILINE = { AT_BEGINNING: AT_BEGINNING_LINE, AT_END: AT_END_LINE diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 5452520..8527412 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -65,8 +65,8 @@ FLAGS = { "u": SRE_FLAG_UNICODE, } -GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | - SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE) +TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE +GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE class Verbose(Exception): pass @@ -822,7 +822,19 @@ def _parse_flags(source, state, char): del_flags = 0 if char != "-": while True: - add_flags |= FLAGS[char] + flag = FLAGS[char] + if source.istext: + if char == 'L': + msg = "bad inline flags: cannot use 'L' flag with a str pattern" + raise source.error(msg) + else: + if char == 'u': + msg = "bad inline flags: cannot use 'u' flag with a bytes pattern" + raise source.error(msg) + add_flags |= flag + if (flag & TYPE_FLAGS) and (add_flags & TYPE_FLAGS) != flag: + msg = "bad inline flags: flags 'a', 'u' and 'L' are incompatible" + raise source.error(msg) char = sourceget() if char is None: raise source.error("missing -, : or )") @@ -844,7 +856,11 @@ def _parse_flags(source, state, char): msg = "unknown flag" if char.isalpha() else "missing flag" raise source.error(msg, len(char)) while True: - del_flags |= FLAGS[char] + flag = FLAGS[char] + if flag & TYPE_FLAGS: + msg = "bad inline flags: cannot turn off flags 'a', 'u' and 'L'" + raise source.error(msg) + del_flags |= flag char = sourceget() if char is None: raise source.error("missing :") diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 9cb426a..fc015e4 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -1470,11 +1470,11 @@ class ReTests(unittest.TestCase): self.assertIsNone(pat.match(b'\xe0')) # Incompatibilities self.assertRaises(ValueError, re.compile, br'\w', re.UNICODE) - self.assertRaises(ValueError, re.compile, br'(?u)\w') + self.assertRaises(re.error, re.compile, br'(?u)\w') self.assertRaises(ValueError, re.compile, r'\w', re.UNICODE | re.ASCII) self.assertRaises(ValueError, re.compile, r'(?u)\w', re.ASCII) self.assertRaises(ValueError, re.compile, r'(?a)\w', re.UNICODE) - self.assertRaises(ValueError, re.compile, r'(?au)\w') + self.assertRaises(re.error, re.compile, r'(?au)\w') def test_locale_flag(self): import locale @@ -1516,11 +1516,11 @@ class ReTests(unittest.TestCase): self.assertIsNone(pat.match(bletter)) # Incompatibilities self.assertRaises(ValueError, re.compile, '', re.LOCALE) - self.assertRaises(ValueError, re.compile, '(?L)') + self.assertRaises(re.error, re.compile, '(?L)') self.assertRaises(ValueError, re.compile, b'', re.LOCALE | re.ASCII) self.assertRaises(ValueError, re.compile, b'(?L)', re.ASCII) self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) - self.assertRaises(ValueError, re.compile, b'(?aL)') + self.assertRaises(re.error, re.compile, b'(?aL)') def test_scoped_flags(self): self.assertTrue(re.match(r'(?i:a)b', 'Ab')) @@ -1535,12 +1535,18 @@ class ReTests(unittest.TestCase): self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE)) self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE)) - self.checkPatternError(r'(?a:\w)', - 'bad inline flags: cannot turn on global flag', 3) + self.assertTrue(re.match(r'\w(?a:\W)\w', '\xe0\xe0\xe0')) + self.assertTrue(re.match(r'(?a:\W(?u:\w)\W)', '\xe0\xe0\xe0')) + self.assertTrue(re.match(r'\W(?u:\w)\W', '\xe0\xe0\xe0', re.ASCII)) + self.checkPatternError(r'(?a)(?-a:\w)', - 'bad inline flags: cannot turn off global flag', 8) + "bad inline flags: cannot turn off flags 'a', 'u' and 'L'", 8) self.checkPatternError(r'(?i-i:a)', - 'bad inline flags: flag turned on and off', 5) + 'bad inline flags: flag turned on and off', 5) + self.checkPatternError(r'(?au:a)', + "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) + self.checkPatternError(br'(?aL:a)', + "bad inline flags: flags 'a', 'u' and 'L' are incompatible", 4) self.checkPatternError(r'(?-', 'missing flag', 3) self.checkPatternError(r'(?-+', 'missing flag', 3) |