diff options
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 57 |
1 files changed, 12 insertions, 45 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index f24f681..1b3e9f8 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -169,13 +169,13 @@ def _compile(code, pattern, flags): def _compile_charset(charset, flags, code, fixup=None): # compile charset subprogram emit = code.append - for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE): + for op, av in _optimize_charset(charset, fixup): emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: emit(av) - elif op is RANGE: + elif op is RANGE or op is RANGE_IGNORE: emit(av[0]) emit(av[1]) elif op is CHARSET: @@ -193,7 +193,7 @@ def _compile_charset(charset, flags, code, fixup=None): raise error("internal: unsupported set operator") emit(OPCODES[FAILURE]) -def _optimize_charset(charset, fixup, isunicode): +def _optimize_charset(charset, fixup): # internal: optimize character set out = [] tail = [] @@ -202,10 +202,9 @@ def _optimize_charset(charset, fixup, isunicode): while True: try: if op is LITERAL: - i = av if fixup: - i = fixup(i) - charmap[i] = 1 + av = fixup(av) + charmap[av] = 1 elif op is RANGE: r = range(av[0], av[1]+1) if fixup: @@ -221,21 +220,13 @@ def _optimize_charset(charset, fixup, isunicode): # character set contains non-UCS1 character codes charmap += b'\0' * 0xff00 continue - # character set contains non-BMP character codes - if fixup and isunicode and op is RANGE: - lo, hi = av - ranges = [av] - # There are only two ranges of cased astral characters: - # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi). - _fixup_range(max(0x10000, lo), min(0x11fff, hi), - ranges, fixup) - for lo, hi in ranges: - if lo == hi: - tail.append((LITERAL, hi)) - else: - tail.append((RANGE, (lo, hi))) - else: - tail.append((op, av)) + # Character set contains non-BMP character codes. + # There are only two ranges of cased non-BMP characters: + # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi), + # and for both ranges RANGE_IGNORE works. + if fixup and op is RANGE: + op = RANGE_IGNORE + tail.append((op, av)) break # compress character map @@ -313,24 +304,6 @@ def _optimize_charset(charset, fixup, isunicode): out += tail return out -def _fixup_range(lo, hi, ranges, fixup): - for i in map(fixup, range(lo, hi+1)): - for k, (lo, hi) in enumerate(ranges): - if i < lo: - if l == lo - 1: - ranges[k] = (i, hi) - else: - ranges.insert(k, (i, i)) - break - elif i > hi: - if i == hi + 1: - ranges[k] = (lo, i) - break - else: - break - else: - ranges.append((i, i)) - _CODEBITS = _sre.CODESIZE * 8 _BITS_TRANS = b'0' + b'1' * 255 def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): @@ -504,12 +477,6 @@ def compile(p, flags=0): # print code - # XXX: <fl> get rid of this limitation! - if p.pattern.groups > 100: - raise AssertionError( - "sorry, but this version only supports 100 named groups" - ) - # map in either direction groupindex = p.pattern.groupdict indexgroup = [None] * p.pattern.groups |