summaryrefslogtreecommitdiffstats
path: root/Lib/sre_compile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r--Lib/sre_compile.py57
1 files changed, 12 insertions, 45 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index f24f681..1b3e9f8 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -169,13 +169,13 @@ def _compile(code, pattern, flags):
def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram
emit = code.append
- for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE):
+ for op, av in _optimize_charset(charset, fixup):
emit(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
- elif op is RANGE:
+ elif op is RANGE or op is RANGE_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
@@ -193,7 +193,7 @@ def _compile_charset(charset, flags, code, fixup=None):
raise error("internal: unsupported set operator")
emit(OPCODES[FAILURE])
-def _optimize_charset(charset, fixup, isunicode):
+def _optimize_charset(charset, fixup):
# internal: optimize character set
out = []
tail = []
@@ -202,10 +202,9 @@ def _optimize_charset(charset, fixup, isunicode):
while True:
try:
if op is LITERAL:
- i = av
if fixup:
- i = fixup(i)
- charmap[i] = 1
+ av = fixup(av)
+ charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
@@ -221,21 +220,13 @@ def _optimize_charset(charset, fixup, isunicode):
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
- # character set contains non-BMP character codes
- if fixup and isunicode and op is RANGE:
- lo, hi = av
- ranges = [av]
- # There are only two ranges of cased astral characters:
- # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
- _fixup_range(max(0x10000, lo), min(0x11fff, hi),
- ranges, fixup)
- for lo, hi in ranges:
- if lo == hi:
- tail.append((LITERAL, hi))
- else:
- tail.append((RANGE, (lo, hi)))
- else:
- tail.append((op, av))
+ # Character set contains non-BMP character codes.
+ # There are only two ranges of cased non-BMP characters:
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+ # and for both ranges RANGE_IGNORE works.
+ if fixup and op is RANGE:
+ op = RANGE_IGNORE
+ tail.append((op, av))
break
# compress character map
@@ -313,24 +304,6 @@ def _optimize_charset(charset, fixup, isunicode):
out += tail
return out
-def _fixup_range(lo, hi, ranges, fixup):
- for i in map(fixup, range(lo, hi+1)):
- for k, (lo, hi) in enumerate(ranges):
- if i < lo:
- if l == lo - 1:
- ranges[k] = (i, hi)
- else:
- ranges.insert(k, (i, i))
- break
- elif i > hi:
- if i == hi + 1:
- ranges[k] = (lo, i)
- break
- else:
- break
- else:
- ranges.append((i, i))
-
_CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
@@ -504,12 +477,6 @@ def compile(p, flags=0):
# print code
- # XXX: <fl> get rid of this limitation!
- if p.pattern.groups > 100:
- raise AssertionError(
- "sorry, but this version only supports 100 named groups"
- )
-
# map in either direction
groupindex = p.pattern.groupdict
indexgroup = [None] * p.pattern.groups