summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSerhiy Storchaka <storchaka@gmail.com>2017-05-14 06:05:13 (GMT)
committerGitHub <noreply@github.com>2017-05-14 06:05:13 (GMT)
commit4ab6abfca4d6e444cca04821b24701cde6993f4e (patch)
tree3fa4eb3ddab7c00ec87cf3a369a03b5e5fcce518
parent821a9d146bc04a1bc1a9807962990a1f59d692b8 (diff)
downloadcpython-4ab6abfca4d6e444cca04821b24701cde6993f4e.zip
cpython-4ab6abfca4d6e444cca04821b24701cde6993f4e.tar.gz
cpython-4ab6abfca4d6e444cca04821b24701cde6993f4e.tar.bz2
bpo-30299: Display a bytecode when compile a regex in debug mode. (#1491)
`re.compile(..., re.DEBUG)` now displays the compiled bytecode in human readable form.
-rw-r--r--Lib/sre_compile.py148
-rw-r--r--Lib/test/test_re.py27
-rw-r--r--Misc/NEWS3
3 files changed, 177 insertions, 1 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index aeb89bc..144620c 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -595,6 +595,150 @@ def _code(p, flags):
return code
+def _hex_code(code):
+ return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
+
+def dis(code):
+ import sys
+
+ labels = set()
+ level = 0
+ offset_width = len(str(len(code) - 1))
+
+ def dis_(start, end):
+ def print_(*args, to=None):
+ if to is not None:
+ labels.add(to)
+ args += ('(to %d)' % (to,),)
+ print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
+ end=' '*(level-1))
+ print(*args)
+
+ def print_2(*args):
+ print(end=' '*(offset_width + 2*level))
+ print(*args)
+
+ nonlocal level
+ level += 1
+ i = start
+ while i < end:
+ start = i
+ op = code[i]
+ i += 1
+ op = OPCODES[op]
+ if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
+ MAX_UNTIL, MIN_UNTIL, NEGATE):
+ print_(op)
+ elif op in (LITERAL, NOT_LITERAL,
+ LITERAL_IGNORE, NOT_LITERAL_IGNORE,
+ LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
+ arg = code[i]
+ i += 1
+ print_(op, '%#02x (%r)' % (arg, chr(arg)))
+ elif op is AT:
+ arg = code[i]
+ i += 1
+ arg = str(ATCODES[arg])
+ assert arg[:3] == 'AT_'
+ print_(op, arg[3:])
+ elif op is CATEGORY:
+ arg = code[i]
+ i += 1
+ arg = str(CHCODES[arg])
+ assert arg[:9] == 'CATEGORY_'
+ print_(op, arg[9:])
+ elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
+ skip = code[i]
+ print_(op, skip, to=i+skip)
+ dis_(i+1, i+skip)
+ i += skip
+ elif op in (RANGE, RANGE_IGNORE):
+ lo, hi = code[i: i+2]
+ i += 2
+ print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
+ elif op is CHARSET:
+ print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
+ i += 256//_CODEBITS
+ elif op is BIGCHARSET:
+ arg = code[i]
+ i += 1
+ mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
+ for x in code[i: i + 256//_sre.CODESIZE]))
+ print_(op, arg, mapping)
+ i += 256//_sre.CODESIZE
+ level += 1
+ for j in range(arg):
+ print_2(_hex_code(code[i: i + 256//_CODEBITS]))
+ i += 256//_CODEBITS
+ level -= 1
+ elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
+ arg = code[i]
+ i += 1
+ print_(op, arg)
+ elif op is JUMP:
+ skip = code[i]
+ print_(op, skip, to=i+skip)
+ i += 1
+ elif op is BRANCH:
+ skip = code[i]
+ print_(op, skip, to=i+skip)
+ while skip:
+ dis_(i+1, i+skip)
+ i += skip
+ start = i
+ skip = code[i]
+ if skip:
+ print_('branch', skip, to=i+skip)
+ else:
+ print_(FAILURE)
+ i += 1
+ elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
+ skip, min, max = code[i: i+3]
+ if max == MAXREPEAT:
+ max = 'MAXREPEAT'
+ print_(op, skip, min, max, to=i+skip)
+ dis_(i+3, i+skip)
+ i += skip
+ elif op is GROUPREF_EXISTS:
+ arg, skip = code[i: i+2]
+ print_(op, arg, skip, to=i+skip)
+ i += 2
+ elif op in (ASSERT, ASSERT_NOT):
+ skip, arg = code[i: i+2]
+ print_(op, skip, arg, to=i+skip)
+ dis_(i+2, i+skip)
+ i += skip
+ elif op is INFO:
+ skip, flags, min, max = code[i: i+4]
+ if max == MAXREPEAT:
+ max = 'MAXREPEAT'
+ print_(op, skip, bin(flags), min, max, to=i+skip)
+ start = i+4
+ if flags & SRE_INFO_PREFIX:
+ prefix_len, prefix_skip = code[i+4: i+6]
+ print_2(' prefix_skip', prefix_skip)
+ start = i + 6
+ prefix = code[start: start+prefix_len]
+ print_2(' prefix',
+ '[%s]' % ', '.join('%#02x' % x for x in prefix),
+ '(%r)' % ''.join(map(chr, prefix)))
+ start += prefix_len
+ print_2(' overlap', code[start: start+prefix_len])
+ start += prefix_len
+ if flags & SRE_INFO_CHARSET:
+ level += 1
+ print_2('in')
+ dis_(start, i+skip)
+ level -= 1
+ i += skip
+ else:
+ raise ValueError(op)
+
+ level -= 1
+
+ dis_(0, len(code))
+
+
def compile(p, flags=0):
# internal: convert pattern list to internal format
@@ -606,7 +750,9 @@ def compile(p, flags=0):
code = _code(p, flags)
- # print(code)
+ if flags & SRE_FLAG_DEBUG:
+ print()
+ dis(code)
# map in either direction
groupindex = p.pattern.groupdict
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 5d36b54..1bb2654 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1688,10 +1688,12 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(1), "")
self.assertEqual(m.group(2), "y")
+ @cpython_only
def test_debug_flag(self):
pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
with captured_stdout() as out:
re.compile(pat, re.DEBUG)
+ self.maxDiff = None
dump = '''\
SUBPATTERN 1 0 0
LITERAL 46
@@ -1707,6 +1709,31 @@ GROUPREF_EXISTS 1
ELSE
LITERAL 58
LITERAL 32
+
+ 0. INFO 8 0b1 2 5 (to 9)
+ prefix_skip 0
+ prefix [0x2e] ('.')
+ overlap [0]
+ 9: MARK 0
+11. LITERAL 0x2e ('.')
+13. MARK 1
+15. BRANCH 10 (to 26)
+17. IN 6 (to 24)
+19. LITERAL 0x63 ('c')
+21. LITERAL 0x68 ('h')
+23. FAILURE
+24: JUMP 9 (to 34)
+26: branch 7 (to 33)
+27. LITERAL 0x70 ('p')
+29. LITERAL 0x79 ('y')
+31. JUMP 2 (to 34)
+33: FAILURE
+34: GROUPREF_EXISTS 0 6 (to 41)
+37. AT END
+39. JUMP 5 (to 45)
+41: LITERAL 0x3a (':')
+43. LITERAL 0x20 (' ')
+45: SUCCESS
'''
self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing
diff --git a/Misc/NEWS b/Misc/NEWS
index e6b4ced..bf19b25 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -323,6 +323,9 @@ Extension Modules
Library
-------
+- bpo-30299: Compiling regular expression in debug mode on CPython now displays
+ the compiled bytecode in human readable form.
+
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``.