From 4ab6abfca4d6e444cca04821b24701cde6993f4e Mon Sep 17 00:00:00 2001
From: Serhiy Storchaka <storchaka@gmail.com>
Date: Sun, 14 May 2017 09:05:13 +0300
Subject: bpo-30299: Display a bytecode when compile a regex in debug mode.
 (#1491)

`re.compile(..., re.DEBUG)` now displays the compiled bytecode in
human readable form.
---
 Lib/sre_compile.py  | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 Lib/test/test_re.py |  27 ++++++++++
 Misc/NEWS           |   3 ++
 3 files changed, 177 insertions(+), 1 deletion(-)

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index aeb89bc..144620c 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -595,6 +595,150 @@ def _code(p, flags):
 
     return code
 
+def _hex_code(code):
+    return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
+
+def dis(code):
+    import sys
+
+    labels = set()
+    level = 0
+    offset_width = len(str(len(code) - 1))
+
+    def dis_(start, end):
+        def print_(*args, to=None):
+            if to is not None:
+                labels.add(to)
+                args += ('(to %d)' % (to,),)
+            print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
+                  end='  '*(level-1))
+            print(*args)
+
+        def print_2(*args):
+            print(end=' '*(offset_width + 2*level))
+            print(*args)
+
+        nonlocal level
+        level += 1
+        i = start
+        while i < end:
+            start = i
+            op = code[i]
+            i += 1
+            op = OPCODES[op]
+            if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
+                      MAX_UNTIL, MIN_UNTIL, NEGATE):
+                print_(op)
+            elif op in (LITERAL, NOT_LITERAL,
+                        LITERAL_IGNORE, NOT_LITERAL_IGNORE,
+                        LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
+                arg = code[i]
+                i += 1
+                print_(op, '%#02x (%r)' % (arg, chr(arg)))
+            elif op is AT:
+                arg = code[i]
+                i += 1
+                arg = str(ATCODES[arg])
+                assert arg[:3] == 'AT_'
+                print_(op, arg[3:])
+            elif op is CATEGORY:
+                arg = code[i]
+                i += 1
+                arg = str(CHCODES[arg])
+                assert arg[:9] == 'CATEGORY_'
+                print_(op, arg[9:])
+            elif op in (IN, IN_IGNORE, IN_LOC_IGNORE):
+                skip = code[i]
+                print_(op, skip, to=i+skip)
+                dis_(i+1, i+skip)
+                i += skip
+            elif op in (RANGE, RANGE_IGNORE):
+                lo, hi = code[i: i+2]
+                i += 2
+                print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
+            elif op is CHARSET:
+                print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
+                i += 256//_CODEBITS
+            elif op is BIGCHARSET:
+                arg = code[i]
+                i += 1
+                mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
+                                        for x in code[i: i + 256//_sre.CODESIZE]))
+                print_(op, arg, mapping)
+                i += 256//_sre.CODESIZE
+                level += 1
+                for j in range(arg):
+                    print_2(_hex_code(code[i: i + 256//_CODEBITS]))
+                    i += 256//_CODEBITS
+                level -= 1
+            elif op in (MARK, GROUPREF, GROUPREF_IGNORE):
+                arg = code[i]
+                i += 1
+                print_(op, arg)
+            elif op is JUMP:
+                skip = code[i]
+                print_(op, skip, to=i+skip)
+                i += 1
+            elif op is BRANCH:
+                skip = code[i]
+                print_(op, skip, to=i+skip)
+                while skip:
+                    dis_(i+1, i+skip)
+                    i += skip
+                    start = i
+                    skip = code[i]
+                    if skip:
+                        print_('branch', skip, to=i+skip)
+                    else:
+                        print_(FAILURE)
+                i += 1
+            elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE):
+                skip, min, max = code[i: i+3]
+                if max == MAXREPEAT:
+                    max = 'MAXREPEAT'
+                print_(op, skip, min, max, to=i+skip)
+                dis_(i+3, i+skip)
+                i += skip
+            elif op is GROUPREF_EXISTS:
+                arg, skip = code[i: i+2]
+                print_(op, arg, skip, to=i+skip)
+                i += 2
+            elif op in (ASSERT, ASSERT_NOT):
+                skip, arg = code[i: i+2]
+                print_(op, skip, arg, to=i+skip)
+                dis_(i+2, i+skip)
+                i += skip
+            elif op is INFO:
+                skip, flags, min, max = code[i: i+4]
+                if max == MAXREPEAT:
+                    max = 'MAXREPEAT'
+                print_(op, skip, bin(flags), min, max, to=i+skip)
+                start = i+4
+                if flags & SRE_INFO_PREFIX:
+                    prefix_len, prefix_skip = code[i+4: i+6]
+                    print_2('  prefix_skip', prefix_skip)
+                    start = i + 6
+                    prefix = code[start: start+prefix_len]
+                    print_2('  prefix',
+                            '[%s]' % ', '.join('%#02x' % x for x in prefix),
+                            '(%r)' % ''.join(map(chr, prefix)))
+                    start += prefix_len
+                    print_2('  overlap', code[start: start+prefix_len])
+                    start += prefix_len
+                if flags & SRE_INFO_CHARSET:
+                    level += 1
+                    print_2('in')
+                    dis_(start, i+skip)
+                    level -= 1
+                i += skip
+            else:
+                raise ValueError(op)
+
+        level -= 1
+
+    dis_(0, len(code))
+
+
 def compile(p, flags=0):
     # internal: convert pattern list to internal format
 
@@ -606,7 +750,9 @@ def compile(p, flags=0):
 
     code = _code(p, flags)
 
-    # print(code)
+    if flags & SRE_FLAG_DEBUG:
+        print()
+        dis(code)
 
     # map in either direction
     groupindex = p.pattern.groupdict
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 5d36b54..1bb2654 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1688,10 +1688,12 @@ class ReTests(unittest.TestCase):
                 self.assertEqual(m.group(1), "")
                 self.assertEqual(m.group(2), "y")
 
+    @cpython_only
     def test_debug_flag(self):
         pat = r'(\.)(?:[ch]|py)(?(1)$|: )'
         with captured_stdout() as out:
             re.compile(pat, re.DEBUG)
+        self.maxDiff = None
         dump = '''\
 SUBPATTERN 1 0 0
   LITERAL 46
@@ -1707,6 +1709,31 @@ GROUPREF_EXISTS 1
 ELSE
   LITERAL 58
   LITERAL 32
+
+ 0. INFO 8 0b1 2 5 (to 9)
+      prefix_skip 0
+      prefix [0x2e] ('.')
+      overlap [0]
+ 9: MARK 0
+11. LITERAL 0x2e ('.')
+13. MARK 1
+15. BRANCH 10 (to 26)
+17.   IN 6 (to 24)
+19.     LITERAL 0x63 ('c')
+21.     LITERAL 0x68 ('h')
+23.     FAILURE
+24:   JUMP 9 (to 34)
+26: branch 7 (to 33)
+27.   LITERAL 0x70 ('p')
+29.   LITERAL 0x79 ('y')
+31.   JUMP 2 (to 34)
+33: FAILURE
+34: GROUPREF_EXISTS 0 6 (to 41)
+37. AT END
+39. JUMP 5 (to 45)
+41: LITERAL 0x3a (':')
+43. LITERAL 0x20 (' ')
+45: SUCCESS
 '''
         self.assertEqual(out.getvalue(), dump)
         # Debug output is output again even a second time (bypassing
diff --git a/Misc/NEWS b/Misc/NEWS
index e6b4ced..bf19b25 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -323,6 +323,9 @@ Extension Modules
 Library
 -------
 
+- bpo-30299: Compiling regular expression in debug mode on CPython now displays
+  the compiled bytecode in human readable form.
+
 - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
   running coroutine and the coroutine returned without any more ``await``.
 
-- 
cgit v0.12