Cleanup and improve the regex tokenizer example. (GH-10426)

1) Convert weird field name "typ" to the more standard "type". 2) For the NUMBER type, convert the value to an int() or float(). 3) Simplify ``group(kind)`` to the shorter and faster ``group()`` call. 4) Simplify logic go a single if-elif chain to make this easier to extend. 5) Reorder the tests to match the order the tokens are specified. This isn't necessary for correctness but does make the example easier to follow. 6) Move the "column" calculation before the if-elif chain so that users have the option of using this value in error messages.
author: Raymond Hettinger <rhettinger@users.noreply.github.com> 2018-11-09 09:19:33 (GMT)
committer: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> 2018-11-09 09:19:33 (GMT)
commit: b83942c755a78f6d917743b73ed87a8fd9f367de (patch)
tree: c19b245c1e930d1ac66a1a587f72b12ce16811ac
parent: 216aaaa0564951f2ca408e2086ba60d849d52902 (diff)
download: cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.zip
cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.tar.gz
cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.tar.bz2
1 files changed, 38 insertions, 36 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 31fb628..2f82955 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -1609,38 +1609,40 @@ successive matches::
     import collections
     import re
 
-    Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
+    Token = collections.namedtuple('Token', ['type', 'value', 'line', 'column'])
 
     def tokenize(code):
         keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
         token_specification = [
-            ('NUMBER',  r'\d+(\.\d*)?'),  # Integer or decimal number
-            ('ASSIGN',  r':='),           # Assignment operator
-            ('END',     r';'),            # Statement terminator
-            ('ID',      r'[A-Za-z]+'),    # Identifiers
-            ('OP',      r'[+\-*/]'),      # Arithmetic operators
-            ('NEWLINE', r'\n'),           # Line endings
-            ('SKIP',    r'[ \t]+'),       # Skip over spaces and tabs
-            ('MISMATCH',r'.'),            # Any other character
+            ('NUMBER',   r'\d+(\.\d*)?'),  # Integer or decimal number
+            ('ASSIGN',   r':='),           # Assignment operator
+            ('END',      r';'),            # Statement terminator
+            ('ID',       r'[A-Za-z]+'),    # Identifiers
+            ('OP',       r'[+\-*/]'),      # Arithmetic operators
+            ('NEWLINE',  r'\n'),           # Line endings
+            ('SKIP',     r'[ \t]+'),       # Skip over spaces and tabs
+            ('MISMATCH', r'.'),            # Any other character
         ]
         tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
         line_num = 1
         line_start = 0
         for mo in re.finditer(tok_regex, code):
             kind = mo.lastgroup
-            value = mo.group(kind)
-            if kind == 'NEWLINE':
+            value = mo.group()
+            column = mo.start() - line_start
+            if kind == 'NUMBER':
+                value = float(value) if '.' in value else int(value)
+            elif kind == 'ID' and value in keywords:
+                kind = value
+            elif kind == 'NEWLINE':
                 line_start = mo.end()
                 line_num += 1
+                continue
             elif kind == 'SKIP':
-                pass
+                continue
             elif kind == 'MISMATCH':
                 raise RuntimeError(f'{value!r} unexpected on line {line_num}')
-            else:
-                if kind == 'ID' and value in keywords:
-                    kind = value
-                column = mo.start() - line_start
-                yield Token(kind, value, line_num, column)
+            yield Token(kind, value, line_num, column)
 
     statements = '''
         IF quantity THEN
@@ -1654,25 +1656,25 @@ successive matches::
 
 The tokenizer produces the following output::
 
-    Token(typ='IF', value='IF', line=2, column=4)
-    Token(typ='ID', value='quantity', line=2, column=7)
-    Token(typ='THEN', value='THEN', line=2, column=16)
-    Token(typ='ID', value='total', line=3, column=8)
-    Token(typ='ASSIGN', value=':=', line=3, column=14)
-    Token(typ='ID', value='total', line=3, column=17)
-    Token(typ='OP', value='+', line=3, column=23)
-    Token(typ='ID', value='price', line=3, column=25)
-    Token(typ='OP', value='*', line=3, column=31)
-    Token(typ='ID', value='quantity', line=3, column=33)
-    Token(typ='END', value=';', line=3, column=41)
-    Token(typ='ID', value='tax', line=4, column=8)
-    Token(typ='ASSIGN', value=':=', line=4, column=12)
-    Token(typ='ID', value='price', line=4, column=15)
-    Token(typ='OP', value='*', line=4, column=21)
-    Token(typ='NUMBER', value='0.05', line=4, column=23)
-    Token(typ='END', value=';', line=4, column=27)
-    Token(typ='ENDIF', value='ENDIF', line=5, column=4)
-    Token(typ='END', value=';', line=5, column=9)
+    Token(type='IF', value='IF', line=2, column=4)
+    Token(type='ID', value='quantity', line=2, column=7)
+    Token(type='THEN', value='THEN', line=2, column=16)
+    Token(type='ID', value='total', line=3, column=8)
+    Token(type='ASSIGN', value=':=', line=3, column=14)
+    Token(type='ID', value='total', line=3, column=17)
+    Token(type='OP', value='+', line=3, column=23)
+    Token(type='ID', value='price', line=3, column=25)
+    Token(type='OP', value='*', line=3, column=31)
+    Token(type='ID', value='quantity', line=3, column=33)
+    Token(type='END', value=';', line=3, column=41)
+    Token(type='ID', value='tax', line=4, column=8)
+    Token(type='ASSIGN', value=':=', line=4, column=12)
+    Token(type='ID', value='price', line=4, column=15)
+    Token(type='OP', value='*', line=4, column=21)
+    Token(type='NUMBER', value=0.05, line=4, column=23)
+    Token(type='END', value=';', line=4, column=27)
+    Token(type='ENDIF', value='ENDIF', line=5, column=4)
+    Token(type='END', value=';', line=5, column=9)
 
 
 .. [Frie09] Friedl, Jeffrey. Mastering Regular Expressions. 3rd ed., O'Reilly
author	Raymond Hettinger <rhettinger@users.noreply.github.com>	2018-11-09 09:19:33 (GMT)
committer	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>	2018-11-09 09:19:33 (GMT)
commit	b83942c755a78f6d917743b73ed87a8fd9f367de (patch)
tree	c19b245c1e930d1ac66a1a587f72b12ce16811ac
parent	216aaaa0564951f2ca408e2086ba60d849d52902 (diff)
download	cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.zip cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.tar.gz cpython-b83942c755a78f6d917743b73ed87a8fd9f367de.tar.bz2