summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaymond Hettinger <python@rcn.com>2014-08-04 06:38:54 (GMT)
committerRaymond Hettinger <python@rcn.com>2014-08-04 06:38:54 (GMT)
commitc566431bf0e91c5a235dbf42b2d80ae2afa44548 (patch)
tree446ac3fb19842dbc73d5db13c8b47ecd5e5a1962
parent4036d87f4dfae69b0ff8850a3f5d6ce65cecd172 (diff)
downloadcpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.zip
cpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.tar.gz
cpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.tar.bz2
Improve regex tokenizer example by using re.finditer().
Also, improve variable names and fix column numbers in the generated output.
-rw-r--r--Doc/library/re.rst75
1 files changed, 38 insertions, 37 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index a835c14..48495a2 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -1333,7 +1333,7 @@ successive matches::
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
- def tokenize(s):
+ def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
@@ -1343,26 +1343,27 @@ successive matches::
('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs
+ ('MISMATCH',r'.'), # Any other character
]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex).match
- line = 1
- pos = line_start = 0
- mo = get_token(s)
- while mo is not None:
- typ = mo.lastgroup
- if typ == 'NEWLINE':
- line_start = pos
- line += 1
- elif typ != 'SKIP':
- val = mo.group(typ)
- if typ == 'ID' and val in keywords:
- typ = val
- yield Token(typ, val, line, mo.start()-line_start)
- pos = mo.end()
- mo = get_token(s, pos)
- if pos != len(s):
- raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
+ line_num = 1
+ line_start = 0
+ for mo in re.finditer(tok_regex, code):
+ kind = mo.lastgroup
+ value = mo.group(kind)
+ if kind == 'NEWLINE':
+ line_start = mo.end()
+ line_num += 1
+ elif kind == 'SKIP':
+ pass
+ elif kind == 'MISMATCH':
+ raise RuntimeError('%r unexpected on line %d' % (value, line_num))
+ else:
+ if kind == 'ID' and value in keywords:
+ kind = value
+ column = mo.start() - line_start
+ yield Token(kind, value, line_num, column)
statements = '''
IF quantity THEN
@@ -1376,22 +1377,22 @@ successive matches::
The tokenizer produces the following output::
- Token(typ='IF', value='IF', line=2, column=5)
- Token(typ='ID', value='quantity', line=2, column=8)
- Token(typ='THEN', value='THEN', line=2, column=17)
- Token(typ='ID', value='total', line=3, column=9)
- Token(typ='ASSIGN', value=':=', line=3, column=15)
- Token(typ='ID', value='total', line=3, column=18)
- Token(typ='OP', value='+', line=3, column=24)
- Token(typ='ID', value='price', line=3, column=26)
- Token(typ='OP', value='*', line=3, column=32)
- Token(typ='ID', value='quantity', line=3, column=34)
- Token(typ='END', value=';', line=3, column=42)
- Token(typ='ID', value='tax', line=4, column=9)
- Token(typ='ASSIGN', value=':=', line=4, column=13)
- Token(typ='ID', value='price', line=4, column=16)
- Token(typ='OP', value='*', line=4, column=22)
- Token(typ='NUMBER', value='0.05', line=4, column=24)
- Token(typ='END', value=';', line=4, column=28)
- Token(typ='ENDIF', value='ENDIF', line=5, column=5)
- Token(typ='END', value=';', line=5, column=10)
+ Token(typ='IF', value='IF', line=2, column=4)
+ Token(typ='ID', value='quantity', line=2, column=7)
+ Token(typ='THEN', value='THEN', line=2, column=16)
+ Token(typ='ID', value='total', line=3, column=8)
+ Token(typ='ASSIGN', value=':=', line=3, column=14)
+ Token(typ='ID', value='total', line=3, column=17)
+ Token(typ='OP', value='+', line=3, column=23)
+ Token(typ='ID', value='price', line=3, column=25)
+ Token(typ='OP', value='*', line=3, column=31)
+ Token(typ='ID', value='quantity', line=3, column=33)
+ Token(typ='END', value=';', line=3, column=41)
+ Token(typ='ID', value='tax', line=4, column=8)
+ Token(typ='ASSIGN', value=':=', line=4, column=12)
+ Token(typ='ID', value='price', line=4, column=15)
+ Token(typ='OP', value='*', line=4, column=21)
+ Token(typ='NUMBER', value='0.05', line=4, column=23)
+ Token(typ='END', value=';', line=4, column=27)
+ Token(typ='ENDIF', value='ENDIF', line=5, column=4)
+ Token(typ='END', value=';', line=5, column=9)