diff options
author | Raymond Hettinger <python@rcn.com> | 2014-08-04 06:38:54 (GMT) |
---|---|---|
committer | Raymond Hettinger <python@rcn.com> | 2014-08-04 06:38:54 (GMT) |
commit | c566431bf0e91c5a235dbf42b2d80ae2afa44548 (patch) | |
tree | 446ac3fb19842dbc73d5db13c8b47ecd5e5a1962 | |
parent | 4036d87f4dfae69b0ff8850a3f5d6ce65cecd172 (diff) | |
download | cpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.zip cpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.tar.gz cpython-c566431bf0e91c5a235dbf42b2d80ae2afa44548.tar.bz2 |
Improve regex tokenizer example by using re.finditer().
Also, improve variable names and fix column numbers
in the generated output.
-rw-r--r-- | Doc/library/re.rst | 75 |
1 files changed, 38 insertions, 37 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index a835c14..48495a2 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -1333,7 +1333,7 @@ successive matches:: Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column']) - def tokenize(s): + def tokenize(code): keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'} token_specification = [ ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number @@ -1343,26 +1343,27 @@ successive matches:: ('OP', r'[+\-*/]'), # Arithmetic operators ('NEWLINE', r'\n'), # Line endings ('SKIP', r'[ \t]+'), # Skip over spaces and tabs + ('MISMATCH',r'.'), # Any other character ] tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) get_token = re.compile(tok_regex).match - line = 1 - pos = line_start = 0 - mo = get_token(s) - while mo is not None: - typ = mo.lastgroup - if typ == 'NEWLINE': - line_start = pos - line += 1 - elif typ != 'SKIP': - val = mo.group(typ) - if typ == 'ID' and val in keywords: - typ = val - yield Token(typ, val, line, mo.start()-line_start) - pos = mo.end() - mo = get_token(s, pos) - if pos != len(s): - raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line)) + line_num = 1 + line_start = 0 + for mo in re.finditer(tok_regex, code): + kind = mo.lastgroup + value = mo.group(kind) + if kind == 'NEWLINE': + line_start = mo.end() + line_num += 1 + elif kind == 'SKIP': + pass + elif kind == 'MISMATCH': + raise RuntimeError('%r unexpected on line %d' % (value, line_num)) + else: + if kind == 'ID' and value in keywords: + kind = value + column = mo.start() - line_start + yield Token(kind, value, line_num, column) statements = ''' IF quantity THEN @@ -1376,22 +1377,22 @@ successive matches:: The tokenizer produces the following output:: - Token(typ='IF', value='IF', line=2, column=5) - Token(typ='ID', value='quantity', line=2, column=8) - Token(typ='THEN', value='THEN', line=2, column=17) - Token(typ='ID', value='total', line=3, column=9) - Token(typ='ASSIGN', value=':=', line=3, column=15) - Token(typ='ID', value='total', line=3, column=18) - Token(typ='OP', value='+', line=3, column=24) - Token(typ='ID', value='price', line=3, column=26) - Token(typ='OP', value='*', line=3, column=32) - Token(typ='ID', value='quantity', line=3, column=34) - Token(typ='END', value=';', line=3, column=42) - Token(typ='ID', value='tax', line=4, column=9) - Token(typ='ASSIGN', value=':=', line=4, column=13) - Token(typ='ID', value='price', line=4, column=16) - Token(typ='OP', value='*', line=4, column=22) - Token(typ='NUMBER', value='0.05', line=4, column=24) - Token(typ='END', value=';', line=4, column=28) - Token(typ='ENDIF', value='ENDIF', line=5, column=5) - Token(typ='END', value=';', line=5, column=10) + Token(typ='IF', value='IF', line=2, column=4) + Token(typ='ID', value='quantity', line=2, column=7) + Token(typ='THEN', value='THEN', line=2, column=16) + Token(typ='ID', value='total', line=3, column=8) + Token(typ='ASSIGN', value=':=', line=3, column=14) + Token(typ='ID', value='total', line=3, column=17) + Token(typ='OP', value='+', line=3, column=23) + Token(typ='ID', value='price', line=3, column=25) + Token(typ='OP', value='*', line=3, column=31) + Token(typ='ID', value='quantity', line=3, column=33) + Token(typ='END', value=';', line=3, column=41) + Token(typ='ID', value='tax', line=4, column=8) + Token(typ='ASSIGN', value=':=', line=4, column=12) + Token(typ='ID', value='price', line=4, column=15) + Token(typ='OP', value='*', line=4, column=21) + Token(typ='NUMBER', value='0.05', line=4, column=23) + Token(typ='END', value=';', line=4, column=27) + Token(typ='ENDIF', value='ENDIF', line=5, column=4) + Token(typ='END', value=';', line=5, column=9) |