diff options
author | Raymond Hettinger <python@rcn.com> | 2010-09-16 12:02:17 (GMT) |
---|---|---|
committer | Raymond Hettinger <python@rcn.com> | 2010-09-16 12:02:17 (GMT) |
commit | 37ade9cb61f8d527a2ae380a5358264693bbbc59 (patch) | |
tree | d3e541555a784e6f2a38e74acb53bcb44212062c /Doc | |
parent | c1cc0d08d4f713472728aee78cf07d0066c9a298 (diff) | |
download | cpython-37ade9cb61f8d527a2ae380a5358264693bbbc59.zip cpython-37ade9cb61f8d527a2ae380a5358264693bbbc59.tar.gz cpython-37ade9cb61f8d527a2ae380a5358264693bbbc59.tar.bz2 |
Add tokenizer example to regex docs.
Diffstat (limited to 'Doc')
-rw-r--r-- | Doc/library/re.rst | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/Doc/library/re.rst b/Doc/library/re.rst index d40ddec..487d0e5 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -1282,3 +1282,66 @@ functionally identical: <_sre.SRE_Match object at ...> >>> re.match("\\\\", r"\\") <_sre.SRE_Match object at ...> + + +Writing a Tokenizer +^^^^^^^^^^^^^^^^^^^ + +A `tokenizer or scanner <http://en.wikipedia.org/wiki/Lexical_analysis>`_ +analyzes a string to categorize groups of characters. This is a useful first +step in writing a compiler or interpreter. + +The text categories are specified with regular expressions. The technique is +to combine those into a single master regular expression and to loop over +successive matches:: + + Token = collections.namedtuple('Token', 'typ value line column') + + def tokenize(s): + tok_spec = [ + ('NUMBER', r'\d+(.\d+)?'), # Integer or decimal number + ('ASSIGN', r':='), # Assignment operator + ('END', ';'), # Statement terminator + ('ID', r'[A-Za-z]+'), # Identifiers + ('OP', r'[+*\/\-]'), # Arithmetic operators + ('NEWLINE', r'\n'), # Line endings + ('SKIP', r'[ \t]'), # Skip over spaces and tabs + ] + tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec) + gettok = re.compile(tok_re).match + line = 1 + pos = line_start = 0 + mo = gettok(s) + while mo is not None: + typ = mo.lastgroup + if typ == 'NEWLINE': + line_start = pos + line += 1 + elif typ != 'SKIP': + yield Token(typ, mo.group(typ), line, mo.start()-line_start) + pos = mo.end() + mo = gettok(s, pos) + if pos != len(s): + raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line)) + + >>> statements = '''\ + total := total + price * quantity; + tax := price * 0.05; + ''' + >>> for token in tokenize(statements): + ... print(token) + ... + Token(typ='ID', value='total', line=1, column=8) + Token(typ='ASSIGN', value=':=', line=1, column=14) + Token(typ='ID', value='total', line=1, column=17) + Token(typ='OP', value='+', line=1, column=23) + Token(typ='ID', value='price', line=1, column=25) + Token(typ='OP', value='*', line=1, column=31) + Token(typ='ID', value='quantity', line=1, column=33) + Token(typ='END', value=';', line=1, column=41) + Token(typ='ID', value='tax', line=2, column=9) + Token(typ='ASSIGN', value=':=', line=2, column=13) + Token(typ='ID', value='price', line=2, column=16) + Token(typ='OP', value='*', line=2, column=22) + Token(typ='NUMBER', value='0.05', line=2, column=24) + Token(typ='END', value=';', line=2, column=28) |