From fc6f5339a99d103928bce9eda605564f2a9e8477 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 7 Mar 1997 00:21:12 +0000 Subject: Ka-Ping's version. --- Lib/tokenize.py | 177 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 132 insertions(+), 45 deletions(-) diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 8f16115..d6985e0 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -1,63 +1,150 @@ -# This module compiles a regular expression that recognizes Python tokens. -# It is designed to match the working of the Python tokenizer exactly. -# It takes care of everything except indentation; -# note that un-escaped newlines are tokens, too. -# tokenprog.regs[3] gives the location of the token without whitespace -# It also defines various subexpressions, but doesn't compile them. -# See the function test() below for an example of how to use. +"""tokenize.py (Ka-Ping Yee, 4 March 1997) -import regex +This module compiles a regular expression that recognizes Python tokens +in individual lines of text. The regular expression handles everything +except indentation, continuations, and triple-quoted strings. The function +'tokenize.tokenize()' takes care of these things for streams of text. It +accepts a file-like object and a function, uses the readline() method to +scan the file, and calls the function called once for each token found +passing its type, a string containing the token, the line number, the line, +and the starting and ending positions of the token within the line. +It is designed to match the working of the Python tokenizer exactly.""" -# Note: to get a quoted backslash in a regexp, it must be quadrupled. +import string, regex +from token import * -Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' +def group(*choices): return '\(' + string.join(choices, '\|') + '\)' +Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' Name = '[a-zA-Z_][a-zA-Z0-9_]*' Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' Octnumber = '0[0-7]*[lL]?' Decnumber = '[1-9][0-9]*[lL]?' -Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber +Intnumber = group(Hexnumber, Octnumber, Decnumber) Exponent = '[eE][-+]?[0-9]+' -Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' +Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' Expfloat = '[0-9]+' + Exponent -Floatnumber = Pointfloat + '\|' + Expfloat -Number = Floatnumber + '\|' + Intnumber +Floatnumber = group(Pointfloat, Expfloat) +Number = group(Floatnumber, Intnumber) -String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"' -# Note: this module *recognizes* double quotes, but for backward -# compatibility, it doesn't *use* them! +Single = group('^\'', '[^\]\'') +Double = group('^"', '[^\]"') +Tsingle = group('^\'\'\'', '[^\]\'\'\'') +Tdouble = group('^"""', '[^\]"""') +Triple = group('\'\'\'', '"""') +String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), + '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) -Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' +Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', + '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') Bracket = '[][(){}]' -Special = '[:;.,`\n]' -Funny = Operator + '\|' + Bracket + '\|' + Special +Special = group('[\]?\r?\n', '[:;.,`\f]') +Funny = group(Operator, Bracket, Special) -PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny - -Token = Ignore + '\(' + PlainToken + '\)' +PlainToken = group(Name, Number, Triple, String, Funny) +Token = Ignore + PlainToken try: - save_syntax = regex.set_syntax(0) # Use default syntax - tokenprog = regex.compile(Token) + save_syntax = regex.set_syntax(0) # use default syntax + tokenprog = regex.compile(Token) + endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), + '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } finally: - if save_syntax != 0: - dummy = regex.set_syntax(save_syntax) # Restore original syntax - - -def test(file): - f = open(file, 'r') - while 1: - line = f.readline() - if not line: break - i, n = 0, len(line) - while i < n: - j = tokenprog.match(line, i) - if j < 0: - print 'No token at', `line[i:i+20]` + '...' - i = i+1 - else: - i = i+j - a, b = tokenprog.regs[3] - if a < b: - print 'Token:', `line[a:b]` + regex.set_syntax(save_syntax) # restore original syntax + +tabsize = 8 +TokenError = 'TokenError' +def printtoken(type, string, linenum, line, start, end): # for testing + print `linenum` + ':', tok_name[type], repr(string) + +def tokenize(readline, tokeneater = printtoken): + linenum = parenlev = continued = 0 + namechars, numchars = string.letters + '_', string.digits + contstr = '' + indents = [0] + while 1: # loop over lines in stream + line = readline() + linenum = linenum + 1 + if line[-2:] == '\r\n': line = line[:-2] + '\n' + pos, max = 0, len(line) + + if contstr: # continued string + if not line: raise TokenError, "EOF within multi-line string" + if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' + if endprog.search(line) >= 0: + pos = end = endprog.regs[0][1] + tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) + contstr = '' + else: + contstr = contstr + line + continue + + elif parenlev == 0 and not continued: # this is a new statement + if not line: break + column = 0 + while 1: # measure leading whitespace + if line[pos] == ' ': column = column + 1 + elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize + elif line[pos] == '\f': column = 0 + else: break + pos = pos + 1 + if line[pos] in '#\n': continue # skip comments or blank lines + + if column > indents[-1]: # count indents or dedents + indents.append(column) + tokeneater(INDENT, '\t', linenum, line, 0, 0) + while column < indents[-1]: + indents = indents[:-1] + tokeneater(DEDENT, '\t', linenum, line, 0, 0) + + else: # continued statement + if not line: raise TokenError, "EOF within multi-line statement" + continued = 0 + + while pos < max: + if tokenprog.match(line, pos) > 0: # scan for tokens + start, end = tokenprog.regs[3] + token = line[start:end] + pos = end + + if token[0] in namechars: # ordinary name + tokeneater(NAME, token, linenum, line, start, end) + elif token[0] in numchars: # ordinary number + tokeneater(NUMBER, token, linenum, line, start, end) + + elif token in ('\'\'\'', '"""'): # triple-quoted + endprog = endprogs[token] + if endprog.search(line, pos) >= 0: # all on one line + pos = endprog.regs[0][1] + tokeneater(STRING, token, linenum, line, start, pos) + else: + contstr = line[start:] # multiple lines + break + elif token[0] in '\'"': + if token[-1] == '\n': # continued string + endprog, contstr = endprogs[token[0]], line[start:] + break + else: # ordinary string + tokeneater(STRING, token, linenum, line, start, end) + + elif token[0] == '\n': + tokeneater(NEWLINE, token, linenum, line, start, end) + elif token[0] == '\\': # continued stmt + continued = 1 + + else: + if token[0] in '([{': parenlev = parenlev + 1 + if token[0] in ')]}': parenlev = parenlev - 1 + tokeneater(OP, token, linenum, line, start, end) + else: + tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) + pos = pos + 1 + + for indent in indents[1:]: # pop remaining indent levels + tokeneater(DEDENT, '\t', linenum, line, 0, 0) + +if __name__ == '__main__': # testing + import sys + file = open(sys.argv[-1]) + tokenize(file.readline) -- cgit v0.12