summaryrefslogtreecommitdiffstats
path: root/Lib/lib2to3/pgen2/tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/lib2to3/pgen2/tokenize.py')
-rw-r--r--Lib/lib2to3/pgen2/tokenize.py197
1 files changed, 68 insertions, 129 deletions
diff --git a/Lib/lib2to3/pgen2/tokenize.py b/Lib/lib2to3/pgen2/tokenize.py
index 7924ff3..8cae873 100644
--- a/Lib/lib2to3/pgen2/tokenize.py
+++ b/Lib/lib2to3/pgen2/tokenize.py
@@ -48,26 +48,22 @@ except NameError:
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
-def _combinations(*l):
- return set(
- x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
- )
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
+Name = r'[a-zA-Z_]\w*'
-Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
-Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
-Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
-Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
+Binnumber = r'0[bB][01]*'
+Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
+Octnumber = r'0[oO]?[0-7]*[lL]?'
+Decnumber = r'[1-9]\d*[lL]?'
Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+(?:_\d+)*'
-Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
-Expfloat = r'\d+(?:_\d+)*' + Exponent
+Exponent = r'[eE][-+]?\d+'
+Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
+Expfloat = r'\d+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
# Tail end of ' string.
@@ -78,11 +74,10 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
-Triple = group(_litprefix + "'''", _litprefix + '"""')
+Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
# Single-line ' or " string.
-String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
@@ -100,38 +95,55 @@ PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
-ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
- _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+ r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
tokenprog, pseudoprog, single3prog, double3prog = map(
re.compile, (Token, PseudoToken, Single3, Double3))
-
-_strprefixes = (
- _combinations('r', 'R', 'f', 'F') |
- _combinations('r', 'R', 'b', 'B') |
- {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
-)
-
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
"'''": single3prog, '"""': double3prog,
- **{f"{prefix}'''": single3prog for prefix in _strprefixes},
- **{f'{prefix}"""': double3prog for prefix in _strprefixes},
- **{prefix: None for prefix in _strprefixes}}
-
-triple_quoted = (
- {"'''", '"""'} |
- {f"{prefix}'''" for prefix in _strprefixes} |
- {f'{prefix}"""' for prefix in _strprefixes}
-)
-single_quoted = (
- {"'", '"'} |
- {f"{prefix}'" for prefix in _strprefixes} |
- {f'{prefix}"' for prefix in _strprefixes}
-)
+ "r'''": single3prog, 'r"""': double3prog,
+ "u'''": single3prog, 'u"""': double3prog,
+ "b'''": single3prog, 'b"""': double3prog,
+ "ur'''": single3prog, 'ur"""': double3prog,
+ "br'''": single3prog, 'br"""': double3prog,
+ "R'''": single3prog, 'R"""': double3prog,
+ "U'''": single3prog, 'U"""': double3prog,
+ "B'''": single3prog, 'B"""': double3prog,
+ "uR'''": single3prog, 'uR"""': double3prog,
+ "Ur'''": single3prog, 'Ur"""': double3prog,
+ "UR'''": single3prog, 'UR"""': double3prog,
+ "bR'''": single3prog, 'bR"""': double3prog,
+ "Br'''": single3prog, 'Br"""': double3prog,
+ "BR'''": single3prog, 'BR"""': double3prog,
+ 'r': None, 'R': None,
+ 'u': None, 'U': None,
+ 'b': None, 'B': None}
+
+triple_quoted = {}
+for t in ("'''", '"""',
+ "r'''", 'r"""', "R'''", 'R"""',
+ "u'''", 'u"""', "U'''", 'U"""',
+ "b'''", 'b"""', "B'''", 'B"""',
+ "ur'''", 'ur"""', "Ur'''", 'Ur"""',
+ "uR'''", 'uR"""', "UR'''", 'UR"""',
+ "br'''", 'br"""', "Br'''", 'Br"""',
+ "bR'''", 'bR"""', "BR'''", 'BR"""',):
+ triple_quoted[t] = t
+single_quoted = {}
+for t in ("'", '"',
+ "r'", 'r"', "R'", 'R"',
+ "u'", 'u"', "U'", 'U"',
+ "b'", 'b"', "B'", 'B"',
+ "ur'", 'ur"', "Ur'", 'Ur"',
+ "uR'", 'uR"', "UR'", 'UR"',
+ "br'", 'br"', "Br'", 'Br"',
+ "bR'", 'bR"', "BR'", 'BR"', ):
+ single_quoted[t] = t
tabsize = 8
@@ -139,11 +151,11 @@ class TokenError(Exception): pass
class StopTokenizing(Exception): pass
-def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
- (srow, scol) = xxx_todo_changeme
- (erow, ecol) = xxx_todo_changeme1
- print("%d,%d-%d,%d:\t%s\t%s" % \
- (srow, scol, erow, ecol, tok_name[type], repr(token)))
+def printtoken(type, token, start, end, line): # for testing
+ (srow, scol) = start
+ (erow, ecol) = end
+ print "%d,%d-%d,%d:\t%s\t%s" % \
+ (srow, scol, erow, ecol, tok_name[type], repr(token))
def tokenize(readline, tokeneater=printtoken):
"""
@@ -208,7 +220,7 @@ class Untokenizer:
for tok in iterable:
toknum, tokval = tok[:2]
- if toknum in (NAME, NUMBER, ASYNC, AWAIT):
+ if toknum in (NAME, NUMBER):
tokval += ' '
if toknum == INDENT:
@@ -224,8 +236,8 @@ class Untokenizer:
startline = False
toks_append(tokval)
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
-blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
+cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)')
+blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)')
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
@@ -321,7 +333,7 @@ def untokenize(iterable):
Round-trip invariant for full input:
Untokenized source will match input source exactly
- Round-trip invariant for limited input:
+ Round-trip invariant for limited intput:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
@@ -346,19 +358,14 @@ def generate_tokens(readline):
column where the token begins in the source; a 2-tuple (erow, ecol) of
ints specifying the row and column where the token ends in the source;
and the line on which the token was found. The line passed is the
- physical line.
+ logical line; continuation lines are included.
"""
lnum = parenlev = continued = 0
+ namechars, numchars = string.ascii_letters + '_', '0123456789'
contstr, needcont = '', 0
contline = None
indents = [0]
- # 'stashed' and 'async_*' are used for async/await parsing
- stashed = None
- async_def = False
- async_def_indent = 0
- async_def_nl = False
-
while 1: # loop over lines in stream
try:
line = readline()
@@ -369,7 +376,7 @@ def generate_tokens(readline):
if contstr: # continued string
if not line:
- raise TokenError("EOF in multi-line string", strstart)
+ raise TokenError, ("EOF in multi-line string", strstart)
endmatch = endprog.match(line)
if endmatch:
pos = end = endmatch.end(0)
@@ -399,10 +406,6 @@ def generate_tokens(readline):
pos = pos + 1
if pos == max: break
- if stashed:
- yield stashed
- stashed = None
-
if line[pos] in '#\r\n': # skip comments or blank lines
if line[pos] == '#':
comment_token = line[pos:].rstrip('\r\n')
@@ -425,22 +428,11 @@ def generate_tokens(readline):
"unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line))
indents = indents[:-1]
-
- if async_def and async_def_indent >= indents[-1]:
- async_def = False
- async_def_nl = False
- async_def_indent = 0
-
yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
- if async_def and async_def_nl and async_def_indent >= indents[-1]:
- async_def = False
- async_def_nl = False
- async_def_indent = 0
-
else: # continued statement
if not line:
- raise TokenError("EOF in multi-line statement", (lnum, 0))
+ raise TokenError, ("EOF in multi-line statement", (lnum, 0))
continued = 0
while pos < max:
@@ -450,25 +442,16 @@ def generate_tokens(readline):
spos, epos, pos = (lnum, start), (lnum, end), end
token, initial = line[start:end], line[start]
- if initial in string.digits or \
+ if initial in numchars or \
(initial == '.' and token != '.'): # ordinary number
yield (NUMBER, token, spos, epos, line)
elif initial in '\r\n':
newline = NEWLINE
if parenlev > 0:
newline = NL
- elif async_def:
- async_def_nl = True
- if stashed:
- yield stashed
- stashed = None
yield (newline, token, spos, epos, line)
-
elif initial == '#':
assert not token.endswith("\n")
- if stashed:
- yield stashed
- stashed = None
yield (COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]
@@ -476,9 +459,6 @@ def generate_tokens(readline):
if endmatch: # all on one line
pos = endmatch.end(0)
token = line[start:pos]
- if stashed:
- yield stashed
- stashed = None
yield (STRING, token, spos, (lnum, pos), line)
else:
strstart = (lnum, start) # multiple lines
@@ -496,63 +476,22 @@ def generate_tokens(readline):
contline = line
break
else: # ordinary string
- if stashed:
- yield stashed
- stashed = None
yield (STRING, token, spos, epos, line)
- elif initial.isidentifier(): # ordinary name
- if token in ('async', 'await'):
- if async_def:
- yield (ASYNC if token == 'async' else AWAIT,
- token, spos, epos, line)
- continue
-
- tok = (NAME, token, spos, epos, line)
- if token == 'async' and not stashed:
- stashed = tok
- continue
-
- if token == 'def':
- if (stashed
- and stashed[0] == NAME
- and stashed[1] == 'async'):
-
- async_def = True
- async_def_indent = indents[-1]
-
- yield (ASYNC, stashed[1],
- stashed[2], stashed[3],
- stashed[4])
- stashed = None
-
- if stashed:
- yield stashed
- stashed = None
-
- yield tok
+ elif initial in namechars: # ordinary name
+ yield (NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt
# This yield is new; needed for better idempotency:
- if stashed:
- yield stashed
- stashed = None
yield (NL, token, spos, (lnum, pos), line)
continued = 1
else:
if initial in '([{': parenlev = parenlev + 1
elif initial in ')]}': parenlev = parenlev - 1
- if stashed:
- yield stashed
- stashed = None
yield (OP, token, spos, epos, line)
else:
yield (ERRORTOKEN, line[pos],
(lnum, pos), (lnum, pos+1), line)
pos = pos + 1
- if stashed:
- yield stashed
- stashed = None
-
for indent in indents[1:]: # pop remaining indent levels
yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')