summaryrefslogtreecommitdiffstats
path: root/Lib/tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py157
1 files changed, 109 insertions, 48 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 0db3867..ff57b12 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -24,7 +24,7 @@ each time a new token is found."""
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
__credits__ = \
- 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
+ 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
import string, re
from token import *
@@ -50,10 +50,11 @@ Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
-Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
-Octnumber = r'0[0-7]*[lL]?'
+Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
+Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
+Binnumber = r'0[bB][01]+[lL]?'
Decnumber = r'[1-9]\d*[lL]?'
-Intnumber = group(Hexnumber, Octnumber, Decnumber)
+Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Expfloat = r'\d+' + Exponent
@@ -109,21 +110,34 @@ endprogs = {"'": re.compile(Single), '"': re.compile(Double),
"uR'''": single3prog, 'uR"""': double3prog,
"Ur'''": single3prog, 'Ur"""': double3prog,
"UR'''": single3prog, 'UR"""': double3prog,
- 'r': None, 'R': None, 'u': None, 'U': None}
+ "b'''": single3prog, 'b"""': double3prog,
+ "br'''": single3prog, 'br"""': double3prog,
+ "B'''": single3prog, 'B"""': double3prog,
+ "bR'''": single3prog, 'bR"""': double3prog,
+ "Br'''": single3prog, 'Br"""': double3prog,
+ "BR'''": single3prog, 'BR"""': double3prog,
+ 'r': None, 'R': None, 'u': None, 'U': None,
+ 'b': None, 'B': None}
triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
"u'''", 'u"""', "U'''", 'U"""',
"ur'''", 'ur"""', "Ur'''", 'Ur"""',
- "uR'''", 'uR"""', "UR'''", 'UR"""'):
+ "uR'''", 'uR"""', "UR'''", 'UR"""',
+ "b'''", 'b"""', "B'''", 'B"""',
+ "br'''", 'br"""', "Br'''", 'Br"""',
+ "bR'''", 'bR"""', "BR'''", 'BR"""'):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"u'", 'u"', "U'", 'U"',
"ur'", 'ur"', "Ur'", 'Ur"',
- "uR'", 'uR"', "UR'", 'UR"' ):
+ "uR'", 'uR"', "UR'", 'UR"',
+ "b'", 'b"', "B'", 'B"',
+ "br'", 'br"', "Br'", 'Br"',
+ "bR'", 'bR"', "BR'", 'BR"' ):
single_quoted[t] = t
tabsize = 8
@@ -132,7 +146,9 @@ class TokenError(Exception): pass
class StopTokenizing(Exception): pass
-def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
+def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
+ srow, scol = srow_scol
+ erow, ecol = erow_ecol
print "%d,%d-%d,%d:\t%s\t%s" % \
(srow, scol, erow, ecol, tok_name[type], repr(token))
@@ -159,14 +175,82 @@ def tokenize_loop(readline, tokeneater):
for token_info in generate_tokens(readline):
tokeneater(*token_info)
+class Untokenizer:
+
+ def __init__(self):
+ self.tokens = []
+ self.prev_row = 1
+ self.prev_col = 0
+
+ def add_whitespace(self, start):
+ row, col = start
+ assert row <= self.prev_row
+ col_offset = col - self.prev_col
+ if col_offset:
+ self.tokens.append(" " * col_offset)
+
+ def untokenize(self, iterable):
+ for t in iterable:
+ if len(t) == 2:
+ self.compat(t, iterable)
+ break
+ tok_type, token, start, end, line = t
+ self.add_whitespace(start)
+ self.tokens.append(token)
+ self.prev_row, self.prev_col = end
+ if tok_type in (NEWLINE, NL):
+ self.prev_row += 1
+ self.prev_col = 0
+ return "".join(self.tokens)
+
+ def compat(self, token, iterable):
+ startline = False
+ indents = []
+ toks_append = self.tokens.append
+ toknum, tokval = token
+ if toknum in (NAME, NUMBER):
+ tokval += ' '
+ if toknum in (NEWLINE, NL):
+ startline = True
+ prevstring = False
+ for tok in iterable:
+ toknum, tokval = tok[:2]
+
+ if toknum in (NAME, NUMBER):
+ tokval += ' '
+
+ # Insert a space between two consecutive strings
+ if toknum == STRING:
+ if prevstring:
+ tokval = ' ' + tokval
+ prevstring = True
+ else:
+ prevstring = False
+
+ if toknum == INDENT:
+ indents.append(tokval)
+ continue
+ elif toknum == DEDENT:
+ indents.pop()
+ continue
+ elif toknum in (NEWLINE, NL):
+ startline = True
+ elif startline and indents:
+ toks_append(indents[-1])
+ startline = False
+ toks_append(tokval)
def untokenize(iterable):
"""Transform tokens back into Python source code.
Each element returned by the iterable must be a token sequence
- with at least two elements, a token number and token value.
+ with at least two elements, a token number and token value. If
+ only two tokens are passed, the resulting output is poor.
+
+ Round-trip invariant for full input:
+ Untokenized source will match input source exactly
- Round-trip invariant:
+ Round-trip invariant for limited intput:
# Output text will tokenize the back to the input
t1 = [tok[:2] for tok in generate_tokens(f.readline)]
newcode = untokenize(t1)
@@ -174,40 +258,8 @@ def untokenize(iterable):
t2 = [tok[:2] for tok in generate_tokens(readline)]
assert t1 == t2
"""
-
- startline = False
- prevstring = False
- indents = []
- toks = []
- toks_append = toks.append
- for tok in iterable:
- toknum, tokval = tok[:2]
-
- if toknum in (NAME, NUMBER):
- tokval += ' '
-
- # Insert a space between two consecutive strings
- if toknum == STRING:
- if prevstring:
- tokval = ' ' + tokval
- prevstring = True
- else:
- prevstring = False
-
- if toknum == INDENT:
- indents.append(tokval)
- continue
- elif toknum == DEDENT:
- indents.pop()
- continue
- elif toknum in (NEWLINE, COMMENT, NL):
- startline = True
- elif startline and indents:
- toks_append(indents[-1])
- startline = False
- toks_append(tokval)
- return ''.join(toks)
-
+ ut = Untokenizer()
+ return ut.untokenize(iterable)
def generate_tokens(readline):
"""
@@ -246,7 +298,7 @@ def generate_tokens(readline):
if endmatch:
pos = end = endmatch.end(0)
yield (STRING, contstr + line[:end],
- strstart, (lnum, end), contline + line)
+ strstart, (lnum, end), contline + line)
contstr, needcont = '', 0
contline = None
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
@@ -272,7 +324,15 @@ def generate_tokens(readline):
if pos == max: break
if line[pos] in '#\r\n': # skip comments or blank lines
- yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
+ if line[pos] == '#':
+ comment_token = line[pos:].rstrip('\r\n')
+ nl_pos = pos + len(comment_token)
+ yield (COMMENT, comment_token,
+ (lnum, pos), (lnum, pos + len(comment_token)), line)
+ yield (NL, line[nl_pos:],
+ (lnum, nl_pos), (lnum, len(line)), line)
+ else:
+ yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
(lnum, pos), (lnum, len(line)), line)
continue
@@ -303,9 +363,10 @@ def generate_tokens(readline):
(initial == '.' and token != '.'): # ordinary number
yield (NUMBER, token, spos, epos, line)
elif initial in '\r\n':
- yield (parenlev > 0 and NL or NEWLINE,
- token, spos, epos, line)
+ yield (NL if parenlev > 0 else NEWLINE,
+ token, spos, epos, line)
elif initial == '#':
+ assert not token.endswith("\n")
yield (COMMENT, token, spos, epos, line)
elif token in triple_quoted:
endprog = endprogs[token]