Possible fix for Skip's bug 116136 (sre recursion limit hit in tokenize.py).

tokenize.py has always used naive regexps for matching string literals, and that appears to trigger the sre recursion limit on Skip's platform (he has very long single-line string literals). Replaced all of tokenize.py's string regexps with the "unrolled" forms used in IDLE, where they're known to handle even absurd (multi-megabyte!) string literals without trouble. See Friedl's book for explanation (at heart, the naive regexps create a backtracking choice point for each character in the literal, while the unrolled forms create none).
author: Tim Peters <tim.peters@gmail.com> 2000-10-07 05:09:39 (GMT)
committer: Tim Peters <tim.peters@gmail.com> 2000-10-07 05:09:39 (GMT)
commit: de49583a0d59f806b88b0f6a869f470047b3cbce (patch)
tree: eb9b53a60d7e3425a7266ced49a421ff2bcebd68
parent: 70d87d73291ca6263955c57842c07e98be768352 (diff)
download: cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.zip
cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.tar.gz
cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.tar.bz2
1 files changed, 20 insertions, 12 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 30bb557..f2ba0a2 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -46,18 +46,25 @@ Floatnumber = group(Pointfloat, Expfloat)
 Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
 
-Single = any(r"[^'\\]", r'\\.') + "'"
-Double = any(r'[^"\\]', r'\\.') + '"'
-Single3 = any(r"[^'\\]",r'\\.',r"'[^'\\]",r"'\\.",r"''[^'\\]",r"''\\.") + "'''"
-Double3 = any(r'[^"\\]',r'\\.',r'"[^"\\]',r'"\\.',r'""[^"\\]',r'""\\.') + '"""'
+# Tail end of ' string.
+Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
+# Tail end of " string.
+Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
+# Tail end of ''' string.
+Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
+# Tail end of """ string.
+Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 Triple = group("[rR]?'''", '[rR]?"""')
-String = group("[rR]?'" + any(r"[^\n'\\]", r'\\.') + "'",
-               '[rR]?"' + any(r'[^\n"\\]', r'\\.') + '"')
+# Single-line ' or " string.
+String = group(r"[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+               r'[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 
-Operator = group('\+=', '\-=', '\*=', '%=', '/=', '\*\*=', '&=', '\|=',
-                 '\^=', '>>=', '<<=', '\+', '\-', '\*\*', '\*', '\^', '~',
-                 '/', '%', '&', '\|', '<<', '>>', '==', '<=', '<>', '!=',
-                 '>=', '=', '<', '>')
+# Because of leftmost-then-longest match semantics, be sure to put the
+# longest operators first (e.g., if = came before ==, == would get
+# recognized as two instances of =).
+Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
+                 r"[+\-*/%&|^=<>]=?",
+                 r"~")
 
 Bracket = '[][(){}]'
 Special = group(r'\r?\n', r'[:;.,`]')
@@ -66,8 +73,9 @@ Funny = group(Operator, Bracket, Special)
 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken
 
-ContStr = group("[rR]?'" + any(r'\\.', r"[^\n'\\]") + group("'", r'\\\r?\n'),
-                '[rR]?"' + any(r'\\.', r'[^\n"\\]') + group('"', r'\\\r?\n'))
+# First (or only) line of ' or " string.
+ContStr = group(r"[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'),
+                r'[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
author	Tim Peters <tim.peters@gmail.com>	2000-10-07 05:09:39 (GMT)
committer	Tim Peters <tim.peters@gmail.com>	2000-10-07 05:09:39 (GMT)
commit	de49583a0d59f806b88b0f6a869f470047b3cbce (patch)
tree	eb9b53a60d7e3425a7266ced49a421ff2bcebd68
parent	70d87d73291ca6263955c57842c07e98be768352 (diff)
download	cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.zip cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.tar.gz cpython-de49583a0d59f806b88b0f6a869f470047b3cbce.tar.bz2