summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMeador Inge <meadori@gmail.com>2012-06-17 02:05:50 (GMT)
committerMeador Inge <meadori@gmail.com>2012-06-17 02:05:50 (GMT)
commit43f42fc3cb67433c88e31268767c0cab36422351 (patch)
tree5f378d62132769aa2c1c2fe4d5d9cf94784c3360
parent7cf66996992eeb7f3ad4c19f960b967e1beb5fa3 (diff)
downloadcpython-43f42fc3cb67433c88e31268767c0cab36422351.zip
cpython-43f42fc3cb67433c88e31268767c0cab36422351.tar.gz
cpython-43f42fc3cb67433c88e31268767c0cab36422351.tar.bz2
Issue #15054: Fix incorrect tokenization of 'b' and 'br' string literals.
Patch by Serhiy Storchaka.
-rw-r--r--Lib/test/test_tokenize.py25
-rw-r--r--Lib/tokenize.py10
-rw-r--r--Misc/NEWS4
3 files changed, 34 insertions, 5 deletions
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 38da106..a51e781 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -278,6 +278,31 @@ String literals
OP '+' (1, 32) (1, 33)
STRING 'UR"ABC"' (1, 34) (1, 41)
+ >>> dump_tokens("b'abc' + B'abc'")
+ STRING "b'abc'" (1, 0) (1, 6)
+ OP '+' (1, 7) (1, 8)
+ STRING "B'abc'" (1, 9) (1, 15)
+ >>> dump_tokens('b"abc" + B"abc"')
+ STRING 'b"abc"' (1, 0) (1, 6)
+ OP '+' (1, 7) (1, 8)
+ STRING 'B"abc"' (1, 9) (1, 15)
+ >>> dump_tokens("br'abc' + bR'abc' + Br'abc' + BR'abc'")
+ STRING "br'abc'" (1, 0) (1, 7)
+ OP '+' (1, 8) (1, 9)
+ STRING "bR'abc'" (1, 10) (1, 17)
+ OP '+' (1, 18) (1, 19)
+ STRING "Br'abc'" (1, 20) (1, 27)
+ OP '+' (1, 28) (1, 29)
+ STRING "BR'abc'" (1, 30) (1, 37)
+ >>> dump_tokens('br"abc" + bR"abc" + Br"abc" + BR"abc"')
+ STRING 'br"abc"' (1, 0) (1, 7)
+ OP '+' (1, 8) (1, 9)
+ STRING 'bR"abc"' (1, 10) (1, 17)
+ OP '+' (1, 18) (1, 19)
+ STRING 'Br"abc"' (1, 20) (1, 27)
+ OP '+' (1, 28) (1, 29)
+ STRING 'BR"abc"' (1, 30) (1, 37)
+
Operators
>>> dump_tokens("def d22(a, b, c=2, d=2, *k): pass")
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index ae3de54..1cba6e5 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -70,10 +70,10 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
+Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
# Single-line ' or " string.
-String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
@@ -91,9 +91,9 @@ PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
-ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
- r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+ r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
diff --git a/Misc/NEWS b/Misc/NEWS
index c4b27fb..c50f6cb 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,10 @@ Core and Builtins
Library
-------
+- Issue #15054: A bug in tokenize.tokenize that caused string literals
+ with 'b' and 'br' prefixes to be incorrectly tokenized has been fixed.
+ Patch by Serhiy Storchaka.
+
- Issue #15036: Allow removing or changing multiple items in
single-file mailboxes (mbox, MMDF, Babyl) flushing the mailbox
between the changes.