From 0189a46ec59ca02feadcfde2840f4236756598da Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 3 Oct 2013 12:08:22 +0300 Subject: Issue #18037: 2to3 now escapes '\u' and '\U' in native strings. --- Lib/lib2to3/fixes/fix_unicode.py | 32 +++++++++++++++++++++++++------- Lib/lib2to3/tests/test_fixers.py | 37 +++++++++++++++++++++++++++++++++++++ Misc/NEWS | 2 ++ 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/Lib/lib2to3/fixes/fix_unicode.py b/Lib/lib2to3/fixes/fix_unicode.py index 6c89576..922486b 100644 --- a/Lib/lib2to3/fixes/fix_unicode.py +++ b/Lib/lib2to3/fixes/fix_unicode.py @@ -1,25 +1,43 @@ -"""Fixer that changes unicode to str, unichr to chr, and u"..." into "...". +r"""Fixer for unicode. + +* Changes unicode to str and unichr to chr. + +* If "...\u..." is not unicode literal change it into "...\\u...". + +* Change u"..." into "...". """ -import re from ..pgen2 import token from .. import fixer_base _mapping = {u"unichr" : u"chr", u"unicode" : u"str"} -_literal_re = re.compile(ur"[uU][rR]?[\'\"]") class FixUnicode(fixer_base.BaseFix): BM_compatible = True PATTERN = "STRING | 'unicode' | 'unichr'" + def start_tree(self, tree, filename): + super(FixUnicode, self).start_tree(tree, filename) + self.unicode_literals = 'unicode_literals' in tree.future_features + def transform(self, node, results): if node.type == token.NAME: new = node.clone() new.value = _mapping[node.value] return new elif node.type == token.STRING: - if _literal_re.match(node.value): - new = node.clone() - new.value = new.value[1:] - return new + val = node.value + if (not self.unicode_literals and val[0] in u'rR\'"' and + u'\\' in val): + val = ur'\\'.join([ + v.replace(u'\\u', ur'\\u').replace(u'\\U', ur'\\U') + for v in val.split(ur'\\') + ]) + if val[0] in u'uU': + val = val[1:] + if val == node.value: + return node + new = node.clone() + new.value = val + return new diff --git a/Lib/lib2to3/tests/test_fixers.py b/Lib/lib2to3/tests/test_fixers.py index 1817208..5f283a8 100644 --- a/Lib/lib2to3/tests/test_fixers.py +++ b/Lib/lib2to3/tests/test_fixers.py @@ -2824,6 +2824,43 @@ class Test_unicode(FixerTestCase): a = """R'''x''' """ self.check(b, a) + def test_native_literal_escape_u(self): + b = """'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """'\\\\\\\\u20ac\\\\U0001d121\\\\u20ac'""" + self.check(b, a) + + b = """r'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """r'\\\\\\\\u20ac\\\\U0001d121\\\\u20ac'""" + self.check(b, a) + + def test_bytes_literal_escape_u(self): + b = """b'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """b'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + + b = """br'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """br'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + + def test_unicode_literal_escape_u(self): + b = """u'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + + b = """ur'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = """r'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + + def test_native_unicode_literal_escape_u(self): + f = 'from __future__ import unicode_literals\n' + b = f + """'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = f + """'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + + b = f + """r'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + a = f + """r'\\\\\\u20ac\\U0001d121\\\\u20ac'""" + self.check(b, a) + class Test_callable(FixerTestCase): fixer = "callable" diff --git a/Misc/NEWS b/Misc/NEWS index feb3f08..9aad84f 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -32,6 +32,8 @@ Core and Builtins Library ------- +- Issue #18037: 2to3 now escapes '\u' and '\U' in native strings. + - Issue #19137: The pprint module now correctly formats empty set and frozenset and instances of set and frozenset subclasses. -- cgit v0.12