diff options
-rw-r--r-- | Lib/test/test_pep3131.py | 3 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 1 | ||||
-rw-r--r-- | Misc/NEWS | 3 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 31 |
4 files changed, 30 insertions, 8 deletions
diff --git a/Lib/test/test_pep3131.py b/Lib/test/test_pep3131.py index 9d5f217..ed7558a 100644 --- a/Lib/test/test_pep3131.py +++ b/Lib/test/test_pep3131.py @@ -8,9 +8,12 @@ class PEP3131Test(unittest.TestCase): รค = 1 ยต = 2 # this is a compatibility character ่ = 3 + ๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข = 4 self.assertEqual(getattr(T, "\xe4"), 1) self.assertEqual(getattr(T, "\u03bc"), 2) self.assertEqual(getattr(T, '\u87d2'), 3) + v = getattr(T, "\U0001d518\U0001d52b\U0001d526\U0001d520\U0001d52c\U0001d521\U0001d522") + self.assertEqual(v, 4) def test_invalid(self): try: diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 1da44b0..19b31c0 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -404,6 +404,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertTrue("bc".isidentifier()) self.assertTrue("b_".isidentifier()) self.assertTrue("ยต".isidentifier()) + self.assertTrue("๐๐ซ๐ฆ๐ ๐ฌ๐ก๐ข".isidentifier()) self.assertFalse(" ".isidentifier()) self.assertFalse("[".isidentifier()) @@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1? Core and Builtins ----------------- +- Issue #12732: In narrow unicode builds, allow Unicode identifiers which fall + outside the BMP. + - Issue #12575: Validate user-generated AST before it is compiled. - Make type(None), type(Ellipsis), and type(NotImplemented) callable. They diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0918671..61b253d 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8044,14 +8044,30 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(1); } +static Py_UCS4 +decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size) +{ + Py_UCS4 ch; + assert(*i < size); + ch = s[(*i)++]; +#ifndef Py_UNICODE_WIDE + if ((ch & 0xfffffc00) == 0xd800 && + *i < size + && (s[*i] & 0xFFFFFC00) == 0xDC00) + ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; +#endif + return ch; +} + int PyUnicode_IsIdentifier(PyObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); - register const Py_UNICODE *e; + Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self); + Py_UCS4 first; + const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); /* Special case for empty strings */ - if (PyUnicode_GET_SIZE(self) == 0) + if (!size) return 0; /* PEP 3131 says that the first character must be in @@ -8062,14 +8078,13 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) + first = decode_ucs4(p, &i, size); + if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) return 0; - e = p + PyUnicode_GET_SIZE(self); - for (p++; p < e; p++) { - if (!_PyUnicode_IsXidContinue(*p)) + while (i < size) + if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) return 0; - } return 1; } |