diff options
-rw-r--r-- | Doc/lib/libstdtypes.tex | 5 | ||||
-rw-r--r-- | Include/errcode.h | 1 | ||||
-rw-r--r-- | Include/unicodeobject.h | 6 | ||||
-rw-r--r-- | Lib/test/badsyntax_3131.py | 2 | ||||
-rw-r--r-- | Lib/test/test_pep3131.py | 29 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 13 | ||||
-rw-r--r-- | Misc/NEWS | 2 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 45 | ||||
-rw-r--r-- | Parser/tokenizer.c | 29 | ||||
-rw-r--r-- | Python/ast.c | 21 | ||||
-rw-r--r-- | Python/pythonrun.c | 4 |
11 files changed, 152 insertions, 5 deletions
diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex index 99c54d8..d8abce9 100644 --- a/Doc/lib/libstdtypes.tex +++ b/Doc/lib/libstdtypes.tex @@ -653,6 +653,11 @@ is at least one character, false otherwise. For 8-bit strings, this method is locale-dependent. \end{methoddesc} +\begin{methoddesc}[str]{isidentifier}{} +Return True if S is a valid identifier according\n\ +to the language definition. +\end{methoddesc} + \begin{methoddesc}[str]{islower}{} Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise. diff --git a/Include/errcode.h b/Include/errcode.h index becec80..6bb3cc1 100644 --- a/Include/errcode.h +++ b/Include/errcode.h @@ -29,6 +29,7 @@ extern "C" { #define E_EOFS 23 /* EOF in triple-quoted string */ #define E_EOLS 24 /* EOL in single-quoted string */ #define E_LINECONT 25 /* Unexpected characters after a line continuation */ +#define E_IDENTIFIER 26 /* Invalid characters in identifier */ #ifdef __cplusplus } diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 54b7129..5545344 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetSize PyUnicodeUCS2_GetSize +# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier # define PyUnicode_Join PyUnicodeUCS2_Join # define PyUnicode_Partition PyUnicodeUCS2_Partition # define PyUnicode_RPartition PyUnicodeUCS2_RPartition @@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS4_GetMax # define PyUnicode_GetSize PyUnicodeUCS4_GetSize +# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier # define PyUnicode_Join PyUnicodeUCS4_Join # define PyUnicode_Partition PyUnicodeUCS4_Partition # define PyUnicode_RPartition PyUnicodeUCS4_RPartition @@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains( PyObject *element /* Element string */ ); +/* Checks whether argument is a valid identifier. */ + +PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); + /* Externally visible for str.strip(unicode) */ PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( PyUnicodeObject *self, diff --git a/Lib/test/badsyntax_3131.py b/Lib/test/badsyntax_3131.py new file mode 100644 index 0000000..901d374 --- /dev/null +++ b/Lib/test/badsyntax_3131.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +€ = 2 diff --git a/Lib/test/test_pep3131.py b/Lib/test/test_pep3131.py new file mode 100644 index 0000000..94801c9 --- /dev/null +++ b/Lib/test/test_pep3131.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +import unittest +from test import test_support + +class PEP3131Test(unittest.TestCase): + + def test_valid(self): + class T: + ä = 1 + µ = 2 # this is a compatibility character + 蟒 = 3 + self.assertEquals(getattr(T, "\xe4"), 1) + self.assertEquals(getattr(T, "\u03bc"), 2) + self.assertEquals(getattr(T, '\u87d2'), 3) + + def test_invalid(self): + try: + from test import badsyntax_3131 + except SyntaxError as s: + self.assertEquals(str(s), + "invalid character in identifier (badsyntax_3131.py, line 2)") + else: + self.fail("expected exception didn't occur") + +def test_main(): + test_support.run_unittest(PEP3131Test) + +if __name__=="__main__": + test_main() diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 3032f79..4b582de 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -313,6 +313,19 @@ class UnicodeTest( self.assertRaises(TypeError, "abc".isnumeric, 42) + def test_isidentifier(self): + self.assertTrue("a".isidentifier()) + self.assertTrue("Z".isidentifier()) + self.assertTrue("_".isidentifier()) + self.assertTrue("b0".isidentifier()) + self.assertTrue("bc".isidentifier()) + self.assertTrue("b_".isidentifier()) + self.assertTrue("µ".isidentifier()) + + self.assertFalse(" ".isidentifier()) + self.assertFalse("[".isidentifier()) + self.assertFalse("©".isidentifier()) + def test_contains(self): # Testing Unicode contains method self.assert_('a' in 'abdb') @@ -26,6 +26,8 @@ TO DO Core and Builtins ----------------- +- PEP 3131: Support non-ASCII identifiers. + - PEP 3120: Change default encoding to UTF-8. - PEP 3123: Use proper C inheritance for PyObject. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 47109a5..e9f97df 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode, } /* We allocate one more byte to make sure the string is - Ux0000 terminated -- XXX is this needed ? + Ux0000 terminated; some code (e.g. new_identifier) + relies on that. XXX This allocator could further be enhanced by assuring that the free list never reduces its size below 1. @@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(1); } +int +PyUnicode_IsIdentifier(PyObject *self) +{ + register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); + register const Py_UNICODE *e; + + /* Special case for empty strings */ + if (PyUnicode_GET_SIZE(self) == 0) + return 0; + + /* PEP 3131 says that the first character must be in + XID_Start and subsequent characters in XID_Continue, + and for the ASCII range, the 2.x rules apply (i.e + start with letters and underscore, continue with + letters, digits, underscore). However, given the current + definition of XID_Start and XID_Continue, it is sufficient + to check just for these, except that _ must be allowed + as starting an identifier. */ + if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) + return 0; + + e = p + PyUnicode_GET_SIZE(self); + for (p++; p < e; p++) { + if (!_PyUnicode_IsXidContinue(*p)) + return 0; + } + return 1; +} + +PyDoc_STRVAR(isidentifier__doc__, +"S.isidentifier() -> bool\n\ +\n\ +Return True if S is a valid identifier according\n\ +to the language definition."); + +static PyObject* +unicode_isidentifier(PyObject *self) +{ + return PyBool_FromLong(PyUnicode_IsIdentifier(self)); +} + PyDoc_STRVAR(join__doc__, "S.join(sequence) -> unicode\n\ \n\ @@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = { {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__}, {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__}, {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__}, + {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__}, {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__}, #if 0 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__}, diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 2e700bc..8f30fef 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -21,13 +21,15 @@ #define is_potential_identifier_start(c) (\ (c >= 'a' && c <= 'z')\ || (c >= 'A' && c <= 'Z')\ - || c == '_') + || c == '_'\ + || (c >= 128)) #define is_potential_identifier_char(c) (\ (c >= 'a' && c <= 'z')\ || (c >= 'A' && c <= 'Z')\ || (c >= '0' && c <= '9')\ - || c == '_') + || c == '_'\ + || (c >= 128)) extern char *PyOS_Readline(FILE *, FILE *, char *); /* Return malloc'ed string including trailing \n; @@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok) return 0; } +#ifdef PGEN +#define verify_identifier(s,e) 1 +#else +/* Verify that the identifier follows PEP 3131. */ +static int +verify_identifier(char *start, char *end) +{ + PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL); + int result = PyUnicode_IsIdentifier(s); + Py_DECREF(s); + return result; +} +#endif /* Get next token, after space stripping etc. */ @@ -1077,7 +1092,7 @@ static int tok_get(register struct tok_state *tok, char **p_start, char **p_end) { register int c; - int blankline; + int blankline, nonascii; *p_start = *p_end = NULL; nextline: @@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) } /* Identifier (most frequent token!) */ + nonascii = 0; if (is_potential_identifier_start(c)) { /* Process r"", u"" and ur"" */ switch (c) { @@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) break; } while (is_potential_identifier_char(c)) { + if (c >= 128) + nonascii = 1; c = tok_nextc(tok); } tok_backup(tok, c); + if (nonascii && + !verify_identifier(tok->start, tok->cur)) { + tok->done = E_IDENTIFIER; + return ERRORTOKEN; + } *p_start = tok->start; *p_end = tok->cur; return NAME; diff --git a/Python/ast.c b/Python/ast.c index 5426c02..c13d093 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n, #define COMP_SETCOMP 2 static identifier -new_identifier(const char* n, PyArena *arena) { +new_identifier(const char* n, PyArena *arena) +{ PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL); + Py_UNICODE *u = PyUnicode_AS_UNICODE(id); + /* Check whether there are non-ASCII characters in the + identifier; if so, normalize to NFKC. */ + for (; *u; u++) { + if (*u >= 128) { + PyObject *m = PyImport_ImportModule("unicodedata"); + PyObject *id2; + if (!m) + return NULL; + id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id); + Py_DECREF(m); + if (!id2) + return NULL; + Py_DECREF(id); + id = id2; + break; + } + } PyUnicode_InternInPlace(&id); PyArena_AddPyObject(arena, id); return id; diff --git a/Python/pythonrun.c b/Python/pythonrun.c index cc0926a..32bc6f7 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -1530,6 +1530,10 @@ err_input(perrdetail *err) case E_LINECONT: msg = "unexpected character after line continuation character"; break; + + case E_IDENTIFIER: + msg = "invalid character in identifier"; + break; default: fprintf(stderr, "error=%d\n", err->error); msg = "unknown parsing error"; |