summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Doc/lib/libstdtypes.tex5
-rw-r--r--Include/errcode.h1
-rw-r--r--Include/unicodeobject.h6
-rw-r--r--Lib/test/badsyntax_3131.py2
-rw-r--r--Lib/test/test_pep3131.py29
-rw-r--r--Lib/test/test_unicode.py13
-rw-r--r--Misc/NEWS2
-rw-r--r--Objects/unicodeobject.c45
-rw-r--r--Parser/tokenizer.c29
-rw-r--r--Python/ast.c21
-rw-r--r--Python/pythonrun.c4
11 files changed, 152 insertions, 5 deletions
diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex
index 99c54d8..d8abce9 100644
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@@ -653,6 +653,11 @@ is at least one character, false otherwise.
For 8-bit strings, this method is locale-dependent.
\end{methoddesc}
+\begin{methoddesc}[str]{isidentifier}{}
+Return True if S is a valid identifier according\n\
+to the language definition.
+\end{methoddesc}
+
\begin{methoddesc}[str]{islower}{}
Return true if all cased characters in the string are lowercase and
there is at least one cased character, false otherwise.
diff --git a/Include/errcode.h b/Include/errcode.h
index becec80..6bb3cc1 100644
--- a/Include/errcode.h
+++ b/Include/errcode.h
@@ -29,6 +29,7 @@ extern "C" {
#define E_EOFS 23 /* EOF in triple-quoted string */
#define E_EOLS 24 /* EOL in single-quoted string */
#define E_LINECONT 25 /* Unexpected characters after a line continuation */
+#define E_IDENTIFIER 26 /* Invalid characters in identifier */
#ifdef __cplusplus
}
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 54b7129..5545344 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -182,6 +182,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
+# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Partition PyUnicodeUCS2_Partition
# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
@@ -268,6 +269,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
+# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Partition PyUnicodeUCS4_Partition
# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
@@ -1250,6 +1252,10 @@ PyAPI_FUNC(int) PyUnicode_Contains(
PyObject *element /* Element string */
);
+/* Checks whether argument is a valid identifier. */
+
+PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
+
/* Externally visible for str.strip(unicode) */
PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
PyUnicodeObject *self,
diff --git a/Lib/test/badsyntax_3131.py b/Lib/test/badsyntax_3131.py
new file mode 100644
index 0000000..901d374
--- /dev/null
+++ b/Lib/test/badsyntax_3131.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+€ = 2
diff --git a/Lib/test/test_pep3131.py b/Lib/test/test_pep3131.py
new file mode 100644
index 0000000..94801c9
--- /dev/null
+++ b/Lib/test/test_pep3131.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+import unittest
+from test import test_support
+
+class PEP3131Test(unittest.TestCase):
+
+ def test_valid(self):
+ class T:
+ ä = 1
+ µ = 2 # this is a compatibility character
+ 蟒 = 3
+ self.assertEquals(getattr(T, "\xe4"), 1)
+ self.assertEquals(getattr(T, "\u03bc"), 2)
+ self.assertEquals(getattr(T, '\u87d2'), 3)
+
+ def test_invalid(self):
+ try:
+ from test import badsyntax_3131
+ except SyntaxError as s:
+ self.assertEquals(str(s),
+ "invalid character in identifier (badsyntax_3131.py, line 2)")
+ else:
+ self.fail("expected exception didn't occur")
+
+def test_main():
+ test_support.run_unittest(PEP3131Test)
+
+if __name__=="__main__":
+ test_main()
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 3032f79..4b582de 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -313,6 +313,19 @@ class UnicodeTest(
self.assertRaises(TypeError, "abc".isnumeric, 42)
+ def test_isidentifier(self):
+ self.assertTrue("a".isidentifier())
+ self.assertTrue("Z".isidentifier())
+ self.assertTrue("_".isidentifier())
+ self.assertTrue("b0".isidentifier())
+ self.assertTrue("bc".isidentifier())
+ self.assertTrue("b_".isidentifier())
+ self.assertTrue("µ".isidentifier())
+
+ self.assertFalse(" ".isidentifier())
+ self.assertFalse("[".isidentifier())
+ self.assertFalse("©".isidentifier())
+
def test_contains(self):
# Testing Unicode contains method
self.assert_('a' in 'abdb')
diff --git a/Misc/NEWS b/Misc/NEWS
index 342ca2f..0745f8d 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -26,6 +26,8 @@ TO DO
Core and Builtins
-----------------
+- PEP 3131: Support non-ASCII identifiers.
+
- PEP 3120: Change default encoding to UTF-8.
- PEP 3123: Use proper C inheritance for PyObject.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 47109a5..e9f97df 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -227,7 +227,8 @@ int unicode_resize(register PyUnicodeObject *unicode,
}
/* We allocate one more byte to make sure the string is
- Ux0000 terminated -- XXX is this needed ?
+ Ux0000 terminated; some code (e.g. new_identifier)
+ relies on that.
XXX This allocator could further be enhanced by assuring that the
free list never reduces its size below 1.
@@ -6679,6 +6680,47 @@ unicode_isnumeric(PyUnicodeObject *self)
return PyBool_FromLong(1);
}
+int
+PyUnicode_IsIdentifier(PyObject *self)
+{
+ register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
+ register const Py_UNICODE *e;
+
+ /* Special case for empty strings */
+ if (PyUnicode_GET_SIZE(self) == 0)
+ return 0;
+
+ /* PEP 3131 says that the first character must be in
+ XID_Start and subsequent characters in XID_Continue,
+ and for the ASCII range, the 2.x rules apply (i.e
+ start with letters and underscore, continue with
+ letters, digits, underscore). However, given the current
+ definition of XID_Start and XID_Continue, it is sufficient
+ to check just for these, except that _ must be allowed
+ as starting an identifier. */
+ if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
+ return 0;
+
+ e = p + PyUnicode_GET_SIZE(self);
+ for (p++; p < e; p++) {
+ if (!_PyUnicode_IsXidContinue(*p))
+ return 0;
+ }
+ return 1;
+}
+
+PyDoc_STRVAR(isidentifier__doc__,
+"S.isidentifier() -> bool\n\
+\n\
+Return True if S is a valid identifier according\n\
+to the language definition.");
+
+static PyObject*
+unicode_isidentifier(PyObject *self)
+{
+ return PyBool_FromLong(PyUnicode_IsIdentifier(self));
+}
+
PyDoc_STRVAR(join__doc__,
"S.join(sequence) -> unicode\n\
\n\
@@ -7714,6 +7756,7 @@ static PyMethodDef unicode_methods[] = {
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
+ {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
#if 0
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 2e700bc..8f30fef 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -21,13 +21,15 @@
#define is_potential_identifier_start(c) (\
(c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\
- || c == '_')
+ || c == '_'\
+ || (c >= 128))
#define is_potential_identifier_char(c) (\
(c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\
|| (c >= '0' && c <= '9')\
- || c == '_')
+ || c == '_'\
+ || (c >= 128))
extern char *PyOS_Readline(FILE *, FILE *, char *);
/* Return malloc'ed string including trailing \n;
@@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok)
return 0;
}
+#ifdef PGEN
+#define verify_identifier(s,e) 1
+#else
+/* Verify that the identifier follows PEP 3131. */
+static int
+verify_identifier(char *start, char *end)
+{
+ PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
+ int result = PyUnicode_IsIdentifier(s);
+ Py_DECREF(s);
+ return result;
+}
+#endif
/* Get next token, after space stripping etc. */
@@ -1077,7 +1092,7 @@ static int
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
{
register int c;
- int blankline;
+ int blankline, nonascii;
*p_start = *p_end = NULL;
nextline:
@@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
}
/* Identifier (most frequent token!) */
+ nonascii = 0;
if (is_potential_identifier_start(c)) {
/* Process r"", u"" and ur"" */
switch (c) {
@@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
break;
}
while (is_potential_identifier_char(c)) {
+ if (c >= 128)
+ nonascii = 1;
c = tok_nextc(tok);
}
tok_backup(tok, c);
+ if (nonascii &&
+ !verify_identifier(tok->start, tok->cur)) {
+ tok->done = E_IDENTIFIER;
+ return ERRORTOKEN;
+ }
*p_start = tok->start;
*p_end = tok->cur;
return NAME;
diff --git a/Python/ast.c b/Python/ast.c
index 5426c02..c13d093 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -47,8 +47,27 @@ static PyObject *parsestrplus(struct compiling *, const node *n,
#define COMP_SETCOMP 2
static identifier
-new_identifier(const char* n, PyArena *arena) {
+new_identifier(const char* n, PyArena *arena)
+{
PyObject* id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
+ Py_UNICODE *u = PyUnicode_AS_UNICODE(id);
+ /* Check whether there are non-ASCII characters in the
+ identifier; if so, normalize to NFKC. */
+ for (; *u; u++) {
+ if (*u >= 128) {
+ PyObject *m = PyImport_ImportModule("unicodedata");
+ PyObject *id2;
+ if (!m)
+ return NULL;
+ id2 = PyObject_CallMethod(m, "normalize", "sO", "NFKC", id);
+ Py_DECREF(m);
+ if (!id2)
+ return NULL;
+ Py_DECREF(id);
+ id = id2;
+ break;
+ }
+ }
PyUnicode_InternInPlace(&id);
PyArena_AddPyObject(arena, id);
return id;
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index cc0926a..32bc6f7 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -1530,6 +1530,10 @@ err_input(perrdetail *err)
case E_LINECONT:
msg = "unexpected character after line continuation character";
break;
+
+ case E_IDENTIFIER:
+ msg = "invalid character in identifier";
+ break;
default:
fprintf(stderr, "error=%d\n", err->error);
msg = "unknown parsing error";