summaryrefslogtreecommitdiffstats
path: root/Parser/tokenizer.c
diff options
context:
space:
mode:
authorMartin v. Löwis <martin@v.loewis.de>2007-08-15 07:32:56 (GMT)
committerMartin v. Löwis <martin@v.loewis.de>2007-08-15 07:32:56 (GMT)
commit47383403a0a11259acb640406a8efc38981d2255 (patch)
treead461e275dc3f2607bab86bb596366d71489b453 /Parser/tokenizer.c
parent32c4ac014387d3bffea5461339b8ad3044d0dafb (diff)
downloadcpython-47383403a0a11259acb640406a8efc38981d2255.zip
cpython-47383403a0a11259acb640406a8efc38981d2255.tar.gz
cpython-47383403a0a11259acb640406a8efc38981d2255.tar.bz2
Implement PEP 3131. Add isidentifier to str.
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r--Parser/tokenizer.c29
1 files changed, 26 insertions, 3 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 2e700bc..8f30fef 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -21,13 +21,15 @@
#define is_potential_identifier_start(c) (\
(c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\
- || c == '_')
+ || c == '_'\
+ || (c >= 128))
#define is_potential_identifier_char(c) (\
(c >= 'a' && c <= 'z')\
|| (c >= 'A' && c <= 'Z')\
|| (c >= '0' && c <= '9')\
- || c == '_')
+ || c == '_'\
+ || (c >= 128))
extern char *PyOS_Readline(FILE *, FILE *, char *);
/* Return malloc'ed string including trailing \n;
@@ -1070,6 +1072,19 @@ indenterror(struct tok_state *tok)
return 0;
}
+#ifdef PGEN
+#define verify_identifier(s,e) 1
+#else
+/* Verify that the identifier follows PEP 3131. */
+static int
+verify_identifier(char *start, char *end)
+{
+ PyObject *s = PyUnicode_DecodeUTF8(start, end-start, NULL);
+ int result = PyUnicode_IsIdentifier(s);
+ Py_DECREF(s);
+ return result;
+}
+#endif
/* Get next token, after space stripping etc. */
@@ -1077,7 +1092,7 @@ static int
tok_get(register struct tok_state *tok, char **p_start, char **p_end)
{
register int c;
- int blankline;
+ int blankline, nonascii;
*p_start = *p_end = NULL;
nextline:
@@ -1195,6 +1210,7 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
}
/* Identifier (most frequent token!) */
+ nonascii = 0;
if (is_potential_identifier_start(c)) {
/* Process r"", u"" and ur"" */
switch (c) {
@@ -1214,9 +1230,16 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end)
break;
}
while (is_potential_identifier_char(c)) {
+ if (c >= 128)
+ nonascii = 1;
c = tok_nextc(tok);
}
tok_backup(tok, c);
+ if (nonascii &&
+ !verify_identifier(tok->start, tok->cur)) {
+ tok->done = E_IDENTIFIER;
+ return ERRORTOKEN;
+ }
*p_start = tok->start;
*p_end = tok->cur;
return NAME;