summaryrefslogtreecommitdiffstats
path: root/Parser/tokenizer.c
diff options
context:
space:
mode:
authorHye-Shik Chang <hyeshik@gmail.com>2004-08-04 17:36:41 (GMT)
committerHye-Shik Chang <hyeshik@gmail.com>2004-08-04 17:36:41 (GMT)
commit7df44b384a4391cfed0a4d26b7e314a06ae4d595 (patch)
treeca296981c3244abf8c42ac8f813e540fe9833e24 /Parser/tokenizer.c
parent5910d81c979b79a98f3d5ac8dea81e84ab721c37 (diff)
downloadcpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.zip
cpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.tar.gz
cpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.tar.bz2
SF #941229: Decode source code with sys.stdin.encoding in interactive
modes like non-interactive modes. This allows for non-latin-1 users to write unicode strings directly and sets Japanese users free from weird manual escaping <wink> in shift_jis environments. (Reviewed by Martin v. Loewis)
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r--Parser/tokenizer.c61
1 files changed, 61 insertions, 0 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 4fdc2e6..8fc2c26 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -651,6 +651,63 @@ PyTokenizer_Free(struct tok_state *tok)
PyMem_DEL(tok);
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+static int
+tok_stdin_decode(struct tok_state *tok, char **inp)
+{
+ PyObject *enc, *sysstdin, *decoded, *utf8;
+ const char *encoding;
+ char *converted;
+
+ if (PySys_GetFile((char *)"stdin", NULL) != stdin)
+ return 0;
+ sysstdin = PySys_GetObject("stdin");
+ if (sysstdin == NULL || !PyFile_Check(sysstdin))
+ return 0;
+
+ enc = ((PyFileObject *)sysstdin)->f_encoding;
+ if (enc == NULL || !PyString_Check(enc))
+ return 0;
+ Py_INCREF(enc);
+
+ encoding = PyString_AsString(enc);
+ decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
+ if (decoded == NULL)
+ goto error_clear;
+
+ utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
+ Py_DECREF(decoded);
+ if (utf8 == NULL)
+ goto error_clear;
+
+ converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
+ Py_DECREF(utf8);
+ if (converted == NULL)
+ goto error_nomem;
+
+ PyMem_FREE(*inp);
+ *inp = converted;
+ if (tok->encoding != NULL)
+ PyMem_DEL(tok->encoding);
+ tok->encoding = new_string(encoding, strlen(encoding));
+ if (tok->encoding == NULL)
+ goto error_nomem;
+
+ Py_DECREF(enc);
+ return 0;
+
+error_nomem:
+ Py_DECREF(enc);
+ tok->done = E_NOMEM;
+ return -1;
+
+error_clear:
+ /* Fallback to iso-8859-1: for backward compatibility */
+ Py_DECREF(enc);
+ PyErr_Clear();
+ return 0;
+}
+#endif
/* Get next char, updating state; error code goes into tok->done */
@@ -690,6 +747,10 @@ tok_nextc(register struct tok_state *tok)
PyMem_FREE(new);
tok->done = E_EOF;
}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+ else if (tok_stdin_decode(tok, &new) != 0)
+ PyMem_FREE(new);
+#endif
else if (tok->start != NULL) {
size_t start = tok->start - tok->buf;
size_t oldlen = tok->cur - tok->buf;