diff options
author | Hye-Shik Chang <hyeshik@gmail.com> | 2004-08-04 17:36:41 (GMT) |
---|---|---|
committer | Hye-Shik Chang <hyeshik@gmail.com> | 2004-08-04 17:36:41 (GMT) |
commit | 7df44b384a4391cfed0a4d26b7e314a06ae4d595 (patch) | |
tree | ca296981c3244abf8c42ac8f813e540fe9833e24 /Parser/tokenizer.c | |
parent | 5910d81c979b79a98f3d5ac8dea81e84ab721c37 (diff) | |
download | cpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.zip cpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.tar.gz cpython-7df44b384a4391cfed0a4d26b7e314a06ae4d595.tar.bz2 |
SF #941229: Decode source code with sys.stdin.encoding in interactive
modes like non-interactive modes. This allows for non-latin-1 users
to write unicode strings directly and sets Japanese users free from
weird manual escaping <wink> in shift_jis environments.
(Reviewed by Martin v. Loewis)
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r-- | Parser/tokenizer.c | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 4fdc2e6..8fc2c26 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -651,6 +651,63 @@ PyTokenizer_Free(struct tok_state *tok) PyMem_DEL(tok); } +#if !defined(PGEN) && defined(Py_USING_UNICODE) +static int +tok_stdin_decode(struct tok_state *tok, char **inp) +{ + PyObject *enc, *sysstdin, *decoded, *utf8; + const char *encoding; + char *converted; + + if (PySys_GetFile((char *)"stdin", NULL) != stdin) + return 0; + sysstdin = PySys_GetObject("stdin"); + if (sysstdin == NULL || !PyFile_Check(sysstdin)) + return 0; + + enc = ((PyFileObject *)sysstdin)->f_encoding; + if (enc == NULL || !PyString_Check(enc)) + return 0; + Py_INCREF(enc); + + encoding = PyString_AsString(enc); + decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL); + if (decoded == NULL) + goto error_clear; + + utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL); + Py_DECREF(decoded); + if (utf8 == NULL) + goto error_clear; + + converted = new_string(PyString_AsString(utf8), PyString_Size(utf8)); + Py_DECREF(utf8); + if (converted == NULL) + goto error_nomem; + + PyMem_FREE(*inp); + *inp = converted; + if (tok->encoding != NULL) + PyMem_DEL(tok->encoding); + tok->encoding = new_string(encoding, strlen(encoding)); + if (tok->encoding == NULL) + goto error_nomem; + + Py_DECREF(enc); + return 0; + +error_nomem: + Py_DECREF(enc); + tok->done = E_NOMEM; + return -1; + +error_clear: + /* Fallback to iso-8859-1: for backward compatibility */ + Py_DECREF(enc); + PyErr_Clear(); + return 0; +} +#endif /* Get next char, updating state; error code goes into tok->done */ @@ -690,6 +747,10 @@ tok_nextc(register struct tok_state *tok) PyMem_FREE(new); tok->done = E_EOF; } +#if !defined(PGEN) && defined(Py_USING_UNICODE) + else if (tok_stdin_decode(tok, &new) != 0) + PyMem_FREE(new); +#endif else if (tok->start != NULL) { size_t start = tok->start - tok->buf; size_t oldlen = tok->cur - tok->buf; |