From 011e8420339245f9b55d41082ec6036f2f83a182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20v=2E=20L=C3=B6wis?= Date: Tue, 5 May 2009 04:43:17 +0000 Subject: Issue #5915: Implement PEP 383, Non-decodable Bytes in System Character Interfaces. --- Doc/library/codecs.rst | 4 +- Doc/library/os.rst | 38 +++- Include/unicodeobject.h | 48 +++-- Lib/test/test_codecs.py | 29 +++ Lib/test/test_os.py | 39 +++- Misc/NEWS | 2 + Modules/_io/fileio.c | 2 +- Modules/posixmodule.c | 520 +++++++++++++++++++++++++++++------------------- Modules/python.c | 113 ++++++++--- Objects/unicodeobject.c | 89 ++++++++- Python/codecs.c | 89 +++++++++ Python/pythonrun.c | 32 +-- configure | 5 +- configure.in | 2 +- pyconfig.h.in | 3 + 15 files changed, 726 insertions(+), 289 deletions(-) diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst index ab578ea..3f1a5fe 100644 --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -322,6 +322,8 @@ and implemented by all standard Python codecs: | ``'backslashreplace'`` | Replace with backslashed escape sequences | | | (only for encoding). | +-------------------------+-----------------------------------------------+ +| ``'utf8b'`` | Replace byte with surrogate U+DCxx. | ++-------------------------+-----------------------------------------------+ In addition, the following error handlers are specific to a single codec: @@ -333,7 +335,7 @@ In addition, the following error handlers are specific to a single codec: +------------------+---------+--------------------------------------------+ .. versionadded:: 3.1 - The ``'surrogates'`` error handler. + The ``'utf8b'`` and ``'surrogates'`` error handlers. The set of allowed values can be extended via :meth:`register_error`. diff --git a/Doc/library/os.rst b/Doc/library/os.rst index c686baf..83f5ee9 100644 --- a/Doc/library/os.rst +++ b/Doc/library/os.rst @@ -51,6 +51,30 @@ the :mod:`os` module, but using them is of course a threat to portability! ``'ce'``, ``'java'``. +.. _os-filenames: + +File Names, Command Line Arguments, and Environment Variables +------------------------------------------------------------- + +In Python, file names, command line arguments, and environment +variables are represented using the string type. On some systems, +decoding these strings to and from bytes is necessary before passing +them to the operating system. Python uses the file system encoding to +perform this conversion (see :func:`sys.getfilesystemencoding`). + +.. versionchanged:: 3.1 + On some systems, conversion using the file system encoding may + fail. In this case, Python uses the ``utf8b`` encoding error + handler, which means that undecodable bytes are replaced by a + Unicode character U+DCxx on decoding, and these are again + translated to the original byte on encoding. + + +The file system encoding must guarantee to successfully decode all +bytes below 128. If the file system encoding fails to provide this +guarantee, API functions may raise UnicodeErrors. + + .. _os-procinfo: Process Parameters @@ -688,12 +712,8 @@ Files and Directories .. function:: getcwd() - Return a string representing the current working directory. On Unix - platforms, this function may raise :exc:`UnicodeDecodeError` if the name of - the current directory is not decodable in the file system encoding. Use - :func:`getcwdb` if you need the call to never fail. Availability: Unix, - Windows. - + Return a string representing the current working directory. + Availability: Unix, Windows. .. function:: getcwdb() @@ -800,10 +820,8 @@ Files and Directories entries ``'.'`` and ``'..'`` even if they are present in the directory. Availability: Unix, Windows. - This function can be called with a bytes or string argument. In the bytes - case, all filenames will be listed as returned by the underlying API. In the - string case, filenames will be decoded using the file system encoding, and - skipped if a decoding error occurs. + This function can be called with a bytes or string argument, and returns + filenames of the same datatype. .. function:: lstat(path) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 9c11873..08b518a 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -198,6 +198,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar +# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetSize PyUnicodeUCS2_GetSize @@ -296,6 +297,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar +# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding # define PyUnicode_GetMax PyUnicodeUCS4_GetMax # define PyUnicode_GetSize PyUnicodeUCS4_GetSize @@ -693,25 +695,6 @@ PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString( PyObject *unicode, const char *errors); -/* Decode a null-terminated string using Py_FileSystemDefaultEncoding. - - If the encoding is supported by one of the built-in codecs (i.e., UTF-8, - UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace - invalid characters with '?'. - - The function is intended to be used for paths and file names only - during bootstrapping process where the codecs are not set up. -*/ - -PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( - const char *s /* encoded string */ - ); - -PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( - const char *s, /* encoded string */ - Py_ssize_t size /* size */ - ); - /* Returns a pointer to the default encoding (normally, UTF-8) of the Unicode object unicode and the size of the encoded representation in bytes stored in *size. @@ -1252,6 +1235,33 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal( const char *errors /* error handling */ ); +/* --- File system encoding ---------------------------------------------- */ + +/* ParseTuple converter which converts a Unicode object into the file + system encoding, using the PEP 383 error handler; bytes objects are + output as-is. */ + +PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); + +/* Decode a null-terminated string using Py_FileSystemDefaultEncoding. + + If the encoding is supported by one of the built-in codecs (i.e., UTF-8, + UTF-16, UTF-32, Latin-1 or MBCS), otherwise fallback to UTF-8 and replace + invalid characters with '?'. + + The function is intended to be used for paths and file names only + during bootstrapping process where the codecs are not set up. +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( + const char *s /* encoded string */ + ); + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( + const char *s, /* encoded string */ + Py_ssize_t size /* size */ + ); + /* --- Methods & Slots ---------------------------------------------------- These are capable of handling Unicode objects and strings on input diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 6706507..5a3834d 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1516,6 +1516,34 @@ class TypesTest(unittest.TestCase): self.assertEquals(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6)) self.assertEquals(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6)) +class Utf8bTest(unittest.TestCase): + + def test_utf8(self): + # Bad byte + self.assertEqual(b"foo\x80bar".decode("utf-8", "utf8b"), + "foo\udc80bar") + self.assertEqual("foo\udc80bar".encode("utf-8", "utf8b"), + b"foo\x80bar") + # bad-utf-8 encoded surrogate + self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "utf8b"), + "\udced\udcb0\udc80") + self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "utf8b"), + b"\xed\xb0\x80") + + def test_ascii(self): + # bad byte + self.assertEqual(b"foo\x80bar".decode("ascii", "utf8b"), + "foo\udc80bar") + self.assertEqual("foo\udc80bar".encode("ascii", "utf8b"), + b"foo\x80bar") + + def test_charmap(self): + # bad byte: \xa5 is unmapped in iso-8859-3 + self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "utf8b"), + "foo\udca5bar") + self.assertEqual("foo\udca5bar".encode("iso-8859-3", "utf8b"), + b"foo\xa5bar") + def test_main(): support.run_unittest( @@ -1543,6 +1571,7 @@ def test_main(): CharmapTest, WithStmtTest, TypesTest, + Utf8bTest, ) diff --git a/Lib/test/test_os.py b/Lib/test/test_os.py index 91e0432..a380505 100644 --- a/Lib/test/test_os.py +++ b/Lib/test/test_os.py @@ -7,6 +7,7 @@ import errno import unittest import warnings import sys +import shutil from test import support # Tests creating TESTFN @@ -698,9 +699,44 @@ if sys.platform != 'win32': self.assertRaises(os.error, os.setregid, 0, 0) self.assertRaises(OverflowError, os.setregid, 1<<32, 0) self.assertRaises(OverflowError, os.setregid, 0, 1<<32) + + class Pep383Tests(unittest.TestCase): + filenames = [b'foo\xf6bar', 'foo\xf6bar'.encode("utf-8")] + + def setUp(self): + self.fsencoding = sys.getfilesystemencoding() + sys.setfilesystemencoding("utf-8") + self.dir = support.TESTFN + self.bdir = self.dir.encode("utf-8", "utf8b") + os.mkdir(self.dir) + self.unicodefn = [] + for fn in self.filenames: + f = open(os.path.join(self.bdir, fn), "w") + f.close() + self.unicodefn.append(fn.decode("utf-8", "utf8b")) + + def tearDown(self): + shutil.rmtree(self.dir) + sys.setfilesystemencoding(self.fsencoding) + + def test_listdir(self): + expected = set(self.unicodefn) + found = set(os.listdir(support.TESTFN)) + self.assertEquals(found, expected) + + def test_open(self): + for fn in self.unicodefn: + f = open(os.path.join(self.dir, fn)) + f.close() + + def test_stat(self): + for fn in self.unicodefn: + os.stat(os.path.join(self.dir, fn)) else: class PosixUidGidTests(unittest.TestCase): pass + class Pep383Tests(unittest.TestCase): + pass def test_main(): support.run_unittest( @@ -714,7 +750,8 @@ def test_main(): ExecTests, Win32ErrorTests, TestInvalidFD, - PosixUidGidTests + PosixUidGidTests, + Pep383Tests ) if __name__ == "__main__": diff --git a/Misc/NEWS b/Misc/NEWS index a384c41..2e4c6bd 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,8 @@ What's New in Python 3.1 beta 1? Core and Builtins ----------------- +- Implement PEP 383, Non-decodable Bytes in System Character Interfaces. + - Issue #5890: in subclasses of 'property' the __doc__ attribute was shadowed by classtype's, even if it was None. property now inserts the __doc__ into the subclass instance __dict__. diff --git a/Modules/_io/fileio.c b/Modules/_io/fileio.c index 4499ee2..164f7e4 100644 --- a/Modules/_io/fileio.c +++ b/Modules/_io/fileio.c @@ -245,7 +245,7 @@ fileio_init(PyObject *oself, PyObject *args, PyObject *kwds) return -1; stringobj = PyUnicode_AsEncodedString( - u, Py_FileSystemDefaultEncoding, NULL); + u, Py_FileSystemDefaultEncoding, "utf8b"); Py_DECREF(u); if (stringobj == NULL) return -1; diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 0575be2..d38a4db 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -493,12 +493,14 @@ convertenviron(void) char *p = strchr(*e, '='); if (p == NULL) continue; - k = PyUnicode_FromStringAndSize(*e, (int)(p-*e)); + k = PyUnicode_Decode(*e, (int)(p-*e), + Py_FileSystemDefaultEncoding, "utf8b"); if (k == NULL) { PyErr_Clear(); continue; } - v = PyUnicode_FromString(p+1); + v = PyUnicode_Decode(p+1, strlen(p+1), + Py_FileSystemDefaultEncoding, "utf8b"); if (v == NULL) { PyErr_Clear(); Py_DECREF(k); @@ -534,6 +536,37 @@ convertenviron(void) return d; } +/* Convert a bytes object to a char*. Optionally lock the buffer if it is a + bytes array. */ + +static char* +bytes2str(PyObject* o, int lock) +{ + if(PyBytes_Check(o)) + return PyBytes_AsString(o); + else if(PyByteArray_Check(o)) { + if (lock && PyObject_GetBuffer(o, NULL, 0) < 0) + /* On a bytearray, this should not fail. */ + PyErr_BadInternalCall(); + return PyByteArray_AsString(o); + } else { + /* The FS converter should have verified that this + is either bytes or bytearray. */ + Py_FatalError("bad object passed to bytes2str"); + /* not reached. */ + return ""; + } +} + +/* Release the lock, decref the object. */ +static void +release_bytes(PyObject* o) +{ + if (PyByteArray_Check(o)) + o->ob_type->tp_as_buffer->bf_releasebuffer(NULL, 0); + Py_DECREF(o); +} + /* Set a POSIX-specific error from errno, and return NULL */ @@ -558,10 +591,11 @@ posix_error_with_unicode_filename(Py_UNICODE* name) static PyObject * -posix_error_with_allocated_filename(char* name) +posix_error_with_allocated_filename(PyObject* name) { - PyObject *rc = PyErr_SetFromErrnoWithFilename(PyExc_OSError, name); - PyMem_Free(name); + PyObject *rc = PyErr_SetFromErrnoWithFilename(PyExc_OSError, + bytes2str(name, 0)); + release_bytes(name); return rc; } @@ -728,17 +762,19 @@ unicode_file_names(void) static PyObject * posix_1str(PyObject *args, char *format, int (*func)(const char*)) { - char *path1 = NULL; + PyObject *opath1 = NULL; + char *path1; int res; if (!PyArg_ParseTuple(args, format, - Py_FileSystemDefaultEncoding, &path1)) + PyUnicode_FSConverter, &opath1)) return NULL; + path1 = bytes2str(opath1, 1); Py_BEGIN_ALLOW_THREADS res = (*func)(path1); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path1); - PyMem_Free(path1); + return posix_error_with_allocated_filename(opath1); + release_bytes(opath1); Py_INCREF(Py_None); return Py_None; } @@ -748,17 +784,20 @@ posix_2str(PyObject *args, char *format, int (*func)(const char *, const char *)) { - char *path1 = NULL, *path2 = NULL; + PyObject *opath1, *opath2; + char *path1, *path2; int res; if (!PyArg_ParseTuple(args, format, - Py_FileSystemDefaultEncoding, &path1, - Py_FileSystemDefaultEncoding, &path2)) + PyUnicode_FSConverter, &opath1, + PyUnicode_FSConverter, &opath2)) return NULL; + path1 = bytes2str(opath1, 1); + path2 = bytes2str(opath2, 1); Py_BEGIN_ALLOW_THREADS res = (*func)(path1, path2); Py_END_ALLOW_THREADS - PyMem_Free(path1); - PyMem_Free(path2); + release_bytes(opath1); + release_bytes(opath2); if (res != 0) /* XXX how to report both path1 and path2??? */ return posix_error(); @@ -1560,8 +1599,8 @@ posix_do_stat(PyObject *self, PyObject *args, int (*wstatfunc)(const Py_UNICODE *, STRUCT_STAT *)) { STRUCT_STAT st; - char *path = NULL; /* pass this to stat; do not free() it */ - char *pathfree = NULL; /* this memory must be free'd */ + PyObject *opath; + char *path; int res; PyObject *result; @@ -1590,25 +1629,24 @@ posix_do_stat(PyObject *self, PyObject *args, #endif if (!PyArg_ParseTuple(args, format, - Py_FileSystemDefaultEncoding, &path)) + PyUnicode_FSConverter, &opath)) return NULL; - pathfree = path; - + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = (*statfunc)(path, &st); Py_END_ALLOW_THREADS if (res != 0) { #ifdef MS_WINDOWS - result = win32_error("stat", pathfree); + result = win32_error("stat", path); #else - result = posix_error_with_filename(pathfree); + result = posix_error_with_filename(path); #endif } else result = _pystat_fromstructstat(&st); - PyMem_Free(pathfree); + release_bytes(opath); return result; } @@ -1625,6 +1663,7 @@ existence, or the inclusive-OR of R_OK, W_OK, and X_OK."); static PyObject * posix_access(PyObject *self, PyObject *args) { + PyObject *opath; char *path; int mode; @@ -1644,13 +1683,14 @@ posix_access(PyObject *self, PyObject *args) are also valid. */ PyErr_Clear(); } - if (!PyArg_ParseTuple(args, "eti:access", - Py_FileSystemDefaultEncoding, &path, &mode)) + if (!PyArg_ParseTuple(args, "O&i:access", + PyUnicode_FSConverter, &opath, &mode)) return 0; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS attr = GetFileAttributesA(path); Py_END_ALLOW_THREADS - PyMem_Free(path); + release_bytes(opath); finish: if (attr == 0xFFFFFFFF) /* File does not exist, or cannot read attributes */ @@ -1663,13 +1703,14 @@ finish: || (attr & FILE_ATTRIBUTE_DIRECTORY)); #else int res; - if (!PyArg_ParseTuple(args, "eti:access", - Py_FileSystemDefaultEncoding, &path, &mode)) + if (!PyArg_ParseTuple(args, "O&i:access", + PyUnicode_FSConverter, &opath, &mode)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = access(path, mode); Py_END_ALLOW_THREADS - PyMem_Free(path); + release_bytes(opath); return PyBool_FromLong(res == 0); #endif } @@ -1750,11 +1791,11 @@ posix_chdir(PyObject *self, PyObject *args) #ifdef MS_WINDOWS return win32_1str(args, "chdir", "y:chdir", win32_chdir, "U:chdir", win32_wchdir); #elif defined(PYOS_OS2) && defined(PYCC_GCC) - return posix_1str(args, "et:chdir", _chdir2); + return posix_1str(args, "O&:chdir", _chdir2); #elif defined(__VMS) - return posix_1str(args, "et:chdir", (int (*)(const char *))chdir); + return posix_1str(args, "O&:chdir", (int (*)(const char *))chdir); #else - return posix_1str(args, "et:chdir", chdir); + return posix_1str(args, "O&:chdir", chdir); #endif } @@ -1779,6 +1820,7 @@ Change the access permissions of a file."); static PyObject * posix_chmod(PyObject *self, PyObject *args) { + PyObject *opath = NULL; char *path = NULL; int i; int res; @@ -1809,9 +1851,10 @@ posix_chmod(PyObject *self, PyObject *args) are also valid. */ PyErr_Clear(); } - if (!PyArg_ParseTuple(args, "eti:chmod", Py_FileSystemDefaultEncoding, - &path, &i)) + if (!PyArg_ParseTuple(args, "O&i:chmod", PyUnicode_FSConverter, + &opath, &i)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS attr = GetFileAttributesA(path); if (attr != 0xFFFFFFFF) { @@ -1826,22 +1869,23 @@ posix_chmod(PyObject *self, PyObject *args) Py_END_ALLOW_THREADS if (!res) { win32_error("chmod", path); - PyMem_Free(path); + release_bytes(opath); return NULL; } - PyMem_Free(path); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; #else /* Py_WIN_WIDE_FILENAMES */ - if (!PyArg_ParseTuple(args, "eti:chmod", Py_FileSystemDefaultEncoding, - &path, &i)) + if (!PyArg_ParseTuple(args, "O&i:chmod", PyUnicode_FSConverter, + &opath, &i)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = chmod(path, i); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; #endif @@ -1877,18 +1921,20 @@ affects the link itself rather than the target."); static PyObject * posix_lchmod(PyObject *self, PyObject *args) { - char *path = NULL; + PyObject *opath; + char *path; int i; int res; - if (!PyArg_ParseTuple(args, "eti:lchmod", Py_FileSystemDefaultEncoding, - &path, &i)) + if (!PyArg_ParseTuple(args, "O&i:lchmod", PyUnicode_FSConverter, + &opath, &i)) return NULL; + path = bytes2str(opath, 1) Py_BEGIN_ALLOW_THREADS res = lchmod(path, i); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_RETURN_NONE; } #endif /* HAVE_LCHMOD */ @@ -1902,18 +1948,20 @@ Set file flags."); static PyObject * posix_chflags(PyObject *self, PyObject *args) { + PyObject *opath; char *path; unsigned long flags; int res; - if (!PyArg_ParseTuple(args, "etk:chflags", - Py_FileSystemDefaultEncoding, &path, &flags)) + if (!PyArg_ParseTuple(args, "O&k:chflags", + PyUnicode_FSConverter, &opath, &flags)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = chflags(path, flags); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; } @@ -1928,18 +1976,20 @@ This function will not follow symbolic links."); static PyObject * posix_lchflags(PyObject *self, PyObject *args) { + PyObject *opath; char *path; unsigned long flags; int res; - if (!PyArg_ParseTuple(args, "etk:lchflags", - Py_FileSystemDefaultEncoding, &path, &flags)) + if (!PyArg_ParseTuple(args, "O&k:lchflags", + PyUnicode_FSConverter, &path, &flags)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = lchflags(path, flags); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; } @@ -1953,7 +2003,7 @@ Change root directory to path."); static PyObject * posix_chroot(PyObject *self, PyObject *args) { - return posix_1str(args, "et:chroot", chroot); + return posix_1str(args, "O&:chroot", chroot); } #endif @@ -1996,19 +2046,21 @@ Change the owner and group id of path to the numeric uid and gid."); static PyObject * posix_chown(PyObject *self, PyObject *args) { - char *path = NULL; + PyObject *opath; + char *path; long uid, gid; int res; - if (!PyArg_ParseTuple(args, "etll:chown", - Py_FileSystemDefaultEncoding, &path, + if (!PyArg_ParseTuple(args, "O&ll:chown", + PyUnicode_FSConverter, &opath, &uid, &gid)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = chown(path, (uid_t) uid, (gid_t) gid); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; } @@ -2045,19 +2097,21 @@ This function will not follow symbolic links."); static PyObject * posix_lchown(PyObject *self, PyObject *args) { - char *path = NULL; + PyObject *opath; + char *path; int uid, gid; int res; - if (!PyArg_ParseTuple(args, "etii:lchown", - Py_FileSystemDefaultEncoding, &path, + if (!PyArg_ParseTuple(args, "O&ii:lchown", + PyUnicode_FSConverter, &opath, &uid, &gid)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS res = lchown(path, (uid_t) uid, (gid_t) gid); Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; } @@ -2113,7 +2167,7 @@ posix_getcwd(int use_bytes) return posix_error(); if (use_bytes) return PyBytes_FromStringAndSize(buf, strlen(buf)); - return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"strict"); + return PyUnicode_Decode(buf, strlen(buf), Py_FileSystemDefaultEncoding,"utf8b"); } PyDoc_STRVAR(posix_getcwd__doc__, @@ -2146,7 +2200,7 @@ Create a hard link to a file."); static PyObject * posix_link(PyObject *self, PyObject *args) { - return posix_2str(args, "etet:link", link); + return posix_2str(args, "O&O&:link", link); } #endif /* HAVE_LINK */ @@ -2171,6 +2225,7 @@ posix_listdir(PyObject *self, PyObject *args) HANDLE hFindFile; BOOL result; WIN32_FIND_DATA FileData; + PyObject *opath; char namebuf[MAX_PATH+5]; /* Overallocate for \\*.*\0 */ char *bufptr = namebuf; Py_ssize_t len = sizeof(namebuf)-5; /* only claim to have space for MAX_PATH */ @@ -2260,9 +2315,16 @@ posix_listdir(PyObject *self, PyObject *args) } #endif - if (!PyArg_ParseTuple(args, "et#:listdir", - Py_FileSystemDefaultEncoding, &bufptr, &len)) + if (!PyArg_ParseTuple(args, "O&:listdir", + PyUnicode_FSConverter, &opath)) + return NULL; + if (PyObject_Size(opath)+1 > MAX_PATH) { + PyErr_SetString(PyExc_ValueError, "path too long"); + Py_DECREF(opath); return NULL; + } + strcpy(namebuf, bytes2str(opath, 0)); + len = PyObject_Size(opath); if (len > 0) { char ch = namebuf[len-1]; if (ch != SEP && ch != ALTSEP && ch != ':') @@ -2324,6 +2386,7 @@ posix_listdir(PyObject *self, PyObject *args) #ifndef MAX_PATH #define MAX_PATH CCHMAXPATH #endif + PyObject *oname; char *name, *pt; Py_ssize_t len; PyObject *d, *v; @@ -2333,11 +2396,13 @@ posix_listdir(PyObject *self, PyObject *args) FILEFINDBUF3 ep; APIRET rc; - if (!PyArg_ParseTuple(args, "et#:listdir", - Py_FileSystemDefaultEncoding, &name, &len)) + if (!PyArg_ParseTuple(args, "O&:listdir", + PyUnicode_FSConverter, &oname)) return NULL; + name = bytes2str(oname); + len = PyObject_Size(oname); if (len >= MAX_PATH) { - PyMem_Free(name); + release_bytes(oname); PyErr_SetString(PyExc_ValueError, "path too long"); return NULL; } @@ -2350,7 +2415,7 @@ posix_listdir(PyObject *self, PyObject *args) strcpy(namebuf + len, "*.*"); if ((d = PyList_New(0)) == NULL) { - PyMem_Free(name); + release_bytes(oname); return NULL; } @@ -2363,7 +2428,7 @@ posix_listdir(PyObject *self, PyObject *args) if (rc != NO_ERROR) { errno = ENOENT; - return posix_error_with_allocated_filename(name); + return posix_error_with_allocated_filename(oname); } if (srchcnt > 0) { /* If Directory is NOT Totally Empty, */ @@ -2393,11 +2458,11 @@ posix_listdir(PyObject *self, PyObject *args) } while (DosFindNext(hdir, &ep, sizeof(ep), &srchcnt) == NO_ERROR && srchcnt > 0); } - PyMem_Free(name); + release_bytes(oname); return d; #else - - char *name = NULL; + PyObject *oname; + char *name; PyObject *d, *v; DIR *dirp; struct dirent *ep; @@ -2408,14 +2473,15 @@ posix_listdir(PyObject *self, PyObject *args) arg_is_unicode = 0; PyErr_Clear(); } - if (!PyArg_ParseTuple(args, "et:listdir", Py_FileSystemDefaultEncoding, &name)) + if (!PyArg_ParseTuple(args, "O&:listdir", PyUnicode_FSConverter, &oname)) return NULL; + name = bytes2str(oname, 1); if ((dirp = opendir(name)) == NULL) { - return posix_error_with_allocated_filename(name); + return posix_error_with_allocated_filename(oname); } if ((d = PyList_New(0)) == NULL) { closedir(dirp); - PyMem_Free(name); + release_bytes(oname); return NULL; } for (;;) { @@ -2429,7 +2495,7 @@ posix_listdir(PyObject *self, PyObject *args) } else { closedir(dirp); Py_DECREF(d); - return posix_error_with_allocated_filename(name); + return posix_error_with_allocated_filename(oname); } } if (ep->d_name[0] == '.' && @@ -2447,18 +2513,16 @@ posix_listdir(PyObject *self, PyObject *args) w = PyUnicode_FromEncodedObject(v, Py_FileSystemDefaultEncoding, - "strict"); - if (w != NULL) { - Py_DECREF(v); + "utf8b"); + Py_DECREF(v); + if (w != NULL) v = w; - } else { - /* Ignore undecodable filenames, as discussed - * in issue 3187. To include these, - * use getcwdb(). */ - PyErr_Clear(); - Py_DECREF(v); - continue; + /* Encoding failed to decode ASCII bytes. + Raise exception. */ + Py_DECREF(d); + d = NULL; + break; } } if (PyList_Append(d, v) != 0) { @@ -2470,7 +2534,7 @@ posix_listdir(PyObject *self, PyObject *args) Py_DECREF(v); } closedir(dirp); - PyMem_Free(name); + release_bytes(oname); return d; @@ -2482,10 +2546,8 @@ posix_listdir(PyObject *self, PyObject *args) static PyObject * posix__getfullpathname(PyObject *self, PyObject *args) { - /* assume encoded strings won't more than double no of chars */ - char inbuf[MAX_PATH*2]; - char *inbufp = inbuf; - Py_ssize_t insize = sizeof(inbuf); + PyObject *opath; + char *path; char outbuf[MAX_PATH*2]; char *temp; #ifdef Py_WIN_WIDE_FILENAMES @@ -2519,13 +2581,17 @@ posix__getfullpathname(PyObject *self, PyObject *args) PyErr_Clear(); } #endif - if (!PyArg_ParseTuple (args, "et#:_getfullpathname", - Py_FileSystemDefaultEncoding, &inbufp, - &insize)) + if (!PyArg_ParseTuple (args, "O&:_getfullpathname", + PyUnicode_FSConverter, &opath)) return NULL; - if (!GetFullPathName(inbuf, sizeof(outbuf)/sizeof(outbuf[0]), - outbuf, &temp)) - return win32_error("GetFullPathName", inbuf); + path = bytes2str(opath, 1); + if (!GetFullPathName(path, sizeof(outbuf)/sizeof(outbuf[0]), + outbuf, &temp)) { + win32_error("GetFullPathName", path); + release_bytes(opath); + return NULL; + } + release_bytes(opath); if (PyUnicode_Check(PyTuple_GetItem(args, 0))) { return PyUnicode_Decode(outbuf, strlen(outbuf), Py_FileSystemDefaultEncoding, NULL); @@ -2542,7 +2608,8 @@ static PyObject * posix_mkdir(PyObject *self, PyObject *args) { int res; - char *path = NULL; + PyObject *opath; + char *path; int mode = 0777; #ifdef Py_WIN_WIDE_FILENAMES @@ -2563,9 +2630,10 @@ posix_mkdir(PyObject *self, PyObject *args) are also valid. */ PyErr_Clear(); } - if (!PyArg_ParseTuple(args, "et|i:mkdir", - Py_FileSystemDefaultEncoding, &path, &mode)) + if (!PyArg_ParseTuple(args, "O&|i:mkdir", + PyUnicode_FSConverter, &opath, &mode)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS /* PyUnicode_AS_UNICODE OK without thread lock as it is a simple dereference. */ @@ -2573,17 +2641,18 @@ posix_mkdir(PyObject *self, PyObject *args) Py_END_ALLOW_THREADS if (!res) { win32_error("mkdir", path); - PyMem_Free(path); + release_bytes(opath); return NULL; } - PyMem_Free(path); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; #else - if (!PyArg_ParseTuple(args, "et|i:mkdir", - Py_FileSystemDefaultEncoding, &path, &mode)) + if (!PyArg_ParseTuple(args, "O&|i:mkdir", + PyUnicode_FSConverter, &opath, &mode)) return NULL; + path = bytes2str(opath, 1); Py_BEGIN_ALLOW_THREADS #if ( defined(__WATCOMC__) || defined(PYCC_VACPP) ) && !defined(__QNX__) res = mkdir(path); @@ -2592,8 +2661,8 @@ posix_mkdir(PyObject *self, PyObject *args) #endif Py_END_ALLOW_THREADS if (res < 0) - return posix_error_with_allocated_filename(path); - PyMem_Free(path); + return posix_error_with_allocated_filename(opath); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; #endif @@ -2685,7 +2754,7 @@ error: Py_INCREF(Py_None); return Py_None; #else - return posix_2str(args, "etet:rename", rename); + return posix_2str(args, "O&O&:rename", rename); #endif } @@ -2700,7 +2769,7 @@ posix_rmdir(PyObject *self, PyObject *args) #ifdef MS_WINDOWS return win32_1str(args, "rmdir", "y:rmdir", RemoveDirectoryA, "U:rmdir", RemoveDirectoryW); #else - return posix_1str(args, "et:rmdir", rmdir); + return posix_1str(args, "O&:rmdir", rmdir); #endif } @@ -2713,9 +2782,9 @@ static PyObject * posix_stat(PyObject *self, PyObject *args) { #ifdef MS_WINDOWS - return posix_do_stat(self, args, "et:stat", STAT, "U:stat", win32_wstat); + return posix_do_stat(self, args, "O&:stat", STAT, "U:stat", win32_wstat); #else - return posix_do_stat(self, args, "et:stat", STAT, NULL, NULL); + return posix_do_stat(self, args, "O&:stat", STAT, NULL, NULL); #endif } @@ -2781,7 +2850,7 @@ posix_unlink(PyObject *self, PyObject *args) #ifdef MS_WINDOWS return win32_1str(args, "remove", "y:remove", DeleteFileA, "U:remove", DeleteFileW); #else - return posix_1str(args, "et:remove", unlink); + return posix_1str(args, "O&:remove", unlink); #endif } @@ -2853,7 +2922,8 @@ posix_utime(PyObject *self, PyObject *args) PyObject *arg; PyUnicodeObject *obwpath; wchar_t *wpath = NULL; - char *apath = NULL; + PyObject *oapath; + char *apath; HANDLE hFile; long atimesec, mtimesec, ausec, musec; FILETIME atime, mtime; @@ -2875,9 +2945,10 @@ posix_utime(PyObject *self, PyObject *args) PyErr_Clear(); } if (!wpath) { - if (!PyArg_ParseTuple(args, "etO:utime", - Py_FileSystemDefaultEncoding, &apath, &arg)) + if (!PyArg_ParseTuple(args, "O&O:utime", + PyUnicode_FSConverter, &oapath, &arg)) return NULL; + apath = bytes2str(oapath, 1); Py_BEGIN_ALLOW_THREADS hFile = CreateFileA(apath, FILE_WRITE_ATTRIBUTES, 0, NULL, OPEN_EXISTING, @@ -2885,10 +2956,10 @@ posix_utime(PyObject *self, PyObject *args) Py_END_ALLOW_THREADS if (hFile == INVALID_HANDLE_VALUE) { win32_error("utime", apath); - PyMem_Free(apath); + release_bytes(oapath); return NULL; } - PyMem_Free(apath); + release_bytes(oapath); } if (arg == Py_None) { @@ -2929,7 +3000,8 @@ done: return result; #else /* Py_WIN_WIDE_FILENAMES */ - char *path = NULL; + PyObject *opath; + char *path; long atime, mtime, ausec, musec; int res; PyObject* arg; @@ -2952,9 +3024,10 @@ done: #endif /* HAVE_UTIMES */ - if (!PyArg_ParseTuple(args, "etO:utime", - Py_FileSystemDefaultEncoding, &path, &arg)) + if (!PyArg_ParseTuple(args, "O&O:utime", + PyUnicode_FSConverter, &opath, &arg)) return NULL; + path = bytes2str(opath, 1); if (arg == Py_None) { /* optional time values not given */ Py_BEGIN_ALLOW_THREADS @@ -2964,18 +3037,18 @@ done: else if (!PyTuple_Check(arg) || PyTuple_Size(arg) != 2) { PyErr_SetString(PyExc_TypeError, "utime() arg 2 must be a tuple (atime, mtime)"); - PyMem_Free(path); + release_bytes(opath); return NULL; } else { if (extract_time(PyTuple_GET_ITEM(arg, 0), &atime, &ausec) == -1) { - PyMem_Free(path); + release_bytes(opath); return NULL; } if (extract_time(PyTuple_GET_ITEM(arg, 1), &mtime, &musec) == -1) { - PyMem_Free(path); + release_bytes(opath); return NULL; } ATIME = atime; @@ -2993,9 +3066,9 @@ done: #endif /* HAVE_UTIMES */ } if (res < 0) { - return posix_error_with_allocated_filename(path); + return posix_error_with_allocated_filename(opath); } - PyMem_Free(path); + release_bytes(opath); Py_INCREF(Py_None); return Py_None; #undef UTIME_ARG @@ -3030,6 +3103,22 @@ free_string_array(char **array, Py_ssize_t count) PyMem_Free(array[i]); PyMem_DEL(array); } + +int fsconvert_strdup(PyObject *o, char**out) +{ + PyObject *bytes; + Py_ssize_t size; + if (!PyUnicode_FSConverter(o, &bytes)) + return 0; + size = PyObject_Size(bytes); + *out = PyMem_Malloc(size+1); + if (!*out) + return 0; + /* Don't lock bytes, as we hold the GIL */ + memcpy(*out, bytes2str(bytes, 0), size+1); + Py_DECREF(bytes); + return 1; +} #endif @@ -3044,6 +3133,7 @@ Execute an executable path with arguments, replacing current process.\n\ static PyObject * posix_execv(PyObject *self, PyObject *args) { + PyObject *opath; char *path; PyObject *argv; char **argvlist; @@ -3053,10 +3143,11 @@ posix_execv(PyObject *self, PyObject *args) /* execv has two arguments: (path, argv), where argv is a list or tuple of strings. */ - if (!PyArg_ParseTuple(args, "etO:execv", - Py_FileSystemDefaultEncoding, - &path, &argv)) + if (!PyArg_ParseTuple(args, "O&O:execv", + PyUnicode_FSConverter, + &opath, &argv)) return NULL; + path = bytes2str(opath, 1); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3067,28 +3158,27 @@ posix_execv(PyObject *self, PyObject *args) } else { PyErr_SetString(PyExc_TypeError, "execv() arg 2 must be a tuple or list"); - PyMem_Free(path); + release_bytes(opath); return NULL; } if (argc < 1) { PyErr_SetString(PyExc_ValueError, "execv() arg 2 must not be empty"); - PyMem_Free(path); + release_bytes(opath); return NULL; } argvlist = PyMem_NEW(char *, argc+1); if (argvlist == NULL) { - PyMem_Free(path); + release_bytes(opath); return PyErr_NoMemory(); } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), "et", - Py_FileSystemDefaultEncoding, - &argvlist[i])) { + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { free_string_array(argvlist, i); PyErr_SetString(PyExc_TypeError, "execv() arg 2 must contain only strings"); - PyMem_Free(path); + release_bytes(opath); return NULL; } @@ -3100,7 +3190,7 @@ posix_execv(PyObject *self, PyObject *args) /* If we get here it's definitely an error */ free_string_array(argvlist, argc); - PyMem_Free(path); + release_bytes(opath); return posix_error(); } @@ -3116,6 +3206,7 @@ Execute a path with arguments and environment, replacing current process.\n\ static PyObject * posix_execve(PyObject *self, PyObject *args) { + PyObject *opath; char *path; PyObject *argv, *env; char **argvlist; @@ -3129,10 +3220,11 @@ posix_execve(PyObject *self, PyObject *args) argv is a list or tuple of strings and env is a dictionary like posix.environ. */ - if (!PyArg_ParseTuple(args, "etOO:execve", - Py_FileSystemDefaultEncoding, - &path, &argv, &env)) + if (!PyArg_ParseTuple(args, "O&OO:execve", + PyUnicode_FSConverter, + &opath, &argv, &env)) return NULL; + path = bytes2str(opath, 1); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3158,10 +3250,8 @@ posix_execve(PyObject *self, PyObject *args) goto fail_0; } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), - "et;execve() arg 2 must contain only strings", - Py_FileSystemDefaultEncoding, - &argvlist[i])) + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { lastarg = i; goto fail_1; @@ -3243,7 +3333,7 @@ posix_execve(PyObject *self, PyObject *args) Py_XDECREF(vals); Py_XDECREF(keys); fail_0: - PyMem_Free(path); + release_bytes(opath); return NULL; } #endif /* HAVE_EXECV */ @@ -3261,6 +3351,7 @@ Execute the program 'path' in a new process.\n\ static PyObject * posix_spawnv(PyObject *self, PyObject *args) { + PyObject *opath; char *path; PyObject *argv; char **argvlist; @@ -3272,10 +3363,11 @@ posix_spawnv(PyObject *self, PyObject *args) /* spawnv has three arguments: (mode, path, argv), where argv is a list or tuple of strings. */ - if (!PyArg_ParseTuple(args, "ietO:spawnv", &mode, - Py_FileSystemDefaultEncoding, - &path, &argv)) + if (!PyArg_ParseTuple(args, "iO&O:spawnv", &mode, + PyUnicode_FSConverter, + &opath, &argv)) return NULL; + path = bytes2str(opath, 1); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3287,24 +3379,23 @@ posix_spawnv(PyObject *self, PyObject *args) else { PyErr_SetString(PyExc_TypeError, "spawnv() arg 2 must be a tuple or list"); - PyMem_Free(path); + release_bytes(opath); return NULL; } argvlist = PyMem_NEW(char *, argc+1); if (argvlist == NULL) { - PyMem_Free(path); + release_bytes(opath); return PyErr_NoMemory(); } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), "et", - Py_FileSystemDefaultEncoding, - &argvlist[i])) { + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { free_string_array(argvlist, i); PyErr_SetString( PyExc_TypeError, "spawnv() arg 2 must contain only strings"); - PyMem_Free(path); + release_bytes(opath); return NULL; } } @@ -3324,7 +3415,7 @@ posix_spawnv(PyObject *self, PyObject *args) #endif free_string_array(argvlist, argc); - PyMem_Free(path); + release_bytes(opath); if (spawnval == -1) return posix_error(); @@ -3349,6 +3440,7 @@ Execute the program 'path' in a new process.\n\ static PyObject * posix_spawnve(PyObject *self, PyObject *args) { + PyObject *opath; char *path; PyObject *argv, *env; char **argvlist; @@ -3364,10 +3456,11 @@ posix_spawnve(PyObject *self, PyObject *args) argv is a list or tuple of strings and env is a dictionary like posix.environ. */ - if (!PyArg_ParseTuple(args, "ietOO:spawnve", &mode, - Py_FileSystemDefaultEncoding, - &path, &argv, &env)) + if (!PyArg_ParseTuple(args, "iO&OO:spawnve", &mode, + PyUnicode_FSConverter, + &opath, &argv, &env)) return NULL; + path = bytes2str(opath, 1); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3393,10 +3486,8 @@ posix_spawnve(PyObject *self, PyObject *args) goto fail_0; } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), - "et;spawnve() arg 2 must contain only strings", - Py_FileSystemDefaultEncoding, - &argvlist[i])) + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { lastarg = i; goto fail_1; @@ -3486,7 +3577,7 @@ posix_spawnve(PyObject *self, PyObject *args) Py_XDECREF(vals); Py_XDECREF(keys); fail_0: - PyMem_Free(path); + release_bytes(opath); return res; } @@ -3504,6 +3595,7 @@ search path to find the file.\n\ static PyObject * posix_spawnvp(PyObject *self, PyObject *args) { + PyObject *opath; char *path; PyObject *argv; char **argvlist; @@ -3514,10 +3606,11 @@ posix_spawnvp(PyObject *self, PyObject *args) /* spawnvp has three arguments: (mode, path, argv), where argv is a list or tuple of strings. */ - if (!PyArg_ParseTuple(args, "ietO:spawnvp", &mode, - Py_FileSystemDefaultEncoding, - &path, &argv)) + if (!PyArg_ParseTuple(args, "iO&O:spawnvp", &mode, + PyUnicode_FSConverter, + &opath, &argv)) return NULL; + path = bytes2str(opath); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3529,24 +3622,23 @@ posix_spawnvp(PyObject *self, PyObject *args) else { PyErr_SetString(PyExc_TypeError, "spawnvp() arg 2 must be a tuple or list"); - PyMem_Free(path); + release_bytes(opath); return NULL; } argvlist = PyMem_NEW(char *, argc+1); if (argvlist == NULL) { - PyMem_Free(path); + release_bytes(opath); return PyErr_NoMemory(); } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), "et", - Py_FileSystemDefaultEncoding, - &argvlist[i])) { + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { free_string_array(argvlist, i); PyErr_SetString( PyExc_TypeError, "spawnvp() arg 2 must contain only strings"); - PyMem_Free(path); + release_bytes(opath); return NULL; } } @@ -3561,7 +3653,7 @@ posix_spawnvp(PyObject *self, PyObject *args) Py_END_ALLOW_THREADS free_string_array(argvlist, argc); - PyMem_Free(path); + release_bytes(opath); if (spawnval == -1) return posix_error(); @@ -3583,6 +3675,7 @@ search path to find the file.\n\ static PyObject * posix_spawnvpe(PyObject *self, PyObject *args) { + PyObject *opath char *path; PyObject *argv, *env; char **argvlist; @@ -3598,9 +3691,10 @@ posix_spawnvpe(PyObject *self, PyObject *args) like posix.environ. */ if (!PyArg_ParseTuple(args, "ietOO:spawnvpe", &mode, - Py_FileSystemDefaultEncoding, - &path, &argv, &env)) + PyUnicode_FSConverter, + &opath, &argv, &env)) return NULL; + path = bytes2str(opath); if (PyList_Check(argv)) { argc = PyList_Size(argv); getitem = PyList_GetItem; @@ -3626,10 +3720,8 @@ posix_spawnvpe(PyObject *self, PyObject *args) goto fail_0; } for (i = 0; i < argc; i++) { - if (!PyArg_Parse((*getitem)(argv, i), - "et;spawnvpe() arg 2 must contain only strings", - Py_FileSystemDefaultEncoding, - &argvlist[i])) + if (!fsconvert_strdup((*getitem)(argv, i), + &argvlist[i])) { lastarg = i; goto fail_1; @@ -3710,7 +3802,7 @@ posix_spawnvpe(PyObject *self, PyObject *args) Py_XDECREF(vals); Py_XDECREF(keys); fail_0: - PyMem_Free(path); + release_bytes(opath); return res; } #endif /* PYOS_OS2 */ @@ -4549,12 +4641,12 @@ static PyObject * posix_lstat(PyObject *self, PyObject *args) { #ifdef HAVE_LSTAT - return posix_do_stat(self, args, "et:lstat", lstat, NULL, NULL); + return posix_do_stat(self, args, "O&:lstat", lstat, NULL, NULL); #else /* !HAVE_LSTAT */ #ifdef MS_WINDOWS - return posix_do_stat(self, args, "et:lstat", STAT, "U:lstat", win32_wstat); + return posix_do_stat(self, args, "O&:lstat", STAT, "U:lstat", win32_wstat); #else - return posix_do_stat(self, args, "et:lstat", STAT, NULL, NULL); + return posix_do_stat(self, args, "O&:lstat", STAT, NULL, NULL); #endif #endif /* !HAVE_LSTAT */ } @@ -4570,16 +4662,18 @@ posix_readlink(PyObject *self, PyObject *args) { PyObject* v; char buf[MAXPATHLEN]; + PyObject *opath; char *path; int n; int arg_is_unicode = 0; - if (!PyArg_ParseTuple(args, "et:readlink", - Py_FileSystemDefaultEncoding, &path)) + if (!PyArg_ParseTuple(args, "O&:readlink", + PyUnicode_FSConverter, &opath)) return NULL; + path = bytes2str(opath, 1); v = PySequence_GetItem(args, 0); if (v == NULL) { - PyMem_Free(path); + release_bytes(opath); return NULL; } @@ -4592,16 +4686,16 @@ posix_readlink(PyObject *self, PyObject *args) n = readlink(path, buf, (int) sizeof buf); Py_END_ALLOW_THREADS if (n < 0) - return posix_error_with_allocated_filename(path); + return posix_error_with_allocated_filename(opath); - PyMem_Free(path); + release_bytes(opath); v = PyBytes_FromStringAndSize(buf, n); if (arg_is_unicode) { PyObject *w; w = PyUnicode_FromEncodedObject(v, Py_FileSystemDefaultEncoding, - "strict"); + "utf8b"); if (w != NULL) { Py_DECREF(v); v = w; @@ -4623,7 +4717,7 @@ Create a symbolic link pointing to src named dst."); static PyObject * posix_symlink(PyObject *self, PyObject *args) { - return posix_2str(args, "etet:symlink", symlink); + return posix_2str(args, "O&O&:symlink", symlink); } #endif /* HAVE_SYMLINK */ @@ -4811,7 +4905,8 @@ Open a file (for low level IO)."); static PyObject * posix_open(PyObject *self, PyObject *args) { - char *file = NULL; + PyObject *ofile; + char *file; int flag; int mode = 0777; int fd; @@ -4835,17 +4930,17 @@ posix_open(PyObject *self, PyObject *args) } #endif - if (!PyArg_ParseTuple(args, "eti|i", - Py_FileSystemDefaultEncoding, &file, + if (!PyArg_ParseTuple(args, "O&i|i", + PyUnicode_FSConverter, &ofile, &flag, &mode)) return NULL; - + file = bytes2str(ofile, 1); Py_BEGIN_ALLOW_THREADS fd = open(file, flag, mode); Py_END_ALLOW_THREADS if (fd < 0) - return posix_error_with_allocated_filename(file); - PyMem_Free(file); + return posix_error_with_allocated_filename(ofile); + release_bytes(ofile); return PyLong_FromLong((long)fd); } @@ -5289,20 +5384,27 @@ posix_putenv(PyObject *self, PyObject *args) wchar_t *s1, *s2; wchar_t *newenv; #else + PyObject *os1, *os2; char *s1, *s2; char *newenv; #endif PyObject *newstr; size_t len; - if (!PyArg_ParseTuple(args, #ifdef MS_WINDOWS + if (!PyArg_ParseTuple(args, "uu:putenv", -#else - "ss:putenv", -#endif &s1, &s2)) return NULL; +#else + if (!PyArg_ParseTuple(args, + "O&O&:putenv", + PyUnicode_FSConverter, &os1, + PyUnicode_FSConverter, &os2)) + return NULL; + s1 = bytes2str(os1, 1); + s2 = bytes2str(os2, 1); +#endif #if defined(PYOS_OS2) if (stricmp(s1, "BEGINLIBPATH") == 0) { @@ -5345,6 +5447,8 @@ posix_putenv(PyObject *self, PyObject *args) PyOS_snprintf(newenv, len, "%s=%s", s1, s2); if (putenv(newenv)) { Py_DECREF(newstr); + release_bytes(os1); + release_bytes(os2); posix_error(); return NULL; } @@ -5365,6 +5469,10 @@ posix_putenv(PyObject *self, PyObject *args) #if defined(PYOS_OS2) } #endif +#ifndef MS_WINDOWS + release_bytes(os1); + release_bytes(os2); +#endif Py_INCREF(Py_None); return Py_None; } @@ -6688,6 +6796,7 @@ the underlying Win32 ShellExecute function doesn't work if it is."); static PyObject * win32_startfile(PyObject *self, PyObject *args) { + PyObject *ofilepath; char *filepath; char *operation = NULL; HINSTANCE rc; @@ -6729,20 +6838,21 @@ win32_startfile(PyObject *self, PyObject *args) #endif normal: - if (!PyArg_ParseTuple(args, "et|s:startfile", - Py_FileSystemDefaultEncoding, &filepath, + if (!PyArg_ParseTuple(args, "O&|s:startfile", + PyUnicode_FSConverter, &ofilepath, &operation)) return NULL; + filepath = bytes2str(ofilepath, 1); Py_BEGIN_ALLOW_THREADS rc = ShellExecute((HWND)0, operation, filepath, NULL, NULL, SW_SHOWNORMAL); Py_END_ALLOW_THREADS if (rc <= (HINSTANCE)32) { PyObject *errval = win32_error("startfile", filepath); - PyMem_Free(filepath); + release_bytes(ofilepath); return errval; } - PyMem_Free(filepath); + release_bytes(ofilepath); Py_INCREF(Py_None); return Py_None; } diff --git a/Modules/python.c b/Modules/python.c index f6da86f..4c0a55b 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -14,6 +14,93 @@ wmain(int argc, wchar_t **argv) return Py_Main(argc, argv); } #else +static wchar_t* +char2wchar(char* arg) +{ + wchar_t *res; +#ifdef HAVE_BROKEN_MBSTOWCS + /* Some platforms have a broken implementation of + * mbstowcs which does not count the characters that + * would result from conversion. Use an upper bound. + */ + size_t argsize = strlen(arg); +#else + size_t argsize = mbstowcs(NULL, arg, 0); +#endif + size_t count; + unsigned char *in; + wchar_t *out; +#ifdef HAVE_MBRTOWC + mbstate_t mbs; +#endif + if (argsize != (size_t)-1) { + res = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); + if (!res) + goto oom; + count = mbstowcs(res, arg, argsize+1); + if (count != (size_t)-1) + return res; + PyMem_Free(res); + } + /* Conversion failed. Fall back to escaping with utf8b. */ +#ifdef HAVE_MBRTOWC + /* Try conversion with mbrtwoc (C99), and escape non-decodable bytes. */ + + /* Overallocate; as multi-byte characters are in the argument, the + actual output could use less memory. */ + argsize = strlen(arg) + 1; + res = PyMem_Malloc(argsize*sizeof(wchar_t)); + if (!res) goto oom; + in = (unsigned char*)arg; + out = res; + memset(&mbs, 0, sizeof mbs); + while (argsize) { + size_t converted = mbrtowc(out, (char*)in, argsize, &mbs); + if (converted == 0) + /* Reached end of string; null char stored. */ + break; + if (converted == (size_t)-2) { + /* Incomplete character. This should never happen, + since we provide everything that we have - + unless there is a bug in the C library, or I + misunderstood how mbrtowc works. */ + fprintf(stderr, "unexpected mbrtowc result -2\n"); + return NULL; + } + if (converted == (size_t)-1) { + /* Conversion error. Escape as UTF-8b, and start over + in the initial shift state. */ + *out++ = 0xdc00 + *in++; + argsize--; + memset(&mbs, 0, sizeof mbs); + continue; + } + /* successfully converted some bytes */ + in += converted; + argsize -= converted; + out++; + } +#else + /* Cannot use C locale for escaping; manually escape as if charset + is ASCII (i.e. escape all bytes > 128. This will still roundtrip + correctly in the locale's charset, which must be an ASCII superset. */ + res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t)); + if (!res) goto oom; + in = (unsigned char*)arg; + out = res; + while(*in) + if(*in < 128) + *out++ = *in++; + else + *out++ = 0xdc00 + *in++; + *out = 0; +#endif + return res; +oom: + fprintf(stderr, "out of memory\n"); + return NULL; +} + int main(int argc, char **argv) { @@ -40,31 +127,9 @@ main(int argc, char **argv) oldloc = strdup(setlocale(LC_ALL, NULL)); setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { -#ifdef HAVE_BROKEN_MBSTOWCS - /* Some platforms have a broken implementation of - * mbstowcs which does not count the characters that - * would result from conversion. Use an upper bound. - */ - size_t argsize = strlen(argv[i]); -#else - size_t argsize = mbstowcs(NULL, argv[i], 0); -#endif - size_t count; - if (argsize == (size_t)-1) { - fprintf(stderr, "Could not convert argument %d to string\n", i); + argv_copy2[i] = argv_copy[i] = char2wchar(argv[i]); + if (!argv_copy[i]) return 1; - } - argv_copy[i] = (wchar_t *)PyMem_Malloc((argsize+1)*sizeof(wchar_t)); - argv_copy2[i] = argv_copy[i]; - if (!argv_copy[i]) { - fprintf(stderr, "out of memory\n"); - return 1; - } - count = mbstowcs(argv_copy[i], argv[i], argsize+1); - if (count == (size_t)-1) { - fprintf(stderr, "Could not convert argument %d to string\n", i); - return 1; - } } setlocale(LC_ALL, oldloc); free(oldloc); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 18b6fa2..218e70b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1530,6 +1530,53 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) } } +/* Convert the argument to a bytes object, according to the file + system encoding */ + +int +PyUnicode_FSConverter(PyObject* arg, void* addr) +{ + PyObject *output = NULL; + Py_ssize_t size; + void *data; + if (PyBytes_Check(arg) || PyByteArray_Check(arg)) { + output = arg; + Py_INCREF(output); + } + else { + arg = PyUnicode_FromObject(arg); + if (!arg) + return 0; + output = PyUnicode_AsEncodedObject(arg, + Py_FileSystemDefaultEncoding, + "utf8b"); + Py_DECREF(arg); + if (!output) + return 0; + if (!PyBytes_Check(output)) { + Py_DECREF(output); + PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes"); + return 0; + } + } + if (PyBytes_Check(output)) { + size = PyBytes_GET_SIZE(output); + data = PyBytes_AS_STRING(output); + } + else { + size = PyByteArray_GET_SIZE(output); + data = PyByteArray_AS_STRING(output); + } + if (size != strlen(data)) { + PyErr_SetString(PyExc_TypeError, "embedded NUL character"); + Py_DECREF(output); + return 0; + } + *(PyObject**)addr = output; + return 1; +} + + char* _PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize) { @@ -4154,11 +4201,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, collstart-startp, collend-startp, &newpos); if (repunicode == NULL) goto onError; - if (!PyUnicode_Check(repunicode)) { - /* Implementation limitation: byte results not supported yet. */ - PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); + if (PyBytes_Check(repunicode)) { + /* Directly copy bytes result to output. */ + repsize = PyBytes_Size(repunicode); + if (repsize > 1) { + /* Make room for all additional bytes. */ + if (_PyBytes_Resize(&res, ressize+repsize-1)) { + Py_DECREF(repunicode); + goto onError; + } + ressize += repsize-1; + } + memcpy(str, PyBytes_AsString(repunicode), repsize); + str += repsize; + p = startp + newpos; Py_DECREF(repunicode); - goto onError; + break; } /* need more space? (at least enough for what we have+the replacement+the rest of the string, so @@ -5123,11 +5181,24 @@ int charmap_encoding_error( collstartpos, collendpos, &newpos); if (repunicode == NULL) return -1; - if (!PyUnicode_Check(repunicode)) { - /* Implementation limitation: byte results not supported yet. */ - PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); + if (PyBytes_Check(repunicode)) { + /* Directly copy bytes result to output. */ + Py_ssize_t outsize = PyBytes_Size(*res); + Py_ssize_t requiredsize; + repsize = PyBytes_Size(repunicode); + requiredsize = *respos + repsize; + if (requiredsize > outsize) + /* Make room for all additional bytes. */ + if (charmapencode_resize(res, respos, requiredsize)) { + Py_DECREF(repunicode); + return -1; + } + memcpy(PyBytes_AsString(*res) + *respos, + PyBytes_AsString(repunicode), repsize); + *respos += repsize; + *inpos = newpos; Py_DECREF(repunicode); - return -1; + break; } /* generate replacement */ repsize = PyUnicode_GET_SIZE(repunicode); @@ -5691,7 +5762,7 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, if (repunicode == NULL) goto onError; if (!PyUnicode_Check(repunicode)) { - /* Implementation limitation: byte results not supported yet. */ + /* Byte results not supported, since they have no decimal property. */ PyErr_SetString(PyExc_TypeError, "error handler should return unicode"); Py_DECREF(repunicode); goto onError; diff --git a/Python/codecs.c b/Python/codecs.c index 633a24c..7e3ff8a 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -829,6 +829,82 @@ PyCodec_SurrogateErrors(PyObject *exc) } } +static PyObject * +PyCodec_UTF8bErrors(PyObject *exc) +{ + PyObject *restuple; + PyObject *object; + Py_ssize_t start; + Py_ssize_t end; + PyObject *res; + if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + Py_UNICODE *p; + Py_UNICODE *startp; + char *outp; + if (PyUnicodeEncodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeEncodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeEncodeError_GetObject(exc))) + return NULL; + startp = PyUnicode_AS_UNICODE(object); + res = PyBytes_FromStringAndSize(NULL, end-start); + if (!res) { + Py_DECREF(object); + return NULL; + } + outp = PyBytes_AsString(res); + for (p = startp+start; p < startp+end; p++) { + Py_UNICODE ch = *p; + if (ch < 0xdc80 || ch > 0xdcff) { + /* Not a UTF-8b surrogate, fail with original exception */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + Py_DECREF(res); + Py_DECREF(object); + return NULL; + } + *outp++ = ch - 0xdc00; + } + restuple = Py_BuildValue("(On)", res, end); + Py_DECREF(res); + Py_DECREF(object); + return restuple; + } + else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { + unsigned char *p; + Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */ + int consumed = 0; + if (PyUnicodeDecodeError_GetStart(exc, &start)) + return NULL; + if (PyUnicodeDecodeError_GetEnd(exc, &end)) + return NULL; + if (!(object = PyUnicodeDecodeError_GetObject(exc))) + return NULL; + if (!(p = (unsigned char*)PyBytes_AsString(object))) { + Py_DECREF(object); + return NULL; + } + while (consumed < 4 && consumed < end-start) { + /* Refuse to escape ASCII bytes. */ + if (p[start+consumed] < 128) + break; + ch[consumed] = 0xdc00 + p[start+consumed]; + consumed++; + } + Py_DECREF(object); + if (!consumed) { + /* codec complained about ASCII byte. */ + PyErr_SetObject(PyExceptionInstance_Class(exc), exc); + return NULL; + } + return Py_BuildValue("(u#n)", ch, consumed, start+consumed); + } + else { + wrong_exception_type(exc); + return NULL; + } +} + static PyObject *strict_errors(PyObject *self, PyObject *exc) { @@ -864,6 +940,11 @@ static PyObject *surrogates_errors(PyObject *self, PyObject *exc) return PyCodec_SurrogateErrors(exc); } +static PyObject *utf8b_errors(PyObject *self, PyObject *exc) +{ + return PyCodec_UTF8bErrors(exc); +} + static int _PyCodecRegistry_Init(void) { static struct { @@ -918,6 +999,14 @@ static int _PyCodecRegistry_Init(void) surrogates_errors, METH_O } + }, + { + "utf8b", + { + "utf8b", + utf8b_errors, + METH_O + } } }; diff --git a/Python/pythonrun.c b/Python/pythonrun.c index f93403b..c75f55f 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -262,6 +262,22 @@ Py_InitializeEx(int install_sigs) _PyImportHooks_Init(); +#if defined(HAVE_LANGINFO_H) && defined(CODESET) + /* On Unix, set the file system encoding according to the + user's preference, if the CODESET names a well-known + Python codec, and Py_FileSystemDefaultEncoding isn't + initialized by other means. Also set the encoding of + stdin and stdout if these are terminals. */ + + codeset = get_codeset(); + if (codeset) { + if (!Py_FileSystemDefaultEncoding) + Py_FileSystemDefaultEncoding = codeset; + else + free(codeset); + } +#endif + if (install_sigs) initsigs(); /* Signal handling stuff, including initintr() */ @@ -285,22 +301,6 @@ Py_InitializeEx(int install_sigs) #ifdef WITH_THREAD _PyGILState_Init(interp, tstate); #endif /* WITH_THREAD */ - -#if defined(HAVE_LANGINFO_H) && defined(CODESET) - /* On Unix, set the file system encoding according to the - user's preference, if the CODESET names a well-known - Python codec, and Py_FileSystemDefaultEncoding isn't - initialized by other means. Also set the encoding of - stdin and stdout if these are terminals. */ - - codeset = get_codeset(); - if (codeset) { - if (!Py_FileSystemDefaultEncoding) - Py_FileSystemDefaultEncoding = codeset; - else - free(codeset); - } -#endif } void diff --git a/configure b/configure index d1da285..cdc9515 100755 --- a/configure +++ b/configure @@ -1,5 +1,5 @@ #! /bin/sh -# From configure.in Revision: 71731 . +# From configure.in Revision: 72144 . # Guess values for system-dependent variables and create Makefiles. # Generated by GNU Autoconf 2.61 for python 3.1. # @@ -16299,11 +16299,12 @@ echo "${ECHO_T}MACHDEP_OBJS" >&6; } + for ac_func in alarm setitimer getitimer bind_textdomain_codeset chown \ clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \ gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \ getpriority getpwent getspnam getspent getsid getwd \ - kill killpg lchmod lchown lstat mkfifo mknod mktime \ + kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \ mremap nice pathconf pause plock poll pthread_init \ putenv readlink realpath \ select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \ diff --git a/configure.in b/configure.in index 6a1e231..ba43b21 100644 --- a/configure.in +++ b/configure.in @@ -2403,7 +2403,7 @@ AC_CHECK_FUNCS(alarm setitimer getitimer bind_textdomain_codeset chown \ clock confstr ctermid execv fchmod fchown fork fpathconf ftime ftruncate \ gai_strerror getgroups getlogin getloadavg getpeername getpgid getpid \ getpriority getpwent getspnam getspent getsid getwd \ - kill killpg lchmod lchown lstat mkfifo mknod mktime \ + kill killpg lchmod lchown lstat mbrtowc mkfifo mknod mktime \ mremap nice pathconf pause plock poll pthread_init \ putenv readlink realpath \ select sem_open sem_timedwait sem_getvalue sem_unlink setegid seteuid \ diff --git a/pyconfig.h.in b/pyconfig.h.in index 01bc235..4c77900 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -419,6 +419,9 @@ /* Define this if you have the makedev macro. */ #undef HAVE_MAKEDEV +/* Define to 1 if you have the `mbrtowc' function. */ +#undef HAVE_MBRTOWC + /* Define to 1 if you have the `memmove' function. */ #undef HAVE_MEMMOVE -- cgit v0.12