diff options
29 files changed, 953 insertions, 499 deletions
diff --git a/Doc/library/http.client.rst b/Doc/library/http.client.rst index 9ef7956..9b1ab0f 100644 --- a/Doc/library/http.client.rst +++ b/Doc/library/http.client.rst @@ -475,11 +475,10 @@ also send your request step by step, by using the four functions below. .. method:: HTTPConnection.endheaders(message_body=None) Send a blank line to the server, signalling the end of the headers. The - optional message_body argument can be used to pass message body - associated with the request. The message body will be sent in - the same packet as the message headers if possible. The - message_body should be a string. - + optional *message_body* argument can be used to pass a message body + associated with the request. The message body will be sent in the same + packet as the message headers if it is string, otherwise it is sent in a + separate packet. .. method:: HTTPConnection.send(data) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 1b4522d..75dec86 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -85,7 +85,7 @@ Copyright (c) Corporation for National Research Initiatives. /* Py_UNICODE was the native Unicode storage format (code unit) used by Python and represents a single Unicode element in the Unicode type. - With PEP 393, Py_UNICODE is deprected and replaced with a + With PEP 393, Py_UNICODE is deprecated and replaced with a typedef to wchar_t. */ #ifndef Py_LIMITED_API @@ -115,7 +115,7 @@ typedef wchar_t Py_UNICODE; # include <wchar.h> #endif -/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve +/* Py_UCS4 and Py_UCS2 are typedefs for the respective unicode representations. */ #if SIZEOF_INT >= 4 typedef unsigned int Py_UCS4; @@ -288,10 +288,27 @@ typedef struct { unsigned int interned:2; /* Character size: - PyUnicode_WCHAR_KIND (0): wchar_t* - PyUnicode_1BYTE_KIND (1): Py_UCS1* - PyUnicode_2BYTE_KIND (2): Py_UCS2* - PyUnicode_4BYTE_KIND (3): Py_UCS4* + - PyUnicode_WCHAR_KIND (0): + + * character type = wchar_t (16 or 32 bits, depending on the + platform) + + - PyUnicode_1BYTE_KIND (1): + + * character type = Py_UCS1 (8 bits, unsigned) + * if ascii is set, all characters must be in range + U+0000-U+007F, otherwise at least one character must be in range + U+0080-U+00FF + + - PyUnicode_2BYTE_KIND (2): + + * character type = Py_UCS2 (16 bits, unsigned) + * at least one character must be in range U+0100-U+FFFF + + - PyUnicode_4BYTE_KIND (3): + + * character type = Py_UCS4 (32 bits, unsigned) + * at least one character must be in range U+10000-U+10FFFF */ unsigned int kind:2; /* Compact is with respect to the allocation scheme. Compact unicode @@ -299,9 +316,9 @@ typedef struct { one block for the PyUnicodeObject struct and another for its data buffer. */ unsigned int compact:1; - /* kind is PyUnicode_1BYTE_KIND but data contains only ASCII - characters. If ascii is 1 and compact is 1, use the PyASCIIObject - structure. */ + /* The string only contains characters in range U+0000-U+007F (ASCII) + and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is + set, use the PyASCIIObject structure. */ unsigned int ascii:1; /* The ready flag indicates whether the object layout is initialized completely. This means that this is either a compact object, or @@ -313,7 +330,7 @@ typedef struct { } PyASCIIObject; /* Non-ASCII strings allocated through PyUnicode_New use the - PyCompactUnicodeOject structure. state.compact is set, and the data + PyCompactUnicodeObject structure. state.compact is set, and the data immediately follow the structure. */ typedef struct { PyASCIIObject _base; @@ -382,7 +399,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; ((const char *)(PyUnicode_AS_UNICODE(op))) -/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */ +/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */ /* Values for PyUnicodeObject.state: */ @@ -426,7 +443,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; #define PyUnicode_CHARACTER_SIZE(op) \ (1 << (PyUnicode_KIND(op) - 1)) -/* Return pointers to the canonical representation casted as unsigned char, +/* Return pointers to the canonical representation cast to unsigned char, Py_UCS2, or Py_UCS4 for direct character access. No checks are performed, use PyUnicode_CHARACTER_SIZE or PyUnicode_KIND() before to ensure these will work correctly. */ @@ -468,9 +485,9 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; /* Write into the canonical representation, this macro does not do any sanity checks and is intended for usage in loops. The caller should cache the - kind and data pointers optained form other macro calls. + kind and data pointers obtained from other macro calls. index is the index in the string (starts at 0) and value is the new - code point value which shoule be written to that location. */ + code point value which should be written to that location. */ #define PyUnicode_WRITE(kind, data, index, value) \ do { \ switch ((kind)) { \ @@ -489,7 +506,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; } \ } while (0) -/* Read a code point form the string's canonical representation. No checks +/* Read a code point from the string's canonical representation. No checks or ready calls are performed. */ #define PyUnicode_READ(kind, data, index) \ ((Py_UCS4) \ @@ -542,7 +559,7 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; /* Return a maximum character value which is suitable for creating another string based on op. This is always an approximation but more efficient - than interating over the string. */ + than iterating over the string. */ #define PyUnicode_MAX_CHAR_VALUE(op) \ (assert(PyUnicode_IS_READY(op)), \ (PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \ @@ -654,6 +671,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromString( const char *u /* UTF-8 encoded string */ ); +/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters. + Scan the string to find the maximum character. */ #ifndef Py_LIMITED_API PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData( int kind, @@ -934,8 +953,8 @@ PyAPI_FUNC(int) PyUnicode_ClearFreeList(void); In case of an error, no *size is set. - This funcation caches the UTF-8 encoded string in the unicodeobject - and subsequent calls will return the same string. The memory is relased + This function caches the UTF-8 encoded string in the unicodeobject + and subsequent calls will return the same string. The memory is released when the unicodeobject is deallocated. _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to @@ -1585,7 +1604,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( These are capable of handling Unicode objects and strings on input (we refer to them as strings in the descriptions) and return - Unicode objects or integers as apporpriate. */ + Unicode objects or integers as appropriate. */ /* Concat two strings giving a new Unicode string. */ @@ -1765,7 +1784,7 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( /* Rich compare two strings and return one of the following: - NULL in case an exception was raised - - Py_True or Py_False for successfuly comparisons + - Py_True or Py_False for successfully comparisons - Py_NotImplemented in case the type combination is unknown Note that Py_EQ and Py_NE comparisons can cause a UnicodeWarning in @@ -1833,6 +1852,7 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buff see Objects/stringlib/localeutil.h */ #ifndef Py_LIMITED_API PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping( + PyObject *unicode, int kind, void *buffer, Py_ssize_t n_buffer, @@ -2011,6 +2031,13 @@ PyAPI_FUNC(Py_UNICODE*) PyUnicode_AsUnicodeCopy( ); #endif /* Py_LIMITED_API */ +#if defined(Py_DEBUG) && !defined(Py_LIMITED_API) +/* FIXME: use PyObject* type for op */ +PyAPI_FUNC(int) _PyUnicode_CheckConsistency( + void *op, + int check_content); +#endif + #ifdef __cplusplus } #endif diff --git a/Lib/http/client.py b/Lib/http/client.py index 5775573..a490e2b 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -947,11 +947,11 @@ class HTTPConnection: def endheaders(self, message_body=None): """Indicate that the last header line has been sent to the server. - This method sends the request to the server. The optional - message_body argument can be used to pass message body - associated with the request. The message body will be sent in - the same packet as the message headers if possible. The - message_body should be a string. + This method sends the request to the server. The optional message_body + argument can be used to pass a message body associated with the + request. The message body will be sent in the same packet as the + message headers if it is a string, otherwise it is sent as a separate + packet. """ if self.__state == _CS_REQ_STARTED: self.__state = _CS_REQ_SENT diff --git a/Lib/idlelib/configHandler.py b/Lib/idlelib/configHandler.py index 73b8db5..79315ef 100644 --- a/Lib/idlelib/configHandler.py +++ b/Lib/idlelib/configHandler.py @@ -145,7 +145,8 @@ class IdleUserConfParser(IdleConfParser): except IOError: os.unlink(fname) cfgFile = open(fname, 'w') - self.write(cfgFile) + with cfgFile: + self.write(cfgFile) else: self.RemoveFile() diff --git a/Lib/idlelib/rpc.py b/Lib/idlelib/rpc.py index 0c56ccd..53f4aa8 100644 --- a/Lib/idlelib/rpc.py +++ b/Lib/idlelib/rpc.py @@ -534,6 +534,10 @@ class RPCClient(SocketIO): def get_remote_proxy(self, oid): return RPCProxy(self, oid) + def close(self): + self.listening_sock.close() + SocketIO.close(self) + class RPCProxy(object): __methods = None diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index f52ea01..75f3a09 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -318,11 +318,13 @@ def _optimize_unicode(charset, fixup): # XXX: could expand category return charset # cannot compress except IndexError: - # non-BMP characters + # non-BMP characters; XXX now they should work return charset if negate: if sys.maxunicode != 65535: # XXX: negation does not work with big charsets + # XXX2: now they should work, but removing this will make the + # charmap 17 times bigger return charset for i in range(65536): charmap[i] = not charmap[i] diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 4e33c23..587c1c0 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -249,8 +249,7 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(chr(0xff), '\xff') self.assertRaises(ValueError, chr, 1<<24) self.assertEqual(chr(sys.maxunicode), - str(('\\U%08x' % (sys.maxunicode)).encode("ascii"), - 'unicode-escape')) + str('\\U0010ffff'.encode("ascii"), 'unicode-escape')) self.assertRaises(TypeError, chr) self.assertEqual(chr(0x0000FFFF), "\U0000FFFF") self.assertEqual(chr(0x00010000), "\U00010000") diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 486f49e..caa1b96 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -1,8 +1,13 @@ import test.support, unittest import sys, codecs, html.entities, unicodedata -import ctypes -SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) +try: + import ctypes +except ImportError: + ctypes = None + SIZEOF_WCHAR_T = -1 +else: + SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) class PosReturn: # this can be used for configurable callbacks @@ -138,22 +143,14 @@ class CodecCallbackTest(unittest.TestCase): def test_backslashescape(self): # Does the same as the "unicode-escape" encoding, but with different # base encodings. - sin = "a\xac\u1234\u20ac\u8000" - if sys.maxunicode > 0xffff: - sin += chr(sys.maxunicode) - sout = b"a\\xac\\u1234\\u20ac\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sin = "a\xac\u1234\u20ac\u8000\U0010ffff" + sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) - sout = b"a\xac\\u1234\\u20ac\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) - sout = b"a\xac\\u1234\xa4\\u8000" - if sys.maxunicode > 0xffff: - sout += bytes("\\U%08x" % sys.maxunicode, "ascii") + sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) def test_decoding_callbacks(self): @@ -580,33 +577,34 @@ class CodecCallbackTest(unittest.TestCase): UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), ("\\uffff", 1) ) - if ctypes.sizeof(ctypes.c_wchar) == 2: + if SIZEOF_WCHAR_T == 2: len_wide = 2 else: len_wide = 1 - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U00010000", - 0, len_wide, "ouch")), - ("\\U00010000", len_wide) - ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U0010ffff", - 0, len_wide, "ouch")), - ("\\U0010ffff", len_wide) - ) - # Lone surrogates (regardless of unicode width) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), - ("\\ud800", 1) - ) - self.assertEqual( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), - ("\\udfff", 1) - ) + if SIZEOF_WCHAR_T > 0: + self.assertEqual( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\U00010000", + 0, len_wide, "ouch")), + ("\\U00010000", len_wide) + ) + self.assertEqual( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\U0010ffff", + 0, len_wide, "ouch")), + ("\\U0010ffff", len_wide) + ) + # Lone surrogates (regardless of unicode width) + self.assertEqual( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), + ("\\ud800", 1) + ) + self.assertEqual( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), + ("\\udfff", 1) + ) def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index f70ae33..e9ce95a 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -3,9 +3,14 @@ import unittest import codecs import locale import sys, _testcapi, io -import ctypes -SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) +try: + import ctypes +except ImportError: + ctypes = None + SIZEOF_WCHAR_T = -1 +else: + SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) class Queue(object): """ diff --git a/Lib/test/test_import.py b/Lib/test/test_import.py index 6cbd218..338f1d4 100644 --- a/Lib/test/test_import.py +++ b/Lib/test/test_import.py @@ -4,6 +4,7 @@ from importlib.test.import_ import test_relative_imports from importlib.test.import_ import util as importlib_util import marshal import os +import platform import py_compile import random import stat @@ -544,6 +545,8 @@ class PycacheTests(unittest.TestCase): @unittest.skipUnless(os.name == 'posix', "test meaningful only on posix systems") + @unittest.skipIf(hasattr(os, 'geteuid') and os.geteuid() == 0, + "due to varying filesystem permission semantics (issue #11956)") def test_unwritable_directory(self): # When the umask causes the new __pycache__ directory to be # unwritable, the import still succeeds but no .pyc file is written. diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 455eda3..a9e3c37 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -2421,6 +2421,21 @@ class CTextIOWrapperTest(TextIOWrapperTest): with self.open(support.TESTFN, "rb") as f: self.assertEqual(f.read(), b"456def") + def test_rwpair_cleared_before_textio(self): + # Issue 13070: TextIOWrapper's finalization would crash when called + # after the reference to the underlying BufferedRWPair's writer got + # cleared by the GC. + for i in range(1000): + b1 = self.BufferedRWPair(self.MockRawIO(), self.MockRawIO()) + t1 = self.TextIOWrapper(b1, encoding="ascii") + b2 = self.BufferedRWPair(self.MockRawIO(), self.MockRawIO()) + t2 = self.TextIOWrapper(b2, encoding="ascii") + # circular references + t1.buddy = t2 + t2.buddy = t1 + support.gc_collect() + + class PyTextIOWrapperTest(TextIOWrapperTest): pass diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py index 4448072..feb7bd5 100644 --- a/Lib/test/test_multibytecodec.py +++ b/Lib/test/test_multibytecodec.py @@ -247,14 +247,9 @@ class Test_ISO2022(unittest.TestCase): self.assertFalse(any(x > 0x80 for x in e)) def test_bug1572832(self): - if sys.maxunicode >= 0x10000: - myunichr = chr - else: - myunichr = lambda x: chr(0xD7C0+(x>>10)) + chr(0xDC00+(x&0x3FF)) - for x in range(0x10000, 0x110000): # Any ISO 2022 codec will cause the segfault - myunichr(x).encode('iso_2022_jp', 'ignore') + chr(x).encode('iso_2022_jp', 'ignore') class TestStateful(unittest.TestCase): text = '\u4E16\u4E16' diff --git a/Lib/test/test_sys_settrace.py b/Lib/test/test_sys_settrace.py index acb8e29..578e95d 100644 --- a/Lib/test/test_sys_settrace.py +++ b/Lib/test/test_sys_settrace.py @@ -283,11 +283,11 @@ class TraceTestCase(unittest.TestCase): self.compare_events(func.__code__.co_firstlineno, tracer.events, func.events) - def set_and_retrieve_none(self): + def test_set_and_retrieve_none(self): sys.settrace(None) assert sys.gettrace() is None - def set_and_retrieve_func(self): + def test_set_and_retrieve_func(self): def fn(*args): pass diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 840b76f..870853e 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -13,10 +13,6 @@ import warnings from test import support, string_tests import _string -# decorator to skip tests on narrow builds -requires_wide_build = unittest.skipIf(sys.maxunicode == 65535, - 'requires wide build') - # Error handling (bad decoder return) def search_function(encoding): def decode1(input, errors="strict"): @@ -519,7 +515,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) - @requires_wide_build def test_lower(self): string_tests.CommonTest.test_lower(self) self.assertEqual('\U00010427'.lower(), '\U0001044F') @@ -530,7 +525,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('X\U00010427x\U0001044F'.lower(), 'x\U0001044Fx\U0001044F') - @requires_wide_build def test_upper(self): string_tests.CommonTest.test_upper(self) self.assertEqual('\U0001044F'.upper(), '\U00010427') @@ -541,7 +535,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('X\U00010427x\U0001044F'.upper(), 'X\U00010427X\U00010427') - @requires_wide_build def test_capitalize(self): string_tests.CommonTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') @@ -554,7 +547,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('X\U00010427x\U0001044F'.capitalize(), 'X\U0001044Fx\U0001044F') - @requires_wide_build def test_title(self): string_tests.MixinStrUnicodeUserStringTest.test_title(self) self.assertEqual('\U0001044F'.title(), '\U00010427') @@ -569,7 +561,6 @@ class UnicodeTest(string_tests.CommonTest, self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') - @requires_wide_build def test_swapcase(self): string_tests.CommonTest.test_swapcase(self) self.assertEqual('\U0001044F'.swapcase(), '\U00010427') @@ -1114,15 +1105,12 @@ class UnicodeTest(string_tests.CommonTest, def test_codecs_utf8(self): self.assertEqual(''.encode('utf-8'), b'') self.assertEqual('\u20ac'.encode('utf-8'), b'\xe2\x82\xac') - if sys.maxunicode == 65535: - self.assertEqual('\ud800\udc02'.encode('utf-8'), b'\xf0\x90\x80\x82') - self.assertEqual('\ud84d\udc56'.encode('utf-8'), b'\xf0\xa3\x91\x96') + self.assertEqual('\U00010002'.encode('utf-8'), b'\xf0\x90\x80\x82') + self.assertEqual('\U00023456'.encode('utf-8'), b'\xf0\xa3\x91\x96') self.assertEqual('\ud800'.encode('utf-8', 'surrogatepass'), b'\xed\xa0\x80') self.assertEqual('\udc00'.encode('utf-8', 'surrogatepass'), b'\xed\xb0\x80') - if sys.maxunicode == 65535: - self.assertEqual( - ('\ud800\udc02'*1000).encode('utf-8'), - b'\xf0\x90\x80\x82'*1000) + self.assertEqual(('\U00010002'*10).encode('utf-8'), + b'\xf0\x90\x80\x82'*10) self.assertEqual( '\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' '\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 671708a..77637a6 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1058,6 +1058,10 @@ class Utility_Tests(unittest.TestCase): self.assertEqual(('user', 'a\vb'),urllib.parse.splitpasswd('user:a\vb')) self.assertEqual(('user', 'a:b'),urllib.parse.splitpasswd('user:a:b')) + def test_thishost(self): + """Test the urllib.request.thishost utility function returns a tuple""" + self.assertIsInstance(urllib.request.thishost(), tuple) + class URLopener_Tests(unittest.TestCase): """Testcase to test the open method of URLopener class.""" diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 4a571e8..671ab68 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -2125,7 +2125,7 @@ def thishost(): """Return the IP addresses of the current host.""" global _thishost if _thishost is None: - _thishost = tuple(socket.gethostbyname_ex(socket.gethostname()[2])) + _thishost = tuple(socket.gethostbyname_ex(socket.gethostname())[2]) return _thishost _ftperrors = None @@ -1316,6 +1316,10 @@ Tools/Demos Extension Modules ----------------- +- Issue #13070: Fix a crash when a TextIOWrapper caught in a reference cycle + would be finalized after the reference to its underlying BufferedRWPair's + writer got cleared by the GC. + - Issue #12881: ctypes: Fix segfault with large structure field names. - Issue #13058: ossaudiodev: fix a file descriptor leak on error. Patch by diff --git a/Modules/_io/bufferedio.c b/Modules/_io/bufferedio.c index b8043d4..86f7412 100644 --- a/Modules/_io/bufferedio.c +++ b/Modules/_io/bufferedio.c @@ -2307,6 +2307,11 @@ bufferedrwpair_isatty(rwpair *self, PyObject *args) static PyObject * bufferedrwpair_closed_get(rwpair *self, void *context) { + if (self->writer == NULL) { + PyErr_SetString(PyExc_RuntimeError, + "the BufferedRWPair object is being garbage-collected"); + return NULL; + } return PyObject_GetAttr((PyObject *) self->writer, _PyIO_str_closed); } diff --git a/Modules/_io/stringio.c b/Modules/_io/stringio.c index c40163f..a4536b1 100644 --- a/Modules/_io/stringio.c +++ b/Modules/_io/stringio.c @@ -131,6 +131,10 @@ write_str(stringio *self, PyObject *obj) return -1; assert(PyUnicode_Check(decoded)); + if (PyUnicode_READY(decoded)) { + Py_DECREF(decoded); + return -1; + } len = PyUnicode_GET_LENGTH(decoded); assert(len >= 0); diff --git a/Objects/exceptions.c b/Objects/exceptions.c index 703e72e..60b340b 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -963,8 +963,13 @@ static PyObject* my_basename(PyObject *name) { Py_ssize_t i, size, offset; - int kind = PyUnicode_KIND(name); - void *data = PyUnicode_DATA(name); + int kind; + void *data; + + if (PyUnicode_READY(name)) + return NULL; + kind = PyUnicode_KIND(name); + data = PyUnicode_DATA(name); size = PyUnicode_GET_LENGTH(name); offset = 0; for(i=0; i < size; i++) { diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h new file mode 100644 index 0000000..935a9c7 --- /dev/null +++ b/Objects/stringlib/asciilib.h @@ -0,0 +1,34 @@ +/* this is sort of a hack. there's at least one place (formatting + floats) where some stringlib code takes a different path if it's + compiled as unicode. */ +#define STRINGLIB_IS_UNICODE 1 + +#define FASTSEARCH asciilib_fastsearch +#define STRINGLIB(F) asciilib_##F +#define STRINGLIB_OBJECT PyUnicodeObject +#define STRINGLIB_CHAR Py_UCS1 +#define STRINGLIB_TYPE_NAME "unicode" +#define STRINGLIB_PARSE_CODE "U" +#define STRINGLIB_EMPTY unicode_empty +#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE +#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL +#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL +#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER +#define STRINGLIB_TOLOWER Py_UNICODE_TOLOWER +#define STRINGLIB_FILL Py_UNICODE_FILL +#define STRINGLIB_STR PyUnicode_1BYTE_DATA +#define STRINGLIB_LEN PyUnicode_GET_LENGTH +#define STRINGLIB_NEW unicode_fromascii +#define STRINGLIB_RESIZE not_supported +#define STRINGLIB_CHECK PyUnicode_Check +#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact +#define STRINGLIB_GROUPING _PyUnicode_InsertThousandsGrouping +#define STRINGLIB_GROUPING_LOCALE _PyUnicode_InsertThousandsGroupingLocale + +#define STRINGLIB_TOSTR PyObject_Str +#define STRINGLIB_TOASCII PyObject_ASCII + +#define _Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping +#define _Py_InsertThousandsGroupingLocale _PyUnicode_ascii_InsertThousandsGroupingLocale + diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index af4ce63..72007d9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -46,6 +46,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #include <windows.h> #endif +#ifdef Py_DEBUG +# define DONT_MAKE_RESULT_READY +#endif + /* Limit for the Unicode object free list */ #define PyUnicode_MAXFREELIST 1024 @@ -90,7 +94,7 @@ extern "C" { #endif #ifdef Py_DEBUG -# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op) +# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) #else # define _PyUnicode_CHECK(op) PyUnicode_Check(op) #endif @@ -235,6 +239,13 @@ const unsigned char _Py_ascii_whitespace[] = { /* forward */ static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); static PyObject* get_latin1_char(unsigned char ch); +static void copy_characters( + PyObject *to, Py_ssize_t to_start, + PyObject *from, Py_ssize_t from_start, + Py_ssize_t how_many); +#ifdef Py_DEBUG +static int unicode_is_singleton(PyObject *unicode); +#endif static PyObject * unicode_encode_call_errorhandler(const char *errors, @@ -292,8 +303,9 @@ PyUnicode_GetMax(void) } #ifdef Py_DEBUG -static int -_PyUnicode_CheckConsistency(void *op) +int +/* FIXME: use PyObject* type for op */ +_PyUnicode_CheckConsistency(void *op, int check_content) { PyASCIIObject *ascii; unsigned int kind; @@ -367,12 +379,31 @@ _PyUnicode_CheckConsistency(void *op) if (ascii->wstr == NULL) assert(compact->wstr_length == 0); } - return 1; -} -#else -static int -_PyUnicode_CheckConsistency(void *op) -{ + /* check that the best kind is used */ + if (check_content && kind != PyUnicode_WCHAR_KIND) + { + Py_ssize_t i; + Py_UCS4 maxchar = 0; + void *data = PyUnicode_DATA(ascii); + for (i=0; i < ascii->length; i++) + { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch > maxchar) + maxchar = ch; + } + if (kind == PyUnicode_1BYTE_KIND) { + if (ascii->state.ascii == 0) + assert(maxchar >= 128); + else + assert(maxchar < 128); + } + else if (kind == PyUnicode_2BYTE_KIND) + assert(maxchar >= 0x100); + else + assert(maxchar >= 0x10000); + } + if (check_content && !unicode_is_singleton((PyObject*)ascii)) + assert(ascii->hash == -1); return 1; } #endif @@ -428,7 +459,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len) /* --- Unicode Object ----------------------------------------------------- */ static PyObject * -fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s)); +fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s)); Py_LOCAL_INLINE(char *) findchar(void *s, int kind, Py_ssize_t size, Py_UCS4 ch, @@ -542,7 +573,7 @@ resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) _PyUnicode_LENGTH(unicode) = length; PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) { - _PyUnicode_CheckConsistency(unicode); + assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; } } @@ -562,7 +593,7 @@ resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length) _PyUnicode_WSTR(unicode) = wstr; _PyUnicode_WSTR(unicode)[length] = 0; _PyUnicode_WSTR_LENGTH(unicode) = length; - _PyUnicode_CheckConsistency(unicode); + assert(_PyUnicode_CheckConsistency(unicode, 0)); return 0; } @@ -579,13 +610,7 @@ resize_copy(PyObject *unicode, Py_ssize_t length) return NULL; copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); - if (PyUnicode_CopyCharacters(copy, 0, - unicode, 0, - copy_length) < 0) - { - Py_DECREF(copy); - return NULL; - } + copy_characters(copy, 0, unicode, 0, copy_length); return copy; } else { @@ -875,13 +900,14 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_WSTR(unicode) = NULL; } } + assert(_PyUnicode_CheckConsistency(unicode, 0)); return obj; } #if SIZEOF_WCHAR_T == 2 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this will decode surrogate pairs, the other conversions are implemented as macros - for efficency. + for efficiency. This function assumes that unicode can hold one more code point than wstr characters for a terminating null character. */ @@ -930,47 +956,55 @@ _PyUnicode_Dirty(PyObject *unicode) return 0; } -Py_ssize_t -PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, - PyObject *from, Py_ssize_t from_start, - Py_ssize_t how_many) +static int +_copy_characters(PyObject *to, Py_ssize_t to_start, + PyObject *from, Py_ssize_t from_start, + Py_ssize_t how_many, int check_maxchar) { unsigned int from_kind, to_kind; void *from_data, *to_data; + int fast; - if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { - PyErr_BadInternalCall(); - return -1; - } + assert(PyUnicode_Check(from)); + assert(PyUnicode_Check(to)); + assert(PyUnicode_IS_READY(from)); + assert(PyUnicode_IS_READY(to)); - if (PyUnicode_READY(from)) - return -1; - if (PyUnicode_READY(to)) - return -1; + assert(PyUnicode_GET_LENGTH(from) >= how_many); + assert(to_start + how_many <= PyUnicode_GET_LENGTH(to)); + assert(0 <= how_many); - how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); - if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { - PyErr_Format(PyExc_SystemError, - "Cannot write %zi characters at %zi " - "in a string of %zi characters", - how_many, to_start, PyUnicode_GET_LENGTH(to)); - return -1; - } if (how_many == 0) return 0; - if (_PyUnicode_Dirty(to)) - return -1; - from_kind = PyUnicode_KIND(from); from_data = PyUnicode_DATA(from); to_kind = PyUnicode_KIND(to); to_data = PyUnicode_DATA(to); - if (from_kind == to_kind - /* deny latin1 => ascii */ - && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from)) +#ifdef Py_DEBUG + if (!check_maxchar + && (from_kind > to_kind + || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))) + { + const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); + Py_UCS4 ch; + Py_ssize_t i; + for (i=0; i < how_many; i++) { + ch = PyUnicode_READ(from_kind, from_data, from_start + i); + assert(ch <= to_maxchar); + } + } +#endif + fast = (from_kind == to_kind); + if (check_maxchar + && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) { + /* deny latin1 => ascii */ + fast = 0; + } + + if (fast) { Py_MEMCPY((char*)to_data + PyUnicode_KIND_SIZE(to_kind, to_start), (char*)from_data @@ -1008,45 +1042,96 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, ); } else { - int invalid_kinds; - /* check if max_char(from substring) <= max_char(to) */ if (from_kind > to_kind /* latin1 => ascii */ - || (PyUnicode_IS_ASCII(to) - && to_kind == PyUnicode_1BYTE_KIND - && !PyUnicode_IS_ASCII(from))) + || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))) { /* slow path to check for character overflow */ const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to); - Py_UCS4 ch, maxchar; + Py_UCS4 ch; Py_ssize_t i; - maxchar = 0; - invalid_kinds = 0; +#ifdef Py_DEBUG for (i=0; i < how_many; i++) { ch = PyUnicode_READ(from_kind, from_data, from_start + i); - if (ch > maxchar) { - maxchar = ch; - if (maxchar > to_maxchar) { - invalid_kinds = 1; - break; - } - } + assert(ch <= to_maxchar); PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); } +#else + if (!check_maxchar) { + for (i=0; i < how_many; i++) { + ch = PyUnicode_READ(from_kind, from_data, from_start + i); + PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); + } + } + else { + for (i=0; i < how_many; i++) { + ch = PyUnicode_READ(from_kind, from_data, from_start + i); + if (ch > to_maxchar) + return 1; + PyUnicode_WRITE(to_kind, to_data, to_start + i, ch); + } + } +#endif } - else - invalid_kinds = 1; - if (invalid_kinds) { - PyErr_Format(PyExc_SystemError, - "Cannot copy %s characters " - "into a string of %s characters", - unicode_kind_name(from), - unicode_kind_name(to)); - return -1; + else { + assert(0 && "inconsistent state"); + return 1; } } + return 0; +} + +static void +copy_characters(PyObject *to, Py_ssize_t to_start, + PyObject *from, Py_ssize_t from_start, + Py_ssize_t how_many) +{ + (void)_copy_characters(to, to_start, from, from_start, how_many, 0); +} + +Py_ssize_t +PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start, + PyObject *from, Py_ssize_t from_start, + Py_ssize_t how_many) +{ + int err; + + if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) { + PyErr_BadInternalCall(); + return -1; + } + + if (PyUnicode_READY(from)) + return -1; + if (PyUnicode_READY(to)) + return -1; + + how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many); + if (to_start + how_many > PyUnicode_GET_LENGTH(to)) { + PyErr_Format(PyExc_SystemError, + "Cannot write %zi characters at %zi " + "in a string of %zi characters", + how_many, to_start, PyUnicode_GET_LENGTH(to)); + return -1; + } + + if (how_many == 0) + return 0; + + if (_PyUnicode_Dirty(to)) + return -1; + + err = _copy_characters(to, to_start, from, from_start, how_many, 1); + if (err) { + PyErr_Format(PyExc_SystemError, + "Cannot copy %s characters " + "into a string of %s characters", + unicode_kind_name(from), + unicode_kind_name(to)); + return -1; + } return how_many; } @@ -1062,19 +1147,17 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, const wchar_t *iter; assert(num_surrogates != NULL && maxchar != NULL); - if (num_surrogates == NULL || maxchar == NULL) { - PyErr_SetString(PyExc_SystemError, - "unexpected NULL arguments to " - "PyUnicode_FindMaxCharAndNumSurrogatePairs"); - return -1; - } - *num_surrogates = 0; *maxchar = 0; for (iter = begin; iter < end; ) { - if (*iter > *maxchar) + if (*iter > *maxchar) { *maxchar = *iter; +#if SIZEOF_WCHAR_T != 2 + if (*maxchar >= 0x10000) + return 0; +#endif + } #if SIZEOF_WCHAR_T == 2 if (*iter >= 0xD800 && *iter <= 0xDBFF && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF) @@ -1114,7 +1197,7 @@ unicode_ready(PyObject **p_obj, int replace) assert(p_obj != NULL); unicode = (PyUnicodeObject *)*p_obj; - /* _PyUnicode_Ready() is only intented for old-style API usage where + /* _PyUnicode_Ready() is only intended for old-style API usage where strings were created using _PyObject_New() and where no canonical representation (the str field) has been set yet aka strings which are not yet ready. */ @@ -1255,6 +1338,7 @@ unicode_ready(PyObject **p_obj, int replace) PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0'; } _PyUnicode_STATE(unicode).ready = 1; + assert(_PyUnicode_CheckConsistency(unicode, 1)); return 0; } @@ -1307,6 +1391,23 @@ unicode_dealloc(register PyUnicodeObject *unicode) } } +#ifdef Py_DEBUG +static int +unicode_is_singleton(PyObject *unicode) +{ + PyASCIIObject *ascii = (PyASCIIObject *)unicode; + if (unicode == unicode_empty) + return 1; + if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) + { + Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); + if (ch < 256 && unicode_latin1[ch] == unicode) + return 1; + } + return 0; +} +#endif + static int unicode_resizable(PyObject *unicode) { @@ -1314,15 +1415,9 @@ unicode_resizable(PyObject *unicode) return 0; if (PyUnicode_CHECK_INTERNED(unicode)) return 0; - assert(unicode != unicode_empty); #ifdef Py_DEBUG - if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND - && PyUnicode_GET_LENGTH(unicode) == 1) - { - Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); - if (ch < 256 && unicode_latin1[ch] == unicode) - return 0; - } + /* singleton refcount is greater than 1 */ + assert(!unicode_is_singleton(unicode)); #endif return 1; } @@ -1360,7 +1455,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) *p_unicode = resize_compact(unicode, length); if (*p_unicode == NULL) return -1; - _PyUnicode_CheckConsistency(*p_unicode); + assert(_PyUnicode_CheckConsistency(*p_unicode, 0)); return 0; } return resize_inplace((PyUnicodeObject*)unicode, length); @@ -1393,6 +1488,7 @@ get_latin1_char(unsigned char ch) if (!unicode) return NULL; PyUnicode_1BYTE_DATA(unicode)[0] = ch; + assert(_PyUnicode_CheckConsistency(unicode, 1)); unicode_latin1[ch] = unicode; } Py_INCREF(unicode); @@ -1461,6 +1557,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) assert(0 && "Impossible state"); } + assert(_PyUnicode_CheckConsistency(unicode, 1)); return (PyObject *)unicode; } @@ -1515,21 +1612,58 @@ PyUnicode_FromString(const char *u) } static PyObject* +unicode_fromascii(const unsigned char* s, Py_ssize_t size) +{ + PyObject *res; +#ifdef Py_DEBUG + const unsigned char *p; + const unsigned char *end = s + size; + for (p=s; p < end; p++) { + assert(*p < 128); + } +#endif + res = PyUnicode_New(size, 127); + if (!res) + return NULL; + memcpy(PyUnicode_1BYTE_DATA(res), s, size); + return res; +} + +static Py_UCS4 +kind_maxchar_limit(unsigned int kind) +{ + switch(kind) { + case PyUnicode_1BYTE_KIND: + return 0x80; + case PyUnicode_2BYTE_KIND: + return 0x100; + case PyUnicode_4BYTE_KIND: + return 0x10000; + default: + assert(0 && "invalid kind"); + return 0x10ffff; + } +} + +static PyObject* _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) { PyObject *res; - unsigned char max = 127; + unsigned char max_char = 127; Py_ssize_t i; + + assert(size >= 0); for (i = 0; i < size; i++) { if (u[i] & 0x80) { - max = 255; + max_char = 255; break; } } - res = PyUnicode_New(size, max); + res = PyUnicode_New(size, max_char); if (!res) return NULL; memcpy(PyUnicode_1BYTE_DATA(res), u, size); + assert(_PyUnicode_CheckConsistency(res, 1)); return res; } @@ -1537,19 +1671,26 @@ static PyObject* _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) { PyObject *res; - Py_UCS2 max = 0; + Py_UCS2 max_char = 0; Py_ssize_t i; - for (i = 0; i < size; i++) - if (u[i] > max) - max = u[i]; - res = PyUnicode_New(size, max); + + assert(size >= 0); + for (i = 0; i < size; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 256) + break; + } + } + res = PyUnicode_New(size, max_char); if (!res) return NULL; - if (max >= 256) + if (max_char >= 256) memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size); else for (i = 0; i < size; i++) PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i]; + assert(_PyUnicode_CheckConsistency(res, 1)); return res; } @@ -1557,15 +1698,21 @@ static PyObject* _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) { PyObject *res; - Py_UCS4 max = 0; + Py_UCS4 max_char = 0; Py_ssize_t i; - for (i = 0; i < size; i++) - if (u[i] > max) - max = u[i]; - res = PyUnicode_New(size, max); + + assert(size >= 0); + for (i = 0; i < size; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 0x10000) + break; + } + } + res = PyUnicode_New(size, max_char); if (!res) return NULL; - if (max >= 0x10000) + if (max_char >= 0x10000) memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size); else { int kind = PyUnicode_KIND(res); @@ -1573,6 +1720,7 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) for (i = 0; i < size; i++) PyUnicode_WRITE(kind, data, i, u[i]); } + assert(_PyUnicode_CheckConsistency(res, 1)); return res; } @@ -1586,9 +1734,68 @@ PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size) return _PyUnicode_FromUCS2(buffer, size); case PyUnicode_4BYTE_KIND: return _PyUnicode_FromUCS4(buffer, size); + default: + assert(0 && "invalid kind"); + PyErr_SetString(PyExc_SystemError, "invalid kind"); + return NULL; } - PyErr_SetString(PyExc_SystemError, "invalid kind"); - return NULL; +} + +/* Ensure that a string uses the most efficient storage, if it is not the + case: create a new string with of the right kind. Write NULL into *p_unicode + on error. */ +void +unicode_adjust_maxchar(PyObject **p_unicode) +{ + PyObject *unicode, *copy; + Py_UCS4 max_char; + Py_ssize_t i, len; + unsigned int kind; + + assert(p_unicode != NULL); + unicode = *p_unicode; + assert(PyUnicode_IS_READY(unicode)); + if (PyUnicode_IS_ASCII(unicode)) + return; + + len = PyUnicode_GET_LENGTH(unicode); + kind = PyUnicode_KIND(unicode); + if (kind == PyUnicode_1BYTE_KIND) { + const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode); + for (i = 0; i < len; i++) { + if (u[i] & 0x80) + return; + } + max_char = 127; + } + else if (kind == PyUnicode_2BYTE_KIND) { + const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode); + max_char = 0; + for (i = 0; i < len; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 256) + return; + } + } + } + else { + assert(kind == PyUnicode_4BYTE_KIND); + const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode); + max_char = 0; + for (i = 0; i < len; i++) { + if (u[i] > max_char) { + max_char = u[i]; + if (max_char >= 0x10000) + return; + } + } + } + assert(max_char > PyUnicode_MAX_CHAR_VALUE(unicode)); + copy = PyUnicode_New(len, max_char); + copy_characters(copy, 0, unicode, 0, len); + Py_DECREF(unicode); + *p_unicode = copy; } PyObject* @@ -1627,6 +1834,7 @@ PyUnicode_Copy(PyObject *unicode) assert(0); break; } + assert(_PyUnicode_CheckConsistency(copy, 1)); return copy; } @@ -1895,7 +2103,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) int precision = 0; int zeropad; const char* f; - PyUnicodeObject *string; + PyObject *string; /* used by sprintf */ char fmt[61]; /* should be enough for %0width.precisionlld */ Py_UCS4 maxchar = 127; /* result is ASCII by default */ @@ -1912,8 +2120,8 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/ * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the * result in an array) - * also esimate a upper bound for all the number formats in the string, - * numbers will be formated in step 3 and be keept in a '\0'-separated + * also estimate a upper bound for all the number formats in the string, + * numbers will be formatted in step 3 and be kept in a '\0'-separated * buffer before putting everything together. */ for (f = format; *f; f++) { if (*f == '%') { @@ -2120,6 +2328,10 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace"); if (!str_obj) goto fail; + if (PyUnicode_READY(str_obj)) { + Py_DECREF(str_obj); + goto fail; + } argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj); maxchar = Py_MAX(maxchar, argmaxchar); n += PyUnicode_GET_LENGTH(str_obj); @@ -2190,7 +2402,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) /* Since we've analyzed how much space we need, we don't have to resize the string. There can be no errors beyond this point. */ - string = (PyUnicodeObject *)PyUnicode_New(n, maxchar); + string = PyUnicode_New(n, maxchar); if (!string) goto fail; kind = PyUnicode_KIND(string); @@ -2241,10 +2453,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) (void) va_arg(vargs, char *); size = PyUnicode_GET_LENGTH(*callresult); assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - if (PyUnicode_CopyCharacters((PyObject*)string, i, - *callresult, 0, - size) < 0) - goto fail; + copy_characters(string, i, *callresult, 0, size); i += size; /* We're done with the unicode()/repr() => forget it */ Py_DECREF(*callresult); @@ -2258,10 +2467,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) Py_ssize_t size; assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); size = PyUnicode_GET_LENGTH(obj); - if (PyUnicode_CopyCharacters((PyObject*)string, i, - obj, 0, - size) < 0) - goto fail; + copy_characters(string, i, obj, 0, size); i += size; break; } @@ -2273,19 +2479,13 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) if (obj) { size = PyUnicode_GET_LENGTH(obj); assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string)); - if (PyUnicode_CopyCharacters((PyObject*)string, i, - obj, 0, - size) < 0) - goto fail; + copy_characters(string, i, obj, 0, size); i += size; } else { size = PyUnicode_GET_LENGTH(*callresult); assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - if (PyUnicode_CopyCharacters((PyObject*)string, i, - *callresult, - 0, size) < 0) - goto fail; + copy_characters(string, i, *callresult, 0, size); i += size; Py_DECREF(*callresult); } @@ -2296,14 +2496,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) case 'R': case 'A': { + Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult); /* unused, since we already have the result */ (void) va_arg(vargs, PyObject *); assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string)); - if (PyUnicode_CopyCharacters((PyObject*)string, i, - *callresult, 0, - PyUnicode_GET_LENGTH(*callresult)) < 0) - goto fail; - i += PyUnicode_GET_LENGTH(*callresult); + copy_characters(string, i, *callresult, 0, size); + i += size; /* We're done with the unicode()/repr() => forget it */ Py_DECREF(*callresult); /* switch to next unicode()/repr() result */ @@ -2332,6 +2530,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs) PyObject_Free(callresults); if (numberresults) PyObject_Free(numberresults); + assert(_PyUnicode_CheckConsistency(string, 1)); return (PyObject *)string; fail: if (callresults) { @@ -2462,6 +2661,7 @@ PyUnicode_FromOrdinal(int ordinal) if (v == NULL) return NULL; PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal); + assert(_PyUnicode_CheckConsistency(v, 1)); return v; } @@ -2625,10 +2825,13 @@ PyUnicode_Decode(const char *s, goto onError; } Py_DECREF(buffer); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&unicode)) { Py_DECREF(unicode); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(unicode, 1)); return unicode; onError: @@ -2655,6 +2858,7 @@ PyUnicode_AsDecodedObject(PyObject *unicode, v = PyCodec_Decode(unicode, encoding, errors); if (v == NULL) goto onError; + assert(_PyUnicode_CheckConsistency(v, 1)); return v; onError: @@ -2687,6 +2891,7 @@ PyUnicode_AsDecodedUnicode(PyObject *unicode, Py_DECREF(v); goto onError; } + assert(_PyUnicode_CheckConsistency(v, 1)); return v; onError: @@ -3674,10 +3879,13 @@ utf7Error: Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&unicode)) { Py_DECREF(unicode); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(unicode, 1)); return (PyObject *)unicode; onError: @@ -3921,7 +4129,7 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, err = 1; } /* Instead of number of overall bytes for this code point, - n containts the number of following bytes: */ + n contains the number of following bytes: */ --n; /* Check if the follow up chars are all valid continuation bytes */ if (n >= 1) { @@ -4244,10 +4452,13 @@ PyUnicode_DecodeUTF8Stateful(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&unicode)) { Py_DECREF(unicode); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(unicode, 1)); return (PyObject *)unicode; onError: @@ -4747,10 +4958,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&unicode)) { Py_DECREF(unicode); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(unicode, 1)); return (PyObject *)unicode; onError: @@ -5145,10 +5359,13 @@ PyUnicode_DecodeUTF16Stateful(const char *s, Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&unicode)) { Py_DECREF(unicode); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(unicode, 1)); return (PyObject *)unicode; onError: @@ -5604,10 +5821,13 @@ PyUnicode_DecodeUnicodeEscape(const char *s, } Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; ucnhashError: @@ -5905,10 +6125,13 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; onError: @@ -6093,10 +6316,13 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; onError: @@ -6457,72 +6683,91 @@ PyUnicode_DecodeASCII(const char *s, { const char *starts = s; PyUnicodeObject *v; - Py_UNICODE *p; + Py_UNICODE *u; Py_ssize_t startinpos; Py_ssize_t endinpos; Py_ssize_t outpos; const char *e; - unsigned char* d; + int has_error; + const unsigned char *p = (const unsigned char *)s; + const unsigned char *end = p + size; + const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK); PyObject *errorHandler = NULL; PyObject *exc = NULL; - Py_ssize_t i; /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && *(unsigned char*)s < 128) - return PyUnicode_FromOrdinal(*(unsigned char*)s); - - /* Fast path. Assume the input actually *is* ASCII, and allocate - a single-block Unicode object with that assumption. If there is - an error, drop the object and start over. */ - v = (PyUnicodeObject*)PyUnicode_New(size, 127); - if (v == NULL) - goto onError; - d = PyUnicode_1BYTE_DATA(v); - for (i = 0; i < size; i++) { - unsigned char ch = ((unsigned char*)s)[i]; - if (ch < 128) - d[i] = ch; - else + if (size == 1 && (unsigned char)s[0] < 128) + return get_latin1_char((unsigned char)s[0]); + + has_error = 0; + while (p < end && !has_error) { + /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for + an explanation. */ + if (!((size_t) p & LONG_PTR_MASK)) { + /* Help register allocation */ + register const unsigned char *_p = p; + while (_p < aligned_end) { + unsigned long value = *(unsigned long *) _p; + if (value & ASCII_CHAR_MASK) { + has_error = 1; + break; + } + _p += SIZEOF_LONG; + } + if (_p == end) + break; + if (has_error) + break; + p = _p; + } + if (*p & 0x80) { + has_error = 1; break; + } + else { + ++p; + } } - if (i == size) - return (PyObject*)v; - Py_DECREF(v); /* start over */ + if (!has_error) + return unicode_fromascii((const unsigned char *)s, size); v = _PyUnicode_New(size); if (v == NULL) goto onError; if (size == 0) return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); + u = PyUnicode_AS_UNICODE(v); e = s + size; while (s < e) { register unsigned char c = (unsigned char)*s; if (c < 128) { - *p++ = c; + *u++ = c; ++s; } else { startinpos = s-starts; endinpos = startinpos + 1; - outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); + outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v); if (unicode_decode_call_errorhandler( errors, &errorHandler, "ascii", "ordinal not in range(128)", &starts, &e, &startinpos, &endinpos, &exc, &s, - &v, &outpos, &p)) + &v, &outpos, &u)) goto onError; } } - if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) + if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) + if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; onError: @@ -6713,10 +6958,13 @@ PyUnicode_DecodeMBCSStateful(const char *s, goto retry; } #endif +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; } @@ -7012,10 +7260,13 @@ PyUnicode_DecodeCharmap(const char *s, goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); +#ifndef DONT_MAKE_RESULT_READY if (_PyUnicode_READY_REPLACE(&v)) { Py_DECREF(v); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(v, 1)); return (PyObject *)v; onError: @@ -7982,7 +8233,7 @@ PyUnicode_Translate(PyObject *str, } static Py_UCS4 -fix_decimal_and_space_to_ascii(PyUnicodeObject *self) +fix_decimal_and_space_to_ascii(PyObject *self) { /* No need to call PyUnicode_READY(self) because this function is only called as a callback from fixup() which does it already. */ @@ -8032,7 +8283,7 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) Py_INCREF(unicode); return unicode; } - return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii); + return fixup(unicode, fix_decimal_and_space_to_ascii); } PyObject * @@ -8057,10 +8308,13 @@ PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, p[i] = '0' + decimal; } } - if (PyUnicode_READY((PyUnicodeObject*)result) == -1) { +#ifndef DONT_MAKE_RESULT_READY + if (_PyUnicode_READY_REPLACE(&result)) { Py_DECREF(result); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(result, 1)); return result; } /* --- Decimal Encoder ---------------------------------------------------- */ @@ -8203,6 +8457,15 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, /* --- Helpers ------------------------------------------------------------ */ +#include "stringlib/asciilib.h" +#include "stringlib/fastsearch.h" +#include "stringlib/partition.h" +#include "stringlib/split.h" +#include "stringlib/count.h" +#include "stringlib/find.h" +#include "stringlib/localeutil.h" +#include "stringlib/undef.h" + #include "stringlib/ucs1lib.h" #include "stringlib/fastsearch.h" #include "stringlib/partition.h" @@ -8231,7 +8494,10 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, #include "stringlib/undef.h" static Py_ssize_t -any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, +any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t, + const Py_UCS1*, Py_ssize_t, + Py_ssize_t, Py_ssize_t), + Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, const Py_UCS1*, Py_ssize_t, Py_ssize_t, Py_ssize_t), Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t, @@ -8268,7 +8534,10 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, switch(kind) { case PyUnicode_1BYTE_KIND: - result = ucs1(buf1, len1, buf2, len2, start, end); + if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) + result = ascii(buf1, len1, buf2, len2, start, end); + else + result = ucs1(buf1, len1, buf2, len2, start, end); break; case PyUnicode_2BYTE_KIND: result = ucs2(buf1, len1, buf2, len2, start, end); @@ -8289,7 +8558,7 @@ any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t, } Py_ssize_t -_PyUnicode_InsertThousandsGrouping(int kind, void *data, +_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data, Py_ssize_t n_buffer, void *digits, Py_ssize_t n_digits, Py_ssize_t min_width, @@ -8298,9 +8567,14 @@ _PyUnicode_InsertThousandsGrouping(int kind, void *data, { switch(kind) { case PyUnicode_1BYTE_KIND: - return _PyUnicode_ucs1_InsertThousandsGrouping( - (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, - min_width, grouping, thousands_sep); + if (unicode != NULL && PyUnicode_IS_ASCII(unicode)) + return _PyUnicode_ascii_InsertThousandsGrouping( + (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, + min_width, grouping, thousands_sep); + else + return _PyUnicode_ucs1_InsertThousandsGrouping( + (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits, + min_width, grouping, thousands_sep); case PyUnicode_2BYTE_KIND: return _PyUnicode_ucs2_InsertThousandsGrouping( (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits, @@ -8377,10 +8651,16 @@ PyUnicode_Count(PyObject *str, ADJUST_INDICES(start, end, len1); switch(kind) { case PyUnicode_1BYTE_KIND: - result = ucs1lib_count( - ((Py_UCS1*)buf1) + start, end - start, - buf2, len2, PY_SSIZE_T_MAX - ); + if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj)) + result = asciilib_count( + ((Py_UCS1*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); + else + result = ucs1lib_count( + ((Py_UCS1*)buf1) + start, end - start, + buf2, len2, PY_SSIZE_T_MAX + ); break; case PyUnicode_2BYTE_KIND: result = ucs2lib_count( @@ -8437,12 +8717,14 @@ PyUnicode_Find(PyObject *str, if (direction > 0) result = any_find_slice( - ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, + asciilib_find_slice, ucs1lib_find_slice, + ucs2lib_find_slice, ucs4lib_find_slice, str, sub, start, end ); else result = any_find_slice( - ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, + asciilib_find_slice, ucs1lib_rfind_slice, + ucs2lib_rfind_slice, ucs4lib_rfind_slice, str, sub, start, end ); @@ -8575,8 +8857,8 @@ PyUnicode_Tailmatch(PyObject *str, reference to the modified object */ static PyObject * -fixup(PyUnicodeObject *self, - Py_UCS4 (*fixfct)(PyUnicodeObject *s)) +fixup(PyObject *self, + Py_UCS4 (*fixfct)(PyObject *s)) { PyObject *u; Py_UCS4 maxchar_old, maxchar_new = 0; @@ -8596,7 +8878,7 @@ fixup(PyUnicodeObject *self, if the kind of the resulting unicode object does not change, everything is fine. Otherwise we need to change the string kind and re-run the fix function. */ - maxchar_new = fixfct((PyUnicodeObject*)u); + maxchar_new = fixfct(u); if (maxchar_new == 0) /* do nothing, keep maxchar_new at 0 which means no changes. */; else if (maxchar_new <= 127) @@ -8631,33 +8913,22 @@ fixup(PyUnicodeObject *self, /* If the maxchar increased so that the kind changed, not all characters are representable anymore and we need to fix the string again. This only happens in very few cases. */ - if (PyUnicode_CopyCharacters(v, 0, - (PyObject*)self, 0, - PyUnicode_GET_LENGTH(self)) < 0) - { - Py_DECREF(u); - return NULL; - } - maxchar_old = fixfct((PyUnicodeObject*)v); + copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self)); + maxchar_old = fixfct(v); assert(maxchar_old > 0 && maxchar_old <= maxchar_new); } else { - if (PyUnicode_CopyCharacters(v, 0, - u, 0, - PyUnicode_GET_LENGTH(self)) < 0) - { - Py_DECREF(u); - return NULL; - } + copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self)); } Py_DECREF(u); + assert(_PyUnicode_CheckConsistency(v, 1)); return v; } } static Py_UCS4 -fixupper(PyUnicodeObject *self) +fixupper(PyObject *self) { /* No need to call PyUnicode_READY(self) because this function is only called as a callback from fixup() which does it already. */ @@ -8688,7 +8959,7 @@ fixupper(PyUnicodeObject *self) } static Py_UCS4 -fixlower(PyUnicodeObject *self) +fixlower(PyObject *self) { /* No need to call PyUnicode_READY(self) because fixup() which does it. */ const Py_ssize_t len = PyUnicode_GET_LENGTH(self); @@ -8718,7 +8989,7 @@ fixlower(PyUnicodeObject *self) } static Py_UCS4 -fixswapcase(PyUnicodeObject *self) +fixswapcase(PyObject *self) { /* No need to call PyUnicode_READY(self) because fixup() which does it. */ const Py_ssize_t len = PyUnicode_GET_LENGTH(self); @@ -8754,7 +9025,7 @@ fixswapcase(PyUnicodeObject *self) } static Py_UCS4 -fixcapitalize(PyUnicodeObject *self) +fixcapitalize(PyObject *self) { /* No need to call PyUnicode_READY(self) because fixup() which does it. */ const Py_ssize_t len = PyUnicode_GET_LENGTH(self); @@ -8795,7 +9066,7 @@ fixcapitalize(PyUnicodeObject *self) } static Py_UCS4 -fixtitle(PyUnicodeObject *self) +fixtitle(PyObject *self) { /* No need to call PyUnicode_READY(self) because fixup() which does it. */ const Py_ssize_t len = PyUnicode_GET_LENGTH(self); @@ -8851,7 +9122,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) PyObject **items; PyObject *item; Py_ssize_t sz, i, res_offset; - Py_UCS4 maxchar = 0; + Py_UCS4 maxchar; Py_UCS4 item_maxchar; fseq = PySequence_Fast(seq, ""); @@ -8866,18 +9137,22 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) seqlen = PySequence_Fast_GET_SIZE(fseq); /* If empty sequence, return u"". */ if (seqlen == 0) { - res = PyUnicode_New(0, 0); - goto Done; + Py_DECREF(fseq); + Py_INCREF(unicode_empty); + res = unicode_empty; + return res; } - items = PySequence_Fast_ITEMS(fseq); + /* If singleton sequence with an exact Unicode, return that. */ + items = PySequence_Fast_ITEMS(fseq); if (seqlen == 1) { - item = items[0]; - if (PyUnicode_CheckExact(item)) { - Py_INCREF(item); - res = item; - goto Done; + if (PyUnicode_CheckExact(items[0])) { + res = items[0]; + Py_INCREF(res); + Py_DECREF(fseq); + return res; } + sep = NULL; } else { /* Set up sep and seplen */ @@ -8886,6 +9161,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) sep = PyUnicode_FromOrdinal(' '); if (!sep) goto onError; + maxchar = 32; } else { if (!PyUnicode_Check(separator)) { @@ -8900,7 +9176,7 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) sep = separator; seplen = PyUnicode_GET_LENGTH(separator); maxchar = PyUnicode_MAX_CHAR_VALUE(separator); - /* inc refcount to keep this code path symetric with the + /* inc refcount to keep this code path symmetric with the above case of a blank separator */ Py_INCREF(sep); } @@ -8943,38 +9219,24 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) /* Catenate everything. */ for (i = 0, res_offset = 0; i < seqlen; ++i) { - Py_ssize_t itemlen, copied; + Py_ssize_t itemlen; item = items[i]; /* Copy item, and maybe the separator. */ if (i && seplen != 0) { - copied = PyUnicode_CopyCharacters(res, res_offset, - sep, 0, seplen); - if (copied < 0) - goto onError; -#ifdef Py_DEBUG - res_offset += copied; -#else + copy_characters(res, res_offset, sep, 0, seplen); res_offset += seplen; -#endif } itemlen = PyUnicode_GET_LENGTH(item); if (itemlen != 0) { - copied = PyUnicode_CopyCharacters(res, res_offset, - item, 0, itemlen); - if (copied < 0) - goto onError; -#ifdef Py_DEBUG - res_offset += copied; -#else + copy_characters(res, res_offset, item, 0, itemlen); res_offset += itemlen; -#endif } } assert(res_offset == PyUnicode_GET_LENGTH(res)); - Done: Py_DECREF(fseq); Py_XDECREF(sep); + assert(_PyUnicode_CheckConsistency(res, 1)); return res; onError: @@ -9007,8 +9269,8 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) } \ } while (0) -static PyUnicodeObject * -pad(PyUnicodeObject *self, +static PyObject * +pad(PyObject *self, Py_ssize_t left, Py_ssize_t right, Py_UCS4 fill) @@ -9046,15 +9308,9 @@ pad(PyUnicodeObject *self, FILL(kind, data, fill, 0, left); if (right) FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right); - if (PyUnicode_CopyCharacters(u, left, - (PyObject*)self, 0, - _PyUnicode_LENGTH(self)) < 0) - { - Py_DECREF(u); - return NULL; - } - - return (PyUnicodeObject*)u; + copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self)); + assert(_PyUnicode_CheckConsistency(u, 1)); + return u; } #undef FILL @@ -9069,9 +9325,14 @@ PyUnicode_Splitlines(PyObject *string, int keepends) switch(PyUnicode_KIND(string)) { case PyUnicode_1BYTE_KIND: - list = ucs1lib_splitlines( - (PyObject*) string, PyUnicode_1BYTE_DATA(string), - PyUnicode_GET_LENGTH(string), keepends); + if (PyUnicode_IS_ASCII(string)) + list = asciilib_splitlines( + (PyObject*) string, PyUnicode_1BYTE_DATA(string), + PyUnicode_GET_LENGTH(string), keepends); + else + list = ucs1lib_splitlines( + (PyObject*) string, PyUnicode_1BYTE_DATA(string), + PyUnicode_GET_LENGTH(string), keepends); break; case PyUnicode_2BYTE_KIND: list = ucs2lib_splitlines( @@ -9092,8 +9353,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) } static PyObject * -split(PyUnicodeObject *self, - PyUnicodeObject *substring, +split(PyObject *self, + PyObject *substring, Py_ssize_t maxcount) { int kind1, kind2, kind; @@ -9110,10 +9371,16 @@ split(PyUnicodeObject *self, if (substring == NULL) switch(PyUnicode_KIND(self)) { case PyUnicode_1BYTE_KIND: - return ucs1lib_split_whitespace( - (PyObject*) self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount - ); + if (PyUnicode_IS_ASCII(self)) + return asciilib_split_whitespace( + (PyObject*) self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount + ); + else + return ucs1lib_split_whitespace( + (PyObject*) self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount + ); case PyUnicode_2BYTE_KIND: return ucs2lib_split_whitespace( (PyObject*) self, PyUnicode_2BYTE_DATA(self), @@ -9152,8 +9419,12 @@ split(PyUnicodeObject *self, switch(kind) { case PyUnicode_1BYTE_KIND: - out = ucs1lib_split( - (PyObject*) self, buf1, len1, buf2, len2, maxcount); + if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) + out = asciilib_split( + (PyObject*) self, buf1, len1, buf2, len2, maxcount); + else + out = ucs1lib_split( + (PyObject*) self, buf1, len1, buf2, len2, maxcount); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_split( @@ -9174,8 +9445,8 @@ split(PyUnicodeObject *self, } static PyObject * -rsplit(PyUnicodeObject *self, - PyUnicodeObject *substring, +rsplit(PyObject *self, + PyObject *substring, Py_ssize_t maxcount) { int kind1, kind2, kind; @@ -9192,10 +9463,16 @@ rsplit(PyUnicodeObject *self, if (substring == NULL) switch(PyUnicode_KIND(self)) { case PyUnicode_1BYTE_KIND: - return ucs1lib_rsplit_whitespace( - (PyObject*) self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount - ); + if (PyUnicode_IS_ASCII(self)) + return asciilib_rsplit_whitespace( + (PyObject*) self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount + ); + else + return ucs1lib_rsplit_whitespace( + (PyObject*) self, PyUnicode_1BYTE_DATA(self), + PyUnicode_GET_LENGTH(self), maxcount + ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( (PyObject*) self, PyUnicode_2BYTE_DATA(self), @@ -9234,8 +9511,12 @@ rsplit(PyUnicodeObject *self, switch(kind) { case PyUnicode_1BYTE_KIND: - out = ucs1lib_rsplit( - (PyObject*) self, buf1, len1, buf2, len2, maxcount); + if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) + out = asciilib_rsplit( + (PyObject*) self, buf1, len1, buf2, len2, maxcount); + else + out = ucs1lib_rsplit( + (PyObject*) self, buf1, len1, buf2, len2, maxcount); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_rsplit( @@ -9256,12 +9537,15 @@ rsplit(PyUnicodeObject *self, } static Py_ssize_t -anylib_find(int kind, void *buf1, Py_ssize_t len1, - void *buf2, Py_ssize_t len2, Py_ssize_t offset) +anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1, + PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset) { switch(kind) { case PyUnicode_1BYTE_KIND: - return ucs1lib_find(buf1, len1, buf2, len2, offset); + if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2)) + return asciilib_find(buf1, len1, buf2, len2, offset); + else + return ucs1lib_find(buf1, len1, buf2, len2, offset); case PyUnicode_2BYTE_KIND: return ucs2lib_find(buf1, len1, buf2, len2, offset); case PyUnicode_4BYTE_KIND: @@ -9272,12 +9556,15 @@ anylib_find(int kind, void *buf1, Py_ssize_t len1, } static Py_ssize_t -anylib_count(int kind, void* sbuf, Py_ssize_t slen, - void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) +anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen, + PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount) { switch(kind) { case PyUnicode_1BYTE_KIND: - return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); + if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1)) + return asciilib_count(sbuf, slen, buf1, len1, maxcount); + else + return ucs1lib_count(sbuf, slen, buf1, len1, maxcount); case PyUnicode_2BYTE_KIND: return ucs2lib_count(sbuf, slen, buf1, len1, maxcount); case PyUnicode_4BYTE_KIND: @@ -9329,20 +9616,16 @@ replace(PyObject *self, PyObject *str1, maxchar = PyUnicode_MAX_CHAR_VALUE(self); /* Replacing u1 with u2 may cause a maxchar reduction in the result string. */ - mayshrink = maxchar > 127; if (u2 > maxchar) { maxchar = u2; mayshrink = 0; } + else + mayshrink = maxchar > 127; u = PyUnicode_New(slen, maxchar); if (!u) goto error; - if (PyUnicode_CopyCharacters(u, 0, - (PyObject*)self, 0, slen) < 0) - { - Py_DECREF(u); - return NULL; - } + copy_characters(u, 0, self, 0, slen); rkind = PyUnicode_KIND(u); for (i = 0; i < PyUnicode_GET_LENGTH(u); i++) if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) { @@ -9351,21 +9634,23 @@ replace(PyObject *self, PyObject *str1, PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2); } if (mayshrink) { - PyObject *tmp = u; - u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp), - PyUnicode_GET_LENGTH(tmp)); - Py_DECREF(tmp); + unicode_adjust_maxchar(&u); + if (u == NULL) + goto error; } } else { int rkind = skind; char *res; + PyObject *rstr; + Py_UCS4 maxchar; + if (kind1 < rkind) { /* widen substring */ buf1 = _PyUnicode_AsKind(str1, rkind); if (!buf1) goto error; release1 = 1; } - i = anylib_find(rkind, sbuf, slen, buf1, len1, 0); + i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0); if (i < 0) goto nothing; if (rkind > kind2) { @@ -9385,11 +9670,13 @@ replace(PyObject *self, PyObject *str1, if (!buf1) goto error; release1 = 1; } - res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen)); - if (!res) { - PyErr_NoMemory(); + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); + rstr = PyUnicode_New(slen, maxchar); + if (!rstr) goto error; - } + res = PyUnicode_DATA(rstr); + memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen)); /* change everything in-place, starting with this one */ memcpy(res + PyUnicode_KIND_SIZE(rkind, i), @@ -9398,9 +9685,9 @@ replace(PyObject *self, PyObject *str1, i += len1; while ( --maxcount > 0) { - i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i), - slen-i, - buf1, len1, i); + i = anylib_find(rkind, self, + sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i, + str1, buf1, len1, i); if (i == -1) break; memcpy(res + PyUnicode_KIND_SIZE(rkind, i), @@ -9409,23 +9696,26 @@ replace(PyObject *self, PyObject *str1, i += len1; } - u = PyUnicode_FromKindAndData(rkind, res, slen); - PyMem_Free(res); - if (!u) goto error; + u = rstr; + unicode_adjust_maxchar(&u); + if (!u) + goto error; } } else { Py_ssize_t n, i, j, ires; Py_ssize_t product, new_size; int rkind = skind; + PyObject *rstr; char *res; + Py_UCS4 maxchar; if (kind1 < rkind) { buf1 = _PyUnicode_AsKind(str1, rkind); if (!buf1) goto error; release1 = 1; } - n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount); + n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount); if (n == 0) goto nothing; if (kind2 < rkind) { @@ -9457,16 +9747,19 @@ replace(PyObject *self, PyObject *str1, "replace string is too long"); goto error; } - res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size)); - if (!res) + maxchar = PyUnicode_MAX_CHAR_VALUE(self); + maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(str2)); + rstr = PyUnicode_New(new_size, maxchar); + if (!rstr) goto error; + res = PyUnicode_DATA(rstr); ires = i = 0; if (len1 > 0) { while (n-- > 0) { /* look for next match */ - j = anylib_find(rkind, - sbuf + PyUnicode_KIND_SIZE(rkind, i), - slen-i, buf1, len1, i); + j = anylib_find(rkind, self, + sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i, + str1, buf1, len1, i); if (j == -1) break; else if (j > i) { @@ -9509,8 +9802,10 @@ replace(PyObject *self, PyObject *str1, sbuf + PyUnicode_KIND_SIZE(rkind, i), PyUnicode_KIND_SIZE(rkind, slen-i)); } - u = PyUnicode_FromKindAndData(rkind, res, new_size); - PyMem_Free(res); + u = rstr; + unicode_adjust_maxchar(&u); + if (u == NULL) + goto error; } if (srelease) PyMem_FREE(sbuf); @@ -9518,6 +9813,7 @@ replace(PyObject *self, PyObject *str1, PyMem_FREE(buf1); if (release2) PyMem_FREE(buf2); + assert(_PyUnicode_CheckConsistency(u, 1)); return u; nothing: @@ -9552,7 +9848,7 @@ Return a titlecased version of S, i.e. words start with title case\n\ characters, all remaining cased characters have lower case."); static PyObject* -unicode_title(PyUnicodeObject *self) +unicode_title(PyObject *self) { return fixup(self, fixtitle); } @@ -9564,7 +9860,7 @@ Return a capitalized version of S, i.e. make the first character\n\ have upper case and the rest lower case."); static PyObject* -unicode_capitalize(PyUnicodeObject *self) +unicode_capitalize(PyObject *self) { return fixup(self, fixcapitalize); } @@ -9639,7 +9935,7 @@ Return S centered in a string of length width. Padding is\n\ done using the specified fill character (default is a space)"); static PyObject * -unicode_center(PyUnicodeObject *self, PyObject *args) +unicode_center(PyObject *self, PyObject *args) { Py_ssize_t marg, left; Py_ssize_t width; @@ -9659,7 +9955,7 @@ unicode_center(PyUnicodeObject *self, PyObject *args) marg = width - _PyUnicode_LENGTH(self); left = marg / 2 + (marg & width & 1); - return (PyObject*) pad(self, left, marg - left, fillchar); + return pad(self, left, marg - left, fillchar); } #if 0 @@ -9957,14 +10253,11 @@ PyUnicode_Concat(PyObject *left, PyObject *right) maxchar); if (w == NULL) goto onError; - if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0) - goto onError; - if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u), - v, 0, - PyUnicode_GET_LENGTH(v)) < 0) - goto onError; + copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)); + copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v)); Py_DECREF(u); Py_DECREF(v); + assert(_PyUnicode_CheckConsistency(w, 1)); return w; onError: @@ -9977,9 +10270,6 @@ static void unicode_append_inplace(PyObject **p_left, PyObject *right) { Py_ssize_t left_len, right_len, new_len; -#ifdef Py_DEBUG - Py_ssize_t copied; -#endif assert(PyUnicode_IS_READY(*p_left)); assert(PyUnicode_IS_READY(right)); @@ -10006,14 +10296,8 @@ unicode_append_inplace(PyObject **p_left, PyObject *right) goto error; } /* copy 'right' into the newly allocated area of 'left' */ -#ifdef Py_DEBUG - copied = PyUnicode_CopyCharacters(*p_left, left_len, - right, 0, - right_len); - assert(0 <= copied); -#else - PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len); -#endif + copy_characters(*p_left, left_len, right, 0, right_len); + _PyUnicode_DIRTY(*p_left); return; error: @@ -10038,24 +10322,26 @@ PyUnicode_Append(PyObject **p_left, PyObject *right) goto error; } + if (PyUnicode_READY(left)) + goto error; + if (PyUnicode_READY(right)) + goto error; + if (PyUnicode_CheckExact(left) && left != unicode_empty && PyUnicode_CheckExact(right) && right != unicode_empty && unicode_resizable(left) && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left) || _PyUnicode_WSTR(left) != NULL)) { - if (PyUnicode_READY(left)) - goto error; - if (PyUnicode_READY(right)) - goto error; - /* Don't resize for ascii += latin1. Convert ascii to latin1 requires to change the structure size, but characters are stored just after - the structure, and so it requires to move all charactres which is + the structure, and so it requires to move all characters which is not so different than duplicating the string. */ if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) { unicode_append_inplace(p_left, right); + if (p_left != NULL) + assert(_PyUnicode_CheckConsistency(*p_left, 1)); return; } } @@ -10201,6 +10487,9 @@ unicode_expandtabs(PyUnicodeObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize)) return NULL; + if (PyUnicode_READY(self) == -1) + return NULL; + /* First pass: determine size of output string */ src_len = PyUnicode_GET_LENGTH(self); i = j = line_pos = 0; @@ -10262,10 +10551,13 @@ unicode_expandtabs(PyUnicodeObject *self, PyObject *args) } } assert (j == PyUnicode_GET_LENGTH(u)); - if (PyUnicode_READY(u)) { +#ifndef DONT_MAKE_RESULT_READY + if (_PyUnicode_READY_REPLACE(&u)) { Py_DECREF(u); return NULL; } +#endif + assert(_PyUnicode_CheckConsistency(u, 1)); return (PyObject*) u; overflow: @@ -10300,7 +10592,8 @@ unicode_find(PyObject *self, PyObject *args) return NULL; result = any_find_slice( - ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, + asciilib_find_slice, ucs1lib_find_slice, + ucs2lib_find_slice, ucs4lib_find_slice, self, (PyObject*)substring, start, end ); @@ -10393,7 +10686,8 @@ unicode_index(PyObject *self, PyObject *args) return NULL; result = any_find_slice( - ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice, + asciilib_find_slice, ucs1lib_find_slice, + ucs2lib_find_slice, ucs4lib_find_slice, self, (PyObject*)substring, start, end ); @@ -10871,7 +11165,7 @@ Return S left-justified in a Unicode string of length width. Padding is\n\ done using the specified fill character (default is a space)."); static PyObject * -unicode_ljust(PyUnicodeObject *self, PyObject *args) +unicode_ljust(PyObject *self, PyObject *args) { Py_ssize_t width; Py_UCS4 fillchar = ' '; @@ -10896,7 +11190,7 @@ PyDoc_STRVAR(lower__doc__, Return a copy of the string S converted to lowercase."); static PyObject* -unicode_lower(PyUnicodeObject *self) +unicode_lower(PyObject *self) { return fixup(self, fixlower); } @@ -10980,11 +11274,18 @@ PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) return NULL; } - kind = PyUnicode_KIND(self); - data = PyUnicode_1BYTE_DATA(self); - return PyUnicode_FromKindAndData(kind, - data + PyUnicode_KIND_SIZE(kind, start), - length); + if (PyUnicode_IS_ASCII(self)) { + kind = PyUnicode_KIND(self); + data = PyUnicode_1BYTE_DATA(self); + return unicode_fromascii(data + start, length); + } + else { + kind = PyUnicode_KIND(self); + data = PyUnicode_1BYTE_DATA(self); + return PyUnicode_FromKindAndData(kind, + data + PyUnicode_KIND_SIZE(kind, start), + length); + } } static PyObject * @@ -11149,6 +11450,7 @@ unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) } } + assert(_PyUnicode_CheckConsistency(u, 1)); return (PyObject*) u; } @@ -11366,6 +11668,7 @@ unicode_repr(PyObject *unicode) } } /* Closing quote already added at the beginning */ + assert(_PyUnicode_CheckConsistency(repr, 1)); return repr; } @@ -11396,7 +11699,8 @@ unicode_rfind(PyObject *self, PyObject *args) return NULL; result = any_find_slice( - ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, + asciilib_rfind_slice, ucs1lib_rfind_slice, + ucs2lib_rfind_slice, ucs4lib_rfind_slice, self, (PyObject*)substring, start, end ); @@ -11431,7 +11735,8 @@ unicode_rindex(PyObject *self, PyObject *args) return NULL; result = any_find_slice( - ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice, + asciilib_rfind_slice, ucs1lib_rfind_slice, + ucs2lib_rfind_slice, ucs4lib_rfind_slice, self, (PyObject*)substring, start, end ); @@ -11455,7 +11760,7 @@ Return S right-justified in a string of length width. Padding is\n\ done using the specified fill character (default is a space)."); static PyObject * -unicode_rjust(PyUnicodeObject *self, PyObject *args) +unicode_rjust(PyObject *self, PyObject *args) { Py_ssize_t width; Py_UCS4 fillchar = ' '; @@ -11490,7 +11795,7 @@ PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) } } - result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); + result = split(s, sep, maxsplit); Py_DECREF(s); Py_XDECREF(sep); @@ -11507,7 +11812,7 @@ whitespace string is a separator and empty strings are\n\ removed from the result."); static PyObject* -unicode_split(PyUnicodeObject *self, PyObject *args) +unicode_split(PyObject *self, PyObject *args) { PyObject *substring = Py_None; Py_ssize_t maxcount = -1; @@ -11518,7 +11823,7 @@ unicode_split(PyUnicodeObject *self, PyObject *args) if (substring == Py_None) return split(self, NULL, maxcount); else if (PyUnicode_Check(substring)) - return split(self, (PyUnicodeObject *)substring, maxcount); + return split(self, substring, maxcount); else return PyUnicode_Split((PyObject *)self, substring, maxcount); } @@ -11542,12 +11847,12 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) return NULL; } - kind1 = PyUnicode_KIND(str_in); + kind1 = PyUnicode_KIND(str_obj); kind2 = PyUnicode_KIND(sep_obj); - kind = kind1 > kind2 ? kind1 : kind2; - buf1 = PyUnicode_DATA(str_in); + kind = Py_MAX(kind1, kind2); + buf1 = PyUnicode_DATA(str_obj); if (kind1 != kind) - buf1 = _PyUnicode_AsKind(str_in, kind); + buf1 = _PyUnicode_AsKind(str_obj, kind); if (!buf1) goto onError; buf2 = PyUnicode_DATA(sep_obj); @@ -11558,9 +11863,12 @@ PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) len1 = PyUnicode_GET_LENGTH(str_obj); len2 = PyUnicode_GET_LENGTH(sep_obj); - switch(PyUnicode_KIND(str_in)) { + switch(PyUnicode_KIND(str_obj)) { case PyUnicode_1BYTE_KIND: - out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); + if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) + out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); + else + out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2); @@ -11629,7 +11937,10 @@ PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in) switch(PyUnicode_KIND(str_in)) { case PyUnicode_1BYTE_KIND: - out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); + if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj)) + out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); + else + out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2); @@ -11668,9 +11979,9 @@ the separator itself, and the part after it. If the separator is not\n\ found, return S and two empty strings."); static PyObject* -unicode_partition(PyUnicodeObject *self, PyObject *separator) +unicode_partition(PyObject *self, PyObject *separator) { - return PyUnicode_Partition((PyObject *)self, separator); + return PyUnicode_Partition(self, separator); } PyDoc_STRVAR(rpartition__doc__, @@ -11681,9 +11992,9 @@ the part before it, the separator itself, and the part after it. If the\n\ separator is not found, return two empty strings and S."); static PyObject* -unicode_rpartition(PyUnicodeObject *self, PyObject *separator) +unicode_rpartition(PyObject *self, PyObject *separator) { - return PyUnicode_RPartition((PyObject *)self, separator); + return PyUnicode_RPartition(self, separator); } PyObject * @@ -11702,7 +12013,7 @@ PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) } } - result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit); + result = rsplit(s, sep, maxsplit); Py_DECREF(s); Py_XDECREF(sep); @@ -11719,7 +12030,7 @@ splits are done. If sep is not specified, any whitespace string\n\ is a separator."); static PyObject* -unicode_rsplit(PyUnicodeObject *self, PyObject *args) +unicode_rsplit(PyObject *self, PyObject *args) { PyObject *substring = Py_None; Py_ssize_t maxcount = -1; @@ -11730,9 +12041,9 @@ unicode_rsplit(PyUnicodeObject *self, PyObject *args) if (substring == Py_None) return rsplit(self, NULL, maxcount); else if (PyUnicode_Check(substring)) - return rsplit(self, (PyUnicodeObject *)substring, maxcount); + return rsplit(self, substring, maxcount); else - return PyUnicode_RSplit((PyObject *)self, substring, maxcount); + return PyUnicode_RSplit(self, substring, maxcount); } PyDoc_STRVAR(splitlines__doc__, @@ -11773,7 +12084,7 @@ Return a copy of S with uppercase characters converted to lowercase\n\ and vice versa."); static PyObject* -unicode_swapcase(PyUnicodeObject *self) +unicode_swapcase(PyObject *self) { return fixup(self, fixswapcase); } @@ -11915,7 +12226,7 @@ PyDoc_STRVAR(upper__doc__, Return a copy of S converted to uppercase."); static PyObject* -unicode_upper(PyUnicodeObject *self) +unicode_upper(PyObject *self) { return fixup(self, fixupper); } @@ -11927,10 +12238,10 @@ Pad a numeric string S with zeros on the left, to fill a field\n\ of the specified width. The string S is never truncated."); static PyObject * -unicode_zfill(PyUnicodeObject *self, PyObject *args) +unicode_zfill(PyObject *self, PyObject *args) { Py_ssize_t fill; - PyUnicodeObject *u; + PyObject *u; Py_ssize_t width; int kind; void *data; @@ -11968,6 +12279,7 @@ unicode_zfill(PyUnicodeObject *self, PyObject *args) PyUnicode_WRITE(kind, data, fill, '0'); } + assert(_PyUnicode_CheckConsistency(u, 1)); return (PyObject*) u; } @@ -12092,13 +12404,14 @@ The substitutions are identified by braces ('{' and '}')."); static PyObject * unicode__format__(PyObject* self, PyObject* args) { - PyObject *format_spec; + PyObject *format_spec, *out; if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) return NULL; - return _PyUnicode_FormatAdvanced(self, format_spec, 0, + out = _PyUnicode_FormatAdvanced(self, format_spec, 0, PyUnicode_GET_LENGTH(format_spec)); + return out; } PyDoc_STRVAR(p_format__doc__, @@ -12253,9 +12566,10 @@ unicode_subscript(PyUnicodeObject* self, PyObject* item) return unicode_getitem((PyObject*)self, i); } else if (PySlice_Check(item)) { Py_ssize_t start, stop, step, slicelength, cur, i; - const Py_UNICODE* source_buf; - Py_UNICODE* result_buf; - PyObject* result; + PyObject *result; + void *src_data, *dest_data; + int src_kind, dest_kind; + Py_UCS4 ch, max_char, kind_limit; if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self), &start, &stop, &step, &slicelength) < 0) { @@ -12272,22 +12586,32 @@ unicode_subscript(PyUnicodeObject* self, PyObject* item) } else if (step == 1) { return PyUnicode_Substring((PyObject*)self, start, start + slicelength); - } else { - source_buf = PyUnicode_AS_UNICODE((PyObject*)self); - result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength* - sizeof(Py_UNICODE)); - - if (result_buf == NULL) - return PyErr_NoMemory(); - - for (cur = start, i = 0; i < slicelength; cur += step, i++) { - result_buf[i] = source_buf[cur]; + } + /* General case */ + max_char = 0; + src_kind = PyUnicode_KIND(self); + kind_limit = kind_maxchar_limit(src_kind); + src_data = PyUnicode_DATA(self); + for (cur = start, i = 0; i < slicelength; cur += step, i++) { + ch = PyUnicode_READ(src_kind, src_data, cur); + if (ch > max_char) { + max_char = ch; + if (max_char >= kind_limit) + break; } + } + result = PyUnicode_New(slicelength, max_char); + if (result == NULL) + return NULL; + dest_kind = PyUnicode_KIND(result); + dest_data = PyUnicode_DATA(result); - result = PyUnicode_FromUnicode(result_buf, slicelength); - PyObject_FREE(result_buf); - return result; + for (cur = start, i = 0; i < slicelength; cur += step, i++) { + Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur); + PyUnicode_WRITE(dest_kind, dest_data, i, ch); } + assert(_PyUnicode_CheckConsistency(result, 1)); + return result; } else { PyErr_SetString(PyExc_TypeError, "string indices must be integers"); return NULL; @@ -12864,6 +13188,7 @@ PyUnicode_Format(PyObject *format, PyObject *args) Py_DECREF(args); } Py_DECREF(uformat); + assert(_PyUnicode_CheckConsistency(result, 1)); return (PyObject *)result; onError: @@ -12914,7 +13239,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (unicode == NULL) return NULL; assert(_PyUnicode_CHECK(unicode)); - if (_PyUnicode_READY_REPLACE(&unicode)) + if (PyUnicode_READY(unicode)) return NULL; self = (PyUnicodeObject *) type->tp_alloc(type, 0); @@ -12926,7 +13251,11 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) length = PyUnicode_GET_LENGTH(unicode); _PyUnicode_LENGTH(self) = length; +#ifdef Py_DEBUG + _PyUnicode_HASH(self) = -1; +#else _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); +#endif _PyUnicode_STATE(self).interned = 0; _PyUnicode_STATE(self).kind = kind; _PyUnicode_STATE(self).compact = 0; @@ -12981,6 +13310,10 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds) Py_MEMCPY(data, PyUnicode_DATA(unicode), PyUnicode_KIND_SIZE(kind, length + 1)); Py_DECREF(unicode); + assert(_PyUnicode_CheckConsistency(self, 1)); +#ifdef Py_DEBUG + _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode); +#endif return (PyObject *)self; onError: @@ -13062,6 +13395,7 @@ void _PyUnicode_Init(void) /* Init the implementation */ unicode_empty = PyUnicode_New(0, 0); + assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); if (!unicode_empty) Py_FatalError("Can't create empty string"); @@ -13122,7 +13456,7 @@ PyUnicode_InternInPlace(PyObject **p) if (PyUnicode_CHECK_INTERNED(s)) return; if (_PyUnicode_READY_REPLACE(p)) { - assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace"); + assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace"); return; } s = (PyUnicodeObject *)(*p); @@ -13208,8 +13542,10 @@ _Py_ReleaseInternedUnicodeStrings(void) n); for (i = 0; i < n; i++) { s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i); - if (PyUnicode_READY(s) == -1) + if (PyUnicode_READY(s) == -1) { + assert(0 && "could not ready string"); fprintf(stderr, "could not ready string\n"); + } switch (PyUnicode_CHECK_INTERNED(s)) { case SSTATE_NOT_INTERNED: /* XXX Shouldn't happen */ diff --git a/Python/_warnings.c b/Python/_warnings.c index 2bcca91..792e3ed 100644 --- a/Python/_warnings.c +++ b/Python/_warnings.c @@ -497,9 +497,16 @@ setup_context(Py_ssize_t stack_level, PyObject **filename, int *lineno, /* Setup filename. */ *filename = PyDict_GetItemString(globals, "__file__"); if (*filename != NULL && PyUnicode_Check(*filename)) { - Py_ssize_t len = PyUnicode_GetSize(*filename); - int kind = PyUnicode_KIND(*filename); - void *data = PyUnicode_DATA(*filename); + Py_ssize_t len; + int kind; + void *data; + + if (PyUnicode_READY(*filename)) + goto handle_error; + + len = PyUnicode_GetSize(*filename); + kind = PyUnicode_KIND(*filename); + data = PyUnicode_DATA(*filename); /* if filename.lower().endswith((".pyc", ".pyo")): */ if (len >= 4 && diff --git a/Python/formatter_unicode.c b/Python/formatter_unicode.c index 609df64..a389734 100644 --- a/Python/formatter_unicode.c +++ b/Python/formatter_unicode.c @@ -501,7 +501,7 @@ calc_number_widths(NumberFieldWidths *spec, Py_ssize_t n_prefix, spec->n_grouped_digits = 0; else spec->n_grouped_digits = _PyUnicode_InsertThousandsGrouping( - PyUnicode_1BYTE_KIND, NULL, 0, NULL, + NULL, PyUnicode_1BYTE_KIND, NULL, 0, NULL, spec->n_digits, spec->n_min_width, locale->grouping, locale->thousands_sep); @@ -603,7 +603,7 @@ fill_number(PyObject *out, Py_ssize_t pos, const NumberFieldWidths *spec, r = #endif _PyUnicode_InsertThousandsGrouping( - kind, + out, kind, (char*)data + PyUnicode_KIND_SIZE(kind, pos), spec->n_grouped_digits, pdigits + PyUnicode_KIND_SIZE(kind, d_pos), @@ -1284,33 +1284,31 @@ _PyUnicode_FormatAdvanced(PyObject *obj, Py_ssize_t start, Py_ssize_t end) { InternalFormatSpec format; - PyObject *result = NULL; + PyObject *result; /* check for the special case of zero length format spec, make it equivalent to str(obj) */ - if (start == end) { - result = PyObject_Str(obj); - goto done; - } + if (start == end) + return PyObject_Str(obj); /* parse the format_spec */ if (!parse_internal_render_format_spec(format_spec, start, end, &format, 's', '<')) - goto done; + return NULL; /* type conversion? */ switch (format.type) { case 's': /* no type conversion needed, already a string. do the formatting */ result = format_string_internal(obj, &format); + if (result != NULL) + assert(_PyUnicode_CheckConsistency(result, 1)); break; default: /* unknown */ unknown_presentation_type(format.type, obj->ob_type->tp_name); - goto done; + result = NULL; } - -done: return result; } diff --git a/Python/getargs.c b/Python/getargs.c index 0e7d9c4..2c2db36 100644 --- a/Python/getargs.c +++ b/Python/getargs.c @@ -834,14 +834,21 @@ convertsimple(PyObject *arg, const char **p_format, va_list *p_va, int flags, case 'C': {/* unicode char */ int *p = va_arg(*p_va, int *); - if (PyUnicode_Check(arg) && - PyUnicode_GET_LENGTH(arg) == 1) { - int kind = PyUnicode_KIND(arg); - void *data = PyUnicode_DATA(arg); - *p = PyUnicode_READ(kind, data, 0); - } - else + int kind; + void *data; + + if (!PyUnicode_Check(arg)) + return converterr("a unicode character", arg, msgbuf, bufsize); + + if (PyUnicode_READY(arg)) + RETURN_ERR_OCCURRED; + + if (PyUnicode_GET_LENGTH(arg) != 1) return converterr("a unicode character", arg, msgbuf, bufsize); + + kind = PyUnicode_KIND(arg); + data = PyUnicode_DATA(arg); + *p = PyUnicode_READ(kind, data, 0); break; } diff --git a/Python/import.c b/Python/import.c index 9f094c0..5f84ac2 100644 --- a/Python/import.c +++ b/Python/import.c @@ -1785,6 +1785,9 @@ find_module_path(PyObject *fullname, PyObject *name, PyObject *path, else return 0; + if (PyUnicode_READY(path_unicode)) + return -1; + len = PyUnicode_GET_LENGTH(path_unicode); if (!PyUnicode_AsUCS4(path_unicode, buf, Py_ARRAY_LENGTH(buf), 1)) { Py_DECREF(path_unicode); diff --git a/Python/traceback.c b/Python/traceback.c index 9a11bf2..b66c96c 100644 --- a/Python/traceback.c +++ b/Python/traceback.c @@ -483,7 +483,8 @@ dump_ascii(int fd, PyObject *text) Py_ssize_t i, size; int truncated; int kind; - void *data; + void *data = NULL; + wchar_t *wstr = NULL; Py_UCS4 ch; size = ascii->length; @@ -494,11 +495,17 @@ dump_ascii(int fd, PyObject *text) else data = ((PyCompactUnicodeObject*)text) + 1; } - else { + else if (kind != PyUnicode_WCHAR_KIND) { data = ((PyUnicodeObject *)text)->data.any; if (data == NULL) return; } + else { + wstr = ((PyASCIIObject *)text)->wstr; + if (wstr == NULL) + return; + size = ((PyCompactUnicodeObject *)text)->wstr_length; + } if (MAX_STRING_LENGTH < size) { size = MAX_STRING_LENGTH; @@ -508,7 +515,10 @@ dump_ascii(int fd, PyObject *text) truncated = 0; for (i=0; i < size; i++) { - ch = PyUnicode_READ(kind, data, i); + if (kind != PyUnicode_WCHAR_KIND) + ch = PyUnicode_READ(kind, data, i); + else + ch = wstr[i]; if (ch < 128) { char c = (char)ch; write(fd, &c, 1); diff --git a/Tools/pybench/pybench.py b/Tools/pybench/pybench.py index 8eaad63..cc1e55c 100755 --- a/Tools/pybench/pybench.py +++ b/Tools/pybench/pybench.py @@ -107,6 +107,7 @@ def get_machine_details(): print('Getting machine details...') buildno, builddate = platform.python_build() python = platform.python_version() + # XXX this is now always UCS4, maybe replace it with 'PEP393' in 3.3+? if sys.maxunicode == 65535: # UCS2 build (standard) unitype = 'UCS2' diff --git a/Tools/unicode/comparecodecs.py b/Tools/unicode/comparecodecs.py index 0f5c1e2..7de14fd 100644 --- a/Tools/unicode/comparecodecs.py +++ b/Tools/unicode/comparecodecs.py @@ -14,7 +14,7 @@ def compare_codecs(encoding1, encoding2): print('Comparing encoding/decoding of %r and %r' % (encoding1, encoding2)) mismatch = 0 # Check encoding - for i in range(sys.maxunicode): + for i in range(sys.maxunicode+1): u = chr(i) try: c1 = u.encode(encoding1) |