From bd1c68c94f0d5bc25b48f62c7527f51c754f2b6b Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Wed, 24 Oct 2007 18:55:37 +0000 Subject: Patch #1303: Adapt str8 constructor to bytes (now buffer) one. --- Lib/modulefinder.py | 12 ++-- Lib/pickletools.py | 2 +- Lib/struct.py | 2 +- Lib/test/test_builtin.py | 3 +- Lib/test/test_bytes.py | 62 ++++++++--------- Lib/test/test_codeccallbacks.py | 6 +- Lib/test/test_compile.py | 2 +- Lib/test/test_io.py | 2 +- Lib/test/test_locale.py | 2 +- Lib/test/test_struct.py | 40 ++++++----- Lib/test/test_sys.py | 4 +- Lib/test/test_unicode.py | 4 +- Lib/test/test_unicodedata.py | 2 +- Lib/test/testcodec.py | 2 +- Objects/stringobject.c | 145 ++++++++++++++++++++++++++++++++++++++-- 15 files changed, 215 insertions(+), 75 deletions(-) diff --git a/Lib/modulefinder.py b/Lib/modulefinder.py index cd9906c..cc5ad19 100644 --- a/Lib/modulefinder.py +++ b/Lib/modulefinder.py @@ -17,12 +17,12 @@ else: READ_MODE = "r" # XXX Clean up once str8's cstor matches bytes. -LOAD_CONST = str8(chr(dis.opname.index('LOAD_CONST'))) -IMPORT_NAME = str8(chr(dis.opname.index('IMPORT_NAME'))) -STORE_NAME = str8(chr(dis.opname.index('STORE_NAME'))) -STORE_GLOBAL = str8(chr(dis.opname.index('STORE_GLOBAL'))) +LOAD_CONST = str8([dis.opname.index('LOAD_CONST')]) +IMPORT_NAME = str8([dis.opname.index('IMPORT_NAME')]) +STORE_NAME = str8([dis.opname.index('STORE_NAME')]) +STORE_GLOBAL = str8([dis.opname.index('STORE_GLOBAL')]) STORE_OPS = [STORE_NAME, STORE_GLOBAL] -HAVE_ARGUMENT = str8(chr(dis.HAVE_ARGUMENT)) +HAVE_ARGUMENT = str8([dis.HAVE_ARGUMENT]) # Modulefinder does a good job at simulating Python's, but it can not # handle __path__ modifications packages make at runtime. Therefore there @@ -368,7 +368,7 @@ class ModuleFinder: consts = co.co_consts LOAD_LOAD_AND_IMPORT = LOAD_CONST + LOAD_CONST + IMPORT_NAME while code: - c = str8(chr(code[0])) + c = str8([code[0]]) if c in STORE_OPS: oparg, = unpack('>> import pickle ->>> x = [1, 2, (3, 4), {str8('abc'): "def"}] +>>> x = [1, 2, (3, 4), {str8(b'abc'): "def"}] >>> pkl = pickle.dumps(x, 0) >>> dis(pkl) 0: ( MARK diff --git a/Lib/struct.py b/Lib/struct.py index 027caa2..10085b7 100644 --- a/Lib/struct.py +++ b/Lib/struct.py @@ -36,7 +36,7 @@ from _struct import Struct as _Struct, error class Struct(_Struct): def __init__(self, fmt): if isinstance(fmt, str): - fmt = str8(fmt) + fmt = str8(fmt, 'latin1') _Struct.__init__(self, fmt) _MAXCACHE = 100 diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index b6b45ee..9670be0 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -580,7 +580,8 @@ class BuiltinTest(unittest.TestCase): self.assertEqual(hash(1), hash(1)) self.assertEqual(hash(1), hash(1.0)) hash('spam') - self.assertEqual(hash('spam'), hash(str8('spam'))) + self.assertEqual(hash('spam'), hash(str8(b'spam'))) # remove str8() + # when b"" is immutable hash((0,1,2,3)) def f(): pass self.assertRaises(TypeError, hash, []) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index c7c6bd3..2a4b6d8 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -103,33 +103,33 @@ class BytesTest(unittest.TestCase): self.failIf(b3 <= b2) def test_compare_to_str(self): - self.assertEqual(b"abc" == str8("abc"), True) - self.assertEqual(b"ab" != str8("abc"), True) - self.assertEqual(b"ab" <= str8("abc"), True) - self.assertEqual(b"ab" < str8("abc"), True) - self.assertEqual(b"abc" >= str8("ab"), True) - self.assertEqual(b"abc" > str8("ab"), True) - - self.assertEqual(b"abc" != str8("abc"), False) - self.assertEqual(b"ab" == str8("abc"), False) - self.assertEqual(b"ab" > str8("abc"), False) - self.assertEqual(b"ab" >= str8("abc"), False) - self.assertEqual(b"abc" < str8("ab"), False) - self.assertEqual(b"abc" <= str8("ab"), False) - - self.assertEqual(str8("abc") == b"abc", True) - self.assertEqual(str8("ab") != b"abc", True) - self.assertEqual(str8("ab") <= b"abc", True) - self.assertEqual(str8("ab") < b"abc", True) - self.assertEqual(str8("abc") >= b"ab", True) - self.assertEqual(str8("abc") > b"ab", True) - - self.assertEqual(str8("abc") != b"abc", False) - self.assertEqual(str8("ab") == b"abc", False) - self.assertEqual(str8("ab") > b"abc", False) - self.assertEqual(str8("ab") >= b"abc", False) - self.assertEqual(str8("abc") < b"ab", False) - self.assertEqual(str8("abc") <= b"ab", False) + self.assertEqual(b"abc" == str8(b"abc"), True) + self.assertEqual(b"ab" != str8(b"abc"), True) + self.assertEqual(b"ab" <= str8(b"abc"), True) + self.assertEqual(b"ab" < str8(b"abc"), True) + self.assertEqual(b"abc" >= str8(b"ab"), True) + self.assertEqual(b"abc" > str8(b"ab"), True) + + self.assertEqual(b"abc" != str8(b"abc"), False) + self.assertEqual(b"ab" == str8(b"abc"), False) + self.assertEqual(b"ab" > str8(b"abc"), False) + self.assertEqual(b"ab" >= str8(b"abc"), False) + self.assertEqual(b"abc" < str8(b"ab"), False) + self.assertEqual(b"abc" <= str8(b"ab"), False) + + self.assertEqual(str8(b"abc") == b"abc", True) + self.assertEqual(str8(b"ab") != b"abc", True) + self.assertEqual(str8(b"ab") <= b"abc", True) + self.assertEqual(str8(b"ab") < b"abc", True) + self.assertEqual(str8(b"abc") >= b"ab", True) + self.assertEqual(str8(b"abc") > b"ab", True) + + self.assertEqual(str8(b"abc") != b"abc", False) + self.assertEqual(str8(b"ab") == b"abc", False) + self.assertEqual(str8(b"ab") > b"abc", False) + self.assertEqual(str8(b"ab") >= b"abc", False) + self.assertEqual(str8(b"abc") < b"ab", False) + self.assertEqual(str8(b"abc") <= b"ab", False) # Byte comparisons with unicode should always fail! # Test this for all expected byte orders and Unicode character sizes @@ -345,7 +345,7 @@ class BytesTest(unittest.TestCase): self.assertEqual(b.decode("utf8", "ignore"), "Hello world\n") def test_from_buffer(self): - sample = str8("Hello world\n\x80\x81\xfe\xff") + sample = str8(b"Hello world\n\x80\x81\xfe\xff") buf = memoryview(sample) b = bytes(buf) self.assertEqual(b, bytes(sample)) @@ -367,8 +367,8 @@ class BytesTest(unittest.TestCase): b1 = b"abc" b2 = b"def" self.assertEqual(b1 + b2, b"abcdef") - self.assertEqual(b1 + str8("def"), b"abcdef") - self.assertEqual(str8("def") + b1, b"defabc") + self.assertEqual(b1 + str8(b"def"), b"abcdef") + self.assertEqual(str8(b"def") + b1, b"defabc") self.assertRaises(TypeError, lambda: b1 + "def") self.assertRaises(TypeError, lambda: "abc" + b2) @@ -391,7 +391,7 @@ class BytesTest(unittest.TestCase): self.assertEqual(b, b"abcdef") self.assertEqual(b, b1) self.failUnless(b is b1) - b += str8("xyz") + b += str8(b"xyz") self.assertEqual(b, b"abcdefxyz") try: b += "" diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index eedd48a..0256eb6 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -181,7 +181,7 @@ class CodecCallbackTest(unittest.TestCase): # mapped through the encoding again. This means, that # to be able to use e.g. the "replace" handler, the # charmap has to have a mapping for "?". - charmap = dict((ord(c), str8(2*c.upper())) for c in "abcdefgh") + charmap = dict((ord(c), str8(2*c.upper(), 'ascii')) for c in "abcdefgh") sin = "abc" sout = b"AABBCC" self.assertEquals(codecs.charmap_encode(sin, "strict", charmap)[0], sout) @@ -189,7 +189,7 @@ class CodecCallbackTest(unittest.TestCase): sin = "abcA" self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) - charmap[ord("?")] = str8("XYZ") + charmap[ord("?")] = str8(b"XYZ") sin = "abcDEF" sout = b"AABBCCXYZXYZXYZ" self.assertEquals(codecs.charmap_encode(sin, "replace", charmap)[0], sout) @@ -309,7 +309,7 @@ class CodecCallbackTest(unittest.TestCase): # check with one argument too much self.assertRaises(TypeError, exctype, *(args + ["too much"])) # check with one argument of the wrong type - wrongargs = [ "spam", str8("eggs"), b"spam", 42, 1.0, None ] + wrongargs = [ "spam", str8(b"eggs"), b"spam", 42, 1.0, None ] for i in range(len(args)): for wrongarg in wrongargs: if type(wrongarg) is type(args[i]): diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py index d6a51dd..1d54953 100644 --- a/Lib/test/test_compile.py +++ b/Lib/test/test_compile.py @@ -157,7 +157,7 @@ if 1: s256 = "".join(["\n"] * 256 + ["spam"]) co = compile(s256, 'fn', 'exec') self.assertEqual(co.co_firstlineno, 257) - self.assertEqual(co.co_lnotab, str8('')) + self.assertEqual(co.co_lnotab, str8()) def test_literals_with_leading_zeroes(self): for arg in ["077787", "0xj", "0x.", "0e", "090000000000000", diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index 644593d..e826ff4 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -88,7 +88,7 @@ class IOTest(unittest.TestCase): self.assertEqual(f.tell(), 6) self.assertEqual(f.seek(-1, 1), 5) self.assertEqual(f.tell(), 5) - self.assertEqual(f.write(str8(" world\n\n\n")), 9) + self.assertEqual(f.write(str8(b" world\n\n\n")), 9) self.assertEqual(f.seek(0), 0) self.assertEqual(f.write(b"h"), 1) self.assertEqual(f.seek(-1, 2), 13) diff --git a/Lib/test/test_locale.py b/Lib/test/test_locale.py index 1bfb3b3..5fcba6f 100644 --- a/Lib/test/test_locale.py +++ b/Lib/test/test_locale.py @@ -82,7 +82,7 @@ finally: # Test BSD Rune locale's bug for isctype functions. def teststrop(s, method, output): - s = str8(s) + s = str8(s, 'latin1') # XXX if verbose: print("%s.%s() =? %s ..." % (repr(s), method, repr(output)), end=' ') result = getattr(s, method)() diff --git a/Lib/test/test_struct.py b/Lib/test/test_struct.py index 7942045..d2b5643 100644 --- a/Lib/test/test_struct.py +++ b/Lib/test/test_struct.py @@ -101,7 +101,7 @@ s = struct.pack('ii', 1, 2) simple_err(struct.unpack, 'iii', s) simple_err(struct.unpack, 'i', s) -c = str8('a') +c = str8(b'a') b = 1 h = 255 i = 65535 @@ -186,7 +186,7 @@ for fmt, arg, big, lil, asy in tests: if isinstance(arg, str): # Strings are returned as str8 since you can't know the encoding of # the string when packed. - arg = str8(arg) + arg = str8(arg, 'latin1') if rev != arg and not asy: raise TestFailed("unpack(%r, %r) -> (%r,) # expected (%r,)" % ( fmt, res, rev, arg)) @@ -428,14 +428,14 @@ for args in [("bB", 1), def test_p_code(): for code, input, expected, expectedback in [ - ('p','abc', '\x00', str8('')), - ('1p', 'abc', '\x00', str8('')), - ('2p', 'abc', '\x01a', str8('a')), - ('3p', 'abc', '\x02ab', str8('ab')), - ('4p', 'abc', '\x03abc', str8('abc')), - ('5p', 'abc', '\x03abc\x00', str8('abc')), - ('6p', 'abc', '\x03abc\x00\x00', str8('abc')), - ('1000p', 'x'*1000, '\xff' + 'x'*999, str8('x'*255))]: + ('p','abc', '\x00', str8()), + ('1p', 'abc', '\x00', str8()), + ('2p', 'abc', '\x01a', str8(b'a')), + ('3p', 'abc', '\x02ab', str8(b'ab')), + ('4p', 'abc', '\x03abc', str8(b'abc')), + ('5p', 'abc', '\x03abc\x00', str8(b'abc')), + ('6p', 'abc', '\x03abc\x00\x00', str8(b'abc')), + ('1000p', 'x'*1000, '\xff' + 'x'*999, str8(b'x'*255))]: expected = bytes(expected, "latin-1") got = struct.pack(code, input) if got != expected: @@ -564,20 +564,24 @@ def test_unpack_from(): if verbose: print("test_unpack_from using", cls.__name__) data = cls(test_string) - vereq(s.unpack_from(data), (str8('abcd'),)) - vereq(s.unpack_from(data, 2), (str8('cd01'),)) - vereq(s.unpack_from(data, 4), (str8('0123'),)) + if not isinstance(data, (str8, bytes)): + bytes_data = str8(data, 'latin1') + else: + bytes_data = data + vereq(s.unpack_from(data), (str8(b'abcd'),)) + vereq(s.unpack_from(data, 2), (str8(b'cd01'),)) + vereq(s.unpack_from(data, 4), (str8(b'0123'),)) for i in range(6): - vereq(s.unpack_from(data, i), (str8(data[i:i+4]),)) + vereq(s.unpack_from(data, i), (bytes_data[i:i+4],)) for i in range(6, len(test_string) + 1): simple_err(s.unpack_from, data, i) for cls in (str, str8, bytes): # XXX + memoryview data = cls(test_string) - vereq(struct.unpack_from(fmt, data), (str8('abcd'),)) - vereq(struct.unpack_from(fmt, data, 2), (str8('cd01'),)) - vereq(struct.unpack_from(fmt, data, 4), (str8('0123'),)) + vereq(struct.unpack_from(fmt, data), (str8(b'abcd'),)) + vereq(struct.unpack_from(fmt, data, 2), (str8(b'cd01'),)) + vereq(struct.unpack_from(fmt, data, 4), (str8(b'0123'),)) for i in range(6): - vereq(struct.unpack_from(fmt, data, i), (str8(data[i:i+4]),)) + vereq(struct.unpack_from(fmt, data, i), (bytes_data[i:i+4],)) for i in range(6, len(test_string) + 1): simple_err(struct.unpack_from, fmt, data, i) diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index ddf1876..8741830 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -300,7 +300,7 @@ class SysModuleTest(unittest.TestCase): def test_intern(self): self.assertRaises(TypeError, sys.intern) - s = str8("never interned before") + s = str8(b"never interned before") self.assert_(sys.intern(s) is s) s2 = s.swapcase().swapcase() self.assert_(sys.intern(s2) is s) @@ -314,7 +314,7 @@ class SysModuleTest(unittest.TestCase): def __hash__(self): return 123 - self.assertRaises(TypeError, sys.intern, S("abc")) + self.assertRaises(TypeError, sys.intern, S(b"abc")) s = "never interned as unicode before" self.assert_(sys.intern(s) is s) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 9aad59a..4970845 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -203,8 +203,8 @@ class UnicodeTest( self.assertRaises(TypeError, 'replace'.replace, "r", 42) def test_str8_comparison(self): - self.assertEqual('abc' == str8('abc'), False) - self.assertEqual('abc' != str8('abc'), True) + self.assertEqual('abc' == str8(b'abc'), False) + self.assertEqual('abc' != str8(b'abc'), True) def test_comparison(self): # Comparisons: diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 58fc73d..ff2dcf5 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -176,7 +176,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): def test_east_asian_width(self): eaw = self.db.east_asian_width - self.assertRaises(TypeError, eaw, str8('a')) + self.assertRaises(TypeError, eaw, str8(b'a')) self.assertRaises(TypeError, eaw, '') self.assertRaises(TypeError, eaw, 'ra') self.assertEqual(eaw('\x1e'), 'N') diff --git a/Lib/test/testcodec.py b/Lib/test/testcodec.py index e7f836b..0db18c1 100644 --- a/Lib/test/testcodec.py +++ b/Lib/test/testcodec.py @@ -36,7 +36,7 @@ def getregentry(): decoding_map = codecs.make_identity_dict(range(256)) decoding_map.update({ 0x78: "abc", # 1-n decoding mapping - str8("abc"): 0x0078,# 1-n encoding mapping + str8(b"abc"): 0x0078,# 1-n encoding mapping 0x01: None, # decoding mapping to 0x79: "", # decoding mapping to }) diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 6371aa3..eba3284 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -3020,16 +3020,151 @@ str_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds); static PyObject * string_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { - PyObject *x = NULL; - static char *kwlist[] = {"object", 0}; + PyObject *x = NULL, *it; + PyObject *(*iternext)(PyObject *); + const char *encoding = NULL; + const char *errors = NULL; + PyObject *new = NULL; + Py_ssize_t i, size; + static char *kwlist[] = {"object", "encoding", "errors", 0}; if (type != &PyString_Type) return str_subtype_new(type, args, kwds); - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:str8", kwlist, &x)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str8", kwlist, &x, + &encoding, &errors)) return NULL; - if (x == NULL) + if (x == NULL) { + if (encoding != NULL || errors != NULL) { + PyErr_SetString(PyExc_TypeError, + "encoding or errors without sequence " + "argument"); + return NULL; + } return PyString_FromString(""); - return PyObject_Str(x); + } + + if (PyUnicode_Check(x)) { + /* Encode via the codec registry */ + if (encoding == NULL) { + PyErr_SetString(PyExc_TypeError, + "string argument without an encoding"); + return NULL; + } + new = PyCodec_Encode(x, encoding, errors); + if (new == NULL) + return NULL; + /* XXX(gb): must accept bytes here since codecs output bytes + at the moment */ + if (PyBytes_Check(new)) { + PyObject *str; + str = PyString_FromString(PyBytes_AsString(new)); + Py_DECREF(new); + if (!str) + return NULL; + return str; + } + if (!PyString_Check(new)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a str8 " + "object (type=%.400s)", + Py_Type(new)->tp_name); + Py_DECREF(new); + return NULL; + } + return new; + } + + /* If it's not unicode, there can't be encoding or errors */ + if (encoding != NULL || errors != NULL) { + PyErr_SetString(PyExc_TypeError, + "encoding or errors without a string argument"); + return NULL; + } + + /* Use the modern buffer interface */ + if (PyObject_CheckBuffer(x)) { + Py_buffer view; + if (PyObject_GetBuffer(x, &view, PyBUF_FULL_RO) < 0) + return NULL; + new = PyString_FromStringAndSize(NULL, view.len); + if (!new) + goto fail; + // XXX(brett.cannon): Better way to get to internal buffer? + if (PyBuffer_ToContiguous(((PyStringObject *)new)->ob_sval, + &view, view.len, 'C') < 0) + goto fail; + PyObject_ReleaseBuffer(x, &view); + return new; + fail: + Py_XDECREF(new); + PyObject_ReleaseBuffer(x, &view); + return NULL; + } + + /* For the iterator version, create a string object and resize as needed. */ + /* XXX(gb): is 64 a good value? also, optimize this if length is known */ + size = 64; + new = PyString_FromStringAndSize(NULL, size); + if (new == NULL) + return NULL; + + /* XXX Optimize this if the arguments is a list, tuple */ + + /* Get the iterator */ + it = PyObject_GetIter(x); + if (it == NULL) + goto error; + // XXX(brett.cannon): No API for this? + iternext = *Py_Type(it)->tp_iternext; + + /* Run the iterator to exhaustion */ + for (i = 0; ; i++) { + PyObject *item; + Py_ssize_t value; + + /* Get the next item */ + item = iternext(it); + if (item == NULL) { + if (PyErr_Occurred()) { + if (!PyErr_ExceptionMatches(PyExc_StopIteration)) + goto error; + PyErr_Clear(); + } + break; + } + + /* Interpret it as an int (__index__) */ + value = PyNumber_AsSsize_t(item, PyExc_ValueError); + Py_DECREF(item); + if (value == -1 && PyErr_Occurred()) + goto error; + + /* Range check */ + if (value < 0 || value >= 256) { + PyErr_SetString(PyExc_ValueError, + "bytes must be in range(0, 256)"); + goto error; + } + + /* Append the byte */ + if (i >= size) { + size *= 2; + if (_PyString_Resize(&new, size) < 0) + goto error; + } + ((PyStringObject *)new)->ob_sval[i] = value; + } + _PyString_Resize(&new, i); + + /* Clean up and return success */ + Py_DECREF(it); + return new; + + error: + /* Error handling when it != NULL */ + Py_XDECREF(it); + Py_DECREF(new); + return NULL; } static PyObject * -- cgit v0.12