diff options
author | Guido van Rossum <guido@python.org> | 2007-11-06 21:34:58 (GMT) |
---|---|---|
committer | Guido van Rossum <guido@python.org> | 2007-11-06 21:34:58 (GMT) |
commit | 98297ee7815939b124156e438b22bd652d67b5db (patch) | |
tree | a9d239ebd87c73af2571ab48003984c4e18e27e5 /Objects/unicodeobject.c | |
parent | a19f80c6df2df5e8a5d0cff37131097835ef971e (diff) | |
download | cpython-98297ee7815939b124156e438b22bd652d67b5db.zip cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.gz cpython-98297ee7815939b124156e438b22bd652d67b5db.tar.bz2 |
Merging the py3k-pep3137 branch back into the py3k branch.
No detailed change log; just check out the change log for the py3k-pep3137
branch. The most obvious changes:
- str8 renamed to bytes (PyString at the C level);
- bytes renamed to buffer (PyBytes at the C level);
- PyString and PyUnicode are no longer compatible.
I.e. we now have an immutable bytes type and a mutable bytes type.
The behavior of PyString was modified quite a bit, to make it more
bytes-like. Some changes are still on the to-do list.
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r-- | Objects/unicodeobject.c | 225 |
1 files changed, 94 insertions, 131 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index c568a8e..ae34c9e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -101,7 +101,7 @@ extern "C" { function will delete the reference from this dictionary. Another way to look at this is that to say that the actual reference - count of a string is: s->ob_refcnt + (s->ob_sstate?2:0) + count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ static PyObject *interned; @@ -998,7 +998,10 @@ PyObject *PyUnicode_FromObject(register PyObject *obj) return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj)); } - return PyUnicode_FromEncodedObject(obj, NULL, "strict"); + PyErr_Format(PyExc_TypeError, + "Can't convert '%.100s' object to str implicitly", + Py_Type(obj)->tp_name); + return NULL; } PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, @@ -1219,22 +1222,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, v = PyCodec_Encode(unicode, encoding, errors); if (v == NULL) goto onError; - if (!PyBytes_Check(v)) { - if (PyString_Check(v)) { - /* Old codec, turn it into bytes */ - PyObject *b = PyBytes_FromObject(v); - Py_DECREF(v); - return b; - } - PyErr_Format(PyExc_TypeError, - "encoder did not return a bytes object " - "(type=%.400s, encoding=%.20s, errors=%.20s)", - v->ob_type->tp_name, - encoding ? encoding : "NULL", - errors ? errors : "NULL"); - Py_DECREF(v); - goto onError; - } + assert(PyString_Check(v)); return v; onError: @@ -1245,19 +1233,15 @@ PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode, const char *errors) { PyObject *v = ((PyUnicodeObject *)unicode)->defenc; - PyObject *b; if (v) return v; if (errors != NULL) Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString"); - b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), + v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), NULL); - if (!b) + if (!v) return NULL; - v = PyString_FromStringAndSize(PyBytes_AsString(b), - PyBytes_Size(b)); - Py_DECREF(b); ((PyUnicodeObject *)unicode)->defenc = v; return v; } @@ -1420,11 +1404,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject); if (!inputobj) goto onError; - if (!PyBytes_Check(inputobj)) { + if (!PyString_Check(inputobj)) { PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes"); } - *input = PyBytes_AS_STRING(inputobj); - insize = PyBytes_GET_SIZE(inputobj); + *input = PyString_AS_STRING(inputobj); + insize = PyString_GET_SIZE(inputobj); *inend = *input + insize; /* we can DECREF safely, as the exception has another reference, so the object won't go away. */ @@ -1674,7 +1658,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, int encodeWhiteSpace, const char *errors) { - PyObject *v; + PyObject *v, *result; /* It might be possible to tighten this worst case */ Py_ssize_t cbAllocated = 5 * size; int inShift = 0; @@ -1685,7 +1669,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, char * start; if (size == 0) - return PyBytes_FromStringAndSize(NULL, 0); + return PyString_FromStringAndSize(NULL, 0); v = PyBytes_FromStringAndSize(NULL, cbAllocated); if (v == NULL) @@ -1757,11 +1741,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, *out++ = '-'; } - if (PyBytes_Resize(v, out - start)) { - Py_DECREF(v); - return NULL; - } - return v; + result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start); + Py_DECREF(v); + return result; } #undef SPECIAL @@ -2001,11 +1983,11 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, { #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ - Py_ssize_t i; /* index into s of next input byte */ - PyObject *v; /* result string object */ - char *p; /* next free byte in output buffer */ - Py_ssize_t nallocated; /* number of result bytes allocated */ - Py_ssize_t nneeded; /* number of result bytes needed */ + Py_ssize_t i; /* index into s of next input byte */ + PyObject *result; /* result string object */ + char *p; /* next free byte in output buffer */ + Py_ssize_t nallocated; /* number of result bytes allocated */ + Py_ssize_t nneeded; /* number of result bytes needed */ char stackbuf[MAX_SHORT_UNICHARS * 4]; assert(s != NULL); @@ -2017,7 +1999,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, * turns out we need. */ nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); - v = NULL; /* will allocate after we're done */ + result = NULL; /* will allocate after we're done */ p = stackbuf; } else { @@ -2025,10 +2007,10 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s, nallocated = size * 4; if (nallocated / 4 != size) /* overflow! */ return PyErr_NoMemory(); - v = PyBytes_FromStringAndSize(NULL, nallocated); - if (v == NULL) + result = PyString_FromStringAndSize(NULL, nallocated); + if (result == NULL) return NULL; - p = PyBytes_AS_STRING(v); + p = PyString_AS_STRING(result); } for (i = 0; i < size;) { @@ -2072,19 +2054,19 @@ encodeUCS4: } } - if (v == NULL) { + if (result == NULL) { /* This was stack allocated. */ nneeded = p - stackbuf; assert(nneeded <= nallocated); - v = PyBytes_FromStringAndSize(stackbuf, nneeded); + result = PyString_FromStringAndSize(stackbuf, nneeded); } else { /* Cut back to size actually needed. */ - nneeded = p - PyBytes_AS_STRING(v); + nneeded = p - PyString_AS_STRING(result); assert(nneeded <= nallocated); - PyBytes_Resize(v, nneeded); + _PyString_Resize(&result, nneeded); } - return v; + return result; #undef MAX_SHORT_UNICHARS } @@ -2279,7 +2261,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s, const char *errors, int byteorder) { - PyObject *v; + PyObject *v, *result; unsigned char *p; #ifndef Py_UNICODE_WIDE int i, pairs; @@ -2319,7 +2301,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s, if (byteorder == 0) STORECHAR(0xFEFF); if (size == 0) - return v; + goto done; if (byteorder == -1) { /* force LE */ @@ -2350,7 +2332,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s, #endif STORECHAR(ch); } - return v; + + done: + result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v)); + Py_DECREF(v); + return result; #undef STORECHAR } @@ -2549,7 +2535,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s, const char *errors, int byteorder) { - PyObject *v; + PyObject *v, *result; unsigned char *p; #ifdef Py_UNICODE_WIDE int i, pairs; @@ -2584,7 +2570,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s, if (byteorder == 0) STORECHAR(0xFEFF); if (size == 0) - return v; + goto done; if (byteorder == -1) { /* force LE */ @@ -2610,7 +2596,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s, if (ch2) STORECHAR(ch2); } - return v; + + done: + result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v)); + Py_DECREF(v); + return result; #undef STORECHAR } @@ -2900,7 +2890,7 @@ static const char *hexdigits = "0123456789abcdef"; PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; + PyObject *repr, *result; char *p; /* XXX(nnorwitz): rather than over-allocating, it would be @@ -3023,12 +3013,10 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, *p++ = (char) ch; } - *p = '\0'; - if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) { - Py_DECREF(repr); - return NULL; - } - return repr; + result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), + p - PyBytes_AS_STRING(repr)); + Py_DECREF(repr); + return result; } PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) @@ -3159,7 +3147,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s, PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, Py_ssize_t size) { - PyObject *repr; + PyObject *repr, *result; char *p; char *q; @@ -3171,7 +3159,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, if (repr == NULL) return NULL; if (size == 0) - return repr; + goto done; p = q = PyBytes_AS_STRING(repr); while (size-- > 0) { @@ -3205,12 +3193,12 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, else *p++ = (char) ch; } - *p = '\0'; - if (PyBytes_Resize(repr, p - q)) { - Py_DECREF(repr); - return NULL; - } - return repr; + size = p - q; + + done: + result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size); + Py_DECREF(repr); + return result; } PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) @@ -3445,23 +3433,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, /* pointer into the output */ char *str; /* current output position */ - Py_ssize_t respos = 0; Py_ssize_t ressize; const char *encoding = (limit == 256) ? "latin-1" : "ascii"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; PyObject *errorHandler = NULL; PyObject *exc = NULL; + PyObject *result = NULL; /* the following variable is used for caching string comparisons * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ int known_errorHandler = -1; /* allocate enough for a simple encoding without replacements, if we need more, we'll resize */ + if (size == 0) + return PyString_FromStringAndSize(NULL, 0); res = PyBytes_FromStringAndSize(NULL, size); if (res == NULL) - goto onError; - if (size == 0) - return res; + return NULL; str = PyBytes_AS_STRING(res); ressize = size; @@ -3589,20 +3577,13 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p, } } } - /* Resize if we allocated to much */ - respos = str - PyBytes_AS_STRING(res); - if (respos<ressize) - /* If this falls res will be NULL */ - PyBytes_Resize(res, respos); - Py_XDECREF(errorHandler); - Py_XDECREF(exc); - return res; - - onError: - Py_XDECREF(res); + result = PyString_FromStringAndSize(PyBytes_AS_STRING(res), + str - PyBytes_AS_STRING(res)); + onError: + Py_DECREF(res); Py_XDECREF(errorHandler); Py_XDECREF(exc); - return NULL; + return result; } PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, @@ -3848,20 +3829,20 @@ static int encode_mbcs(PyObject **repr, if (*repr == NULL) { /* Create string object */ - *repr = PyBytes_FromStringAndSize(NULL, mbcssize); + *repr = PyString_FromStringAndSize(NULL, mbcssize); if (*repr == NULL) return -1; } else { /* Extend string object */ - n = PyBytes_Size(*repr); - if (PyBytes_Resize(*repr, n + mbcssize) < 0) + n = PyString_Size(*repr); + if (_PyString_Resize(repr, n + mbcssize) < 0) return -1; } /* Do the conversion */ if (size > 0) { - char *s = PyBytes_AS_STRING(*repr) + n; + char *s = PyString_AS_STRING(*repr) + n; if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { PyErr_SetFromWindowsErrWithFilename(0, NULL); return -1; @@ -4341,16 +4322,14 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping) } static int -charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) +charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize) { - Py_ssize_t outsize = PyBytes_GET_SIZE( outobj); + Py_ssize_t outsize = PyString_GET_SIZE(*outobj); /* exponentially overallocate to minimize reallocations */ if (requiredsize < 2*outsize) requiredsize = 2*outsize; - if (PyBytes_Resize(outobj, requiredsize)) { - Py_DECREF(outobj); + if (_PyString_Resize(outobj, requiredsize)) return -1; - } return 0; } @@ -4365,21 +4344,21 @@ typedef enum charmapencode_result { reallocation error occurred. The caller must decref the result */ static charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, - PyObject *outobj, Py_ssize_t *outpos) + PyObject **outobj, Py_ssize_t *outpos) { PyObject *rep; char *outstart; - Py_ssize_t outsize = PyBytes_GET_SIZE(outobj); + Py_ssize_t outsize = PyString_GET_SIZE(*outobj); if (Py_Type(mapping) == &EncodingMapType) { int res = encoding_map_lookup(c, mapping); Py_ssize_t requiredsize = *outpos+1; if (res == -1) return enc_FAILED; - if (outsize<requiredsize) + if (outsize<requiredsize) if (charmapencode_resize(outobj, outpos, requiredsize)) return enc_EXCEPTION; - outstart = PyBytes_AS_STRING(outobj); + outstart = PyString_AS_STRING(*outobj); outstart[(*outpos)++] = (char)res; return enc_SUCCESS; } @@ -4398,7 +4377,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, Py_DECREF(rep); return enc_EXCEPTION; } - outstart = PyBytes_AS_STRING(outobj); + outstart = PyString_AS_STRING(*outobj); outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep); } else { @@ -4410,7 +4389,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping, Py_DECREF(rep); return enc_EXCEPTION; } - outstart = PyBytes_AS_STRING(outobj); + outstart = PyString_AS_STRING(*outobj); memcpy(outstart + *outpos, repchars, repsize); *outpos += repsize; } @@ -4426,7 +4405,7 @@ int charmap_encoding_error( const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping, PyObject **exceptionObject, int *known_errorHandler, PyObject **errorHandler, const char *errors, - PyObject *res, Py_ssize_t *respos) + PyObject **res, Py_ssize_t *respos) { PyObject *repunicode = NULL; /* initialize to prevent gcc warning */ Py_ssize_t repsize; @@ -4561,7 +4540,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, /* allocate enough for a simple encoding without replacements, if we need more, we'll resize */ - res = PyBytes_FromStringAndSize(NULL, size); + res = PyString_FromStringAndSize(NULL, size); if (res == NULL) goto onError; if (size == 0) @@ -4569,14 +4548,14 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, while (inpos<size) { /* try to encode it */ - charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos); + charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos); if (x==enc_EXCEPTION) /* error */ goto onError; if (x==enc_FAILED) { /* unencodable character */ if (charmap_encoding_error(p, size, &inpos, mapping, &exc, &known_errorHandler, &errorHandler, errors, - res, &respos)) { + &res, &respos)) { goto onError; } } @@ -4586,10 +4565,9 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p, } /* Resize if we allocated to much */ - if (respos<PyBytes_GET_SIZE(res)) { - if (PyBytes_Resize(res, respos)) - goto onError; - } + if (respos<PyString_GET_SIZE(res)) + _PyString_Resize(&res, respos); + Py_XDECREF(exc); Py_XDECREF(errorHandler); return res; @@ -5483,20 +5461,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq) item = PySequence_Fast_GET_ITEM(fseq, i); /* Convert item to Unicode. */ - if (!PyString_Check(item) && !PyUnicode_Check(item)) - { - if (PyBytes_Check(item)) - { - PyErr_Format(PyExc_TypeError, - "sequence item %d: join() will not operate on " - "bytes objects", i); - goto onError; - } - item = PyObject_Unicode(item); + if (!PyUnicode_Check(item)) { + PyErr_Format(PyExc_TypeError, + "sequence item %zd: expected str instance," + " %.80s found", + i, Py_Type(item)->tp_name); + goto onError; } - else - item = PyUnicode_FromObject(item); - + item = PyUnicode_FromObject(item); if (item == NULL) goto onError; /* We own a reference to item from here on. */ @@ -6396,9 +6368,6 @@ PyObject *PyUnicode_Concat(PyObject *left, { PyUnicodeObject *u = NULL, *v = NULL, *w; - if (PyBytes_Check(left) || PyBytes_Check(right)) - return PyBytes_Concat(left, right); - /* Coerce the two arguments */ u = (PyUnicodeObject *)PyUnicode_FromObject(left); if (u == NULL) @@ -6515,7 +6484,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args) v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); if (v == NULL) goto onError; - if (!PyBytes_Check(v)) { + if (!PyString_Check(v)) { PyErr_Format(PyExc_TypeError, "encoder did not return a bytes object " "(type=%.400s)", @@ -8232,12 +8201,6 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx) return NULL; } -#define F_LJUST (1<<0) -#define F_SIGN (1<<1) -#define F_BLANK (1<<2) -#define F_ALT (1<<3) -#define F_ZERO (1<<4) - static Py_ssize_t strtounicode(Py_UNICODE *buffer, const char *charbuffer) { |