summaryrefslogtreecommitdiffstats
path: root/Objects/unicodeobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/unicodeobject.c')
-rw-r--r--Objects/unicodeobject.c225
1 files changed, 94 insertions, 131 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index c568a8e..ae34c9e 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -101,7 +101,7 @@ extern "C" {
function will delete the reference from this dictionary.
Another way to look at this is that to say that the actual reference
- count of a string is: s->ob_refcnt + (s->ob_sstate?2:0)
+ count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
*/
static PyObject *interned;
@@ -998,7 +998,10 @@ PyObject *PyUnicode_FromObject(register PyObject *obj)
return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
PyUnicode_GET_SIZE(obj));
}
- return PyUnicode_FromEncodedObject(obj, NULL, "strict");
+ PyErr_Format(PyExc_TypeError,
+ "Can't convert '%.100s' object to str implicitly",
+ Py_Type(obj)->tp_name);
+ return NULL;
}
PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
@@ -1219,22 +1222,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL)
goto onError;
- if (!PyBytes_Check(v)) {
- if (PyString_Check(v)) {
- /* Old codec, turn it into bytes */
- PyObject *b = PyBytes_FromObject(v);
- Py_DECREF(v);
- return b;
- }
- PyErr_Format(PyExc_TypeError,
- "encoder did not return a bytes object "
- "(type=%.400s, encoding=%.20s, errors=%.20s)",
- v->ob_type->tp_name,
- encoding ? encoding : "NULL",
- errors ? errors : "NULL");
- Py_DECREF(v);
- goto onError;
- }
+ assert(PyString_Check(v));
return v;
onError:
@@ -1245,19 +1233,15 @@ PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
const char *errors)
{
PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
- PyObject *b;
if (v)
return v;
if (errors != NULL)
Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
- b = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
+ v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL);
- if (!b)
+ if (!v)
return NULL;
- v = PyString_FromStringAndSize(PyBytes_AsString(b),
- PyBytes_Size(b));
- Py_DECREF(b);
((PyUnicodeObject *)unicode)->defenc = v;
return v;
}
@@ -1420,11 +1404,11 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
if (!inputobj)
goto onError;
- if (!PyBytes_Check(inputobj)) {
+ if (!PyString_Check(inputobj)) {
PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
}
- *input = PyBytes_AS_STRING(inputobj);
- insize = PyBytes_GET_SIZE(inputobj);
+ *input = PyString_AS_STRING(inputobj);
+ insize = PyString_GET_SIZE(inputobj);
*inend = *input + insize;
/* we can DECREF safely, as the exception has another reference,
so the object won't go away. */
@@ -1674,7 +1658,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
int encodeWhiteSpace,
const char *errors)
{
- PyObject *v;
+ PyObject *v, *result;
/* It might be possible to tighten this worst case */
Py_ssize_t cbAllocated = 5 * size;
int inShift = 0;
@@ -1685,7 +1669,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
char * start;
if (size == 0)
- return PyBytes_FromStringAndSize(NULL, 0);
+ return PyString_FromStringAndSize(NULL, 0);
v = PyBytes_FromStringAndSize(NULL, cbAllocated);
if (v == NULL)
@@ -1757,11 +1741,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
*out++ = '-';
}
- if (PyBytes_Resize(v, out - start)) {
- Py_DECREF(v);
- return NULL;
- }
- return v;
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), out - start);
+ Py_DECREF(v);
+ return result;
}
#undef SPECIAL
@@ -2001,11 +1983,11 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
{
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
- Py_ssize_t i; /* index into s of next input byte */
- PyObject *v; /* result string object */
- char *p; /* next free byte in output buffer */
- Py_ssize_t nallocated; /* number of result bytes allocated */
- Py_ssize_t nneeded; /* number of result bytes needed */
+ Py_ssize_t i; /* index into s of next input byte */
+ PyObject *result; /* result string object */
+ char *p; /* next free byte in output buffer */
+ Py_ssize_t nallocated; /* number of result bytes allocated */
+ Py_ssize_t nneeded; /* number of result bytes needed */
char stackbuf[MAX_SHORT_UNICHARS * 4];
assert(s != NULL);
@@ -2017,7 +1999,7 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
* turns out we need.
*/
nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
- v = NULL; /* will allocate after we're done */
+ result = NULL; /* will allocate after we're done */
p = stackbuf;
}
else {
@@ -2025,10 +2007,10 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
nallocated = size * 4;
if (nallocated / 4 != size) /* overflow! */
return PyErr_NoMemory();
- v = PyBytes_FromStringAndSize(NULL, nallocated);
- if (v == NULL)
+ result = PyString_FromStringAndSize(NULL, nallocated);
+ if (result == NULL)
return NULL;
- p = PyBytes_AS_STRING(v);
+ p = PyString_AS_STRING(result);
}
for (i = 0; i < size;) {
@@ -2072,19 +2054,19 @@ encodeUCS4:
}
}
- if (v == NULL) {
+ if (result == NULL) {
/* This was stack allocated. */
nneeded = p - stackbuf;
assert(nneeded <= nallocated);
- v = PyBytes_FromStringAndSize(stackbuf, nneeded);
+ result = PyString_FromStringAndSize(stackbuf, nneeded);
}
else {
/* Cut back to size actually needed. */
- nneeded = p - PyBytes_AS_STRING(v);
+ nneeded = p - PyString_AS_STRING(result);
assert(nneeded <= nallocated);
- PyBytes_Resize(v, nneeded);
+ _PyString_Resize(&result, nneeded);
}
- return v;
+ return result;
#undef MAX_SHORT_UNICHARS
}
@@ -2279,7 +2261,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
const char *errors,
int byteorder)
{
- PyObject *v;
+ PyObject *v, *result;
unsigned char *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
@@ -2319,7 +2301,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
- return v;
+ goto done;
if (byteorder == -1) {
/* force LE */
@@ -2350,7 +2332,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
#endif
STORECHAR(ch);
}
- return v;
+
+ done:
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+ Py_DECREF(v);
+ return result;
#undef STORECHAR
}
@@ -2549,7 +2535,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
const char *errors,
int byteorder)
{
- PyObject *v;
+ PyObject *v, *result;
unsigned char *p;
#ifdef Py_UNICODE_WIDE
int i, pairs;
@@ -2584,7 +2570,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
- return v;
+ goto done;
if (byteorder == -1) {
/* force LE */
@@ -2610,7 +2596,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
if (ch2)
STORECHAR(ch2);
}
- return v;
+
+ done:
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
+ Py_DECREF(v);
+ return result;
#undef STORECHAR
}
@@ -2900,7 +2890,7 @@ static const char *hexdigits = "0123456789abcdef";
PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
- PyObject *repr;
+ PyObject *repr, *result;
char *p;
/* XXX(nnorwitz): rather than over-allocating, it would be
@@ -3023,12 +3013,10 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
*p++ = (char) ch;
}
- *p = '\0';
- if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
- Py_DECREF(repr);
- return NULL;
- }
- return repr;
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr),
+ p - PyBytes_AS_STRING(repr));
+ Py_DECREF(repr);
+ return result;
}
PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
@@ -3159,7 +3147,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size)
{
- PyObject *repr;
+ PyObject *repr, *result;
char *p;
char *q;
@@ -3171,7 +3159,7 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
if (repr == NULL)
return NULL;
if (size == 0)
- return repr;
+ goto done;
p = q = PyBytes_AS_STRING(repr);
while (size-- > 0) {
@@ -3205,12 +3193,12 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
else
*p++ = (char) ch;
}
- *p = '\0';
- if (PyBytes_Resize(repr, p - q)) {
- Py_DECREF(repr);
- return NULL;
- }
- return repr;
+ size = p - q;
+
+ done:
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(repr), size);
+ Py_DECREF(repr);
+ return result;
}
PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@@ -3445,23 +3433,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
/* pointer into the output */
char *str;
/* current output position */
- Py_ssize_t respos = 0;
Py_ssize_t ressize;
const char *encoding = (limit == 256) ? "latin-1" : "ascii";
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ PyObject *result = NULL;
/* the following variable is used for caching string comparisons
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1;
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
+ if (size == 0)
+ return PyString_FromStringAndSize(NULL, 0);
res = PyBytes_FromStringAndSize(NULL, size);
if (res == NULL)
- goto onError;
- if (size == 0)
- return res;
+ return NULL;
str = PyBytes_AS_STRING(res);
ressize = size;
@@ -3589,20 +3577,13 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
}
}
}
- /* Resize if we allocated to much */
- respos = str - PyBytes_AS_STRING(res);
- if (respos<ressize)
- /* If this falls res will be NULL */
- PyBytes_Resize(res, respos);
- Py_XDECREF(errorHandler);
- Py_XDECREF(exc);
- return res;
-
- onError:
- Py_XDECREF(res);
+ result = PyString_FromStringAndSize(PyBytes_AS_STRING(res),
+ str - PyBytes_AS_STRING(res));
+ onError:
+ Py_DECREF(res);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
- return NULL;
+ return result;
}
PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@@ -3848,20 +3829,20 @@ static int encode_mbcs(PyObject **repr,
if (*repr == NULL) {
/* Create string object */
- *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
+ *repr = PyString_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
return -1;
}
else {
/* Extend string object */
- n = PyBytes_Size(*repr);
- if (PyBytes_Resize(*repr, n + mbcssize) < 0)
+ n = PyString_Size(*repr);
+ if (_PyString_Resize(repr, n + mbcssize) < 0)
return -1;
}
/* Do the conversion */
if (size > 0) {
- char *s = PyBytes_AS_STRING(*repr) + n;
+ char *s = PyString_AS_STRING(*repr) + n;
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
@@ -4341,16 +4322,14 @@ static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
}
static int
-charmapencode_resize(PyObject *outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
+charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
{
- Py_ssize_t outsize = PyBytes_GET_SIZE( outobj);
+ Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
/* exponentially overallocate to minimize reallocations */
if (requiredsize < 2*outsize)
requiredsize = 2*outsize;
- if (PyBytes_Resize(outobj, requiredsize)) {
- Py_DECREF(outobj);
+ if (_PyString_Resize(outobj, requiredsize))
return -1;
- }
return 0;
}
@@ -4365,21 +4344,21 @@ typedef enum charmapencode_result {
reallocation error occurred. The caller must decref the result */
static
charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
- PyObject *outobj, Py_ssize_t *outpos)
+ PyObject **outobj, Py_ssize_t *outpos)
{
PyObject *rep;
char *outstart;
- Py_ssize_t outsize = PyBytes_GET_SIZE(outobj);
+ Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
if (Py_Type(mapping) == &EncodingMapType) {
int res = encoding_map_lookup(c, mapping);
Py_ssize_t requiredsize = *outpos+1;
if (res == -1)
return enc_FAILED;
- if (outsize<requiredsize)
+ if (outsize<requiredsize)
if (charmapencode_resize(outobj, outpos, requiredsize))
return enc_EXCEPTION;
- outstart = PyBytes_AS_STRING(outobj);
+ outstart = PyString_AS_STRING(*outobj);
outstart[(*outpos)++] = (char)res;
return enc_SUCCESS;
}
@@ -4398,7 +4377,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Py_DECREF(rep);
return enc_EXCEPTION;
}
- outstart = PyBytes_AS_STRING(outobj);
+ outstart = PyString_AS_STRING(*outobj);
outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
}
else {
@@ -4410,7 +4389,7 @@ charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Py_DECREF(rep);
return enc_EXCEPTION;
}
- outstart = PyBytes_AS_STRING(outobj);
+ outstart = PyString_AS_STRING(*outobj);
memcpy(outstart + *outpos, repchars, repsize);
*outpos += repsize;
}
@@ -4426,7 +4405,7 @@ int charmap_encoding_error(
const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
PyObject **exceptionObject,
int *known_errorHandler, PyObject **errorHandler, const char *errors,
- PyObject *res, Py_ssize_t *respos)
+ PyObject **res, Py_ssize_t *respos)
{
PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Py_ssize_t repsize;
@@ -4561,7 +4540,7 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
/* allocate enough for a simple encoding without
replacements, if we need more, we'll resize */
- res = PyBytes_FromStringAndSize(NULL, size);
+ res = PyString_FromStringAndSize(NULL, size);
if (res == NULL)
goto onError;
if (size == 0)
@@ -4569,14 +4548,14 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
while (inpos<size) {
/* try to encode it */
- charmapencode_result x = charmapencode_output(p[inpos], mapping, res, &respos);
+ charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
if (x==enc_EXCEPTION) /* error */
goto onError;
if (x==enc_FAILED) { /* unencodable character */
if (charmap_encoding_error(p, size, &inpos, mapping,
&exc,
&known_errorHandler, &errorHandler, errors,
- res, &respos)) {
+ &res, &respos)) {
goto onError;
}
}
@@ -4586,10 +4565,9 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
}
/* Resize if we allocated to much */
- if (respos<PyBytes_GET_SIZE(res)) {
- if (PyBytes_Resize(res, respos))
- goto onError;
- }
+ if (respos<PyString_GET_SIZE(res))
+ _PyString_Resize(&res, respos);
+
Py_XDECREF(exc);
Py_XDECREF(errorHandler);
return res;
@@ -5483,20 +5461,14 @@ PyUnicode_Join(PyObject *separator, PyObject *seq)
item = PySequence_Fast_GET_ITEM(fseq, i);
/* Convert item to Unicode. */
- if (!PyString_Check(item) && !PyUnicode_Check(item))
- {
- if (PyBytes_Check(item))
- {
- PyErr_Format(PyExc_TypeError,
- "sequence item %d: join() will not operate on "
- "bytes objects", i);
- goto onError;
- }
- item = PyObject_Unicode(item);
+ if (!PyUnicode_Check(item)) {
+ PyErr_Format(PyExc_TypeError,
+ "sequence item %zd: expected str instance,"
+ " %.80s found",
+ i, Py_Type(item)->tp_name);
+ goto onError;
}
- else
- item = PyUnicode_FromObject(item);
-
+ item = PyUnicode_FromObject(item);
if (item == NULL)
goto onError;
/* We own a reference to item from here on. */
@@ -6396,9 +6368,6 @@ PyObject *PyUnicode_Concat(PyObject *left,
{
PyUnicodeObject *u = NULL, *v = NULL, *w;
- if (PyBytes_Check(left) || PyBytes_Check(right))
- return PyBytes_Concat(left, right);
-
/* Coerce the two arguments */
u = (PyUnicodeObject *)PyUnicode_FromObject(left);
if (u == NULL)
@@ -6515,7 +6484,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
if (v == NULL)
goto onError;
- if (!PyBytes_Check(v)) {
+ if (!PyString_Check(v)) {
PyErr_Format(PyExc_TypeError,
"encoder did not return a bytes object "
"(type=%.400s)",
@@ -8232,12 +8201,6 @@ getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
return NULL;
}
-#define F_LJUST (1<<0)
-#define F_SIGN (1<<1)
-#define F_BLANK (1<<2)
-#define F_ALT (1<<3)
-#define F_ZERO (1<<4)
-
static Py_ssize_t
strtounicode(Py_UNICODE *buffer, const char *charbuffer)
{