summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarc-André Lemburg <mal@egenix.com>2008-06-06 12:18:17 (GMT)
committerMarc-André Lemburg <mal@egenix.com>2008-06-06 12:18:17 (GMT)
commitb2750b5d334e9c8d262009069bce41c15803eca0 (patch)
tree2c501adf96b37d3afbc32e6fdc344fade85cf3d5
parent4efb518185d32d573ff65f11b94c6031340a018a (diff)
downloadcpython-b2750b5d334e9c8d262009069bce41c15803eca0.zip
cpython-b2750b5d334e9c8d262009069bce41c15803eca0.tar.gz
cpython-b2750b5d334e9c8d262009069bce41c15803eca0.tar.bz2
Move the codec decode type checks to bytes/bytearray.decode().
Use faster PyUnicode_FromEncodedObject() for bytes/bytearray.decode(). Add new PyCodec_KnownEncoding() API. Add new PyUnicode_AsDecodedUnicode() and PyUnicode_AsEncodedUnicode() APIs. Add missing PyUnicode_AsDecodedObject() to unicodeobject.h Fix punicode codec to also work on memoryviews.
-rw-r--r--Include/codecs.h13
-rw-r--r--Include/unicodeobject.h36
-rw-r--r--Lib/encodings/punycode.py2
-rw-r--r--Objects/bytearrayobject.c4
-rw-r--r--Objects/bytesobject.c4
-rw-r--r--Objects/unicodeobject.c96
-rw-r--r--Python/codecs.c45
-rw-r--r--Python/pythonrun.c12
8 files changed, 171 insertions, 41 deletions
diff --git a/Include/codecs.h b/Include/codecs.h
index 0d76241..c979e86 100644
--- a/Include/codecs.h
+++ b/Include/codecs.h
@@ -27,7 +27,7 @@ PyAPI_FUNC(int) PyCodec_Register(
PyObject *search_function
);
-/* Codec register lookup API.
+/* Codec registry lookup API.
Looks up the given encoding and returns a CodecInfo object with
function attributes which implement the different aspects of
@@ -49,6 +49,17 @@ PyAPI_FUNC(PyObject *) _PyCodec_Lookup(
const char *encoding
);
+/* Codec registry encoding check API.
+
+ Returns 1/0 depending on whether there is a registered codec for
+ the given encoding.
+
+*/
+
+PyAPI_FUNC(int) PyCodec_KnownEncoding(
+ const char *encoding
+ );
+
/* Generic codec based encoding API.
object is passed through the encoder function found for the given
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 384cd55..7af2eba 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -139,8 +139,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
+# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
+# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
+# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
@@ -233,8 +236,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
+# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
+# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
+# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
@@ -744,6 +750,24 @@ PyAPI_FUNC(PyObject*) PyUnicode_Decode(
const char *errors /* error handling */
);
+/* Decode a Unicode object unicode and return the result as Python
+ object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
+ PyObject *unicode, /* Unicode object */
+ const char *encoding, /* encoding */
+ const char *errors /* error handling */
+ );
+
+/* Decode a Unicode object unicode and return the result as Unicode
+ object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
+ PyObject *unicode, /* Unicode object */
+ const char *encoding, /* encoding */
+ const char *errors /* error handling */
+ );
+
/* Encodes a Py_UNICODE buffer of the given size and returns a
Python string object. */
@@ -772,11 +796,21 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
const char *errors /* error handling */
);
+/* Encodes a Unicode object and returns the result as Unicode
+ object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
+ PyObject *unicode, /* Unicode object */
+ const char *encoding, /* encoding */
+ const char *errors /* error handling */
+ );
+
+/* Build an encoding map. */
+
PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
PyObject* string /* 256 character map */
);
-
/* --- UTF-7 Codecs ------------------------------------------------------- */
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py
index b801a46..8129af2 100644
--- a/Lib/encodings/punycode.py
+++ b/Lib/encodings/punycode.py
@@ -183,6 +183,8 @@ def insertion_sort(base, extended, errors):
def punycode_decode(text, errors):
if isinstance(text, str):
text = text.encode("ascii")
+ if isinstance(text, memoryview):
+ text = bytes(text)
pos = text.rfind(b"-")
if pos == -1:
base = ""
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index 75a8eef..70921c0 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -725,7 +725,7 @@ bytes_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds)
"string argument without an encoding");
return -1;
}
- encoded = PyCodec_Encode(arg, encoding, errors);
+ encoded = PyUnicode_AsEncodedString(arg, encoding, errors);
if (encoded == NULL)
return -1;
assert(PyBytes_Check(encoded));
@@ -2854,7 +2854,7 @@ bytes_decode(PyObject *self, PyObject *args)
return NULL;
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
- return PyCodec_Decode(self, encoding, errors);
+ return PyUnicode_FromEncodedObject(self, encoding, errors);
}
PyDoc_STRVAR(alloc_doc,
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index ab6207b..471d09c 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -2713,7 +2713,7 @@ string_decode(PyObject *self, PyObject *args)
return NULL;
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
- return PyCodec_Decode(self, encoding, errors);
+ return PyUnicode_FromEncodedObject(self, encoding, errors);
}
@@ -2899,7 +2899,7 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
"string argument without an encoding");
return NULL;
}
- new = PyCodec_Encode(x, encoding, errors);
+ new = PyUnicode_AsEncodedString(x, encoding, errors);
if (new == NULL)
return NULL;
assert(PyBytes_Check(new));
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 78e38b5..fc8c8a9 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1099,14 +1099,18 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
/* Coerce object */
if (PyBytes_Check(obj)) {
- s = PyBytes_AS_STRING(obj);
- len = PyBytes_GET_SIZE(obj);
- }
+ s = PyBytes_AS_STRING(obj);
+ len = PyBytes_GET_SIZE(obj);
+ }
+ else if (PyByteArray_Check(obj)) {
+ s = PyByteArray_AS_STRING(obj);
+ len = PyByteArray_GET_SIZE(obj);
+ }
else if (PyObject_AsCharBuffer(obj, &s, &len)) {
/* Overwrite the error message with something more useful in
case of a TypeError. */
if (PyErr_ExceptionMatches(PyExc_TypeError))
- PyErr_Format(PyExc_TypeError,
+ PyErr_Format(PyExc_TypeError,
"coercing to Unicode: need string or buffer, "
"%.80s found",
Py_TYPE(obj)->tp_name);
@@ -1188,7 +1192,7 @@ PyObject *PyUnicode_Decode(const char *s,
goto onError;
if (!PyUnicode_Check(unicode)) {
PyErr_Format(PyExc_TypeError,
- "decoder did not return an unicode object (type=%.400s)",
+ "decoder did not return a unicode object (type=%.400s)",
Py_TYPE(unicode)->tp_name);
Py_DECREF(unicode);
goto onError;
@@ -1225,6 +1229,37 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
return NULL;
}
+PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *v;
+
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ goto onError;
+ }
+
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Decode via the codec registry */
+ v = PyCodec_Decode(unicode, encoding, errors);
+ if (v == NULL)
+ goto onError;
+ if (!PyUnicode_Check(v)) {
+ PyErr_Format(PyExc_TypeError,
+ "decoder did not return a unicode object (type=%.400s)",
+ Py_TYPE(v)->tp_name);
+ Py_DECREF(v);
+ goto onError;
+ }
+ return v;
+
+ onError:
+ return NULL;
+}
+
PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Py_ssize_t size,
const char *encoding,
@@ -1296,7 +1331,54 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
v = PyCodec_Encode(unicode, encoding, errors);
if (v == NULL)
goto onError;
- assert(PyBytes_Check(v));
+ if (PyByteArray_Check(v)) {
+ char msg[100];
+ PyOS_snprintf(msg, sizeof(msg),
+ "encoder %s returned buffer instead of bytes",
+ encoding);
+ if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
+ v = NULL;
+ goto onError;
+ }
+ v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
+ }
+ else if (!PyBytes_Check(v)) {
+ PyErr_Format(PyExc_TypeError,
+ "encoder did not return a bytes object (type=%.400s)",
+ Py_TYPE(v)->tp_name);
+ v = NULL;
+ }
+ return v;
+
+ onError:
+ return NULL;
+}
+
+PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
+ const char *encoding,
+ const char *errors)
+{
+ PyObject *v;
+
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ goto onError;
+ }
+
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Encode via the codec registry */
+ v = PyCodec_Encode(unicode, encoding, errors);
+ if (v == NULL)
+ goto onError;
+ if (!PyUnicode_Check(v)) {
+ PyErr_Format(PyExc_TypeError,
+ "encoder did not return an unicode object (type=%.400s)",
+ Py_TYPE(v)->tp_name);
+ Py_DECREF(v);
+ goto onError;
+ }
return v;
onError:
@@ -6617,7 +6699,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args)
if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
return NULL;
- v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
+ v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
if (v == NULL)
goto onError;
if (!PyBytes_Check(v)) {
diff --git a/Python/codecs.c b/Python/codecs.c
index 33f0733..66576c4 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -183,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return NULL;
}
+/* Codec registry encoding check API. */
+
+int PyCodec_KnownEncoding(const char *encoding)
+{
+ PyObject *codecs;
+
+ codecs = _PyCodec_Lookup(encoding);
+ if (!codecs) {
+ PyErr_Clear();
+ return 0;
+ }
+ else {
+ Py_DECREF(codecs);
+ return 1;
+ }
+}
+
static
PyObject *args_tuple(PyObject *object,
const char *errors)
@@ -344,32 +361,20 @@ PyObject *PyCodec_Encode(PyObject *object,
"encoder must return a tuple (object, integer)");
goto onError;
}
- v = PyTuple_GET_ITEM(result, 0);
- if (PyByteArray_Check(v)) {
- char msg[100];
- PyOS_snprintf(msg, sizeof(msg),
- "encoder %s returned buffer instead of bytes",
- encoding);
- if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
- v = NULL;
- goto onError;
- }
- v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
- }
- else if (PyBytes_Check(v))
- Py_INCREF(v);
- else {
- PyErr_SetString(PyExc_TypeError,
- "encoding must return a tuple(bytes, integer)");
- v = NULL;
- }
+ v = PyTuple_GET_ITEM(result,0);
+ Py_INCREF(v);
/* We don't check or use the second (integer) entry. */
+ Py_DECREF(args);
+ Py_DECREF(encoder);
+ Py_DECREF(result);
+ return v;
+
onError:
Py_XDECREF(result);
Py_XDECREF(args);
Py_XDECREF(encoder);
- return v;
+ return NULL;
}
/* Decode an object (usually a Python string) using the given encoding
diff --git a/Python/pythonrun.c b/Python/pythonrun.c
index 7fe4cce..24517e4 100644
--- a/Python/pythonrun.c
+++ b/Python/pythonrun.c
@@ -261,14 +261,10 @@ Py_InitializeEx(int install_sigs)
codeset = nl_langinfo(CODESET);
if (codeset && *codeset) {
- PyObject *enc = PyCodec_Encoder(codeset);
- if (enc) {
- codeset = strdup(codeset);
- Py_DECREF(enc);
- } else {
- codeset = NULL;
- PyErr_Clear();
- }
+ if (PyCodec_KnownEncoding(codeset))
+ codeset = strdup(codeset);
+ else
+ codeset = NULL;
} else
codeset = NULL;