summaryrefslogtreecommitdiffstats
path: root/Python/codecs.c
diff options
context:
space:
mode:
Diffstat (limited to 'Python/codecs.c')
-rw-r--r--Python/codecs.c938
1 files changed, 226 insertions, 712 deletions
diff --git a/Python/codecs.c b/Python/codecs.c
index 08e9b91..dbecd1d 100644
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -9,12 +9,8 @@ Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
-#include "pycore_pystate.h"
-#include "ucnhash.h"
#include <ctype.h>
-const char *Py_hexdigits = "0123456789abcdef";
-
/* --- Codec Registry ----------------------------------------------------- */
/* Import the standard encodings package which will register the first
@@ -32,7 +28,7 @@ static int _PyCodecRegistry_Init(void); /* Forward */
int PyCodec_Register(PyObject *search_function)
{
- PyInterpreterState *interp = _PyInterpreterState_Get();
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
goto onError;
if (search_function == NULL) {
@@ -49,16 +45,15 @@ int PyCodec_Register(PyObject *search_function)
return -1;
}
-extern int _Py_normalize_encoding(const char *, char *, size_t);
-
-/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
- converted to lower case, spaces and hyphens are replaced with underscores. */
+/* Convert a string to a normalized Python string: all characters are
+ converted to lower case, spaces are replaced with underscores. */
static
PyObject *normalizestring(const char *string)
{
+ register size_t i;
size_t len = strlen(string);
- char *encoding;
+ char *p;
PyObject *v;
if (len > PY_SSIZE_T_MAX) {
@@ -66,19 +61,18 @@ PyObject *normalizestring(const char *string)
return NULL;
}
- encoding = PyMem_Malloc(len + 1);
- if (encoding == NULL)
- return PyErr_NoMemory();
-
- if (!_Py_normalize_encoding(string, encoding, len + 1))
- {
- PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
- PyMem_Free(encoding);
+ v = PyString_FromStringAndSize(NULL, len);
+ if (v == NULL)
return NULL;
+ p = PyString_AS_STRING(v);
+ for (i = 0; i < len; i++) {
+ register char ch = string[i];
+ if (ch == ' ')
+ ch = '-';
+ else
+ ch = Py_TOLOWER(Py_CHARMASK(ch));
+ p[i] = ch;
}
-
- v = PyUnicode_FromString(encoding);
- PyMem_Free(encoding);
return v;
}
@@ -99,38 +93,42 @@ PyObject *normalizestring(const char *string)
PyObject *_PyCodec_Lookup(const char *encoding)
{
+ PyInterpreterState *interp;
+ PyObject *result, *args = NULL, *v;
+ Py_ssize_t i, len;
+
if (encoding == NULL) {
PyErr_BadArgument();
- return NULL;
+ goto onError;
}
- PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
- if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
- return NULL;
- }
+ interp = PyThreadState_GET()->interp;
+ if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
+ goto onError;
/* Convert the encoding to a normalized Python string: all
characters are converted to lower case, spaces and hyphens are
replaced with underscores. */
- PyObject *v = normalizestring(encoding);
- if (v == NULL) {
- return NULL;
- }
- PyUnicode_InternInPlace(&v);
+ v = normalizestring(encoding);
+ if (v == NULL)
+ goto onError;
+ PyString_InternInPlace(&v);
/* First, try to lookup the name in the registry dictionary */
- PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
+ result = PyDict_GetItem(interp->codec_search_cache, v);
if (result != NULL) {
Py_INCREF(result);
Py_DECREF(v);
return result;
}
- else if (PyErr_Occurred()) {
- goto onError;
- }
/* Next, scan the search functions in order of registration */
- const Py_ssize_t len = PyList_Size(interp->codec_search_path);
+ args = PyTuple_New(1);
+ if (args == NULL)
+ goto onError;
+ PyTuple_SET_ITEM(args,0,v);
+
+ len = PyList_Size(interp->codec_search_path);
if (len < 0)
goto onError;
if (len == 0) {
@@ -140,14 +138,13 @@ PyObject *_PyCodec_Lookup(const char *encoding)
goto onError;
}
- Py_ssize_t i;
for (i = 0; i < len; i++) {
PyObject *func;
func = PyList_GetItem(interp->codec_search_path, i);
if (func == NULL)
goto onError;
- result = _PyObject_CallOneArg(func, v);
+ result = PyEval_CallObject(func, args);
if (result == NULL)
goto onError;
if (result == Py_None) {
@@ -170,60 +167,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
}
/* Cache and return the result */
- if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
- Py_DECREF(result);
- goto onError;
- }
- Py_DECREF(v);
+ PyDict_SetItem(interp->codec_search_cache, v, result);
+ Py_DECREF(args);
return result;
onError:
- Py_DECREF(v);
+ Py_XDECREF(args);
return NULL;
}
-int _PyCodec_Forget(const char *encoding)
-{
- PyObject *v;
- int result;
-
- PyInterpreterState *interp = _PyInterpreterState_Get();
- if (interp->codec_search_path == NULL) {
- return -1;
- }
-
- /* Convert the encoding to a normalized Python string: all
- characters are converted to lower case, spaces and hyphens are
- replaced with underscores. */
- v = normalizestring(encoding);
- if (v == NULL) {
- return -1;
- }
-
- /* Drop the named codec from the internal cache */
- result = PyDict_DelItem(interp->codec_search_cache, v);
- Py_DECREF(v);
-
- return result;
-}
-
-/* Codec registry encoding check API. */
-
-int PyCodec_KnownEncoding(const char *encoding)
-{
- PyObject *codecs;
-
- codecs = _PyCodec_Lookup(encoding);
- if (!codecs) {
- PyErr_Clear();
- return 0;
- }
- else {
- Py_DECREF(codecs);
- return 1;
- }
-}
-
static
PyObject *args_tuple(PyObject *object,
const char *errors)
@@ -238,7 +190,7 @@ PyObject *args_tuple(PyObject *object,
if (errors) {
PyObject *v;
- v = PyUnicode_FromString(errors);
+ v = PyString_FromString(errors);
if (v == NULL) {
Py_DECREF(args);
return NULL;
@@ -279,7 +231,7 @@ PyObject *codec_makeincrementalcodec(PyObject *codec_info,
if (errors)
ret = PyObject_CallFunction(inccodec, "s", errors);
else
- ret = _PyObject_CallNoArg(inccodec);
+ ret = PyObject_CallFunction(inccodec, NULL);
Py_DECREF(inccodec);
return ret;
}
@@ -317,7 +269,7 @@ PyObject *codec_getstreamcodec(const char *encoding,
if (errors != NULL)
streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
else
- streamcodec = _PyObject_CallOneArg(codeccls, stream);
+ streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Py_DECREF(codecs);
return streamcodec;
}
@@ -382,22 +334,6 @@ PyObject *PyCodec_StreamWriter(const char *encoding,
return codec_getstreamcodec(encoding, stream, errors, 3);
}
-/* Helper that tries to ensure the reported exception chain indicates the
- * codec that was invoked to trigger the failure without changing the type
- * of the exception raised.
- */
-static void
-wrap_codec_error(const char *operation,
- const char *encoding)
-{
- /* TrySetFromCause will replace the active exception with a suitably
- * updated clone if it can, otherwise it will leave the original
- * exception alone.
- */
- _PyErr_TrySetFromCause("%s with '%s' codec failed",
- operation, encoding);
-}
-
/* Encode an object (e.g. a Unicode object) using the given encoding
and return the resulting encoded object (usually a Python string).
@@ -410,22 +346,20 @@ _PyCodec_EncodeInternal(PyObject *object,
const char *errors)
{
PyObject *args = NULL, *result = NULL;
- PyObject *v = NULL;
+ PyObject *v;
args = args_tuple(object, errors);
if (args == NULL)
goto onError;
- result = PyObject_Call(encoder, args, NULL);
- if (result == NULL) {
- wrap_codec_error("encoding", encoding);
+ result = PyEval_CallObject(encoder,args);
+ if (result == NULL)
goto onError;
- }
if (!PyTuple_Check(result) ||
PyTuple_GET_SIZE(result) != 2) {
PyErr_SetString(PyExc_TypeError,
- "encoder must return a tuple (object, integer)");
+ "encoder must return a tuple (object,integer)");
goto onError;
}
v = PyTuple_GET_ITEM(result,0);
@@ -462,11 +396,9 @@ _PyCodec_DecodeInternal(PyObject *object,
if (args == NULL)
goto onError;
- result = PyObject_Call(decoder, args, NULL);
- if (result == NULL) {
- wrap_codec_error("decoding", encoding);
+ result = PyEval_CallObject(decoder,args);
+ if (result == NULL)
goto onError;
- }
if (!PyTuple_Check(result) ||
PyTuple_GET_SIZE(result) != 2) {
PyErr_SetString(PyExc_TypeError,
@@ -520,7 +452,6 @@ PyObject *PyCodec_Decode(PyObject *object,
PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
const char *alternate_command)
{
- _Py_IDENTIFIER(_is_text_encoding);
PyObject *codec;
PyObject *attr;
int is_text_codec;
@@ -533,28 +464,39 @@ PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
* encoding, and the same for anything lacking the private
* attribute.
*/
- if (!PyTuple_CheckExact(codec)) {
- if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
- Py_DECREF(codec);
- return NULL;
- }
- if (attr != NULL) {
+ if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
+ attr = PyObject_GetAttrString(codec, "_is_text_encoding");
+ if (attr == NULL) {
+ if (!PyErr_ExceptionMatches(PyExc_AttributeError))
+ goto onError;
+ PyErr_Clear();
+ } else {
is_text_codec = PyObject_IsTrue(attr);
Py_DECREF(attr);
- if (is_text_codec <= 0) {
- Py_DECREF(codec);
- if (!is_text_codec)
- PyErr_Format(PyExc_LookupError,
- "'%.400s' is not a text encoding; "
- "use %s to handle arbitrary codecs",
- encoding, alternate_command);
- return NULL;
+ if (is_text_codec < 0)
+ goto onError;
+ if (!is_text_codec) {
+ PyObject *msg = PyString_FromFormat(
+ "'%.400s' is not a text encoding; "
+ "use %s to handle arbitrary codecs",
+ encoding, alternate_command);
+ if (msg == NULL)
+ goto onError;
+ if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
+ Py_DECREF(msg);
+ goto onError;
+ }
+ Py_DECREF(msg);
}
}
}
/* This appears to be a valid text encoding */
return codec;
+
+ onError:
+ Py_DECREF(codec);
+ return NULL;
}
@@ -620,7 +562,7 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
Return 0 on success, -1 on error */
int PyCodec_RegisterError(const char *name, PyObject *error)
{
- PyInterpreterState *interp = _PyInterpreterState_Get();
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return -1;
if (!PyCallable_Check(error)) {
@@ -628,7 +570,7 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
return -1;
}
return PyDict_SetItemString(interp->codec_error_registry,
- name, error);
+ (char *)name, error);
}
/* Lookup the error handling callback function registered under the
@@ -638,27 +580,37 @@ PyObject *PyCodec_LookupError(const char *name)
{
PyObject *handler = NULL;
- PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
return NULL;
if (name==NULL)
name = "strict";
- handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
- if (handler) {
- Py_INCREF(handler);
- }
- else if (!PyErr_Occurred()) {
+ handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
+ if (!handler)
PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
- }
+ else
+ Py_INCREF(handler);
return handler;
}
static void wrong_exception_type(PyObject *exc)
{
- PyErr_Format(PyExc_TypeError,
- "don't know how to handle %.200s in error callback",
- exc->ob_type->tp_name);
+ PyObject *type = PyObject_GetAttrString(exc, "__class__");
+ if (type != NULL) {
+ PyObject *name = PyObject_GetAttrString(type, "__name__");
+ Py_DECREF(type);
+ if (name != NULL) {
+ PyObject *string = PyObject_Str(name);
+ Py_DECREF(name);
+ if (string != NULL) {
+ PyErr_Format(PyExc_TypeError,
+ "don't know how to handle %.400s in error callback",
+ PyString_AS_STRING(string));
+ Py_DECREF(string);
+ }
+ }
+ }
}
PyObject *PyCodec_StrictErrors(PyObject *exc)
@@ -671,6 +623,7 @@ PyObject *PyCodec_StrictErrors(PyObject *exc)
}
+#ifdef Py_USING_UNICODE
PyObject *PyCodec_IgnoreErrors(PyObject *exc)
{
Py_ssize_t end;
@@ -691,58 +644,57 @@ PyObject *PyCodec_IgnoreErrors(PyObject *exc)
wrong_exception_type(exc);
return NULL;
}
- return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
+ /* ouch: passing NULL, 0, pos gives None instead of u'' */
+ return Py_BuildValue("(u#n)", &end, 0, end);
}
PyObject *PyCodec_ReplaceErrors(PyObject *exc)
{
- Py_ssize_t start, end, i, len;
+ PyObject *restuple;
+ Py_ssize_t start;
+ Py_ssize_t end;
+ Py_ssize_t i;
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *res;
- int kind;
- void *data;
+ Py_UNICODE *p;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
- len = end - start;
- res = PyUnicode_New(len, '?');
+ res = PyUnicode_FromUnicode(NULL, end-start);
if (res == NULL)
return NULL;
- kind = PyUnicode_KIND(res);
- data = PyUnicode_DATA(res);
- for (i = 0; i < len; ++i)
- PyUnicode_WRITE(kind, data, i, '?');
- assert(_PyUnicode_CheckConsistency(res, 1));
- return Py_BuildValue("(Nn)", res, end);
+ for (p = PyUnicode_AS_UNICODE(res), i = start;
+ i<end; ++p, ++i)
+ *p = '?';
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
+ return restuple;
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
+ Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
if (PyUnicodeDecodeError_GetEnd(exc, &end))
return NULL;
- return Py_BuildValue("(Cn)",
- (int)Py_UNICODE_REPLACEMENT_CHARACTER,
- end);
+ return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end);
}
else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
PyObject *res;
- int kind;
- void *data;
+ Py_UNICODE *p;
if (PyUnicodeTranslateError_GetStart(exc, &start))
return NULL;
if (PyUnicodeTranslateError_GetEnd(exc, &end))
return NULL;
- len = end - start;
- res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
+ res = PyUnicode_FromUnicode(NULL, end-start);
if (res == NULL)
return NULL;
- kind = PyUnicode_KIND(res);
- data = PyUnicode_DATA(res);
- for (i=0; i < len; i++)
- PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
- assert(_PyUnicode_CheckConsistency(res, 1));
- return Py_BuildValue("(Nn)", res, end);
+ for (p = PyUnicode_AS_UNICODE(res), i = start;
+ i<end; ++p, ++i)
+ *p = Py_UNICODE_REPLACEMENT_CHARACTER;
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
+ return restuple;
}
else {
wrong_exception_type(exc);
@@ -755,74 +707,96 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
- Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
- unsigned char *outp;
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ Py_UNICODE *e;
+ Py_UNICODE *outp;
Py_ssize_t ressize;
- Py_UCS4 ch;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
- if (end - start > PY_SSIZE_T_MAX / (2+7+1))
+ startp = PyUnicode_AS_UNICODE(object);
+ if (end - start > PY_SSIZE_T_MAX / (2+7+1)) {
end = start + PY_SSIZE_T_MAX / (2+7+1);
- for (i = start, ressize = 0; i < end; ++i) {
- /* object is guaranteed to be "ready" */
- ch = PyUnicode_READ_CHAR(object, i);
- if (ch<10)
+#ifndef Py_UNICODE_WIDE
+ if (0xD800 <= startp[end - 1] && startp[end - 1] <= 0xDBFF)
+ end--;
+#endif
+ }
+ e = startp + end;
+ for (p = startp+start, ressize = 0; p < e;) {
+ Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+ if ((0xD800 <= ch && ch <= 0xDBFF) &&
+ (p < e) &&
+ (0xDC00 <= *p && *p <= 0xDFFF)) {
+ ch = ((((ch & 0x03FF) << 10) |
+ ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+ }
+#endif
+ if (ch < 10)
ressize += 2+1+1;
- else if (ch<100)
+ else if (ch < 100)
ressize += 2+2+1;
- else if (ch<1000)
+ else if (ch < 1000)
ressize += 2+3+1;
- else if (ch<10000)
+ else if (ch < 10000)
ressize += 2+4+1;
- else if (ch<100000)
+ else if (ch < 100000)
ressize += 2+5+1;
- else if (ch<1000000)
+ else if (ch < 1000000)
ressize += 2+6+1;
else
ressize += 2+7+1;
}
/* allocate replacement */
- res = PyUnicode_New(ressize, 127);
+ res = PyUnicode_FromUnicode(NULL, ressize);
if (res == NULL) {
Py_DECREF(object);
return NULL;
}
- outp = PyUnicode_1BYTE_DATA(res);
/* generate replacement */
- for (i = start; i < end; ++i) {
+ for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
int digits;
int base;
- ch = PyUnicode_READ_CHAR(object, i);
+ Py_UCS4 ch = *p++;
+#ifndef Py_UNICODE_WIDE
+ if ((0xD800 <= ch && ch <= 0xDBFF) &&
+ (p < startp+end) &&
+ (0xDC00 <= *p && *p <= 0xDFFF)) {
+ ch = ((((ch & 0x03FF) << 10) |
+ ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
+ }
+#endif
*outp++ = '&';
*outp++ = '#';
- if (ch<10) {
+ if (ch < 10) {
digits = 1;
base = 1;
}
- else if (ch<100) {
+ else if (ch < 100) {
digits = 2;
base = 10;
}
- else if (ch<1000) {
+ else if (ch < 1000) {
digits = 3;
base = 100;
}
- else if (ch<10000) {
+ else if (ch < 10000) {
digits = 4;
base = 1000;
}
- else if (ch<100000) {
+ else if (ch < 100000) {
digits = 5;
base = 10000;
}
- else if (ch<1000000) {
+ else if (ch < 1000000) {
digits = 6;
base = 100000;
}
@@ -837,8 +811,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
}
*outp++ = ';';
}
- assert(_PyUnicode_CheckConsistency(res, 1));
- restuple = Py_BuildValue("(Nn)", res, end);
+ restuple = Py_BuildValue("(On)", res, end);
+ Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
@@ -848,517 +822,87 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
}
}
-PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
-{
- PyObject *object;
- Py_ssize_t i;
- Py_ssize_t start;
- Py_ssize_t end;
- PyObject *res;
- unsigned char *outp;
- int ressize;
- Py_UCS4 c;
-
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
- const unsigned char *p;
- if (PyUnicodeDecodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeDecodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeDecodeError_GetObject(exc)))
- return NULL;
- p = (const unsigned char*)PyBytes_AS_STRING(object);
- res = PyUnicode_New(4 * (end - start), 127);
- if (res == NULL) {
- Py_DECREF(object);
- return NULL;
- }
- outp = PyUnicode_1BYTE_DATA(res);
- for (i = start; i < end; i++, outp += 4) {
- unsigned char c = p[i];
- outp[0] = '\\';
- outp[1] = 'x';
- outp[2] = Py_hexdigits[(c>>4)&0xf];
- outp[3] = Py_hexdigits[c&0xf];
- }
+static Py_UNICODE hexdigits[] = {
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
+};
- assert(_PyUnicode_CheckConsistency(res, 1));
- Py_DECREF(object);
- return Py_BuildValue("(Nn)", res, end);
- }
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
- if (PyUnicodeEncodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeEncodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeEncodeError_GetObject(exc)))
- return NULL;
- }
- else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
- if (PyUnicodeTranslateError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeTranslateError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeTranslateError_GetObject(exc)))
- return NULL;
- }
- else {
- wrong_exception_type(exc);
- return NULL;
- }
-
- if (end - start > PY_SSIZE_T_MAX / (1+1+8))
- end = start + PY_SSIZE_T_MAX / (1+1+8);
- for (i = start, ressize = 0; i < end; ++i) {
- /* object is guaranteed to be "ready" */
- c = PyUnicode_READ_CHAR(object, i);
- if (c >= 0x10000) {
- ressize += 1+1+8;
- }
- else if (c >= 0x100) {
- ressize += 1+1+4;
- }
- else
- ressize += 1+1+2;
- }
- res = PyUnicode_New(ressize, 127);
- if (res == NULL) {
- Py_DECREF(object);
- return NULL;
- }
- outp = PyUnicode_1BYTE_DATA(res);
- for (i = start; i < end; ++i) {
- c = PyUnicode_READ_CHAR(object, i);
- *outp++ = '\\';
- if (c >= 0x00010000) {
- *outp++ = 'U';
- *outp++ = Py_hexdigits[(c>>28)&0xf];
- *outp++ = Py_hexdigits[(c>>24)&0xf];
- *outp++ = Py_hexdigits[(c>>20)&0xf];
- *outp++ = Py_hexdigits[(c>>16)&0xf];
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
- }
- else if (c >= 0x100) {
- *outp++ = 'u';
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
- }
- else
- *outp++ = 'x';
- *outp++ = Py_hexdigits[(c>>4)&0xf];
- *outp++ = Py_hexdigits[c&0xf];
- }
-
- assert(_PyUnicode_CheckConsistency(res, 1));
- Py_DECREF(object);
- return Py_BuildValue("(Nn)", res, end);
-}
-
-static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
-
-PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
+PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{
if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
PyObject *restuple;
PyObject *object;
- Py_ssize_t i;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
- unsigned char *outp;
+ Py_UNICODE *p;
+ Py_UNICODE *startp;
+ Py_UNICODE *outp;
Py_ssize_t ressize;
- int replsize;
- Py_UCS4 c;
- char buffer[256]; /* NAME_MAXLEN */
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
- if (!ucnhash_CAPI) {
- /* load the unicode data module */
- ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
- PyUnicodeData_CAPSULE_NAME, 1);
- if (!ucnhash_CAPI)
- return NULL;
- }
- for (i = start, ressize = 0; i < end; ++i) {
- /* object is guaranteed to be "ready" */
- c = PyUnicode_READ_CHAR(object, i);
- if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
- replsize = 1+1+1+(int)strlen(buffer)+1;
- }
- else if (c >= 0x10000) {
- replsize = 1+1+8;
- }
- else if (c >= 0x100) {
- replsize = 1+1+4;
+ if (end - start > PY_SSIZE_T_MAX / (1+1+8))
+ end = start + PY_SSIZE_T_MAX / (1+1+8);
+ startp = PyUnicode_AS_UNICODE(object);
+ for (p = startp+start, ressize = 0; p < startp+end; ++p) {
+#ifdef Py_UNICODE_WIDE
+ if (*p >= 0x00010000)
+ ressize += 1+1+8;
+ else
+#endif
+ if (*p >= 0x100) {
+ ressize += 1+1+4;
}
else
- replsize = 1+1+2;
- if (ressize > PY_SSIZE_T_MAX - replsize)
- break;
- ressize += replsize;
+ ressize += 1+1+2;
}
- end = i;
- res = PyUnicode_New(ressize, 127);
- if (res==NULL)
+ res = PyUnicode_FromUnicode(NULL, ressize);
+ if (res == NULL) {
+ Py_DECREF(object);
return NULL;
- for (i = start, outp = PyUnicode_1BYTE_DATA(res);
- i < end; ++i) {
- c = PyUnicode_READ_CHAR(object, i);
+ }
+ for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
+ p < startp+end; ++p) {
+ Py_UNICODE c = *p;
*outp++ = '\\';
- if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
- *outp++ = 'N';
- *outp++ = '{';
- strcpy((char *)outp, buffer);
- outp += strlen(buffer);
- *outp++ = '}';
- continue;
- }
+#ifdef Py_UNICODE_WIDE
if (c >= 0x00010000) {
*outp++ = 'U';
- *outp++ = Py_hexdigits[(c>>28)&0xf];
- *outp++ = Py_hexdigits[(c>>24)&0xf];
- *outp++ = Py_hexdigits[(c>>20)&0xf];
- *outp++ = Py_hexdigits[(c>>16)&0xf];
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
+ *outp++ = hexdigits[(c>>28)&0xf];
+ *outp++ = hexdigits[(c>>24)&0xf];
+ *outp++ = hexdigits[(c>>20)&0xf];
+ *outp++ = hexdigits[(c>>16)&0xf];
+ *outp++ = hexdigits[(c>>12)&0xf];
+ *outp++ = hexdigits[(c>>8)&0xf];
}
- else if (c >= 0x100) {
+ else
+#endif
+ if (c >= 0x100) {
*outp++ = 'u';
- *outp++ = Py_hexdigits[(c>>12)&0xf];
- *outp++ = Py_hexdigits[(c>>8)&0xf];
+ *outp++ = hexdigits[(c>>12)&0xf];
+ *outp++ = hexdigits[(c>>8)&0xf];
}
else
*outp++ = 'x';
- *outp++ = Py_hexdigits[(c>>4)&0xf];
- *outp++ = Py_hexdigits[c&0xf];
+ *outp++ = hexdigits[(c>>4)&0xf];
+ *outp++ = hexdigits[c&0xf];
}
- assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
- assert(_PyUnicode_CheckConsistency(res, 1));
- restuple = Py_BuildValue("(Nn)", res, end);
- Py_DECREF(object);
- return restuple;
- }
- else {
- wrong_exception_type(exc);
- return NULL;
- }
-}
-
-#define ENC_UNKNOWN -1
-#define ENC_UTF8 0
-#define ENC_UTF16BE 1
-#define ENC_UTF16LE 2
-#define ENC_UTF32BE 3
-#define ENC_UTF32LE 4
-
-static int
-get_standard_encoding(const char *encoding, int *bytelength)
-{
- if (Py_TOLOWER(encoding[0]) == 'u' &&
- Py_TOLOWER(encoding[1]) == 't' &&
- Py_TOLOWER(encoding[2]) == 'f') {
- encoding += 3;
- if (*encoding == '-' || *encoding == '_' )
- encoding++;
- if (encoding[0] == '8' && encoding[1] == '\0') {
- *bytelength = 3;
- return ENC_UTF8;
- }
- else if (encoding[0] == '1' && encoding[1] == '6') {
- encoding += 2;
- *bytelength = 2;
- if (*encoding == '\0') {
-#ifdef WORDS_BIGENDIAN
- return ENC_UTF16BE;
-#else
- return ENC_UTF16LE;
-#endif
- }
- if (*encoding == '-' || *encoding == '_' )
- encoding++;
- if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
- if (Py_TOLOWER(encoding[0]) == 'b')
- return ENC_UTF16BE;
- if (Py_TOLOWER(encoding[0]) == 'l')
- return ENC_UTF16LE;
- }
- }
- else if (encoding[0] == '3' && encoding[1] == '2') {
- encoding += 2;
- *bytelength = 4;
- if (*encoding == '\0') {
-#ifdef WORDS_BIGENDIAN
- return ENC_UTF32BE;
-#else
- return ENC_UTF32LE;
-#endif
- }
- if (*encoding == '-' || *encoding == '_' )
- encoding++;
- if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
- if (Py_TOLOWER(encoding[0]) == 'b')
- return ENC_UTF32BE;
- if (Py_TOLOWER(encoding[0]) == 'l')
- return ENC_UTF32LE;
- }
- }
- }
- else if (strcmp(encoding, "CP_UTF8") == 0) {
- *bytelength = 3;
- return ENC_UTF8;
- }
- return ENC_UNKNOWN;
-}
-
-/* This handler is declared static until someone demonstrates
- a need to call it directly. */
-static PyObject *
-PyCodec_SurrogatePassErrors(PyObject *exc)
-{
- PyObject *restuple;
- PyObject *object;
- PyObject *encode;
- const char *encoding;
- int code;
- int bytelength;
- Py_ssize_t i;
- Py_ssize_t start;
- Py_ssize_t end;
- PyObject *res;
-
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
- unsigned char *outp;
- if (PyUnicodeEncodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeEncodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeEncodeError_GetObject(exc)))
- return NULL;
- if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
- Py_DECREF(object);
- return NULL;
- }
- if (!(encoding = PyUnicode_AsUTF8(encode))) {
- Py_DECREF(object);
- Py_DECREF(encode);
- return NULL;
- }
- code = get_standard_encoding(encoding, &bytelength);
- Py_DECREF(encode);
- if (code == ENC_UNKNOWN) {
- /* Not supported, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(object);
- return NULL;
- }
-
- if (end - start > PY_SSIZE_T_MAX / bytelength)
- end = start + PY_SSIZE_T_MAX / bytelength;
- res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
- if (!res) {
- Py_DECREF(object);
- return NULL;
- }
- outp = (unsigned char*)PyBytes_AsString(res);
- for (i = start; i < end; i++) {
- /* object is guaranteed to be "ready" */
- Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- /* Not a surrogate, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(res);
- Py_DECREF(object);
- return NULL;
- }
- switch (code) {
- case ENC_UTF8:
- *outp++ = (unsigned char)(0xe0 | (ch >> 12));
- *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
- *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
- break;
- case ENC_UTF16LE:
- *outp++ = (unsigned char) ch;
- *outp++ = (unsigned char)(ch >> 8);
- break;
- case ENC_UTF16BE:
- *outp++ = (unsigned char)(ch >> 8);
- *outp++ = (unsigned char) ch;
- break;
- case ENC_UTF32LE:
- *outp++ = (unsigned char) ch;
- *outp++ = (unsigned char)(ch >> 8);
- *outp++ = (unsigned char)(ch >> 16);
- *outp++ = (unsigned char)(ch >> 24);
- break;
- case ENC_UTF32BE:
- *outp++ = (unsigned char)(ch >> 24);
- *outp++ = (unsigned char)(ch >> 16);
- *outp++ = (unsigned char)(ch >> 8);
- *outp++ = (unsigned char) ch;
- break;
- }
- }
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
- else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
- const unsigned char *p;
- Py_UCS4 ch = 0;
- if (PyUnicodeDecodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeDecodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeDecodeError_GetObject(exc)))
- return NULL;
- p = (const unsigned char*)PyBytes_AS_STRING(object);
- if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
- Py_DECREF(object);
- return NULL;
- }
- if (!(encoding = PyUnicode_AsUTF8(encode))) {
- Py_DECREF(object);
- Py_DECREF(encode);
- return NULL;
- }
- code = get_standard_encoding(encoding, &bytelength);
- Py_DECREF(encode);
- if (code == ENC_UNKNOWN) {
- /* Not supported, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(object);
- return NULL;
- }
-
- /* Try decoding a single surrogate character. If
- there are more, let the codec call us again. */
- p += start;
- if (PyBytes_GET_SIZE(object) - start >= bytelength) {
- switch (code) {
- case ENC_UTF8:
- if ((p[0] & 0xf0) == 0xe0 &&
- (p[1] & 0xc0) == 0x80 &&
- (p[2] & 0xc0) == 0x80) {
- /* it's a three-byte code */
- ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
- }
- break;
- case ENC_UTF16LE:
- ch = p[1] << 8 | p[0];
- break;
- case ENC_UTF16BE:
- ch = p[0] << 8 | p[1];
- break;
- case ENC_UTF32LE:
- ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
- break;
- case ENC_UTF32BE:
- ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
- break;
- }
- }
-
- Py_DECREF(object);
- if (!Py_UNICODE_IS_SURROGATE(ch)) {
- /* it's not a surrogate - fail */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- return NULL;
- }
- res = PyUnicode_FromOrdinal(ch);
- if (res == NULL)
- return NULL;
- return Py_BuildValue("(Nn)", res, start + bytelength);
- }
- else {
- wrong_exception_type(exc);
- return NULL;
- }
-}
-
-static PyObject *
-PyCodec_SurrogateEscapeErrors(PyObject *exc)
-{
- PyObject *restuple;
- PyObject *object;
- Py_ssize_t i;
- Py_ssize_t start;
- Py_ssize_t end;
- PyObject *res;
-
- if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
- char *outp;
- if (PyUnicodeEncodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeEncodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeEncodeError_GetObject(exc)))
- return NULL;
- res = PyBytes_FromStringAndSize(NULL, end-start);
- if (!res) {
- Py_DECREF(object);
- return NULL;
- }
- outp = PyBytes_AsString(res);
- for (i = start; i < end; i++) {
- /* object is guaranteed to be "ready" */
- Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
- if (ch < 0xdc80 || ch > 0xdcff) {
- /* Not a UTF-8b surrogate, fail with original exception */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- Py_DECREF(res);
- Py_DECREF(object);
- return NULL;
- }
- *outp++ = ch - 0xdc00;
- }
- restuple = Py_BuildValue("(On)", res, end);
- Py_DECREF(res);
- Py_DECREF(object);
- return restuple;
- }
- else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
- PyObject *str;
- const unsigned char *p;
- Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
- int consumed = 0;
- if (PyUnicodeDecodeError_GetStart(exc, &start))
- return NULL;
- if (PyUnicodeDecodeError_GetEnd(exc, &end))
- return NULL;
- if (!(object = PyUnicodeDecodeError_GetObject(exc)))
- return NULL;
- p = (const unsigned char*)PyBytes_AS_STRING(object);
- while (consumed < 4 && consumed < end-start) {
- /* Refuse to escape ASCII bytes. */
- if (p[start+consumed] < 128)
- break;
- ch[consumed] = 0xdc00 + p[start+consumed];
- consumed++;
- }
- Py_DECREF(object);
- if (!consumed) {
- /* codec complained about ASCII byte. */
- PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
- return NULL;
- }
- str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
- if (str == NULL)
- return NULL;
- return Py_BuildValue("(Nn)", str, start+consumed);
- }
else {
wrong_exception_type(exc);
return NULL;
}
}
-
+#endif
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
@@ -1366,6 +910,7 @@ static PyObject *strict_errors(PyObject *self, PyObject *exc)
}
+#ifdef Py_USING_UNICODE
static PyObject *ignore_errors(PyObject *self, PyObject *exc)
{
return PyCodec_IgnoreErrors(exc);
@@ -1388,21 +933,7 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
{
return PyCodec_BackslashReplaceErrors(exc);
}
-
-static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
-{
- return PyCodec_NameReplaceErrors(exc);
-}
-
-static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
-{
- return PyCodec_SurrogatePassErrors(exc);
-}
-
-static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
-{
- return PyCodec_SurrogateEscapeErrors(exc);
-}
+#endif
static int _PyCodecRegistry_Init(void)
{
@@ -1421,6 +952,7 @@ static int _PyCodecRegistry_Init(void)
"raises a UnicodeError on coding errors.")
}
},
+#ifdef Py_USING_UNICODE
{
"ignore",
{
@@ -1459,40 +991,14 @@ static int _PyCodecRegistry_Init(void)
backslashreplace_errors,
METH_O,
PyDoc_STR("Implements the 'backslashreplace' error handling, "
- "which replaces malformed data with a backslashed "
- "escape sequence.")
- }
- },
- {
- "namereplace",
- {
- "namereplace_errors",
- namereplace_errors,
- METH_O,
- PyDoc_STR("Implements the 'namereplace' error handling, "
"which replaces an unencodable character with a "
- "\\N{...} escape sequence.")
- }
- },
- {
- "surrogatepass",
- {
- "surrogatepass",
- surrogatepass_errors,
- METH_O
- }
- },
- {
- "surrogateescape",
- {
- "surrogateescape",
- surrogateescape_errors,
- METH_O
+ "backslashed escape sequence.")
}
}
+#endif
};
- PyInterpreterState *interp = _PyInterpreterState_Get();
+ PyInterpreterState *interp = PyThreadState_GET()->interp;
PyObject *mod;
unsigned i;
@@ -1504,8 +1010,8 @@ static int _PyCodecRegistry_Init(void)
interp->codec_error_registry = PyDict_New();
if (interp->codec_error_registry) {
- for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
- PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
+ for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
+ PyObject *func = PyCFunction_New(&methods[i].def, NULL);
int res;
if (!func)
Py_FatalError("can't initialize codec error registry");
@@ -1521,11 +1027,19 @@ static int _PyCodecRegistry_Init(void)
interp->codec_error_registry == NULL)
Py_FatalError("can't initialize codec registry");
- mod = PyImport_ImportModuleNoBlock("encodings");
+ mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
if (mod == NULL) {
+ if (PyErr_ExceptionMatches(PyExc_ImportError)) {
+ /* Ignore ImportErrors... this is done so that
+ distributions can disable the encodings package. Note
+ that other errors are not masked, e.g. SystemErrors
+ raised to inform the user of an error in the Python
+ configuration are still reported back to the user. */
+ PyErr_Clear();
+ return 0;
+ }
return -1;
}
Py_DECREF(mod);
- interp->codecs_initialized = 1;
return 0;
}