diff options
Diffstat (limited to 'Python/codecs.c')
| -rw-r--r-- | Python/codecs.c | 286 | 
1 files changed, 230 insertions, 56 deletions
| diff --git a/Python/codecs.c b/Python/codecs.c index ea33c49..27f2aeb 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -53,7 +53,7 @@ int PyCodec_Register(PyObject *search_function)  static  PyObject *normalizestring(const char *string)  { -    register size_t i; +    size_t i;      size_t len = strlen(string);      char *p;      PyObject *v; @@ -65,9 +65,9 @@ PyObject *normalizestring(const char *string)      p = PyMem_Malloc(len + 1);      if (p == NULL) -        return NULL; +        return PyErr_NoMemory();      for (i = 0; i < len; i++) { -        register char ch = string[i]; +        char ch = string[i];          if (ch == ' ')              ch = '-';          else @@ -185,6 +185,32 @@ PyObject *_PyCodec_Lookup(const char *encoding)      return NULL;  } +int _PyCodec_Forget(const char *encoding) +{ +    PyInterpreterState *interp; +    PyObject *v; +    int result; + +    interp = PyThreadState_GET()->interp; +    if (interp->codec_search_path == NULL) { +        return -1; +    } + +    /* Convert the encoding to a normalized Python string: all +       characters are converted to lower case, spaces and hyphens are +       replaced with underscores. */ +    v = normalizestring(encoding); +    if (v == NULL) { +        return -1; +    } + +    /* Drop the named codec from the internal cache */ +    result = PyDict_DelItem(interp->codec_search_cache, v); +    Py_DECREF(v); + +    return result; +} +  /* Codec registry encoding check API. */  int PyCodec_KnownEncoding(const char *encoding) @@ -360,6 +386,22 @@ PyObject *PyCodec_StreamWriter(const char *encoding,      return codec_getstreamcodec(encoding, stream, errors, 3);  } +/* Helper that tries to ensure the reported exception chain indicates the + * codec that was invoked to trigger the failure without changing the type + * of the exception raised. + */ +static void +wrap_codec_error(const char *operation, +                 const char *encoding) +{ +    /* TrySetFromCause will replace the active exception with a suitably +     * updated clone if it can, otherwise it will leave the original +     * exception alone. +     */ +    _PyErr_TrySetFromCause("%s with '%s' codec failed", +                           operation, encoding); +} +  /* Encode an object (e.g. an Unicode object) using the given encoding     and return the resulting encoded object (usually a Python string). @@ -379,8 +421,10 @@ _PyCodec_EncodeInternal(PyObject *object,          goto onError;      result = PyEval_CallObject(encoder, args); -    if (result == NULL) +    if (result == NULL) { +        wrap_codec_error("encoding", encoding);          goto onError; +    }      if (!PyTuple_Check(result) ||          PyTuple_GET_SIZE(result) != 2) { @@ -423,8 +467,10 @@ _PyCodec_DecodeInternal(PyObject *object,          goto onError;      result = PyEval_CallObject(decoder,args); -    if (result == NULL) +    if (result == NULL) { +        wrap_codec_error("decoding", encoding);          goto onError; +    }      if (!PyTuple_Check(result) ||          PyTuple_GET_SIZE(result) != 2) {          PyErr_SetString(PyExc_TypeError, @@ -503,12 +549,13 @@ PyObject * _PyCodec_LookupTextEncoding(const char *encoding,          } else {              is_text_codec = PyObject_IsTrue(attr);              Py_DECREF(attr); -            if (!is_text_codec) { +            if (is_text_codec <= 0) {                  Py_DECREF(codec); -                PyErr_Format(PyExc_LookupError, -                             "'%.400s' is not a text encoding; " -                             "use %s to handle arbitrary codecs", -                             encoding, alternate_command); +                if (!is_text_codec) +                    PyErr_Format(PyExc_LookupError, +                                 "'%.400s' is not a text encoding; " +                                 "use %s to handle arbitrary codecs", +                                 encoding, alternate_command);                  return NULL;              }          } @@ -589,7 +636,7 @@ int PyCodec_RegisterError(const char *name, PyObject *error)          return -1;      }      return PyDict_SetItemString(interp->codec_error_registry, -                                (char *)name, error); +                                name, error);  }  /* Lookup the error handling callback function registered under the @@ -605,7 +652,7 @@ PyObject *PyCodec_LookupError(const char *name)      if (name==NULL)          name = "strict"; -    handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name); +    handler = PyDict_GetItemString(interp->codec_error_registry, name);      if (!handler)          PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);      else @@ -615,18 +662,9 @@ PyObject *PyCodec_LookupError(const char *name)  static void wrong_exception_type(PyObject *exc)  { -    _Py_IDENTIFIER(__class__); -    _Py_IDENTIFIER(__name__); -    PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__); -    if (type != NULL) { -        PyObject *name = _PyObject_GetAttrId(type, &PyId___name__); -        Py_DECREF(type); -        if (name != NULL) { -            PyErr_Format(PyExc_TypeError, -                         "don't know how to handle %S in error callback", name); -            Py_DECREF(name); -        } -    } +    PyErr_Format(PyExc_TypeError, +                 "don't know how to handle %.200s in error callback", +                 exc->ob_type->tp_name);  }  PyObject *PyCodec_StrictErrors(PyObject *exc) @@ -642,15 +680,16 @@ PyObject *PyCodec_StrictErrors(PyObject *exc)  PyObject *PyCodec_IgnoreErrors(PyObject *exc)  {      Py_ssize_t end; -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          if (PyUnicodeEncodeError_GetEnd(exc, &end))              return NULL;      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {          if (PyUnicodeDecodeError_GetEnd(exc, &end))              return NULL;      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {          if (PyUnicodeTranslateError_GetEnd(exc, &end))              return NULL;      } @@ -666,7 +705,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)  {      Py_ssize_t start, end, i, len; -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          PyObject *res;          int kind;          void *data; @@ -685,14 +724,14 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)          assert(_PyUnicode_CheckConsistency(res, 1));          return Py_BuildValue("(Nn)", res, end);      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {          if (PyUnicodeDecodeError_GetEnd(exc, &end))              return NULL;          return Py_BuildValue("(Cn)",                               (int)Py_UNICODE_REPLACEMENT_CHARACTER,                               end);      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {          PyObject *res;          int kind;          void *data; @@ -719,7 +758,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc)  PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)  { -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          PyObject *restuple;          PyObject *object;          Py_ssize_t i; @@ -817,7 +856,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)  PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)  { -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          PyObject *restuple;          PyObject *object;          Py_ssize_t i; @@ -848,8 +887,10 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)                  ressize += 1+1+2;          }          res = PyUnicode_New(ressize, 127); -        if (res==NULL) +        if (res == NULL) { +            Py_DECREF(object);              return NULL; +        }          for (i = start, outp = PyUnicode_1BYTE_DATA(res);              i < end; ++i) {              c = PyUnicode_READ_CHAR(object, i); @@ -885,6 +926,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)      }  } +#define ENC_UTF8        0 +#define ENC_UTF16BE     1 +#define ENC_UTF16LE     2 +#define ENC_UTF32BE     3 +#define ENC_UTF32LE     4 + +static int +get_standard_encoding(const char *encoding, int *bytelength) +{ +    if (Py_TOLOWER(encoding[0]) == 'u' && +        Py_TOLOWER(encoding[1]) == 't' && +        Py_TOLOWER(encoding[2]) == 'f') { +        encoding += 3; +        if (*encoding == '-' || *encoding == '_' ) +            encoding++; +        if (encoding[0] == '1' && encoding[1] == '6') { +            encoding += 2; +            *bytelength = 2; +            if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN +                return ENC_UTF16BE; +#else +                return ENC_UTF16LE; +#endif +            } +            if (*encoding == '-' || *encoding == '_' ) +                encoding++; +            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { +                if (Py_TOLOWER(encoding[0]) == 'b') +                    return ENC_UTF16BE; +                if (Py_TOLOWER(encoding[0]) == 'l') +                    return ENC_UTF16LE; +            } +        } +        else if (encoding[0] == '3' && encoding[1] == '2') { +            encoding += 2; +            *bytelength = 4; +            if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN +                return ENC_UTF32BE; +#else +                return ENC_UTF32LE; +#endif +            } +            if (*encoding == '-' || *encoding == '_' ) +                encoding++; +            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { +                if (Py_TOLOWER(encoding[0]) == 'b') +                    return ENC_UTF32BE; +                if (Py_TOLOWER(encoding[0]) == 'l') +                    return ENC_UTF32LE; +            } +        } +    } +    /* utf-8 */ +    *bytelength = 3; +    return ENC_UTF8; +} +  /* This handler is declared static until someone demonstrates     a need to call it directly. */  static PyObject * @@ -892,76 +992,149 @@ PyCodec_SurrogatePassErrors(PyObject *exc)  {      PyObject *restuple;      PyObject *object; +    PyObject *encode; +    char *encoding; +    int code; +    int bytelength;      Py_ssize_t i;      Py_ssize_t start;      Py_ssize_t end;      PyObject *res; -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { -        char *outp; + +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { +        unsigned char *outp;          if (PyUnicodeEncodeError_GetStart(exc, &start))              return NULL;          if (PyUnicodeEncodeError_GetEnd(exc, &end))              return NULL;          if (!(object = PyUnicodeEncodeError_GetObject(exc)))              return NULL; -        res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); +        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { +            Py_DECREF(object); +            return NULL; +        } +        if (!(encoding = PyUnicode_AsUTF8(encode))) { +            Py_DECREF(object); +            Py_DECREF(encode); +            return NULL; +        } +        code = get_standard_encoding(encoding, &bytelength); +        Py_DECREF(encode); + +        if (end - start > PY_SSIZE_T_MAX / bytelength) +            end = start + PY_SSIZE_T_MAX / bytelength; +        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));          if (!res) {              Py_DECREF(object);              return NULL;          } -        outp = PyBytes_AsString(res); +        outp = (unsigned char*)PyBytes_AsString(res);          for (i = start; i < end; i++) {              /* object is guaranteed to be "ready" */              Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); -            if (ch < 0xd800 || ch > 0xdfff) { +            if (!Py_UNICODE_IS_SURROGATE(ch)) {                  /* Not a surrogate, fail with original exception */                  PyErr_SetObject(PyExceptionInstance_Class(exc), exc);                  Py_DECREF(res);                  Py_DECREF(object);                  return NULL;              } -            *outp++ = (char)(0xe0 | (ch >> 12)); -            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); -            *outp++ = (char)(0x80 | (ch & 0x3f)); +            switch (code) { +            case ENC_UTF8: +                *outp++ = (unsigned char)(0xe0 | (ch >> 12)); +                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); +                *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); +                break; +            case ENC_UTF16LE: +                *outp++ = (unsigned char) ch; +                *outp++ = (unsigned char)(ch >> 8); +                break; +            case ENC_UTF16BE: +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char) ch; +                break; +            case ENC_UTF32LE: +                *outp++ = (unsigned char) ch; +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char)(ch >> 16); +                *outp++ = (unsigned char)(ch >> 24); +                break; +            case ENC_UTF32BE: +                *outp++ = (unsigned char)(ch >> 24); +                *outp++ = (unsigned char)(ch >> 16); +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char) ch; +                break; +            }          }          restuple = Py_BuildValue("(On)", res, end);          Py_DECREF(res);          Py_DECREF(object);          return restuple;      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {          unsigned char *p;          Py_UCS4 ch = 0;          if (PyUnicodeDecodeError_GetStart(exc, &start))              return NULL; +        if (PyUnicodeDecodeError_GetEnd(exc, &end)) +            return NULL;          if (!(object = PyUnicodeDecodeError_GetObject(exc)))              return NULL;          if (!(p = (unsigned char*)PyBytes_AsString(object))) {              Py_DECREF(object);              return NULL;          } +        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { +            Py_DECREF(object); +            return NULL; +        } +        if (!(encoding = PyUnicode_AsUTF8(encode))) { +            Py_DECREF(object); +            Py_DECREF(encode); +            return NULL; +        } +        code = get_standard_encoding(encoding, &bytelength); +        Py_DECREF(encode); +          /* Try decoding a single surrogate character. If             there are more, let the codec call us again. */          p += start; -        if (PyBytes_GET_SIZE(object) - start >= 3 && -            (p[0] & 0xf0) == 0xe0 && -            (p[1] & 0xc0) == 0x80 && -            (p[2] & 0xc0) == 0x80) { -            /* it's a three-byte code */ -            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); -            if (ch < 0xd800 || ch > 0xdfff) -                /* it's not a surrogate - fail */ -                ch = 0; +        if (PyBytes_GET_SIZE(object) - start >= bytelength) { +            switch (code) { +            case ENC_UTF8: +                if ((p[0] & 0xf0) == 0xe0 && +                    (p[1] & 0xc0) == 0x80 && +                    (p[2] & 0xc0) == 0x80) { +                    /* it's a three-byte code */ +                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); +                } +                break; +            case ENC_UTF16LE: +                ch = p[1] << 8 | p[0]; +                break; +            case ENC_UTF16BE: +                ch = p[0] << 8 | p[1]; +                break; +            case ENC_UTF32LE: +                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; +                break; +            case ENC_UTF32BE: +                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; +                break; +            }          } +          Py_DECREF(object); -        if (ch == 0) { +        if (!Py_UNICODE_IS_SURROGATE(ch)) { +            /* it's not a surrogate - fail */              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);              return NULL;          }          res = PyUnicode_FromOrdinal(ch);          if (res == NULL)              return NULL; -        return Py_BuildValue("(Nn)", res, start+3); +        return Py_BuildValue("(Nn)", res, start + bytelength);      }      else {          wrong_exception_type(exc); @@ -978,7 +1151,8 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc)      Py_ssize_t start;      Py_ssize_t end;      PyObject *res; -    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { + +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          char *outp;          if (PyUnicodeEncodeError_GetStart(exc, &start))              return NULL; @@ -1009,7 +1183,7 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc)          Py_DECREF(object);          return restuple;      } -    else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {          PyObject *str;          unsigned char *p;          Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */ @@ -1178,7 +1352,7 @@ static int _PyCodecRegistry_Init(void)      if (interp->codec_error_registry) {          for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { -            PyObject *func = PyCFunction_New(&methods[i].def, NULL); +            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);              int res;              if (!func)                  Py_FatalError("can't initialize codec error registry"); | 
