diff options
Diffstat (limited to 'Python/codecs.c')
| -rw-r--r-- | Python/codecs.c | 233 | 
1 files changed, 206 insertions, 27 deletions
| diff --git a/Python/codecs.c b/Python/codecs.c index ea33c49..6849c0f 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -53,7 +53,7 @@ int PyCodec_Register(PyObject *search_function)  static  PyObject *normalizestring(const char *string)  { -    register size_t i; +    size_t i;      size_t len = strlen(string);      char *p;      PyObject *v; @@ -65,9 +65,9 @@ PyObject *normalizestring(const char *string)      p = PyMem_Malloc(len + 1);      if (p == NULL) -        return NULL; +        return PyErr_NoMemory();      for (i = 0; i < len; i++) { -        register char ch = string[i]; +        char ch = string[i];          if (ch == ' ')              ch = '-';          else @@ -185,6 +185,32 @@ PyObject *_PyCodec_Lookup(const char *encoding)      return NULL;  } +int _PyCodec_Forget(const char *encoding) +{ +    PyInterpreterState *interp; +    PyObject *v; +    int result; + +    interp = PyThreadState_GET()->interp; +    if (interp->codec_search_path == NULL) { +        return -1; +    } + +    /* Convert the encoding to a normalized Python string: all +       characters are converted to lower case, spaces and hyphens are +       replaced with underscores. */ +    v = normalizestring(encoding); +    if (v == NULL) { +        return -1; +    } + +    /* Drop the named codec from the internal cache */ +    result = PyDict_DelItem(interp->codec_search_cache, v); +    Py_DECREF(v); + +    return result; +} +  /* Codec registry encoding check API. */  int PyCodec_KnownEncoding(const char *encoding) @@ -360,6 +386,22 @@ PyObject *PyCodec_StreamWriter(const char *encoding,      return codec_getstreamcodec(encoding, stream, errors, 3);  } +/* Helper that tries to ensure the reported exception chain indicates the + * codec that was invoked to trigger the failure without changing the type + * of the exception raised. + */ +static void +wrap_codec_error(const char *operation, +                 const char *encoding) +{ +    /* TrySetFromCause will replace the active exception with a suitably +     * updated clone if it can, otherwise it will leave the original +     * exception alone. +     */ +    _PyErr_TrySetFromCause("%s with '%s' codec failed", +                           operation, encoding); +} +  /* Encode an object (e.g. an Unicode object) using the given encoding     and return the resulting encoded object (usually a Python string). @@ -379,8 +421,10 @@ _PyCodec_EncodeInternal(PyObject *object,          goto onError;      result = PyEval_CallObject(encoder, args); -    if (result == NULL) +    if (result == NULL) { +        wrap_codec_error("encoding", encoding);          goto onError; +    }      if (!PyTuple_Check(result) ||          PyTuple_GET_SIZE(result) != 2) { @@ -423,8 +467,10 @@ _PyCodec_DecodeInternal(PyObject *object,          goto onError;      result = PyEval_CallObject(decoder,args); -    if (result == NULL) +    if (result == NULL) { +        wrap_codec_error("decoding", encoding);          goto onError; +    }      if (!PyTuple_Check(result) ||          PyTuple_GET_SIZE(result) != 2) {          PyErr_SetString(PyExc_TypeError, @@ -589,7 +635,7 @@ int PyCodec_RegisterError(const char *name, PyObject *error)          return -1;      }      return PyDict_SetItemString(interp->codec_error_registry, -                                (char *)name, error); +                                name, error);  }  /* Lookup the error handling callback function registered under the @@ -605,7 +651,7 @@ PyObject *PyCodec_LookupError(const char *name)      if (name==NULL)          name = "strict"; -    handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name); +    handler = PyDict_GetItemString(interp->codec_error_registry, name);      if (!handler)          PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);      else @@ -848,8 +894,10 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)                  ressize += 1+1+2;          }          res = PyUnicode_New(ressize, 127); -        if (res==NULL) +        if (res == NULL) { +            Py_DECREF(object);              return NULL; +        }          for (i = start, outp = PyUnicode_1BYTE_DATA(res);              i < end; ++i) {              c = PyUnicode_READ_CHAR(object, i); @@ -885,6 +933,65 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)      }  } +#define ENC_UTF8        0 +#define ENC_UTF16BE     1 +#define ENC_UTF16LE     2 +#define ENC_UTF32BE     3 +#define ENC_UTF32LE     4 + +static int +get_standard_encoding(const char *encoding, int *bytelength) +{ +    if (Py_TOLOWER(encoding[0]) == 'u' && +        Py_TOLOWER(encoding[1]) == 't' && +        Py_TOLOWER(encoding[2]) == 'f') { +        encoding += 3; +        if (*encoding == '-' || *encoding == '_' ) +            encoding++; +        if (encoding[0] == '1' && encoding[1] == '6') { +            encoding += 2; +            *bytelength = 2; +            if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN +                return ENC_UTF16BE; +#else +                return ENC_UTF16LE; +#endif +            } +            if (*encoding == '-' || *encoding == '_' ) +                encoding++; +            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { +                if (Py_TOLOWER(encoding[0]) == 'b') +                    return ENC_UTF16BE; +                if (Py_TOLOWER(encoding[0]) == 'l') +                    return ENC_UTF16LE; +            } +        } +        else if (encoding[0] == '3' && encoding[1] == '2') { +            encoding += 2; +            *bytelength = 4; +            if (*encoding == '\0') { +#ifdef WORDS_BIGENDIAN +                return ENC_UTF32BE; +#else +                return ENC_UTF32LE; +#endif +            } +            if (*encoding == '-' || *encoding == '_' ) +                encoding++; +            if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') { +                if (Py_TOLOWER(encoding[0]) == 'b') +                    return ENC_UTF32BE; +                if (Py_TOLOWER(encoding[0]) == 'l') +                    return ENC_UTF32LE; +            } +        } +    } +    /* utf-8 */ +    *bytelength = 3; +    return ENC_UTF8; +} +  /* This handler is declared static until someone demonstrates     a need to call it directly. */  static PyObject * @@ -892,37 +999,79 @@ PyCodec_SurrogatePassErrors(PyObject *exc)  {      PyObject *restuple;      PyObject *object; +    PyObject *encode; +    char *encoding; +    int code; +    int bytelength;      Py_ssize_t i;      Py_ssize_t start;      Py_ssize_t end;      PyObject *res;      if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { -        char *outp; +        unsigned char *outp;          if (PyUnicodeEncodeError_GetStart(exc, &start))              return NULL;          if (PyUnicodeEncodeError_GetEnd(exc, &end))              return NULL;          if (!(object = PyUnicodeEncodeError_GetObject(exc)))              return NULL; -        res = PyBytes_FromStringAndSize(NULL, 3*(end-start)); +        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) { +            Py_DECREF(object); +            return NULL; +        } +        if (!(encoding = PyUnicode_AsUTF8(encode))) { +            Py_DECREF(object); +            Py_DECREF(encode); +            return NULL; +        } +        code = get_standard_encoding(encoding, &bytelength); +        Py_DECREF(encode); + +        if (end - start > PY_SSIZE_T_MAX / bytelength) +            end = start + PY_SSIZE_T_MAX / bytelength; +        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));          if (!res) {              Py_DECREF(object);              return NULL;          } -        outp = PyBytes_AsString(res); +        outp = (unsigned char*)PyBytes_AsString(res);          for (i = start; i < end; i++) {              /* object is guaranteed to be "ready" */              Py_UCS4 ch = PyUnicode_READ_CHAR(object, i); -            if (ch < 0xd800 || ch > 0xdfff) { +            if (!Py_UNICODE_IS_SURROGATE(ch)) {                  /* Not a surrogate, fail with original exception */                  PyErr_SetObject(PyExceptionInstance_Class(exc), exc);                  Py_DECREF(res);                  Py_DECREF(object);                  return NULL;              } -            *outp++ = (char)(0xe0 | (ch >> 12)); -            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f)); -            *outp++ = (char)(0x80 | (ch & 0x3f)); +            switch (code) { +            case ENC_UTF8: +                *outp++ = (unsigned char)(0xe0 | (ch >> 12)); +                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f)); +                *outp++ = (unsigned char)(0x80 | (ch & 0x3f)); +                break; +            case ENC_UTF16LE: +                *outp++ = (unsigned char) ch; +                *outp++ = (unsigned char)(ch >> 8); +                break; +            case ENC_UTF16BE: +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char) ch; +                break; +            case ENC_UTF32LE: +                *outp++ = (unsigned char) ch; +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char)(ch >> 16); +                *outp++ = (unsigned char)(ch >> 24); +                break; +            case ENC_UTF32BE: +                *outp++ = (unsigned char)(ch >> 24); +                *outp++ = (unsigned char)(ch >> 16); +                *outp++ = (unsigned char)(ch >> 8); +                *outp++ = (unsigned char) ch; +                break; +            }          }          restuple = Py_BuildValue("(On)", res, end);          Py_DECREF(res); @@ -934,34 +1083,64 @@ PyCodec_SurrogatePassErrors(PyObject *exc)          Py_UCS4 ch = 0;          if (PyUnicodeDecodeError_GetStart(exc, &start))              return NULL; +        if (PyUnicodeDecodeError_GetEnd(exc, &end)) +            return NULL;          if (!(object = PyUnicodeDecodeError_GetObject(exc)))              return NULL;          if (!(p = (unsigned char*)PyBytes_AsString(object))) {              Py_DECREF(object);              return NULL;          } +        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) { +            Py_DECREF(object); +            return NULL; +        } +        if (!(encoding = PyUnicode_AsUTF8(encode))) { +            Py_DECREF(object); +            Py_DECREF(encode); +            return NULL; +        } +        code = get_standard_encoding(encoding, &bytelength); +        Py_DECREF(encode); +          /* Try decoding a single surrogate character. If             there are more, let the codec call us again. */          p += start; -        if (PyBytes_GET_SIZE(object) - start >= 3 && -            (p[0] & 0xf0) == 0xe0 && -            (p[1] & 0xc0) == 0x80 && -            (p[2] & 0xc0) == 0x80) { -            /* it's a three-byte code */ -            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); -            if (ch < 0xd800 || ch > 0xdfff) -                /* it's not a surrogate - fail */ -                ch = 0; +        if (PyBytes_GET_SIZE(object) - start >= bytelength) { +            switch (code) { +            case ENC_UTF8: +                if ((p[0] & 0xf0) == 0xe0 && +                    (p[1] & 0xc0) == 0x80 && +                    (p[2] & 0xc0) == 0x80) { +                    /* it's a three-byte code */ +                    ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f); +                } +                break; +            case ENC_UTF16LE: +                ch = p[1] << 8 | p[0]; +                break; +            case ENC_UTF16BE: +                ch = p[0] << 8 | p[1]; +                break; +            case ENC_UTF32LE: +                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; +                break; +            case ENC_UTF32BE: +                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]; +                break; +            }          } +          Py_DECREF(object); -        if (ch == 0) { +        if (!Py_UNICODE_IS_SURROGATE(ch)) { +            /* it's not a surrogate - fail */              PyErr_SetObject(PyExceptionInstance_Class(exc), exc);              return NULL;          }          res = PyUnicode_FromOrdinal(ch);          if (res == NULL)              return NULL; -        return Py_BuildValue("(Nn)", res, start+3); +        return Py_BuildValue("(Nn)", res, start + bytelength);      }      else {          wrong_exception_type(exc); @@ -1178,7 +1357,7 @@ static int _PyCodecRegistry_Init(void)      if (interp->codec_error_registry) {          for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) { -            PyObject *func = PyCFunction_New(&methods[i].def, NULL); +            PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);              int res;              if (!func)                  Py_FatalError("can't initialize codec error registry"); | 
