diff options
Diffstat (limited to 'Python/codecs.c')
| -rw-r--r-- | Python/codecs.c | 199 | 
1 files changed, 185 insertions, 14 deletions
| diff --git a/Python/codecs.c b/Python/codecs.c index 74445b0..38b0c2c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -9,6 +9,7 @@ Copyright (c) Corporation for National Research Initiatives.     ------------------------------------------------------------------------ */  #include "Python.h" +#include "ucnhash.h"  #include <ctype.h>  const char *Py_hexdigits = "0123456789abcdef"; @@ -855,6 +856,119 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)  PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)  { +    PyObject *object; +    Py_ssize_t i; +    Py_ssize_t start; +    Py_ssize_t end; +    PyObject *res; +    unsigned char *outp; +    int ressize; +    Py_UCS4 c; + +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) { +        unsigned char *p; +        if (PyUnicodeDecodeError_GetStart(exc, &start)) +            return NULL; +        if (PyUnicodeDecodeError_GetEnd(exc, &end)) +            return NULL; +        if (!(object = PyUnicodeDecodeError_GetObject(exc))) +            return NULL; +        if (!(p = (unsigned char*)PyBytes_AsString(object))) { +            Py_DECREF(object); +            return NULL; +        } +        res = PyUnicode_New(4 * (end - start), 127); +        if (res == NULL) { +            Py_DECREF(object); +            return NULL; +        } +        outp = PyUnicode_1BYTE_DATA(res); +        for (i = start; i < end; i++, outp += 4) { +            unsigned char c = p[i]; +            outp[0] = '\\'; +            outp[1] = 'x'; +            outp[2] = Py_hexdigits[(c>>4)&0xf]; +            outp[3] = Py_hexdigits[c&0xf]; +        } + +        assert(_PyUnicode_CheckConsistency(res, 1)); +        Py_DECREF(object); +        return Py_BuildValue("(Nn)", res, end); +    } +    if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) { +        if (PyUnicodeEncodeError_GetStart(exc, &start)) +            return NULL; +        if (PyUnicodeEncodeError_GetEnd(exc, &end)) +            return NULL; +        if (!(object = PyUnicodeEncodeError_GetObject(exc))) +            return NULL; +    } +    else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) { +        if (PyUnicodeTranslateError_GetStart(exc, &start)) +            return NULL; +        if (PyUnicodeTranslateError_GetEnd(exc, &end)) +            return NULL; +        if (!(object = PyUnicodeTranslateError_GetObject(exc))) +            return NULL; +    } +    else { +        wrong_exception_type(exc); +        return NULL; +    } + +    if (end - start > PY_SSIZE_T_MAX / (1+1+8)) +        end = start + PY_SSIZE_T_MAX / (1+1+8); +    for (i = start, ressize = 0; i < end; ++i) { +        /* object is guaranteed to be "ready" */ +        c = PyUnicode_READ_CHAR(object, i); +        if (c >= 0x10000) { +            ressize += 1+1+8; +        } +        else if (c >= 0x100) { +            ressize += 1+1+4; +        } +        else +            ressize += 1+1+2; +    } +    res = PyUnicode_New(ressize, 127); +    if (res == NULL) { +        Py_DECREF(object); +        return NULL; +    } +    outp = PyUnicode_1BYTE_DATA(res); +    for (i = start; i < end; ++i) { +        c = PyUnicode_READ_CHAR(object, i); +        *outp++ = '\\'; +        if (c >= 0x00010000) { +            *outp++ = 'U'; +            *outp++ = Py_hexdigits[(c>>28)&0xf]; +            *outp++ = Py_hexdigits[(c>>24)&0xf]; +            *outp++ = Py_hexdigits[(c>>20)&0xf]; +            *outp++ = Py_hexdigits[(c>>16)&0xf]; +            *outp++ = Py_hexdigits[(c>>12)&0xf]; +            *outp++ = Py_hexdigits[(c>>8)&0xf]; +        } +        else if (c >= 0x100) { +            *outp++ = 'u'; +            *outp++ = Py_hexdigits[(c>>12)&0xf]; +            *outp++ = Py_hexdigits[(c>>8)&0xf]; +        } +        else +            *outp++ = 'x'; +        *outp++ = Py_hexdigits[(c>>4)&0xf]; +        *outp++ = Py_hexdigits[c&0xf]; +    } + +    assert(_PyUnicode_CheckConsistency(res, 1)); +    Py_DECREF(object); +    return Py_BuildValue("(Nn)", res, end); +} + +static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; +static int ucnhash_initialized = 0; + +PyObject *PyCodec_NameReplaceErrors(PyObject *exc) +{      if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {          PyObject *restuple;          PyObject *object; @@ -864,36 +978,57 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)          PyObject *res;          unsigned char *outp;          Py_ssize_t ressize; +        int replsize;          Py_UCS4 c; +        char buffer[256]; /* NAME_MAXLEN */          if (PyUnicodeEncodeError_GetStart(exc, &start))              return NULL;          if (PyUnicodeEncodeError_GetEnd(exc, &end))              return NULL;          if (!(object = PyUnicodeEncodeError_GetObject(exc)))              return NULL; -        if (end - start > PY_SSIZE_T_MAX / (1+1+8)) -            end = start + PY_SSIZE_T_MAX / (1+1+8); +        if (!ucnhash_initialized) { +            /* load the unicode data module */ +            ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import( +                                            PyUnicodeData_CAPSULE_NAME, 1); +            ucnhash_initialized = 1; +        }          for (i = start, ressize = 0; i < end; ++i) {              /* object is guaranteed to be "ready" */              c = PyUnicode_READ_CHAR(object, i); -            if (c >= 0x10000) { -                ressize += 1+1+8; +            if (ucnhash_CAPI && +                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { +                replsize = 1+1+1+(int)strlen(buffer)+1; +            } +            else if (c >= 0x10000) { +                replsize = 1+1+8;              }              else if (c >= 0x100) { -                ressize += 1+1+4; +                replsize = 1+1+4;              }              else -                ressize += 1+1+2; +                replsize = 1+1+2; +            if (ressize > PY_SSIZE_T_MAX - replsize) +                break; +            ressize += replsize;          } +        end = i;          res = PyUnicode_New(ressize, 127); -        if (res == NULL) { -            Py_DECREF(object); +        if (res==NULL)              return NULL; -        }          for (i = start, outp = PyUnicode_1BYTE_DATA(res);              i < end; ++i) {              c = PyUnicode_READ_CHAR(object, i);              *outp++ = '\\'; +            if (ucnhash_CAPI && +                ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) { +                *outp++ = 'N'; +                *outp++ = '{'; +                strcpy((char *)outp, buffer); +                outp += strlen(buffer); +                *outp++ = '}'; +                continue; +            }              if (c >= 0x00010000) {                  *outp++ = 'U';                  *outp++ = Py_hexdigits[(c>>28)&0xf]; @@ -914,6 +1049,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)              *outp++ = Py_hexdigits[c&0xf];          } +        assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);          assert(_PyUnicode_CheckConsistency(res, 1));          restuple = Py_BuildValue("(Nn)", res, end);          Py_DECREF(object); @@ -925,6 +1061,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)      }  } +#define ENC_UNKNOWN     -1  #define ENC_UTF8        0  #define ENC_UTF16BE     1  #define ENC_UTF16LE     2 @@ -940,7 +1077,11 @@ get_standard_encoding(const char *encoding, int *bytelength)          encoding += 3;          if (*encoding == '-' || *encoding == '_' )              encoding++; -        if (encoding[0] == '1' && encoding[1] == '6') { +        if (encoding[0] == '8' && encoding[1] == '\0') { +            *bytelength = 3; +            return ENC_UTF8; +        } +        else if (encoding[0] == '1' && encoding[1] == '6') {              encoding += 2;              *bytelength = 2;              if (*encoding == '\0') { @@ -979,9 +1120,11 @@ get_standard_encoding(const char *encoding, int *bytelength)              }          }      } -    /* utf-8 */ -    *bytelength = 3; -    return ENC_UTF8; +    else if (strcmp(encoding, "CP_UTF8") == 0) { +        *bytelength = 3; +        return ENC_UTF8; +    } +    return ENC_UNKNOWN;  }  /* This handler is declared static until someone demonstrates @@ -1019,6 +1162,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc)          }          code = get_standard_encoding(encoding, &bytelength);          Py_DECREF(encode); +        if (code == ENC_UNKNOWN) { +            /* Not supported, fail with original exception */ +            PyErr_SetObject(PyExceptionInstance_Class(exc), exc); +            Py_DECREF(object); +            return NULL; +        }          if (end - start > PY_SSIZE_T_MAX / bytelength)              end = start + PY_SSIZE_T_MAX / bytelength; @@ -1095,6 +1244,12 @@ PyCodec_SurrogatePassErrors(PyObject *exc)          }          code = get_standard_encoding(encoding, &bytelength);          Py_DECREF(encode); +        if (code == ENC_UNKNOWN) { +            /* Not supported, fail with original exception */ +            PyErr_SetObject(PyExceptionInstance_Class(exc), exc); +            Py_DECREF(object); +            return NULL; +        }          /* Try decoding a single surrogate character. If             there are more, let the codec call us again. */ @@ -1251,6 +1406,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)      return PyCodec_BackslashReplaceErrors(exc);  } +static PyObject *namereplace_errors(PyObject *self, PyObject *exc) +{ +    return PyCodec_NameReplaceErrors(exc); +} +  static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)  {      return PyCodec_SurrogatePassErrors(exc); @@ -1316,8 +1476,19 @@ static int _PyCodecRegistry_Init(void)                  backslashreplace_errors,                  METH_O,                  PyDoc_STR("Implements the 'backslashreplace' error handling, " +                          "which replaces malformed data with a backslashed " +                          "escape sequence.") +            } +        }, +        { +            "namereplace", +            { +                "namereplace_errors", +                namereplace_errors, +                METH_O, +                PyDoc_STR("Implements the 'namereplace' error handling, "                            "which replaces an unencodable character with a " -                          "backslashed escape sequence.") +                          "\\N{...} escape sequence.")              }          },          { | 
