diff options
Diffstat (limited to 'Objects/unicodeobject.c')
| -rw-r--r-- | Objects/unicodeobject.c | 266 | 
1 files changed, 266 insertions, 0 deletions
| diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f686ba6..b78bfc0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)  				NULL);  } +/* --- UTF-32 Codec ------------------------------------------------------- */ + +PyObject * +PyUnicode_DecodeUTF32(const char *s, +		      Py_ssize_t size, +		      const char *errors, +		      int *byteorder) +{ +    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); +} + +PyObject * +PyUnicode_DecodeUTF32Stateful(const char *s, +			      Py_ssize_t size, +			      const char *errors, +			      int *byteorder, +			      Py_ssize_t *consumed) +{ +    const char *starts = s; +    Py_ssize_t startinpos; +    Py_ssize_t endinpos; +    Py_ssize_t outpos; +    PyUnicodeObject *unicode; +    Py_UNICODE *p; +#ifndef Py_UNICODE_WIDE +    int i, pairs; +#else +    const int pairs = 0; +#endif +    const unsigned char *q, *e; +    int bo = 0;       /* assume native ordering by default */ +    const char *errmsg = ""; +    /* On narrow builds we split characters outside the BMP into two +       codepoints => count how much extra space we need. */ +#ifndef Py_UNICODE_WIDE +    for (i = pairs = 0; i < size/4; i++) +	if (((Py_UCS4 *)s)[i] >= 0x10000) +	    pairs++; +#endif +    /* Offsets from q for retrieving bytes in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +    int iorder[] = {0, 1, 2, 3}; +#else +    int iorder[] = {3, 2, 1, 0}; +#endif +    PyObject *errorHandler = NULL; +    PyObject *exc = NULL; + +    /* This might be one to much, because of a BOM */ +    unicode = _PyUnicode_New((size+3)/4+pairs); +    if (!unicode) +        return NULL; +    if (size == 0) +        return (PyObject *)unicode; + +    /* Unpack UTF-32 encoded data */ +    p = unicode->str; +    q = (unsigned char *)s; +    e = q + size; + +    if (byteorder) +        bo = *byteorder; + +    /* Check for BOM marks (U+FEFF) in the input and adjust current +       byte order setting accordingly. In native mode, the leading BOM +       mark is skipped, in all other modes, it is copied to the output +       stream as-is (giving a ZWNBSP character). */ +    if (bo == 0) { +        if (size >= 4) { +            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | +                                (q[iorder[1]] << 8) | q[iorder[0]]; +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +	    if (bom == 0x0000FEFF) { +		q += 4; +		bo = -1; +	    } +	    else if (bom == 0xFFFE0000) { +		q += 4; +		bo = 1; +	    } +#else +	    if (bom == 0x0000FEFF) { +		q += 4; +		bo = 1; +	    } +	    else if (bom == 0xFFFE0000) { +		q += 4; +		bo = -1; +	    } +#endif +	} +    } + +    if (bo == -1) { +        /* force LE */ +        iorder[0] = 0; +        iorder[1] = 1; +        iorder[2] = 2; +        iorder[3] = 3; +    } +    else if (bo == 1) { +        /* force BE */ +        iorder[0] = 3; +        iorder[1] = 2; +        iorder[2] = 1; +        iorder[3] = 0; +    } + +    while (q < e) { +	Py_UCS4 ch; +	/* remaining bytes at the end? (size should be divisible by 4) */ +	if (e-q<4) { +	    if (consumed) +		break; +	    errmsg = "truncated data"; +	    startinpos = ((const char *)q)-starts; +	    endinpos = ((const char *)e)-starts; +	    goto utf32Error; +	    /* The remaining input chars are ignored if the callback +	       chooses to skip the input */ +	} +	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | +	     (q[iorder[1]] << 8) | q[iorder[0]]; + +	if (ch >= 0x110000) +	{ +	    errmsg = "codepoint not in range(0x110000)"; +	    startinpos = ((const char *)q)-starts; +	    endinpos = startinpos+4; +	    goto utf32Error; +	} +#ifndef Py_UNICODE_WIDE +	if (ch >= 0x10000) +	{ +	    *p++ = 0xD800 | ((ch-0x10000) >> 10); +	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); +	} +	else +#endif +	    *p++ = ch; +	q += 4; +	continue; +    utf32Error: +	outpos = p-PyUnicode_AS_UNICODE(unicode); +    if (unicode_decode_call_errorhandler( +         errors, &errorHandler, +         "utf32", errmsg, +         starts, size, &startinpos, &endinpos, &exc, &s, +         (PyObject **)&unicode, &outpos, &p)) +	    goto onError; +    } + +    if (byteorder) +        *byteorder = bo; + +    if (consumed) +	*consumed = (const char *)q-starts; + +    /* Adjust length */ +    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) +        goto onError; + +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc); +    return (PyObject *)unicode; + +onError: +    Py_DECREF(unicode); +    Py_XDECREF(errorHandler); +    Py_XDECREF(exc); +    return NULL; +} + +PyObject * +PyUnicode_EncodeUTF32(const Py_UNICODE *s, +		      Py_ssize_t size, +		      const char *errors, +		      int byteorder) +{ +    PyObject *v; +    unsigned char *p; +#ifndef Py_UNICODE_WIDE +    int i, pairs; +#else +    const int pairs = 0; +#endif +    /* Offsets from p for storing byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +    int iorder[] = {0, 1, 2, 3}; +#else +    int iorder[] = {3, 2, 1, 0}; +#endif + +#define STORECHAR(CH)                       \ +    do {                                    \ +        p[iorder[3]] = ((CH) >> 24) & 0xff; \ +        p[iorder[2]] = ((CH) >> 16) & 0xff; \ +        p[iorder[1]] = ((CH) >> 8) & 0xff;  \ +        p[iorder[0]] = (CH) & 0xff;         \ +        p += 4;                             \ +    } while(0) + +    /* In narrow builds we can output surrogate pairs as one codepoint, +       so we need less space. */ +#ifndef Py_UNICODE_WIDE +    for (i = pairs = 0; i < size-1; i++) +	if (0xD800 <= s[i] && s[i] <= 0xDBFF && +	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) +	    pairs++; +#endif +    v = PyString_FromStringAndSize(NULL, +		  4 * (size - pairs + (byteorder == 0))); +    if (v == NULL) +        return NULL; + +    p = (unsigned char *)PyString_AS_STRING(v); +    if (byteorder == 0) +	STORECHAR(0xFEFF); +    if (size == 0) +        return v; + +    if (byteorder == -1) { +        /* force LE */ +        iorder[0] = 0; +        iorder[1] = 1; +        iorder[2] = 2; +        iorder[3] = 3; +    } +    else if (byteorder == 1) { +        /* force BE */ +        iorder[0] = 3; +        iorder[1] = 2; +        iorder[2] = 1; +        iorder[3] = 0; +    } + +    while (size-- > 0) { +	Py_UCS4 ch = *s++; +#ifndef Py_UNICODE_WIDE +	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { +	    Py_UCS4 ch2 = *s; +	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { +		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; +		s++; +		size--; +	    } +	} +#endif +        STORECHAR(ch); +    } +    return v; +#undef STORECHAR +} + +PyObject *PyUnicode_AsUTF32String(PyObject *unicode) +{ +    if (!PyUnicode_Check(unicode)) { +        PyErr_BadArgument(); +        return NULL; +    } +    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), +				 PyUnicode_GET_SIZE(unicode), +				 NULL, +				 0); +} +  /* --- UTF-16 Codec ------------------------------------------------------- */  PyObject * | 
