diff options
author | Walter Dörwald <walter@livinglogic.de> | 2007-08-17 16:41:28 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2007-08-17 16:41:28 (GMT) |
commit | 6e390806495cf30c836615996b94e5ffa258cbef (patch) | |
tree | eef913ca3061a114ff6d301a042408d4d3243ecc /Objects | |
parent | 437e6a3b1588ece44abbb4d65f74f9a841638e1d (diff) | |
download | cpython-6e390806495cf30c836615996b94e5ffa258cbef.zip cpython-6e390806495cf30c836615996b94e5ffa258cbef.tar.gz cpython-6e390806495cf30c836615996b94e5ffa258cbef.tar.bz2 |
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index f686ba6..b78bfc0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode) NULL); } +/* --- UTF-32 Codec ------------------------------------------------------- */ + +PyObject * +PyUnicode_DecodeUTF32(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder) +{ + return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL); +} + +PyObject * +PyUnicode_DecodeUTF32Stateful(const char *s, + Py_ssize_t size, + const char *errors, + int *byteorder, + Py_ssize_t *consumed) +{ + const char *starts = s; + Py_ssize_t startinpos; + Py_ssize_t endinpos; + Py_ssize_t outpos; + PyUnicodeObject *unicode; + Py_UNICODE *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + const unsigned char *q, *e; + int bo = 0; /* assume native ordering by default */ + const char *errmsg = ""; + /* On narrow builds we split characters outside the BMP into two + codepoints => count how much extra space we need. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size/4; i++) + if (((Py_UCS4 *)s)[i] >= 0x10000) + pairs++; +#endif + /* Offsets from q for retrieving bytes in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + /* This might be one to much, because of a BOM */ + unicode = _PyUnicode_New((size+3)/4+pairs); + if (!unicode) + return NULL; + if (size == 0) + return (PyObject *)unicode; + + /* Unpack UTF-32 encoded data */ + p = unicode->str; + q = (unsigned char *)s; + e = q + size; + + if (byteorder) + bo = *byteorder; + + /* Check for BOM marks (U+FEFF) in the input and adjust current + byte order setting accordingly. In native mode, the leading BOM + mark is skipped, in all other modes, it is copied to the output + stream as-is (giving a ZWNBSP character). */ + if (bo == 0) { + if (size >= 4) { + const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + if (bom == 0x0000FEFF) { + q += 4; + bo = -1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = 1; + } +#else + if (bom == 0x0000FEFF) { + q += 4; + bo = 1; + } + else if (bom == 0xFFFE0000) { + q += 4; + bo = -1; + } +#endif + } + } + + if (bo == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (bo == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (q < e) { + Py_UCS4 ch; + /* remaining bytes at the end? (size should be divisible by 4) */ + if (e-q<4) { + if (consumed) + break; + errmsg = "truncated data"; + startinpos = ((const char *)q)-starts; + endinpos = ((const char *)e)-starts; + goto utf32Error; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + } + ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) | + (q[iorder[1]] << 8) | q[iorder[0]]; + + if (ch >= 0x110000) + { + errmsg = "codepoint not in range(0x110000)"; + startinpos = ((const char *)q)-starts; + endinpos = startinpos+4; + goto utf32Error; + } +#ifndef Py_UNICODE_WIDE + if (ch >= 0x10000) + { + *p++ = 0xD800 | ((ch-0x10000) >> 10); + *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF); + } + else +#endif + *p++ = ch; + q += 4; + continue; + utf32Error: + outpos = p-PyUnicode_AS_UNICODE(unicode); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "utf32", errmsg, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&unicode, &outpos, &p)) + goto onError; + } + + if (byteorder) + *byteorder = bo; + + if (consumed) + *consumed = (const char *)q-starts; + + /* Adjust length */ + if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0) + goto onError; + + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)unicode; + +onError: + Py_DECREF(unicode); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + +PyObject * +PyUnicode_EncodeUTF32(const Py_UNICODE *s, + Py_ssize_t size, + const char *errors, + int byteorder) +{ + PyObject *v; + unsigned char *p; +#ifndef Py_UNICODE_WIDE + int i, pairs; +#else + const int pairs = 0; +#endif + /* Offsets from p for storing byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int iorder[] = {0, 1, 2, 3}; +#else + int iorder[] = {3, 2, 1, 0}; +#endif + +#define STORECHAR(CH) \ + do { \ + p[iorder[3]] = ((CH) >> 24) & 0xff; \ + p[iorder[2]] = ((CH) >> 16) & 0xff; \ + p[iorder[1]] = ((CH) >> 8) & 0xff; \ + p[iorder[0]] = (CH) & 0xff; \ + p += 4; \ + } while(0) + + /* In narrow builds we can output surrogate pairs as one codepoint, + so we need less space. */ +#ifndef Py_UNICODE_WIDE + for (i = pairs = 0; i < size-1; i++) + if (0xD800 <= s[i] && s[i] <= 0xDBFF && + 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF) + pairs++; +#endif + v = PyString_FromStringAndSize(NULL, + 4 * (size - pairs + (byteorder == 0))); + if (v == NULL) + return NULL; + + p = (unsigned char *)PyString_AS_STRING(v); + if (byteorder == 0) + STORECHAR(0xFEFF); + if (size == 0) + return v; + + if (byteorder == -1) { + /* force LE */ + iorder[0] = 0; + iorder[1] = 1; + iorder[2] = 2; + iorder[3] = 3; + } + else if (byteorder == 1) { + /* force BE */ + iorder[0] = 3; + iorder[1] = 2; + iorder[2] = 1; + iorder[3] = 0; + } + + while (size-- > 0) { + Py_UCS4 ch = *s++; +#ifndef Py_UNICODE_WIDE + if (0xD800 <= ch && ch <= 0xDBFF && size > 0) { + Py_UCS4 ch2 = *s; + if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000; + s++; + size--; + } + } +#endif + STORECHAR(ch); + } + return v; +#undef STORECHAR +} + +PyObject *PyUnicode_AsUTF32String(PyObject *unicode) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return NULL; + } + return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode), + PyUnicode_GET_SIZE(unicode), + NULL, + 0); +} + /* --- UTF-16 Codec ------------------------------------------------------- */ PyObject * |