summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorWalter Dörwald <walter@livinglogic.de>2007-08-17 16:41:28 (GMT)
committerWalter Dörwald <walter@livinglogic.de>2007-08-17 16:41:28 (GMT)
commit6e390806495cf30c836615996b94e5ffa258cbef (patch)
treeeef913ca3061a114ff6d301a042408d4d3243ecc /Objects
parent437e6a3b1588ece44abbb4d65f74f9a841638e1d (diff)
downloadcpython-6e390806495cf30c836615996b94e5ffa258cbef.zip
cpython-6e390806495cf30c836615996b94e5ffa258cbef.tar.gz
cpython-6e390806495cf30c836615996b94e5ffa258cbef.tar.bz2
Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.
Diffstat (limited to 'Objects')
-rw-r--r--Objects/unicodeobject.c266
1 files changed, 266 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index f686ba6..b78bfc0 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
NULL);
}
+/* --- UTF-32 Codec ------------------------------------------------------- */
+
+PyObject *
+PyUnicode_DecodeUTF32(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ int *byteorder)
+{
+ return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF32Stateful(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ int *byteorder,
+ Py_ssize_t *consumed)
+{
+ const char *starts = s;
+ Py_ssize_t startinpos;
+ Py_ssize_t endinpos;
+ Py_ssize_t outpos;
+ PyUnicodeObject *unicode;
+ Py_UNICODE *p;
+#ifndef Py_UNICODE_WIDE
+ int i, pairs;
+#else
+ const int pairs = 0;
+#endif
+ const unsigned char *q, *e;
+ int bo = 0; /* assume native ordering by default */
+ const char *errmsg = "";
+ /* On narrow builds we split characters outside the BMP into two
+ codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+ for (i = pairs = 0; i < size/4; i++)
+ if (((Py_UCS4 *)s)[i] >= 0x10000)
+ pairs++;
+#endif
+ /* Offsets from q for retrieving bytes in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ int iorder[] = {0, 1, 2, 3};
+#else
+ int iorder[] = {3, 2, 1, 0};
+#endif
+ PyObject *errorHandler = NULL;
+ PyObject *exc = NULL;
+
+ /* This might be one to much, because of a BOM */
+ unicode = _PyUnicode_New((size+3)/4+pairs);
+ if (!unicode)
+ return NULL;
+ if (size == 0)
+ return (PyObject *)unicode;
+
+ /* Unpack UTF-32 encoded data */
+ p = unicode->str;
+ q = (unsigned char *)s;
+ e = q + size;
+
+ if (byteorder)
+ bo = *byteorder;
+
+ /* Check for BOM marks (U+FEFF) in the input and adjust current
+ byte order setting accordingly. In native mode, the leading BOM
+ mark is skipped, in all other modes, it is copied to the output
+ stream as-is (giving a ZWNBSP character). */
+ if (bo == 0) {
+ if (size >= 4) {
+ const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+ (q[iorder[1]] << 8) | q[iorder[0]];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ if (bom == 0x0000FEFF) {
+ q += 4;
+ bo = -1;
+ }
+ else if (bom == 0xFFFE0000) {
+ q += 4;
+ bo = 1;
+ }
+#else
+ if (bom == 0x0000FEFF) {
+ q += 4;
+ bo = 1;
+ }
+ else if (bom == 0xFFFE0000) {
+ q += 4;
+ bo = -1;
+ }
+#endif
+ }
+ }
+
+ if (bo == -1) {
+ /* force LE */
+ iorder[0] = 0;
+ iorder[1] = 1;
+ iorder[2] = 2;
+ iorder[3] = 3;
+ }
+ else if (bo == 1) {
+ /* force BE */
+ iorder[0] = 3;
+ iorder[1] = 2;
+ iorder[2] = 1;
+ iorder[3] = 0;
+ }
+
+ while (q < e) {
+ Py_UCS4 ch;
+ /* remaining bytes at the end? (size should be divisible by 4) */
+ if (e-q<4) {
+ if (consumed)
+ break;
+ errmsg = "truncated data";
+ startinpos = ((const char *)q)-starts;
+ endinpos = ((const char *)e)-starts;
+ goto utf32Error;
+ /* The remaining input chars are ignored if the callback
+ chooses to skip the input */
+ }
+ ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+ (q[iorder[1]] << 8) | q[iorder[0]];
+
+ if (ch >= 0x110000)
+ {
+ errmsg = "codepoint not in range(0x110000)";
+ startinpos = ((const char *)q)-starts;
+ endinpos = startinpos+4;
+ goto utf32Error;
+ }
+#ifndef Py_UNICODE_WIDE
+ if (ch >= 0x10000)
+ {
+ *p++ = 0xD800 | ((ch-0x10000) >> 10);
+ *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+ }
+ else
+#endif
+ *p++ = ch;
+ q += 4;
+ continue;
+ utf32Error:
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf32", errmsg,
+ starts, size, &startinpos, &endinpos, &exc, &s,
+ (PyObject **)&unicode, &outpos, &p))
+ goto onError;
+ }
+
+ if (byteorder)
+ *byteorder = bo;
+
+ if (consumed)
+ *consumed = (const char *)q-starts;
+
+ /* Adjust length */
+ if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+ goto onError;
+
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return (PyObject *)unicode;
+
+onError:
+ Py_DECREF(unicode);
+ Py_XDECREF(errorHandler);
+ Py_XDECREF(exc);
+ return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeUTF32(const Py_UNICODE *s,
+ Py_ssize_t size,
+ const char *errors,
+ int byteorder)
+{
+ PyObject *v;
+ unsigned char *p;
+#ifndef Py_UNICODE_WIDE
+ int i, pairs;
+#else
+ const int pairs = 0;
+#endif
+ /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+ int iorder[] = {0, 1, 2, 3};
+#else
+ int iorder[] = {3, 2, 1, 0};
+#endif
+
+#define STORECHAR(CH) \
+ do { \
+ p[iorder[3]] = ((CH) >> 24) & 0xff; \
+ p[iorder[2]] = ((CH) >> 16) & 0xff; \
+ p[iorder[1]] = ((CH) >> 8) & 0xff; \
+ p[iorder[0]] = (CH) & 0xff; \
+ p += 4; \
+ } while(0)
+
+ /* In narrow builds we can output surrogate pairs as one codepoint,
+ so we need less space. */
+#ifndef Py_UNICODE_WIDE
+ for (i = pairs = 0; i < size-1; i++)
+ if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
+ 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+ pairs++;
+#endif
+ v = PyString_FromStringAndSize(NULL,
+ 4 * (size - pairs + (byteorder == 0)));
+ if (v == NULL)
+ return NULL;
+
+ p = (unsigned char *)PyString_AS_STRING(v);
+ if (byteorder == 0)
+ STORECHAR(0xFEFF);
+ if (size == 0)
+ return v;
+
+ if (byteorder == -1) {
+ /* force LE */
+ iorder[0] = 0;
+ iorder[1] = 1;
+ iorder[2] = 2;
+ iorder[3] = 3;
+ }
+ else if (byteorder == 1) {
+ /* force BE */
+ iorder[0] = 3;
+ iorder[1] = 2;
+ iorder[2] = 1;
+ iorder[3] = 0;
+ }
+
+ while (size-- > 0) {
+ Py_UCS4 ch = *s++;
+#ifndef Py_UNICODE_WIDE
+ if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+ Py_UCS4 ch2 = *s;
+ if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ s++;
+ size--;
+ }
+ }
+#endif
+ STORECHAR(ch);
+ }
+ return v;
+#undef STORECHAR
+}
+
+PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
+{
+ if (!PyUnicode_Check(unicode)) {
+ PyErr_BadArgument();
+ return NULL;
+ }
+ return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
+ PyUnicode_GET_SIZE(unicode),
+ NULL,
+ 0);
+}
+
/* --- UTF-16 Codec ------------------------------------------------------- */
PyObject *