diff options
author | Walter Dörwald <walter@livinglogic.de> | 2005-08-30 10:23:14 (GMT) |
---|---|---|
committer | Walter Dörwald <walter@livinglogic.de> | 2005-08-30 10:23:14 (GMT) |
commit | a47d1c08d0911f2f49d92b8c6035593a672af436 (patch) | |
tree | b89cf4f689e9037da807a5e2509d87715d64057f /Objects | |
parent | 523c9f0709d5e7af4d45817b92cf5ce01609269c (diff) | |
download | cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.zip cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.gz cpython-a47d1c08d0911f2f49d92b8c6035593a672af436.tar.bz2 |
SF bug #1251300: On UCS-4 builds the "unicode-internal" codec will now complain
about illegal code points. The codec now supports PEP 293 style error handlers.
(This is a variant of the Nik Haldimann's patch that detects truncated data)
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5e5dac5..5d096ed 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2273,6 +2273,81 @@ PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) PyUnicode_GET_SIZE(unicode)); } +/* --- Unicode Internal Codec ------------------------------------------- */ + +PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s, + int size, + const char *errors) +{ + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + Py_UNICODE unimax; + PyUnicodeObject *v; + Py_UNICODE *p; + const char *end; + const char *reason; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + unimax = PyUnicode_GetMax(); + v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); + if (v == NULL) + goto onError; + if (PyUnicode_GetSize((PyObject *)v) == 0) + return (PyObject *)v; + p = PyUnicode_AS_UNICODE(v); + end = s + size; + + while (s < end) { + *p = *(Py_UNICODE *)s; + /* We have to sanity check the raw data, otherwise doom looms for + some malformed UCS-4 data. */ + if ( + #ifdef Py_UNICODE_WIDE + *p > unimax || *p < 0 || + #endif + end-s < Py_UNICODE_SIZE + ) + { + startinpos = s - starts; + if (end-s < Py_UNICODE_SIZE) { + endinpos = end-starts; + reason = "truncated input"; + } + else { + endinpos = s - starts + Py_UNICODE_SIZE; + reason = "illegal code point (> 0x10FFFF)"; + } + outpos = p - PyUnicode_AS_UNICODE(v); + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "unicode_internal", reason, + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + goto onError; + } + } + else { + p++; + s += Py_UNICODE_SIZE; + } + } + + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) + goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)v; + + onError: + Py_XDECREF(v); + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return NULL; +} + /* --- Latin-1 Codec ------------------------------------------------------ */ PyObject *PyUnicode_DecodeLatin1(const char *s, |