diff options
author | Marc-André Lemburg <mal@egenix.com> | 2001-09-20 10:35:46 (GMT) |
---|---|---|
committer | Marc-André Lemburg <mal@egenix.com> | 2001-09-20 10:35:46 (GMT) |
commit | c60e6f777114f43c64f1b83f9ad2b6e4efd220e7 (patch) | |
tree | c7c600ed692c243edbd520872a2648cb9c01a8c1 | |
parent | 26e3b681b26c9978c819396e278f43d356d86f9e (diff) | |
download | cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.zip cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.gz cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.bz2 |
Patch #435971: UTF-7 codec by Brian Quinlan.
-rw-r--r-- | Include/unicodeobject.h | 18 | ||||
-rw-r--r-- | Lib/encodings/aliases.py | 4 | ||||
-rw-r--r-- | Lib/test/test_unicode.py | 29 | ||||
-rw-r--r-- | Modules/_codecsmodule.c | 42 | ||||
-rw-r--r-- | Objects/unicodeobject.c | 300 |
5 files changed, 392 insertions, 1 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index d7540fb..41feae2 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString( const char *errors /* error handling */ ); +/* --- UTF-7 Codecs ------------------------------------------------------- */ + +extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF7( + const char *string, /* UTF-7 encoded string */ + int length, /* size of string */ + const char *errors /* error handling */ + ); + +extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF7( + const Py_UNICODE *data, /* Unicode char buffer */ + int length, /* number of Py_UNICODE chars to encode */ + int encodeSetO, /* force the encoder to encode characters in + Set O, as described in RFC2152 */ + int encodeWhiteSpace, /* force the encoder to encode space, tab, + carriage return and linefeed characters */ + const char *errors /* error handling */ + ); + /* --- UTF-8 Codecs ------------------------------------------------------- */ extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8( diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py index c3c49b4..65d8fef 100644 --- a/Lib/encodings/aliases.py +++ b/Lib/encodings/aliases.py @@ -14,6 +14,10 @@ aliases = { 'latin': 'latin_1', 'latin1': 'latin_1', + # UTF-7 + 'utf7': 'utf_7', + 'u7': 'utf_7', + # UTF-8 'utf': 'utf_8', 'utf8': 'utf_8', diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index dde16ef..d57328d 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -377,6 +377,32 @@ print 'done.' # Test builtin codecs print 'Testing builtin codecs...', +# UTF-7 specific encoding tests: +utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'), # RFC2152 example + (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'), # RFC2152 example + (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'), # RFC2152 example + (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example + (u'+', '+-'), + (u'+-', '+--'), + (u'+?', '+-?'), + (u'\?', '+AFw?'), + (u'+?', '+-?'), + (ur'\\?', '+AFwAXA?'), + (ur'\\\?', '+AFwAXABc?'), + (ur'++--', '+-+---')] + +for x,y in utfTests: + verify( x.encode('utf-7') == y ) + +try: + unicode('+3ADYAA-', 'utf-7') # surrogates not supported +except UnicodeError: + pass +else: + raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception" + +verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd') + # UTF-8 specific encoding tests: verify(u'\u20ac'.encode('utf-8') == \ ''.join((chr(0xe2), chr(0x82), chr(0xac))) ) @@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x") verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x') verify(u'hello'.encode('ascii') == 'hello') +verify(u'hello'.encode('utf-7') == 'hello') verify(u'hello'.encode('utf-8') == 'hello') verify(u'hello'.encode('utf8') == 'hello') verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000') @@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello') # Roundtrip safety for BMP (just the first 1024 chars) u = u''.join(map(unichr, range(1024))) -for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', +for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be', 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'): verify(unicode(u.encode(encoding),encoding) == u) diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c index a085bcf..29b0686 100644 --- a/Modules/_codecsmodule.c +++ b/Modules/_codecsmodule.c @@ -124,6 +124,22 @@ unicode_internal_decode(PyObject *self, } static PyObject * +utf_7_decode(PyObject *self, + PyObject *args) +{ + const char *data; + int size; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "t#|z:utf_7_decode", + &data, &size, &errors)) + return NULL; + + return codec_tuple(PyUnicode_DecodeUTF7(data, size, errors), + size); +} + +static PyObject * utf_8_decode(PyObject *self, PyObject *args) { @@ -382,6 +398,30 @@ unicode_internal_encode(PyObject *self, } static PyObject * +utf_7_encode(PyObject *self, + PyObject *args) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTuple(args, "O|z:utf_7_encode", + &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + 0, + 0, + errors), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static PyObject * utf_8_encode(PyObject *self, PyObject *args) { @@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = { #ifdef Py_USING_UNICODE {"utf_8_encode", utf_8_encode, 1}, {"utf_8_decode", utf_8_decode, 1}, + {"utf_7_encode", utf_7_encode, 1}, + {"utf_7_decode", utf_7_decode, 1}, {"utf_16_encode", utf_16_encode, 1}, {"utf_16_le_encode", utf_16_le_encode, 1}, {"utf_16_be_encode", utf_16_be_encode, 1}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5080eb8..50f2f5c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding) return -1; } +/* --- UTF-7 Codec -------------------------------------------------------- */ + +/* see RFC2152 for details */ + +static +char utf7_special[128] = { + /* indicate whether a UTF-7 character is special i.e. cannot be directly + encoded: + 0 - not special + 1 - special + 2 - whitespace (optional) + 3 - RFC2152 Set O (optional) */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, + +}; + +#define SPECIAL(c, encodeO, encodeWS) \ + (((c)>127 || utf7_special[(c)] == 1) || \ + (encodeWS && (utf7_special[(c)] == 2)) || \ + (encodeO && (utf7_special[(c)] == 3))) + +#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]) +#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/') +#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ + (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) + +#define ENCODE(out, ch, bits) \ + while (bits >= 6) { \ + *out++ = B64(ch >> (bits-6)); \ + bits -= 6; \ + } + +#define DECODE(out, ch, bits, surrogate) \ + while (bits >= 16) { \ + Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \ + bits -= 16; \ + if (surrogate) { \ + /* We have already generated an error for the high surrogate + so let's not bother seeing if the low surrogate is correct or not */\ + surrogate = 0; \ + } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \ + /* This is a surrogate pair. Unfortunately we can't represent \ + it in a 16-bit character */ \ + surrogate = 1; \ + errmsg = "code pairs are not supported"; \ + goto utf7Error; \ + } else { \ + *out++ = outCh; \ + } \ + } \ + +static +int utf7_decoding_error(Py_UNICODE **dest, + const char *errors, + const char *details) +{ + if ((errors == NULL) || + (strcmp(errors,"strict") == 0)) { + PyErr_Format(PyExc_UnicodeError, + "UTF-7 decoding error: %.400s", + details); + return -1; + } + else if (strcmp(errors,"ignore") == 0) { + return 0; + } + else if (strcmp(errors,"replace") == 0) { + if (dest != NULL) { + **dest = Py_UNICODE_REPLACEMENT_CHARACTER; + (*dest)++; + } + return 0; + } + else { + PyErr_Format(PyExc_ValueError, + "UTF-7 decoding error; unknown error handling code: %.400s", + errors); + return -1; + } +} + +PyObject *PyUnicode_DecodeUTF7(const char *s, + int size, + const char *errors) +{ + const char *e; + PyUnicodeObject *unicode; + Py_UNICODE *p; + const char *errmsg = ""; + int inShift = 0; + unsigned int bitsleft = 0; + unsigned long charsleft = 0; + int surrogate = 0; + + unicode = _PyUnicode_New(size); + if (!unicode) + return NULL; + if (size == 0) + return (PyObject *)unicode; + + p = unicode->str; + e = s + size; + + while (s < e) { + Py_UNICODE ch = *s; + + if (inShift) { + if ((ch == '-') || !B64CHAR(ch)) { + inShift = 0; + s++; + + /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); + if (bitsleft >= 6) { + /* The shift sequence has a partial character in it. If + bitsleft < 6 then we could just classify it as padding + but that is not the case here */ + + errmsg = "partial character in shift sequence"; + goto utf7Error; + } + /* According to RFC2152 the remaining bits should be zero. We + choose to signal an error/insert a replacement character + here so indicate the potential of a misencoded character. */ + + /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ + if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) { + errmsg = "non-zero padding bits in shift sequence"; + goto utf7Error; + } + + if (ch == '-') { + if ((s < e) && (*(s) == '-')) { + *p++ = '-'; + inShift = 1; + } + } else if (SPECIAL(ch,0,0)) { + errmsg = "unexpected special character"; + goto utf7Error; + } else { + *p++ = ch; + } + } else { + charsleft = (charsleft << 6) | UB64(ch); + bitsleft += 6; + s++; + /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); + } + } + else if ( ch == '+' ) { + s++; + if (s < e && *s == '-') { + s++; + *p++ = '+'; + } else + { + inShift = 1; + bitsleft = 0; + } + } + else if (SPECIAL(ch,0,0)) { + errmsg = "unexpected special character"; + s++; + goto utf7Error; + } + else { + *p++ = ch; + s++; + } + continue; + utf7Error: + if (utf7_decoding_error(&p, errors, errmsg)) + goto onError; + } + + if (inShift) { + if (utf7_decoding_error(&p, errors, "unterminated shift sequence")) + goto onError; + } + + if (_PyUnicode_Resize(&unicode, p - unicode->str)) + goto onError; + + return (PyObject *)unicode; + +onError: + Py_DECREF(unicode); + return NULL; +} + + +PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s, + int size, + int encodeSetO, + int encodeWhiteSpace, + const char *errors) +{ + PyObject *v; + /* It might be possible to tighten this worst case */ + unsigned int cbAllocated = 5 * size; + int inShift = 0; + int i = 0; + unsigned int bitsleft = 0; + unsigned long charsleft = 0; + char * out; + char * start; + + if (size == 0) + return PyString_FromStringAndSize(NULL, 0); + + v = PyString_FromStringAndSize(NULL, cbAllocated); + if (v == NULL) + return NULL; + + start = out = PyString_AS_STRING(v); + for (;i < size; ++i) { + Py_UNICODE ch = s[i]; + + if (!inShift) { + if (ch == '+') { + *out++ = '+'; + *out++ = '-'; + } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { + charsleft = ch; + bitsleft = 16; + *out++ = '+'; + /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); + inShift = bitsleft > 0; + } else { + *out++ = (char) ch; + } + } else { + if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { + *out++ = B64(charsleft << (6-bitsleft)); + charsleft = 0; + bitsleft = 0; + /* Characters not in the BASE64 set implicitly unshift the sequence + so no '-' is required, except if the character is itself a '-' */ + if (B64CHAR(ch) || ch == '-') { + *out++ = '-'; + } + inShift = 0; + *out++ = (char) ch; + } else { + bitsleft += 16; + charsleft = (charsleft << 16) | ch; + /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft); + + /* If the next character is special then we dont' need to terminate + the shift sequence. If the next character is not a BASE64 character + or '-' then the shift sequence will be terminated implicitly and we + don't have to insert a '-'. */ + + if (bitsleft == 0) { + if (i + 1 < size) { + Py_UNICODE ch2 = s[i+1]; + + if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { + + } else if (B64CHAR(ch2) || ch2 == '-') { + *out++ = '-'; + inShift = 0; + } else { + inShift = 0; + } + + } + else { + *out++ = '-'; + inShift = 0; + } + } + } + } + } + if (bitsleft) { + *out++= B64(charsleft << (6-bitsleft) ); + *out++ = '-'; + } + + if (_PyString_Resize(&v, out - start)) { + Py_DECREF(v); + return NULL; + } + return v; +} + +#undef SPECIAL +#undef B64 +#undef B64CHAR +#undef UB64 +#undef ENCODE +#undef DECODE + /* --- UTF-8 Codec -------------------------------------------------------- */ static |