Patch #435971: UTF-7 codec by Brian Quinlan.

author: Marc-André Lemburg <mal@egenix.com> 2001-09-20 10:35:46 (GMT)
committer: Marc-André Lemburg <mal@egenix.com> 2001-09-20 10:35:46 (GMT)
commit: c60e6f777114f43c64f1b83f9ad2b6e4efd220e7 (patch)
tree: c7c600ed692c243edbd520872a2648cb9c01a8c1
parent: 26e3b681b26c9978c819396e278f43d356d86f9e (diff)
download: cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.zip
cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.gz
cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.bz2
5 files changed, 392 insertions, 1 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index d7540fb..41feae2 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
     const char *errors		/* error handling */
     );
 
+/* --- UTF-7 Codecs ------------------------------------------------------- */
+
+extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF7(
+    const char *string, 	/* UTF-7 encoded string */
+    int length,	 		/* size of string */
+    const char *errors		/* error handling */
+    );
+
+extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF7(
+    const Py_UNICODE *data, 	/* Unicode char buffer */
+    int length,	 		/* number of Py_UNICODE chars to encode */
+    int encodeSetO,             /* force the encoder to encode characters in
+                                   Set O, as described in RFC2152 */
+    int encodeWhiteSpace,       /* force the encoder to encode space, tab,
+                                   carriage return and linefeed characters */
+    const char *errors		/* error handling */
+    );
+
 /* --- UTF-8 Codecs ------------------------------------------------------- */
 
 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
diff --git a/Lib/encodings/aliases.py b/Lib/encodings/aliases.py
index c3c49b4..65d8fef 100644
--- a/Lib/encodings/aliases.py
+++ b/Lib/encodings/aliases.py
@@ -14,6 +14,10 @@ aliases = {
     'latin': 'latin_1',
     'latin1': 'latin_1',
     
+    # UTF-7
+    'utf7': 'utf_7',
+    'u7': 'utf_7',
+    
     # UTF-8
     'utf': 'utf_8',
     'utf8': 'utf_8',
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index dde16ef..d57328d 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -377,6 +377,32 @@ print 'done.'
 # Test builtin codecs
 print 'Testing builtin codecs...',
 
+# UTF-7 specific encoding tests:
+utfTests = [(u'A\u2262\u0391.', 'A+ImIDkQ.'),  # RFC2152 example
+ (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
+ (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
+ (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
+ (u'+', '+-'),
+ (u'+-', '+--'),
+ (u'+?', '+-?'),
+ (u'\?', '+AFw?'),
+ (u'+?', '+-?'),
+ (ur'\\?', '+AFwAXA?'),
+ (ur'\\\?', '+AFwAXABc?'),
+ (ur'++--', '+-+---')]
+
+for x,y in utfTests:
+    verify( x.encode('utf-7') == y )
+
+try:        
+    unicode('+3ADYAA-', 'utf-7') # surrogates not supported
+except UnicodeError:
+    pass
+else:
+    raise TestFailed, "unicode('+3ADYAA-', 'utf-7') failed to raise an exception"
+
+verify(unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd')
+
 # UTF-8 specific encoding tests:
 verify(u'\u20ac'.encode('utf-8') == \
        ''.join((chr(0xe2), chr(0x82), chr(0xac))) )
@@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
 verify(unicode('Andr\202 x','ascii','replace') == u'Andr\uFFFD x')
 
 verify(u'hello'.encode('ascii') == 'hello')
+verify(u'hello'.encode('utf-7') == 'hello')
 verify(u'hello'.encode('utf-8') == 'hello')
 verify(u'hello'.encode('utf8') == 'hello')
 verify(u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000')
@@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello')
 
 # Roundtrip safety for BMP (just the first 1024 chars)
 u = u''.join(map(unichr, range(1024)))
-for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
+for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
                  'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
     verify(unicode(u.encode(encoding),encoding) == u)
 
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c
index a085bcf..29b0686 100644
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -124,6 +124,22 @@ unicode_internal_decode(PyObject *self,
 }
 
 static PyObject *
+utf_7_decode(PyObject *self,
+	    PyObject *args)
+{
+    const char *data;
+    int size;
+    const char *errors = NULL;
+    
+    if (!PyArg_ParseTuple(args, "t#|z:utf_7_decode",
+			  &data, &size, &errors))
+	return NULL;
+
+    return codec_tuple(PyUnicode_DecodeUTF7(data, size, errors),
+		       size);
+}
+
+static PyObject *
 utf_8_decode(PyObject *self,
 	    PyObject *args)
 {
@@ -382,6 +398,30 @@ unicode_internal_encode(PyObject *self,
 }
 
 static PyObject *
+utf_7_encode(PyObject *self,
+	    PyObject *args)
+{
+    PyObject *str, *v;
+    const char *errors = NULL;
+
+    if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
+			  &str, &errors))
+	return NULL;
+
+    str = PyUnicode_FromObject(str);
+    if (str == NULL)
+	return NULL;
+    v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
+					 PyUnicode_GET_SIZE(str),
+                     0,
+                     0,
+					 errors),
+		    PyUnicode_GET_SIZE(str));
+    Py_DECREF(str);
+    return v;
+}
+
+static PyObject *
 utf_8_encode(PyObject *self,
 	    PyObject *args)
 {
@@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = {
 #ifdef Py_USING_UNICODE
     {"utf_8_encode",		utf_8_encode,			1},
     {"utf_8_decode",		utf_8_decode,			1},
+    {"utf_7_encode",		utf_7_encode,			1},
+    {"utf_7_decode",		utf_7_decode,			1},
     {"utf_16_encode",		utf_16_encode,			1},
     {"utf_16_le_encode",	utf_16_le_encode,		1},
     {"utf_16_be_encode",	utf_16_be_encode,		1},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 5080eb8..50f2f5c 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
     return -1;
 }
 
+/* --- UTF-7 Codec -------------------------------------------------------- */
+
+/* see RFC2152 for details */
+
+static 
+char utf7_special[128] = {
+    /* indicate whether a UTF-7 character is special i.e. cannot be directly
+       encoded:
+	   0 - not special
+	   1 - special
+	   2 - whitespace (optional)
+	   3 - RFC2152 Set O (optional) */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+
+};
+
+#define SPECIAL(c, encodeO, encodeWS) \
+	(((c)>127 || utf7_special[(c)] == 1) || \
+	 (encodeWS && (utf7_special[(c)] == 2)) || \
+     (encodeO && (utf7_special[(c)] == 3)))
+
+#define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
+#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
+#define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
+                        (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
+
+#define ENCODE(out, ch, bits) \
+    while (bits >= 6) { \
+        *out++ = B64(ch >> (bits-6)); \
+        bits -= 6; \
+    }
+
+#define DECODE(out, ch, bits, surrogate) \
+    while (bits >= 16) { \
+        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
+        bits -= 16; \
+		if (surrogate) { \
+			/* We have already generated an error for the high surrogate
+               so let's not bother seeing if the low surrogate is correct or not */\
+			surrogate = 0; \
+		} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
+            /* This is a surrogate pair. Unfortunately we can't represent \
+               it in a 16-bit character */ \
+			surrogate = 1; \
+            errmsg = "code pairs are not supported"; \
+	        goto utf7Error; \
+		} else { \
+				*out++ = outCh; \
+		} \
+    } \
+
+static
+int utf7_decoding_error(Py_UNICODE **dest,
+                        const char *errors,
+                        const char *details) 
+{
+    if ((errors == NULL) ||
+        (strcmp(errors,"strict") == 0)) {
+        PyErr_Format(PyExc_UnicodeError,
+                     "UTF-7 decoding error: %.400s",
+                     details);
+        return -1;
+    }
+    else if (strcmp(errors,"ignore") == 0) {
+        return 0;
+    }
+    else if (strcmp(errors,"replace") == 0) {
+        if (dest != NULL) {
+            **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
+            (*dest)++;
+        }
+        return 0;
+    }
+    else {
+        PyErr_Format(PyExc_ValueError,
+                     "UTF-7 decoding error; unknown error handling code: %.400s",
+                     errors);
+        return -1;
+    }
+}
+
+PyObject *PyUnicode_DecodeUTF7(const char *s,
+			       int size,
+			       const char *errors)
+{
+    const char *e;
+    PyUnicodeObject *unicode;
+    Py_UNICODE *p;
+    const char *errmsg = "";
+    int inShift = 0;
+    unsigned int bitsleft = 0;
+    unsigned long charsleft = 0;
+	int surrogate = 0;
+
+    unicode = _PyUnicode_New(size);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+
+    p = unicode->str;
+    e = s + size;
+
+    while (s < e) {
+        Py_UNICODE ch = *s;
+
+        if (inShift) {
+            if ((ch == '-') || !B64CHAR(ch)) {
+                inShift = 0;
+                s++;
+                    
+                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+                if (bitsleft >= 6) {
+                    /* The shift sequence has a partial character in it. If
+                       bitsleft < 6 then we could just classify it as padding
+                       but that is not the case here */
+
+                    errmsg = "partial character in shift sequence";
+                    goto utf7Error; 
+                }
+                /* According to RFC2152 the remaining bits should be zero. We
+                   choose to signal an error/insert a replacement character 
+                   here so indicate the potential of a misencoded character. */
+
+                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
+                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
+                    errmsg = "non-zero padding bits in shift sequence";
+                    goto utf7Error; 
+                }
+
+                if (ch == '-') {
+                    if ((s < e) && (*(s) == '-')) {
+                        *p++ = '-';   
+                        inShift = 1;
+                    }
+                } else if (SPECIAL(ch,0,0)) {
+                    errmsg = "unexpected special character";
+	                goto utf7Error;  
+                } else  {
+                    *p++ = ch;
+                }
+            } else {
+                charsleft = (charsleft << 6) | UB64(ch);
+                bitsleft += 6;
+                s++;
+                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+            }
+        }
+        else if ( ch == '+' ) {
+            s++;
+            if (s < e && *s == '-') {
+                s++;
+                *p++ = '+';
+            } else
+            {
+                inShift = 1;
+                bitsleft = 0;
+            }
+        }
+        else if (SPECIAL(ch,0,0)) {
+            errmsg = "unexpected special character";
+            s++;
+	        goto utf7Error;  
+        }
+        else {
+            *p++ = ch;
+            s++;
+        }
+        continue;
+    utf7Error:
+      if (utf7_decoding_error(&p, errors, errmsg))
+          goto onError;
+    }
+
+    if (inShift) {
+        if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
+            goto onError;
+    }
+
+    if (_PyUnicode_Resize(&unicode, p - unicode->str))
+        goto onError;
+
+    return (PyObject *)unicode;
+
+onError:
+    Py_DECREF(unicode);
+    return NULL;
+}
+
+
+PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
+                   int size,
+                   int encodeSetO,
+                   int encodeWhiteSpace,
+                   const char *errors)
+{
+    PyObject *v;
+    /* It might be possible to tighten this worst case */
+    unsigned int cbAllocated = 5 * size;
+    int inShift = 0;
+    int i = 0;
+    unsigned int bitsleft = 0;
+    unsigned long charsleft = 0;
+    char * out;
+    char * start;
+
+    if (size == 0)
+		return PyString_FromStringAndSize(NULL, 0);
+
+    v = PyString_FromStringAndSize(NULL, cbAllocated);
+    if (v == NULL)
+        return NULL;
+
+    start = out = PyString_AS_STRING(v);
+    for (;i < size; ++i) {
+        Py_UNICODE ch = s[i];
+
+        if (!inShift) {
+			if (ch == '+') {
+				*out++ = '+';
+                *out++ = '-';
+            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
+                charsleft = ch;
+                bitsleft = 16;
+                *out++ = '+';
+				/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+                inShift = bitsleft > 0;
+			} else {
+				*out++ = (char) ch;
+			}
+		} else {
+            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
+                *out++ = B64(charsleft << (6-bitsleft));
+                charsleft = 0;
+                bitsleft = 0;
+                /* Characters not in the BASE64 set implicitly unshift the sequence
+                   so no '-' is required, except if the character is itself a '-' */
+                if (B64CHAR(ch) || ch == '-') {
+                    *out++ = '-';
+                }
+                inShift = 0;
+                *out++ = (char) ch;
+            } else {
+                bitsleft += 16;
+                charsleft = (charsleft << 16) | ch;
+                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+
+                /* If the next character is special then we dont' need to terminate
+                   the shift sequence. If the next character is not a BASE64 character 
+                   or '-' then the shift sequence will be terminated implicitly and we
+                   don't have to insert a '-'. */
+
+                if (bitsleft == 0) {
+                    if (i + 1 < size) {
+                        Py_UNICODE ch2 = s[i+1];
+
+                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
+                           
+                        } else if (B64CHAR(ch2) || ch2 == '-') {
+                            *out++ = '-';
+                            inShift = 0;
+                        } else {
+                            inShift = 0;
+                        }
+
+                    }
+                    else {
+                        *out++ = '-';
+                        inShift = 0;
+                    }
+                }
+            }            
+        }
+	}
+    if (bitsleft) {
+        *out++= B64(charsleft << (6-bitsleft) );
+        *out++ = '-';
+    }
+
+    if (_PyString_Resize(&v, out - start)) {
+        Py_DECREF(v);
+        return NULL;
+    }
+    return v;
+}
+
+#undef SPECIAL
+#undef B64
+#undef B64CHAR
+#undef UB64
+#undef ENCODE
+#undef DECODE
+
 /* --- UTF-8 Codec -------------------------------------------------------- */
 
 static
author	Marc-André Lemburg <mal@egenix.com>	2001-09-20 10:35:46 (GMT)
committer	Marc-André Lemburg <mal@egenix.com>	2001-09-20 10:35:46 (GMT)
commit	c60e6f777114f43c64f1b83f9ad2b6e4efd220e7 (patch)
tree	c7c600ed692c243edbd520872a2648cb9c01a8c1
parent	26e3b681b26c9978c819396e278f43d356d86f9e (diff)
download	cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.zip cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.gz cpython-c60e6f777114f43c64f1b83f9ad2b6e4efd220e7.tar.bz2