summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Include/unicodeobject.h6
-rw-r--r--Lib/test/test_unicode.py20
-rw-r--r--Misc/ACKS1
-rw-r--r--Misc/NEWS3
-rw-r--r--Objects/unicodeobject.c425
5 files changed, 266 insertions, 189 deletions
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 696c1c7..9c11873 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -858,10 +858,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
- int encodeSetO, /* force the encoder to encode characters in
- Set O, as described in RFC2152 */
- int encodeWhiteSpace, /* force the encoder to encode space, tab,
- carriage return and linefeed characters */
+ int base64SetO, /* Encode RFC2152 Set O characters in base64 */
+ int base64WhiteSpace, /* Encode whitespace (sp, ht, nl, cr) in base64 */
const char *errors /* error handling */
);
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index 220a8eb..21bb922 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -867,19 +867,31 @@ class UnicodeTest(
('+?', b'+-?'),
(r'\\?', b'+AFwAXA?'),
(r'\\\?', b'+AFwAXABc?'),
- (r'++--', b'+-+---')
+ (r'++--', b'+-+---'),
+ ('\U000abcde', b'+2m/c3g-'), # surrogate pairs
+ ('/', b'/'),
]
for (x, y) in utfTests:
self.assertEqual(x.encode('utf-7'), y)
- # surrogates not supported
+ # Unpaired surrogates not supported
self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
- self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd')
+ self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
# Issue #2242: crash on some Windows/MSVC versions
- self.assertRaises(UnicodeDecodeError, b'+\xc1'.decode, 'utf-7')
+ self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
+
+ # Direct encoded characters
+ set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
+ # Optional direct characters
+ set_o = '!"#$%&*;<=>@[]^_`{|}'
+ for c in set_d:
+ self.assertEqual(c.encode('utf7'), c.encode('ascii'))
+ self.assertEqual(c.encode('ascii').decode('utf7'), c)
+ for c in set_o:
+ self.assertEqual(c.encode('ascii').decode('utf7'), c)
def test_codecs_utf8(self):
self.assertEqual(''.encode('utf-8'), b'')
diff --git a/Misc/ACKS b/Misc/ACKS
index 7eab407..e26940a 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -35,6 +35,7 @@ Luigi Ballabio
Jeff Balogh
Michael J. Barber
Chris Barker
+Nick Barnes
Quentin Barnes
Richard Barran
Cesar Eduardo Barros
diff --git a/Misc/NEWS b/Misc/NEWS
index befef21..378ed32 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 3.1 beta 1?
Core and Builtins
-----------------
+- Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal
+ sequences. Patch by Nick Barnes and Victor Stinner.
+
- Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.
- Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 6ad73e0..18b6fa2 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1702,69 +1702,84 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
/* --- UTF-7 Codec -------------------------------------------------------- */
-/* see RFC2152 for details */
+/* See RFC2152 for details. We encode conservatively and decode liberally. */
-static
-char utf7_special[128] = {
- /* indicate whether a UTF-7 character is special i.e. cannot be directly
- encoded:
- 0 - not special
- 1 - special
- 2 - whitespace (optional)
- 3 - RFC2152 Set O (optional) */
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+/* Three simple macros defining base-64. */
-};
+/* Is c a base-64 character? */
+
+#define IS_BASE64(c) \
+ (((c) >= 'A' && (c) <= 'Z') || \
+ ((c) >= 'a' && (c) <= 'z') || \
+ ((c) >= '0' && (c) <= '9') || \
+ (c) == '+' || (c) == '/')
-/* Note: The comparison (c) <= 0 is a trick to work-around gcc
- warnings about the comparison always being false; since
- utf7_special[0] is 1, we can safely make that one comparison
- true */
+/* given that c is a base-64 character, what is its base-64 value? */
-#define SPECIAL(c, encodeO, encodeWS) \
- ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
- (encodeWS && (utf7_special[(c)] == 2)) || \
- (encodeO && (utf7_special[(c)] == 3)))
+#define FROM_BASE64(c) \
+ (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
+ ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
+ ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
+ (c) == '+' ? 62 : 63)
-#define B64(n) \
+/* What is the base-64 character of the bottom 6 bits of n? */
+
+#define TO_BASE64(n) \
("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
-#define B64CHAR(c) \
- (ISALNUM(c) || (c) == '+' || (c) == '/')
-#define UB64(c) \
- ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
- (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
-
-#define ENCODE(out, ch, bits) \
- while (bits >= 6) { \
- *out++ = B64(ch >> (bits-6)); \
- bits -= 6; \
- }
-
-#define DECODE(out, ch, bits, surrogate) \
- while (bits >= 16) { \
- Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
- bits -= 16; \
- if (surrogate) { \
- /* We have already generated an error for the high surrogate \
- so let's not bother seeing if the low surrogate is correct or not */ \
- surrogate = 0; \
- } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
- /* This is a surrogate pair. Unfortunately we can't represent \
- it in a 16-bit character */ \
- surrogate = 1; \
- errmsg = "code pairs are not supported"; \
- goto utf7Error; \
- } else { \
- *out++ = outCh; \
- } \
- }
+
+/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
+ * decoded as itself. We are permissive on decoding; the only ASCII
+ * byte not decoding to itself is the + which begins a base64
+ * string. */
+
+#define DECODE_DIRECT(c) \
+ ((c) <= 127 && (c) != '+')
+
+/* The UTF-7 encoder treats ASCII characters differently according to
+ * whether they are Set D, Set O, Whitespace, or special (i.e. none of
+ * the above). See RFC2152. This array identifies these different
+ * sets:
+ * 0 : "Set D"
+ * alphanumeric and '(),-./:?
+ * 1 : "Set O"
+ * !"#$%&*;<=>@[]^_`{|}
+ * 2 : "whitespace"
+ * ht nl cr sp
+ * 3 : special (must be base64 encoded)
+ * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
+ */
+
+static
+char utf7_category[128] = {
+/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+/* sp ! " # $ % & ' ( ) * + , - . / */
+ 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
+/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+/* @ A B C D E F G H I J K L M N O */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* P Q R S T U V W X Y Z [ \ ] ^ _ */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
+/* ` a b c d e f g h i j k l m n o */
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+/* p q r s t u v w x y z { | } ~ del */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
+};
+
+/* ENCODE_DIRECT: this character should be encoded as itself. The
+ * answer depends on whether we are encoding set O as itself, and also
+ * on whether we are encoding whitespace as itself. RFC2152 makes it
+ * clear that the answers to these questions vary between
+ * applications, so this code needs to be flexible. */
+
+#define ENCODE_DIRECT(c, directO, directWS) \
+ ((c) < 128 && (c) > 0 && \
+ ((utf7_category[(c)] == 0) || \
+ (directWS && (utf7_category[(c)] == 2)) || \
+ (directO && (utf7_category[(c)] == 1))))
PyObject *PyUnicode_DecodeUTF7(const char *s,
Py_ssize_t size,
@@ -1773,6 +1788,13 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
}
+/* The decoder. The only state we preserve is our read position,
+ * i.e. how many characters we have consumed. So if we end in the
+ * middle of a shift sequence we have to back off the read position
+ * and the output to the beginning of the sequence, otherwise we lose
+ * all the shift state (seen bits, number of bits seen, high
+ * surrogate). */
+
PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Py_ssize_t size,
const char *errors,
@@ -1787,9 +1809,10 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Py_UNICODE *p;
const char *errmsg = "";
int inShift = 0;
- unsigned int bitsleft = 0;
- unsigned long charsleft = 0;
- int surrogate = 0;
+ Py_UNICODE *shiftOutStart;
+ unsigned int base64bits = 0;
+ unsigned long base64buffer = 0;
+ Py_UNICODE surrogate = 0;
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
@@ -1803,6 +1826,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
}
p = unicode->str;
+ shiftOutStart = p;
e = s + size;
while (s < e) {
@@ -1810,72 +1834,101 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
restart:
ch = (unsigned char) *s;
- if (inShift) {
- if ((ch == '-') || !B64CHAR(ch)) {
- inShift = 0;
+ if (inShift) { /* in a base-64 section */
+ if (IS_BASE64(ch)) { /* consume a base-64 character */
+ base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
+ base64bits += 6;
s++;
-
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
- if (bitsleft >= 6) {
- /* The shift sequence has a partial character in it. If
- bitsleft < 6 then we could just classify it as padding
- but that is not the case here */
-
- errmsg = "partial character in shift sequence";
- goto utf7Error;
+ if (base64bits >= 16) {
+ /* we have enough bits for a UTF-16 value */
+ Py_UNICODE outCh = (Py_UNICODE)
+ (base64buffer >> (base64bits-16));
+ base64bits -= 16;
+ base64buffer &= (1 << base64bits) - 1; /* clear high bits */
+ if (surrogate) {
+ /* expecting a second surrogate */
+ if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+#ifdef Py_UNICODE_WIDE
+ *p++ = (((surrogate & 0x3FF)<<10)
+ | (outCh & 0x3FF)) + 0x10000;
+#else
+ *p++ = surrogate;
+ *p++ = outCh;
+#endif
+ surrogate = 0;
+ }
+ else {
+ surrogate = 0;
+ errmsg = "second surrogate missing";
+ goto utf7Error;
+ }
+ }
+ else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+ /* first surrogate */
+ surrogate = outCh;
+ }
+ else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+ errmsg = "unexpected second surrogate";
+ goto utf7Error;
+ }
+ else {
+ *p++ = outCh;
+ }
}
- /* According to RFC2152 the remaining bits should be zero. We
- choose to signal an error/insert a replacement character
- here so indicate the potential of a misencoded character. */
-
- /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
- if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
- errmsg = "non-zero padding bits in shift sequence";
+ }
+ else { /* now leaving a base-64 section */
+ inShift = 0;
+ s++;
+ if (surrogate) {
+ errmsg = "second surrogate missing at end of shift sequence";
goto utf7Error;
}
-
- if (ch == '-') {
- if ((s < e) && (*(s) == '-')) {
- *p++ = '-';
- inShift = 1;
+ if (base64bits > 0) { /* left-over bits */
+ if (base64bits >= 6) {
+ /* We've seen at least one base-64 character */
+ errmsg = "partial character in shift sequence";
+ goto utf7Error;
}
- } else if (SPECIAL(ch,0,0)) {
- errmsg = "unexpected special character";
- goto utf7Error;
- } else {
+ else {
+ /* Some bits remain; they should be zero */
+ if (base64buffer != 0) {
+ errmsg = "non-zero padding bits in shift sequence";
+ goto utf7Error;
+ }
+ }
+ }
+ if (ch != '-') {
+ /* '-' is absorbed; other terminating
+ characters are preserved */
*p++ = ch;
}
- } else {
- charsleft = (charsleft << 6) | UB64(ch);
- bitsleft += 6;
- s++;
- /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
}
}
else if ( ch == '+' ) {
startinpos = s-starts;
- s++;
- if (s < e && *s == '-') {
+ s++; /* consume '+' */
+ if (s < e && *s == '-') { /* '+-' encodes '+' */
s++;
*p++ = '+';
- } else
- {
+ }
+ else { /* begin base64-encoded section */
inShift = 1;
- bitsleft = 0;
+ shiftOutStart = p;
+ base64bits = 0;
}
}
- else if (SPECIAL(ch,0,0)) {
- startinpos = s-starts;
- errmsg = "unexpected special character";
+ else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
+ *p++ = ch;
s++;
- goto utf7Error;
}
else {
- *p++ = ch;
+ startinpos = s-starts;
s++;
+ errmsg = "unexpected special character";
+ goto utf7Error;
}
continue;
- utf7Error:
+utf7Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
endinpos = s-starts;
if (unicode_decode_call_errorhandler(
@@ -1886,23 +1939,35 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
goto onError;
}
- if (inShift && !consumed) {
- outpos = p-PyUnicode_AS_UNICODE(unicode);
- endinpos = size;
- if (unicode_decode_call_errorhandler(
- errors, &errorHandler,
- "utf7", "unterminated shift sequence",
- &starts, &e, &startinpos, &endinpos, &exc, &s,
- &unicode, &outpos, &p))
- goto onError;
- if (s < e)
- goto restart;
+ /* end of string */
+
+ if (inShift && !consumed) { /* in shift sequence, no more to follow */
+ /* if we're in an inconsistent state, that's an error */
+ if (surrogate ||
+ (base64bits >= 6) ||
+ (base64bits > 0 && base64buffer != 0)) {
+ outpos = p-PyUnicode_AS_UNICODE(unicode);
+ endinpos = size;
+ if (unicode_decode_call_errorhandler(
+ errors, &errorHandler,
+ "utf7", "unterminated shift sequence",
+ &starts, &e, &startinpos, &endinpos, &exc, &s,
+ &unicode, &outpos, &p))
+ goto onError;
+ if (s < e)
+ goto restart;
+ }
}
+
+ /* return state */
if (consumed) {
- if(inShift)
+ if (inShift) {
+ p = shiftOutStart; /* back off output */
*consumed = startinpos;
- else
+ }
+ else {
*consumed = s-starts;
+ }
}
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
@@ -1922,27 +1987,27 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Py_ssize_t size,
- int encodeSetO,
- int encodeWhiteSpace,
+ int base64SetO,
+ int base64WhiteSpace,
const char *errors)
{
PyObject *v;
/* It might be possible to tighten this worst case */
- Py_ssize_t cbAllocated = 5 * size;
+ Py_ssize_t allocated = 5 * size;
int inShift = 0;
Py_ssize_t i = 0;
- unsigned int bitsleft = 0;
- unsigned long charsleft = 0;
+ unsigned int base64bits = 0;
+ unsigned long base64buffer = 0;
char * out;
char * start;
if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0);
- if (cbAllocated / 5 != size)
+ if (allocated / 5 != size)
return PyErr_NoMemory();
- v = PyBytes_FromStringAndSize(NULL, cbAllocated);
+ v = PyBytes_FromStringAndSize(NULL, allocated);
if (v == NULL)
return NULL;
@@ -1950,78 +2015,76 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
for (;i < size; ++i) {
Py_UNICODE ch = s[i];
- if (!inShift) {
- if (ch == '+') {
- *out++ = '+';
- *out++ = '-';
- } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- charsleft = ch;
- bitsleft = 16;
- *out++ = '+';
- /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
- inShift = bitsleft > 0;
- } else {
- *out++ = (char) ch;
- }
- } else {
- if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
- *out++ = B64(charsleft << (6-bitsleft));
- charsleft = 0;
- bitsleft = 0;
+ if (inShift) {
+ if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ /* shifting out */
+ if (base64bits) { /* output remaining bits */
+ *out++ = TO_BASE64(base64buffer << (6-base64bits));
+ base64buffer = 0;
+ base64bits = 0;
+ }
+ inShift = 0;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
- if (B64CHAR(ch) || ch == '-') {
+ if (IS_BASE64(ch) || ch == '-') {
*out++ = '-';
}
- inShift = 0;
*out++ = (char) ch;
- } else {
- bitsleft += 16;
- charsleft = (charsleft << 16) | ch;
- /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
-
- /* If the next character is special then we don't need to terminate
- the shift sequence. If the next character is not a BASE64 character
- or '-' then the shift sequence will be terminated implicitly and we
- don't have to insert a '-'. */
-
- if (bitsleft == 0) {
- if (i + 1 < size) {
- Py_UNICODE ch2 = s[i+1];
-
- if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
-
- } else if (B64CHAR(ch2) || ch2 == '-') {
- *out++ = '-';
- inShift = 0;
- } else {
- inShift = 0;
- }
-
- }
- else {
+ }
+ else {
+ goto encode_char;
+ }
+ }
+ else { /* not in a shift sequence */
+ if (ch == '+') {
+ *out++ = '+';
*out++ = '-';
- inShift = 0;
- }
- }
+ }
+ else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
+ *out++ = (char) ch;
+ }
+ else {
+ *out++ = '+';
+ inShift = 1;
+ goto encode_char;
}
}
+ continue;
+encode_char:
+#ifdef Py_UNICODE_WIDE
+ if (ch >= 0x10000) {
+ /* code first surrogate */
+ base64bits += 16;
+ base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+ while (base64bits >= 6) {
+ *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+ base64bits -= 6;
+ }
+ /* prepare second surrogate */
+ ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
+ }
+#endif
+ base64bits += 16;
+ base64buffer = (base64buffer << 16) | ch;
+ while (base64bits >= 6) {
+ *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+ base64bits -= 6;
+ }
}
- if (bitsleft) {
- *out++= B64(charsleft << (6-bitsleft) );
+ if (base64bits)
+ *out++= TO_BASE64(base64buffer << (6-base64bits) );
+ if (inShift)
*out++ = '-';
- }
if (_PyBytes_Resize(&v, out - start) < 0)
return NULL;
return v;
}
-#undef SPECIAL
-#undef B64
-#undef B64CHAR
-#undef UB64
-#undef ENCODE
-#undef DECODE
+#undef IS_BASE64
+#undef FROM_BASE64
+#undef TO_BASE64
+#undef DECODE_DIRECT
+#undef ENCODE_DIRECT
/* --- UTF-8 Codec -------------------------------------------------------- */