Patch #462190, patch #464070: Support quoted printable in the binascii module.

Decode and encode underscores for header style encoding. Fixes bug #463996.
author: Martin v. Löwis <martin@v.loewis.de> 2001-09-30 20:32:11 (GMT)
committer: Martin v. Löwis <martin@v.loewis.de> 2001-09-30 20:32:11 (GMT)
commit: 16dc7f44b1116aab58897bc7e94cb972488206fc (patch)
tree: b1e90d9ca27e6dbdd0cd6b6d66fcb8a333a746a3 /Modules/binascii.c
parent: 5f12d755a82312673c35e8224b2bde7ced159c52 (diff)
download: cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.zip
cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.tar.gz
cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.tar.bz2
1 files changed, 296 insertions, 0 deletions
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 00a2805..484f656 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -42,6 +42,15 @@
 ** does make the performance sub-optimal. Oh well, too bad...
 **
 ** Jack Jansen, CWI, July 1995.
+** 
+** Added support for quoted-printable encoding, based on rfc 1521 et al
+** quoted-printable encoding specifies that non printable characters (anything 
+** below 32 and above 126) be encoded as =XX where XX is the hexadecimal value
+** of the character.  It also specifies some other behavior to enable 8bit data
+** in a mail message with little difficulty (maximum line sizes, protecting 
+** some cases of whitespace, etc).    
+**
+** Brandon Long, September 2001.
 */
 
 
@@ -971,6 +980,289 @@ static char doc_unhexlify[] =
 hexstr must contain an even number of hex digits (upper or lower case).\n\
 This function is also available as \"unhexlify()\"";
 
+static int table_hex[128] = {
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+   0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+  -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
+};
+
+#define hexval(c) table_hex[(unsigned int)(c)]
+
+#define MAXLINESIZE 76
+
+static char doc_a2b_qp[] = "Decode a string of qp-encoded data";
+
+static PyObject* 
+binascii_a2b_qp(PyObject *self, PyObject *args, PyObject *kwargs)
+{
+	unsigned int in, out;
+	char ch;
+	unsigned char *data, *odata;
+	unsigned int datalen = 0;
+	PyObject *rv;
+	static char *kwlist[] = {"data", "header", NULL};
+	int header = 0;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|i", kwlist, &data, 
+	      &datalen, &header))
+		return NULL;
+
+	/* We allocate the output same size as input, this is overkill */
+	odata = (char *) calloc(1, datalen);
+
+	if (odata == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+
+	in = out = 0;
+	while (in < datalen) {
+		if (data[in] == '=') {
+			in++;
+			if (in >= datalen) break;
+			/* Soft line breaks */
+			if ((data[in] == '\n') || (data[in] == '\r') || 
+			    (data[in] == ' ') || (data[in] == '\t')) {
+				if (data[in] != '\n') {
+					while (in < datalen && data[in] != '\n') in++;
+				}
+				if (in < datalen) in++;
+			}
+			else if (data[in] == '=') {
+				/* broken case from broken python qp */
+				odata[out++] = '=';
+				in++;
+			}
+			else if (((data[in] >= 'A' && data[in] <= 'F') || 
+			          (data[in] >= 'a' && data[in] <= 'f') ||
+				  (data[in] >= '0' && data[in] <= '9')) &&
+			         ((data[in+1] >= 'A' && data[in+1] <= 'F') ||
+				  (data[in+1] >= 'a' && data[in+1] <= 'f') ||
+				  (data[in+1] >= '0' && data[in+1] <= '9'))) {
+				/* hexval */
+				ch = hexval(data[in]) << 4;
+				in++;
+				ch |= hexval(data[in]);
+				in++;
+				odata[out++] = ch;
+			}
+			else {
+			  odata[out++] = '=';
+			}
+		}
+		else if (header && data[in] == '_') {
+			odata[out++] = ' ';
+			in++;
+		}
+		else {
+			odata[out] = data[in];
+			in++;
+			out++;
+		}
+	}
+	if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+		free (odata);
+		return NULL;
+	}
+	free (odata);
+	return rv;
+}
+
+static int 
+to_hex (unsigned char ch, unsigned char *s)
+{
+	unsigned int uvalue = ch;
+
+	s[1] = "0123456789ABCDEF"[uvalue % 16];
+	uvalue = (uvalue / 16);
+	s[0] = "0123456789ABCDEF"[uvalue % 16];
+	return 0;
+}
+
+static char doc_b2a_qp[] = 
+"b2a_qp(data, quotetabs=0, istext=1, header=0) -> s; \n\
+ Encode a string using quoted-printable encoding. \n\
+\n\
+On encoding, when istext is set, newlines are not encoded, and white \n\
+space at end of lines is.  When istext is not set, \\r and \\n (CR/LF) are \n\
+both encoded.  When quotetabs is set, space and tabs are encoded.";
+
+/* XXX: This is ridiculously complicated to be backward compatible
+ * (mostly) with the quopri module.  It doesn't re-create the quopri
+ * module bug where text ending in CRLF has the CR encoded */
+static PyObject* 
+binascii_b2a_qp (PyObject *self, PyObject *args, PyObject *kwargs)
+{
+	unsigned int in, out;
+	unsigned char *data, *odata;
+	unsigned int datalen = 0, odatalen = 0;
+	PyObject *rv;
+	unsigned int linelen = 0;
+	static char *kwlist[] = {"data", "quotetabs", "istext", "header", NULL};
+	int istext = 1;
+	int quotetabs = 0;
+	int header = 0;
+	unsigned char ch;
+	int crlf = 0;
+	unsigned char *p;
+
+	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|iii", kwlist, &data, 
+	      &datalen, &quotetabs, &istext, &header))
+		return NULL;
+
+	/* See if this string is using CRLF line ends */
+	/* XXX: this function has the side effect of converting all of
+	 * the end of lines to be the same depending on this detection
+	 * here */
+	p = strchr(data, '\n');
+	if ((p != NULL) && (p > data) && (*(p-1) == '\r'))
+		crlf = 1;
+
+	/* First, scan to see how many characters need to be encoded */
+	in = 0;
+	while (in < datalen) {
+		if ((data[in] > 126) || 
+		    (data[in] == '=') ||
+		    (header && data[in] == '_') ||
+		    ((data[in] == '.') && (linelen == 1)) ||
+		    (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+		    ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+		    ((data[in] < 33) && 
+		     (data[in] != '\r') && (data[in] != '\n') && 
+		     (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+		{
+			if ((linelen + 3) >= MAXLINESIZE) {
+				linelen = 0;
+				if (crlf)
+					odatalen += 3;
+				else
+					odatalen += 2;
+			}
+			linelen += 3;
+			odatalen += 3;
+			in++;
+		}
+		else {
+		  	if (istext && 
+			    ((data[in] == '\n') ||
+			     ((in+1 < datalen) && (data[in] == '\r') &&
+			     (data[in+1] == '\n'))))
+			{
+			  	linelen = 0;
+				/* Protect against whitespace on end of line */
+				if (in && ((data[in-1] == ' ') || (data[in-1] == '\t')))
+					odatalen += 2;
+				if (crlf)
+					odatalen += 2;
+				else
+					odatalen += 1;
+				if (data[in] == '\r')
+					in += 2;
+				else
+					in++;
+			}
+			else {
+				if ((in + 1 != datalen) && 
+				    (data[in+1] != '\n') &&
+				    (linelen + 1) >= MAXLINESIZE) {
+					linelen = 0;
+					if (crlf)
+						odatalen += 3;
+					else
+						odatalen += 2;
+				}
+				linelen++;
+				odatalen++;
+				in++;
+			}
+		}
+	}
+
+	odata = (char *) calloc(1, odatalen);
+
+	if (odata == NULL) {
+		PyErr_NoMemory();
+		return NULL;
+	}
+
+	in = out = linelen = 0;
+	while (in < datalen) {
+		if ((data[in] > 126) || 
+		    (data[in] == '=') ||
+		    (header && data[in] == '_') ||
+		    ((data[in] == '.') && (linelen == 1)) ||
+		    (!istext && ((data[in] == '\r') || (data[in] == '\n'))) ||
+		    ((data[in] == '\t' || data[in] == ' ') && (in + 1 == datalen)) ||
+		    ((data[in] < 33) && 
+		     (data[in] != '\r') && (data[in] != '\n') && 
+		     (quotetabs && ((data[in] != '\t') || (data[in] != ' ')))))
+		{
+			if ((linelen + 3 )>= MAXLINESIZE) {
+				odata[out++] = '=';
+				if (crlf) odata[out++] = '\r';
+				odata[out++] = '\n';
+				linelen = 0;
+			}
+			odata[out++] = '=';
+			to_hex(data[in], &odata[out]);
+			out += 2;
+			in++;
+			linelen += 3;
+		}
+		else {
+		  	if (istext && 
+			    ((data[in] == '\n') ||
+			     ((in+1 < datalen) && (data[in] == '\r') &&
+			     (data[in+1] == '\n'))))
+			{
+			  	linelen = 0;
+				/* Protect against whitespace on end of line */
+				if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) {
+					ch = odata[out-1];
+					odata[out-1] = '=';
+					to_hex(ch, &odata[out]);
+					out += 2;
+				}
+					
+				if (crlf) odata[out++] = '\r';
+				odata[out++] = '\n';
+				if (data[in] == '\r')
+					in += 2;
+				else
+					in++;
+			}
+			else {
+				if ((in + 1 != datalen) && 
+				    (data[in+1] != '\n') &&
+				    (linelen + 1) >= MAXLINESIZE) {
+					odata[out++] = '=';
+					if (crlf) odata[out++] = '\r';
+					odata[out++] = '\n';
+					linelen = 0;
+				}
+				linelen++;
+				if (header && data[in] == ' ') {
+					odata[out++] = '_';
+					in++;
+				}
+				else {
+					odata[out++] = data[in++];
+				}
+			}
+		}
+	}
+	if ((rv = PyString_FromStringAndSize(odata, out)) == NULL) {
+		free (odata);
+		return NULL;
+	}
+	free (odata);
+	return rv;
+}
 
 /* List of functions defined in the module */
 
@@ -990,6 +1282,10 @@ static struct PyMethodDef binascii_module_methods[] = {
 	 doc_rledecode_hqx},
 	{"crc_hqx",    binascii_crc_hqx,    METH_VARARGS, doc_crc_hqx},
 	{"crc32",      binascii_crc32,      METH_VARARGS, doc_crc32},
+	{"a2b_qp", (PyCFunction)binascii_a2b_qp, METH_VARARGS | METH_KEYWORDS, 
+	  doc_a2b_qp},
+	{"b2a_qp", (PyCFunction)binascii_b2a_qp, METH_VARARGS | METH_KEYWORDS, 
+          doc_b2a_qp},
 	{NULL, NULL}			     /* sentinel */
 };
author	Martin v. Löwis <martin@v.loewis.de>	2001-09-30 20:32:11 (GMT)
committer	Martin v. Löwis <martin@v.loewis.de>	2001-09-30 20:32:11 (GMT)
commit	16dc7f44b1116aab58897bc7e94cb972488206fc (patch)
tree	b1e90d9ca27e6dbdd0cd6b6d66fcb8a333a746a3 /Modules/binascii.c
parent	5f12d755a82312673c35e8224b2bde7ced159c52 (diff)
download	cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.zip cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.tar.gz cpython-16dc7f44b1116aab58897bc7e94cb972488206fc.tar.bz2