summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSergey Fedoseev <fedoseev.sergey@gmail.com>2019-07-14 12:15:32 (GMT)
committerMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2019-07-14 12:15:32 (GMT)
commit1c5e68e7145f0825f9b952389141edb9436eb43d (patch)
treebd9cffe7c14cd5ce3018681cd2795809d3062848
parent0d4f4352efecf1b044c88e234e71774fe04b7d6c (diff)
downloadcpython-1c5e68e7145f0825f9b952389141edb9436eb43d.zip
cpython-1c5e68e7145f0825f9b952389141edb9436eb43d.tar.gz
cpython-1c5e68e7145f0825f9b952389141edb9436eb43d.tar.bz2
bpo-34749: Improved performance of binascii.a2b_base64(). (GH-9444)
https://bugs.python.org/issue34749
-rw-r--r--Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst2
-rw-r--r--Modules/binascii.c142
2 files changed, 58 insertions, 86 deletions
diff --git a/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst
new file mode 100644
index 0000000..5a5e5b4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst
@@ -0,0 +1,2 @@
+:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey
+Fedoseev.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 1c7dc35..94b0732 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -130,7 +130,7 @@ static const unsigned char table_a2b_hqx[256] = {
static const unsigned char table_b2a_hqx[] =
"!\"#$%&'()*+,-012345689@ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr";
-static const char table_a2b_base64[] = {
+static const unsigned char table_a2b_base64[] = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@@ -138,7 +138,16 @@ static const char table_a2b_base64[] = {
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
- 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
+ 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
+
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
};
#define BASE64_PAD '='
@@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
return _PyBytesWriter_Finish(&writer, ascii_data);
}
-
-static int
-binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num)
-{
- /* Finds & returns the (num+1)th
- ** valid character for base64, or -1 if none.
- */
-
- int ret = -1;
- unsigned char c, b64val;
-
- while ((slen > 0) && (ret == -1)) {
- c = *s;
- b64val = table_a2b_base64[c & 0x7f];
- if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
- if (num == 0)
- ret = *s;
- num--;
- }
-
- s++;
- slen--;
- }
- return ret;
-}
-
/*[clinic input]
binascii.a2b_base64
@@ -452,88 +435,74 @@ static PyObject *
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
/*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
{
- const unsigned char *ascii_data;
- unsigned char *bin_data;
- unsigned char *bin_data_start;
- int leftbits = 0;
- unsigned char this_ch;
- unsigned int leftchar = 0;
- Py_ssize_t ascii_len, bin_len;
- int quad_pos = 0;
- _PyBytesWriter writer;
- binascii_state *state;
-
- ascii_data = data->buf;
- ascii_len = data->len;
+ assert(data->len >= 0);
- assert(ascii_len >= 0);
-
- if (ascii_len > PY_SSIZE_T_MAX - 3)
- return PyErr_NoMemory();
-
- bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
-
- _PyBytesWriter_Init(&writer);
+ const unsigned char *ascii_data = data->buf;
+ size_t ascii_len = data->len;
/* Allocate the buffer */
- bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
+ Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
+ _PyBytesWriter writer;
+ _PyBytesWriter_Init(&writer);
+ unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
if (bin_data == NULL)
return NULL;
- bin_data_start = bin_data;
-
- for( ; ascii_len > 0; ascii_len--, ascii_data++) {
- this_ch = *ascii_data;
+ unsigned char *bin_data_start = bin_data;
- if (this_ch > 0x7f ||
- this_ch == '\r' || this_ch == '\n' || this_ch == ' ')
- continue;
+ int quad_pos = 0;
+ unsigned char leftchar = 0;
+ int pads = 0;
+ for (size_t i = 0; i < ascii_len; i++) {
+ unsigned char this_ch = ascii_data[i];
/* Check for pad sequences and ignore
** the invalid ones.
*/
if (this_ch == BASE64_PAD) {
- if ( (quad_pos < 2) ||
- ((quad_pos == 2) &&
- (binascii_find_valid(ascii_data, ascii_len, 1)
- != BASE64_PAD)) )
- {
- continue;
- }
- else {
+ if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
/* A pad sequence means no more input.
** We've already interpreted the data
** from the quad at this point.
*/
- leftbits = 0;
- break;
+ goto done;
}
+ continue;
}
- this_ch = table_a2b_base64[*ascii_data];
- if ( this_ch == (unsigned char) -1 )
+ this_ch = table_a2b_base64[this_ch];
+ if (this_ch >= 64) {
continue;
+ }
+ pads = 0;
- /*
- ** Shift it in on the low end, and see if there's
- ** a byte ready for output.
- */
- quad_pos = (quad_pos + 1) & 0x03;
- leftchar = (leftchar << 6) | (this_ch);
- leftbits += 6;
-
- if ( leftbits >= 8 ) {
- leftbits -= 8;
- *bin_data++ = (leftchar >> leftbits) & 0xff;
- leftchar &= ((1 << leftbits) - 1);
+ switch (quad_pos) {
+ case 0:
+ quad_pos = 1;
+ leftchar = this_ch;
+ break;
+ case 1:
+ quad_pos = 2;
+ *bin_data++ = (leftchar << 2) | (this_ch >> 4);
+ leftchar = this_ch & 0x0f;
+ break;
+ case 2:
+ quad_pos = 3;
+ *bin_data++ = (leftchar << 4) | (this_ch >> 2);
+ leftchar = this_ch & 0x03;
+ break;
+ case 3:
+ quad_pos = 0;
+ *bin_data++ = (leftchar << 6) | (this_ch);
+ leftchar = 0;
+ break;
}
}
- if (leftbits != 0) {
- state = PyModule_GetState(module);
+ if (quad_pos != 0) {
+ binascii_state *state = PyModule_GetState(module);
if (state == NULL) {
- return NULL;
- }
- if (leftbits == 6) {
+ /* error already set, from PyModule_GetState */
+ } else if (quad_pos == 1) {
/*
** There is exactly one extra valid, non-padding, base64 character.
** This is an invalid length, as there is no possible input that
@@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
return NULL;
}
+done:
return _PyBytesWriter_Finish(&writer, bin_data);
}