summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVictor Stinner <victor.stinner@gmail.com>2015-10-14 09:25:33 (GMT)
committerVictor Stinner <victor.stinner@gmail.com>2015-10-14 09:25:33 (GMT)
commit2bf8993db966256d564d87865ceddf0e33c02500 (patch)
tree8b172dcec9ee6d9584c75ecc933b418b5210963b
parentebcf9edc05c03af38c01d8aeb05494b68169756c (diff)
downloadcpython-2bf8993db966256d564d87865ceddf0e33c02500.zip
cpython-2bf8993db966256d564d87865ceddf0e33c02500.tar.gz
cpython-2bf8993db966256d564d87865ceddf0e33c02500.tar.bz2
Optimize bytes.fromhex() and bytearray.fromhex()
Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now between 2x and 3.5x faster. Changes: * Use a fast-path working on a char* string for ASCII string * Use a slow-path for non-ASCII string * Replace slow hex_digit_to_int() function with a O(1) lookup in _PyLong_DigitValue precomputed table * Use _PyBytesWriter API to handle the buffer * Add unit tests to check the error position in error messages
-rw-r--r--Doc/whatsnew/3.6.rst3
-rw-r--r--Include/bytesobject.h3
-rw-r--r--Include/longobject.h2
-rw-r--r--Lib/test/test_bytes.py14
-rw-r--r--Misc/NEWS3
-rw-r--r--Objects/bytearrayobject.c43
-rw-r--r--Objects/bytesobject.c128
7 files changed, 101 insertions, 95 deletions
diff --git a/Doc/whatsnew/3.6.rst b/Doc/whatsnew/3.6.rst
index d1e5045..edacea1 100644
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -161,6 +161,9 @@ Optimizations
* ``bytearray % args`` is now between 2.5 and 5 times faster. (Contributed by
Victor Stinner in :issue:`25399`).
+* Optimize :meth:`bytes.fromhex` and :meth:`bytearray.fromhex`: they are now
+ between 2x and 3.5x faster. (Contributed by Victor Stinner in :issue:`25401`).
+
Build and C API Changes
=======================
diff --git a/Include/bytesobject.h b/Include/bytesobject.h
index b5b37ef..4046c1c 100644
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@@ -67,6 +67,9 @@ PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
Py_ssize_t format_len,
PyObject *args,
int use_bytearray);
+PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
+ PyObject *string,
+ int use_bytearray);
#endif
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, Py_ssize_t,
diff --git a/Include/longobject.h b/Include/longobject.h
index ab92495..9574f05 100644
--- a/Include/longobject.h
+++ b/Include/longobject.h
@@ -65,7 +65,7 @@ PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
# error "void* different in size from int, long and long long"
#endif /* SIZEOF_VOID_P */
-/* Used by Python/mystrtoul.c. */
+/* Used by Python/mystrtoul.c and _PyBytes_FromHex(). */
#ifndef Py_LIMITED_API
PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
#endif
diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index 947d959..0fe33b5 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -301,6 +301,20 @@ class BaseBytesTest:
self.assertRaises(ValueError, self.type2test.fromhex, '\x00')
self.assertRaises(ValueError, self.type2test.fromhex, '12 \x00 34')
+ for data, pos in (
+ # invalid first hexadecimal character
+ ('12 x4 56', 3),
+ # invalid second hexadecimal character
+ ('12 3x 56', 4),
+ # two invalid hexadecimal characters
+ ('12 xy 56', 3),
+ # test non-ASCII string
+ ('12 3\xff 56', 4),
+ ):
+ with self.assertRaises(ValueError) as cm:
+ self.type2test.fromhex(data)
+ self.assertIn('at position %s' % pos, str(cm.exception))
+
def test_hex(self):
self.assertRaises(TypeError, self.type2test.hex)
self.assertRaises(TypeError, self.type2test.hex, 1)
diff --git a/Misc/NEWS b/Misc/NEWS
index 97e5e77..312cde0 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,9 @@ Release date: XXXX-XX-XX
Core and Builtins
-----------------
+- Issue #25401: Optimize bytes.fromhex() and bytearray.fromhex(): they are now
+ between 2x and 3.5x faster.
+
- Issue #25399: Optimize bytearray % args using the new private _PyBytesWriter
API. Formatting is now between 2.5 and 5 times faster.
diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c
index e535bce..b270fcc 100644
--- a/Objects/bytearrayobject.c
+++ b/Objects/bytearrayobject.c
@@ -2823,48 +2823,7 @@ static PyObject *
bytearray_fromhex_impl(PyObject*cls, PyObject *string)
/*[clinic end generated code: output=df3da60129b3700c input=907bbd2d34d9367a]*/
{
- PyObject *newbytes;
- char *buf;
- Py_ssize_t hexlen, byteslen, i, j;
- int top, bot;
- void *data;
- unsigned int kind;
-
- assert(PyUnicode_Check(string));
- if (PyUnicode_READY(string))
- return NULL;
- kind = PyUnicode_KIND(string);
- data = PyUnicode_DATA(string);
- hexlen = PyUnicode_GET_LENGTH(string);
-
- byteslen = hexlen/2; /* This overestimates if there are spaces */
- newbytes = PyByteArray_FromStringAndSize(NULL, byteslen);
- if (!newbytes)
- return NULL;
- buf = PyByteArray_AS_STRING(newbytes);
- for (i = j = 0; i < hexlen; i += 2) {
- /* skip over spaces in the input */
- while (PyUnicode_READ(kind, data, i) == ' ')
- i++;
- if (i >= hexlen)
- break;
- top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
- bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
- if (top == -1 || bot == -1) {
- PyErr_Format(PyExc_ValueError,
- "non-hexadecimal number found in "
- "fromhex() arg at position %zd", i);
- goto error;
- }
- buf[j++] = (top << 4) + bot;
- }
- if (PyByteArray_Resize(newbytes, j) < 0)
- goto error;
- return newbytes;
-
- error:
- Py_DECREF(newbytes);
- return NULL;
+ return _PyBytes_FromHex(string, 1);
}
PyDoc_STRVAR(hex__doc__,
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 20b11fb..2d4cf4b 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -30,6 +30,10 @@ static PyBytesObject *nullstring;
*/
#define PyBytesObject_SIZE (offsetof(PyBytesObject, ob_sval) + 1)
+/* Forward declaration */
+Py_LOCAL_INLINE(Py_ssize_t) _PyBytesWriter_GetSize(_PyBytesWriter *writer,
+ char *str);
+
/*
For PyBytes_FromString(), the parameter `str' points to a null-terminated
string containing exactly `size' bytes.
@@ -3078,22 +3082,6 @@ bytes_splitlines_impl(PyBytesObject*self, int keepends)
);
}
-static int
-hex_digit_to_int(Py_UCS4 c)
-{
- if (c >= 128)
- return -1;
- if (Py_ISDIGIT(c))
- return c - '0';
- else {
- if (Py_ISUPPER(c))
- c = Py_TOLOWER(c);
- if (c >= 'a' && c <= 'f')
- return c - 'a' + 10;
- }
- return -1;
-}
-
/*[clinic input]
@classmethod
bytes.fromhex
@@ -3111,47 +3099,83 @@ static PyObject *
bytes_fromhex_impl(PyTypeObject *type, PyObject *string)
/*[clinic end generated code: output=0973acc63661bb2e input=bf4d1c361670acd3]*/
{
- PyObject *newstring;
+ return _PyBytes_FromHex(string, 0);
+}
+
+PyObject*
+_PyBytes_FromHex(PyObject *string, int use_bytearray)
+{
char *buf;
- Py_ssize_t hexlen, byteslen, i, j;
- int top, bot;
- void *data;
- unsigned int kind;
+ Py_ssize_t hexlen, invalid_char;
+ unsigned int top, bot;
+ Py_UCS1 *str, *end;
+ _PyBytesWriter writer;
+
+ _PyBytesWriter_Init(&writer);
+ writer.use_bytearray = use_bytearray;
assert(PyUnicode_Check(string));
if (PyUnicode_READY(string))
return NULL;
- kind = PyUnicode_KIND(string);
- data = PyUnicode_DATA(string);
hexlen = PyUnicode_GET_LENGTH(string);
- byteslen = hexlen/2; /* This overestimates if there are spaces */
- newstring = PyBytes_FromStringAndSize(NULL, byteslen);
- if (!newstring)
+ if (!PyUnicode_IS_ASCII(string)) {
+ void *data = PyUnicode_DATA(string);
+ unsigned int kind = PyUnicode_KIND(string);
+ Py_ssize_t i;
+
+ /* search for the first non-ASCII character */
+ for (i = 0; i < hexlen; i++) {
+ if (PyUnicode_READ(kind, data, i) >= 128)
+ break;
+ }
+ invalid_char = i;
+ goto error;
+ }
+
+ assert(PyUnicode_KIND(string) == PyUnicode_1BYTE_KIND);
+ str = PyUnicode_1BYTE_DATA(string);
+
+ /* This overestimates if there are spaces */
+ buf = _PyBytesWriter_Alloc(&writer, hexlen / 2);
+ if (buf == NULL)
return NULL;
- buf = PyBytes_AS_STRING(newstring);
- for (i = j = 0; i < hexlen; i += 2) {
+
+ end = str + hexlen;
+ while (str < end) {
/* skip over spaces in the input */
- while (PyUnicode_READ(kind, data, i) == ' ')
- i++;
- if (i >= hexlen)
- break;
- top = hex_digit_to_int(PyUnicode_READ(kind, data, i));
- bot = hex_digit_to_int(PyUnicode_READ(kind, data, i+1));
- if (top == -1 || bot == -1) {
- PyErr_Format(PyExc_ValueError,
- "non-hexadecimal number found in "
- "fromhex() arg at position %zd", i);
+ if (*str == ' ') {
+ do {
+ str++;
+ } while (*str == ' ');
+ if (str >= end)
+ break;
+ }
+
+ top = _PyLong_DigitValue[*str];
+ if (top >= 16) {
+ invalid_char = str - PyUnicode_1BYTE_DATA(string);
goto error;
}
- buf[j++] = (top << 4) + bot;
+ str++;
+
+ bot = _PyLong_DigitValue[*str];
+ if (bot >= 16) {
+ invalid_char = str - PyUnicode_1BYTE_DATA(string);
+ goto error;
+ }
+ str++;
+
+ *buf++ = (unsigned char)((top << 4) + bot);
}
- if (j != byteslen && _PyBytes_Resize(&newstring, j) < 0)
- goto error;
- return newstring;
+
+ return _PyBytesWriter_Finish(&writer, buf);
error:
- Py_XDECREF(newstring);
+ PyErr_Format(PyExc_ValueError,
+ "non-hexadecimal number found in "
+ "fromhex() arg at position %zd", invalid_char);
+ _PyBytesWriter_Dealloc(&writer);
return NULL;
}
@@ -3888,7 +3912,7 @@ _PyBytesWriter_AsString(_PyBytesWriter *writer)
}
Py_LOCAL_INLINE(Py_ssize_t)
-_PyBytesWriter_GetPos(_PyBytesWriter *writer, char *str)
+_PyBytesWriter_GetSize(_PyBytesWriter *writer, char *str)
{
char *start = _PyBytesWriter_AsString(writer);
assert(str != NULL);
@@ -3963,7 +3987,7 @@ _PyBytesWriter_Prepare(_PyBytesWriter *writer, void *str, Py_ssize_t size)
allocated += allocated / OVERALLOCATE_FACTOR;
}
- pos = _PyBytesWriter_GetPos(writer, str);
+ pos = _PyBytesWriter_GetSize(writer, str);
if (!writer->use_small_buffer) {
if (writer->use_bytearray) {
if (PyByteArray_Resize(writer->buffer, allocated))
@@ -4041,33 +4065,33 @@ _PyBytesWriter_Alloc(_PyBytesWriter *writer, Py_ssize_t size)
PyObject *
_PyBytesWriter_Finish(_PyBytesWriter *writer, void *str)
{
- Py_ssize_t pos;
+ Py_ssize_t size;
PyObject *result;
_PyBytesWriter_CheckConsistency(writer, str);
- pos = _PyBytesWriter_GetPos(writer, str);
- if (pos == 0 && !writer->use_bytearray) {
+ size = _PyBytesWriter_GetSize(writer, str);
+ if (size == 0 && !writer->use_bytearray) {
Py_CLEAR(writer->buffer);
/* Get the empty byte string singleton */
result = PyBytes_FromStringAndSize(NULL, 0);
}
else if (writer->use_small_buffer) {
- result = PyBytes_FromStringAndSize(writer->small_buffer, pos);
+ result = PyBytes_FromStringAndSize(writer->small_buffer, size);
}
else {
result = writer->buffer;
writer->buffer = NULL;
- if (pos != writer->allocated) {
+ if (size != writer->allocated) {
if (writer->use_bytearray) {
- if (PyByteArray_Resize(result, pos)) {
+ if (PyByteArray_Resize(result, size)) {
Py_DECREF(result);
return NULL;
}
}
else {
- if (_PyBytes_Resize(&result, pos)) {
+ if (_PyBytes_Resize(&result, size)) {
assert(result == NULL);
return NULL;
}