diff options
author | Inada Naoki <songofacandy@gmail.com> | 2024-12-13 16:21:46 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-12-13 16:21:46 (GMT) |
commit | 5dd775bed086909722ec7014a7c4f77a35f74a80 (patch) | |
tree | 3d897cbbbcb64bf35a93b2c9a8b519311e6b1d44 /Objects | |
parent | 8bc18182a7c28f86265c9d82bd0338137480921c (diff) | |
download | cpython-5dd775bed086909722ec7014a7c4f77a35f74a80.zip cpython-5dd775bed086909722ec7014a7c4f77a35f74a80.tar.gz cpython-5dd775bed086909722ec7014a7c4f77a35f74a80.tar.bz2 |
gh-126024: unicodeobject: optimize find_first_nonascii (GH-127790)
Remove 1 branch.
Diffstat (limited to 'Objects')
-rw-r--r-- | Objects/unicodeobject.c | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 33c4747..b7aeb06 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5077,21 +5077,24 @@ load_unaligned(const unsigned char *p, size_t size) static Py_ssize_t find_first_nonascii(const unsigned char *start, const unsigned char *end) { + // The search is done in `size_t` chunks. + // The start and end might not be aligned at `size_t` boundaries, + // so they're handled specially. + const unsigned char *p = start; if (end - start >= SIZEOF_SIZE_T) { - const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T); + // Avoid unaligned read. #if PY_LITTLE_ENDIAN && HAVE_CTZ - if (p < p2) { - size_t u; - memcpy(&u, p, sizeof(size_t)); - u &= ASCII_CHAR_MASK; - if (u) { - return (ctz(u) - 7) / 8; - } - p = p2; + size_t u; + memcpy(&u, p, sizeof(size_t)); + u &= ASCII_CHAR_MASK; + if (u) { + return (ctz(u) - 7) / 8; } + p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T); #else /* PY_LITTLE_ENDIAN && HAVE_CTZ */ + const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T); while (p < p2) { if (*p & 0x80) { return p - start; @@ -5099,6 +5102,7 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end) p++; } #endif + const unsigned char *e = end - SIZEOF_SIZE_T; while (p <= e) { size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK; @@ -5115,6 +5119,7 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end) } } #if PY_LITTLE_ENDIAN && HAVE_CTZ + assert((end - p) < SIZEOF_SIZE_T); // we can not use *(const size_t*)p to avoid buffer overrun. size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK; if (u) { |