diff options
author | INADA Naoki <methane@users.noreply.github.com> | 2018-01-28 00:59:12 (GMT) |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-01-28 00:59:12 (GMT) |
commit | bea57060c863d0c3474c79350bd9c557f2ff0e7c (patch) | |
tree | aacefc63e6f0fecbc24fb7767ce21ecc293c8b16 | |
parent | ea8fc52e75363276db23c6a8d7a689f79efce4f9 (diff) | |
download | cpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.zip cpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.tar.gz cpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.tar.bz2 |
bpo-32677: Optimize str.isascii() (GH-5356)
-rw-r--r-- | Lib/test/string_tests.py | 7 | ||||
-rw-r--r-- | Objects/bytes_methods.c | 40 |
2 files changed, 43 insertions, 4 deletions
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 4be1d21..561b09a 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -916,6 +916,13 @@ class BaseTest: self.checkequal(True, '\x00\x7f', 'isascii') self.checkequal(False, '\x80', 'isascii') self.checkequal(False, '\xe9', 'isascii') + # bytes.isascii() and bytearray.isascii() has optimization which + # check 4 or 8 bytes at once. So check some alignments. + for p in range(8): + self.checkequal(True, ' '*p + '\x7f', 'isascii') + self.checkequal(False, ' '*p + '\x80', 'isascii') + self.checkequal(True, ' '*p + '\x7f' + ' '*8, 'isascii') + self.checkequal(False, ' '*p + '\x80' + ' '*8, 'isascii') def test_isdigit(self): self.checkequal(False, '', 'isdigit') diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 149650f..07842f7 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -98,19 +98,51 @@ PyDoc_STRVAR_shared(_Py_isascii__doc__, Return True if B is empty or all characters in B are ASCII,\n\ False otherwise."); +// Optimization is copied from ascii_decode in unicodeobject.c +/* Mask to quickly check whether a C 'long' contains a + non-ASCII, UTF8-encoded char. */ +#if (SIZEOF_LONG == 8) +# define ASCII_CHAR_MASK 0x8080808080808080UL +#elif (SIZEOF_LONG == 4) +# define ASCII_CHAR_MASK 0x80808080UL +#else +# error C 'long' size should be either 4 or 8! +#endif + PyObject* _Py_bytes_isascii(const char *cptr, Py_ssize_t len) { - const unsigned char *p = (unsigned char *) cptr; - const unsigned char *e = p + len; - for (; p < e; p++) { - if (*p >= 128) { + const char *p = cptr; + const char *end = p + len; + const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG); + + while (p < end) { + /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h + for an explanation. */ + if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) { + /* Help allocation */ + const char *_p = p; + while (_p < aligned_end) { + unsigned long value = *(unsigned long *) _p; + if (value & ASCII_CHAR_MASK) { + Py_RETURN_FALSE; + } + _p += SIZEOF_LONG; + } + p = _p; + if (_p == end) + break; + } + if ((unsigned char)*p & 0x80) { Py_RETURN_FALSE; } + p++; } Py_RETURN_TRUE; } +#undef ASCII_CHAR_MASK + PyDoc_STRVAR_shared(_Py_isdigit__doc__, "B.isdigit() -> bool\n\ |