summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorINADA Naoki <methane@users.noreply.github.com>2018-01-28 00:59:12 (GMT)
committerGitHub <noreply@github.com>2018-01-28 00:59:12 (GMT)
commitbea57060c863d0c3474c79350bd9c557f2ff0e7c (patch)
treeaacefc63e6f0fecbc24fb7767ce21ecc293c8b16
parentea8fc52e75363276db23c6a8d7a689f79efce4f9 (diff)
downloadcpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.zip
cpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.tar.gz
cpython-bea57060c863d0c3474c79350bd9c557f2ff0e7c.tar.bz2
bpo-32677: Optimize str.isascii() (GH-5356)
-rw-r--r--Lib/test/string_tests.py7
-rw-r--r--Objects/bytes_methods.c40
2 files changed, 43 insertions, 4 deletions
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
index 4be1d21..561b09a 100644
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -916,6 +916,13 @@ class BaseTest:
self.checkequal(True, '\x00\x7f', 'isascii')
self.checkequal(False, '\x80', 'isascii')
self.checkequal(False, '\xe9', 'isascii')
+ # bytes.isascii() and bytearray.isascii() has optimization which
+ # check 4 or 8 bytes at once. So check some alignments.
+ for p in range(8):
+ self.checkequal(True, ' '*p + '\x7f', 'isascii')
+ self.checkequal(False, ' '*p + '\x80', 'isascii')
+ self.checkequal(True, ' '*p + '\x7f' + ' '*8, 'isascii')
+ self.checkequal(False, ' '*p + '\x80' + ' '*8, 'isascii')
def test_isdigit(self):
self.checkequal(False, '', 'isdigit')
diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c
index 149650f..07842f7 100644
--- a/Objects/bytes_methods.c
+++ b/Objects/bytes_methods.c
@@ -98,19 +98,51 @@ PyDoc_STRVAR_shared(_Py_isascii__doc__,
Return True if B is empty or all characters in B are ASCII,\n\
False otherwise.");
+// Optimization is copied from ascii_decode in unicodeobject.c
+/* Mask to quickly check whether a C 'long' contains a
+ non-ASCII, UTF8-encoded char. */
+#if (SIZEOF_LONG == 8)
+# define ASCII_CHAR_MASK 0x8080808080808080UL
+#elif (SIZEOF_LONG == 4)
+# define ASCII_CHAR_MASK 0x80808080UL
+#else
+# error C 'long' size should be either 4 or 8!
+#endif
+
PyObject*
_Py_bytes_isascii(const char *cptr, Py_ssize_t len)
{
- const unsigned char *p = (unsigned char *) cptr;
- const unsigned char *e = p + len;
- for (; p < e; p++) {
- if (*p >= 128) {
+ const char *p = cptr;
+ const char *end = p + len;
+ const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
+
+ while (p < end) {
+ /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
+ for an explanation. */
+ if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
+ /* Help allocation */
+ const char *_p = p;
+ while (_p < aligned_end) {
+ unsigned long value = *(unsigned long *) _p;
+ if (value & ASCII_CHAR_MASK) {
+ Py_RETURN_FALSE;
+ }
+ _p += SIZEOF_LONG;
+ }
+ p = _p;
+ if (_p == end)
+ break;
+ }
+ if ((unsigned char)*p & 0x80) {
Py_RETURN_FALSE;
}
+ p++;
}
Py_RETURN_TRUE;
}
+#undef ASCII_CHAR_MASK
+
PyDoc_STRVAR_shared(_Py_isdigit__doc__,
"B.isdigit() -> bool\n\