From 7108bdf27c7a460cf83c4a01dea54ae4591d8aea Mon Sep 17 00:00:00 2001 From: goldsteinn <35538541+goldsteinn@users.noreply.github.com> Date: Mon, 23 May 2022 20:45:31 -0500 Subject: gh-93033: Use wmemchr in stringlib (GH-93034) Generally comparable perf for the "good" case where memchr doesn't return any collisions (false matches on lower byte) but clearly faster with collisions. --- .../2022-05-22-23-46-18.gh-issue-93033.wZfiL-.rst | 1 + Objects/bytearrayobject.c | 1 + Objects/bytes_methods.c | 1 + Objects/stringlib/asciilib.h | 1 + Objects/stringlib/fastsearch.h | 34 ++++++++++++++-------- Objects/stringlib/replace.h | 4 +-- Objects/stringlib/stringdefs.h | 1 + Objects/stringlib/ucs1lib.h | 1 + Objects/stringlib/ucs2lib.h | 4 +++ Objects/stringlib/ucs4lib.h | 4 +++ Objects/stringlib/undef.h | 1 + 11 files changed, 39 insertions(+), 14 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-05-22-23-46-18.gh-issue-93033.wZfiL-.rst diff --git a/Misc/NEWS.d/next/Library/2022-05-22-23-46-18.gh-issue-93033.wZfiL-.rst b/Misc/NEWS.d/next/Library/2022-05-22-23-46-18.gh-issue-93033.wZfiL-.rst new file mode 100644 index 0000000..3cee530 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-05-22-23-46-18.gh-issue-93033.wZfiL-.rst @@ -0,0 +1 @@ +Search in some strings (platform dependent i.e [U+0xFFFF, U+0x0100] on Windows or [U+0xFFFFFFFF, U+0x00010000] on Linux 64-bit) are now up to 10 times faster. diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index b9436d9..d017962 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1096,6 +1096,7 @@ bytearray_dealloc(PyByteArrayObject *self) #define STRINGLIB_ISSPACE Py_ISSPACE #define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact +#define STRINGLIB_FAST_MEMCHR memchr #define STRINGLIB_MUTABLE 1 #include "stringlib/fastsearch.h" diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 994fb8a..6b81663 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -431,6 +431,7 @@ _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to) #define STRINGLIB(F) stringlib_##F #define STRINGLIB_CHAR char #define STRINGLIB_SIZEOF_CHAR 1 +#define STRINGLIB_FAST_MEMCHR memchr #include "stringlib/fastsearch.h" #include "stringlib/count.h" diff --git a/Objects/stringlib/asciilib.h b/Objects/stringlib/asciilib.h index eebe888..b3016bf 100644 --- a/Objects/stringlib/asciilib.h +++ b/Objects/stringlib/asciilib.h @@ -21,6 +21,7 @@ #define STRINGLIB_CHECK PyUnicode_Check #define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact #define STRINGLIB_MUTABLE 0 +#define STRINGLIB_FAST_MEMCHR memchr #define STRINGLIB_TOSTR PyObject_Str #define STRINGLIB_TOASCII PyObject_ASCII diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index b91082b..a838c66 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -39,7 +39,7 @@ #define STRINGLIB_BLOOM(mask, ch) \ ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) -#if STRINGLIB_SIZEOF_CHAR == 1 +#ifdef STRINGLIB_FAST_MEMCHR # define MEMCHR_CUT_OFF 15 #else # define MEMCHR_CUT_OFF 40 @@ -53,8 +53,8 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) p = s; e = s + n; if (n > MEMCHR_CUT_OFF) { -#if STRINGLIB_SIZEOF_CHAR == 1 - p = memchr(s, ch, n); +#ifdef STRINGLIB_FAST_MEMCHR + p = STRINGLIB_FAST_MEMCHR(s, ch, n); if (p != NULL) return (p - s); return -1; @@ -102,16 +102,26 @@ STRINGLIB(find_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) return -1; } +#undef MEMCHR_CUT_OFF + +#if STRINGLIB_SIZEOF_CHAR == 1 +# define MEMRCHR_CUT_OFF 15 +#else +# define MEMRCHR_CUT_OFF 40 +#endif + + Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) { const STRINGLIB_CHAR *p; #ifdef HAVE_MEMRCHR - /* memrchr() is a GNU extension, available since glibc 2.1.91. - it doesn't seem as optimized as memchr(), but is still quite - faster than our hand-written loop below */ + /* memrchr() is a GNU extension, available since glibc 2.1.91. it + doesn't seem as optimized as memchr(), but is still quite + faster than our hand-written loop below. There is no wmemrchr + for 4-byte chars. */ - if (n > MEMCHR_CUT_OFF) { + if (n > MEMRCHR_CUT_OFF) { #if STRINGLIB_SIZEOF_CHAR == 1 p = memrchr(s, ch, n); if (p != NULL) @@ -139,11 +149,11 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) if (*p == ch) return n; /* False positive */ - if (n1 - n > MEMCHR_CUT_OFF) + if (n1 - n > MEMRCHR_CUT_OFF) continue; - if (n <= MEMCHR_CUT_OFF) + if (n <= MEMRCHR_CUT_OFF) break; - s1 = p - MEMCHR_CUT_OFF; + s1 = p - MEMRCHR_CUT_OFF; while (p > s1) { p--; if (*p == ch) @@ -151,7 +161,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) } n = p - s; } - while (n > MEMCHR_CUT_OFF); + while (n > MEMRCHR_CUT_OFF); } #endif } @@ -165,7 +175,7 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch) return -1; } -#undef MEMCHR_CUT_OFF +#undef MEMRCHR_CUT_OFF /* Change to a 1 to see logging comments walk through the algorithm. */ #if 0 && STRINGLIB_SIZEOF_CHAR == 1 diff --git a/Objects/stringlib/replace.h b/Objects/stringlib/replace.h index ef318ed..123c9f8 100644 --- a/Objects/stringlib/replace.h +++ b/Objects/stringlib/replace.h @@ -29,9 +29,9 @@ STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end, if (!--attempts) { /* if u1 was not found for attempts iterations, use FASTSEARCH() or memchr() */ -#if STRINGLIB_SIZEOF_CHAR == 1 +#ifdef STRINGLIB_FAST_MEMCHR s++; - s = memchr(s, u1, end - s); + s = STRINGLIB_FAST_MEMCHR(s, u1, end - s); if (s == NULL) return; #else diff --git a/Objects/stringlib/stringdefs.h b/Objects/stringlib/stringdefs.h index 88641b2..484b98b 100644 --- a/Objects/stringlib/stringdefs.h +++ b/Objects/stringlib/stringdefs.h @@ -24,4 +24,5 @@ #define STRINGLIB_CHECK_EXACT PyBytes_CheckExact #define STRINGLIB_TOSTR PyObject_Str #define STRINGLIB_TOASCII PyObject_Repr +#define STRINGLIB_FAST_MEMCHR memchr #endif /* !STRINGLIB_STRINGDEFS_H */ diff --git a/Objects/stringlib/ucs1lib.h b/Objects/stringlib/ucs1lib.h index 026ab11..1b9b65e 100644 --- a/Objects/stringlib/ucs1lib.h +++ b/Objects/stringlib/ucs1lib.h @@ -20,6 +20,7 @@ #define STRINGLIB_NEW _PyUnicode_FromUCS1 #define STRINGLIB_CHECK PyUnicode_Check #define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact +#define STRINGLIB_FAST_MEMCHR memchr #define STRINGLIB_MUTABLE 0 #define STRINGLIB_TOSTR PyObject_Str diff --git a/Objects/stringlib/ucs2lib.h b/Objects/stringlib/ucs2lib.h index 75f11bc..4b49bbb 100644 --- a/Objects/stringlib/ucs2lib.h +++ b/Objects/stringlib/ucs2lib.h @@ -21,6 +21,10 @@ #define STRINGLIB_CHECK PyUnicode_Check #define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact #define STRINGLIB_MUTABLE 0 +#if SIZEOF_WCHAR_T == 2 +#define STRINGLIB_FAST_MEMCHR(s, c, n) \ + (Py_UCS2 *)wmemchr((const wchar_t *)(s), c, n) +#endif #define STRINGLIB_TOSTR PyObject_Str #define STRINGLIB_TOASCII PyObject_ASCII diff --git a/Objects/stringlib/ucs4lib.h b/Objects/stringlib/ucs4lib.h index 57344f2..def4ca5 100644 --- a/Objects/stringlib/ucs4lib.h +++ b/Objects/stringlib/ucs4lib.h @@ -21,6 +21,10 @@ #define STRINGLIB_CHECK PyUnicode_Check #define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact #define STRINGLIB_MUTABLE 0 +#if SIZEOF_WCHAR_T == 4 +#define STRINGLIB_FAST_MEMCHR(s, c, n) \ + (Py_UCS4 *)wmemchr((const wchar_t *)(s), c, n) +#endif #define STRINGLIB_TOSTR PyObject_Str #define STRINGLIB_TOASCII PyObject_ASCII diff --git a/Objects/stringlib/undef.h b/Objects/stringlib/undef.h index bf32298..cc873a2 100644 --- a/Objects/stringlib/undef.h +++ b/Objects/stringlib/undef.h @@ -8,3 +8,4 @@ #undef STRINGLIB_NEW #undef STRINGLIB_IS_UNICODE #undef STRINGLIB_MUTABLE +#undef STRINGLIB_FAST_MEMCHR -- cgit v0.12