summaryrefslogtreecommitdiffstats
path: root/Objects/stringlib
diff options
context:
space:
mode:
authord.grigonis <dgrigonis@users.noreply.github.com>2024-06-04 07:44:49 (GMT)
committerGitHub <noreply@github.com>2024-06-04 07:44:49 (GMT)
commita8f1152b70d707340b394689cd09aa0831da3601 (patch)
treec7daf81a20dfc5b853332c4c7c98f4f9c58091dc /Objects/stringlib
parent8d63c8d47b9edd8ac2f0b395b2fa0ae5f571252d (diff)
downloadcpython-a8f1152b70d707340b394689cd09aa0831da3601.zip
cpython-a8f1152b70d707340b394689cd09aa0831da3601.tar.gz
cpython-a8f1152b70d707340b394689cd09aa0831da3601.tar.bz2
gh-119879: str.find(): Utilize last character gap for two-way periodic needles (#119880)
Diffstat (limited to 'Objects/stringlib')
-rw-r--r--Objects/stringlib/fastsearch.h63
1 files changed, 35 insertions, 28 deletions
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index 257b7bd..309ed15 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -256,7 +256,7 @@ STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
The local period of the cut is the minimal length of a string w
such that (left endswith w or w endswith left)
- and (right startswith w or w startswith left).
+ and (right startswith w or w startswith right).
The Critical Factorization Theorem says that this maximal local
period is the global period of the string.
@@ -337,21 +337,20 @@ STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
if (p->is_periodic) {
assert(p->cut <= len_needle/2);
assert(p->cut < p->period);
- p->gap = 0; // unused
}
else {
// A lower bound on the period
p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
- // The gap between the last character and the previous
- // occurrence of an equivalent character (modulo TABLE_SIZE)
- p->gap = len_needle;
- STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK;
- for (Py_ssize_t i = len_needle - 2; i >= 0; i--) {
- STRINGLIB_CHAR x = needle[i] & TABLE_MASK;
- if (x == last) {
- p->gap = len_needle - 1 - i;
- break;
- }
+ }
+ // The gap between the last character and the previous
+ // occurrence of an equivalent character (modulo TABLE_SIZE)
+ p->gap = len_needle;
+ STRINGLIB_CHAR last = needle[len_needle - 1] & TABLE_MASK;
+ for (Py_ssize_t i = len_needle - 2; i >= 0; i--) {
+ STRINGLIB_CHAR x = needle[i] & TABLE_MASK;
+ if (x == last) {
+ p->gap = len_needle - 1 - i;
+ break;
}
}
// Fill up a compressed Boyer-Moore "Bad Character" table
@@ -383,6 +382,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
const STRINGLIB_CHAR *window;
LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
+ Py_ssize_t gap = p->gap;
+ Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
if (p->is_periodic) {
LOG("Needle is periodic.\n");
Py_ssize_t memory = 0;
@@ -408,8 +409,16 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
Py_ssize_t i = Py_MAX(cut, memory);
for (; i < len_needle; i++) {
if (needle[i] != window[i]) {
- LOG("Right half does not match.\n");
- window_last += i - cut + 1;
+ if (i < gap_jump_end) {
+ LOG("Early right half mismatch: jump by gap.\n");
+ assert(gap >= i - cut + 1);
+ window_last += gap;
+ }
+ else {
+ LOG("Late right half mismatch: jump by n (>gap)\n");
+ assert(i - cut + 1 > gap);
+ window_last += i - cut + 1;
+ }
memory = 0;
goto periodicwindowloop;
}
@@ -442,10 +451,8 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
}
}
else {
- Py_ssize_t gap = p->gap;
period = Py_MAX(gap, period);
LOG("Needle is not periodic.\n");
- Py_ssize_t gap_jump_end = Py_MIN(len_needle, cut + gap);
windowloop:
while (window_last < haystack_end) {
for (;;) {
@@ -463,19 +470,19 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
window = window_last - len_needle + 1;
assert((window[len_needle - 1] & TABLE_MASK) ==
(needle[len_needle - 1] & TABLE_MASK));
- for (Py_ssize_t i = cut; i < gap_jump_end; i++) {
- if (needle[i] != window[i]) {
- LOG("Early right half mismatch: jump by gap.\n");
- assert(gap >= i - cut + 1);
- window_last += gap;
- goto windowloop;
- }
- }
- for (Py_ssize_t i = gap_jump_end; i < len_needle; i++) {
+ Py_ssize_t i = cut;
+ for (; i < len_needle; i++) {
if (needle[i] != window[i]) {
- LOG("Late right half mismatch.\n");
- assert(i - cut + 1 > gap);
- window_last += i - cut + 1;
+ if (i < gap_jump_end) {
+ LOG("Early right half mismatch: jump by gap.\n");
+ assert(gap >= i - cut + 1);
+ window_last += gap;
+ }
+ else {
+ LOG("Late right half mismatch: jump by n (>gap)\n");
+ assert(i - cut + 1 > gap);
+ window_last += i - cut + 1;
+ }
goto windowloop;
}
}