summaryrefslogtreecommitdiffstats
path: root/Objects
diff options
context:
space:
mode:
authorDennis Sweeney <36520290+sweeneyde@users.noreply.github.com>2021-02-28 18:20:50 (GMT)
committerGitHub <noreply@github.com>2021-02-28 18:20:50 (GMT)
commit73a85c4e1da42db28e3de57c868d24a089b8d277 (patch)
treeed2b044a2b8bd42bb611922bbcacbaf3599e91f9 /Objects
parent2183d06bc8a481098d62a4ebed8d6982b3d1602a (diff)
downloadcpython-73a85c4e1da42db28e3de57c868d24a089b8d277.zip
cpython-73a85c4e1da42db28e3de57c868d24a089b8d277.tar.gz
cpython-73a85c4e1da42db28e3de57c868d24a089b8d277.tar.bz2
bpo-41972: Use the two-way algorithm for string searching (GH-22904)
Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters <tim.peters@gmail.com>
Diffstat (limited to 'Objects')
-rw-r--r--Objects/stringlib/fastsearch.h490
-rw-r--r--Objects/stringlib/stringlib_find_two_way_notes.txt431
2 files changed, 901 insertions, 20 deletions
diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
index 56a4467..6574720 100644
--- a/Objects/stringlib/fastsearch.h
+++ b/Objects/stringlib/fastsearch.h
@@ -9,10 +9,16 @@
/* note: fastsearch may access s[n], which isn't a problem when using
Python's ordinary string types, but may cause problems if you're
using this code in other contexts. also, the count mode returns -1
- if there cannot possible be a match in the target string, and 0 if
+ if there cannot possibly be a match in the target string, and 0 if
it has actually checked for matches, but didn't find any. callers
beware! */
+/* If the strings are long enough, use Crochemore and Perrin's Two-Way
+ algorithm, which has worst-case O(n) runtime and best-case O(n/k).
+ Also compute a table of shifts to achieve O(n/k) in more cases,
+ and often (data dependent) deduce larger shifts than pure C&P can
+ deduce. */
+
#define FAST_COUNT 0
#define FAST_SEARCH 1
#define FAST_RSEARCH 2
@@ -160,6 +166,353 @@ STRINGLIB(rfind_char)(const STRINGLIB_CHAR* s, Py_ssize_t n, STRINGLIB_CHAR ch)
#undef MEMCHR_CUT_OFF
+/* Change to a 1 to see logging comments walk through the algorithm. */
+#if 0 && STRINGLIB_SIZEOF_CHAR == 1
+# define LOG(...) printf(__VA_ARGS__)
+# define LOG_STRING(s, n) printf("\"%.*s\"", n, s)
+#else
+# define LOG(...)
+# define LOG_STRING(s, n)
+#endif
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_lex_search)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
+ Py_ssize_t *return_period, int invert_alphabet)
+{
+ /* Do a lexicographic search. Essentially this:
+ >>> max(needle[i:] for i in range(len(needle)+1))
+ Also find the period of the right half. */
+ Py_ssize_t max_suffix = 0;
+ Py_ssize_t candidate = 1;
+ Py_ssize_t k = 0;
+ // The period of the right half.
+ Py_ssize_t period = 1;
+
+ while (candidate + k < len_needle) {
+ // each loop increases candidate + k + max_suffix
+ STRINGLIB_CHAR a = needle[candidate + k];
+ STRINGLIB_CHAR b = needle[max_suffix + k];
+ // check if the suffix at candidate is better than max_suffix
+ if (invert_alphabet ? (b < a) : (a < b)) {
+ // Fell short of max_suffix.
+ // The next k + 1 characters are non-increasing
+ // from candidate, so they won't start a maximal suffix.
+ candidate += k + 1;
+ k = 0;
+ // We've ruled out any period smaller than what's
+ // been scanned since max_suffix.
+ period = candidate - max_suffix;
+ }
+ else if (a == b) {
+ if (k + 1 != period) {
+ // Keep scanning the equal strings
+ k++;
+ }
+ else {
+ // Matched a whole period.
+ // Start matching the next period.
+ candidate += period;
+ k = 0;
+ }
+ }
+ else {
+ // Did better than max_suffix, so replace it.
+ max_suffix = candidate;
+ candidate++;
+ k = 0;
+ period = 1;
+ }
+ }
+ *return_period = period;
+ return max_suffix;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_factorize)(const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle,
+ Py_ssize_t *return_period)
+{
+ /* Do a "critical factorization", making it so that:
+ >>> needle = (left := needle[:cut]) + (right := needle[cut:])
+ where the "local period" of the cut is maximal.
+
+ The local period of the cut is the minimal length of a string w
+ such that (left endswith w or w endswith left)
+ and (right startswith w or w startswith left).
+
+ The Critical Factorization Theorem says that this maximal local
+ period is the global period of the string.
+
+ Crochemore and Perrin (1991) show that this cut can be computed
+ as the later of two cuts: one that gives a lexicographically
+ maximal right half, and one that gives the same with the
+ with respect to a reversed alphabet-ordering.
+
+ This is what we want to happen:
+ >>> x = "GCAGAGAG"
+ >>> cut, period = factorize(x)
+ >>> x[:cut], (right := x[cut:])
+ ('GC', 'AGAGAG')
+ >>> period # right half period
+ 2
+ >>> right[period:] == right[:-period]
+ True
+
+ This is how the local period lines up in the above example:
+ GC | AGAGAG
+ AGAGAGC = AGAGAGC
+ The length of this minimal repetition is 7, which is indeed the
+ period of the original string. */
+
+ Py_ssize_t cut1, period1, cut2, period2, cut, period;
+ cut1 = STRINGLIB(_lex_search)(needle, len_needle, &period1, 0);
+ cut2 = STRINGLIB(_lex_search)(needle, len_needle, &period2, 1);
+
+ // Take the later cut.
+ if (cut1 > cut2) {
+ period = period1;
+ cut = cut1;
+ }
+ else {
+ period = period2;
+ cut = cut2;
+ }
+
+ LOG("split: "); LOG_STRING(needle, cut);
+ LOG(" + "); LOG_STRING(needle + cut, len_needle - cut);
+ LOG("\n");
+
+ *return_period = period;
+ return cut;
+}
+
+#define SHIFT_TYPE uint8_t
+#define NOT_FOUND ((1U<<(8*sizeof(SHIFT_TYPE))) - 1U)
+#define SHIFT_OVERFLOW (NOT_FOUND - 1U)
+
+#define TABLE_SIZE_BITS 6
+#define TABLE_SIZE (1U << TABLE_SIZE_BITS)
+#define TABLE_MASK (TABLE_SIZE - 1U)
+
+typedef struct STRINGLIB(_pre) {
+ const STRINGLIB_CHAR *needle;
+ Py_ssize_t len_needle;
+ Py_ssize_t cut;
+ Py_ssize_t period;
+ int is_periodic;
+ SHIFT_TYPE table[TABLE_SIZE];
+} STRINGLIB(prework);
+
+
+Py_LOCAL_INLINE(void)
+STRINGLIB(_preprocess)(const STRINGLIB_CHAR *needle, Py_ssize_t len_needle,
+ STRINGLIB(prework) *p)
+{
+ p->needle = needle;
+ p->len_needle = len_needle;
+ p->cut = STRINGLIB(_factorize)(needle, len_needle, &(p->period));
+ assert(p->period + p->cut <= len_needle);
+ p->is_periodic = (0 == memcmp(needle,
+ needle + p->period,
+ p->cut * STRINGLIB_SIZEOF_CHAR));
+ if (p->is_periodic) {
+ assert(p->cut <= len_needle/2);
+ assert(p->cut < p->period);
+ }
+ else {
+ // A lower bound on the period
+ p->period = Py_MAX(p->cut, len_needle - p->cut) + 1;
+ }
+ // Now fill up a table
+ memset(&(p->table[0]), 0xff, TABLE_SIZE*sizeof(SHIFT_TYPE));
+ assert(p->table[0] == NOT_FOUND);
+ assert(p->table[TABLE_MASK] == NOT_FOUND);
+ for (Py_ssize_t i = 0; i < len_needle; i++) {
+ Py_ssize_t shift = len_needle - i;
+ if (shift > SHIFT_OVERFLOW) {
+ shift = SHIFT_OVERFLOW;
+ }
+ p->table[needle[i] & TABLE_MASK] = Py_SAFE_DOWNCAST(shift,
+ Py_ssize_t,
+ SHIFT_TYPE);
+ }
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
+ STRINGLIB(prework) *p)
+{
+ // Crochemore and Perrin's (1991) Two-Way algorithm.
+ // See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
+ Py_ssize_t len_needle = p->len_needle;
+ Py_ssize_t cut = p->cut;
+ Py_ssize_t period = p->period;
+ const STRINGLIB_CHAR *needle = p->needle;
+ const STRINGLIB_CHAR *window = haystack;
+ const STRINGLIB_CHAR *last_window = haystack + len_haystack - len_needle;
+ SHIFT_TYPE *table = p->table;
+ LOG("===== Two-way: \"%s\" in \"%s\". =====\n", needle, haystack);
+
+ if (p->is_periodic) {
+ LOG("Needle is periodic.\n");
+ Py_ssize_t memory = 0;
+ periodicwindowloop:
+ while (window <= last_window) {
+ Py_ssize_t i = Py_MAX(cut, memory);
+
+ // Visualize the line-up:
+ LOG("> "); LOG_STRING(haystack, len_haystack);
+ LOG("\n> "); LOG("%*s", window - haystack, "");
+ LOG_STRING(needle, len_needle);
+ LOG("\n> "); LOG("%*s", window - haystack + i, "");
+ LOG(" ^ <-- cut\n");
+
+ if (window[i] != needle[i]) {
+ // Sunday's trick: if we're going to jump, we might
+ // as well jump to line up the character *after* the
+ // current window.
+ STRINGLIB_CHAR first_outside = window[len_needle];
+ SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
+ if (shift == NOT_FOUND) {
+ LOG("\"%c\" not found. Skipping entirely.\n",
+ first_outside);
+ window += len_needle + 1;
+ }
+ else {
+ LOG("Shifting to line up \"%c\".\n", first_outside);
+ Py_ssize_t memory_shift = i - cut + 1;
+ window += Py_MAX(shift, memory_shift);
+ }
+ memory = 0;
+ goto periodicwindowloop;
+ }
+ for (i = i + 1; i < len_needle; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Right half does not match. Jump ahead by %d.\n",
+ i - cut + 1);
+ window += i - cut + 1;
+ memory = 0;
+ goto periodicwindowloop;
+ }
+ }
+ for (i = memory; i < cut; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Left half does not match. Jump ahead by period %d.\n",
+ period);
+ window += period;
+ memory = len_needle - period;
+ goto periodicwindowloop;
+ }
+ }
+ LOG("Left half matches. Returning %d.\n",
+ window - haystack);
+ return window - haystack;
+ }
+ }
+ else {
+ LOG("Needle is not periodic.\n");
+ assert(cut < len_needle);
+ STRINGLIB_CHAR needle_cut = needle[cut];
+ windowloop:
+ while (window <= last_window) {
+
+ // Visualize the line-up:
+ LOG("> "); LOG_STRING(haystack, len_haystack);
+ LOG("\n> "); LOG("%*s", window - haystack, "");
+ LOG_STRING(needle, len_needle);
+ LOG("\n> "); LOG("%*s", window - haystack + cut, "");
+ LOG(" ^ <-- cut\n");
+
+ if (window[cut] != needle_cut) {
+ // Sunday's trick: if we're going to jump, we might
+ // as well jump to line up the character *after* the
+ // current window.
+ STRINGLIB_CHAR first_outside = window[len_needle];
+ SHIFT_TYPE shift = table[first_outside & TABLE_MASK];
+ if (shift == NOT_FOUND) {
+ LOG("\"%c\" not found. Skipping entirely.\n",
+ first_outside);
+ window += len_needle + 1;
+ }
+ else {
+ LOG("Shifting to line up \"%c\".\n", first_outside);
+ window += shift;
+ }
+ goto windowloop;
+ }
+ for (Py_ssize_t i = cut + 1; i < len_needle; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Right half does not match. Advance by %d.\n",
+ i - cut + 1);
+ window += i - cut + 1;
+ goto windowloop;
+ }
+ }
+ for (Py_ssize_t i = 0; i < cut; i++) {
+ if (needle[i] != window[i]) {
+ LOG("Left half does not match. Advance by period %d.\n",
+ period);
+ window += period;
+ goto windowloop;
+ }
+ }
+ LOG("Left half matches. Returning %d.\n", window - haystack);
+ return window - haystack;
+ }
+ }
+ LOG("Not found. Returning -1.\n");
+ return -1;
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way_find)(const STRINGLIB_CHAR *haystack,
+ Py_ssize_t len_haystack,
+ const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle)
+{
+ LOG("###### Finding \"%s\" in \"%s\".\n", needle, haystack);
+ STRINGLIB(prework) p;
+ STRINGLIB(_preprocess)(needle, len_needle, &p);
+ return STRINGLIB(_two_way)(haystack, len_haystack, &p);
+}
+
+Py_LOCAL_INLINE(Py_ssize_t)
+STRINGLIB(_two_way_count)(const STRINGLIB_CHAR *haystack,
+ Py_ssize_t len_haystack,
+ const STRINGLIB_CHAR *needle,
+ Py_ssize_t len_needle,
+ Py_ssize_t maxcount)
+{
+ LOG("###### Counting \"%s\" in \"%s\".\n", needle, haystack);
+ STRINGLIB(prework) p;
+ STRINGLIB(_preprocess)(needle, len_needle, &p);
+ Py_ssize_t index = 0, count = 0;
+ while (1) {
+ Py_ssize_t result;
+ result = STRINGLIB(_two_way)(haystack + index,
+ len_haystack - index, &p);
+ if (result == -1) {
+ return count;
+ }
+ count++;
+ if (count == maxcount) {
+ return maxcount;
+ }
+ index += result + len_needle;
+ }
+ return count;
+}
+
+#undef SHIFT_TYPE
+#undef NOT_FOUND
+#undef SHIFT_OVERFLOW
+#undef TABLE_SIZE_BITS
+#undef TABLE_SIZE
+#undef TABLE_MASK
+
+#undef LOG
+#undef LOG_STRING
+
Py_LOCAL_INLINE(Py_ssize_t)
FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
const STRINGLIB_CHAR* p, Py_ssize_t m,
@@ -195,10 +548,22 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
}
mlast = m - 1;
- skip = mlast - 1;
+ skip = mlast;
mask = 0;
if (mode != FAST_RSEARCH) {
+ if (m >= 100 && w >= 2000 && w / m >= 5) {
+ /* For larger problems where the needle isn't a huge
+ percentage of the size of the haystack, the relatively
+ expensive O(m) startup cost of the two-way algorithm
+ will surely pay off. */
+ if (mode == FAST_SEARCH) {
+ return STRINGLIB(_two_way_find)(s, n, p, m);
+ }
+ else {
+ return STRINGLIB(_two_way_count)(s, n, p, m, maxcount);
+ }
+ }
const STRINGLIB_CHAR *ss = s + m - 1;
const STRINGLIB_CHAR *pp = p + m - 1;
@@ -207,41 +572,118 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:-1] */
for (i = 0; i < mlast; i++) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
- if (p[i] == p[mlast])
+ if (p[i] == p[mlast]) {
skip = mlast - i - 1;
+ }
}
/* process pattern[-1] outside the loop */
STRINGLIB_BLOOM_ADD(mask, p[mlast]);
+ if (m >= 100 && w >= 8000) {
+ /* To ensure that we have good worst-case behavior,
+ here's an adaptive version of the algorithm, where if
+ we match O(m) characters without any matches of the
+ entire needle, then we predict that the startup cost of
+ the two-way algorithm will probably be worth it. */
+ Py_ssize_t hits = 0;
+ for (i = 0; i <= w; i++) {
+ if (ss[i] == pp[0]) {
+ /* candidate match */
+ for (j = 0; j < mlast; j++) {
+ if (s[i+j] != p[j]) {
+ break;
+ }
+ }
+ if (j == mlast) {
+ /* got a match! */
+ if (mode != FAST_COUNT) {
+ return i;
+ }
+ count++;
+ if (count == maxcount) {
+ return maxcount;
+ }
+ i = i + mlast;
+ continue;
+ }
+ /* miss: check if next character is part of pattern */
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+ i = i + m;
+ }
+ else {
+ i = i + skip;
+ }
+ hits += j + 1;
+ if (hits >= m / 4 && i < w - 1000) {
+ /* We've done O(m) fruitless comparisons
+ anyway, so spend the O(m) cost on the
+ setup for the two-way algorithm. */
+ Py_ssize_t res;
+ if (mode == FAST_COUNT) {
+ res = STRINGLIB(_two_way_count)(
+ s+i, n-i, p, m, maxcount-count);
+ return count + res;
+ }
+ else {
+ res = STRINGLIB(_two_way_find)(s+i, n-i, p, m);
+ if (res == -1) {
+ return -1;
+ }
+ return i + res;
+ }
+ }
+ }
+ else {
+ /* skip: check if next character is part of pattern */
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
+ i = i + m;
+ }
+ }
+ }
+ if (mode != FAST_COUNT) {
+ return -1;
+ }
+ return count;
+ }
+ /* The standard, non-adaptive version of the algorithm. */
for (i = 0; i <= w; i++) {
/* note: using mlast in the skip path slows things down on x86 */
if (ss[i] == pp[0]) {
/* candidate match */
- for (j = 0; j < mlast; j++)
- if (s[i+j] != p[j])
+ for (j = 0; j < mlast; j++) {
+ if (s[i+j] != p[j]) {
break;
+ }
+ }
if (j == mlast) {
/* got a match! */
- if (mode != FAST_COUNT)
+ if (mode != FAST_COUNT) {
return i;
+ }
count++;
- if (count == maxcount)
+ if (count == maxcount) {
return maxcount;
+ }
i = i + mlast;
continue;
}
/* miss: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, ss[i+1]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
- else
+ }
+ else {
i = i + skip;
- } else {
+ }
+ }
+ else {
/* skip: check if next character is part of pattern */
- if (!STRINGLIB_BLOOM(mask, ss[i+1]))
+ if (!STRINGLIB_BLOOM(mask, ss[i+1])) {
i = i + m;
+ }
}
}
- } else { /* FAST_RSEARCH */
+ }
+ else { /* FAST_RSEARCH */
/* create compressed boyer-moore delta 1 table */
@@ -250,28 +692,36 @@ FASTSEARCH(const STRINGLIB_CHAR* s, Py_ssize_t n,
/* process pattern[:0:-1] */
for (i = mlast; i > 0; i--) {
STRINGLIB_BLOOM_ADD(mask, p[i]);
- if (p[i] == p[0])
+ if (p[i] == p[0]) {
skip = i - 1;
+ }
}
for (i = w; i >= 0; i--) {
if (s[i] == p[0]) {
/* candidate match */
- for (j = mlast; j > 0; j--)
- if (s[i+j] != p[j])
+ for (j = mlast; j > 0; j--) {
+ if (s[i+j] != p[j]) {
break;
- if (j == 0)
+ }
+ }
+ if (j == 0) {
/* got a match! */
return i;
+ }
/* miss: check if previous character is part of pattern */
- if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
+ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m;
- else
+ }
+ else {
i = i - skip;
- } else {
+ }
+ }
+ else {
/* skip: check if previous character is part of pattern */
- if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
+ if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1])) {
i = i - m;
+ }
}
}
}
diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt
new file mode 100644
index 0000000..afe4543
--- /dev/null
+++ b/Objects/stringlib/stringlib_find_two_way_notes.txt
@@ -0,0 +1,431 @@
+This document explains Crochemore and Perrin's Two-Way string matching
+algorithm, in which a smaller string (the "pattern" or "needle")
+is searched for in a longer string (the "text" or "haystack"),
+determining whether the needle is a substring of the haystack, and if
+so, at what index(es). It is to be used by Python's string
+(and bytes-like) objects when calling `find`, `index`, `__contains__`,
+or implicitly in methods like `replace` or `partition`.
+
+This is essentially a re-telling of the paper
+
+ Crochemore M., Perrin D., 1991, Two-way string-matching,
+ Journal of the ACM 38(3):651-675.
+
+focused more on understanding and examples than on rigor. See also
+the code sample here:
+
+ http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
+
+The algorithm runs in O(len(needle) + len(haystack)) time and with
+O(1) space. However, since there is a larger preprocessing cost than
+simpler algorithms, this Two-Way algorithm is to be used only when the
+needle and haystack lengths meet certain thresholds.
+
+
+These are the basic steps of the algorithm:
+
+ * "Very carefully" cut the needle in two.
+ * For each alignment attempted:
+ 1. match the right part
+ * On failure, jump by the amount matched + 1
+ 2. then match the left part.
+ * On failure jump by max(len(left), len(right)) + 1
+ * If the needle is periodic, don't re-do comparisons; maintain
+ a "memory" of how many characters you already know match.
+
+
+-------- Matching the right part --------
+
+We first scan the right part of the needle to check if it matches the
+the aligned characters in the haystack. We scan left-to-right,
+and if a mismatch occurs, we jump ahead by the amount matched plus 1.
+
+Example:
+
+ text: ........EFGX...................
+ pattern: ....abcdEFGH....
+ cut: <<<<>>>>
+
+Matched 3, so jump ahead by 4:
+
+ text: ........EFGX...................
+ pattern: ....abcdEFGH....
+ cut: <<<<>>>>
+
+Why are we allowed to do this? Because we cut the needle very
+carefully, in such a way that if the cut is ...abcd + EFGH... then
+we have
+
+ d != E
+ cd != EF
+ bcd != EFG
+ abcd != EFGH
+ ... and so on.
+
+If this is true for every pair of equal-length substrings around the
+cut, then the following alignments do not work, so we can skip them:
+
+ text: ........EFG....................
+ pattern: ....abcdEFGH....
+ ^ (Bad because d != E)
+ text: ........EFG....................
+ pattern: ....abcdEFGH....
+ ^^ (Bad because cd != EF)
+ text: ........EFG....................
+ pattern: ....abcdEFGH....
+ ^^^ (Bad because bcd != EFG)
+
+Skip 3 alignments => increment alignment by 4.
+
+
+-------- If len(left_part) < len(right_part) --------
+
+Above is the core idea, and it begins to suggest how the algorithm can
+be linear-time. There is one bit of subtlety involving what to do
+around the end of the needle: if the left half is shorter than the
+right, then we could run into something like this:
+
+ text: .....EFG......
+ pattern: cdEFGH
+
+The same argument holds that we can skip ahead by 4, so long as
+
+ d != E
+ cd != EF
+ ?cd != EFG
+ ??cd != EFGH
+ etc.
+
+The question marks represent "wildcards" that always match; they're
+outside the limits of the needle, so there's no way for them to
+invalidate a match. To ensure that the inequalities above are always
+true, we need them to be true for all possible '?' values. We thus
+need cd != FG and cd != GH, etc.
+
+
+-------- Matching the left part --------
+
+Once we have ensured the right part matches, we scan the left part
+(order doesn't matter, but traditionally right-to-left), and if we
+find a mismatch, we jump ahead by
+max(len(left_part), len(right_part)) + 1. That we can jump by
+at least len(right_part) + 1 we have already seen:
+
+ text: .....EFG.....
+ pattern: abcdEFG
+ Matched 3, so jump by 4,
+ using the fact that d != E, cd != EF, and bcd != EFG.
+
+But we can also jump by at least len(left_part) + 1:
+
+ text: ....cdEF.....
+ pattern: abcdEF
+ Jump by len('abcd') + 1 = 5.
+
+ Skip the alignments:
+ text: ....cdEF.....
+ pattern: abcdEF
+ text: ....cdEF.....
+ pattern: abcdEF
+ text: ....cdEF.....
+ pattern: abcdEF
+ text: ....cdEF.....
+ pattern: abcdEF
+
+This requires the following facts:
+ d != E
+ cd != EF
+ bcd != EF?
+ abcd != EF??
+ etc., for all values of ?s, as above.
+
+If we have both sets of inequalities, then we can indeed jump by
+max(len(left_part), len(right_part)) + 1. Under the assumption of such
+a nice splitting of the needle, we now have enough to prove linear
+time for the search: consider the forward-progress/comparisons ratio
+at each alignment position. If a mismatch occurs in the right part,
+the ratio is 1 position forward per comparison. On the other hand,
+if a mismatch occurs in the left half, we advance by more than
+len(needle)//2 positions for at most len(needle) comparisons,
+so this ratio is more than 1/2. This average "movement speed" is
+bounded below by the constant "1 position per 2 comparisons", so we
+have linear time.
+
+
+-------- The periodic case --------
+
+The sets of inequalities listed so far seem too good to be true in
+the general case. Indeed, they fail when a needle is periodic:
+there's no way to split 'AAbAAbAAbA' in two such that
+
+ (the stuff n characters to the left of the split)
+ cannot equal
+ (the stuff n characters to the right of the split)
+ for all n.
+
+This is because no matter how you cut it, you'll get
+s[cut-3:cut] == s[cut:cut+3]. So what do we do? We still cut the
+needle in two so that n can be as big as possible. If we were to
+split it as
+
+ AAbA + AbAAbA
+
+then A == A at the split, so this is bad (we failed at length 1), but
+if we split it as
+
+ AA + bAAbAAbA
+
+we at least have A != b and AA != bA, and we fail at length 3
+since ?AA == bAA. We already knew that a cut to make length-3
+mismatch was impossible due to the period, but we now see that the
+bound is sharp; we can get length-1 and length-2 to mismatch.
+
+This is exactly the content of the *critical factorization theorem*:
+that no matter the period of the original needle, you can cut it in
+such a way that (with the appropriate question marks),
+needle[cut-k:cut] mismatches needle[cut:cut+k] for all k < the period.
+
+Even "non-periodic" strings are periodic with a period equal to
+their length, so for such needles, the CFT already guarantees that
+the algorithm described so far will work, since we can cut the needle
+so that the length-k chunks on either side of the cut mismatch for all
+k < len(needle). Looking closer at the algorithm, we only actually
+require that k go up to max(len(left_part), len(right_part)).
+So long as the period exceeds that, we're good.
+
+The more general shorter-period case is a bit harder. The essentials
+are the same, except we use the periodicity to our advantage by
+"remembering" periods that we've already compared. In our running
+example, say we're computing
+
+ "AAbAAbAAbA" in "bbbAbbAAbAAbAAbbbAAbAAbAAbAA".
+
+We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
+
+ First alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^^X
+ - Mismatch at third position, so jump by 3.
+ - This requires that A!=b and AA != bA.
+
+ Second alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^^^^^^^^
+ X
+ - Matched entire right part
+ - Mismatch at left part.
+ - Jump forward a period, remembering the existing comparisons
+
+ Third alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ mmmmmmm^^X
+ - There's "memory": a bunch of characters were already matched.
+ - Two more characters match beyond that.
+ - The 8th character of the right part mismatched, so jump by 8
+ - The above rule is more complicated than usual: we don't have
+ the right inequalities for lengths 1 through 7, but we do have
+ shifted copies of the length-1 and length-2 inequalities,
+ along with knowledge of the mismatch. We can skip all of these
+ alignments at once:
+
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ~ A != b at the cut
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ~~ AA != bA at the cut
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^^^^X 7-3=4 match, and the 5th misses.
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ~ A != b at the cut
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ~~ AA != bA at the cut
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^X 7-3-3=1 match and the 2nd misses.
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ~ A != b at the cut
+
+ Fourth alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^X
+ - Second character mismatches, so jump by 2.
+
+ Fifth alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ ^^^^^^^^
+ X
+ - Right half matches, so use memory and skip ahead by period=3
+
+ Sixth alignment:
+ bbbAbbAAbAAbAAbbbAAbAAbAAbAA
+ AAbAAbAAbA
+ mmmmmmmm^^
+ - Right part matches, left part is remembered, found a match!
+
+The one tricky skip by 8 here generalizes: if we have a period of p,
+then the CFT says we can ensure the cut has the inequality property
+for lengths 1 through p-1, and jumping by p would line up the
+matching characters and mismatched character one period earlier.
+Inductively, this proves that we can skip by the number of characters
+matched in the right half, plus 1, just as in the original algorithm.
+
+To make it explicit, the memory is set whenever the entire right part
+is matched and is then used as a starting point in the next alignment.
+In such a case, the alignment jumps forward one period, and the right
+half matches all except possibly the last period. Additionally,
+if we cut so that the left part has a length strictly less than the
+period (we always can!), then we can know that the left part already
+matches. The memory is reset to 0 whenever there is a mismatch in the
+right part.
+
+To prove linearity for the periodic case, note that if a right-part
+character mismatches, then we advance forward 1 unit per comparison.
+On the other hand, if the entire right part matches, then the skipping
+forward by one period "defers" some of the comparisons to the next
+alignment, where they will then be spent at the usual rate of
+one comparison per step forward. Even if left-half comparisons
+are always "wasted", they constitute less than half of all
+comparisons, so the average rate is certainly at least 1 move forward
+per 2 comparisons.
+
+
+-------- When to choose the periodic algorithm ---------
+
+The periodic algorithm is always valid but has an overhead of one
+more "memory" register and some memory computation steps, so the
+here-described-first non-periodic/long-period algorithm -- skipping by
+max(len(left_part), len(right_part)) + 1 rather than the period --
+should be preferred when possible.
+
+Interestingly, the long-period algorithm does not require an exact
+computation of the period; it works even with some long-period, but
+undeniably "periodic" needles:
+
+ Cut: AbcdefAbc == Abcde + fAbc
+
+This cut gives these inequalities:
+
+ e != f
+ de != fA
+ cde != fAb
+ bcde != fAbc
+ Abcde != fAbc?
+ The first failure is a period long, per the CFT:
+ ?Abcde == fAbc??
+
+A sufficient condition for using the long-period algorithm is having
+the period of the needle be greater than
+max(len(left_part), len(right_part)). This way, after choosing a good
+split, we get all of the max(len(left_part), len(right_part))
+inequalities around the cut that were required in the long-period
+version of the algorithm.
+
+With all of this in mind, here's how we choose:
+
+ (1) Choose a "critical factorization" of the needle -- a cut
+ where we have period minus 1 inequalities in a row.
+ More specifically, choose a cut so that the left_part
+ is less than one period long.
+ (2) Determine the period P_r of the right_part.
+ (3) Check if the left part is just an extension of the pattern of
+ the right part, so that the whole needle has period P_r.
+ Explicitly, check if
+ needle[0:cut] == needle[0+P_r:cut+P_r]
+ If so, we use the periodic algorithm. If not equal, we use the
+ long-period algorithm.
+
+Note that if equality holds in (3), then the period of the whole
+string is P_r. On the other hand, suppose equality does not hold.
+The period of the needle is then strictly greater than P_r. Here's
+a general fact:
+
+ If p is a substring of s and p has period r, then the period
+ of s is either equal to r or greater than len(p).
+
+We know that needle_period != P_r,
+and therefore needle_period > len(right_part).
+Additionally, we'll choose the cut (see below)
+so that len(left_part) < needle_period.
+
+Thus, in the case where equality does not hold, we have that
+needle_period >= max(len(left_part), len(right_part)) + 1,
+so the long-period algorithm works, but otherwise, we know the period
+of the needle.
+
+Note that this decision process doesn't always require an exact
+computation of the period -- we can get away with only computing P_r!
+
+
+-------- Computing the cut --------
+
+Our remaining tasks are now to compute a cut of the needle with as
+many inequalities as possible, ensuring that cut < needle_period.
+Meanwhile, we must also compute the period P_r of the right_part.
+
+The computation is relatively simple, essentially doing this:
+
+ suffix1 = max(needle[i:] for i in range(len(needle)))
+ suffix2 = ... # the same as above, but invert the alphabet
+ cut1 = len(needle) - len(suffix1)
+ cut2 = len(needle) - len(suffix2)
+ cut = max(cut1, cut2) # the later cut
+
+For cut2, "invert the alphabet" is different than saying min(...),
+since in lexicographic order, we still put "py" < "python", even
+if the alphabet is inverted. Computing these, along with the method
+of computing the period of the right half, is easiest to read directly
+from the source code in fastsearch.h, in which these are computed
+in linear time.
+
+Crochemore & Perrin's Theorem 3.1 give that "cut" above is a
+critical factorization less than the period, but a very brief sketch
+of their proof goes something like this (this is far from complete):
+
+ * If this cut splits the needle as some
+ needle == (a + w) + (w + b), meaning there's a bad equality
+ w == w, it's impossible for w + b to be bigger than both
+ b and w + w + b, so this can't happen. We thus have all of
+ the ineuqalities with no question marks.
+ * By maximality, the right part is not a substring of the left
+ part. Thus, we have all of the inequalities involving no
+ left-side question marks.
+ * If you have all of the inequalities without right-side question
+ marks, we have a critical factorization.
+ * If one such inequality fails, then there's a smaller period,
+ but the factorization is nonetheless critical. Here's where
+ you need the redundancy coming from computing both cuts and
+ choosing the later one.
+
+
+-------- Some more Bells and Whistles --------
+
+Beyond Crochemore & Perrin's original algorithm, we can use a couple
+more tricks for speed in fastsearch.h:
+
+ 1. Even though C&P has a best-case O(n/m) time, this doesn't occur
+ very often, so we add a Boyer-Moore bad character table to
+ achieve sublinear time in more cases.
+
+ 2. The prework of computing the cut/period is expensive per
+ needle character, so we shouldn't do it if it won't pay off.
+ For this reason, if the needle and haystack are long enough,
+ only automatically start with two-way if the needle's length
+ is a small percentage of the length of the haystack.
+
+ 3. In cases where the needle and haystack are large but the needle
+ makes up a significant percentage of the length of the
+ haystack, don't pay the expensive two-way preprocessing cost
+ if you don't need to. Instead, keep track of how many
+ character comparisons are equal, and if that exceeds
+ O(len(needle)), then pay that cost, since the simpler algorithm
+ isn't doing very well.