bpo-38530: Refactor and improve AttributeError suggestions (GH-25776)

- Make case-swaps half the cost of any other edit - Refactor Levenshtein code to not use memory allocator, and to bail early on no match. - Add comments to Levenshtein distance code - Add test cases for Levenshtein distance behind a debug macro - Set threshold to `(name_size + item_size + 3) * MOVE_COST / 6`. - Reasoning: similar to `difflib.SequenceMatcher.ratio()` >= 2/3: ``` "Multiset Jaccard similarity" >= 2/3 matching letters / total letters >= 2/3 (name_size - distance + item_size - distance) / (name_size + item_size) >= 2/3 1 - (2*distance) / (name_size + item_size) >= 2/3 1/3 >= (2*distance) / (name_size + item_size) (name_size + item_size) / 6 >= distance With rounding: (name_size + item_size + 3) // 6 >= distance ``` Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
author: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com> 2021-05-03 15:47:27 (GMT)
committer: GitHub <noreply@github.com> 2021-05-03 15:47:27 (GMT)
commit: 80a2a4ed7d090fff2584302f07315d567109bca9 (patch)
tree: cfaec858b722cf629fdb488a02baeb8e823bf185 /Python
parent: c715b524210050bdd2a2e233817246d443bbb236 (diff)
download: cpython-80a2a4ed7d090fff2584302f07315d567109bca9.zip
cpython-80a2a4ed7d090fff2584302f07315d567109bca9.tar.gz
cpython-80a2a4ed7d090fff2584302f07315d567109bca9.tar.bz2
1 files changed, 130 insertions, 59 deletions
diff --git a/Python/suggestions.c b/Python/suggestions.c
index 2fd6714..6fb01f1 100644
--- a/Python/suggestions.c
+++ b/Python/suggestions.c
@@ -3,78 +3,129 @@
 
 #include "pycore_pyerrors.h"
 
-#define MAX_DISTANCE 3
-#define MAX_CANDIDATE_ITEMS 160
-#define MAX_STRING_SIZE 25
+#define MAX_CANDIDATE_ITEMS 750
+#define MAX_STRING_SIZE 40
 
-/* Calculate the Levenshtein distance between string1 and string2 */
-static Py_ssize_t
-levenshtein_distance(const char *a, size_t a_size,
-                     const char *b, size_t b_size) {
+#define MOVE_COST 2
+#define CASE_COST 1
 
-    if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
+#define LEAST_FIVE_BITS(n) ((n) & 31)
+
+static inline int
+substitution_cost(char a, char b)
+{
+    if (LEAST_FIVE_BITS(a) != LEAST_FIVE_BITS(b)) {
+        // Not the same, not a case flip.
+        return MOVE_COST;
+    }
+    if (a == b) {
         return 0;
     }
+    if ('A' <= a && a <= 'Z') {
+        a += ('a' - 'A');
+    }
+    if ('A' <= b && b <= 'Z') {
+        b += ('a' - 'A');
+    }
+    if (a == b) {
+        return CASE_COST;
+    }
+    return MOVE_COST;
+}
+
+/* Calculate the Levenshtein distance between string1 and string2 */
+static Py_ssize_t
+levenshtein_distance(const char *a, size_t a_size,
+                     const char *b, size_t b_size,
+                     size_t max_cost)
+{
+    static size_t buffer[MAX_STRING_SIZE];
 
     // Both strings are the same (by identity)
     if (a == b) {
         return 0;
     }
 
-    // The first string is empty
-    if (a_size == 0) {
-        return b_size;
+    // Trim away common affixes.
+    while (a_size && b_size && a[0] == b[0]) {
+        a++; a_size--;
+        b++; b_size--;
+    }
+    while (a_size && b_size && a[a_size-1] == b[b_size-1]) {
+        a_size--;
+        b_size--;
+    }
+    if (a_size == 0 || b_size == 0) {
+        return (a_size + b_size) * MOVE_COST;
+    }
+    if (a_size > MAX_STRING_SIZE || b_size > MAX_STRING_SIZE) {
+        return max_cost + 1;
     }
 
-    // The second string is empty
-    if (b_size == 0) {
-        return a_size;
+    // Prefer shorter buffer
+    if (b_size < a_size) {
+        const char *t = a; a = b; b = t;
+        size_t t_size = a_size; a_size = b_size; b_size = t_size;
     }
 
-    size_t *buffer = PyMem_Calloc(a_size, sizeof(size_t));
-    if (buffer == NULL) {
-        return -1;
+    // quick fail when a match is impossible.
+    if ((b_size - a_size) * MOVE_COST > max_cost) {
+        return max_cost + 1;
     }
 
+    // Instead of producing the whole traditional len(a)-by-len(b)
+    // matrix, we can update just one row in place.
     // Initialize the buffer row
-    size_t index = 0;
-    while (index < a_size) {
-        buffer[index] = index + 1;
-        index++;
+    for (size_t i = 0; i < a_size; i++) {
+        // cost from b[:0] to a[:i+1]
+        buffer[i] = (i + 1) * MOVE_COST;
     }
 
-    size_t b_index = 0;
     size_t result = 0;
-    while (b_index < b_size) {
+    for (size_t b_index = 0; b_index < b_size; b_index++) {
         char code = b[b_index];
-        size_t distance = result = b_index++;
-        index = SIZE_MAX;
-        while (++index < a_size) {
-            size_t b_distance = code == a[index] ? distance : distance + 1;
+        // cost(b[:b_index], a[:0]) == b_index * MOVE_COST
+        size_t distance = result = b_index * MOVE_COST;
+        size_t minimum = SIZE_MAX;
+        for (size_t index = 0; index < a_size; index++) {
+
+            // cost(b[:b_index+1], a[:index+1]) = min(
+            //     // 1) substitute
+            //     cost(b[:b_index], a[:index])
+            //         + substitution_cost(b[b_index], a[index]),
+            //     // 2) delete from b
+            //     cost(b[:b_index], a[:index+1]) + MOVE_COST,
+            //     // 3) delete from a
+            //     cost(b[:b_index+1], a[index]) + MOVE_COST
+            // )
+
+            // 1) Previous distance in this row is cost(b[:b_index], a[:index])
+            size_t substitute = distance + substitution_cost(code, a[index]);
+            // 2) cost(b[:b_index], a[:index+1]) from previous row
             distance = buffer[index];
-            if (distance > result) {
-                if (b_distance > result) {
-                    result = result + 1;
-                } else {
-                    result = b_distance;
-                }
-            } else {
-                if (b_distance > distance) {
-                    result = distance + 1;
-                } else {
-                    result = b_distance;
-                }
-            }
+            // 3) existing result is cost(b[:b_index+1], a[index])
+
+            size_t insert_delete = Py_MIN(result, distance) + MOVE_COST;
+            result = Py_MIN(insert_delete, substitute);
+
+            // cost(b[:b_index+1], a[:index+1])
             buffer[index] = result;
+            if (result < minimum) {
+                minimum = result;
+            }
+        }
+        if (minimum > max_cost) {
+            // Everything in this row is too big, so bail early.
+            return max_cost + 1;
         }
     }
-    PyMem_Free(buffer);
     return result;
 }
 
 static inline PyObject *
 calculate_suggestions(PyObject *dir,
-                      PyObject *name) {
+                      PyObject *name)
+{
     assert(!PyErr_Occurred());
     assert(PyList_CheckExact(dir));
 
@@ -83,13 +134,14 @@ calculate_suggestions(PyObject *dir,
         return NULL;
     }
 
-    Py_ssize_t suggestion_distance = PyUnicode_GetLength(name);
+    Py_ssize_t suggestion_distance = PY_SSIZE_T_MAX;
     PyObject *suggestion = NULL;
     Py_ssize_t name_size;
     const char *name_str = PyUnicode_AsUTF8AndSize(name, &name_size);
     if (name_str == NULL) {
         return NULL;
     }
+
     for (int i = 0; i < dir_size; ++i) {
         PyObject *item = PyList_GET_ITEM(dir, i);
         Py_ssize_t item_size;
@@ -97,15 +149,14 @@ calculate_suggestions(PyObject *dir,
         if (item_str == NULL) {
             return NULL;
         }
-        Py_ssize_t current_distance = levenshtein_distance(
-                name_str, name_size, item_str, item_size);
-        if (current_distance == -1) {
-            return NULL;
-        }
-        if (current_distance == 0 ||
-            current_distance > MAX_DISTANCE ||
-            current_distance * 2 > name_size)
-        {
+        // No more than 1/3 of the involved characters should need changed.
+        Py_ssize_t max_distance = (name_size + item_size + 3) * MOVE_COST / 6;
+        // Don't take matches we've already beaten.
+        max_distance = Py_MIN(max_distance, suggestion_distance - 1);
+        Py_ssize_t current_distance =
+            levenshtein_distance(name_str, name_size,
+                                 item_str, item_size, max_distance);
+        if (current_distance > max_distance) {
             continue;
         }
         if (!suggestion || current_distance < suggestion_distance) {
@@ -113,15 +164,13 @@ calculate_suggestions(PyObject *dir,
             suggestion_distance = current_distance;
         }
     }
-    if (!suggestion) {
-        return NULL;
-    }
-    Py_INCREF(suggestion);
+    Py_XINCREF(suggestion);
     return suggestion;
 }
 
 static PyObject *
-offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) {
+offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc)
+{
     PyObject *name = exc->name; // borrowed reference
     PyObject *obj = exc->obj; // borrowed reference
 
@@ -142,7 +191,8 @@ offer_suggestions_for_attribute_error(PyAttributeErrorObject *exc) {
 
 
 static PyObject *
-offer_suggestions_for_name_error(PyNameErrorObject *exc) {
+offer_suggestions_for_name_error(PyNameErrorObject *exc)
+{
     PyObject *name = exc->name; // borrowed reference
     PyTracebackObject *traceback = (PyTracebackObject *) exc->traceback; // borrowed reference
     // Abort if we don't have a variable name or we have an invalid one
@@ -194,7 +244,9 @@ offer_suggestions_for_name_error(PyNameErrorObject *exc) {
 // Offer suggestions for a given exception. Returns a python string object containing the
 // suggestions. This function returns NULL if no suggestion was found or if an exception happened,
 // users must call PyErr_Occurred() to disambiguate.
-PyObject *_Py_Offer_Suggestions(PyObject *exception) {
+PyObject *
+_Py_Offer_Suggestions(PyObject *exception)
+{
     PyObject *result = NULL;
     assert(!PyErr_Occurred());
     if (Py_IS_TYPE(exception, (PyTypeObject*)PyExc_AttributeError)) {
@@ -205,3 +257,22 @@ PyObject *_Py_Offer_Suggestions(PyObject *exception) {
     return result;
 }
 
+Py_ssize_t
+_Py_UTF8_Edit_Cost(PyObject *a, PyObject *b, Py_ssize_t max_cost)
+{
+    assert(PyUnicode_Check(a) && PyUnicode_Check(b));
+    Py_ssize_t size_a, size_b;
+    const char *utf8_a = PyUnicode_AsUTF8AndSize(a, &size_a);
+    if (utf8_a == NULL) {
+        return -1;
+    }
+    const char *utf8_b = PyUnicode_AsUTF8AndSize(b, &size_b);
+    if (utf8_b == NULL) {
+        return -1;
+    }
+    if (max_cost == -1) {
+        max_cost = MOVE_COST * Py_MAX(size_a, size_b);
+    }
+    return levenshtein_distance(utf8_a, size_a, utf8_b, size_b, max_cost);
+}
+
author	Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>	2021-05-03 15:47:27 (GMT)
committer	GitHub <noreply@github.com>	2021-05-03 15:47:27 (GMT)
commit	80a2a4ed7d090fff2584302f07315d567109bca9 (patch)
tree	cfaec858b722cf629fdb488a02baeb8e823bf185 /Python
parent	c715b524210050bdd2a2e233817246d443bbb236 (diff)
download	cpython-80a2a4ed7d090fff2584302f07315d567109bca9.zip cpython-80a2a4ed7d090fff2584302f07315d567109bca9.tar.gz cpython-80a2a4ed7d090fff2584302f07315d567109bca9.tar.bz2