summaryrefslogtreecommitdiffstats
path: root/src/edit_distance.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/edit_distance.cc')
-rw-r--r--src/edit_distance.cc68
1 files changed, 68 insertions, 0 deletions
diff --git a/src/edit_distance.cc b/src/edit_distance.cc
new file mode 100644
index 0000000..fe05f64
--- /dev/null
+++ b/src/edit_distance.cc
@@ -0,0 +1,68 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "edit_distance.h"
+
+#include <vector>
+
+#include "string_piece.h"
+
+int EditDistance(const StringPiece& s1,
+ const StringPiece& s2,
+ bool allow_replacements,
+ int max_edit_distance) {
+ // The algorithm implemented below is the "classic"
+ // dynamic-programming algorithm for computing the Levenshtein
+ // distance, which is described here:
+ //
+ // http://en.wikipedia.org/wiki/Levenshtein_distance
+ //
+ // Although the algorithm is typically described using an m x n
+ // array, only two rows are used at a time, so this implemenation
+ // just keeps two separate vectors for those two rows.
+ int m = s1.len_;
+ int n = s2.len_;
+
+ std::vector<int> previous(n + 1);
+ std::vector<int> current(n + 1);
+
+ for (int i = 0; i <= n; ++i)
+ previous[i] = i;
+
+ for (int y = 1; y <= m; ++y) {
+ current[0] = y;
+ int best_this_row = current[0];
+
+ for (int x = 1; x <= n; ++x) {
+ if (allow_replacements) {
+ current[x] = min(previous[x-1] + (s1.str_[y-1] == s2.str_[x-1] ? 0 : 1),
+ min(current[x-1], previous[x])+1);
+ }
+ else {
+ if (s1.str_[y-1] == s2.str_[x-1])
+ current[x] = previous[x-1];
+ else
+ current[x] = min(current[x-1], previous[x]) + 1;
+ }
+ best_this_row = min(best_this_row, current[x]);
+ }
+
+ if (max_edit_distance && best_this_row > max_edit_distance)
+ return max_edit_distance + 1;
+
+ current.swap(previous);
+ }
+
+ return previous[n];
+}