From d0a3c5c3735c52aa2fd4ecfb0d2c84dc9ebbb45a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 10 Nov 2011 12:58:00 -0800 Subject: Add a EditDistance() function based on the one in llvm/lib/Support/StringRef.cpp. --- configure.py | 6 +++-- src/edit_distance.cc | 68 +++++++++++++++++++++++++++++++++++++++++++++++ src/edit_distance.h | 25 +++++++++++++++++ src/edit_distance_test.cc | 49 ++++++++++++++++++++++++++++++++++ src/string_piece.h | 2 +- 5 files changed, 147 insertions(+), 3 deletions(-) create mode 100644 src/edit_distance.cc create mode 100644 src/edit_distance.h create mode 100644 src/edit_distance_test.cc diff --git a/configure.py b/configure.py index 17afea4..be8eea6 100755 --- a/configure.py +++ b/configure.py @@ -140,8 +140,9 @@ if platform not in ('mingw'): n.newline() n.comment('Core source files all build into ninja library.') -for name in ['build', 'build_log', 'clean', 'eval_env', 'graph', 'graphviz', - 'parsers', 'util', 'stat_cache', 'disk_interface', 'state']: +for name in ['build', 'build_log', 'clean', 'edit_distance', 'eval_env', + 'graph', 'graphviz', 'parsers', 'util', 'stat_cache', + 'disk_interface', 'state']: objs += cxx(name) if platform == 'mingw': objs += cxx('subprocess-win32') @@ -176,6 +177,7 @@ for name in ['build_log_test', 'build_test', 'clean_test', 'disk_interface_test', + 'edit_distance_test', 'eval_env_test', 'graph_test', 'parsers_test', diff --git a/src/edit_distance.cc b/src/edit_distance.cc new file mode 100644 index 0000000..fe05f64 --- /dev/null +++ b/src/edit_distance.cc @@ -0,0 +1,68 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "edit_distance.h" + +#include + +#include "string_piece.h" + +int EditDistance(const StringPiece& s1, + const StringPiece& s2, + bool allow_replacements, + int max_edit_distance) { + // The algorithm implemented below is the "classic" + // dynamic-programming algorithm for computing the Levenshtein + // distance, which is described here: + // + // http://en.wikipedia.org/wiki/Levenshtein_distance + // + // Although the algorithm is typically described using an m x n + // array, only two rows are used at a time, so this implemenation + // just keeps two separate vectors for those two rows. + int m = s1.len_; + int n = s2.len_; + + std::vector previous(n + 1); + std::vector current(n + 1); + + for (int i = 0; i <= n; ++i) + previous[i] = i; + + for (int y = 1; y <= m; ++y) { + current[0] = y; + int best_this_row = current[0]; + + for (int x = 1; x <= n; ++x) { + if (allow_replacements) { + current[x] = min(previous[x-1] + (s1.str_[y-1] == s2.str_[x-1] ? 0 : 1), + min(current[x-1], previous[x])+1); + } + else { + if (s1.str_[y-1] == s2.str_[x-1]) + current[x] = previous[x-1]; + else + current[x] = min(current[x-1], previous[x]) + 1; + } + best_this_row = min(best_this_row, current[x]); + } + + if (max_edit_distance && best_this_row > max_edit_distance) + return max_edit_distance + 1; + + current.swap(previous); + } + + return previous[n]; +} diff --git a/src/edit_distance.h b/src/edit_distance.h new file mode 100644 index 0000000..186a0d7 --- /dev/null +++ b/src/edit_distance.h @@ -0,0 +1,25 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NINJA_EDIT_DISTANCE_H_ +#define NINJA_EDIT_DISTANCE_H_ + +struct StringPiece; + +int EditDistance(const StringPiece& s1, + const StringPiece& s2, + bool allow_replacements = true, + int max_edit_distance = 0); + +#endif // NINJA_EDIT_DISTANCE_H_ diff --git a/src/edit_distance_test.cc b/src/edit_distance_test.cc new file mode 100644 index 0000000..a4c0486 --- /dev/null +++ b/src/edit_distance_test.cc @@ -0,0 +1,49 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "edit_distance.h" + +#include "string_piece.h" +#include "test.h" + +TEST(EditDistanceTest, TestEmpty) { + EXPECT_EQ(5, EditDistance("", "ninja")); + EXPECT_EQ(5, EditDistance("ninja", "")); + EXPECT_EQ(0, EditDistance("", "")); +} + +TEST(EditDistanceTest, TestMaxDistance) { + const bool allow_replacements = true; + for (int max_distance = 1; max_distance < 7; ++max_distance) { + EXPECT_EQ(max_distance + 1, + EditDistance("abcdefghijklmnop", "ponmlkjihgfedcba", + allow_replacements, max_distance)); + } +} + +TEST(EditDistanceTest, TestAllowReplacements) { + bool allow_replacements = true; + EXPECT_EQ(1, EditDistance("ninja", "njnja", allow_replacements)); + EXPECT_EQ(1, EditDistance("njnja", "ninja", allow_replacements)); + + allow_replacements = false; + EXPECT_EQ(2, EditDistance("ninja", "njnja", allow_replacements)); + EXPECT_EQ(2, EditDistance("njnja", "ninja", allow_replacements)); +} + +TEST(EditDistanceTest, TestBasics) { + EXPECT_EQ(0, EditDistance("browser_tests", "browser_tests")); + EXPECT_EQ(1, EditDistance("browser_test", "browser_tests")); + EXPECT_EQ(1, EditDistance("browser_tests", "browser_test")); +} diff --git a/src/string_piece.h b/src/string_piece.h index 0e55afb..3b94ce3 100644 --- a/src/string_piece.h +++ b/src/string_piece.h @@ -48,4 +48,4 @@ struct StringPiece { int len_; }; -#endif // NINJA_BROWSE_H_ +#endif // NINJA_STRINGPIECE_H_ -- cgit v0.12