diff options
Diffstat (limited to 'googletest/src/gtest-edit-distance.cc')
-rw-r--r-- | googletest/src/gtest-edit-distance.cc | 507 |
1 files changed, 0 insertions, 507 deletions
diff --git a/googletest/src/gtest-edit-distance.cc b/googletest/src/gtest-edit-distance.cc deleted file mode 100644 index 331a3b9..0000000 --- a/googletest/src/gtest-edit-distance.cc +++ /dev/null @@ -1,507 +0,0 @@ -// Copyright 2018, Google Inc. -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// -// Internal helper functions for finding optimal edit transformations -// between strings. - -#include "gtest/gtest.h" - -#include <functional> -#include <list> -#include <ostream> // NOLINT -#include <queue> -#include <vector> - -namespace testing { -namespace internal { -namespace { - -// The following implement data structures and code for a Dijkstra-search -// based implementation of optimal edit distance. - -// Posible states a node can be in. Either a node is unsettled (it hasn't been -// drawn from the priority queue yet), or it is settled and a back-link to its -// parent node is fixed. -enum EditSearchState { - kUnsettled, - kMatchParent, - kAddParent, - kRemoveParent, - kReplaceParent -}; - -// Custom container for search states. This is smaller and faster than a hash -// map, because the used states are dense along diagonals. -// Specifically, each state requires only 1 byte, whereas a hash_map would -// require storing the key, which would come to at least 8 bytes. std::map has -// an extra 32 bytes per node (3 pointers + 1 byte, padded), so even though -// there are circumstances where this class can have kBlockSize overhead per -// state, on average it does better than 40 bytes of overhead per state. -// In addition, in unopt builds (the usual way tests are run) the fewer -// allocations + better locality has this method running 10-50x faster than -// std::map for inputs that are large enough to measure. -class EditSearchMap { - public: - EditSearchMap(size_t left_size, size_t right_size) - : left_size_(left_size), right_size_(right_size) { - GTEST_CHECK_(left_size_ == left_size && right_size_ == right_size) - << "Overflow in size: Arguments too large"; - } - - // Gets a mutable reference to a state - this is actually of type - // EditSearchState - inserting if it does not exist. - unsigned char& insert(UInt32 left, UInt32 right) { - std::vector<UInt32>* vec; - size_t index1; - size_t index2; - if (left > right) { - vec = &left_nodes_; - index1 = left - right - 1; - index2 = right; - } else { - vec = &right_nodes_; - index1 = right - left; - index2 = left; - } - if (vec->size() <= index1) { - GTEST_CHECK_(vec->size() == index1) - << "Array diagonals should only grow by one " << vec->size() << " vs " - << index1; - vec->push_back(block_indices_.size()); - // Round up - block_indices_.resize( - block_indices_.size() + - (DiagonalLength(left, right) + kBlockSize - 1) / kBlockSize, - kUnallocatedBlock); - } - const size_t bucket = index2 / kBlockSize; - const size_t pos_in_bucket = index2 % kBlockSize; - UInt32& level2 = block_indices_[(*vec)[index1] + bucket]; - if (level2 == kUnallocatedBlock) { - level2 = nodes_.size(); - size_t diagonal_length = DiagonalLength(left, right); - GTEST_CHECK_(diagonal_length > index2) - << diagonal_length << " " << index2; - size_t block_size = kBlockSize; - if (diagonal_length / kBlockSize == bucket) { - // We can never get here if diagonal_length is a multiple of - // kBlockSize, which is what we want, since this would evaluate to 0. - block_size = diagonal_length % kBlockSize; - } - nodes_.resize(nodes_.size() + block_size); - } - return nodes_[level2 + pos_in_bucket]; - } - - size_t MemoryUsage() const { - return nodes_.capacity() + - sizeof(UInt32) * (left_nodes_.capacity() + right_nodes_.capacity() + - block_indices_.capacity()); - } - - private: - enum { kBlockSize = 1024, kUnallocatedBlock = 0xFFFFFFFFul }; - - size_t DiagonalLength(UInt32 left, UInt32 right) const { - return std::min(left_size_ - left, right_size_ - right) + - (left < right ? left : right); - } - - // The state space is conceptually a left_size_ by right_size_ sparse matrix - // of EditSearchStates. However, due to the access pattern of the search, it - // is much better to store the nodes per diagonal rather than per row. - UInt32 left_size_; - UInt32 right_size_; - // The nodes are stored by diagonals, split in two: Those to the left of the - // main diagonal are in left_nodes_, and everything else is in right_nodes_. - // The values are indices into block_indices_. - std::vector<UInt32> left_nodes_; - std::vector<UInt32> right_nodes_; - // Every entry here is an offset into the beginning of a kBlockSize-sized - // block in nodes_. An entire diagonal is allocated together here; for a - // diagonal of length <= kBlockSize, that's just a single entry, but for - // longer diagonals multiple contiguous index entries will be reserved at - // once. Unused entries will be assigned kUnallocatedBlock; this - // double-indirect scheme is used to save memory in the cases when an entire - // diagonal isn't needed. - std::vector<UInt32> block_indices_; - // This stores the actual EditSearchState data, pointed to by block_indices_. - std::vector<unsigned char> nodes_; -}; - -struct EditHeapEntry { - EditHeapEntry(UInt32 l, UInt32 r, UInt64 c, EditSearchState s) - : left(l), right(r), cost(c), state(s) {} - - UInt32 left; - UInt32 right; - UInt64 cost : 61; - // The state that the node will get when this entry is settled. Therefore, - // this can never be kUnsettled. - UInt64 state : 3; - - bool operator>(const EditHeapEntry& other) const { return cost > other.cost; } -}; - -// Need a min-queue, so invert the comparator. -typedef std::priority_queue<EditHeapEntry, std::vector<EditHeapEntry>, - std::greater<EditHeapEntry>> - EditHeap; - -} // namespace - -std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left, - const std::vector<size_t>& right, - size_t* memory_usage) { - const UInt64 kBaseCost = 100000; - // We make replace a little more expensive than add/remove to lower - // their priority. - const UInt64 kReplaceCost = 100001; - // In the common case where the vectors are the same (or almost the same) - // size, we know that an add will have to be followed by some later remove - // (or vice versa) in order to get the lengths to balance. We "borrow" some - // of the cost of the later operation and bring it forward into the earlier - // operation, to increase the cost of exploring (usually fruitlessly) around - // the beginning of the graph. - // However, there is a trade-off: This cheapens the cost of exploring around - // the beginning of the graph (in one direction) when the vectors are - // unequal in length. So we don't steal *all* the cost. - // You can view this as a form of A*, using an admissable heuristic that has - // been re-cast as a cost function that can be used in Dijkstra. - const UInt64 kTowardsGoalCost = 50003; - const UInt64 kAwayFromGoalCost = 2 * kBaseCost - kTowardsGoalCost; - - EditSearchMap node_map(left.size() + 1, right.size() + 1); - EditHeap heap; - heap.push(EditHeapEntry(0, 0, 0, kReplaceParent)); - - while (!heap.empty()) { - const EditHeapEntry current_entry = heap.top(); - heap.pop(); - - UInt32 left_pos = current_entry.left; - UInt32 right_pos = current_entry.right; - unsigned char& current_state = node_map.insert(left_pos, right_pos); - if (current_state != kUnsettled) { - // Node was already settled by a previous entry in the priority queue, - // this is a suboptimal path that should be ignored. - continue; - } - current_state = current_entry.state; - - if (left_pos == left.size() && right_pos == right.size()) { - // This is the normal exit point; if we terminate due to the heap being - // empty, we'll fail a check later. - break; - } - - // Special case: Since the cost of a match is zero, we can immediately - // settle the new node without putting it in the queue, since nothing can - // have a smaller cost than it. Furthermore, we don't need to relax the - // other two edges, since we know we don't need them: Any path from this - // node that would use them has an path via the match that is at least as - // cheap. Together, this means we can loop here until we stop matching. - while (left_pos < left.size() && right_pos < right.size() && - left[left_pos] == right[right_pos]) { - left_pos++; - right_pos++; - unsigned char& fast_forward_state = node_map.insert(left_pos, right_pos); - if (fast_forward_state != kUnsettled) { - // The search reached around and settled this node before settling the - // base node. This means we're completely done with this iteration; - // abort to the outer loop. - goto outer_loop_bottom; - // Otherwise, when can settle this node, even if it was created from - // another state - we know the cost of settling it now is optimal. - } - fast_forward_state = kMatchParent; - } - - // Relax adjacent nodes. We have no way to find or lower the cost of - // existing entries in the heap, so we just push new entries and throw - // them out at the top if the node is already settled. We *could* check to - // see if they're already settled before pushing, but it turns out to be - // ~not any faster, and more complicated to do so. - // - // If we're at an edge, there's only one node to relax. - if (left_pos >= left.size()) { - if (right_pos >= right.size()) { - break; // Can happen due to the fast-path loop above. - } - heap.push(EditHeapEntry(left_pos, right_pos + 1, - current_entry.cost + kTowardsGoalCost, - kAddParent)); - continue; - } - if (right_pos >= right.size()) { - heap.push(EditHeapEntry(left_pos + 1, right_pos, - current_entry.cost + kTowardsGoalCost, - kRemoveParent)); - continue; - } - // General case: Relax 3 edges. - heap.push(EditHeapEntry( - left_pos, right_pos + 1, - current_entry.cost + (right.size() + left_pos > right_pos + left.size() - ? kTowardsGoalCost - : kAwayFromGoalCost), - kAddParent)); - heap.push(EditHeapEntry( - left_pos + 1, right_pos, - current_entry.cost + (right.size() + left_pos < right_pos + left.size() - ? kTowardsGoalCost - : kAwayFromGoalCost), - kRemoveParent)); - heap.push(EditHeapEntry(left_pos + 1, right_pos + 1, - current_entry.cost + kReplaceCost, kReplaceParent)); - outer_loop_bottom : {} // Need the curlies to form a statement. - } - - // Reconstruct the best path. We do it in reverse order. - std::vector<EditType> best_path; - UInt32 left_pos = left.size(); - UInt32 right_pos = right.size(); - while (left_pos != 0 || right_pos != 0) { - GTEST_CHECK_(left_pos <= left.size() && right_pos <= right.size()); - // The node must already exist, but if it somehow doesn't, it will be - // added as kUnsettled, which will crash below. - const unsigned char state = node_map.insert(left_pos, right_pos); - switch (state) { - case kAddParent: - right_pos--; - break; - case kRemoveParent: - left_pos--; - break; - case kMatchParent: - case kReplaceParent: - left_pos--; - right_pos--; - break; - default: - GTEST_LOG_(FATAL) << "Unsettled node at " << left_pos << "," - << right_pos; - } - best_path.push_back(static_cast<EditType>(state - 1)); - } - std::reverse(best_path.begin(), best_path.end()); - if (memory_usage != NULL) { - *memory_usage = node_map.MemoryUsage(); - } - return best_path; -} - -namespace { - -// Helper class to convert string into ids with deduplication. -class InternalStrings { - public: - size_t GetId(const std::string* str) { - IdMap::iterator it = ids_.find(str); - if (it != ids_.end()) return it->second; - size_t id = ids_.size(); - return ids_[str] = id; - } - - private: - struct IdMapCmp { - bool operator()(const std::string* first, const std::string* second) const { - return *first < *second; - } - }; - typedef std::map<const std::string*, size_t, IdMapCmp> IdMap; - IdMap ids_; -}; - -} // namespace - -std::vector<EditType> CalculateOptimalEdits( - const std::vector<std::string>& left, - const std::vector<std::string>& right) { - std::vector<size_t> left_ids, right_ids; - { - InternalStrings intern_table; - for (size_t i = 0; i < left.size(); ++i) { - left_ids.push_back(intern_table.GetId(&left[i])); - } - for (size_t i = 0; i < right.size(); ++i) { - right_ids.push_back(intern_table.GetId(&right[i])); - } - } - return CalculateOptimalEdits(left_ids, right_ids); -} - -namespace { - -// Helper class that holds the state for one hunk and prints it out to the -// stream. -// It reorders adds/removes when possible to group all removes before all -// adds. It also adds the hunk header before printing into the stream. -class Hunk { - public: - Hunk(size_t left_start, size_t right_start) - : left_start_(left_start), - right_start_(right_start), - adds_(), - removes_(), - common_() {} - - void PushLine(char edit, const char* line) { - switch (edit) { - case ' ': - ++common_; - FlushEdits(); - hunk_.push_back(std::make_pair(' ', line)); - break; - case '-': - ++removes_; - hunk_removes_.push_back(std::make_pair('-', line)); - break; - case '+': - ++adds_; - hunk_adds_.push_back(std::make_pair('+', line)); - break; - } - } - - void PrintTo(std::ostream* os) { - PrintHeader(os); - FlushEdits(); - for (std::list<std::pair<char, const char*> >::const_iterator it = - hunk_.begin(); - it != hunk_.end(); ++it) { - *os << it->first << it->second << "\n"; - } - } - - bool has_edits() const { return adds_ || removes_; } - - private: - void FlushEdits() { - hunk_.splice(hunk_.end(), hunk_removes_); - hunk_.splice(hunk_.end(), hunk_adds_); - } - - // Print a unified diff header for one hunk. - // The format is - // "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@" - // where the left/right parts are omitted if unnecessary. - void PrintHeader(std::ostream* ss) const { - *ss << "@@ "; - if (removes_) { - *ss << "-" << left_start_ << "," << (removes_ + common_); - } - if (removes_ && adds_) { - *ss << " "; - } - if (adds_) { - *ss << "+" << right_start_ << "," << (adds_ + common_); - } - *ss << " @@\n"; - } - - size_t left_start_, right_start_; - size_t adds_, removes_, common_; - std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_; -}; - -} // namespace - -// Create a list of diff hunks in Unified diff format. -// Each hunk has a header generated by PrintHeader above plus a body with -// lines prefixed with ' ' for no change, '-' for deletion and '+' for -// addition. -// 'context' represents the desired unchanged prefix/suffix around the diff. -// If two hunks are close enough that their contexts overlap, then they are -// joined into one hunk. -std::string CreateUnifiedDiff(const std::vector<std::string>& left, - const std::vector<std::string>& right, - size_t context) { - const std::vector<EditType> edits = CalculateOptimalEdits(left, right); - - size_t l_i = 0, r_i = 0, edit_i = 0; - std::stringstream ss; - while (edit_i < edits.size()) { - // Find first edit. - while (edit_i < edits.size() && edits[edit_i] == kEditMatch) { - ++l_i; - ++r_i; - ++edit_i; - } - - // Find the first line to include in the hunk. - const size_t prefix_context = std::min(l_i, context); - Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); - for (size_t i = prefix_context; i > 0; --i) { - hunk.PushLine(' ', left[l_i - i].c_str()); - } - - // Iterate the edits until we found enough suffix for the hunk or the input - // is over. - size_t n_suffix = 0; - for (; edit_i < edits.size(); ++edit_i) { - if (n_suffix >= context) { - // Continue only if the next hunk is very close. - std::vector<EditType>::const_iterator it = edits.begin() + edit_i; - while (it != edits.end() && *it == kEditMatch) ++it; - if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { - // There is no next edit or it is too far away. - break; - } - } - - EditType edit = edits[edit_i]; - // Reset count when a non match is found. - n_suffix = edit == kEditMatch ? n_suffix + 1 : 0; - - if (edit == kEditMatch || edit == kEditRemove || edit == kEditReplace) { - hunk.PushLine(edit == kEditMatch ? ' ' : '-', left[l_i].c_str()); - } - if (edit == kEditAdd || edit == kEditReplace) { - hunk.PushLine('+', right[r_i].c_str()); - } - - // Advance indices, depending on edit type. - l_i += edit != kEditAdd; - r_i += edit != kEditRemove; - } - - if (!hunk.has_edits()) { - // We are done. We don't want this hunk. - break; - } - - hunk.PrintTo(&ss); - } - return ss.str(); -} - -} // namespace internal -} // namespace testing |