// Copyright 2018, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Internal helper functions for finding optimal edit transformations // between strings. #include "gtest/gtest.h" #include #include #include // NOLINT #include #include namespace testing { namespace internal { namespace { // The following implement data structures and code for a Dijkstra-search // based implementation of optimal edit distance. // Posible states a node can be in. Either a node is unsettled (it hasn't been // drawn from the priority queue yet), or it is settled and a back-link to its // parent node is fixed. enum EditSearchState { kUnsettled, kMatchParent, kAddParent, kRemoveParent, kReplaceParent }; // Custom container for search states. This is smaller and faster than a hash // map, because the used states are dense along diagonals. // Specifically, each state requires only 1 byte, whereas a hash_map would // require storing the key, which would come to at least 8 bytes. std::map has // an extra 32 bytes per node (3 pointers + 1 byte, padded), so even though // there are circumstances where this class can have kBlockSize overhead per // state, on average it does better than 40 bytes of overhead per state. // In addition, in unopt builds (the usual way tests are run) the fewer // allocations + better locality has this method running 10-50x faster than // std::map for inputs that are large enough to measure. class EditSearchMap { public: EditSearchMap(size_t left_size, size_t right_size) : left_size_(left_size), right_size_(right_size) { GTEST_CHECK_(left_size_ == left_size && right_size_ == right_size) << "Overflow in size: Arguments too large"; } // Gets a mutable reference to a state - this is actually of type // EditSearchState - inserting if it does not exist. unsigned char& insert(UInt32 left, UInt32 right) { std::vector* vec; size_t index1; size_t index2; if (left > right) { vec = &left_nodes_; index1 = left - right - 1; index2 = right; } else { vec = &right_nodes_; index1 = right - left; index2 = left; } if (vec->size() <= index1) { GTEST_CHECK_(vec->size() == index1) << "Array diagonals should only grow by one " << vec->size() << " vs " << index1; vec->push_back(block_indices_.size()); // Round up block_indices_.resize( block_indices_.size() + (DiagonalLength(left, right) + kBlockSize - 1) / kBlockSize, kUnallocatedBlock); } const size_t bucket = index2 / kBlockSize; const size_t pos_in_bucket = index2 % kBlockSize; UInt32& level2 = block_indices_[(*vec)[index1] + bucket]; if (level2 == kUnallocatedBlock) { level2 = nodes_.size(); size_t diagonal_length = DiagonalLength(left, right); GTEST_CHECK_(diagonal_length > index2) << diagonal_length << " " << index2; size_t block_size = kBlockSize; if (diagonal_length / kBlockSize == bucket) { // We can never get here if diagonal_length is a multiple of // kBlockSize, which is what we want, since this would evaluate to 0. block_size = diagonal_length % kBlockSize; } nodes_.resize(nodes_.size() + block_size); } return nodes_[level2 + pos_in_bucket]; } size_t MemoryUsage() const { return nodes_.capacity() + sizeof(UInt32) * (left_nodes_.capacity() + right_nodes_.capacity() + block_indices_.capacity()); } private: enum { kBlockSize = 1024, kUnallocatedBlock = 0xFFFFFFFFul }; size_t DiagonalLength(UInt32 left, UInt32 right) const { return std::min(left_size_ - left, right_size_ - right) + (left < right ? left : right); } // The state space is conceptually a left_size_ by right_size_ sparse matrix // of EditSearchStates. However, due to the access pattern of the search, it // is much better to store the nodes per diagonal rather than per row. UInt32 left_size_; UInt32 right_size_; // The nodes are stored by diagonals, split in two: Those to the left of the // main diagonal are in left_nodes_, and everything else is in right_nodes_. // The values are indices into block_indices_. std::vector left_nodes_; std::vector right_nodes_; // Every entry here is an offset into the beginning of a kBlockSize-sized // block in nodes_. An entire diagonal is allocated together here; for a // diagonal of length <= kBlockSize, that's just a single entry, but for // longer diagonals multiple contiguous index entries will be reserved at // once. Unused entries will be assigned kUnallocatedBlock; this // double-indirect scheme is used to save memory in the cases when an entire // diagonal isn't needed. std::vector block_indices_; // This stores the actual EditSearchState data, pointed to by block_indices_. std::vector nodes_; }; struct EditHeapEntry { EditHeapEntry(UInt32 l, UInt32 r, UInt64 c, EditSearchState s) : left(l), right(r), cost(c), state(s) {} UInt32 left; UInt32 right; UInt64 cost : 61; // The state that the node will get when this entry is settled. Therefore, // this can never be kUnsettled. UInt64 state : 3; bool operator>(const EditHeapEntry& other) const { return cost > other.cost; } }; // Need a min-queue, so invert the comparator. typedef std::priority_queue, std::greater> EditHeap; } // namespace std::vector CalculateOptimalEdits(const std::vector& left, const std::vector& right, size_t* memory_usage) { const UInt64 kBaseCost = 100000; // We make replace a little more expensive than add/remove to lower // their priority. const UInt64 kReplaceCost = 100001; // In the common case where the vectors are the same (or almost the same) // size, we know that an add will have to be followed by some later remove // (or vice versa) in order to get the lengths to balance. We "borrow" some // of the cost of the later operation and bring it forward into the earlier // operation, to increase the cost of exploring (usually fruitlessly) around // the beginning of the graph. // However, there is a trade-off: This cheapens the cost of exploring around // the beginning of the graph (in one direction) when the vectors are // unequal in length. So we don't steal *all* the cost. // You can view this as a form of A*, using an admissable heuristic that has // been re-cast as a cost function that can be used in Dijkstra. const UInt64 kTowardsGoalCost = 50003; const UInt64 kAwayFromGoalCost = 2 * kBaseCost - kTowardsGoalCost; EditSearchMap node_map(left.size() + 1, right.size() + 1); EditHeap heap; heap.push(EditHeapEntry(0, 0, 0, kReplaceParent)); while (!heap.empty()) { const EditHeapEntry current_entry = heap.top(); heap.pop(); UInt32 left_pos = current_entry.left; UInt32 right_pos = current_entry.right; unsigned char& current_state = node_map.insert(left_pos, right_pos); if (current_state != kUnsettled) { // Node was already settled by a previous entry in the priority queue, // this is a suboptimal path that should be ignored. continue; } current_state = current_entry.state; if (left_pos == left.size() && right_pos == right.size()) { // This is the normal exit point; if we terminate due to the heap being // empty, we'll fail a check later. break; } // Special case: Since the cost of a match is zero, we can immediately // settle the new node without putting it in the queue, since nothing can // have a smaller cost than it. Furthermore, we don't need to relax the // other two edges, since we know we don't need them: Any path from this // node that would use them has an path via the match that is at least as // cheap. Together, this means we can loop here until we stop matching. while (left_pos < left.size() && right_pos < right.size() && left[left_pos] == right[right_pos]) { left_pos++; right_pos++; unsigned char& fast_forward_state = node_map.insert(left_pos, right_pos); if (fast_forward_state != kUnsettled) { // The search reached around and settled this node before settling the // base node. This means we're completely done with this iteration; // abort to the outer loop. goto outer_loop_bottom; // Otherwise, when can settle this node, even if it was created from // another state - we know the cost of settling it now is optimal. } fast_forward_state = kMatchParent; } // Relax adjacent nodes. We have no way to find or lower the cost of // existing entries in the heap, so we just push new entries and throw // them out at the top if the node is already settled. We *could* check to // see if they're already settled before pushing, but it turns out to be // ~not any faster, and more complicated to do so. // // If we're at an edge, there's only one node to relax. if (left_pos >= left.size()) { if (right_pos >= right.size()) { break; // Can happen due to the fast-path loop above. } heap.push(EditHeapEntry(left_pos, right_pos + 1, current_entry.cost + kTowardsGoalCost, kAddParent)); continue; } if (right_pos >= right.size()) { heap.push(EditHeapEntry(left_pos + 1, right_pos, current_entry.cost + kTowardsGoalCost, kRemoveParent)); continue; } // General case: Relax 3 edges. heap.push(EditHeapEntry( left_pos, right_pos + 1, current_entry.cost + (right.size() + left_pos > right_pos + left.size() ? kTowardsGoalCost : kAwayFromGoalCost), kAddParent)); heap.push(EditHeapEntry( left_pos + 1, right_pos, current_entry.cost + (right.size() + left_pos < right_pos + left.size() ? kTowardsGoalCost : kAwayFromGoalCost), kRemoveParent)); heap.push(EditHeapEntry(left_pos + 1, right_pos + 1, current_entry.cost + kReplaceCost, kReplaceParent)); outer_loop_bottom : {} // Need the curlies to form a statement. } // Reconstruct the best path. We do it in reverse order. std::vector best_path; UInt32 left_pos = left.size(); UInt32 right_pos = right.size(); while (left_pos != 0 || right_pos != 0) { GTEST_CHECK_(left_pos <= left.size() && right_pos <= right.size()); // The node must already exist, but if it somehow doesn't, it will be // added as kUnsettled, which will crash below. const unsigned char state = node_map.insert(left_pos, right_pos); switch (state) { case kAddParent: right_pos--; break; case kRemoveParent: left_pos--; break; case kMatchParent: case kReplaceParent: left_pos--; right_pos--; break; default: GTEST_LOG_(FATAL) << "Unsettled node at " << left_pos << "," << right_pos; } best_path.push_back(static_cast(state - 1)); } std::reverse(best_path.begin(), best_path.end()); if (memory_usage != NULL) { *memory_usage = node_map.MemoryUsage(); } return best_path; } namespace { // Helper class to convert string into ids with deduplication. class InternalStrings { public: size_t GetId(const std::string* str) { IdMap::iterator it = ids_.find(str); if (it != ids_.end()) return it->second; size_t id = ids_.size(); return ids_[str] = id; } private: struct IdMapCmp { bool operator()(const std::string* first, const std::string* second) const { return *first < *second; } }; typedef std::map IdMap; IdMap ids_; }; } // namespace std::vector CalculateOptimalEdits( const std::vector& left, const std::vector& right) { std::vector left_ids, right_ids; { InternalStrings intern_table; for (size_t i = 0; i < left.size(); ++i) { left_ids.push_back(intern_table.GetId(&left[i])); } for (size_t i = 0; i < right.size(); ++i) { right_ids.push_back(intern_table.GetId(&right[i])); } } return CalculateOptimalEdits(left_ids, right_ids); } namespace { // Helper class that holds the state for one hunk and prints it out to the // stream. // It reorders adds/removes when possible to group all removes before all // adds. It also adds the hunk header before printing into the stream. class Hunk { public: Hunk(size_t left_start, size_t right_start) : left_start_(left_start), right_start_(right_start), adds_(), removes_(), common_() {} void PushLine(char edit, const char* line) { switch (edit) { case ' ': ++common_; FlushEdits(); hunk_.push_back(std::make_pair(' ', line)); break; case '-': ++removes_; hunk_removes_.push_back(std::make_pair('-', line)); break; case '+': ++adds_; hunk_adds_.push_back(std::make_pair('+', line)); break; } } void PrintTo(std::ostream* os) { PrintHeader(os); FlushEdits(); for (std::list >::const_iterator it = hunk_.begin(); it != hunk_.end(); ++it) { *os << it->first << it->second << "\n"; } } bool has_edits() const { return adds_ || removes_; } private: void FlushEdits() { hunk_.splice(hunk_.end(), hunk_removes_); hunk_.splice(hunk_.end(), hunk_adds_); } // Print a unified diff header for one hunk. // The format is // "@@ -, +, @@" // where the left/right parts are omitted if unnecessary. void PrintHeader(std::ostream* ss) const { *ss << "@@ "; if (removes_) { *ss << "-" << left_start_ << "," << (removes_ + common_); } if (removes_ && adds_) { *ss << " "; } if (adds_) { *ss << "+" << right_start_ << "," << (adds_ + common_); } *ss << " @@\n"; } size_t left_start_, right_start_; size_t adds_, removes_, common_; std::list > hunk_, hunk_adds_, hunk_removes_; }; } // namespace // Create a list of diff hunks in Unified diff format. // Each hunk has a header generated by PrintHeader above plus a body with // lines prefixed with ' ' for no change, '-' for deletion and '+' for // addition. // 'context' represents the desired unchanged prefix/suffix around the diff. // If two hunks are close enough that their contexts overlap, then they are // joined into one hunk. std::string CreateUnifiedDiff(const std::vector& left, const std::vector& right, size_t context) { const std::vector edits = CalculateOptimalEdits(left, right); size_t l_i = 0, r_i = 0, edit_i = 0; std::stringstream ss; while (edit_i < edits.size()) { // Find first edit. while (edit_i < edits.size() && edits[edit_i] == kEditMatch) { ++l_i; ++r_i; ++edit_i; } // Find the first line to include in the hunk. const size_t prefix_context = std::min(l_i, context); Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1); for (size_t i = prefix_context; i > 0; --i) { hunk.PushLine(' ', left[l_i - i].c_str()); } // Iterate the edits until we found enough suffix for the hunk or the input // is over. size_t n_suffix = 0; for (; edit_i < edits.size(); ++edit_i) { if (n_suffix >= context) { // Continue only if the next hunk is very close. std::vector::const_iterator it = edits.begin() + edit_i; while (it != edits.end() && *it == kEditMatch) ++it; if (it == edits.end() || (it - edits.begin()) - edit_i >= context) { // There is no next edit or it is too far away. break; } } EditType edit = edits[edit_i]; // Reset count when a non match is found. n_suffix = edit == kEditMatch ? n_suffix + 1 : 0; if (edit == kEditMatch || edit == kEditRemove || edit == kEditReplace) { hunk.PushLine(edit == kEditMatch ? ' ' : '-', left[l_i].c_str()); } if (edit == kEditAdd || edit == kEditReplace) { hunk.PushLine('+', right[r_i].c_str()); } // Advance indices, depending on edit type. l_i += edit != kEditAdd; r_i += edit != kEditRemove; } if (!hunk.has_edits()) { // We are done. We don't want this hunk. break; } hunk.PrintTo(&ss); } return ss.str(); } } // namespace internal } // namespace testing