Googletest export

Fix Theta(N^2) memory usage of EXPECT_EQ(string) when the strings don't match. The underlying CalculateOptimalEdits() implementation used a simple dynamic-programming approach that always used N^2 memory and time. This meant that tests for equality of large strings were ticking time bombs: They'd work fine as long as the test passed, but as soon as the strings differed the test would OOM, which is very hard to debug. I switched it out for a Dijkstra search, which is still worst-case O(N^2), but in the usual case of mostly-matching strings, it is much closer to linear. PiperOrigin-RevId: 210405025
author: Abseil Team <absl-team@google.com> 2018-08-27 18:53:25 (GMT)
committer: Gennadiy Civil <misterg@google.com> 2018-08-28 20:53:30 (GMT)
commit: 167c5e8188beb5dae002ac7571457e3c26eb6a3f (patch)
tree: 3ac7d65d7a5d9eb4be3883cd3032d29cd93e1e21 /googletest/src/gtest-edit-distance.cc
parent: 1bb76182caee8239b71b9d6d21f479014d37ad5b (diff)
download: googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.zip
googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.tar.gz
googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.tar.bz2
1 files changed, 507 insertions, 0 deletions
diff --git a/googletest/src/gtest-edit-distance.cc b/googletest/src/gtest-edit-distance.cc
new file mode 100644
index 0000000..331a3b9
--- /dev/null
+++ b/googletest/src/gtest-edit-distance.cc
@@ -0,0 +1,507 @@
+// Copyright 2018, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Internal helper functions for finding optimal edit transformations
+// between strings.
+
+#include "gtest/gtest.h"
+
+#include <functional>
+#include <list>
+#include <ostream>  // NOLINT
+#include <queue>
+#include <vector>
+
+namespace testing {
+namespace internal {
+namespace {
+
+// The following implement data structures and code for a Dijkstra-search
+// based implementation of optimal edit distance.
+
+// Posible states a node can be in. Either a node is unsettled (it hasn't been
+// drawn from the priority queue yet), or it is settled and a back-link to its
+// parent node is fixed.
+enum EditSearchState {
+  kUnsettled,
+  kMatchParent,
+  kAddParent,
+  kRemoveParent,
+  kReplaceParent
+};
+
+// Custom container for search states. This is smaller and faster than a hash
+// map, because the used states are dense along diagonals.
+// Specifically, each state requires only 1 byte, whereas a hash_map would
+// require storing the key, which would come to at least 8 bytes. std::map has
+// an extra 32 bytes per node (3 pointers + 1 byte, padded), so even though
+// there are circumstances where this class can have kBlockSize overhead per
+// state, on average it does better than 40 bytes of overhead per state.
+// In addition, in unopt builds (the usual way tests are run) the fewer
+// allocations + better locality has this method running 10-50x faster than
+// std::map for inputs that are large enough to measure.
+class EditSearchMap {
+ public:
+  EditSearchMap(size_t left_size, size_t right_size)
+      : left_size_(left_size), right_size_(right_size) {
+    GTEST_CHECK_(left_size_ == left_size && right_size_ == right_size)
+        << "Overflow in size: Arguments too large";
+  }
+
+  // Gets a mutable reference to a state - this is actually of type
+  // EditSearchState - inserting if it does not exist.
+  unsigned char& insert(UInt32 left, UInt32 right) {
+    std::vector<UInt32>* vec;
+    size_t index1;
+    size_t index2;
+    if (left > right) {
+      vec = &left_nodes_;
+      index1 = left - right - 1;
+      index2 = right;
+    } else {
+      vec = &right_nodes_;
+      index1 = right - left;
+      index2 = left;
+    }
+    if (vec->size() <= index1) {
+      GTEST_CHECK_(vec->size() == index1)
+          << "Array diagonals should only grow by one " << vec->size() << " vs "
+          << index1;
+      vec->push_back(block_indices_.size());
+      // Round up
+      block_indices_.resize(
+          block_indices_.size() +
+              (DiagonalLength(left, right) + kBlockSize - 1) / kBlockSize,
+          kUnallocatedBlock);
+    }
+    const size_t bucket = index2 / kBlockSize;
+    const size_t pos_in_bucket = index2 % kBlockSize;
+    UInt32& level2 = block_indices_[(*vec)[index1] + bucket];
+    if (level2 == kUnallocatedBlock) {
+      level2 = nodes_.size();
+      size_t diagonal_length = DiagonalLength(left, right);
+      GTEST_CHECK_(diagonal_length > index2)
+          << diagonal_length << " " << index2;
+      size_t block_size = kBlockSize;
+      if (diagonal_length / kBlockSize == bucket) {
+        // We can never get here if diagonal_length is a multiple of
+        // kBlockSize, which is what we want, since this would evaluate to 0.
+        block_size = diagonal_length % kBlockSize;
+      }
+      nodes_.resize(nodes_.size() + block_size);
+    }
+    return nodes_[level2 + pos_in_bucket];
+  }
+
+  size_t MemoryUsage() const {
+    return nodes_.capacity() +
+           sizeof(UInt32) * (left_nodes_.capacity() + right_nodes_.capacity() +
+                             block_indices_.capacity());
+  }
+
+ private:
+  enum { kBlockSize = 1024, kUnallocatedBlock = 0xFFFFFFFFul };
+
+  size_t DiagonalLength(UInt32 left, UInt32 right) const {
+    return std::min(left_size_ - left, right_size_ - right) +
+           (left < right ? left : right);
+  }
+
+  // The state space is conceptually a left_size_ by right_size_ sparse matrix
+  // of EditSearchStates. However, due to the access pattern of the search, it
+  // is much better to store the nodes per diagonal rather than per row.
+  UInt32 left_size_;
+  UInt32 right_size_;
+  // The nodes are stored by diagonals, split in two: Those to the left of the
+  // main diagonal are in left_nodes_, and everything else is in right_nodes_.
+  // The values are indices into block_indices_.
+  std::vector<UInt32> left_nodes_;
+  std::vector<UInt32> right_nodes_;
+  // Every entry here is an offset into the beginning of a kBlockSize-sized
+  // block in nodes_. An entire diagonal is allocated together here; for a
+  // diagonal of length <= kBlockSize, that's just a single entry, but for
+  // longer diagonals multiple contiguous index entries will be reserved at
+  // once. Unused entries will be assigned kUnallocatedBlock; this
+  // double-indirect scheme is used to save memory in the cases when an entire
+  // diagonal isn't needed.
+  std::vector<UInt32> block_indices_;
+  // This stores the actual EditSearchState data, pointed to by block_indices_.
+  std::vector<unsigned char> nodes_;
+};
+
+struct EditHeapEntry {
+  EditHeapEntry(UInt32 l, UInt32 r, UInt64 c, EditSearchState s)
+      : left(l), right(r), cost(c), state(s) {}
+
+  UInt32 left;
+  UInt32 right;
+  UInt64 cost : 61;
+  // The state that the node will get when this entry is settled. Therefore,
+  // this can never be kUnsettled.
+  UInt64 state : 3;
+
+  bool operator>(const EditHeapEntry& other) const { return cost > other.cost; }
+};
+
+// Need a min-queue, so invert the comparator.
+typedef std::priority_queue<EditHeapEntry, std::vector<EditHeapEntry>,
+                            std::greater<EditHeapEntry>>
+    EditHeap;
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t>& left,
+                                            const std::vector<size_t>& right,
+                                            size_t* memory_usage) {
+  const UInt64 kBaseCost = 100000;
+  // We make replace a little more expensive than add/remove to lower
+  // their priority.
+  const UInt64 kReplaceCost = 100001;
+  // In the common case where the vectors are the same (or almost the same)
+  // size, we know that an add will have to be followed by some later remove
+  // (or vice versa) in order to get the lengths to balance. We "borrow" some
+  // of the cost of the later operation and bring it forward into the earlier
+  // operation, to increase the cost of exploring (usually fruitlessly) around
+  // the beginning of the graph.
+  // However, there is a trade-off: This cheapens the cost of exploring around
+  // the beginning of the graph (in one direction) when the vectors are
+  // unequal in length. So we don't steal *all* the cost.
+  // You can view this as a form of A*, using an admissable heuristic that has
+  // been re-cast as a cost function that can be used in Dijkstra.
+  const UInt64 kTowardsGoalCost = 50003;
+  const UInt64 kAwayFromGoalCost = 2 * kBaseCost - kTowardsGoalCost;
+
+  EditSearchMap node_map(left.size() + 1, right.size() + 1);
+  EditHeap heap;
+  heap.push(EditHeapEntry(0, 0, 0, kReplaceParent));
+
+  while (!heap.empty()) {
+    const EditHeapEntry current_entry = heap.top();
+    heap.pop();
+
+    UInt32 left_pos = current_entry.left;
+    UInt32 right_pos = current_entry.right;
+    unsigned char& current_state = node_map.insert(left_pos, right_pos);
+    if (current_state != kUnsettled) {
+      // Node was already settled by a previous entry in the priority queue,
+      // this is a suboptimal path that should be ignored.
+      continue;
+    }
+    current_state = current_entry.state;
+
+    if (left_pos == left.size() && right_pos == right.size()) {
+      // This is the normal exit point; if we terminate due to the heap being
+      // empty, we'll fail a check later.
+      break;
+    }
+
+    // Special case: Since the cost of a match is zero, we can immediately
+    // settle the new node without putting it in the queue, since nothing can
+    // have a smaller cost than it. Furthermore, we don't need to relax the
+    // other two edges, since we know we don't need them: Any path from this
+    // node that would use them has an path via the match that is at least as
+    // cheap. Together, this means we can loop here until we stop matching.
+    while (left_pos < left.size() && right_pos < right.size() &&
+           left[left_pos] == right[right_pos]) {
+      left_pos++;
+      right_pos++;
+      unsigned char& fast_forward_state = node_map.insert(left_pos, right_pos);
+      if (fast_forward_state != kUnsettled) {
+        // The search reached around and settled this node before settling the
+        // base node. This means we're completely done with this iteration;
+        // abort to the outer loop.
+        goto outer_loop_bottom;
+        // Otherwise, when can settle this node, even if it was created from
+        // another state - we know the cost of settling it now is optimal.
+      }
+      fast_forward_state = kMatchParent;
+    }
+
+    // Relax adjacent nodes. We have no way to find or lower the cost of
+    // existing entries in the heap, so we just push new entries and throw
+    // them out at the top if the node is already settled. We *could* check to
+    // see if they're already settled before pushing, but it turns out to be
+    // ~not any faster, and more complicated to do so.
+    //
+    // If we're at an edge, there's only one node to relax.
+    if (left_pos >= left.size()) {
+      if (right_pos >= right.size()) {
+        break;  // Can happen due to the fast-path loop above.
+      }
+      heap.push(EditHeapEntry(left_pos, right_pos + 1,
+                              current_entry.cost + kTowardsGoalCost,
+                              kAddParent));
+      continue;
+    }
+    if (right_pos >= right.size()) {
+      heap.push(EditHeapEntry(left_pos + 1, right_pos,
+                              current_entry.cost + kTowardsGoalCost,
+                              kRemoveParent));
+      continue;
+    }
+    // General case: Relax 3 edges.
+    heap.push(EditHeapEntry(
+        left_pos, right_pos + 1,
+        current_entry.cost + (right.size() + left_pos > right_pos + left.size()
+                                  ? kTowardsGoalCost
+                                  : kAwayFromGoalCost),
+        kAddParent));
+    heap.push(EditHeapEntry(
+        left_pos + 1, right_pos,
+        current_entry.cost + (right.size() + left_pos < right_pos + left.size()
+                                  ? kTowardsGoalCost
+                                  : kAwayFromGoalCost),
+        kRemoveParent));
+    heap.push(EditHeapEntry(left_pos + 1, right_pos + 1,
+                            current_entry.cost + kReplaceCost, kReplaceParent));
+  outer_loop_bottom : {}  // Need the curlies to form a statement.
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  UInt32 left_pos = left.size();
+  UInt32 right_pos = right.size();
+  while (left_pos != 0 || right_pos != 0) {
+    GTEST_CHECK_(left_pos <= left.size() && right_pos <= right.size());
+    // The node must already exist, but if it somehow doesn't, it will be
+    // added as kUnsettled, which will crash below.
+    const unsigned char state = node_map.insert(left_pos, right_pos);
+    switch (state) {
+      case kAddParent:
+        right_pos--;
+        break;
+      case kRemoveParent:
+        left_pos--;
+        break;
+      case kMatchParent:
+      case kReplaceParent:
+        left_pos--;
+        right_pos--;
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Unsettled node at " << left_pos << ","
+                          << right_pos;
+    }
+    best_path.push_back(static_cast<EditType>(state - 1));
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  if (memory_usage != NULL) {
+    *memory_usage = node_map.MemoryUsage();
+  }
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string* str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  struct IdMapCmp {
+    bool operator()(const std::string* first, const std::string* second) const {
+      return *first < *second;
+    }
+  };
+  typedef std::map<const std::string*, size_t, IdMapCmp> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string>& left,
+    const std::vector<std::string>& right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(&left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(&right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printing into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start),
+        right_start_(right_start),
+        adds_(),
+        removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char* line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream* os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char*> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream* ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char*> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string>& left,
+                              const std::vector<std::string>& right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kEditMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        std::vector<EditType>::const_iterator it = edits.begin() + edit_i;
+        while (it != edits.end() && *it == kEditMatch) ++it;
+        if (it == edits.end() || (it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kEditMatch ? n_suffix + 1 : 0;
+
+      if (edit == kEditMatch || edit == kEditRemove || edit == kEditReplace) {
+        hunk.PushLine(edit == kEditMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kEditAdd || edit == kEditReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kEditAdd;
+      r_i += edit != kEditRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace internal
+}  // namespace testing
author	Abseil Team <absl-team@google.com>	2018-08-27 18:53:25 (GMT)
committer	Gennadiy Civil <misterg@google.com>	2018-08-28 20:53:30 (GMT)
commit	167c5e8188beb5dae002ac7571457e3c26eb6a3f (patch)
tree	3ac7d65d7a5d9eb4be3883cd3032d29cd93e1e21 /googletest/src/gtest-edit-distance.cc
parent	1bb76182caee8239b71b9d6d21f479014d37ad5b (diff)
download	googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.zip googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.tar.gz googletest-167c5e8188beb5dae002ac7571457e3c26eb6a3f.tar.bz2