From b6a9a1c8adbb444c2489d884f06e5bd39627c3e9 Mon Sep 17 00:00:00 2001 From: Evan Martin Date: Mon, 17 Dec 2012 09:08:15 -0800 Subject: add DepsLog, a new data structure for dependency information DepsLog is a compact serialization of dependency information. It can be used to replace depfiles for faster loading. --- configure.py | 2 + src/deps_log.cc | 149 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/deps_log.h | 91 +++++++++++++++++++++++++++++++ src/deps_log_test.cc | 63 ++++++++++++++++++++++ src/graph.h | 9 +++- src/state.cc | 5 +- 6 files changed, 316 insertions(+), 3 deletions(-) create mode 100644 src/deps_log.cc create mode 100644 src/deps_log.h create mode 100644 src/deps_log_test.cc diff --git a/configure.py b/configure.py index 10c6994..8f5a497 100755 --- a/configure.py +++ b/configure.py @@ -269,6 +269,7 @@ for name in ['build', 'build_log', 'clean', 'depfile_parser', + 'deps_log', 'disk_interface', 'edit_distance', 'eval_env', @@ -348,6 +349,7 @@ for name in ['build_log_test', 'build_test', 'clean_test', 'depfile_parser_test', + 'deps_log_test', 'disk_interface_test', 'edit_distance_test', 'graph_test', diff --git a/src/deps_log.cc b/src/deps_log.cc new file mode 100644 index 0000000..ca7fd4b --- /dev/null +++ b/src/deps_log.cc @@ -0,0 +1,149 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deps_log.h" + +#include +#include +#include +#include + +#include "graph.h" +#include "state.h" +#include "util.h" + +bool DepsLog::OpenForWrite(const string& path, string* err) { + file_ = fopen(path.c_str(), "ab"); + if (!file_) { + *err = strerror(errno); + return false; + } + SetCloseOnExec(fileno(file_)); + + // Opening a file in append mode doesn't set the file pointer to the file's + // end on Windows. Do that explicitly. + fseek(file_, 0, SEEK_END); + + /* XXX + if (ftell(log_file_) == 0) { + if (fprintf(log_file_, kFileSignature, kCurrentVersion) < 0) { + *err = strerror(errno); + return false; + } + } + */ + + return true; +} + +bool DepsLog::RecordDeps(Node* node, TimeStamp mtime, + const vector& nodes) { + // Assign ids to all nodes that are missing one. + if (node->id() < 0) + RecordId(node); + for (vector::const_iterator i = nodes.begin(); + i != nodes.end(); ++i) { + if ((*i)->id() < 0) + RecordId(*i); + } + + uint16_t size = 4 * (1 + 1 + nodes.size()); + size |= 0x8000; // Deps record: set high bit. + fwrite(&size, 2, 1, file_); + int id = node->id(); + fwrite(&id, 4, 1, file_); + int timestamp = node->mtime(); + fwrite(×tamp, 4, 1, file_); + for (vector::const_iterator i = nodes.begin(); + i != nodes.end(); ++i) { + id = node->id(); + fwrite(&id, 4, 1, file_); + } + + return true; +} + +void DepsLog::Close() { + fclose(file_); + file_ = NULL; +} + +bool DepsLog::Load(const string& path, State* state, string* err) { + char buf[32 << 10]; + FILE* f = fopen(path.c_str(), "rb"); + if (!f) { + *err = strerror(errno); + return false; + } + + int id = 0; + for (;;) { + uint16_t size; + if (fread(&size, 2, 1, f) < 1) + break; + bool is_deps = (size >> 15) != 0; + size = size & 0x7FFF; + + if (fread(buf, size, 1, f) < 1) + break; + + if (is_deps) { + assert(size % 4 == 0); + int* deps_data = reinterpret_cast(buf); + int out_id = deps_data[0]; + int mtime = deps_data[1]; + deps_data += 2; + int deps_count = (size / 4) - 2; + + Deps* deps = new Deps; + deps->mtime = mtime; + deps->node_count = deps_count; + deps->nodes = new Node*[deps_count]; + for (int i = 0; i < deps_count; ++i) { + assert(deps_data[i] < (int)nodes_.size()); + assert(nodes_[deps_data[i]]); + deps->nodes[i] = nodes_[deps_data[i]]; + } + + if (out_id >= (int)deps_.size()) + deps_.resize(out_id + 1); + if (deps_[out_id]) + delete deps_[out_id]; + deps_[out_id] = deps; + } else { + StringPiece path(buf, size); + Node* node = state->GetNode(path); + assert(node->id() < 0); + node->set_id(id); + ++id; + } + } + if (ferror(f)) { + *err = strerror(ferror(f)); + return false; + } + fclose(f); + return true; +} + +bool DepsLog::RecordId(Node* node) { + uint16_t size = node->path().size(); + fwrite(&size, 2, 1, file_); + fwrite(node->path().data(), node->path().size(), 1, file_); + + node->set_id(nodes_.size()); + nodes_.push_back(node); + + return true; +} diff --git a/src/deps_log.h b/src/deps_log.h new file mode 100644 index 0000000..45d2cea --- /dev/null +++ b/src/deps_log.h @@ -0,0 +1,91 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef NINJA_DEPS_LOG_H_ +#define NINJA_DEPS_LOG_H_ + +#include +#include +using namespace std; + +#include + +#include "timestamp.h" + +struct Node; +struct State; + +/// As build commands run they can output extra dependency information +/// (e.g. header dependencies for C source) via a pipe. DepsLog collects +/// that information at build time and reloads it at startup. +/// +/// The on-disk format is based on two primary constraints: +/// - it must be written to as a stream (during the build, which may be +/// interrupted); +/// - it can be read all at once on startup. (Alternative designs, where +/// it contains indexing information, were considered and discarded as +/// too complicated to implement; if the file is small than reading it +/// fully on startup is acceptable.) +/// Here are some stats from the Windows Chrome dependency files, to +/// help guide the design space. The total text in the files sums to +/// 90mb so some compression is warranted to keep load-time fast. +/// There's about 10k files worth of dependencies that reference about +/// 40k total paths totalling 2mb of unique strings. +/// +/// Based on these above, the file is structured as a sequence of records. +/// Each record is either a path string or a dependency list. +/// Numbering the path strings in file order gives them dense integer ids. +/// A dependency list maps an output id to a list of input ids. +/// +/// Concretely, a record is: +/// two bytes record length, high bit indicates record type +/// (implies max record length 32k) +/// path records contain just the string name of the path +/// dependency records are an array of 4-byte integers +/// [output path id, output path mtime, input path id, input path id...] +/// (The mtime is compared against the on-disk output path mtime +/// to verify the stored data is up-to-date.) +/// If two records reference the same output the latter one in the file +/// wins, allowing updates to just be appended to the file. A separate +/// repacking step can run occasionally to remove dead records. +struct DepsLog { + + // Writing (build-time) interface. + bool OpenForWrite(const string& path, string* err); + bool RecordDeps(Node* node, TimeStamp mtime, const vector& nodes); + void Close(); + + // Reading (startup-time) interface. + bool Load(const string& path, State* state, string* err); + + private: + // Write a node name record, assigning it an id. + bool RecordId(Node* node); + + struct Deps { + Deps() : mtime(-1), node_count(0), nodes(NULL) {} + ~Deps() { delete [] nodes; } + int mtime; + int node_count; + Node** nodes; + }; + + FILE* file_; + vector nodes_; + vector deps_; + + friend struct DepsLogTest; +}; + +#endif // NINJA_DEPS_LOG_H_ diff --git a/src/deps_log_test.cc b/src/deps_log_test.cc new file mode 100644 index 0000000..540865b --- /dev/null +++ b/src/deps_log_test.cc @@ -0,0 +1,63 @@ +// Copyright 2012 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "deps_log.h" + +#include "graph.h" +#include "util.h" +#include "test.h" + +namespace { + +const char kTestFilename[] = "DepsLogTest-tempfile"; + +struct DepsLogTest : public testing::Test { + virtual void SetUp() { + // In case a crashing test left a stale file behind. + unlink(kTestFilename); + } + virtual void TearDown() { + //unlink(kTestFilename); + } +}; + +TEST_F(DepsLogTest, WriteRead) { + State state1; + DepsLog log1; + string err; + EXPECT_TRUE(log1.OpenForWrite(kTestFilename, &err)); + ASSERT_EQ("", err); + + vector deps; + deps.push_back(state1.GetNode("foo.h")); + deps.push_back(state1.GetNode("bar.h")); + log1.RecordDeps(state1.GetNode("out.o"), 1, deps); + + deps.clear(); + deps.push_back(state1.GetNode("foo.h")); + deps.push_back(state1.GetNode("bar2.h")); + log1.RecordDeps(state1.GetNode("out2.o"), 2, deps); + + log1.Close(); + + State state2; + DepsLog log2; + EXPECT_TRUE(log1.Load(kTestFilename, &state2, &err)); + ASSERT_EQ("", err); + state2.Dump(); + + state2.GetNode("out2.o")->Dump(); +} + +} // anonymous namespace diff --git a/src/graph.h b/src/graph.h index 8b93e29..4ef05ec 100644 --- a/src/graph.h +++ b/src/graph.h @@ -32,7 +32,8 @@ struct Node { : path_(path), mtime_(-1), dirty_(false), - in_edge_(NULL) {} + in_edge_(NULL), + id_(-1) {} /// Return true if the file exists (mtime_ got a value). bool Stat(DiskInterface* disk_interface); @@ -74,6 +75,9 @@ struct Node { Edge* in_edge() const { return in_edge_; } void set_in_edge(Edge* edge) { in_edge_ = edge; } + int id() const { return id_; } + void set_id(int id) { id_ = id; } + const vector& out_edges() const { return out_edges_; } void AddOutEdge(Edge* edge) { out_edges_.push_back(edge); } @@ -98,6 +102,9 @@ private: /// All Edges that use this Node as an input. vector out_edges_; + + /// A dense integer id for the node, assigned and used by DepsLog. + int id_; }; /// An invokable build command and associated metadata (description, etc.). diff --git a/src/state.cc b/src/state.cc index 9f46fee..d2d5ebe 100644 --- a/src/state.cc +++ b/src/state.cc @@ -202,10 +202,11 @@ void State::Reset() { void State::Dump() { for (Paths::iterator i = paths_.begin(); i != paths_.end(); ++i) { Node* node = i->second; - printf("%s %s\n", + printf("%s %s [id:%d]\n", node->path().c_str(), node->status_known() ? (node->dirty() ? "dirty" : "clean") - : "unknown"); + : "unknown", + node->id()); } if (!pools_.empty()) { printf("resource_pools:\n"); -- cgit v0.12