From 5be83d0b2e4909f39998c98dde9a1393d8e5f1c2 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 14 Jun 2012 21:52:17 -0700 Subject: Only store command hashes in the build log. .build_log load time 350ms -> 17ms, filesize 197MB -> 1.6MB on Mac. On Windows, it's 500ms -> 20ms. Makes the build log a lot less useful for scripts, but there could be a tool for use cases that need log information. A prototype of such a tool is in https://github.com/nico/ninja/commit/1b243d311 The hash function is 64bit murmurhash2. Assuming that that different commands get the same hash only by chance, it's is very unlikely for two different commands to hash to the same value with a 64bit hash. --- src/build_log.cc | 64 +++++++++++++++++++++++++++++++++++++++++++++++---- src/build_log.h | 6 +++-- src/build_log_test.cc | 12 +++++----- src/build_test.cc | 6 +++-- src/graph.cc | 4 ++-- src/test.cc | 5 ++++ src/test.h | 1 + src/util.h | 1 + 8 files changed, 82 insertions(+), 17 deletions(-) diff --git a/src/build_log.cc b/src/build_log.cc index 5803ada..97c3344 100644 --- a/src/build_log.cc +++ b/src/build_log.cc @@ -37,10 +37,57 @@ namespace { const char kFileSignature[] = "# ninja log v%d\n"; -const int kCurrentVersion = 4; +const int kCurrentVersion = 5; + +// 64bit MurmurHash2, by Austin Appleby +#if defined(_MSC_VER) +#define BIG_CONSTANT(x) (x) +#else // defined(_MSC_VER) +#define BIG_CONSTANT(x) (x##LLU) +#endif // !defined(_MSC_VER) +inline +uint64_t MurmurHash64A(const void* key, int len) { + static const uint64_t seed = 0xDECAFBADDECAFBADull; + const uint64_t m = BIG_CONSTANT(0xc6a4a7935bd1e995); + const int r = 47; + uint64_t h = seed ^ (len * m); + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + while(data != end) { + uint64_t k = *data++; + k *= m; + k ^= k >> r; + k *= m; + h ^= k; + h *= m; + } + const unsigned char* data2 = (const unsigned char*)data; + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; +} +#undef BIG_CONSTANT + } // namespace +// static +uint64_t BuildLog::LogEntry::HashCommand(StringPiece command) { + return MurmurHash64A(command.str_, command.len_); +} + BuildLog::BuildLog() : log_file_(NULL), config_(NULL), needs_recompaction_(false) {} @@ -95,7 +142,7 @@ void BuildLog::RecordCommand(Edge* edge, int start_time, int end_time, log_entry->output = path; log_.insert(Log::value_type(log_entry->output, log_entry)); } - log_entry->command = command; + log_entry->command_hash = LogEntry::HashCommand(command); log_entry->start_time = start_time; log_entry->end_time = end_time; log_entry->restat_mtime = restat_mtime; @@ -239,7 +286,14 @@ bool BuildLog::Load(const string& path, string* err) { entry->start_time = start_time; entry->end_time = end_time; entry->restat_mtime = restat_mtime; - entry->command = string(start, end - start); + if (log_version >= 5) { + char c = *end; *end = '\0'; + entry->command_hash = (uint64_t)strtoull(start, NULL, 10); + *end = c; + } + else + entry->command_hash = LogEntry::HashCommand(StringPiece(start, + end - start)); } // Decide whether it's time to rebuild the log: @@ -267,9 +321,9 @@ BuildLog::LogEntry* BuildLog::LookupByOutput(const string& path) { } void BuildLog::WriteEntry(FILE* f, const LogEntry& entry) { - fprintf(f, "%d\t%d\t%ld\t%s\t%s\n", + fprintf(f, "%d\t%d\t%ld\t%s\t%llu\n", entry.start_time, entry.end_time, (long) entry.restat_mtime, - entry.output.c_str(), entry.command.c_str()); + entry.output.c_str(), entry.command_hash); } bool BuildLog::Recompact(const string& path, string* err) { diff --git a/src/build_log.h b/src/build_log.h index da8e726..98ef7b3 100644 --- a/src/build_log.h +++ b/src/build_log.h @@ -49,14 +49,16 @@ struct BuildLog { struct LogEntry { string output; - string command; + uint64_t command_hash; int start_time; int end_time; TimeStamp restat_mtime; + static uint64_t HashCommand(StringPiece command); + // Used by tests. bool operator==(const LogEntry& o) { - return output == o.output && command == o.command && + return output == o.output && command_hash == o.command_hash && start_time == o.start_time && end_time == o.end_time && restat_mtime == o.restat_mtime; } diff --git a/src/build_log_test.cc b/src/build_log_test.cc index 199e016..afd3b81 100644 --- a/src/build_log_test.cc +++ b/src/build_log_test.cc @@ -110,7 +110,7 @@ TEST_F(BuildLogTest, DoubleEntry) { BuildLog::LogEntry* e = log.LookupByOutput("out"); ASSERT_TRUE(e); - ASSERT_EQ("command def", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command def", e->command_hash)); } TEST_F(BuildLogTest, Truncate) { @@ -164,7 +164,7 @@ TEST_F(BuildLogTest, UpgradeV3) { ASSERT_EQ(123, e->start_time); ASSERT_EQ(456, e->end_time); ASSERT_EQ(0, e->restat_mtime); - ASSERT_EQ("command", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command", e->command_hash)); } TEST_F(BuildLogTest, SpacesInOutputV4) { @@ -183,7 +183,7 @@ TEST_F(BuildLogTest, SpacesInOutputV4) { ASSERT_EQ(123, e->start_time); ASSERT_EQ(456, e->end_time); ASSERT_EQ(456, e->restat_mtime); - ASSERT_EQ("command", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command", e->command_hash)); } TEST_F(BuildLogTest, DuplicateVersionHeader) { @@ -207,14 +207,14 @@ TEST_F(BuildLogTest, DuplicateVersionHeader) { ASSERT_EQ(123, e->start_time); ASSERT_EQ(456, e->end_time); ASSERT_EQ(456, e->restat_mtime); - ASSERT_EQ("command", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command", e->command_hash)); e = log.LookupByOutput("out2"); ASSERT_TRUE(e); ASSERT_EQ(456, e->start_time); ASSERT_EQ(789, e->end_time); ASSERT_EQ(789, e->restat_mtime); - ASSERT_EQ("command2", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command2", e->command_hash)); } TEST_F(BuildLogTest, VeryLongInputLine) { @@ -242,5 +242,5 @@ TEST_F(BuildLogTest, VeryLongInputLine) { ASSERT_EQ(456, e->start_time); ASSERT_EQ(789, e->end_time); ASSERT_EQ(789, e->restat_mtime); - ASSERT_EQ("command2", e->command); + ASSERT_NO_FATAL_FAILURE(AssertHash("command2", e->command_hash)); } diff --git a/src/build_test.cc b/src/build_test.cc index c45f2b3..23f1909 100644 --- a/src/build_test.cc +++ b/src/build_test.cc @@ -953,8 +953,10 @@ TEST_F(BuildWithLogTest, RspFileCmdLineChange) { // (to simulate a change in the command line between 2 builds) BuildLog::LogEntry * log_entry = build_log_.LookupByOutput("out"); ASSERT_TRUE(NULL != log_entry); - ASSERT_EQ("cat out.rsp > out;rspfile=Original very long command", log_entry->command); - log_entry->command = "cat out.rsp > out;rspfile=Altered very long command"; + ASSERT_NO_FATAL_FAILURE(AssertHash( + "cat out.rsp > out;rspfile=Original very long command", + log_entry->command_hash)); + log_entry->command_hash++; // Change the command hash to something else. // Now expect the target to be rebuilt commands_ran_.clear(); state_.Reset(); diff --git a/src/graph.cc b/src/graph.cc index 3531e86..f381d8a 100644 --- a/src/graph.cc +++ b/src/graph.cc @@ -157,7 +157,7 @@ bool Edge::RecomputeOutputDirty(BuildLog* build_log, // dirty. if (!rule_->generator() && build_log && (entry || (entry = build_log->LookupByOutput(output->path())))) { - if (command != entry->command) { + if (BuildLog::LogEntry::HashCommand(command) != entry->command_hash) { EXPLAIN("command line changed for %s", output->path().c_str()); return true; } @@ -200,7 +200,7 @@ string EdgeEnv::LookupVariable(const string& var) { } else if (edge_->env_) { return edge_->env_->LookupVariable(var); } else { - // XXX shoudl we warn here? + // XXX should we warn here? return string(); } } diff --git a/src/test.cc b/src/test.cc index 2fbb4df..afedd10 100644 --- a/src/test.cc +++ b/src/test.cc @@ -18,6 +18,7 @@ #include +#include "build_log.h" #include "parsers.h" #include "util.h" @@ -89,6 +90,10 @@ void AssertParse(State* state, const char* input) { ASSERT_EQ("", err); } +void AssertHash(const char* expected, uint64_t actual) { + ASSERT_EQ(BuildLog::LogEntry::HashCommand(expected), actual); +} + void VirtualFileSystem::Create(const string& path, int time, const string& contents) { files_[path].mtime = time; diff --git a/src/test.h b/src/test.h index 97f7cb1..37ca1f9 100644 --- a/src/test.h +++ b/src/test.h @@ -35,6 +35,7 @@ struct StateTestWithBuiltinRules : public testing::Test { }; void AssertParse(State* state, const char* input); +void AssertHash(const char* expected, uint64_t actual); /// An implementation of DiskInterface that uses an in-memory representation /// of disk state. It also logs file accesses and directory creations diff --git a/src/util.h b/src/util.h index 399913e..82f4850 100644 --- a/src/util.h +++ b/src/util.h @@ -77,6 +77,7 @@ double GetLoadAverage(); #define fileno _fileno #define unlink _unlink #define chdir _chdir +#define strtoull _strtoui64 #endif #ifdef _WIN32 -- cgit v0.12