summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/util.cc149
-rw-r--r--src/util_test.cc101
2 files changed, 208 insertions, 42 deletions
diff --git a/src/util.cc b/src/util.cc
index 0553de3..5f67fcf 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -143,20 +143,19 @@ void CanonicalizePath(char* path, size_t* len, uint64_t* slash_bits) {
return;
}
- const int kMaxPathComponents = 60;
- char* components[kMaxPathComponents];
- int component_count = 0;
-
char* start = path;
char* dst = start;
+ char* dst_start = dst;
const char* src = start;
const char* end = start + *len;
+ const char* src_next;
+ // For absolute paths, skip the leading directory separator
+ // as this one should never be removed from the result.
if (IsPathSeparator(*src)) {
#ifdef _WIN32
-
- // network path starts with //
- if (*len > 1 && IsPathSeparator(*(src + 1))) {
+ // Windows network path starts with //
+ if (src + 2 <= end && IsPathSeparator(src[1])) {
src += 2;
dst += 2;
} else {
@@ -167,50 +166,126 @@ void CanonicalizePath(char* path, size_t* len, uint64_t* slash_bits) {
++src;
++dst;
#endif
+ dst_start = dst;
+ } else {
+ // For relative paths, skip any leading ../ as these are quite common
+ // to reference source files in build plans, and doing this here makes
+ // the loop work below faster in general.
+ while (src + 3 <= end && src[0] == '.' && src[1] == '.' &&
+ IsPathSeparator(src[2])) {
+ src += 3;
+ dst += 3;
+ }
}
- while (src < end) {
- if (*src == '.') {
- if (src + 1 == end || IsPathSeparator(src[1])) {
- // '.' component; eliminate.
- src += 2;
- continue;
- } else if (src[1] == '.' && (src + 2 == end || IsPathSeparator(src[2]))) {
- // '..' component. Back up if possible.
+ // Loop over all components of the paths _except_ the last one, in
+ // order to simplify the loop's code and make it faster.
+ int component_count = 0;
+ char* dst0 = dst;
+ for (; src < end; src = src_next) {
+#ifndef _WIN32
+ // Use memchr() for faster lookups thanks to optimized C library
+ // implementation. `hyperfine canon_perftest` shows a significant
+ // difference (e,g, 484ms vs 437ms).
+ const char* next_sep =
+ static_cast<const char*>(::memchr(src, '/', end - src));
+ if (!next_sep) {
+ // This is the last component, will be handled out of the loop.
+ break;
+ }
+#else
+ // Need to check for both '/' and '\\' so do not use memchr().
+ // Cannot use strpbrk() because end[0] can be \0 or something else!
+ const char* next_sep = src;
+ while (next_sep != end && !IsPathSeparator(*next_sep))
+ ++next_sep;
+ if (next_sep == end) {
+ // This is the last component, will be handled out of the loop.
+ break;
+ }
+#endif
+ // Position for next loop iteration.
+ src_next = next_sep + 1;
+ // Length of the component, excluding trailing directory.
+ size_t component_len = next_sep - src;
+
+ if (component_len <= 2) {
+ if (component_len == 0) {
+ continue; // Ignore empty component, e.g. 'foo//bar' -> 'foo/bar'.
+ }
+ if (src[0] == '.') {
+ if (component_len == 1) {
+ continue; // Ignore '.' component, e.g. './foo' -> 'foo'.
+ } else if (src[1] == '.') {
+ // Process the '..' component if found. Back up if possible.
+ if (component_count > 0) {
+ // Move back to start of previous component.
+ --component_count;
+ while (--dst > dst0 && !IsPathSeparator(dst[-1])) {
+ // nothing to do here, decrement happens before condition check.
+ }
+ } else {
+ dst[0] = '.';
+ dst[1] = '.';
+ dst[2] = src[2];
+ dst += 3;
+ }
+ continue;
+ }
+ }
+ }
+ ++component_count;
+
+ // Copy or skip component, including trailing directory separator.
+ if (dst != src) {
+ ::memmove(dst, src, src_next - src);
+ }
+ dst += src_next - src;
+ }
+
+ // Handling the last component that does not have a trailing separator.
+ // The logic here is _slightly_ different since there is no trailing
+ // directory separator.
+ size_t component_len = end - src;
+ do {
+ if (component_len == 0)
+ break; // Ignore empty component (e.g. 'foo//' -> 'foo/')
+ if (src[0] == '.') {
+ if (component_len == 1)
+ break; // Ignore trailing '.' (e.g. 'foo/.' -> 'foo/')
+ if (src[1] == '.') {
+ // Handle '..'. Back up if possible.
if (component_count > 0) {
- dst = components[component_count - 1];
- src += 3;
- --component_count;
+ while (--dst > dst0 && !IsPathSeparator(dst[-1])) {
+ // nothing to do here, decrement happens before condition check.
+ }
} else {
- *dst++ = *src++;
- *dst++ = *src++;
- *dst++ = *src++;
+ dst[0] = '.';
+ dst[1] = '.';
+ dst += 2;
+ // No separator to add here.
}
- continue;
+ break;
}
}
-
- if (IsPathSeparator(*src)) {
- src++;
- continue;
+ // Skip or copy last component, no trailing separator.
+ if (dst != src) {
+ ::memmove(dst, src, component_len);
}
+ dst += component_len;
+ } while (0);
- if (component_count == kMaxPathComponents)
- Fatal("path has too many components : %s", path);
- components[component_count] = dst;
- ++component_count;
-
- while (src != end && !IsPathSeparator(*src))
- *dst++ = *src++;
- *dst++ = *src++; // Copy '/' or final \0 character as well.
- }
+ // Remove trailing path separator if any, but keep the initial
+ // path separator(s) if there was one (or two on Windows).
+ if (dst > dst_start && IsPathSeparator(dst[-1]))
+ dst--;
if (dst == start) {
+ // Handle special cases like "aa/.." -> "."
*dst++ = '.';
- *dst++ = '\0';
}
- *len = dst - start - 1;
+ *len = dst - start; // dst points after the trailing char here.
#ifdef _WIN32
uint64_t bits = 0;
uint64_t bits_mask = 1;
diff --git a/src/util_test.cc b/src/util_test.cc
index d58b170..8467e2a 100644
--- a/src/util_test.cc
+++ b/src/util_test.cc
@@ -89,13 +89,57 @@ TEST(CanonicalizePath, PathSamples) {
EXPECT_EQ("/foo", path);
#endif
+ path = "..";
+ CanonicalizePath(&path);
+ EXPECT_EQ("..", path);
+
+ path = "../";
+ CanonicalizePath(&path);
+ EXPECT_EQ("..", path);
+
+ path = "../foo";
+ CanonicalizePath(&path);
+ EXPECT_EQ("../foo", path);
+
+ path = "../foo/";
+ CanonicalizePath(&path);
+ EXPECT_EQ("../foo", path);
+
+ path = "../..";
+ CanonicalizePath(&path);
+ EXPECT_EQ("../..", path);
+
+ path = "../../";
+ CanonicalizePath(&path);
+ EXPECT_EQ("../..", path);
+
+ path = "./../";
+ CanonicalizePath(&path);
+ EXPECT_EQ("..", path);
+
+ path = "/..";
+ CanonicalizePath(&path);
+ EXPECT_EQ("/..", path);
+
+ path = "/../";
+ CanonicalizePath(&path);
+ EXPECT_EQ("/..", path);
+
+ path = "/../..";
+ CanonicalizePath(&path);
+ EXPECT_EQ("/../..", path);
+
+ path = "/../../";
+ CanonicalizePath(&path);
+ EXPECT_EQ("/../..", path);
+
path = "/";
CanonicalizePath(&path);
- EXPECT_EQ("", path);
+ EXPECT_EQ("/", path);
path = "/foo/..";
CanonicalizePath(&path);
- EXPECT_EQ("", path);
+ EXPECT_EQ("/", path);
path = ".";
CanonicalizePath(&path);
@@ -171,7 +215,7 @@ TEST(CanonicalizePath, PathSamplesWindows) {
path = "\\";
CanonicalizePath(&path);
- EXPECT_EQ("", path);
+ EXPECT_EQ("/", path);
}
TEST(CanonicalizePath, SlashTracking) {
@@ -321,8 +365,53 @@ TEST(CanonicalizePath, TooManyComponents) {
EXPECT_EQ(58, std::count(path.begin(), path.end(), '\\'));
CanonicalizePath(&path, &slash_bits);
EXPECT_EQ(slash_bits, 0x3ffffffffffffff);
+
+ // More than 60 components is now completely ok too.
+ path =
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\a\\"
+ "a\\a\\a\\a\\a\\a\\a\\a\\a\\x\\y.h";
+ EXPECT_EQ(218, std::count(path.begin(), path.end(), '\\'));
+ CanonicalizePath(&path, &slash_bits);
+ EXPECT_EQ(slash_bits, 0xffffffffffffffff);
}
-#endif
+#else // !_WIN32
+TEST(CanonicalizePath, TooManyComponents) {
+ string path;
+ uint64_t slash_bits;
+
+ // More than 60 components is now completely ok.
+ path =
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/"
+ "a/a/a/a/a/a/a/a/a/x/y.h";
+ EXPECT_EQ(218, std::count(path.begin(), path.end(), '/'));
+ CanonicalizePath(&path, &slash_bits);
+ EXPECT_EQ(slash_bits, 0x0);
+}
+#endif // !_WIN32
TEST(CanonicalizePath, UpDir) {
string path, err;
@@ -353,11 +442,13 @@ TEST(CanonicalizePath, NotNullTerminated) {
EXPECT_EQ(strlen("foo"), len);
EXPECT_EQ("foo/. bar/.", string(path));
+ // Verify that foo/..file gets canonicalized to 'file' without
+ // touching the rest of the string.
path = "foo/../file bar/.";
len = strlen("foo/../file");
CanonicalizePath(&path[0], &len, &unused);
EXPECT_EQ(strlen("file"), len);
- EXPECT_EQ("file ./file bar/.", string(path));
+ EXPECT_EQ("file../file bar/.", string(path));
}
TEST(PathEscaping, TortureTest) {