From 8a37bfffcbea27ffbb9168b2740476336cd4963a Mon Sep 17 00:00:00 2001 From: Evan Martin Date: Tue, 17 Jul 2012 17:38:48 -0700 Subject: disallow crlf in manifest files It turns out to be trickier than expected to process these correctly. It turns out to also be trickier than expected to give a nice error message on encountering these. But the behavior prior to this patch would just be silent failures where we attempted to examine paths that accidentally contained embedded \r. For now, fix all regexes of the form [^...] to include \r in the excluded block, then assert that we get a vague lexer error near the problem. In the future perhaps we can open manifest files in text mode on Windows or just disallow Windows-style CRLF in the manual. --- src/lexer.cc | 110 ++++++++++++++++++++++++-------------------- src/lexer.in.cc | 4 +- src/manifest_parser_test.cc | 20 ++++++++ 3 files changed, 81 insertions(+), 53 deletions(-) diff --git a/src/lexer.cc b/src/lexer.cc index b3efe22..ca6f367 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -127,7 +127,7 @@ Lexer::Token Lexer::ReadToken() { unsigned int yyaccept = 0; static const unsigned char yybm[] = { 0, 64, 64, 64, 64, 64, 64, 64, - 64, 64, 0, 64, 64, 64, 64, 64, + 64, 64, 0, 64, 64, 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 192, 64, 64, 64, 64, 64, 64, 64, @@ -216,7 +216,8 @@ yy3: yy4: yyaccept = 1; yych = *(q = ++p); - if (yych >= 0x01) goto yy60; + if (yych <= 0x00) goto yy5; + if (yych != '\r') goto yy60; yy5: { token = ERROR; break; } yy6: @@ -354,7 +355,9 @@ yy60: if (yybm[0+yych] & 64) { goto yy59; } - if (yych >= 0x01) goto yy62; + if (yych <= 0x00) goto yy61; + if (yych <= '\f') goto yy62; +yy61: p = q; if (yyaccept <= 0) { goto yy3; @@ -576,7 +579,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { unsigned char yych; static const unsigned char yybm[] = { 0, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 0, 128, 128, 128, 128, 128, + 128, 128, 0, 128, 128, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 16, 128, 128, 128, 0, 128, 128, 128, @@ -609,24 +612,25 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { 128, 128, 128, 128, 128, 128, 128, 128, }; yych = *p; - if (yych <= '#') { + if (yych <= ' ') { if (yych <= '\n') { if (yych <= 0x00) goto yy96; if (yych >= '\n') goto yy92; } else { - if (yych == ' ') goto yy92; + if (yych == '\r') goto yy98; + if (yych >= ' ') goto yy92; } } else { - if (yych <= ':') { - if (yych <= '$') goto yy94; - if (yych >= ':') goto yy92; + if (yych <= '9') { + if (yych == '$') goto yy94; } else { + if (yych <= ':') goto yy92; if (yych == '|') goto yy92; } } ++p; yych = *p; - goto yy120; + goto yy121; yy91: { eval->AddText(StringPiece(start, p - start)); @@ -649,39 +653,40 @@ yy94: ++p; if ((yych = *p) <= '/') { if (yych <= ' ') { - if (yych == '\n') goto yy109; - if (yych <= 0x1F) goto yy98; - goto yy100; + if (yych == '\n') goto yy110; + if (yych <= 0x1F) goto yy99; + goto yy101; } else { if (yych <= '$') { - if (yych <= '#') goto yy98; - goto yy102; + if (yych <= '#') goto yy99; + goto yy103; } else { - if (yych == '-') goto yy104; - goto yy98; + if (yych == '-') goto yy105; + goto yy99; } } } else { if (yych <= '^') { if (yych <= ':') { - if (yych <= '9') goto yy104; - goto yy106; + if (yych <= '9') goto yy105; + goto yy107; } else { - if (yych <= '@') goto yy98; - if (yych <= 'Z') goto yy104; - goto yy98; + if (yych <= '@') goto yy99; + if (yych <= 'Z') goto yy105; + goto yy99; } } else { if (yych <= '`') { - if (yych <= '_') goto yy104; - goto yy98; + if (yych <= '_') goto yy105; + goto yy99; } else { - if (yych <= 'z') goto yy104; - if (yych <= '{') goto yy108; - goto yy98; + if (yych <= 'z') goto yy105; + if (yych <= '{') goto yy109; + goto yy99; } } } +yy95: { last_token_ = start; return Error("lexing error", err); @@ -693,83 +698,86 @@ yy96: return Error("unexpected EOF", err); } yy98: - ++p; + yych = *++p; + goto yy95; yy99: + ++p; +yy100: { last_token_ = start; return Error("bad $-escape (literal $ must be written as $$)", err); } -yy100: +yy101: ++p; { eval->AddText(StringPiece(" ", 1)); continue; } -yy102: +yy103: ++p; { eval->AddText(StringPiece("$", 1)); continue; } -yy104: +yy105: ++p; yych = *p; - goto yy118; -yy105: + goto yy119; +yy106: { eval->AddSpecial(StringPiece(start + 1, p - start - 1)); continue; } -yy106: +yy107: ++p; { eval->AddText(StringPiece(":", 1)); continue; } -yy108: +yy109: yych = *(q = ++p); if (yybm[0+yych] & 32) { - goto yy112; + goto yy113; } - goto yy99; -yy109: + goto yy100; +yy110: ++p; yych = *p; if (yybm[0+yych] & 16) { - goto yy109; + goto yy110; } { continue; } -yy112: +yy113: ++p; yych = *p; if (yybm[0+yych] & 32) { - goto yy112; + goto yy113; } - if (yych == '}') goto yy115; + if (yych == '}') goto yy116; p = q; - goto yy99; -yy115: + goto yy100; +yy116: ++p; { eval->AddSpecial(StringPiece(start + 2, p - start - 3)); continue; } -yy117: +yy118: ++p; yych = *p; -yy118: +yy119: if (yybm[0+yych] & 64) { - goto yy117; + goto yy118; } - goto yy105; -yy119: + goto yy106; +yy120: ++p; yych = *p; -yy120: +yy121: if (yybm[0+yych] & 128) { - goto yy119; + goto yy120; } goto yy91; } diff --git a/src/lexer.in.cc b/src/lexer.in.cc index e478921..852d6e9 100644 --- a/src/lexer.in.cc +++ b/src/lexer.in.cc @@ -130,7 +130,7 @@ Lexer::Token Lexer::ReadToken() { simple_varname = [a-zA-Z0-9_-]+; varname = [a-zA-Z0-9_.-]+; - [ ]*"#"[^\000\n]*"\n" { continue; } + [ ]*"#"[^\000\r\n]*"\n" { continue; } [ ]*[\n] { token = NEWLINE; break; } [ ]+ { token = INDENT; break; } "build" { token = BUILD; break; } @@ -200,7 +200,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { for (;;) { start = p; /*!re2c - [^$ :\n|\000]+ { + [^$ :\r\n|\000]+ { eval->AddText(StringPiece(start, p - start)); continue; } diff --git a/src/manifest_parser_test.cc b/src/manifest_parser_test.cc index 9c6644c..3261d39 100644 --- a/src/manifest_parser_test.cc +++ b/src/manifest_parser_test.cc @@ -682,3 +682,23 @@ TEST_F(ParserTest, UTF8) { " command = true\n" " description = compilaci\xC3\xB3\n")); } + +// We might want to eventually allow CRLF to be nice to Windows developers, +// but for now just verify we error out with a nice message. +TEST_F(ParserTest, CRLF) { + State state; + ManifestParser parser(&state, NULL); + string err; + + EXPECT_FALSE(parser.ParseTest("# comment with crlf\r\n", + &err)); + EXPECT_EQ("input:1: lexing error\n", + err); + + EXPECT_FALSE(parser.ParseTest("foo = foo\nbar = bar\r\n", + &err)); + EXPECT_EQ("input:2: lexing error\n" + "bar = bar\r\n" + " ^ near here", + err); +} -- cgit v0.12