From dbd933365ec780d27ab7c0dfba30dc1af1094607 Mon Sep 17 00:00:00 2001 From: Brad King Date: Mon, 14 Oct 2013 15:13:11 -0400 Subject: cmListFileLexer: Allow a leading UTF-8 Byte-Order-Mark (#11137) Teach the lexer to read a UTF-8, UTF-16 BE/LE, or UTF-32 BE/LE Byte-Order-Mark from the start of a file if any is present. Report an error on files using UTF-16 or UTF-32 and accept a UTF-8 or missing BOM. --- Source/cmListFileCache.cxx | 15 ++++++- Source/cmListFileLexer.c | 57 +++++++++++++++++++++++-- Source/cmListFileLexer.h | 14 +++++- Source/cmListFileLexer.in.l | 57 +++++++++++++++++++++++-- Tests/RunCMake/Syntax/BOM-UTF-16-BE-result.txt | 1 + Tests/RunCMake/Syntax/BOM-UTF-16-BE-stderr.txt | 6 +++ Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake | Bin 0 -> 54 bytes Tests/RunCMake/Syntax/BOM-UTF-16-LE-result.txt | 1 + Tests/RunCMake/Syntax/BOM-UTF-16-LE-stderr.txt | 6 +++ Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake | Bin 0 -> 54 bytes Tests/RunCMake/Syntax/BOM-UTF-32-BE-result.txt | 1 + Tests/RunCMake/Syntax/BOM-UTF-32-BE-stderr.txt | 6 +++ Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake | Bin 0 -> 108 bytes Tests/RunCMake/Syntax/BOM-UTF-32-LE-result.txt | 1 + Tests/RunCMake/Syntax/BOM-UTF-32-LE-stderr.txt | 6 +++ Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake | Bin 0 -> 108 bytes Tests/RunCMake/Syntax/BOM-UTF-8-stdout.txt | 1 + Tests/RunCMake/Syntax/BOM-UTF-8.cmake | 1 + Tests/RunCMake/Syntax/RunCMakeTest.cmake | 5 +++ 19 files changed, 168 insertions(+), 10 deletions(-) create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-BE-result.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-BE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-LE-result.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-LE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-BE-result.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-BE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-LE-result.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-LE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-8-stdout.txt create mode 100644 Tests/RunCMake/Syntax/BOM-UTF-8.cmake diff --git a/Source/cmListFileCache.cxx b/Source/cmListFileCache.cxx index 898f379..f6ea4b1 100644 --- a/Source/cmListFileCache.cxx +++ b/Source/cmListFileCache.cxx @@ -57,13 +57,26 @@ cmListFileParser::~cmListFileParser() bool cmListFileParser::ParseFile() { // Open the file. - if(!cmListFileLexer_SetFileName(this->Lexer, this->FileName)) + cmListFileLexer_BOM bom; + if(!cmListFileLexer_SetFileName(this->Lexer, this->FileName, &bom)) { cmSystemTools::Error("cmListFileCache: error can not open file ", this->FileName); return false; } + // Verify the Byte-Order-Mark, if any. + if(bom != cmListFileLexer_BOM_None && + bom != cmListFileLexer_BOM_UTF8) + { + cmListFileLexer_SetFileName(this->Lexer, 0, 0); + cmOStringStream m; + m << "File\n " << this->FileName << "\n" + << "starts with a Byte-Order-Mark that is not UTF-8."; + this->Makefile->IssueMessage(cmake::FATAL_ERROR, m.str()); + return false; + } + // Use a simple recursive-descent parser to process the token // stream. bool haveNewline = true; diff --git a/Source/cmListFileLexer.c b/Source/cmListFileLexer.c index ad5a83d..394bd17 100644 --- a/Source/cmListFileLexer.c +++ b/Source/cmListFileLexer.c @@ -2307,19 +2307,68 @@ cmListFileLexer* cmListFileLexer_New() /*--------------------------------------------------------------------------*/ void cmListFileLexer_Delete(cmListFileLexer* lexer) { - cmListFileLexer_SetFileName(lexer, 0); + cmListFileLexer_SetFileName(lexer, 0, 0); free(lexer); } /*--------------------------------------------------------------------------*/ -int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name) +static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) +{ + unsigned char b[2]; + if(fread(b, 1, 2, f) == 2) + { + if(b[0] == 0xEF && b[1] == 0xBB) + { + if(fread(b, 1, 1, f) == 1 && b[0] == 0xBF) + { + return cmListFileLexer_BOM_UTF8; + } + } + else if(b[0] == 0xFE && b[1] == 0xFF) + { + /* UTF-16 BE */ + return cmListFileLexer_BOM_UTF16BE; + } + else if(b[0] == 0 && b[1] == 0) + { + if(fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) + { + return cmListFileLexer_BOM_UTF32BE; + } + } + else if(b[0] == 0xFF && b[1] == 0xFE) + { + fpos_t p; + fgetpos(f, &p); + if(fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) + { + return cmListFileLexer_BOM_UTF32LE; + } + fsetpos(f, &p); + return cmListFileLexer_BOM_UTF16LE; + } + } + rewind(f); + return cmListFileLexer_BOM_None; +} + +/*--------------------------------------------------------------------------*/ +int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name, + cmListFileLexer_BOM* bom) { int result = 1; cmListFileLexerDestroy(lexer); if(name) { lexer->file = fopen(name, "r"); - if(!lexer->file) + if(lexer->file) + { + if(bom) + { + *bom = cmListFileLexer_ReadBOM(lexer->file); + } + } + else { result = 0; } @@ -2365,7 +2414,7 @@ cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer* lexer) } else { - cmListFileLexer_SetFileName(lexer, 0); + cmListFileLexer_SetFileName(lexer, 0, 0); return 0; } } diff --git a/Source/cmListFileLexer.h b/Source/cmListFileLexer.h index cc78b5c..719347c 100644 --- a/Source/cmListFileLexer.h +++ b/Source/cmListFileLexer.h @@ -36,6 +36,17 @@ struct cmListFileLexer_Token_s int column; }; +enum cmListFileLexer_BOM_e +{ + cmListFileLexer_BOM_None, + cmListFileLexer_BOM_UTF8, + cmListFileLexer_BOM_UTF16BE, + cmListFileLexer_BOM_UTF16LE, + cmListFileLexer_BOM_UTF32BE, + cmListFileLexer_BOM_UTF32LE +}; +typedef enum cmListFileLexer_BOM_e cmListFileLexer_BOM; + typedef struct cmListFileLexer_s cmListFileLexer; #ifdef __cplusplus @@ -44,7 +55,8 @@ extern "C" #endif cmListFileLexer* cmListFileLexer_New(); -int cmListFileLexer_SetFileName(cmListFileLexer*, const char*); +int cmListFileLexer_SetFileName(cmListFileLexer*, const char*, + cmListFileLexer_BOM* bom); int cmListFileLexer_SetString(cmListFileLexer*, const char*); cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer*); long cmListFileLexer_GetCurrentLine(cmListFileLexer*); diff --git a/Source/cmListFileLexer.in.l b/Source/cmListFileLexer.in.l index 89f2917..a660d37 100644 --- a/Source/cmListFileLexer.in.l +++ b/Source/cmListFileLexer.in.l @@ -328,19 +328,68 @@ cmListFileLexer* cmListFileLexer_New() /*--------------------------------------------------------------------------*/ void cmListFileLexer_Delete(cmListFileLexer* lexer) { - cmListFileLexer_SetFileName(lexer, 0); + cmListFileLexer_SetFileName(lexer, 0, 0); free(lexer); } /*--------------------------------------------------------------------------*/ -int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name) +static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) +{ + unsigned char b[2]; + if(fread(b, 1, 2, f) == 2) + { + if(b[0] == 0xEF && b[1] == 0xBB) + { + if(fread(b, 1, 1, f) == 1 && b[0] == 0xBF) + { + return cmListFileLexer_BOM_UTF8; + } + } + else if(b[0] == 0xFE && b[1] == 0xFF) + { + /* UTF-16 BE */ + return cmListFileLexer_BOM_UTF16BE; + } + else if(b[0] == 0 && b[1] == 0) + { + if(fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) + { + return cmListFileLexer_BOM_UTF32BE; + } + } + else if(b[0] == 0xFF && b[1] == 0xFE) + { + fpos_t p; + fgetpos(f, &p); + if(fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) + { + return cmListFileLexer_BOM_UTF32LE; + } + fsetpos(f, &p); + return cmListFileLexer_BOM_UTF16LE; + } + } + rewind(f); + return cmListFileLexer_BOM_None; +} + +/*--------------------------------------------------------------------------*/ +int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name, + cmListFileLexer_BOM* bom) { int result = 1; cmListFileLexerDestroy(lexer); if(name) { lexer->file = fopen(name, "r"); - if(!lexer->file) + if(lexer->file) + { + if(bom) + { + *bom = cmListFileLexer_ReadBOM(lexer->file); + } + } + else { result = 0; } @@ -386,7 +435,7 @@ cmListFileLexer_Token* cmListFileLexer_Scan(cmListFileLexer* lexer) } else { - cmListFileLexer_SetFileName(lexer, 0); + cmListFileLexer_SetFileName(lexer, 0, 0); return 0; } } diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-BE-result.txt b/Tests/RunCMake/Syntax/BOM-UTF-16-BE-result.txt new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-16-BE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-BE-stderr.txt b/Tests/RunCMake/Syntax/BOM-UTF-16-BE-stderr.txt new file mode 100644 index 0000000..b3f1e47 --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-16-BE-stderr.txt @@ -0,0 +1,6 @@ +CMake Error at CMakeLists.txt:3 \(include\): + File + + .*/Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake + + starts with a Byte-Order-Mark that is not UTF-8. diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake b/Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake new file mode 100644 index 0000000..c51f6e6 Binary files /dev/null and b/Tests/RunCMake/Syntax/BOM-UTF-16-BE.cmake differ diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-LE-result.txt b/Tests/RunCMake/Syntax/BOM-UTF-16-LE-result.txt new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-16-LE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-LE-stderr.txt b/Tests/RunCMake/Syntax/BOM-UTF-16-LE-stderr.txt new file mode 100644 index 0000000..c08c902 --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-16-LE-stderr.txt @@ -0,0 +1,6 @@ +CMake Error at CMakeLists.txt:3 \(include\): + File + + .*/Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake + + starts with a Byte-Order-Mark that is not UTF-8. diff --git a/Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake b/Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake new file mode 100644 index 0000000..b57446f Binary files /dev/null and b/Tests/RunCMake/Syntax/BOM-UTF-16-LE.cmake differ diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-BE-result.txt b/Tests/RunCMake/Syntax/BOM-UTF-32-BE-result.txt new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-32-BE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-BE-stderr.txt b/Tests/RunCMake/Syntax/BOM-UTF-32-BE-stderr.txt new file mode 100644 index 0000000..5dde4e3 --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-32-BE-stderr.txt @@ -0,0 +1,6 @@ +CMake Error at CMakeLists.txt:3 \(include\): + File + + .*/Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake + + starts with a Byte-Order-Mark that is not UTF-8. diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake b/Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake new file mode 100644 index 0000000..23c57f3 Binary files /dev/null and b/Tests/RunCMake/Syntax/BOM-UTF-32-BE.cmake differ diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-LE-result.txt b/Tests/RunCMake/Syntax/BOM-UTF-32-LE-result.txt new file mode 100644 index 0000000..d00491f --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-32-LE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-LE-stderr.txt b/Tests/RunCMake/Syntax/BOM-UTF-32-LE-stderr.txt new file mode 100644 index 0000000..eb054ec --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-32-LE-stderr.txt @@ -0,0 +1,6 @@ +CMake Error at CMakeLists.txt:3 \(include\): + File + + .*/Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake + + starts with a Byte-Order-Mark that is not UTF-8. diff --git a/Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake b/Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake new file mode 100644 index 0000000..c330f5b Binary files /dev/null and b/Tests/RunCMake/Syntax/BOM-UTF-32-LE.cmake differ diff --git a/Tests/RunCMake/Syntax/BOM-UTF-8-stdout.txt b/Tests/RunCMake/Syntax/BOM-UTF-8-stdout.txt new file mode 100644 index 0000000..5776d6e --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-8-stdout.txt @@ -0,0 +1 @@ +-- message diff --git a/Tests/RunCMake/Syntax/BOM-UTF-8.cmake b/Tests/RunCMake/Syntax/BOM-UTF-8.cmake new file mode 100644 index 0000000..bdff83b --- /dev/null +++ b/Tests/RunCMake/Syntax/BOM-UTF-8.cmake @@ -0,0 +1 @@ +message(STATUS "message") diff --git a/Tests/RunCMake/Syntax/RunCMakeTest.cmake b/Tests/RunCMake/Syntax/RunCMakeTest.cmake index 2d87328..d1a15c8 100644 --- a/Tests/RunCMake/Syntax/RunCMakeTest.cmake +++ b/Tests/RunCMake/Syntax/RunCMakeTest.cmake @@ -1,5 +1,10 @@ include(RunCMake) +run_cmake(BOM-UTF-8) +run_cmake(BOM-UTF-16-LE) +run_cmake(BOM-UTF-16-BE) +run_cmake(BOM-UTF-32-LE) +run_cmake(BOM-UTF-32-BE) run_cmake(CommandSpaces) run_cmake(CommandTabs) run_cmake(CommandNewlines) -- cgit v0.12