diff --git a/Source/LexerParser/cmListFileLexer.c b/Source/LexerParser/cmListFileLexer.c index 8bf12a630e..e945afcfb7 100644 --- a/Source/LexerParser/cmListFileLexer.c +++ b/Source/LexerParser/cmListFileLexer.c @@ -2715,6 +2715,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) { unsigned char b[2]; + size_t n; if (fread(b, 1, 2, f) == 2) { if (b[0] == 0xEF && b[1] == 0xBB) { if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) { @@ -2730,13 +2731,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) } else if (b[0] == 0xFF && b[1] == 0xFE) { fpos_t p; fgetpos(f, &p); - if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) { + n = fread(b, 1, 2, f); + if (n == 2 && b[0] == 0 && b[1] == 0) { return cmListFileLexer_BOM_UTF32LE; } if (fsetpos(f, &p) != 0) { return cmListFileLexer_BOM_Broken; } - return cmListFileLexer_BOM_UTF16LE; + /* In case we were able to subsequently read only a single byte out of two + (i.e., three in total), the file must be corrupt and the BOM cannot + represent a UTF-16-LE BOM since each code unit must consist of two + bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as + UTF-16-LE input. */ + if (n % 2 == 0) { + return cmListFileLexer_BOM_UTF16LE; + } } } if (fseek(f, 0, SEEK_SET) != 0) { diff --git a/Source/LexerParser/cmListFileLexer.in.l b/Source/LexerParser/cmListFileLexer.in.l index 21e611fdf8..4ae18aae4f 100644 --- a/Source/LexerParser/cmListFileLexer.in.l +++ b/Source/LexerParser/cmListFileLexer.in.l @@ -442,6 +442,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) { unsigned char b[2]; + size_t n; if (fread(b, 1, 2, f) == 2) { if (b[0] == 0xEF && b[1] == 0xBB) { if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) { @@ -457,13 +458,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) } else if (b[0] == 0xFF && b[1] == 0xFE) { fpos_t p; fgetpos(f, &p); - if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) { + n = fread(b, 1, 2, f); + if (n == 2 && b[0] == 0 && b[1] == 0) { return cmListFileLexer_BOM_UTF32LE; } if (fsetpos(f, &p) != 0) { return cmListFileLexer_BOM_Broken; } - return cmListFileLexer_BOM_UTF16LE; + /* In case we were able to subsequently read only a single byte out of two + (i.e., three in total), the file must be corrupt and the BOM cannot + represent a UTF-16-LE BOM since each code unit must consist of two + bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as + UTF-16-LE input. */ + if (n % 2 == 0) { + return cmListFileLexer_BOM_UTF16LE; + } } } if (fseek(f, 0, SEEK_SET) != 0) { diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt new file mode 100644 index 0000000000..d00491fd7e --- /dev/null +++ b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt new file mode 100644 index 0000000000..d6c4874bff Binary files /dev/null and b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt differ diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE.cmake b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE.cmake new file mode 100644 index 0000000000..da856c17e0 Binary files /dev/null and b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE.cmake differ diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-result.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-result.txt new file mode 100644 index 0000000000..d00491fd7e --- /dev/null +++ b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-stderr.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-stderr.txt new file mode 100644 index 0000000000..c6968f9a76 Binary files /dev/null and b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-stderr.txt differ diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE.cmake b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE.cmake new file mode 100644 index 0000000000..6e00d25c6c Binary files /dev/null and b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE.cmake differ diff --git a/Tests/RunCMake/Syntax/RunCMakeTest.cmake b/Tests/RunCMake/Syntax/RunCMakeTest.cmake index 2817f764f3..42ea532f12 100644 --- a/Tests/RunCMake/Syntax/RunCMakeTest.cmake +++ b/Tests/RunCMake/Syntax/RunCMakeTest.cmake @@ -5,6 +5,8 @@ run_cmake(BOM-UTF-16-LE) run_cmake(BOM-UTF-16-BE) run_cmake(BOM-UTF-32-LE) run_cmake(BOM-UTF-32-BE) +run_cmake(Broken-BOM-UTF-32-LE) +run_cmake(Broken-BOM-UTF-32-BE) run_cmake(CommandSpaces) run_cmake(CommandTabs) run_cmake(CommandNewlines)