From ca072e3734f35511f989111e79db981ba1eea8e2 Mon Sep 17 00:00:00 2001 From: Sergiu Deitsch Date: Thu, 11 Sep 2025 15:13:49 +0200 Subject: [PATCH] cmListFileLexer: Test for broken UTF-32-(BE|LE) BOM --- Source/LexerParser/cmListFileLexer.c | 13 +++++++++++-- Source/LexerParser/cmListFileLexer.in.l | 13 +++++++++++-- .../Syntax/Broken-BOM-UTF-32-BE-result.txt | 1 + .../Syntax/Broken-BOM-UTF-32-BE-stderr.txt | Bin 0 -> 121 bytes Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE.cmake | Bin 0 -> 3 bytes .../Syntax/Broken-BOM-UTF-32-LE-result.txt | 1 + .../Syntax/Broken-BOM-UTF-32-LE-stderr.txt | Bin 0 -> 121 bytes Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE.cmake | Bin 0 -> 3 bytes Tests/RunCMake/Syntax/RunCMakeTest.cmake | 2 ++ 9 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE.cmake create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-result.txt create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE-stderr.txt create mode 100644 Tests/RunCMake/Syntax/Broken-BOM-UTF-32-LE.cmake diff --git a/Source/LexerParser/cmListFileLexer.c b/Source/LexerParser/cmListFileLexer.c index 8bf12a630e..e945afcfb7 100644 --- a/Source/LexerParser/cmListFileLexer.c +++ b/Source/LexerParser/cmListFileLexer.c @@ -2715,6 +2715,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) { unsigned char b[2]; + size_t n; if (fread(b, 1, 2, f) == 2) { if (b[0] == 0xEF && b[1] == 0xBB) { if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) { @@ -2730,13 +2731,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) } else if (b[0] == 0xFF && b[1] == 0xFE) { fpos_t p; fgetpos(f, &p); - if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) { + n = fread(b, 1, 2, f); + if (n == 2 && b[0] == 0 && b[1] == 0) { return cmListFileLexer_BOM_UTF32LE; } if (fsetpos(f, &p) != 0) { return cmListFileLexer_BOM_Broken; } - return cmListFileLexer_BOM_UTF16LE; + /* In case we were able to subsequently read only a single byte out of two + (i.e., three in total), the file must be corrupt and the BOM cannot + represent a UTF-16-LE BOM since each code unit must consist of two + bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as + UTF-16-LE input. */ + if (n % 2 == 0) { + return cmListFileLexer_BOM_UTF16LE; + } } } if (fseek(f, 0, SEEK_SET) != 0) { diff --git a/Source/LexerParser/cmListFileLexer.in.l b/Source/LexerParser/cmListFileLexer.in.l index 21e611fdf8..4ae18aae4f 100644 --- a/Source/LexerParser/cmListFileLexer.in.l +++ b/Source/LexerParser/cmListFileLexer.in.l @@ -442,6 +442,7 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer) static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) { unsigned char b[2]; + size_t n; if (fread(b, 1, 2, f) == 2) { if (b[0] == 0xEF && b[1] == 0xBB) { if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) { @@ -457,13 +458,21 @@ static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f) } else if (b[0] == 0xFF && b[1] == 0xFE) { fpos_t p; fgetpos(f, &p); - if (fread(b, 1, 2, f) == 2 && b[0] == 0 && b[1] == 0) { + n = fread(b, 1, 2, f); + if (n == 2 && b[0] == 0 && b[1] == 0) { return cmListFileLexer_BOM_UTF32LE; } if (fsetpos(f, &p) != 0) { return cmListFileLexer_BOM_Broken; } - return cmListFileLexer_BOM_UTF16LE; + /* In case we were able to subsequently read only a single byte out of two + (i.e., three in total), the file must be corrupt and the BOM cannot + represent a UTF-16-LE BOM since each code unit must consist of two + bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as + UTF-16-LE input. */ + if (n % 2 == 0) { + return cmListFileLexer_BOM_UTF16LE; + } } } if (fseek(f, 0, SEEK_SET) != 0) { diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt new file mode 100644 index 0000000000..d00491fd7e --- /dev/null +++ b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-result.txt @@ -0,0 +1 @@ +1 diff --git a/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt b/Tests/RunCMake/Syntax/Broken-BOM-UTF-32-BE-stderr.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6c4874bffbbd9ca73da7ed58ba2d27aff9d7f77 GIT binary patch literal 121 zcmWm4u?~VT5C&jop5hNPZ31qLHFYC8-~*iPMGR?6&P(9w4fxI9b)<#iMT&qFPRhdB zxcg|H)6HCZ