cmListFileLexer: Do not require files to be seekable

Read the BOM sequentially and store the read bytes for later use if
these do not correspond to a BOM. This allows FIFO to be used as input,
e.g., for piping input or Bash process substitution.
This commit is contained in:
Sergiu Deitsch
2025-09-08 17:07:47 +02:00
parent 9a53a28596
commit 54161c70d5
13 changed files with 178 additions and 61 deletions

View File

@@ -773,6 +773,9 @@ struct cmListFileLexer_s
size_t size;
FILE* file;
size_t cr;
char read_buffer[4];
size_t read_size;
size_t read_position;
char* string_buffer;
char* string_position;
size_t string_left;
@@ -2626,9 +2629,26 @@ static int cmListFileLexerInput(cmListFileLexer* lexer, char* buffer,
does not convert newlines on all platforms. Move any
trailing CR to the start of the buffer for the next read. */
size_t cr = lexer->cr;
size_t n;
size_t n = 0;
buffer[0] = '\r';
n = fread(buffer + cr, 1, bufferSize - cr, lexer->file);
size_t actualBufferSize = bufferSize - cr;
char* p = buffer + cr;
size_t readLeft = lexer->read_size - lexer->read_position;
/* Absorb the bytes that were read during BOM detection, if any. */
if (readLeft > 0) {
size_t actualReadSize =
actualBufferSize >= readLeft ? readLeft : actualBufferSize;
memcpy(p, lexer->read_buffer + lexer->read_position, actualReadSize);
lexer->read_position += actualReadSize;
p += actualReadSize;
n += actualReadSize;
actualBufferSize -= actualReadSize;
}
n += fread(p, 1, actualBufferSize, lexer->file);
if (n) {
char* o = buffer;
const char* i = buffer;
@@ -2682,6 +2702,11 @@ static void cmListFileLexerDestroy(cmListFileLexer* lexer)
fclose(lexer->file);
lexer->file = 0;
}
if (lexer->read_size != 0) {
memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
lexer->read_size = 0;
lexer->read_position = 0;
}
if (lexer->string_buffer) {
free(lexer->string_buffer);
lexer->string_buffer = 0;
@@ -2712,45 +2737,66 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
}
/*--------------------------------------------------------------------------*/
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f,
unsigned char readBuffer[4],
size_t* readSize)
{
unsigned char b[2];
size_t n;
if (fread(b, 1, 2, f) == 2) {
/* Read the up to four bytes that might correspond to a BOM. In case these
bytes turn out not to represent a BOM, save them for later consumption in
order to avoid seeking the file (which might not be seekable, e.g., if
it's a pipe). */
unsigned char* b = readBuffer;
size_t n = fread(b, 1, 2, f);
*readSize = n; /* Initialize first and then accumulate */
if (n == 2) {
if (b[0] == 0xEF && b[1] == 0xBB) {
if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
return cmListFileLexer_BOM_UTF8;
n = fread(b + 2, 1, 1, f);
*readSize += n;
if (n == 1) {
if (b[2] == 0xBF) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF8;
}
}
} else if (b[0] == 0xFE && b[1] == 0xFF) {
*readSize = 0; /* We consumed the BOM: discard it */
/* UTF-16 BE */
return cmListFileLexer_BOM_UTF16BE;
} else if (b[0] == 0 && b[1] == 0) {
if (fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) {
return cmListFileLexer_BOM_UTF32BE;
n = fread(b + 2, 1, 2, f);
*readSize += n;
if (n == 2) {
if (b[2] == 0xFE && b[3] == 0xFF) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF32BE;
}
}
} else if (b[0] == 0xFF && b[1] == 0xFE) {
fpos_t p;
fgetpos(f, &p);
n = fread(b, 1, 2, f);
if (n == 2 && b[0] == 0 && b[1] == 0) {
n = fread(b + 2, 1, 2, f);
*readSize += n;
if (n == 2 && b[2] == 0 && b[3] == 0) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF32LE;
}
if (fsetpos(f, &p) != 0) {
return cmListFileLexer_BOM_Broken;
}
/* In case we were able to subsequently read only a single byte out of two
(i.e., three in total), the file must be corrupt and the BOM cannot
represent a UTF-16-LE BOM since each code unit must consist of two
bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
UTF-16-LE input. */
if (n % 2 == 0) {
*readSize = n; /* We consumed the read bytes as BOM only partially */
memmove(b, b + 2, n);
return cmListFileLexer_BOM_UTF16LE;
}
}
}
if (fseek(f, 0, SEEK_SET) != 0) {
return cmListFileLexer_BOM_Broken;
}
return cmListFileLexer_BOM_None;
}
@@ -2770,7 +2816,13 @@ int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
#endif
if (lexer->file) {
if (bom) {
*bom = cmListFileLexer_ReadBOM(lexer->file);
*bom = cmListFileLexer_ReadBOM(
lexer->file, (unsigned char*)lexer->read_buffer, &lexer->read_size);
lexer->read_position = 0;
} else {
memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
lexer->read_size = 0;
lexer->read_position = 0;
}
} else {
result = 0;
@@ -2789,10 +2841,15 @@ int cmListFileLexer_SetString(cmListFileLexer* lexer, char const* text,
/* text might be not NULL while length is 0. However, on some platforms
malloc(0) will return NULL. To avoid signaling an error to the caller in
such cases, ensure nonzero length. */
if (length > 0) {
lexer->string_buffer = (char*)malloc(length);
size_t read_size = lexer->read_size - lexer->read_position;
size_t string_size = read_size + length;
if (string_size > 0) {
lexer->string_buffer = (char*)malloc(string_size);
if (lexer->string_buffer) {
memcpy(lexer->string_buffer, text, length);
memcpy(lexer->string_buffer, lexer->read_buffer + lexer->read_position,
read_size);
memcpy(lexer->string_buffer + read_size, text, length);
lexer->read_position += read_size;
lexer->string_position = lexer->string_buffer;
lexer->string_left = length;
} else {

View File

@@ -39,6 +39,9 @@ struct cmListFileLexer_s
size_t size;
FILE* file;
size_t cr;
char read_buffer[4];
size_t read_size;
size_t read_position;
char* string_buffer;
char* string_position;
size_t string_left;
@@ -353,9 +356,26 @@ static int cmListFileLexerInput(cmListFileLexer* lexer, char* buffer,
does not convert newlines on all platforms. Move any
trailing CR to the start of the buffer for the next read. */
size_t cr = lexer->cr;
size_t n;
size_t n = 0;
buffer[0] = '\r';
n = fread(buffer + cr, 1, bufferSize - cr, lexer->file);
size_t actualBufferSize = bufferSize - cr;
char* p = buffer + cr;
size_t readLeft = lexer->read_size - lexer->read_position;
/* Absorb the bytes that were read during BOM detection, if any. */
if (readLeft > 0) {
size_t actualReadSize =
actualBufferSize >= readLeft ? readLeft : actualBufferSize;
memcpy(p, lexer->read_buffer + lexer->read_position, actualReadSize);
lexer->read_position += actualReadSize;
p += actualReadSize;
n += actualReadSize;
actualBufferSize -= actualReadSize;
}
n += fread(p, 1, actualBufferSize, lexer->file);
if (n) {
char* o = buffer;
const char* i = buffer;
@@ -409,6 +429,11 @@ static void cmListFileLexerDestroy(cmListFileLexer* lexer)
fclose(lexer->file);
lexer->file = 0;
}
if (lexer->read_size != 0) {
memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
lexer->read_size = 0;
lexer->read_position = 0;
}
if (lexer->string_buffer) {
free(lexer->string_buffer);
lexer->string_buffer = 0;
@@ -439,45 +464,66 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
}
/*--------------------------------------------------------------------------*/
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f,
unsigned char readBuffer[4],
size_t* readSize)
{
unsigned char b[2];
size_t n;
if (fread(b, 1, 2, f) == 2) {
/* Read the up to four bytes that might correspond to a BOM. In case these
bytes turn out not to represent a BOM, save them for later consumption in
order to avoid seeking the file (which might not be seekable, e.g., if
it's a pipe). */
unsigned char* b = readBuffer;
size_t n = fread(b, 1, 2, f);
*readSize = n; /* Initialize first and then accumulate */
if (n == 2) {
if (b[0] == 0xEF && b[1] == 0xBB) {
if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
return cmListFileLexer_BOM_UTF8;
n = fread(b + 2, 1, 1, f);
*readSize += n;
if (n == 1) {
if (b[2] == 0xBF) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF8;
}
}
} else if (b[0] == 0xFE && b[1] == 0xFF) {
*readSize = 0; /* We consumed the BOM: discard it */
/* UTF-16 BE */
return cmListFileLexer_BOM_UTF16BE;
} else if (b[0] == 0 && b[1] == 0) {
if (fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) {
return cmListFileLexer_BOM_UTF32BE;
n = fread(b + 2, 1, 2, f);
*readSize += n;
if (n == 2) {
if (b[2] == 0xFE && b[3] == 0xFF) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF32BE;
}
}
} else if (b[0] == 0xFF && b[1] == 0xFE) {
fpos_t p;
fgetpos(f, &p);
n = fread(b, 1, 2, f);
if (n == 2 && b[0] == 0 && b[1] == 0) {
n = fread(b + 2, 1, 2, f);
*readSize += n;
if (n == 2 && b[2] == 0 && b[3] == 0) {
*readSize = 0; /* We consumed the BOM: discard it */
return cmListFileLexer_BOM_UTF32LE;
}
if (fsetpos(f, &p) != 0) {
return cmListFileLexer_BOM_Broken;
}
/* In case we were able to subsequently read only a single byte out of two
(i.e., three in total), the file must be corrupt and the BOM cannot
represent a UTF-16-LE BOM since each code unit must consist of two
bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
UTF-16-LE input. */
if (n % 2 == 0) {
*readSize = n; /* We consumed the read bytes as BOM only partially */
memmove(b, b + 2, n);
return cmListFileLexer_BOM_UTF16LE;
}
}
}
if (fseek(f, 0, SEEK_SET) != 0) {
return cmListFileLexer_BOM_Broken;
}
return cmListFileLexer_BOM_None;
}
@@ -497,7 +543,13 @@ int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
#endif
if (lexer->file) {
if (bom) {
*bom = cmListFileLexer_ReadBOM(lexer->file);
*bom = cmListFileLexer_ReadBOM(
lexer->file, (unsigned char*)lexer->read_buffer, &lexer->read_size);
lexer->read_position = 0;
} else {
memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
lexer->read_size = 0;
lexer->read_position = 0;
}
} else {
result = 0;
@@ -516,10 +568,15 @@ int cmListFileLexer_SetString(cmListFileLexer* lexer, char const* text,
/* text might be not NULL while length is 0. However, on some platforms
malloc(0) will return NULL. To avoid signaling an error to the caller in
such cases, ensure nonzero length. */
if (length > 0) {
lexer->string_buffer = (char*)malloc(length);
size_t read_size = lexer->read_size - lexer->read_position;
size_t string_size = read_size + length;
if (string_size > 0) {
lexer->string_buffer = (char*)malloc(string_size);
if (lexer->string_buffer) {
memcpy(lexer->string_buffer, text, length);
memcpy(lexer->string_buffer, lexer->read_buffer + lexer->read_position,
read_size);
memcpy(lexer->string_buffer + read_size, text, length);
lexer->read_position += read_size;
lexer->string_position = lexer->string_buffer;
lexer->string_left = length;
} else {

View File

@@ -126,13 +126,6 @@ bool cmListFileParser::ParseFile(char const* filename)
return false;
}
if (bom == cmListFileLexer_BOM_Broken) {
cmListFileLexer_SetFileName(this->Lexer.get(), nullptr, nullptr);
this->IssueFileOpenError("Error while reading Byte-Order-Mark. "
"File not seekable?");
return false;
}
// Verify the Byte-Order-Mark, if any.
if (bom != cmListFileLexer_BOM_None && bom != cmListFileLexer_BOM_UTF8) {
cmListFileLexer_SetFileName(this->Lexer.get(), nullptr, nullptr);

View File

@@ -40,7 +40,6 @@ struct cmListFileLexer_Token_s
enum cmListFileLexer_BOM_e
{
cmListFileLexer_BOM_None,
cmListFileLexer_BOM_Broken,
cmListFileLexer_BOM_UTF8,
cmListFileLexer_BOM_UTF16BE,
cmListFileLexer_BOM_UTF16LE,

View File

@@ -1100,15 +1100,19 @@ set(CMAKE_RELATIVE_PATH_TOP_BINARY \"${RunCMake_TEST_BINARY_DIR}\")
endfunction()
run_cmake_depends()
function(reject_fifo)
function(accept_fifo)
find_program(BASH_EXECUTABLE bash)
if(BASH_EXECUTABLE)
set(BASH_COMMAND_ARGUMENT "'${CMAKE_COMMAND}' -P <(echo 'return()')")
run_cmake_command(reject_fifo ${BASH_EXECUTABLE} -c ${BASH_COMMAND_ARGUMENT})
run_cmake_command(accept_fifo ${BASH_EXECUTABLE} -c ${BASH_COMMAND_ARGUMENT})
set(source_dir ${RunCMake_SOURCE_DIR}/Toolchain)
run_cmake_command(fifo_empty_initial_cache_process_substitution ${BASH_EXECUTABLE}
-c "\"${CMAKE_COMMAND}\" -C <(echo) -S \"${source_dir}\" -B \"${RunCMake_BINARY_DIR}/fifo-empty-initial-cache\"")
endif()
endfunction()
if(CMAKE_HOST_UNIX AND NOT CMAKE_SYSTEM_NAME STREQUAL "CYGWIN" AND NOT CMAKE_SYSTEM_NAME STREQUAL "MSYS")
reject_fifo()
accept_fifo()
run_cmake_command(closed_stdin sh -c "\"${CMAKE_COMMAND}\" --version <&-")
run_cmake_command(closed_stdout sh -c "\"${CMAKE_COMMAND}\" --version >&-")
run_cmake_command(closed_stderr sh -c "\"${CMAKE_COMMAND}\" --version 2>&-")

View File

@@ -0,0 +1 @@
0

View File

@@ -0,0 +1 @@
^$

View File

@@ -1,2 +0,0 @@
CMake Error in .*
Error while reading Byte-Order-Mark\. File not seekable\?

View File

@@ -1,3 +1,4 @@
CommandTabs.cmake whitespace=-tab-in-indent
StringCRLF.cmake eol=crlf
BracketCRLF.cmake eol=crlf
OneCharacter.cmake binary

View File

@@ -0,0 +1,4 @@
CMake Error at OneCharacter.cmake:1:
Unexpected end of file.
Parse error. Function missing opening "\(".

View File

@@ -0,0 +1 @@
a

View File

@@ -7,6 +7,7 @@ run_cmake(BOM-UTF-32-LE)
run_cmake(BOM-UTF-32-BE)
run_cmake(Broken-BOM-UTF-32-LE)
run_cmake(Broken-BOM-UTF-32-BE)
run_cmake(OneCharacter)
run_cmake(CommandSpaces)
run_cmake(CommandTabs)
run_cmake(CommandNewlines)