cmListFileLexer: Do not require files to be seekable

Read the BOM sequentially and store the read bytes for later use if these do not correspond to a BOM. This allows FIFO to be used as input, e.g., for piping input or Bash process substitution.
2026-01-06 05:40:54 -06:00 · 2025-09-08 17:07:47 +02:00
parent 9a53a28596
commit 54161c70d5
13 changed files with 178 additions and 61 deletions
--- a/Source/LexerParser/cmListFileLexer.c
+++ b/Source/LexerParser/cmListFileLexer.c
@@ -773,6 +773,9 @@ struct cmListFileLexer_s
  size_t size;
  FILE* file;
  size_t cr;
+  char read_buffer[4];
+  size_t read_size;
+  size_t read_position;
  char* string_buffer;
  char* string_position;
  size_t string_left;
@@ -2626,9 +2629,26 @@ static int cmListFileLexerInput(cmListFileLexer* lexer, char* buffer,
         does not convert newlines on all platforms.  Move any
         trailing CR to the start of the buffer for the next read. */
      size_t cr = lexer->cr;
-      size_t n;
+      size_t n = 0;
      buffer[0] = '\r';
-      n = fread(buffer + cr, 1, bufferSize - cr, lexer->file);
+
+      size_t actualBufferSize = bufferSize - cr;
+      char* p = buffer + cr;
+      size_t readLeft = lexer->read_size - lexer->read_position;
+
+      /* Absorb the bytes that were read during BOM detection, if any. */
+      if (readLeft > 0) {
+        size_t actualReadSize =
+          actualBufferSize >= readLeft ? readLeft : actualBufferSize;
+        memcpy(p, lexer->read_buffer + lexer->read_position, actualReadSize);
+        lexer->read_position += actualReadSize;
+        p += actualReadSize;
+        n += actualReadSize;
+        actualBufferSize -= actualReadSize;
+      }
+
+      n += fread(p, 1, actualBufferSize, lexer->file);
+
      if (n) {
        char* o = buffer;
        const char* i = buffer;
@@ -2682,6 +2702,11 @@ static void cmListFileLexerDestroy(cmListFileLexer* lexer)
      fclose(lexer->file);
      lexer->file = 0;
    }
+    if (lexer->read_size != 0) {
+      memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
+      lexer->read_size = 0;
+      lexer->read_position = 0;
+    }
    if (lexer->string_buffer) {
      free(lexer->string_buffer);
      lexer->string_buffer = 0;
@@ -2712,45 +2737,66 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
 }

 /*--------------------------------------------------------------------------*/
-static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
+static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f,
+                                                   unsigned char readBuffer[4],
+                                                   size_t* readSize)
 {
-  unsigned char b[2];
-  size_t n;
-  if (fread(b, 1, 2, f) == 2) {
+  /* Read the up to four bytes that might correspond to a BOM. In case these
+     bytes turn out not to represent a BOM, save them for later consumption in
+     order to avoid seeking the file (which might not be seekable, e.g., if
+     it's a pipe). */
+  unsigned char* b = readBuffer;
+
+  size_t n = fread(b, 1, 2, f);
+  *readSize = n; /* Initialize first and then accumulate */
+
+  if (n == 2) {
    if (b[0] == 0xEF && b[1] == 0xBB) {
-      if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
-        return cmListFileLexer_BOM_UTF8;
+      n = fread(b + 2, 1, 1, f);
+      *readSize += n;
+
+      if (n == 1) {
+        if (b[2] == 0xBF) {
+          *readSize = 0; /* We consumed the BOM: discard it */
+          return cmListFileLexer_BOM_UTF8;
+        }
      }
    } else if (b[0] == 0xFE && b[1] == 0xFF) {
+      *readSize = 0; /* We consumed the BOM: discard it */
      /* UTF-16 BE */
      return cmListFileLexer_BOM_UTF16BE;
    } else if (b[0] == 0 && b[1] == 0) {
-      if (fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) {
-        return cmListFileLexer_BOM_UTF32BE;
+      n = fread(b + 2, 1, 2, f);
+      *readSize += n;
+
+      if (n == 2) {
+        if (b[2] == 0xFE && b[3] == 0xFF) {
+          *readSize = 0; /* We consumed the BOM: discard it */
+          return cmListFileLexer_BOM_UTF32BE;
+        }
      }
    } else if (b[0] == 0xFF && b[1] == 0xFE) {
-      fpos_t p;
-      fgetpos(f, &p);
-      n = fread(b, 1, 2, f);
-      if (n == 2 && b[0] == 0 && b[1] == 0) {
+      n = fread(b + 2, 1, 2, f);
+      *readSize += n;
+
+      if (n == 2 && b[2] == 0 && b[3] == 0) {
+        *readSize = 0; /* We consumed the BOM: discard it */
        return cmListFileLexer_BOM_UTF32LE;
      }
-      if (fsetpos(f, &p) != 0) {
-        return cmListFileLexer_BOM_Broken;
-      }
+
      /* In case we were able to subsequently read only a single byte out of two
         (i.e., three in total), the file must be corrupt and the BOM cannot
         represent a UTF-16-LE BOM since each code unit must consist of two
         bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
         UTF-16-LE input. */
      if (n % 2 == 0) {
+        *readSize = n; /* We consumed the read bytes as BOM only partially */
+        memmove(b, b + 2, n);
        return cmListFileLexer_BOM_UTF16LE;
      }
    }
  }
-  if (fseek(f, 0, SEEK_SET) != 0) {
-    return cmListFileLexer_BOM_Broken;
-  }
+
  return cmListFileLexer_BOM_None;
 }

@@ -2770,7 +2816,13 @@ int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
 #endif
    if (lexer->file) {
      if (bom) {
-        *bom = cmListFileLexer_ReadBOM(lexer->file);
+        *bom = cmListFileLexer_ReadBOM(
+          lexer->file, (unsigned char*)lexer->read_buffer, &lexer->read_size);
+        lexer->read_position = 0;
+      } else {
+        memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
+        lexer->read_size = 0;
+        lexer->read_position = 0;
      }
    } else {
      result = 0;
@@ -2789,10 +2841,15 @@ int cmListFileLexer_SetString(cmListFileLexer* lexer, char const* text,
  /* text might be not NULL while length is 0. However, on some platforms
     malloc(0) will return NULL. To avoid signaling an error to the caller in
     such cases, ensure nonzero length. */
-  if (length > 0) {
-    lexer->string_buffer = (char*)malloc(length);
+  size_t read_size = lexer->read_size - lexer->read_position;
+  size_t string_size = read_size + length;
+  if (string_size > 0) {
+    lexer->string_buffer = (char*)malloc(string_size);
    if (lexer->string_buffer) {
-      memcpy(lexer->string_buffer, text, length);
+      memcpy(lexer->string_buffer, lexer->read_buffer + lexer->read_position,
+             read_size);
+      memcpy(lexer->string_buffer + read_size, text, length);
+      lexer->read_position += read_size;
      lexer->string_position = lexer->string_buffer;
      lexer->string_left = length;
    } else {
--- a/Source/LexerParser/cmListFileLexer.in.l
+++ b/Source/LexerParser/cmListFileLexer.in.l
@@ -39,6 +39,9 @@ struct cmListFileLexer_s
  size_t size;
  FILE* file;
  size_t cr;
+  char read_buffer[4];
+  size_t read_size;
+  size_t read_position;
  char* string_buffer;
  char* string_position;
  size_t string_left;
@@ -353,9 +356,26 @@ static int cmListFileLexerInput(cmListFileLexer* lexer, char* buffer,
         does not convert newlines on all platforms.  Move any
         trailing CR to the start of the buffer for the next read. */
      size_t cr = lexer->cr;
-      size_t n;
+      size_t n = 0;
      buffer[0] = '\r';
-      n = fread(buffer + cr, 1, bufferSize - cr, lexer->file);
+
+      size_t actualBufferSize = bufferSize - cr;
+      char* p = buffer + cr;
+      size_t readLeft = lexer->read_size - lexer->read_position;
+
+      /* Absorb the bytes that were read during BOM detection, if any. */
+      if (readLeft > 0) {
+        size_t actualReadSize =
+          actualBufferSize >= readLeft ? readLeft : actualBufferSize;
+        memcpy(p, lexer->read_buffer + lexer->read_position, actualReadSize);
+        lexer->read_position += actualReadSize;
+        p += actualReadSize;
+        n += actualReadSize;
+        actualBufferSize -= actualReadSize;
+      }
+
+      n += fread(p, 1, actualBufferSize, lexer->file);
+
      if (n) {
        char* o = buffer;
        const char* i = buffer;
@@ -409,6 +429,11 @@ static void cmListFileLexerDestroy(cmListFileLexer* lexer)
      fclose(lexer->file);
      lexer->file = 0;
    }
+    if (lexer->read_size != 0) {
+      memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
+      lexer->read_size = 0;
+      lexer->read_position = 0;
+    }
    if (lexer->string_buffer) {
      free(lexer->string_buffer);
      lexer->string_buffer = 0;
@@ -439,45 +464,66 @@ void cmListFileLexer_Delete(cmListFileLexer* lexer)
 }

 /*--------------------------------------------------------------------------*/
-static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f)
+static cmListFileLexer_BOM cmListFileLexer_ReadBOM(FILE* f,
+                                                   unsigned char readBuffer[4],
+                                                   size_t* readSize)
 {
-  unsigned char b[2];
-  size_t n;
-  if (fread(b, 1, 2, f) == 2) {
+  /* Read the up to four bytes that might correspond to a BOM. In case these
+     bytes turn out not to represent a BOM, save them for later consumption in
+     order to avoid seeking the file (which might not be seekable, e.g., if
+     it's a pipe). */
+  unsigned char* b = readBuffer;
+
+  size_t n = fread(b, 1, 2, f);
+  *readSize = n; /* Initialize first and then accumulate */
+
+  if (n == 2) {
    if (b[0] == 0xEF && b[1] == 0xBB) {
-      if (fread(b, 1, 1, f) == 1 && b[0] == 0xBF) {
-        return cmListFileLexer_BOM_UTF8;
+      n = fread(b + 2, 1, 1, f);
+      *readSize += n;
+
+      if (n == 1) {
+        if (b[2] == 0xBF) {
+          *readSize = 0; /* We consumed the BOM: discard it */
+          return cmListFileLexer_BOM_UTF8;
+        }
      }
    } else if (b[0] == 0xFE && b[1] == 0xFF) {
+      *readSize = 0; /* We consumed the BOM: discard it */
      /* UTF-16 BE */
      return cmListFileLexer_BOM_UTF16BE;
    } else if (b[0] == 0 && b[1] == 0) {
-      if (fread(b, 1, 2, f) == 2 && b[0] == 0xFE && b[1] == 0xFF) {
-        return cmListFileLexer_BOM_UTF32BE;
+      n = fread(b + 2, 1, 2, f);
+      *readSize += n;
+
+      if (n == 2) {
+        if (b[2] == 0xFE && b[3] == 0xFF) {
+          *readSize = 0; /* We consumed the BOM: discard it */
+          return cmListFileLexer_BOM_UTF32BE;
+        }
      }
    } else if (b[0] == 0xFF && b[1] == 0xFE) {
-      fpos_t p;
-      fgetpos(f, &p);
-      n = fread(b, 1, 2, f);
-      if (n == 2 && b[0] == 0 && b[1] == 0) {
+      n = fread(b + 2, 1, 2, f);
+      *readSize += n;
+
+      if (n == 2 && b[2] == 0 && b[3] == 0) {
+        *readSize = 0; /* We consumed the BOM: discard it */
        return cmListFileLexer_BOM_UTF32LE;
      }
-      if (fsetpos(f, &p) != 0) {
-        return cmListFileLexer_BOM_Broken;
-      }
+
      /* In case we were able to subsequently read only a single byte out of two
         (i.e., three in total), the file must be corrupt and the BOM cannot
         represent a UTF-16-LE BOM since each code unit must consist of two
         bytes. This avoids incorrectly detecting an incomplete UTF-32-LE BOM as
         UTF-16-LE input. */
      if (n % 2 == 0) {
+        *readSize = n; /* We consumed the read bytes as BOM only partially */
+        memmove(b, b + 2, n);
        return cmListFileLexer_BOM_UTF16LE;
      }
    }
  }
-  if (fseek(f, 0, SEEK_SET) != 0) {
-    return cmListFileLexer_BOM_Broken;
-  }
+
  return cmListFileLexer_BOM_None;
 }

@@ -497,7 +543,13 @@ int cmListFileLexer_SetFileName(cmListFileLexer* lexer, const char* name,
 #endif
    if (lexer->file) {
      if (bom) {
-        *bom = cmListFileLexer_ReadBOM(lexer->file);
+        *bom = cmListFileLexer_ReadBOM(
+          lexer->file, (unsigned char*)lexer->read_buffer, &lexer->read_size);
+        lexer->read_position = 0;
+      } else {
+        memset(lexer->read_buffer, 0, sizeof(lexer->read_buffer));
+        lexer->read_size = 0;
+        lexer->read_position = 0;
      }
    } else {
      result = 0;
@@ -516,10 +568,15 @@ int cmListFileLexer_SetString(cmListFileLexer* lexer, char const* text,
  /* text might be not NULL while length is 0. However, on some platforms
     malloc(0) will return NULL. To avoid signaling an error to the caller in
     such cases, ensure nonzero length. */
-  if (length > 0) {
-    lexer->string_buffer = (char*)malloc(length);
+  size_t read_size = lexer->read_size - lexer->read_position;
+  size_t string_size = read_size + length;
+  if (string_size > 0) {
+    lexer->string_buffer = (char*)malloc(string_size);
    if (lexer->string_buffer) {
-      memcpy(lexer->string_buffer, text, length);
+      memcpy(lexer->string_buffer, lexer->read_buffer + lexer->read_position,
+             read_size);
+      memcpy(lexer->string_buffer + read_size, text, length);
+      lexer->read_position += read_size;
      lexer->string_position = lexer->string_buffer;
      lexer->string_left = length;
    } else {
--- a/Source/cmListFileCache.cxx
+++ b/Source/cmListFileCache.cxx
@@ -126,13 +126,6 @@ bool cmListFileParser::ParseFile(char const* filename)
    return false;
  }

-  if (bom == cmListFileLexer_BOM_Broken) {
-    cmListFileLexer_SetFileName(this->Lexer.get(), nullptr, nullptr);
-    this->IssueFileOpenError("Error while reading Byte-Order-Mark. "
-                             "File not seekable?");
-    return false;
-  }
-
  // Verify the Byte-Order-Mark, if any.
  if (bom != cmListFileLexer_BOM_None && bom != cmListFileLexer_BOM_UTF8) {
    cmListFileLexer_SetFileName(this->Lexer.get(), nullptr, nullptr);
--- a/Source/cmListFileLexer.h
+++ b/Source/cmListFileLexer.h
@@ -40,7 +40,6 @@ struct cmListFileLexer_Token_s
 enum cmListFileLexer_BOM_e
 {
  cmListFileLexer_BOM_None,
-  cmListFileLexer_BOM_Broken,
  cmListFileLexer_BOM_UTF8,
  cmListFileLexer_BOM_UTF16BE,
  cmListFileLexer_BOM_UTF16LE,
--- a/Tests/RunCMake/CommandLine/RunCMakeTest.cmake
+++ b/Tests/RunCMake/CommandLine/RunCMakeTest.cmake
@@ -1100,15 +1100,19 @@ set(CMAKE_RELATIVE_PATH_TOP_BINARY \"${RunCMake_TEST_BINARY_DIR}\")
 endfunction()
 run_cmake_depends()

-function(reject_fifo)
+function(accept_fifo)
  find_program(BASH_EXECUTABLE bash)
  if(BASH_EXECUTABLE)
    set(BASH_COMMAND_ARGUMENT "'${CMAKE_COMMAND}' -P <(echo 'return()')")
-    run_cmake_command(reject_fifo ${BASH_EXECUTABLE} -c ${BASH_COMMAND_ARGUMENT})
+    run_cmake_command(accept_fifo ${BASH_EXECUTABLE} -c ${BASH_COMMAND_ARGUMENT})
+
+    set(source_dir ${RunCMake_SOURCE_DIR}/Toolchain)
+    run_cmake_command(fifo_empty_initial_cache_process_substitution ${BASH_EXECUTABLE}
+      -c "\"${CMAKE_COMMAND}\" -C <(echo) -S \"${source_dir}\" -B \"${RunCMake_BINARY_DIR}/fifo-empty-initial-cache\"")
  endif()
 endfunction()
 if(CMAKE_HOST_UNIX AND NOT CMAKE_SYSTEM_NAME STREQUAL "CYGWIN" AND NOT CMAKE_SYSTEM_NAME STREQUAL "MSYS")
-  reject_fifo()
+  accept_fifo()
  run_cmake_command(closed_stdin  sh -c "\"${CMAKE_COMMAND}\" --version <&-")
  run_cmake_command(closed_stdout sh -c "\"${CMAKE_COMMAND}\" --version >&-")
  run_cmake_command(closed_stderr sh -c "\"${CMAKE_COMMAND}\" --version 2>&-")
--- a/Tests/RunCMake/CommandLine/accept_fifo-result.txt
+++ b/Tests/RunCMake/CommandLine/accept_fifo-result.txt
@@ -0,0 +1 @@
+0
--- a/Tests/RunCMake/CommandLine/accept_fifo-stderr.txt
+++ b/Tests/RunCMake/CommandLine/accept_fifo-stderr.txt
@@ -0,0 +1 @@
+^$
--- a/Tests/RunCMake/CommandLine/reject_fifo-stderr.txt
+++ b/Tests/RunCMake/CommandLine/reject_fifo-stderr.txt
@@ -1,2 +0,0 @@
-CMake Error in .*
-  Error while reading Byte-Order-Mark\.  File not seekable\?
--- a/Tests/RunCMake/Syntax/.gitattributes
+++ b/Tests/RunCMake/Syntax/.gitattributes
@@ -1,3 +1,4 @@
 CommandTabs.cmake   whitespace=-tab-in-indent
 StringCRLF.cmake    eol=crlf
 BracketCRLF.cmake   eol=crlf
+OneCharacter.cmake  binary
--- a/Tests/RunCMake/CommandLine/reject_fifo-result.txt
+++ b/Tests/RunCMake/CommandLine/reject_fifo-result.txt
--- a/Tests/RunCMake/Syntax/OneCharacter-stderr.txt
+++ b/Tests/RunCMake/Syntax/OneCharacter-stderr.txt
@@ -0,0 +1,4 @@
+CMake Error at OneCharacter.cmake:1:
+  Unexpected end of file.
+
+  Parse error.  Function missing opening "\(".
--- a/Tests/RunCMake/Syntax/OneCharacter.cmake
+++ b/Tests/RunCMake/Syntax/OneCharacter.cmake
@@ -0,0 +1 @@
+a
--- a/Tests/RunCMake/Syntax/RunCMakeTest.cmake
+++ b/Tests/RunCMake/Syntax/RunCMakeTest.cmake
@@ -7,6 +7,7 @@ run_cmake(BOM-UTF-32-LE)
 run_cmake(BOM-UTF-32-BE)
 run_cmake(Broken-BOM-UTF-32-LE)
 run_cmake(Broken-BOM-UTF-32-BE)
+run_cmake(OneCharacter)
 run_cmake(CommandSpaces)
 run_cmake(CommandTabs)
 run_cmake(CommandNewlines)