Truncate garbabe bytes, don't null pad. More testing

2026-02-15 18:19:12 -06:00 · 2025-11-25 13:21:43 -08:00
parent 108a2f1d8e
commit b065275a17
5 changed files with 94 additions and 24 deletions
--- a/go/store/nbs/journal_record.go
+++ b/go/store/nbs/journal_record.go
@@ -22,6 +22,7 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"os"
 	"time"

 	"github.com/dolthub/dolt/go/store/d"
@@ -372,6 +373,17 @@ func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb f
 		return 0, err
 	}

+	if f, ok := r.(*os.File); ok {
+		err = f.Truncate(off)
+		if err != nil {
+			return 0, err
+		}
+		err = f.Sync()
+		if err != nil {
+			return 0, err
+		}
+	}
+
 	return off, nil
 }

--- a/go/store/nbs/journal_writer.go
+++ b/go/store/nbs/journal_writer.go
@@ -34,12 +34,6 @@ import (
 )

 const (
-	// chunkJournalFileSize is the size we initialize the journal file to when it is first created. We
-	// create a 16KB block of zero-initialized data and then sync the file to the first byte. We do this
-	// to ensure that we can write to the journal file and that we have some space for initial records.
-	// This probably isn't strictly necessary, but it also doesn't hurt.
-	chunkJournalFileSize = 16 * 1024
-
 	// journalWriterBuffSize is the size of the statically allocated buffer where journal records are
 	// built before being written to the journal file on disk. There is not a hard limit on the size
 	// of records – specifically, some newer data chunking formats (i.e. optimized JSON storage) can
@@ -121,27 +115,9 @@ func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, e
 		return nil, err
 	}

-	// Open the journal file and initialize it with 16KB of zero bytes. This is intended to
-	// ensure that we can write to the journal and to allocate space for the first set of
-	// records, but probably isn't strictly necessary.
 	if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666); err != nil {
 		return nil, err
 	}
-	const batch = 1024 * 1024
-	b := make([]byte, batch)
-	for i := 0; i < chunkJournalFileSize; i += batch {
-		if _, err = f.Write(b); err != nil { // zero fill |f|
-			return nil, err
-		}
-	}
-	if err = f.Sync(); err != nil {
-		return nil, err
-	}
-	if o, err := f.Seek(0, io.SeekStart); err != nil {
-		return nil, err
-	} else if o != 0 {
-		return nil, fmt.Errorf("expected file journalOffset 0, got %d", o)
-	}

 	return &journalWriter{
 		buf:     make([]byte, 0, journalWriterBuffSize),
--- a/integration-tests/bats/chunk-journal.bats
+++ b/integration-tests/bats/chunk-journal.bats
@@ -35,3 +35,78 @@ teardown() {
    [ -s ".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" ]
    [ -s ".dolt/noms/journal.idx" ]
 }
+
+file_size() {
+  # mac and linux differ on stat args.
+  if stat -c%s "$1" >/dev/null 2>&1; then
+    stat -c%s "$1"
+  else
+    stat -f%z "$1"
+  fi
+}
+
+
+@test "chunk-journal: verify null padding on journal file is truncated" {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  # capture original size
+  local orig_size=$(file_size "$journal")
+
+  # append 4K of null bytes. Historically, null bytes were found in journal files all the time.
+  dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
+
+  # sanity check size actually grew
+  local grown_size=$(file_size "$journal")
+  [ "$grown_size" -gt "$orig_size" ]
+
+  # This should truncate the journal.
+  dolt status
+
+  local final_size=$(file_size "$journal")
+
+  [ "$final_size" -eq "$orig_size" ]
+}
+
+@test "chunk-journal: verify garbage padding on journal file is truncated, reported." {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  local orig_size=$(file_size "$journal")
+
+  # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
+  printf '\x00\x50\x00\x01' >>"$journal"
+
+  # append 4K of null bytes. This will be ignored, but ensures that we have more than a root hash's worth of garbage.
+  dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
+
+  # This should truncate the journal.
+  run dolt status
+  [ "$status" -eq 0 ]
+  [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
+
+  local final_size=$(file_size "$journal")
+
+  [ "$final_size" -eq "$orig_size" ]
+}
+
+@test "chunk-journal: verify dataloss detection does not truncate the file." {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
+  printf '\x00\x50\x00\x01' >>"$journal"
+
+  cat $BATS_CWD/corrupt_dbs/journal_data.bin >>"$journal"
+
+  local grown_size=$(file_size "$journal")
+
+  run dolt status
+  [ "$status" -eq 1 ]
+  [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
+
+  local final_size=$(file_size "$journal")
+
+  # file should be unchanged after mangling it to have data loss.
+  [ "$final_size" -eq "$grown_size" ]
+}
--- a/integration-tests/bats/corrupt_dbs/README.md
+++ b/integration-tests/bats/corrupt_dbs/README.md
@@ -12,3 +12,10 @@ and perhaps other tools in the future. Please catalog the contents of each datab
  This database contains a journal file which has been altered to have an object (7i48kt4h41hcjniri7scv5m8a69cdn13)
  which has a bad CRC. The object in question doesn't trip any problems until it's fully loaded, which indicates that the CRC for the journal record is correct, but the data is not.

+## journal_data.bin
+  This is a binary file which constitutes a valid journal root hash record followed by a
+  chunk record (100 bytes of random data). This file can be appended to assist in testing for
+  dataloss. We parse journal files until we can't parse them any longer, so you need to append
+  null bytes or random data to the end of a journal file to get into that state. That does
+  not constitute dataloss though - which is why you append this file to the journal because
+  these bytes are parsable. 
--- a/integration-tests/bats/corrupt_dbs/journal_data.bin
+++ b/integration-tests/bats/corrupt_dbs/journal_data.bin