From b065275a17bcaecb28aababc82195f9ec1793069 Mon Sep 17 00:00:00 2001
From: Neil Macneale IV <neil@dolthub.com>
Date: Tue, 25 Nov 2025 13:21:43 -0800
Subject: [PATCH] Truncate garbabe bytes, don't null pad. More testing

---
 go/store/nbs/journal_record.go                |  12 +++
 go/store/nbs/journal_writer.go                |  24 ------
 integration-tests/bats/chunk-journal.bats     |  75 ++++++++++++++++++
 integration-tests/bats/corrupt_dbs/README.md  |   7 ++
 .../bats/corrupt_dbs/journal_data.bin         | Bin 0 -> 179 bytes
 5 files changed, 94 insertions(+), 24 deletions(-)
 create mode 100644 integration-tests/bats/corrupt_dbs/journal_data.bin

diff --git a/go/store/nbs/journal_record.go b/go/store/nbs/journal_record.go
index 19bf7dcc1d..30de99c917 100644
--- a/go/store/nbs/journal_record.go
+++ b/go/store/nbs/journal_record.go
@@ -22,6 +22,7 @@ import (
 	"fmt"
 	"io"
 	"math"
+	"os"
 	"time"
 
 	"github.com/dolthub/dolt/go/store/d"
@@ -372,6 +373,17 @@ func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb f
 		return 0, err
 	}
 
+	if f, ok := r.(*os.File); ok {
+		err = f.Truncate(off)
+		if err != nil {
+			return 0, err
+		}
+		err = f.Sync()
+		if err != nil {
+			return 0, err
+		}
+	}
+
 	return off, nil
 }
 
diff --git a/go/store/nbs/journal_writer.go b/go/store/nbs/journal_writer.go
index 7ada32c9a1..a462d54e4b 100644
--- a/go/store/nbs/journal_writer.go
+++ b/go/store/nbs/journal_writer.go
@@ -34,12 +34,6 @@ import (
 )
 
 const (
-	// chunkJournalFileSize is the size we initialize the journal file to when it is first created. We
-	// create a 16KB block of zero-initialized data and then sync the file to the first byte. We do this
-	// to ensure that we can write to the journal file and that we have some space for initial records.
-	// This probably isn't strictly necessary, but it also doesn't hurt.
-	chunkJournalFileSize = 16 * 1024
-
 	// journalWriterBuffSize is the size of the statically allocated buffer where journal records are
 	// built before being written to the journal file on disk. There is not a hard limit on the size
 	// of records – specifically, some newer data chunking formats (i.e. optimized JSON storage) can
@@ -121,27 +115,9 @@ func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, e
 		return nil, err
 	}
 
-	// Open the journal file and initialize it with 16KB of zero bytes. This is intended to
-	// ensure that we can write to the journal and to allocate space for the first set of
-	// records, but probably isn't strictly necessary.
 	if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666); err != nil {
 		return nil, err
 	}
-	const batch = 1024 * 1024
-	b := make([]byte, batch)
-	for i := 0; i < chunkJournalFileSize; i += batch {
-		if _, err = f.Write(b); err != nil { // zero fill |f|
-			return nil, err
-		}
-	}
-	if err = f.Sync(); err != nil {
-		return nil, err
-	}
-	if o, err := f.Seek(0, io.SeekStart); err != nil {
-		return nil, err
-	} else if o != 0 {
-		return nil, fmt.Errorf("expected file journalOffset 0, got %d", o)
-	}
 
 	return &journalWriter{
 		buf:     make([]byte, 0, journalWriterBuffSize),
diff --git a/integration-tests/bats/chunk-journal.bats b/integration-tests/bats/chunk-journal.bats
index 4ab022a88d..a3210a67c1 100644
--- a/integration-tests/bats/chunk-journal.bats
+++ b/integration-tests/bats/chunk-journal.bats
@@ -35,3 +35,78 @@ teardown() {
     [ -s ".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" ]
     [ -s ".dolt/noms/journal.idx" ]
 }
+
+file_size() {
+  # mac and linux differ on stat args.
+  if stat -c%s "$1" >/dev/null 2>&1; then
+    stat -c%s "$1"
+  else
+    stat -f%z "$1"
+  fi
+}
+
+
+@test "chunk-journal: verify null padding on journal file is truncated" {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  # capture original size
+  local orig_size=$(file_size "$journal")
+
+  # append 4K of null bytes. Historically, null bytes were found in journal files all the time.
+  dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
+
+  # sanity check size actually grew
+  local grown_size=$(file_size "$journal")
+  [ "$grown_size" -gt "$orig_size" ]
+
+  # This should truncate the journal.
+  dolt status
+
+  local final_size=$(file_size "$journal")
+
+  [ "$final_size" -eq "$orig_size" ]
+}
+
+@test "chunk-journal: verify garbage padding on journal file is truncated, reported." {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  local orig_size=$(file_size "$journal")
+
+  # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
+  printf '\x00\x50\x00\x01' >>"$journal"
+
+  # append 4K of null bytes. This will be ignored, but ensures that we have more than a root hash's worth of garbage.
+  dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
+
+  # This should truncate the journal.
+  run dolt status
+  [ "$status" -eq 0 ]
+  [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
+
+  local final_size=$(file_size "$journal")
+
+  [ "$final_size" -eq "$orig_size" ]
+}
+
+@test "chunk-journal: verify dataloss detection does not truncate the file." {
+  local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
+  [ -f "$journal" ]
+
+  # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
+  printf '\x00\x50\x00\x01' >>"$journal"
+
+  cat $BATS_CWD/corrupt_dbs/journal_data.bin >>"$journal"
+
+  local grown_size=$(file_size "$journal")
+
+  run dolt status
+  [ "$status" -eq 1 ]
+  [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
+
+  local final_size=$(file_size "$journal")
+
+  # file should be unchanged after mangling it to have data loss.
+  [ "$final_size" -eq "$grown_size" ]
+}
diff --git a/integration-tests/bats/corrupt_dbs/README.md b/integration-tests/bats/corrupt_dbs/README.md
index 420a6d1e00..2528b10671 100644
--- a/integration-tests/bats/corrupt_dbs/README.md
+++ b/integration-tests/bats/corrupt_dbs/README.md
@@ -12,3 +12,10 @@ and perhaps other tools in the future. Please catalog the contents of each datab
   This database contains a journal file which has been altered to have an object (7i48kt4h41hcjniri7scv5m8a69cdn13)
   which has a bad CRC. The object in question doesn't trip any problems until it's fully loaded, which indicates that the CRC for the journal record is correct, but the data is not.
 
+## journal_data.bin
+  This is a binary file which constitutes a valid journal root hash record followed by a
+  chunk record (100 bytes of random data). This file can be appended to assist in testing for
+  dataloss. We parse journal files until we can't parse them any longer, so you need to append
+  null bytes or random data to the end of a journal file to get into that state. That does
+  not constitute dataloss though - which is why you append this file to the journal because
+  these bytes are parsable. 
diff --git a/integration-tests/bats/corrupt_dbs/journal_data.bin b/integration-tests/bats/corrupt_dbs/journal_data.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc012137a830d23b4040e82c45965cd8c01047f6
GIT binary patch
literal 179
zcmV;k08IY?001Zf0R#X5003zw8q)$m!+rh2v)&(8VWN>ro5BKg3Yd9Xgnm8%004^t
z0s^c=Y*JDem2F>Z1>INalV6#1TCxLV@MEe{wf(^=dLs5-tP%q*7TG4Kv2J}>G$*F|
zq`vR<Fr`NFVnVv#{bW9&u;$FY$0t!=1$taAI73a;Wm%r=g3{r870B(&>g;Pkx01=y
h{bQFI5g{qCn@Fimc_a%LRqrkuqq)qfHT<sf19Me+P*DH?

literal 0
HcmV?d00001