From b065275a17bcaecb28aababc82195f9ec1793069 Mon Sep 17 00:00:00 2001 From: Neil Macneale IV Date: Tue, 25 Nov 2025 13:21:43 -0800 Subject: [PATCH] Truncate garbabe bytes, don't null pad. More testing --- go/store/nbs/journal_record.go | 12 +++ go/store/nbs/journal_writer.go | 24 ------ integration-tests/bats/chunk-journal.bats | 75 ++++++++++++++++++ integration-tests/bats/corrupt_dbs/README.md | 7 ++ .../bats/corrupt_dbs/journal_data.bin | Bin 0 -> 179 bytes 5 files changed, 94 insertions(+), 24 deletions(-) create mode 100644 integration-tests/bats/corrupt_dbs/journal_data.bin diff --git a/go/store/nbs/journal_record.go b/go/store/nbs/journal_record.go index 19bf7dcc1d..30de99c917 100644 --- a/go/store/nbs/journal_record.go +++ b/go/store/nbs/journal_record.go @@ -22,6 +22,7 @@ import ( "fmt" "io" "math" + "os" "time" "github.com/dolthub/dolt/go/store/d" @@ -372,6 +373,17 @@ func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb f return 0, err } + if f, ok := r.(*os.File); ok { + err = f.Truncate(off) + if err != nil { + return 0, err + } + err = f.Sync() + if err != nil { + return 0, err + } + } + return off, nil } diff --git a/go/store/nbs/journal_writer.go b/go/store/nbs/journal_writer.go index 7ada32c9a1..a462d54e4b 100644 --- a/go/store/nbs/journal_writer.go +++ b/go/store/nbs/journal_writer.go @@ -34,12 +34,6 @@ import ( ) const ( - // chunkJournalFileSize is the size we initialize the journal file to when it is first created. We - // create a 16KB block of zero-initialized data and then sync the file to the first byte. We do this - // to ensure that we can write to the journal file and that we have some space for initial records. - // This probably isn't strictly necessary, but it also doesn't hurt. - chunkJournalFileSize = 16 * 1024 - // journalWriterBuffSize is the size of the statically allocated buffer where journal records are // built before being written to the journal file on disk. There is not a hard limit on the size // of records – specifically, some newer data chunking formats (i.e. optimized JSON storage) can @@ -121,27 +115,9 @@ func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, e return nil, err } - // Open the journal file and initialize it with 16KB of zero bytes. This is intended to - // ensure that we can write to the journal and to allocate space for the first set of - // records, but probably isn't strictly necessary. if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666); err != nil { return nil, err } - const batch = 1024 * 1024 - b := make([]byte, batch) - for i := 0; i < chunkJournalFileSize; i += batch { - if _, err = f.Write(b); err != nil { // zero fill |f| - return nil, err - } - } - if err = f.Sync(); err != nil { - return nil, err - } - if o, err := f.Seek(0, io.SeekStart); err != nil { - return nil, err - } else if o != 0 { - return nil, fmt.Errorf("expected file journalOffset 0, got %d", o) - } return &journalWriter{ buf: make([]byte, 0, journalWriterBuffSize), diff --git a/integration-tests/bats/chunk-journal.bats b/integration-tests/bats/chunk-journal.bats index 4ab022a88d..a3210a67c1 100644 --- a/integration-tests/bats/chunk-journal.bats +++ b/integration-tests/bats/chunk-journal.bats @@ -35,3 +35,78 @@ teardown() { [ -s ".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" ] [ -s ".dolt/noms/journal.idx" ] } + +file_size() { + # mac and linux differ on stat args. + if stat -c%s "$1" >/dev/null 2>&1; then + stat -c%s "$1" + else + stat -f%z "$1" + fi +} + + +@test "chunk-journal: verify null padding on journal file is truncated" { + local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" + [ -f "$journal" ] + + # capture original size + local orig_size=$(file_size "$journal") + + # append 4K of null bytes. Historically, null bytes were found in journal files all the time. + dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal" + + # sanity check size actually grew + local grown_size=$(file_size "$journal") + [ "$grown_size" -gt "$orig_size" ] + + # This should truncate the journal. + dolt status + + local final_size=$(file_size "$journal") + + [ "$final_size" -eq "$orig_size" ] +} + +@test "chunk-journal: verify garbage padding on journal file is truncated, reported." { + local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" + [ -f "$journal" ] + + local orig_size=$(file_size "$journal") + + # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large) + printf '\x00\x50\x00\x01' >>"$journal" + + # append 4K of null bytes. This will be ignored, but ensures that we have more than a root hash's worth of garbage. + dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal" + + # This should truncate the journal. + run dolt status + [ "$status" -eq 0 ] + [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false + + local final_size=$(file_size "$journal") + + [ "$final_size" -eq "$orig_size" ] +} + +@test "chunk-journal: verify dataloss detection does not truncate the file." { + local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" + [ -f "$journal" ] + + # append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large) + printf '\x00\x50\x00\x01' >>"$journal" + + cat $BATS_CWD/corrupt_dbs/journal_data.bin >>"$journal" + + local grown_size=$(file_size "$journal") + + run dolt status + [ "$status" -eq 1 ] + [[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false + + local final_size=$(file_size "$journal") + + # file should be unchanged after mangling it to have data loss. + [ "$final_size" -eq "$grown_size" ] +} diff --git a/integration-tests/bats/corrupt_dbs/README.md b/integration-tests/bats/corrupt_dbs/README.md index 420a6d1e00..2528b10671 100644 --- a/integration-tests/bats/corrupt_dbs/README.md +++ b/integration-tests/bats/corrupt_dbs/README.md @@ -12,3 +12,10 @@ and perhaps other tools in the future. Please catalog the contents of each datab This database contains a journal file which has been altered to have an object (7i48kt4h41hcjniri7scv5m8a69cdn13) which has a bad CRC. The object in question doesn't trip any problems until it's fully loaded, which indicates that the CRC for the journal record is correct, but the data is not. +## journal_data.bin + This is a binary file which constitutes a valid journal root hash record followed by a + chunk record (100 bytes of random data). This file can be appended to assist in testing for + dataloss. We parse journal files until we can't parse them any longer, so you need to append + null bytes or random data to the end of a journal file to get into that state. That does + not constitute dataloss though - which is why you append this file to the journal because + these bytes are parsable. diff --git a/integration-tests/bats/corrupt_dbs/journal_data.bin b/integration-tests/bats/corrupt_dbs/journal_data.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc012137a830d23b4040e82c45965cd8c01047f6 GIT binary patch literal 179 zcmV;k08IY?001Zf0R#X5003zw8q)$m!+rh2v)&(8VWN>ro5BKg3Yd9Xgnm8%004^t z0s^c=Y*JDem2F>Z1>INalV6#1TCxLV@MEe{wf(^=dLs5-tP%q*7TG4Kv2J}>G$*F| zq`vRg;Pkx01=y h{bQFI5g{qCn@Fimc_a%LRqrkuqq)qfHT