Truncate garbabe bytes, don't null pad. More testing

This commit is contained in:
Neil Macneale IV
2025-11-25 13:21:43 -08:00
parent 108a2f1d8e
commit b065275a17
5 changed files with 94 additions and 24 deletions

View File

@@ -22,6 +22,7 @@ import (
"fmt"
"io"
"math"
"os"
"time"
"github.com/dolthub/dolt/go/store/d"
@@ -372,6 +373,17 @@ func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb f
return 0, err
}
if f, ok := r.(*os.File); ok {
err = f.Truncate(off)
if err != nil {
return 0, err
}
err = f.Sync()
if err != nil {
return 0, err
}
}
return off, nil
}

View File

@@ -34,12 +34,6 @@ import (
)
const (
// chunkJournalFileSize is the size we initialize the journal file to when it is first created. We
// create a 16KB block of zero-initialized data and then sync the file to the first byte. We do this
// to ensure that we can write to the journal file and that we have some space for initial records.
// This probably isn't strictly necessary, but it also doesn't hurt.
chunkJournalFileSize = 16 * 1024
// journalWriterBuffSize is the size of the statically allocated buffer where journal records are
// built before being written to the journal file on disk. There is not a hard limit on the size
// of records specifically, some newer data chunking formats (i.e. optimized JSON storage) can
@@ -121,27 +115,9 @@ func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, e
return nil, err
}
// Open the journal file and initialize it with 16KB of zero bytes. This is intended to
// ensure that we can write to the journal and to allocate space for the first set of
// records, but probably isn't strictly necessary.
if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666); err != nil {
return nil, err
}
const batch = 1024 * 1024
b := make([]byte, batch)
for i := 0; i < chunkJournalFileSize; i += batch {
if _, err = f.Write(b); err != nil { // zero fill |f|
return nil, err
}
}
if err = f.Sync(); err != nil {
return nil, err
}
if o, err := f.Seek(0, io.SeekStart); err != nil {
return nil, err
} else if o != 0 {
return nil, fmt.Errorf("expected file journalOffset 0, got %d", o)
}
return &journalWriter{
buf: make([]byte, 0, journalWriterBuffSize),

View File

@@ -35,3 +35,78 @@ teardown() {
[ -s ".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" ]
[ -s ".dolt/noms/journal.idx" ]
}
file_size() {
# mac and linux differ on stat args.
if stat -c%s "$1" >/dev/null 2>&1; then
stat -c%s "$1"
else
stat -f%z "$1"
fi
}
@test "chunk-journal: verify null padding on journal file is truncated" {
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
[ -f "$journal" ]
# capture original size
local orig_size=$(file_size "$journal")
# append 4K of null bytes. Historically, null bytes were found in journal files all the time.
dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
# sanity check size actually grew
local grown_size=$(file_size "$journal")
[ "$grown_size" -gt "$orig_size" ]
# This should truncate the journal.
dolt status
local final_size=$(file_size "$journal")
[ "$final_size" -eq "$orig_size" ]
}
@test "chunk-journal: verify garbage padding on journal file is truncated, reported." {
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
[ -f "$journal" ]
local orig_size=$(file_size "$journal")
# append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
printf '\x00\x50\x00\x01' >>"$journal"
# append 4K of null bytes. This will be ignored, but ensures that we have more than a root hash's worth of garbage.
dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
# This should truncate the journal.
run dolt status
[ "$status" -eq 0 ]
[[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
local final_size=$(file_size "$journal")
[ "$final_size" -eq "$orig_size" ]
}
@test "chunk-journal: verify dataloss detection does not truncate the file." {
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
[ -f "$journal" ]
# append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
printf '\x00\x50\x00\x01' >>"$journal"
cat $BATS_CWD/corrupt_dbs/journal_data.bin >>"$journal"
local grown_size=$(file_size "$journal")
run dolt status
[ "$status" -eq 1 ]
[[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
local final_size=$(file_size "$journal")
# file should be unchanged after mangling it to have data loss.
[ "$final_size" -eq "$grown_size" ]
}

View File

@@ -12,3 +12,10 @@ and perhaps other tools in the future. Please catalog the contents of each datab
This database contains a journal file which has been altered to have an object (7i48kt4h41hcjniri7scv5m8a69cdn13)
which has a bad CRC. The object in question doesn't trip any problems until it's fully loaded, which indicates that the CRC for the journal record is correct, but the data is not.
## journal_data.bin
This is a binary file which constitutes a valid journal root hash record followed by a
chunk record (100 bytes of random data). This file can be appended to assist in testing for
dataloss. We parse journal files until we can't parse them any longer, so you need to append
null bytes or random data to the end of a journal file to get into that state. That does
not constitute dataloss though - which is why you append this file to the journal because
these bytes are parsable.

Binary file not shown.