mirror of
https://github.com/dolthub/dolt.git
synced 2026-02-15 18:19:12 -06:00
Truncate garbabe bytes, don't null pad. More testing
This commit is contained in:
@@ -22,6 +22,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/dolthub/dolt/go/store/d"
|
||||
@@ -372,6 +373,17 @@ func processJournalRecords(ctx context.Context, r io.ReadSeeker, off int64, cb f
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if f, ok := r.(*os.File); ok {
|
||||
err = f.Truncate(off)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return off, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -34,12 +34,6 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
// chunkJournalFileSize is the size we initialize the journal file to when it is first created. We
|
||||
// create a 16KB block of zero-initialized data and then sync the file to the first byte. We do this
|
||||
// to ensure that we can write to the journal file and that we have some space for initial records.
|
||||
// This probably isn't strictly necessary, but it also doesn't hurt.
|
||||
chunkJournalFileSize = 16 * 1024
|
||||
|
||||
// journalWriterBuffSize is the size of the statically allocated buffer where journal records are
|
||||
// built before being written to the journal file on disk. There is not a hard limit on the size
|
||||
// of records – specifically, some newer data chunking formats (i.e. optimized JSON storage) can
|
||||
@@ -121,27 +115,9 @@ func createJournalWriter(ctx context.Context, path string) (wr *journalWriter, e
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Open the journal file and initialize it with 16KB of zero bytes. This is intended to
|
||||
// ensure that we can write to the journal and to allocate space for the first set of
|
||||
// records, but probably isn't strictly necessary.
|
||||
if f, err = os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
const batch = 1024 * 1024
|
||||
b := make([]byte, batch)
|
||||
for i := 0; i < chunkJournalFileSize; i += batch {
|
||||
if _, err = f.Write(b); err != nil { // zero fill |f|
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
if err = f.Sync(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if o, err := f.Seek(0, io.SeekStart); err != nil {
|
||||
return nil, err
|
||||
} else if o != 0 {
|
||||
return nil, fmt.Errorf("expected file journalOffset 0, got %d", o)
|
||||
}
|
||||
|
||||
return &journalWriter{
|
||||
buf: make([]byte, 0, journalWriterBuffSize),
|
||||
|
||||
@@ -35,3 +35,78 @@ teardown() {
|
||||
[ -s ".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv" ]
|
||||
[ -s ".dolt/noms/journal.idx" ]
|
||||
}
|
||||
|
||||
file_size() {
|
||||
# mac and linux differ on stat args.
|
||||
if stat -c%s "$1" >/dev/null 2>&1; then
|
||||
stat -c%s "$1"
|
||||
else
|
||||
stat -f%z "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@test "chunk-journal: verify null padding on journal file is truncated" {
|
||||
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
|
||||
[ -f "$journal" ]
|
||||
|
||||
# capture original size
|
||||
local orig_size=$(file_size "$journal")
|
||||
|
||||
# append 4K of null bytes. Historically, null bytes were found in journal files all the time.
|
||||
dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
|
||||
|
||||
# sanity check size actually grew
|
||||
local grown_size=$(file_size "$journal")
|
||||
[ "$grown_size" -gt "$orig_size" ]
|
||||
|
||||
# This should truncate the journal.
|
||||
dolt status
|
||||
|
||||
local final_size=$(file_size "$journal")
|
||||
|
||||
[ "$final_size" -eq "$orig_size" ]
|
||||
}
|
||||
|
||||
@test "chunk-journal: verify garbage padding on journal file is truncated, reported." {
|
||||
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
|
||||
[ -f "$journal" ]
|
||||
|
||||
local orig_size=$(file_size "$journal")
|
||||
|
||||
# append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
|
||||
printf '\x00\x50\x00\x01' >>"$journal"
|
||||
|
||||
# append 4K of null bytes. This will be ignored, but ensures that we have more than a root hash's worth of garbage.
|
||||
dd if=/dev/zero bs=4096 count=1 2>/dev/null >>"$journal"
|
||||
|
||||
# This should truncate the journal.
|
||||
run dolt status
|
||||
[ "$status" -eq 0 ]
|
||||
[[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
|
||||
|
||||
local final_size=$(file_size "$journal")
|
||||
|
||||
[ "$final_size" -eq "$orig_size" ]
|
||||
}
|
||||
|
||||
@test "chunk-journal: verify dataloss detection does not truncate the file." {
|
||||
local journal=".dolt/noms/vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv"
|
||||
[ -f "$journal" ]
|
||||
|
||||
# append some bytes which are going to be parsed as an invalid chunk size of 5242881 (one byte too large)
|
||||
printf '\x00\x50\x00\x01' >>"$journal"
|
||||
|
||||
cat $BATS_CWD/corrupt_dbs/journal_data.bin >>"$journal"
|
||||
|
||||
local grown_size=$(file_size "$journal")
|
||||
|
||||
run dolt status
|
||||
[ "$status" -eq 1 ]
|
||||
[[ "$output" =~ "invalid journal record length: 5242881 exceeds max allowed size of 5242880" ]] || false
|
||||
|
||||
local final_size=$(file_size "$journal")
|
||||
|
||||
# file should be unchanged after mangling it to have data loss.
|
||||
[ "$final_size" -eq "$grown_size" ]
|
||||
}
|
||||
|
||||
@@ -12,3 +12,10 @@ and perhaps other tools in the future. Please catalog the contents of each datab
|
||||
This database contains a journal file which has been altered to have an object (7i48kt4h41hcjniri7scv5m8a69cdn13)
|
||||
which has a bad CRC. The object in question doesn't trip any problems until it's fully loaded, which indicates that the CRC for the journal record is correct, but the data is not.
|
||||
|
||||
## journal_data.bin
|
||||
This is a binary file which constitutes a valid journal root hash record followed by a
|
||||
chunk record (100 bytes of random data). This file can be appended to assist in testing for
|
||||
dataloss. We parse journal files until we can't parse them any longer, so you need to append
|
||||
null bytes or random data to the end of a journal file to get into that state. That does
|
||||
not constitute dataloss though - which is why you append this file to the journal because
|
||||
these bytes are parsable.
|
||||
|
||||
BIN
integration-tests/bats/corrupt_dbs/journal_data.bin
Normal file
BIN
integration-tests/bats/corrupt_dbs/journal_data.bin
Normal file
Binary file not shown.
Reference in New Issue
Block a user