mirror of
https://github.com/dolthub/dolt.git
synced 2026-05-13 03:10:03 -05:00
Archive Version 3. Now with more index.
This commit is contained in:
+18
-7
@@ -40,6 +40,8 @@ Format Version Differences:
|
||||
- Version 2: In addition to zStd compressed chunks, we also support Snappy compressed chunks, in the same format
|
||||
as Noms table files. Any Snappy compressed chunk will have a dictionary ID of 0, and the chunk data
|
||||
will be stored in the second Bytespan. It is stored with 32 bit CRC, just like Noms table files.
|
||||
- Version 3: In addition to the previous versions, we now support larger Indexes. The Index Length is now a Uint64,
|
||||
which expands the size of the footer by 4 bytes.
|
||||
|
||||
A Dolt Archive file follows the following format:
|
||||
+------------+------------+-----+------------+-------+----------+--------+
|
||||
@@ -50,7 +52,7 @@ In reverse order, since that's how we read it
|
||||
|
||||
Footer:
|
||||
+----------------------+-------------------------+----------------------+--------------------------+-----------------+------------------------+--------------------+
|
||||
| (Uint32) IndexLength | (Uint32) ByteSpan Count | (Uint32) Chunk Count | (Uint32) Metadata Length | (192) CheckSums | (Uint8) Format Version | (7) File Signature |
|
||||
| (Uint64) IndexLength | (Uint32) ByteSpan Count | (Uint32) Chunk Count | (Uint32) Metadata Length | (192) CheckSums | (Uint8) Format Version | (7) File Signature |
|
||||
+----------------------+-------------------------+----------------------+--------------------------+-----------------+------------------------+--------------------+
|
||||
- Index Length: The length of the Index in bytes.
|
||||
- ByteSpan Count: (N) The number of ByteSpans in the Archive. (does not include the null ByteSpan)
|
||||
@@ -62,6 +64,10 @@ Footer:
|
||||
- Format Version: Sequence starting at 1. Currently, 1 and 2 are supported.
|
||||
- File Signature: Some would call this a magic number. Not on my watch. Dolt Archives have a 7 byte signature: "DOLTARC"
|
||||
|
||||
*** Note that the footer size for the versions 1 and 2 or archives are 4 bytes shorter. The IndexLength is a Uint32
|
||||
rather than a Uint64. This was expanded to support much larger Indexes in version 3. The way this is implemented
|
||||
is that we load the larger footer for all versions, but ignore the first 4 bytes for versions 1 and 2.
|
||||
|
||||
CheckSums:
|
||||
+----------------------------+-------------------+----------------------+
|
||||
| (64) Sha512 ByteSpan 1 - N | (64) Sha512 Index | (64) Sha512 Metadata |
|
||||
@@ -168,7 +174,7 @@ const (
|
||||
archiveFileSignature = "DOLTARC"
|
||||
archiveFileSigSize = uint64(len(archiveFileSignature))
|
||||
archiveCheckSumSize = sha512.Size * 3 // sha512 3 times.
|
||||
archiveFooterSize = uint32Size + // index length
|
||||
archiveFooterSize = uint64Size + // index length
|
||||
uint32Size + // byte span count
|
||||
uint32Size + // chunk count
|
||||
uint32Size + // metadataSpan length
|
||||
@@ -180,12 +186,16 @@ const (
|
||||
|
||||
/*
|
||||
+----------------------+-------------------------+----------------------+--------------------------+-----------------+------------------------+--------------------+
|
||||
| (Uint32) IndexLength | (Uint32) ByteSpan Count | (Uint32) Chunk Count | (Uint32) Metadata Length | (192) CheckSums | (Uint8) Format Version | (7) File Signature |
|
||||
| (Uint64) IndexLength | (Uint32) ByteSpan Count | (Uint32) Chunk Count | (Uint32) Metadata Length | (192) CheckSums | (Uint8) Format Version | (7) File Signature |
|
||||
+----------------------+-------------------------+----------------------+--------------------------+-----------------+------------------------+--------------------+
|
||||
|
||||
Note that all offsets are based on the footer total size determined by the version 3 format (archiveVersionGiantIndexSupport),
|
||||
which is the largest. Versions 1 and 2 have a smaller footer size, but the only special case offset is for the index
|
||||
length, which is at the start of the footer.
|
||||
*/
|
||||
const ( // afr = Archive FooteR
|
||||
afrIndexLenOffset = 0
|
||||
afrByteSpanOffset = afrIndexLenOffset + uint32Size
|
||||
afrByteSpanOffset = afrIndexLenOffset + uint64Size
|
||||
afrChunkCountOffset = afrByteSpanOffset + uint32Size
|
||||
afrMetaLenOffset = afrChunkCountOffset + uint32Size
|
||||
afrDataChkSumOffset = afrMetaLenOffset + uint32Size
|
||||
@@ -197,9 +207,10 @@ const ( // afr = Archive FooteR
|
||||
|
||||
// Archive Format Versions.
|
||||
const (
|
||||
archiveVersionInitial = uint8(1)
|
||||
archiveVersionSnappySupport = uint8(2)
|
||||
archiveFormatVersionMax = archiveVersionSnappySupport
|
||||
archiveVersionInitial = uint8(1)
|
||||
archiveVersionSnappySupport = uint8(2)
|
||||
archiveVersionGiantIndexSupport = uint8(3)
|
||||
archiveFormatVersionMax = archiveVersionGiantIndexSupport
|
||||
)
|
||||
|
||||
// Archive Metadata Data Keys are the fields in the archive metadata that are stored in the footer. These are used
|
||||
|
||||
@@ -46,7 +46,7 @@ type archiveReader struct {
|
||||
type suffix [hash.SuffixLen]byte
|
||||
|
||||
type archiveFooter struct {
|
||||
indexSize uint32
|
||||
indexSize uint64
|
||||
byteSpanCount uint32
|
||||
chunkCount uint32
|
||||
metadataSize uint32
|
||||
@@ -59,22 +59,32 @@ type archiveFooter struct {
|
||||
hash hash.Hash
|
||||
}
|
||||
|
||||
// actualFooterSize returns the footer size, in bytes for a specific archive. Due to the evolution of the archive format,
|
||||
// the footer size expanded in format version 3, so we need to calculate the footer size when calculating offsets
|
||||
// for this instance.
|
||||
func (f archiveFooter) actualFooterSize() uint64 {
|
||||
if f.formatVersion < archiveVersionGiantIndexSupport {
|
||||
// Version 1 and 2 archives have a smaller footer.
|
||||
return archiveFooterSize - 4
|
||||
}
|
||||
return archiveFooterSize
|
||||
}
|
||||
|
||||
// dataSpan returns the span of the data section of the archive. This is not generally used directly since we usually
|
||||
// read individual spans for each chunk.
|
||||
func (f archiveFooter) dataSpan() byteSpan {
|
||||
return byteSpan{offset: 0, length: f.fileSize - archiveFooterSize - uint64(f.metadataSize) - uint64(f.indexSize)}
|
||||
return byteSpan{offset: 0, length: f.fileSize - f.actualFooterSize() - uint64(f.metadataSize) - uint64(f.indexSize)}
|
||||
}
|
||||
|
||||
// totalIndexSpan returns the span of the entire index section of the archive. This span is not directly useful as
|
||||
// the index is broken into a compressed section and an uncompressed section. Use indexCompressedSpan and indexSuffixSpan
|
||||
// totalIndexSpan returns the span of the entire index section of the archive.
|
||||
func (f archiveFooter) totalIndexSpan() byteSpan {
|
||||
return byteSpan{offset: f.fileSize - archiveFooterSize - uint64(f.metadataSize) - uint64(f.indexSize), length: uint64(f.indexSize)}
|
||||
return byteSpan{offset: f.fileSize - f.actualFooterSize() - uint64(f.metadataSize) - uint64(f.indexSize), length: uint64(f.indexSize)}
|
||||
}
|
||||
|
||||
// indexByteOffsetSpan returns the span of the byte offsets section of the index. This is the first part of the index
|
||||
func (f archiveFooter) indexByteOffsetSpan() byteSpan {
|
||||
totalIdx := f.totalIndexSpan()
|
||||
return byteSpan{offset: totalIdx.offset, length: uint64(f.byteSpanCount * uint64Size)}
|
||||
return byteSpan{offset: totalIdx.offset, length: uint64(f.byteSpanCount) * uint64Size}
|
||||
}
|
||||
|
||||
// indexPrefixSpan returns the span of the prefix section of the index. This is the second part of the index.
|
||||
@@ -101,7 +111,7 @@ func (f archiveFooter) indexSuffixSpan() byteSpan {
|
||||
|
||||
// metadataSpan returns the span of the metadata section of the archive.
|
||||
func (f archiveFooter) metadataSpan() byteSpan {
|
||||
return byteSpan{offset: f.fileSize - archiveFooterSize - uint64(f.metadataSize), length: uint64(f.metadataSize)}
|
||||
return byteSpan{offset: f.fileSize - f.actualFooterSize() - uint64(f.metadataSize), length: uint64(f.metadataSize)}
|
||||
}
|
||||
|
||||
func newArchiveMetadata(ctx context.Context, reader tableReaderAt, fileSize uint64, stats *Stats) (*ArchiveMetadata, error) {
|
||||
@@ -187,9 +197,12 @@ func newArchiveReaderFromFooter(ctx context.Context, reader tableReaderAt, fileS
|
||||
func newArchiveReader(ctx context.Context, reader tableReaderAt, fileSize uint64, stats *Stats) (archiveReader, error) {
|
||||
footer, err := loadFooter(ctx, reader, fileSize, stats)
|
||||
if err != nil {
|
||||
return archiveReader{}, err
|
||||
return archiveReader{}, errors.New("Failed to loadFooter: " + err.Error())
|
||||
}
|
||||
|
||||
// s := fmt.Sprintf("Footer loaded: %v", footer)
|
||||
// return archiveReader{}, errors.New(s)
|
||||
|
||||
return buildArchiveReader(ctx, reader, footer, stats)
|
||||
}
|
||||
|
||||
@@ -200,7 +213,7 @@ func buildArchiveReader(ctx context.Context, reader tableReaderAt, footer archiv
|
||||
byteSpans[0] = 0 // Null byteSpan to simplify logic.
|
||||
err := binary.Read(secRdr, binary.BigEndian, byteSpans[1:])
|
||||
if err != nil {
|
||||
return archiveReader{}, err
|
||||
return archiveReader{}, errors.New("Failed to read byte spans: " + err.Error())
|
||||
}
|
||||
|
||||
prefixSpan := footer.indexPrefixSpan()
|
||||
@@ -208,15 +221,15 @@ func buildArchiveReader(ctx context.Context, reader tableReaderAt, footer archiv
|
||||
prefixes := make([]uint64, footer.chunkCount)
|
||||
err = binary.Read(prefixRdr, binary.BigEndian, prefixes[:])
|
||||
if err != nil {
|
||||
return archiveReader{}, err
|
||||
return archiveReader{}, errors.New("Failed to read prefixes: " + err.Error())
|
||||
}
|
||||
|
||||
chunkRefSpan := footer.indexChunkRefSpan()
|
||||
chunkRdr := newSectionReader(ctx, reader, int64(chunkRefSpan.offset), int64(chunkRefSpan.length), stats)
|
||||
chunks := make([]uint32, footer.chunkCount*2)
|
||||
err = binary.Read(chunkRdr, binary.BigEndian, chunks[:])
|
||||
chnks := make([]uint32, footer.chunkCount*2)
|
||||
err = binary.Read(chunkRdr, binary.BigEndian, chnks[:])
|
||||
if err != nil {
|
||||
return archiveReader{}, err
|
||||
return archiveReader{}, errors.New("Failed to read chunk references: " + err.Error())
|
||||
}
|
||||
|
||||
suffixSpan := footer.indexSuffixSpan()
|
||||
@@ -224,7 +237,7 @@ func buildArchiveReader(ctx context.Context, reader tableReaderAt, footer archiv
|
||||
suffixes := make([]byte, footer.chunkCount*hash.SuffixLen)
|
||||
_, err = io.ReadFull(sufRdr, suffixes)
|
||||
if err != nil {
|
||||
return archiveReader{}, err
|
||||
return archiveReader{}, errors.New("Failed to read suffixes: " + err.Error())
|
||||
}
|
||||
|
||||
dictCache, err := lru.New2Q[uint32, *DecompBundle](256)
|
||||
@@ -236,7 +249,7 @@ func buildArchiveReader(ctx context.Context, reader tableReaderAt, footer archiv
|
||||
reader: reader,
|
||||
prefixes: prefixes,
|
||||
spanIndex: byteSpans,
|
||||
chunkRefs: chunks,
|
||||
chunkRefs: chnks,
|
||||
suffixes: suffixes,
|
||||
footer: footer,
|
||||
dictCache: dictCache,
|
||||
@@ -289,17 +302,33 @@ func buildFooter(fileSize uint64, buf []byte) (f archiveFooter, err error) {
|
||||
f.formatVersion = buf[afrVersionOffset]
|
||||
f.fileSignature = string(buf[afrSigOffset:])
|
||||
// Verify File Signature
|
||||
if f.fileSignature != string(archiveFileSignature) {
|
||||
if f.fileSignature != archiveFileSignature {
|
||||
err = ErrInvalidFileSignature
|
||||
return
|
||||
}
|
||||
// Verify Format Version. 1 and 2 supported.
|
||||
// Verify Format Version. 1,2,3 supported.
|
||||
if f.formatVersion > archiveFormatVersionMax {
|
||||
err = ErrInvalidFormatVersion
|
||||
return
|
||||
}
|
||||
|
||||
f.indexSize = binary.BigEndian.Uint32(buf[afrIndexLenOffset : afrIndexChkSumOffset+uint32Size])
|
||||
smallFooter := false
|
||||
if f.formatVersion < archiveVersionGiantIndexSupport {
|
||||
smallFooter = true
|
||||
}
|
||||
|
||||
if smallFooter {
|
||||
// Version 1 and 2 archives have a smaller footer. Ignore the first 4 bytes.
|
||||
if afrIndexLenOffset != 0 {
|
||||
// Future proofing for the event where we need to extend the footer with additional fields. This is intended
|
||||
// to blow up in development if we try and change it.
|
||||
panic("runtime error: afrIndexChkSumOffset must be 0.")
|
||||
}
|
||||
f.indexSize = uint64(binary.BigEndian.Uint32(buf[4 : 4+uint32Size]))
|
||||
} else {
|
||||
f.indexSize = binary.BigEndian.Uint64(buf[afrIndexLenOffset : afrIndexLenOffset+uint64Size])
|
||||
}
|
||||
|
||||
f.byteSpanCount = binary.BigEndian.Uint32(buf[afrByteSpanOffset : afrByteSpanOffset+uint32Size])
|
||||
f.chunkCount = binary.BigEndian.Uint32(buf[afrChunkCountOffset : afrChunkCountOffset+uint32Size])
|
||||
f.metadataSize = binary.BigEndian.Uint32(buf[afrMetaLenOffset : afrMetaLenOffset+uint32Size])
|
||||
@@ -310,6 +339,10 @@ func buildFooter(fileSize uint64, buf []byte) (f archiveFooter, err error) {
|
||||
|
||||
// calculate the hash of the footer. We don't currently verify that this is what was used to load the content.
|
||||
sha := sha512.New()
|
||||
if smallFooter {
|
||||
buf = buf[4:]
|
||||
}
|
||||
|
||||
sha.Write(buf)
|
||||
f.hash = hash.New(sha.Sum(nil)[:hash.ByteLen])
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ type archiveWriter struct {
|
||||
stagedBytes stagedByteSpanSlice
|
||||
stagedChunks stagedChunkRefSlice
|
||||
seenChunks hash.HashSet
|
||||
indexLen uint32
|
||||
indexLen uint64
|
||||
metadataLen uint32
|
||||
dataCheckSum sha512Sum
|
||||
indexCheckSum sha512Sum
|
||||
@@ -289,7 +289,7 @@ func (aw *archiveWriter) writeIndex() error {
|
||||
aw.bytesWritten += hash.SuffixLen
|
||||
}
|
||||
|
||||
aw.indexLen = uint32(indexSize)
|
||||
aw.indexLen = indexSize
|
||||
aw.indexCheckSum = sha512Sum(aw.output.GetSum())
|
||||
aw.output.ResetHasher()
|
||||
aw.workflowStage = stageMetadata
|
||||
@@ -331,7 +331,7 @@ func (aw *archiveWriter) writeFooter() error {
|
||||
}
|
||||
|
||||
// Write out the index length
|
||||
err := aw.writeUint32(aw.indexLen)
|
||||
err := aw.writeUint64(aw.indexLen)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -409,10 +409,10 @@ mutations_and_gc_statement() {
|
||||
## This output indicates that the new content pushed to the remote all landed as zStd chunks
|
||||
## in an archive file. multiline regex - no quotes - to match this text:
|
||||
# Archive Metadata:
|
||||
# Format Version: 2
|
||||
# Format Version: 3
|
||||
# Snappy Chunk Count: 0 (bytes: 0)
|
||||
# ZStd Chunk Count: 1609
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]2[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]0.*ZStd[[:space:]]Chunk[[:space:]]Count:[[:space:]]1609 ]] || false
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]3[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]0.*ZStd[[:space:]]Chunk[[:space:]]Count:[[:space:]]1609 ]] || false
|
||||
}
|
||||
|
||||
@test "archive: small push remote with archive default produces archive with snappy chunks" {
|
||||
@@ -441,9 +441,9 @@ mutations_and_gc_statement() {
|
||||
## This output indicates that the new content pushed to the remote all landed as snappy chunks
|
||||
## in an archive file. multiline regex - no quotes - to match this text:
|
||||
# Archive Metadata:
|
||||
# Format Version: 2
|
||||
# Format Version: 3
|
||||
# Snappy Chunk Count: 9
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]2[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]9[[:space:]] ]] || false
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]3[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]9[[:space:]] ]] || false
|
||||
}
|
||||
|
||||
@test "archive: fetch into empty database with archive default" {
|
||||
@@ -467,10 +467,10 @@ mutations_and_gc_statement() {
|
||||
## the remote is all archive, the chunks end up as zStd as well.
|
||||
## multiline regex - no quotes - to match this text:
|
||||
# Archive Metadata:
|
||||
# Format Version: 2
|
||||
# Format Version: 3
|
||||
# Snappy Chunk Count: 0 (bytes: 0)
|
||||
# ZStd Chunk Count: 260
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]2[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]0.*ZStd[[:space:]]Chunk[[:space:]]Count:[[:space:]]260 ]] || false
|
||||
[[ $output =~ Archive[[:space:]]Metadata:[[:space:]]*Format[[:space:]]Version:[[:space:]]3[[:space:]]*Snappy[[:space:]]Chunk[[:space:]]Count:[[:space:]]0.*ZStd[[:space:]]Chunk[[:space:]]Count:[[:space:]]260 ]] || false
|
||||
|
||||
dolt fsck
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user