Make metadata code aware of newgen archives

This commit is contained in:
Neil Macneale IV
2025-02-10 11:28:46 -08:00
parent 1f498f832c
commit f8330ff852
2 changed files with 112 additions and 70 deletions

View File

@@ -15,6 +15,7 @@
package nbs
import (
"errors"
"os"
"path/filepath"
@@ -22,34 +23,38 @@ import (
"github.com/dolthub/dolt/go/store/hash"
)
type StorageType int
const (
Journal StorageType = iota
TableFileNewGen
TableFileOldGen
Archive
)
type ArchiveMetadata struct {
originalTableFileId string
}
type TableFileFormat int
const (
TypeNoms TableFileFormat = iota
TypeArchive
)
type StorageArtifact struct {
id hash.Hash
path string
storageType StorageType
// ID of the storage artifact. This is uses in the manifest to identify the artifact, but it is not the file name.
// as archives has a suffix.
id hash.Hash
// path to the storage artifact.
path string
// storageType is the type of the storage artifact.
storageType TableFileFormat
// arcMetadata is additional metadata for archive files. it is only set for storageType == TypeArchive.
arcMetadata *ArchiveMetadata
}
type StorageMetadata struct {
// root is the path to storage. Specifically, it contains a .dolt directory.
root string
artifacts []StorageArtifact
}
func (sm *StorageMetadata) ArchiveFilesPresent() bool {
for _, artifact := range sm.artifacts {
if artifact.storageType == Archive {
if artifact.storageType == TypeArchive {
return true
}
}
@@ -60,7 +65,7 @@ func (sm *StorageMetadata) ArchiveFilesPresent() bool {
func (sm *StorageMetadata) RevertMap() map[hash.Hash]hash.Hash {
revertMap := make(map[hash.Hash]hash.Hash)
for _, artifact := range sm.artifacts {
if artifact.storageType == Archive {
if artifact.storageType == TypeArchive {
md := artifact.arcMetadata
revertMap[artifact.id] = hash.Parse(md.originalTableFileId)
}
@@ -68,6 +73,8 @@ func (sm *StorageMetadata) RevertMap() map[hash.Hash]hash.Hash {
return revertMap
}
// oldGenTableExists returns true if the table file exists in the oldgen directory. This is a file system check for
// a table file we have no record of, but may be useful in the process of reverting an archive operation.
func (sm *StorageMetadata) oldGenTableExists(id hash.Hash) (bool, error) {
path := filepath.Join(sm.root, ".dolt", "noms", "oldgen", id.String())
_, err := os.Stat(path)
@@ -88,20 +95,9 @@ func GetStorageMetadata(path string) (StorageMetadata, error) {
return StorageMetadata{}, err
}
// TODO: new gen and journal information in storage metadata will be useful in the future.
// newGen := filepath.Join(path, ".dolt", "noms")
// newgenManifest := filepath.Join(newGen, "manifest")
oldgen := filepath.Join(path, ".dolt", "noms", "oldgen")
oldgenManifest := filepath.Join(oldgen, "manifest")
// If there is not oldgen manifest, then GC has never been run. Which is fine. We just don't have any oldgen.
if _, err := os.Stat(oldgenManifest); err != nil {
return StorageMetadata{}, nil
}
// create a io.Reader for the manifest file
manifestReader, err := os.Open(oldgenManifest)
newGen := filepath.Join(path, ".dolt", "noms")
newgenManifest := filepath.Join(newGen, "manifest")
manifestReader, err := os.Open(newgenManifest)
if err != nil {
return StorageMetadata{}, err
}
@@ -116,53 +112,90 @@ func GetStorageMetadata(path string) (StorageMetadata, error) {
// for each table in the manifest, get the table spec
for i := 0; i < manifest.NumTableSpecs(); i++ {
tableSpecInfo := manifest.GetTableSpecInfo(i)
// If the oldgen/name exists, it's not an archive. If it exists with a .darc suffix, then it's an archive.
tfName := tableSpecInfo.GetName()
fullPath := filepath.Join(oldgen, tfName)
_, err := os.Stat(fullPath)
if err == nil {
// exists. Not an archive.
artifacts = append(artifacts, StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TableFileOldGen,
})
} else if os.IsNotExist(err) {
arcName := tfName + ".darc"
arcPath := filepath.Join(oldgen, arcName)
_, err := os.Stat(arcPath)
if err == nil {
// reader for the path. State. call
reader, fileSize, err := openReader(arcPath)
if err != nil {
return StorageMetadata{}, err
}
arcMetadata, err := newArchiveMetadata(reader, fileSize)
if err != nil {
return StorageMetadata{}, err
}
artifacts = append(artifacts, StorageArtifact{
id: hash.Parse(tfName),
path: arcPath,
storageType: Archive,
arcMetadata: arcMetadata,
})
} else {
// any error is bad here. If the files don't exist, then the manifest is no good.
return StorageMetadata{}, err
}
} else {
// some other error.
artifact, err := buildArtifact(tableSpecInfo, newGen)
if err != nil {
return StorageMetadata{}, err
}
artifacts = append(artifacts, artifact)
}
oldgen := filepath.Join(newGen, "oldgen")
oldgenManifest := filepath.Join(oldgen, "manifest")
// If there is no oldgen manifest, then GC has never been run. Which is fine. We just don't have any oldgen.
if _, err := os.Stat(oldgenManifest); err != nil {
return StorageMetadata{path, artifacts}, nil
}
manifestReader, err = os.Open(oldgenManifest)
if err != nil {
return StorageMetadata{}, err
}
manifest, err = ParseManifest(manifestReader)
if err != nil {
return StorageMetadata{}, err
}
for i := 0; i < manifest.NumTableSpecs(); i++ {
tableSpecInfo := manifest.GetTableSpecInfo(i)
artifact, err := buildArtifact(tableSpecInfo, oldgen)
if err != nil {
return StorageMetadata{}, err
}
artifacts = append(artifacts, artifact)
}
return StorageMetadata{path, artifacts}, nil
}
func buildArtifact(info TableSpecInfo, genPath string) (StorageArtifact, error) {
tfName := info.GetName()
// This code is going to be removed as soon as backup supports archives.
archive := false
fullPath := filepath.Join(genPath, tfName)
_, err := os.Stat(fullPath)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
fullPath = filepath.Join(genPath, tfName+ArchiveFileSuffix)
} else {
return StorageArtifact{}, err
}
_, err = os.Stat(fullPath)
if err != nil {
return StorageArtifact{}, err
}
archive = true
}
if !archive {
return StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TypeNoms,
}, nil
} else {
reader, fileSize, err := openReader(fullPath)
if err != nil {
return StorageArtifact{}, err
}
arcMetadata, err := newArchiveMetadata(reader, fileSize)
if err != nil {
return StorageArtifact{}, err
}
return StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TypeArchive,
arcMetadata: arcMetadata,
}, nil
}
}
func validateDir(path string) error {
info, err := os.Stat(path)

View File

@@ -183,6 +183,16 @@ mutations_and_gc_statement() {
run dolt sql -q 'select sum(i) from tbl;'
[[ "$status" -eq 0 ]] || false
[[ "$output" =~ "138075" ]] || false # i = 1 - 525, sum is 138075
## Temporary check. We want to ensure that backup will give an error, even when
## there are archives in newgen.
mkdir ../backup
dolt backup add bac1 file://../backup
run dolt backup sync bac1
[ "$status" -eq 1 ]
[[ "$output" =~ "error: archive files present" ]] || false
}
@test "archive: can clone respiratory with mixed types" {
@@ -235,7 +245,7 @@ mutations_and_gc_statement() {
dolt fetch
## update the remote repo directly. Need to run the archive command when the server is stopped.
## This will result in achived files on the remote, which we will need to read chunks from when we fetch.
## This will result in archived files on the remote, which we will need to read chunks from when we fetch.
cd ../../remote
kill $remotesrv_pid
wait $remotesrv_pid || :
@@ -248,7 +258,6 @@ mutations_and_gc_statement() {
[[ "$remotesrv_pid" -gt 0 ]] || false
cd ../cloned/repo1
run dolt fetch
[ "$status" -eq 0 ]