From f7f74662571abe2e70355c0b6f5dd7f787be3c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Mon, 9 Feb 2026 12:35:20 -0800 Subject: [PATCH 01/28] .gitignore: update --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1e6b24365d..251f66cd75 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ CLAUDE.md .beads .gitattributes +.de/ +AGENTS.md From 78d1391ef25ab465e220f7e0f8a26083306a4466 Mon Sep 17 00:00:00 2001 From: Aaron Son Date: Mon, 9 Feb 2026 12:44:06 -0800 Subject: [PATCH 02/28] go: sqle/dsess: transactions.go: When serializing transaction commits against a working set, form the key with the normalized db name. Previously, this lock would accidentally allow concurrent access to writing the database working set value because a non-normalized database name like `db/main\x00/refs/heads/main` would allow access along with a normalized database name like `db\x00/refs/heads/main`. This did not impact correctness, since the working sets are safe for concurrent modification at the storage layer, but it could cause transient failures for a client if the optimistic lock retries failed sequentially enough times. Here we fix the bug so that the txLocks serialize access to the ref heads as expected. --- .../concurrent_writes_test.go | 151 ++++++++++++++++++ .../go-sql-server-driver/repro_10331_test.go | 10 -- 2 files changed, 151 insertions(+), 10 deletions(-) create mode 100644 integration-tests/go-sql-server-driver/concurrent_writes_test.go diff --git a/integration-tests/go-sql-server-driver/concurrent_writes_test.go b/integration-tests/go-sql-server-driver/concurrent_writes_test.go new file mode 100644 index 0000000000..733635cc08 --- /dev/null +++ b/integration-tests/go-sql-server-driver/concurrent_writes_test.go @@ -0,0 +1,151 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" + + driver "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils/sql_server_driver" +) + +// The txLocks in +func TestConcurrentWrites(t *testing.T) { + t.Parallel() + var ports DynamicResources + ports.global = &GlobalPorts + ports.t = t + u, err := driver.NewDoltUser() + require.NoError(t, err) + t.Cleanup(func() { + u.Cleanup() + }) + + rs, err := u.MakeRepoStore() + require.NoError(t, err) + + repo, err := rs.MakeRepo("concurrent_writes_test") + require.NoError(t, err) + + srvSettings := &driver.Server{ + Args: []string{"-P", `{{get_port "server_port"}}`}, + DynamicPort: "server_port", + } + server := MakeServer(t, repo, srvSettings, &ports) + server.DBName = "concurrent_writes_test" + + db, err := server.DB(driver.Connection{User: "root"}) + require.NoError(t, err) + db.SetMaxIdleConns(0) + defer func() { + require.NoError(t, db.Close()) + }() + ctx := t.Context() + func() { + conn, err := db.Conn(ctx) + require.NoError(t, err) + defer conn.Close() + // Create table and initial data. + _, err = conn.ExecContext(ctx, "CREATE TABLE data (id VARCHAR(64) PRIMARY KEY, worker INT, data TEXT, created_at TIMESTAMP)") + require.NoError(t, err) + _, err = conn.ExecContext(ctx, "CALL DOLT_COMMIT('-Am', 'init with table')") + require.NoError(t, err) + }() + + eg, ctx := errgroup.WithContext(ctx) + start := time.Now() + + nextInt := uint32(0) + const numWriters = 32 + const testDuration = 8 * time.Second + startCh := make(chan struct{}) + for i := range numWriters { + eg.Go(func() error { + select { + case <-startCh: + case <-ctx.Done(): + return nil + } + db, err := server.DB(driver.Connection{User: "root"}) + require.NoError(t, err) + db.SetMaxOpenConns(1) + conn, err := db.Conn(ctx) + if err != nil { + return err + } + defer conn.Close() + j := 0 + for { + if time.Since(start) > testDuration { + return nil + } + if ctx.Err() != nil { + return nil + } + key := fmt.Sprintf("main-%d-%d", i, j) + _, err := conn.ExecContext(ctx, "INSERT INTO data VALUES (?,?,?,?)", key, i, key, time.Now()) + if err != nil { + return err + } + atomic.AddUint32(&nextInt, 1) + _, err = conn.ExecContext(ctx, fmt.Sprintf("CALL DOLT_COMMIT('-Am', 'insert %s')", key)) + if err != nil { + return err + } + j += 1 + } + }) + } + time.Sleep(500 * time.Millisecond) + close(startCh) + require.NoError(t, eg.Wait()) + t.Logf("wrote %d", nextInt) + ctx = t.Context() + conn, err := db.Conn(ctx) + if err != nil { + require.NoError(t, err) + } + defer func () { + require.NoError(t, conn.Close()) + }() + rows, err := conn.QueryContext(ctx, "SELECT COUNT(*) FROM data") + if err != nil { + require.NoError(t, err) + } + var i int + for rows.Next() { + err = rows.Scan(&i) + require.NoError(t, err) + } + require.NoError(t, rows.Err()) + require.NoError(t, rows.Close()) + t.Logf("read %d", i) + rows, err = conn.QueryContext(ctx, "SELECT COUNT(*) FROM dolt_log") + if err != nil { + require.NoError(t, err) + } + for rows.Next() { + err = rows.Scan(&i) + require.NoError(t, err) + } + require.NoError(t, rows.Err()) + require.NoError(t, rows.Close()) + t.Logf("created %d commits", i) +} diff --git a/integration-tests/go-sql-server-driver/repro_10331_test.go b/integration-tests/go-sql-server-driver/repro_10331_test.go index 516095c96a..04809822ce 100644 --- a/integration-tests/go-sql-server-driver/repro_10331_test.go +++ b/integration-tests/go-sql-server-driver/repro_10331_test.go @@ -15,11 +15,6 @@ package main import ( - // "context" - // "database/sql" - // sqldriver "database/sql/driver" - // "fmt" - // "strings" "crypto/rand" "encoding/base64" "fmt" @@ -29,14 +24,9 @@ import ( "testing" "time" - // "time" - - // "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" - // "golang.org/x/sync/errgroup" - driver "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils/sql_server_driver" ) From 337aee528f2a4358bc11aabf75ce8f4c3ab26e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Wed, 4 Feb 2026 16:11:25 -0800 Subject: [PATCH 03/28] /go/store/blobstore: wip implementing concatenate --- go/store/blobstore/git_blobstore.go | 162 ++++++++++++++++++++++- go/store/blobstore/git_blobstore_test.go | 157 ++++++++++++++++++++++ 2 files changed, 312 insertions(+), 7 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index a7ba33f27f..0aef870b5d 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -15,6 +15,7 @@ package blobstore import ( + "bytes" "context" "errors" "fmt" @@ -25,6 +26,7 @@ import ( "time" "github.com/cenkalti/backoff/v4" + "golang.org/x/sync/errgroup" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" ) @@ -33,9 +35,8 @@ import ( // database (bare repo or .git directory). It stores keys as paths within the tree // of the commit referenced by a git ref (e.g. refs/dolt/data). // -// This implementation is being developed in phases. Read paths are implemented first, -// then write paths are added incrementally. At the moment, Put is implemented, while -// CheckAndPut and Concatenate are still unimplemented. +// This implementation is being developed in phases. Read paths were implemented first, +// then write paths were added incrementally. type GitBlobstore struct { gitDir string ref string @@ -408,15 +409,162 @@ func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key s } func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { - if _, err := normalizeGitTreePath(key); err != nil { + key, err := normalizeGitTreePath(key) + if err != nil { return "", err } - for _, src := range sources { - if _, err := normalizeGitTreePath(src); err != nil { + + normSources := make([]string, len(sources)) + for i, src := range sources { + src, err := normalizeGitTreePath(src) + if err != nil { return "", err } + normSources[i] = src } - return "", fmt.Errorf("%w: GitBlobstore.Concatenate", git.ErrUnimplemented) + + // Snapshot the current head for reading sources so we don't depend on the ref staying + // stable while we stream the concatenated contents into a new blob object. + snapshot, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return "", err + } + if !ok && len(normSources) > 0 { + // If the ref doesn't exist, the store is missing/corrupt (there is no commit to + // resolve source paths against). + return "", &git.RefNotFoundError{Ref: gbs.ref} + } + + blobOID, err := gbs.hashConcatenation(ctx, snapshot, ok, normSources) + if err != nil { + return "", err + } + + const maxRetries = 31 // 32 total attempts (initial + retries) + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Millisecond + bo.Multiplier = 2 + bo.MaxInterval = 320 * time.Millisecond + bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed + bo.Reset() + policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) + + var ver string + op := func() error { + parent, hasParent, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + newCommit, msg, err := gbs.buildConcatenateCommit(ctx, parent, hasParent, key, blobOID, len(normSources)) + if err != nil { + return backoff.Permanent(err) + } + + if !hasParent { + // Best-effort ref creation. If a concurrent writer created the ref first, retry. + if err := gbs.api.UpdateRef(ctx, gbs.ref, newCommit, msg); err != nil { + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + ver = newCommit.String() + return nil + } + + err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) + if err == nil { + ver = newCommit.String() + return nil + } + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + + return ver, nil +} + +func (gbs *GitBlobstore) buildConcatenateCommit(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, nSources int) (git.OID, string, error) { + msg := fmt.Sprintf("gitblobstore: concatenate %s (%d sources)", key, nSources) + commitOID, err := gbs.buildCommitWithMessage(ctx, parent, hasParent, key, blobOID, msg) + if err != nil { + return "", "", err + } + return commitOID, msg, nil +} + +func (gbs *GitBlobstore) hashConcatenation(ctx context.Context, commit git.OID, hasCommit bool, sources []string) (git.OID, error) { + if len(sources) == 0 { + return gbs.api.HashObject(ctx, bytes.NewReader(nil)) + } + if !hasCommit { + return "", &git.RefNotFoundError{Ref: gbs.ref} + } + + pr, pw := io.Pipe() + eg, ectx := errgroup.WithContext(ctx) + eg.Go(func() error { + defer func() { + _ = pw.Close() + }() + + for _, src := range sources { + blobOID, err := gbs.api.ResolvePathBlob(ectx, commit, src) + if err != nil { + if git.IsPathNotFound(err) { + _ = pw.CloseWithError(NotFound{Key: src}) + return NotFound{Key: src} + } + _ = pw.CloseWithError(err) + return err + } + + rc, err := gbs.api.BlobReader(ectx, blobOID) + if err != nil { + _ = pw.CloseWithError(err) + return err + } + + _, err = io.Copy(pw, rc) + cerr := rc.Close() + if err == nil { + err = cerr + } + if err != nil { + _ = pw.CloseWithError(err) + return err + } + } + return nil + }) + + oid, err := gbs.api.HashObject(ectx, pr) + if err != nil { + _ = pr.CloseWithError(err) + if werr := eg.Wait(); werr != nil { + return "", werr + } + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + + _ = pr.Close() + if err := eg.Wait(); err != nil { + return "", err + } + return oid, nil } // normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path. diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index a543808572..870238dbb1 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -215,6 +215,12 @@ func TestGitBlobstore_InvalidKeysError(t *testing.T) { _, err = bs.Put(ctx, k, 1, bytes.NewReader([]byte("x"))) require.Error(t, err, "expected error for key %q", k) + + _, err = bs.Concatenate(ctx, k, []string{"ok"}) + require.Error(t, err, "expected error for key %q", k) + + _, err = bs.Concatenate(ctx, "ok2", []string{k}) + require.Error(t, err, "expected error for source key %q", k) } } @@ -268,6 +274,157 @@ func TestGitBlobstore_Put_Overwrite(t *testing.T) { require.Equal(t, []byte("v2\n"), got) } +func TestGitBlobstore_Concatenate_RoundTripAndRanges(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + a := []byte("aaaaa") + b := []byte("bbb") + c := []byte("cccccccc") + _, err = PutBytes(ctx, bs, "a", a) + require.NoError(t, err) + _, err = PutBytes(ctx, bs, "b", b) + require.NoError(t, err) + _, err = PutBytes(ctx, bs, "c", c) + require.NoError(t, err) + + ver, err := bs.Concatenate(ctx, "composite", []string{"a", "b", "c"}) + require.NoError(t, err) + require.NotEmpty(t, ver) + + // Full object. + got, ver2, err := GetBytes(ctx, bs, "composite", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, append(append(append([]byte(nil), a...), b...), c...), got) + + // Range verification across boundaries. + var off int64 + rc, sz, ver3, err := bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(a))}) + require.NoError(t, err) + require.Equal(t, ver, ver3) + require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) + buf, err := io.ReadAll(rc) + _ = rc.Close() + require.NoError(t, err) + require.Equal(t, a, buf) + off += int64(len(a)) + + rc, sz, ver3, err = bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(b))}) + require.NoError(t, err) + require.Equal(t, ver, ver3) + require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) + buf, err = io.ReadAll(rc) + _ = rc.Close() + require.NoError(t, err) + require.Equal(t, b, buf) + off += int64(len(b)) + + rc, sz, ver3, err = bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(c))}) + require.NoError(t, err) + require.Equal(t, ver, ver3) + require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) + buf, err = io.ReadAll(rc) + _ = rc.Close() + require.NoError(t, err) + require.Equal(t, c, buf) +} + +func TestGitBlobstore_Concatenate_EmptySourcesCreatesEmptyBlob(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + ver, err := bs.Concatenate(ctx, "empty", nil) + require.NoError(t, err) + require.NotEmpty(t, ver) + + rc, sz, ver2, err := bs.Get(ctx, "empty", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, uint64(0), sz) + data, err := io.ReadAll(rc) + _ = rc.Close() + require.NoError(t, err) + require.Empty(t, data) +} + +func TestGitBlobstore_Concatenate_MissingSourceIsNotFound(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + _, err = PutBytes(ctx, bs, "present", []byte("x")) + require.NoError(t, err) + + _, err = bs.Concatenate(ctx, "composite", []string{"present", "missing"}) + require.Error(t, err) + require.True(t, IsNotFoundError(err)) + + ok, err := bs.Exists(ctx, "composite") + require.NoError(t, err) + require.False(t, ok) +} + +func TestGitBlobstore_Concatenate_ContentionRetryPreservesOtherKey(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Seed the ref so Concatenate takes the CAS path. + _, err = repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ + "a": []byte("A"), + "b": []byte("B"), + }, "seed") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + origAPI := bs.api + h := &hookGitAPI{GitAPI: origAPI, ref: DoltDataRef} + h.onFirstCAS = func(ctx context.Context, old git.OID) { + // Advance the ref to simulate another writer committing concurrently. + _, _ = writeKeyToRef(ctx, origAPI, DoltDataRef, "external", []byte("external\n"), testIdentity()) + } + bs.api = h + + ver, err := bs.Concatenate(ctx, "composite", []string{"a", "b"}) + require.NoError(t, err) + require.NotEmpty(t, ver) + + got, ver2, err := GetBytes(ctx, bs, "composite", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, []byte("AB"), got) + + got, _, err = GetBytes(ctx, bs, "external", AllRange) + require.NoError(t, err) + require.Equal(t, []byte("external\n"), got) + + got, _, err = GetBytes(ctx, bs, "a", AllRange) + require.NoError(t, err) + require.Equal(t, []byte("A"), got) +} + type hookGitAPI struct { git.GitAPI From 2419da780311e363026b87c3156c3b39cd453fbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 10:05:02 -0800 Subject: [PATCH 04/28] /go/store/blobstore/git_blobstore.go: fix lost concurrent writes on empty repo --- go/store/blobstore/git_blobstore.go | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 0aef870b5d..c8e3020a48 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -212,9 +212,10 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r } if !ok { - // Best-effort ref creation. If a concurrent writer created the ref first, retry - // and take the normal CAS path on the new head. - if err := gbs.api.UpdateRef(ctx, gbs.ref, newCommit, msg); err != nil { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids + // losing concurrent writes when multiple goroutines create the ref at once. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { if gbs.refAdvanced(ctx, parent) { return err } @@ -462,8 +463,10 @@ func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources [] } if !hasParent { - // Best-effort ref creation. If a concurrent writer created the ref first, retry. - if err := gbs.api.UpdateRef(ctx, gbs.ref, newCommit, msg); err != nil { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids + // losing concurrent writes when multiple goroutines create the ref at once. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { if gbs.refAdvanced(ctx, parent) { return err } From 361a5ff747f6a85d60ca55308cf8149967e717e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 11:53:10 -0800 Subject: [PATCH 05/28] /go/store/blobstore: maybe concatenate --- go/store/blobstore/git_blobstore.go | 559 ++++++++++++++++-- .../git_blobstore_chunked_checkandput_test.go | 82 +++ .../git_blobstore_chunked_concatenate_test.go | 88 +++ .../git_blobstore_chunked_get_test.go | 126 ++++ .../git_blobstore_chunked_put_test.go | 84 +++ go/store/blobstore/git_blobstore_parts.go | 40 ++ .../blobstore/git_blobstore_parts_test.go | 67 +++ .../blobstore/internal/gitbs/descriptor.go | 228 +++++++ .../internal/gitbs/descriptor_test.go | 91 +++ .../blobstore/internal/gitbs/parts_path.go | 40 ++ .../internal/gitbs/parts_path_test.go | 40 ++ go/store/blobstore/internal/gitbs/ranges.go | 131 ++++ .../blobstore/internal/gitbs/ranges_test.go | 82 +++ 13 files changed, 1614 insertions(+), 44 deletions(-) create mode 100644 go/store/blobstore/git_blobstore_chunked_checkandput_test.go create mode 100644 go/store/blobstore/git_blobstore_chunked_concatenate_test.go create mode 100644 go/store/blobstore/git_blobstore_chunked_get_test.go create mode 100644 go/store/blobstore/git_blobstore_chunked_put_test.go create mode 100644 go/store/blobstore/git_blobstore_parts.go create mode 100644 go/store/blobstore/git_blobstore_parts_test.go create mode 100644 go/store/blobstore/internal/gitbs/descriptor.go create mode 100644 go/store/blobstore/internal/gitbs/descriptor_test.go create mode 100644 go/store/blobstore/internal/gitbs/parts_path.go create mode 100644 go/store/blobstore/internal/gitbs/parts_path_test.go create mode 100644 go/store/blobstore/internal/gitbs/ranges.go create mode 100644 go/store/blobstore/internal/gitbs/ranges_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index c8e3020a48..3f18d29920 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -22,6 +22,7 @@ import ( "io" "os" "path/filepath" + "sort" "strings" "time" @@ -29,6 +30,7 @@ import ( "golang.org/x/sync/errgroup" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" ) // GitBlobstore is a Blobstore implementation backed by a git repository's object @@ -46,6 +48,12 @@ type GitBlobstore struct { // When nil, we prefer whatever identity git derives from env/config, falling back // to a deterministic default only if git reports the identity is missing. identity *git.Identity + // maxPartSize, when non-zero, enables the chunked-object representation for objects + // written by Put/CheckAndPut/Concatenate. When enabled, no single part blob created + // by this blobstore should exceed maxPartSize bytes. + // + // A zero value means "disabled" (store values inline as a single git blob). + maxPartSize uint64 } var _ Blobstore = (*GitBlobstore)(nil) @@ -54,17 +62,38 @@ var _ Blobstore = (*GitBlobstore)(nil) // |gitDir| should point at a bare repo directory or a .git directory. Put is implemented, // while CheckAndPut and Concatenate are still unimplemented (see type-level docs). func NewGitBlobstore(gitDir, ref string) (*GitBlobstore, error) { - return NewGitBlobstoreWithIdentity(gitDir, ref, nil) + return NewGitBlobstoreWithOptions(gitDir, ref, GitBlobstoreOptions{}) } // NewGitBlobstoreWithIdentity creates a GitBlobstore rooted at |gitDir| and |ref|, optionally // forcing an author/committer identity for write paths. func NewGitBlobstoreWithIdentity(gitDir, ref string, identity *git.Identity) (*GitBlobstore, error) { + return NewGitBlobstoreWithOptions(gitDir, ref, GitBlobstoreOptions{Identity: identity}) +} + +// GitBlobstoreOptions configures optional behaviors of GitBlobstore. +type GitBlobstoreOptions struct { + // Identity, when non-nil, forces the author/committer identity for commits created by write paths. + Identity *git.Identity + // MaxPartSize enables chunked-object writes when non-zero. + // Read paths always support chunked objects if encountered. + MaxPartSize uint64 +} + +// NewGitBlobstoreWithOptions creates a GitBlobstore rooted at |gitDir| and |ref|. +func NewGitBlobstoreWithOptions(gitDir, ref string, opts GitBlobstoreOptions) (*GitBlobstore, error) { r, err := git.NewRunner(gitDir) if err != nil { return nil, err } - return &GitBlobstore{gitDir: gitDir, ref: ref, runner: r, api: git.NewGitAPIImpl(r), identity: identity}, nil + return &GitBlobstore{ + gitDir: gitDir, + ref: ref, + runner: r, + api: git.NewGitAPIImpl(r), + identity: opts.Identity, + maxPartSize: opts.MaxPartSize, + }, nil } func (gbs *GitBlobstore) Path() string { @@ -124,27 +153,126 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. return nil, 0, commit.String(), err } - // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads - // that do many small ranged reads (e.g. table index/footer reads). Consider caching/materializing - // blobs to a local file (or using a batched git cat-file mode) to serve ranges efficiently. + return gbs.openBlobOrDescriptorRange(ctx, commit, blobOID, sz, br) +} + +type limitReadCloser struct { + r io.Reader + c io.Closer +} + +func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } +func (l *limitReadCloser) Close() error { return l.c.Close() } + +func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit git.OID, blobOID git.OID, blobSize int64, br BlobRange) (io.ReadCloser, uint64, string, error) { + ver := commit.String() + + // Read the blob contents. If it's a descriptor, we'll parse it and stream across parts. rc, err := gbs.api.BlobReader(ctx, blobOID) if err != nil { - return nil, 0, commit.String(), err + return nil, 0, ver, err + } + defer func() { + if rc != nil { + _ = rc.Close() + } + }() + + // Read up to a bounded prefix to determine if it's a descriptor. If it looks like one, + // read the full blob (descriptors are expected to be small). + const peekN = 64 * 1024 + peek := make([]byte, 0, 256) + buf := make([]byte, 256) + for len(peek) < cap(peek) { + n, rerr := rc.Read(buf[:min(cap(peek)-len(peek), len(buf))]) + if n > 0 { + peek = append(peek, buf[:n]...) + } + if rerr != nil { + if errors.Is(rerr, io.EOF) { + break + } + return nil, 0, ver, rerr + } } + // Not a descriptor: stream inline blob with BlobRange slicing. + if !gitbs.IsDescriptorPrefix(peek) { + // Re-open for streaming the full inline blob. (Simpler than splicing peek+rest.) + _ = rc.Close() + rc = nil + + inlineRC, err := gbs.api.BlobReader(ctx, blobOID) + if err != nil { + return nil, 0, ver, err + } + return sliceInlineBlob(inlineRC, blobSize, br, ver) + } + + // It's probably a descriptor. Read the full contents (bounded defensively). + // TODO(gitblobstore): add a MaxDescriptorSize config; for now cap at 64KiB. + descBytes := append([]byte(nil), peek...) + for int64(len(descBytes)) < blobSize && len(descBytes) < peekN { + n, rerr := rc.Read(buf) + if n > 0 { + descBytes = append(descBytes, buf[:n]...) + } + if rerr != nil { + if errors.Is(rerr, io.EOF) { + break + } + return nil, 0, ver, rerr + } + } + if int64(len(descBytes)) < blobSize { + if blobSize > peekN { + return nil, 0, ver, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", blobSize, peekN) + } + return nil, 0, ver, io.ErrUnexpectedEOF + } + + desc, err := gitbs.ParseDescriptor(descBytes) + if err != nil { + // Treat malformed descriptors as corruption (hard error). + return nil, 0, ver, err + } + + total := int64(desc.TotalSize) + start, end, err := gitbs.NormalizeRange(total, br.offset, br.length) + if err != nil { + return nil, uint64(desc.TotalSize), ver, err + } + slices, err := gitbs.SliceParts(desc.Parts, start, end) + if err != nil { + return nil, uint64(desc.TotalSize), ver, err + } + + // Stream across part blobs. + streamRC := &multiPartReadCloser{ + ctx: ctx, + api: gbs.api, + slices: slices, + } + // Close descriptor blob reader (not used past this point). + _ = rc.Close() + rc = nil + return streamRC, uint64(desc.TotalSize), ver, nil +} + +func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { // Implement BlobRange by slicing the streamed blob contents. if br.isAllRange() { - return rc, uint64(sz), commit.String(), nil + return rc, uint64(sz), ver, nil } pos := br.positiveRange(sz) if pos.offset < 0 || pos.offset > sz { _ = rc.Close() - return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange offset %d for blob of size %d", pos.offset, sz) + return nil, uint64(sz), ver, fmt.Errorf("invalid BlobRange offset %d for blob of size %d", pos.offset, sz) } if pos.length < 0 { _ = rc.Close() - return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange length %d", pos.length) + return nil, uint64(sz), ver, fmt.Errorf("invalid BlobRange length %d", pos.length) } if pos.length == 0 { // Read from offset to end. @@ -159,20 +287,93 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. if pos.offset > 0 { if _, err := io.CopyN(io.Discard, rc, pos.offset); err != nil { _ = rc.Close() - return nil, uint64(sz), commit.String(), err + return nil, uint64(sz), ver, err } } - return &limitReadCloser{r: io.LimitReader(rc, pos.length), c: rc}, uint64(sz), commit.String(), nil + return &limitReadCloser{r: io.LimitReader(rc, pos.length), c: rc}, uint64(sz), ver, nil } -type limitReadCloser struct { - r io.Reader - c io.Closer +type multiPartReadCloser struct { + ctx context.Context + api git.GitAPI + + slices []gitbs.PartSlice + curIdx int + + curRC io.ReadCloser + rem int64 } -func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } -func (l *limitReadCloser) Close() error { return l.c.Close() } +func (m *multiPartReadCloser) Read(p []byte) (int, error) { + for { + if m.curRC == nil { + if m.curIdx >= len(m.slices) { + return 0, io.EOF + } + s := m.slices[m.curIdx] + rc, err := m.api.BlobReader(m.ctx, git.OID(s.OIDHex)) + if err != nil { + return 0, err + } + // Skip within part. + if s.Offset > 0 { + if _, err := io.CopyN(io.Discard, rc, s.Offset); err != nil { + _ = rc.Close() + return 0, err + } + } + m.curRC = rc + m.rem = s.Length + } + + if m.rem == 0 { + _ = m.curRC.Close() + m.curRC = nil + m.curIdx++ + continue + } + + toRead := len(p) + if int64(toRead) > m.rem { + toRead = int(m.rem) + } + n, err := m.curRC.Read(p[:toRead]) + if n > 0 { + m.rem -= int64(n) + return n, nil + } + if err != nil { + if errors.Is(err, io.EOF) { + // End of underlying part blob; if we still expected bytes, that's corruption. + if m.rem > 0 { + return 0, io.ErrUnexpectedEOF + } + _ = m.curRC.Close() + m.curRC = nil + m.curIdx++ + continue + } + return 0, err + } + } +} + +func (m *multiPartReadCloser) Close() error { + if m.curRC != nil { + err := m.curRC.Close() + m.curRC = nil + return err + } + return nil +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { key, err := normalizeGitTreePath(key) @@ -180,9 +381,11 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r return "", err } + msg := fmt.Sprintf("gitblobstore: put %s", key) + // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, - // we can reuse the blob OID without re-reading |reader|. - blobOID, err := gbs.api.HashObject(ctx, reader) + // we can reuse the resulting object OIDs without re-reading |reader|. + writes, err := gbs.planPutWrites(ctx, key, totalSize, reader) if err != nil { return "", err } @@ -206,7 +409,7 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r return backoff.Permanent(err) } - newCommit, msg, err := gbs.buildPutCommit(ctx, parent, ok, key, blobOID) + newCommit, err := gbs.buildCommitWithWrites(ctx, parent, ok, writes, msg) if err != nil { return backoff.Permanent(err) } @@ -248,16 +451,16 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r return ver, nil } -func (gbs *GitBlobstore) buildPutCommit(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID) (git.OID, string, error) { - msg := fmt.Sprintf("gitblobstore: put %s", key) - commitOID, err := gbs.buildCommitWithMessage(ctx, parent, hasParent, key, blobOID, msg) - if err != nil { - return "", "", err - } - return commitOID, msg, nil +type treeWrite struct { + path string + oid git.OID } func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, msg string) (git.OID, error) { + return gbs.buildCommitWithWrites(ctx, parent, hasParent, []treeWrite{{path: key, oid: blobOID}}, msg) +} + +func (gbs *GitBlobstore) buildCommitWithWrites(ctx context.Context, parent git.OID, hasParent bool, writes []treeWrite, msg string) (git.OID, error) { _, indexFile, cleanup, err := newTempIndex() if err != nil { return "", err @@ -280,8 +483,11 @@ func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git. // flat (e.g. "manifest", "", ".records"), so this should not occur. If we ever // namespace keys into directories, consider proactively removing conflicting paths from the index // before UpdateIndexCacheInfo so Put/CheckAndPut remain robust. - if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", blobOID, key); err != nil { - return "", err + sort.Slice(writes, func(i, j int) bool { return writes[i].path < writes[j].path }) + for _, w := range writes { + if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { + return "", err + } } treeOID, err := gbs.api.WriteTree(ctx, indexFile) @@ -307,6 +513,91 @@ func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git. return commitOID, nil } +func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) ([]treeWrite, error) { + // Minimal policy: chunk only when explicitly enabled and |totalSize| exceeds MaxPartSize. + if gbs.maxPartSize == 0 || totalSize <= 0 || uint64(totalSize) <= gbs.maxPartSize { + blobOID, err := gbs.api.HashObject(ctx, reader) + if err != nil { + return nil, err + } + return []treeWrite{{path: key, oid: blobOID}}, nil + } + + descOID, partOIDs, err := gbs.hashChunkedObject(ctx, reader) + if err != nil { + return nil, err + } + + writes := make([]treeWrite, 0, 1+len(partOIDs)) + writes = append(writes, treeWrite{path: key, oid: descOID}) + for _, p := range partOIDs { + ppath, err := gitbs.PartPath(p.String()) + if err != nil { + return nil, err + } + writes = append(writes, treeWrite{path: ppath, oid: p}) + } + return writes, nil +} + +func (gbs *GitBlobstore) hashChunkedObject(ctx context.Context, reader io.Reader) (descOID git.OID, partOIDs []git.OID, err error) { + max := int64(gbs.maxPartSize) + if max <= 0 { + return "", nil, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) + } + + parts, partOIDs, total, err := gbs.hashParts(ctx, reader) + if err != nil { + return "", nil, err + } + + descBytes, err := gitbs.EncodeDescriptor(gitbs.Descriptor{TotalSize: total, Parts: parts}) + if err != nil { + return "", nil, err + } + descOID, err = gbs.api.HashObject(ctx, bytes.NewReader(descBytes)) + if err != nil { + return "", nil, err + } + return descOID, partOIDs, nil +} + +func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []gitbs.PartRef, partOIDs []git.OID, total uint64, err error) { + max := int64(gbs.maxPartSize) + if max <= 0 { + return nil, nil, 0, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) + } + + buf := make([]byte, max) + for { + n, rerr := io.ReadFull(reader, buf) + if rerr != nil { + if errors.Is(rerr, io.EOF) { + break + } + if !errors.Is(rerr, io.ErrUnexpectedEOF) { + return nil, nil, 0, rerr + } + // ErrUnexpectedEOF: process final short chunk and stop. + } + if n == 0 { + break + } + partBytes := append([]byte(nil), buf[:n]...) + oid, err := gbs.api.HashObject(ctx, bytes.NewReader(partBytes)) + if err != nil { + return nil, nil, 0, err + } + partOIDs = append(partOIDs, oid) + parts = append(parts, gitbs.PartRef{OIDHex: oid.String(), Size: uint64(n)}) + total += uint64(n) + if errors.Is(rerr, io.ErrUnexpectedEOF) { + break + } + } + return parts, partOIDs, total, nil +} + func defaultGitBlobstoreIdentity() *git.Identity { // Deterministic fallback identity for environments without git identity configured. return &git.Identity{Name: "dolt gitblobstore", Email: "gitblobstore@dolt.invalid"} @@ -374,13 +665,13 @@ func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key s return "", CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualVersion} } - blobOID, err := gbs.api.HashObject(ctx, reader) + msg := fmt.Sprintf("gitblobstore: checkandput %s", key) + writes, err := gbs.planPutWrites(ctx, key, totalSize, reader) if err != nil { return "", err } - msg := fmt.Sprintf("gitblobstore: checkandput %s", key) - newCommit, err := gbs.buildCommitWithMessage(ctx, parent, ok, key, blobOID, msg) + newCommit, err := gbs.buildCommitWithWrites(ctx, parent, ok, writes, msg) if err != nil { return "", err } @@ -436,9 +727,20 @@ func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources [] return "", &git.RefNotFoundError{Ref: gbs.ref} } - blobOID, err := gbs.hashConcatenation(ctx, snapshot, ok, normSources) - if err != nil { - return "", err + msg := fmt.Sprintf("gitblobstore: concatenate %s (%d sources)", key, len(normSources)) + + var writes []treeWrite + if gbs.maxPartSize == 0 { + blobOID, err := gbs.hashConcatenation(ctx, snapshot, ok, normSources) + if err != nil { + return "", err + } + writes = []treeWrite{{path: key, oid: blobOID}} + } else { + writes, err = gbs.planConcatenateWritesChunked(ctx, snapshot, ok, key, normSources) + if err != nil { + return "", err + } } const maxRetries = 31 // 32 total attempts (initial + retries) @@ -457,7 +759,7 @@ func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources [] return backoff.Permanent(err) } - newCommit, msg, err := gbs.buildConcatenateCommit(ctx, parent, hasParent, key, blobOID, len(normSources)) + newCommit, err := gbs.buildCommitWithWrites(ctx, parent, hasParent, writes, msg) if err != nil { return backoff.Permanent(err) } @@ -497,15 +799,6 @@ func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources [] return ver, nil } -func (gbs *GitBlobstore) buildConcatenateCommit(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, nSources int) (git.OID, string, error) { - msg := fmt.Sprintf("gitblobstore: concatenate %s (%d sources)", key, nSources) - commitOID, err := gbs.buildCommitWithMessage(ctx, parent, hasParent, key, blobOID, msg) - if err != nil { - return "", "", err - } - return commitOID, msg, nil -} - func (gbs *GitBlobstore) hashConcatenation(ctx context.Context, commit git.OID, hasCommit bool, sources []string) (git.OID, error) { if len(sources) == 0 { return gbs.api.HashObject(ctx, bytes.NewReader(nil)) @@ -570,6 +863,184 @@ func (gbs *GitBlobstore) hashConcatenation(ctx context.Context, commit git.OID, return oid, nil } +type resolvedConcatSource struct { + inlineOID git.OID + inlineSize int64 + desc *gitbs.Descriptor +} + +func (gbs *GitBlobstore) resolveConcatSource(ctx context.Context, commit git.OID, path string) (resolvedConcatSource, error) { + blobOID, err := gbs.api.ResolvePathBlob(ctx, commit, path) + if err != nil { + if git.IsPathNotFound(err) { + return resolvedConcatSource{}, NotFound{Key: path} + } + return resolvedConcatSource{}, err + } + sz, err := gbs.api.BlobSize(ctx, blobOID) + if err != nil { + return resolvedConcatSource{}, err + } + + // Peek enough bytes to detect descriptor prefix conservatively. + rc, err := gbs.api.BlobReader(ctx, blobOID) + if err != nil { + return resolvedConcatSource{}, err + } + defer rc.Close() + + peek := make([]byte, 0, 64) + buf := make([]byte, 64) + for len(peek) < cap(peek) { + n, rerr := rc.Read(buf[:min(cap(peek)-len(peek), len(buf))]) + if n > 0 { + peek = append(peek, buf[:n]...) + } + if rerr != nil { + if errors.Is(rerr, io.EOF) { + break + } + return resolvedConcatSource{}, rerr + } + } + + if !gitbs.IsDescriptorPrefix(peek) { + return resolvedConcatSource{inlineOID: blobOID, inlineSize: sz}, nil + } + + // Descriptor: re-read whole descriptor blob (bounded). + // TODO(gitblobstore): configurable MaxDescriptorSize. + const maxDesc = int64(64 * 1024) + if sz > maxDesc { + return resolvedConcatSource{}, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", sz, maxDesc) + } + _ = rc.Close() + + rc2, err := gbs.api.BlobReader(ctx, blobOID) + if err != nil { + return resolvedConcatSource{}, err + } + defer rc2.Close() + + descBytes, err := io.ReadAll(rc2) + if err != nil { + return resolvedConcatSource{}, err + } + desc, err := gitbs.ParseDescriptor(descBytes) + if err != nil { + return resolvedConcatSource{}, err + } + return resolvedConcatSource{desc: &desc}, nil +} + +func (gbs *GitBlobstore) planConcatenateWritesChunked(ctx context.Context, snapshot git.OID, hasSnapshot bool, key string, sources []string) ([]treeWrite, error) { + if len(sources) == 0 { + // Empty concatenation => empty object. Store inline. + oid, err := gbs.api.HashObject(ctx, bytes.NewReader(nil)) + if err != nil { + return nil, err + } + return []treeWrite{{path: key, oid: oid}}, nil + } + if !hasSnapshot { + return nil, &git.RefNotFoundError{Ref: gbs.ref} + } + + var ( + allParts []gitbs.PartRef + allPartOID = make(map[git.OID]struct{}) + total uint64 + ) + + for _, src := range sources { + rs, err := gbs.resolveConcatSource(ctx, snapshot, src) + if err != nil { + return nil, err + } + + var parts []gitbs.PartRef + var oids []git.OID + + if rs.desc != nil { + parts = rs.desc.Parts + for _, p := range parts { + oid := git.OID(p.OIDHex) + if p.Size > gbs.maxPartSize { + // Re-chunk oversized part. + rc, err := gbs.api.BlobReader(ctx, oid) + if err != nil { + return nil, err + } + newParts, newOIDs, _, err := gbs.hashParts(ctx, rc) + _ = rc.Close() + if err != nil { + return nil, err + } + allParts = append(allParts, newParts...) + for _, no := range newOIDs { + allPartOID[no] = struct{}{} + } + for _, np := range newParts { + total += np.Size + } + continue + } + oids = append(oids, oid) + } + } else { + // Inline. + if rs.inlineSize < 0 { + return nil, fmt.Errorf("gitblobstore: invalid inline size %d", rs.inlineSize) + } + if uint64(rs.inlineSize) > gbs.maxPartSize { + // Re-chunk oversized inline blob. + rc, err := gbs.api.BlobReader(ctx, rs.inlineOID) + if err != nil { + return nil, err + } + newParts, newOIDs, _, err := gbs.hashParts(ctx, rc) + _ = rc.Close() + if err != nil { + return nil, err + } + parts = newParts + oids = newOIDs + } else { + parts = []gitbs.PartRef{{OIDHex: rs.inlineOID.String(), Size: uint64(rs.inlineSize)}} + oids = []git.OID{rs.inlineOID} + } + } + + allParts = append(allParts, parts...) + for _, o := range oids { + allPartOID[o] = struct{}{} + } + for _, p := range parts { + total += p.Size + } + } + + descBytes, err := gitbs.EncodeDescriptor(gitbs.Descriptor{TotalSize: total, Parts: allParts}) + if err != nil { + return nil, err + } + descOID, err := gbs.api.HashObject(ctx, bytes.NewReader(descBytes)) + if err != nil { + return nil, err + } + + writes := make([]treeWrite, 0, 1+len(allPartOID)) + writes = append(writes, treeWrite{path: key, oid: descOID}) + for oid := range allPartOID { + ppath, err := gitbs.PartPath(oid.String()) + if err != nil { + return nil, err + } + writes = append(writes, treeWrite{path: ppath, oid: oid}) + } + return writes, nil +} + // normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path. // // Rules: diff --git a/go/store/blobstore/git_blobstore_chunked_checkandput_test.go b/go/store/blobstore/git_blobstore_chunked_checkandput_test.go new file mode 100644 index 0000000000..316131fc9c --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_checkandput_test.go @@ -0,0 +1,82 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_CheckAndPut_ChunkedRoundTrip_CreateOnly(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + want := []byte("abcdefghij") // 10 bytes -> chunked + ver, err := bs.CheckAndPut(ctx, "", "big", int64(len(want)), bytes.NewReader(want)) + require.NoError(t, err) + require.NotEmpty(t, ver) + + got, ver2, err := GetBytes(ctx, bs, "big", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, want, got) +} + +type failReadSeeker struct{} + +func (f failReadSeeker) Read(p []byte) (int, error) { + return 0, errors.New("read should not be called") +} + +func TestGitBlobstore_CheckAndPut_MismatchDoesNotConsumeReader(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Seed any commit so actualVersion != "". + bs0, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{Identity: testIdentity()}) + require.NoError(t, err) + _, err = bs0.Put(ctx, "x", 1, bytes.NewReader([]byte("x"))) + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + // Provide a wrong expectedVersion; should fail without reading. + _, err = bs.CheckAndPut(ctx, "definitely-wrong", "y", 1, io.Reader(failReadSeeker{})) + require.Error(t, err) + require.True(t, IsCheckAndPutError(err)) +} diff --git a/go/store/blobstore/git_blobstore_chunked_concatenate_test.go b/go/store/blobstore/git_blobstore_chunked_concatenate_test.go new file mode 100644 index 0000000000..ed989cbb4b --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_concatenate_test.go @@ -0,0 +1,88 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "io" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_Concatenate_ChunkedStructuralAndRechunksOversizedInline(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Seed an oversized inline blob (chunking disabled) so we exercise re-chunking during Concatenate. + seed, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 0, + }) + require.NoError(t, err) + + inline := []byte("abcdefghij") // 10 bytes + _, err = seed.Put(ctx, "a", int64(len(inline)), bytes.NewReader(inline)) + require.NoError(t, err) + + // Now concatenate in chunked mode with a small max part size. + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + ver, err := bs.Concatenate(ctx, "out", []string{"a"}) + require.NoError(t, err) + require.NotEmpty(t, ver) + + got, ver2, err := GetBytes(ctx, bs, "out", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, inline, got) + + // Verify "out" is a descriptor and all parts are <= 3 and reachable under parts namespace. + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + commit := git.OID(ver) + + outOID, err := api.ResolvePathBlob(ctx, commit, "out") + require.NoError(t, err) + rc, err := api.BlobReader(ctx, outOID) + require.NoError(t, err) + descBytes, err := io.ReadAll(rc) + require.NoError(t, err) + require.NoError(t, rc.Close()) + + desc, err := gitbs.ParseDescriptor(descBytes) + require.NoError(t, err) + require.Equal(t, uint64(len(inline)), desc.TotalSize) + for _, p := range desc.Parts { + require.LessOrEqual(t, p.Size, uint64(3)) + ppath, err := gitbs.PartPath(p.OIDHex) + require.NoError(t, err) + _, err = api.ResolvePathBlob(ctx, commit, ppath) + require.NoError(t, err) + } +} diff --git a/go/store/blobstore/git_blobstore_chunked_get_test.go b/go/store/blobstore/git_blobstore_chunked_get_test.go new file mode 100644 index 0000000000..923f91071a --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_get_test.go @@ -0,0 +1,126 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "os/exec" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_Get_ChunkedDescriptor_AllAndRanges(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + + // Create two part blobs. + part1 := []byte("abc") + part2 := []byte("defgh") + oid1, err := api.HashObject(ctx, bytes.NewReader(part1)) + require.NoError(t, err) + oid2, err := api.HashObject(ctx, bytes.NewReader(part2)) + require.NoError(t, err) + + desc := gitbs.Descriptor{ + TotalSize: uint64(len(part1) + len(part2)), + Parts: []gitbs.PartRef{ + {OIDHex: oid1.String(), Size: uint64(len(part1))}, + {OIDHex: oid2.String(), Size: uint64(len(part2))}, + }, + } + descBytes, err := gitbs.EncodeDescriptor(desc) + require.NoError(t, err) + descOID, err := api.HashObject(ctx, bytes.NewReader(descBytes)) + require.NoError(t, err) + + // Build a commit whose tree contains: + // - key "chunked" -> descriptor blob + // - parts staged under reserved parts namespace (reachability) + _, indexFile, cleanup, err := newTempIndex() + require.NoError(t, err) + defer cleanup() + + require.NoError(t, api.ReadTreeEmpty(ctx, indexFile)) + require.NoError(t, api.UpdateIndexCacheInfo(ctx, indexFile, "100644", descOID, "chunked")) + _, err = stagePartReachable(ctx, api, indexFile, oid1) + require.NoError(t, err) + _, err = stagePartReachable(ctx, api, indexFile, oid2) + require.NoError(t, err) + + treeOID, err := api.WriteTree(ctx, indexFile) + require.NoError(t, err) + commitOID, err := api.CommitTree(ctx, treeOID, nil, "seed chunked descriptor", &git.Identity{Name: "t", Email: "t@t"}) + require.NoError(t, err) + require.NoError(t, api.UpdateRef(ctx, DoltDataRef, commitOID, "seed")) + + bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) + require.NoError(t, err) + + wantAll := append(append([]byte(nil), part1...), part2...) + + got, ver, err := GetBytes(ctx, bs, "chunked", AllRange) + require.NoError(t, err) + require.Equal(t, commitOID.String(), ver) + require.Equal(t, wantAll, got) + + // Range spanning boundary: offset 2 length 4 => "cdef" + got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(2, 4)) + require.NoError(t, err) + require.Equal(t, commitOID.String(), ver) + require.Equal(t, []byte("cdef"), got) + + // Tail read last 3 bytes => "fgh" + got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(-3, 0)) + require.NoError(t, err) + require.Equal(t, commitOID.String(), ver) + require.Equal(t, []byte("fgh"), got) + + // Validate size returned is logical size, not descriptor size. + rc, sz, ver2, err := bs.Get(ctx, "chunked", NewBlobRange(0, 1)) + require.NoError(t, err) + require.Equal(t, uint64(len(wantAll)), sz) + require.Equal(t, commitOID.String(), ver2) + _ = rc.Close() + + // Also verify "inline blob that happens to start with magic" is treated as inline + // if it doesn't match the descriptor prefix (magic + size line). + inline := "DOLTBS1\nthis is not a descriptor\n" + inlineCommit, err := repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ + "inline": []byte(inline), + }, "seed inline magic") + require.NoError(t, err) + + bs2, err := NewGitBlobstore(repo.GitDir, DoltDataRef) + require.NoError(t, err) + got2, ver3, err := GetBytes(ctx, bs2, "inline", AllRange) + require.NoError(t, err) + require.Equal(t, inlineCommit, ver3) + require.Equal(t, []byte(inline), got2) +} diff --git a/go/store/blobstore/git_blobstore_chunked_put_test.go b/go/store/blobstore/git_blobstore_chunked_put_test.go new file mode 100644 index 0000000000..5c16b9ec77 --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_put_test.go @@ -0,0 +1,84 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "io" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_Put_ChunkedUnderMaxPartSize(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + want := []byte("abcdefghij") // 10 bytes -> 3,3,3,1 + ver, err := bs.Put(ctx, "big", int64(len(want)), bytes.NewReader(want)) + require.NoError(t, err) + + got, ver2, err := GetBytes(ctx, bs, "big", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, want, got) + + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + + commit := git.OID(ver) + keyOID, err := api.ResolvePathBlob(ctx, commit, "big") + require.NoError(t, err) + + rc, err := api.BlobReader(ctx, keyOID) + require.NoError(t, err) + descBytes, err := io.ReadAll(rc) + require.NoError(t, err) + require.NoError(t, rc.Close()) + + desc, err := gitbs.ParseDescriptor(descBytes) + require.NoError(t, err) + require.Equal(t, uint64(len(want)), desc.TotalSize) + require.GreaterOrEqual(t, len(desc.Parts), 2) + + for _, p := range desc.Parts { + require.LessOrEqual(t, p.Size, uint64(3)) + ppath, err := gitbs.PartPath(p.OIDHex) + require.NoError(t, err) + gotOID, err := api.ResolvePathBlob(ctx, commit, ppath) + require.NoError(t, err) + require.Equal(t, git.OID(p.OIDHex), gotOID) + } + + // Range spanning boundary (offset 2, length 4) => "cdef" + got, _, err = GetBytes(ctx, bs, "big", NewBlobRange(2, 4)) + require.NoError(t, err) + require.Equal(t, []byte("cdef"), got) +} diff --git a/go/store/blobstore/git_blobstore_parts.go b/go/store/blobstore/git_blobstore_parts.go new file mode 100644 index 0000000000..dccd36bea0 --- /dev/null +++ b/go/store/blobstore/git_blobstore_parts.go @@ -0,0 +1,40 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "context" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" +) + +const ( + // gitblobstorePartFileMode is the canonical filemode used for part blobs staged into the tree. + gitblobstorePartFileMode = "100644" +) + +// stagePartReachable stages a tree entry for |partOID| into |indexFile| under the reserved +// parts namespace, ensuring the blob is reachable from the resulting tree/commit. +// +// This operation is idempotent: staging the same part OID at the same computed path twice +// should result in the same index state. +func stagePartReachable(ctx context.Context, api git.GitAPI, indexFile string, partOID git.OID) (path string, err error) { + path, err = gitbs.PartPath(partOID.String()) + if err != nil { + return "", err + } + return path, api.UpdateIndexCacheInfo(ctx, indexFile, gitblobstorePartFileMode, partOID, path) +} diff --git a/go/store/blobstore/git_blobstore_parts_test.go b/go/store/blobstore/git_blobstore_parts_test.go new file mode 100644 index 0000000000..2ae66b06a7 --- /dev/null +++ b/go/store/blobstore/git_blobstore_parts_test.go @@ -0,0 +1,67 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "context" + "os/exec" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestStagePartReachable_Idempotent(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found on PATH") + } + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + + partOID, err := api.HashObject(ctx, strings.NewReader("part-bytes")) + require.NoError(t, err) + + _, indexFile, cleanup, err := newTempIndex() + require.NoError(t, err) + defer cleanup() + + require.NoError(t, api.ReadTreeEmpty(ctx, indexFile)) + + path1, err := stagePartReachable(ctx, api, indexFile, partOID) + require.NoError(t, err) + path2, err := stagePartReachable(ctx, api, indexFile, partOID) + require.NoError(t, err) + require.Equal(t, path1, path2) + + treeOID, err := api.WriteTree(ctx, indexFile) + require.NoError(t, err) + + commitOID, err := api.CommitTree(ctx, treeOID, nil, "stage part reachable test", &git.Identity{Name: "t", Email: "t@t"}) + require.NoError(t, err) + + // Verify the staged path resolves to the part blob in the committed tree. + got, err := api.ResolvePathBlob(ctx, commitOID, path1) + require.NoError(t, err) + require.Equal(t, partOID, got) +} diff --git a/go/store/blobstore/internal/gitbs/descriptor.go b/go/store/blobstore/internal/gitbs/descriptor.go new file mode 100644 index 0000000000..ba502fba75 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/descriptor.go @@ -0,0 +1,228 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gitbs contains internal helpers for GitBlobstore representations. +// +// This package is intentionally Git-agnostic: it does not import the internal/git +// plumbing, and does not assume any ref/update strategy. It focuses on chunked +// object descriptor encoding/decoding and validation. +package gitbs + +import ( + "bytes" + "fmt" + "strconv" + "strings" +) + +const ( + // DescriptorMagic is the first line of a chunked-object descriptor. + DescriptorMagic = "DOLTBS1" +) + +type Descriptor struct { + TotalSize uint64 + Parts []PartRef +} + +type PartRef struct { + OIDHex string + Size uint64 +} + +// IsDescriptorPrefix returns true if |b| looks like the beginning of a descriptor. +// Callers can use this on a small prefix before deciding whether to read and parse +// the full blob. +func IsDescriptorPrefix(b []byte) bool { + // Be conservative: require the magic line break plus "size " prefix. + // This avoids mis-detecting arbitrary inline content that begins with "DOLTBS1". + if !bytes.HasPrefix(b, []byte(DescriptorMagic)) { + return false + } + if len(b) < len(DescriptorMagic)+1 { + return false + } + rest := b[len(DescriptorMagic):] + if bytes.HasPrefix(rest, []byte("\nsize ")) { + return true + } + if bytes.HasPrefix(rest, []byte("\r\nsize ")) { + return true + } + return false +} + +// ParseDescriptor parses and validates a descriptor blob. +func ParseDescriptor(b []byte) (Descriptor, error) { + lines := splitLines(string(b)) + if len(lines) == 0 { + return Descriptor{}, fmt.Errorf("descriptor: empty") + } + if lines[0] != DescriptorMagic { + return Descriptor{}, fmt.Errorf("descriptor: invalid magic %q", lines[0]) + } + + var ( + d Descriptor + haveSz bool + sumPart uint64 + ) + + for _, line := range lines[1:] { + if strings.TrimSpace(line) == "" { + continue + } + fields := strings.Fields(line) + switch { + case len(fields) >= 1 && fields[0] == "size": + if haveSz { + return Descriptor{}, fmt.Errorf("descriptor: multiple size lines") + } + if len(fields) != 2 { + return Descriptor{}, fmt.Errorf("descriptor: malformed size line %q", line) + } + n, err := parseUint(fields[1]) + if err != nil { + return Descriptor{}, fmt.Errorf("descriptor: invalid size %q: %w", fields[1], err) + } + d.TotalSize = n + haveSz = true + + case len(fields) >= 1 && fields[0] == "part": + if len(fields) != 3 { + return Descriptor{}, fmt.Errorf("descriptor: malformed part line %q", line) + } + oid := fields[1] + if err := validateOIDHex(oid); err != nil { + return Descriptor{}, fmt.Errorf("descriptor: invalid part oid %q: %w", oid, err) + } + sz, err := parseUint(fields[2]) + if err != nil { + return Descriptor{}, fmt.Errorf("descriptor: invalid part size %q: %w", fields[2], err) + } + if sz == 0 { + return Descriptor{}, fmt.Errorf("descriptor: part size must be > 0") + } + if sumPart > ^uint64(0)-sz { + return Descriptor{}, fmt.Errorf("descriptor: part sizes overflow uint64") + } + sumPart += sz + d.Parts = append(d.Parts, PartRef{OIDHex: oid, Size: sz}) + + default: + return Descriptor{}, fmt.Errorf("descriptor: unknown line %q", line) + } + } + + if !haveSz { + return Descriptor{}, fmt.Errorf("descriptor: missing size line") + } + if d.TotalSize == 0 { + if len(d.Parts) != 0 { + return Descriptor{}, fmt.Errorf("descriptor: total size 0 requires zero parts") + } + return d, nil + } + if len(d.Parts) == 0 { + return Descriptor{}, fmt.Errorf("descriptor: non-zero total size requires at least one part") + } + if sumPart != d.TotalSize { + return Descriptor{}, fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sumPart, d.TotalSize) + } + return d, nil +} + +// EncodeDescriptor encodes a descriptor in the stable line-oriented format. +func EncodeDescriptor(d Descriptor) ([]byte, error) { + // Validate basic invariants so Encode+Parse is deterministic. + if _, err := validateDescriptorForEncode(d); err != nil { + return nil, err + } + + var buf strings.Builder + buf.Grow(64 + len(d.Parts)*64) + buf.WriteString(DescriptorMagic) + buf.WriteByte('\n') + buf.WriteString("size ") + buf.WriteString(strconv.FormatUint(d.TotalSize, 10)) + buf.WriteByte('\n') + for _, p := range d.Parts { + buf.WriteString("part ") + buf.WriteString(p.OIDHex) + buf.WriteByte(' ') + buf.WriteString(strconv.FormatUint(p.Size, 10)) + buf.WriteByte('\n') + } + return []byte(buf.String()), nil +} + +func validateDescriptorForEncode(d Descriptor) (Descriptor, error) { + var sum uint64 + if d.TotalSize == 0 { + if len(d.Parts) != 0 { + return Descriptor{}, fmt.Errorf("descriptor: total size 0 requires zero parts") + } + return d, nil + } + if len(d.Parts) == 0 { + return Descriptor{}, fmt.Errorf("descriptor: non-zero total size requires at least one part") + } + for _, p := range d.Parts { + if err := validateOIDHex(p.OIDHex); err != nil { + return Descriptor{}, fmt.Errorf("descriptor: invalid part oid %q: %w", p.OIDHex, err) + } + if p.Size == 0 { + return Descriptor{}, fmt.Errorf("descriptor: part size must be > 0") + } + if sum > ^uint64(0)-p.Size { + return Descriptor{}, fmt.Errorf("descriptor: part sizes overflow uint64") + } + sum += p.Size + } + if sum != d.TotalSize { + return Descriptor{}, fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sum, d.TotalSize) + } + return d, nil +} + +func splitLines(s string) []string { + // Normalize CRLF to LF, then split. + s = strings.ReplaceAll(s, "\r\n", "\n") + s = strings.TrimRight(s, "\n") + if s == "" { + return nil + } + return strings.Split(s, "\n") +} + +func parseUint(s string) (uint64, error) { + return strconv.ParseUint(s, 10, 64) +} + +func validateOIDHex(oid string) error { + if len(oid) != 40 { + return fmt.Errorf("expected 40 hex chars, got %d", len(oid)) + } + for i := 0; i < len(oid); i++ { + c := oid[i] + switch { + case c >= '0' && c <= '9': + case c >= 'a' && c <= 'f': + case c >= 'A' && c <= 'F': + default: + return fmt.Errorf("non-hex character %q", c) + } + } + return nil +} diff --git a/go/store/blobstore/internal/gitbs/descriptor_test.go b/go/store/blobstore/internal/gitbs/descriptor_test.go new file mode 100644 index 0000000000..2d72d783b2 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/descriptor_test.go @@ -0,0 +1,91 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestEncodeParseDescriptor_RoundTrip(t *testing.T) { + d := Descriptor{ + TotalSize: 7, + Parts: []PartRef{ + {OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 3}, + {OIDHex: "89abcdef0123456789abcdef0123456789abcdef", Size: 4}, + }, + } + + b, err := EncodeDescriptor(d) + require.NoError(t, err) + + got, err := ParseDescriptor(b) + require.NoError(t, err) + require.Equal(t, d, got) +} + +func TestParseDescriptor_InvalidMagic(t *testing.T) { + _, err := ParseDescriptor([]byte("NOPE\nsize 0\n")) + require.Error(t, err) +} + +func TestParseDescriptor_MissingSizeLine(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\npart 0123456789abcdef0123456789abcdef01234567 1\n")) + require.Error(t, err) +} + +func TestParseDescriptor_MultipleSizeLines(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 1\nsize 2\n")) + require.Error(t, err) +} + +func TestParseDescriptor_UnknownLine(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\nwat 1\n")) + require.Error(t, err) +} + +func TestParseDescriptor_InvalidOID(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 1\npart not-an-oid 1\n")) + require.Error(t, err) +} + +func TestParseDescriptor_PartSizeZeroRejected(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\npart 0123456789abcdef0123456789abcdef01234567 0\n")) + require.Error(t, err) +} + +func TestParseDescriptor_SumMismatch(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 2\npart 0123456789abcdef0123456789abcdef01234567 1\n")) + require.Error(t, err) +} + +func TestParseDescriptor_TotalSizeZeroRequiresNoParts(t *testing.T) { + _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\npart 0123456789abcdef0123456789abcdef01234567 1\n")) + require.Error(t, err) +} + +func TestEncodeDescriptor_Validates(t *testing.T) { + _, err := EncodeDescriptor(Descriptor{TotalSize: 1}) + require.Error(t, err) +} + +func TestIsDescriptorPrefix(t *testing.T) { + require.True(t, IsDescriptorPrefix([]byte("DOLTBS1\nsize "))) + require.True(t, IsDescriptorPrefix([]byte("DOLTBS1\r\nsize "))) + require.False(t, IsDescriptorPrefix([]byte("DOLTBS"))) + require.False(t, IsDescriptorPrefix([]byte("xxxxDOLTBS1\n"))) + require.False(t, IsDescriptorPrefix([]byte("DOLTBS1\nthis is not a descriptor\n"))) +} diff --git a/go/store/blobstore/internal/gitbs/parts_path.go b/go/store/blobstore/internal/gitbs/parts_path.go new file mode 100644 index 0000000000..f14cfd8a27 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/parts_path.go @@ -0,0 +1,40 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "path" + "strings" +) + +const ( + // PartsPrefix is a reserved tree prefix under which part blobs are staged to ensure + // reachability from the GitBlobstore ref snapshot. + PartsPrefix = "__dolt_blobstore_parts__" +) + +// PartPath returns the reserved tree path for a part blob with the given hex OID. +// The returned path uses forward slashes (git tree paths) and a 2-level fanout: +// +// __dolt_blobstore_parts__/aa/bb/ +// +// where aa/bb are the first 4 hex characters of the oid. +func PartPath(oidHex string) (string, error) { + if err := validateOIDHex(oidHex); err != nil { + return "", err + } + oidHex = strings.ToLower(oidHex) + return path.Join(PartsPrefix, oidHex[:2], oidHex[2:4], oidHex), nil +} diff --git a/go/store/blobstore/internal/gitbs/parts_path_test.go b/go/store/blobstore/internal/gitbs/parts_path_test.go new file mode 100644 index 0000000000..b809519e21 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/parts_path_test.go @@ -0,0 +1,40 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPartPath_Deterministic(t *testing.T) { + oid := "0123456789abcdef0123456789abcdef01234567" + p, err := PartPath(oid) + require.NoError(t, err) + require.Equal(t, "__dolt_blobstore_parts__/01/23/"+oid, p) +} + +func TestPartPath_NormalizesToLower(t *testing.T) { + oidUpper := "0123456789ABCDEF0123456789ABCDEF01234567" + p, err := PartPath(oidUpper) + require.NoError(t, err) + require.Equal(t, "__dolt_blobstore_parts__/01/23/0123456789abcdef0123456789abcdef01234567", p) +} + +func TestPartPath_InvalidOID(t *testing.T) { + _, err := PartPath("nope") + require.Error(t, err) +} diff --git a/go/store/blobstore/internal/gitbs/ranges.go b/go/store/blobstore/internal/gitbs/ranges.go new file mode 100644 index 0000000000..2de6f30332 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/ranges.go @@ -0,0 +1,131 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import "fmt" + +// PartSlice describes a contiguous slice to read from a particular part. +type PartSlice struct { + OIDHex string + // Offset is the byte offset into the part at which to begin reading. + Offset int64 + // Length is the number of bytes to read from the part slice. + Length int64 +} + +// NormalizeRange converts (offset,length) with possible negative offsets into a +// concrete half-open interval [start,end) over an object of total size |total|. +// +// Semantics match blobstore.BlobRange: +// - offset < 0 means relative to end (start = total + offset) +// - length == 0 means "to end" +// - length < 0 is invalid +func NormalizeRange(total int64, offset int64, length int64) (start, end int64, err error) { + if total < 0 { + return 0, 0, fmt.Errorf("invalid total size %d", total) + } + if length < 0 { + return 0, 0, fmt.Errorf("invalid length %d", length) + } + start = offset + if start < 0 { + start = total + start + } + if start < 0 || start > total { + return 0, 0, fmt.Errorf("invalid offset %d for total size %d", offset, total) + } + if length == 0 { + end = total + } else { + end = start + length + if end < start { + return 0, 0, fmt.Errorf("range overflow") + } + if end > total { + end = total + } + } + return start, end, nil +} + +// SliceParts maps a logical range [start,end) over the concatenation of |parts| +// into per-part slices. +// +// - start/end are byte offsets in the logical object (0 <= start <= end <= total) +// - parts must have Size > 0 +func SliceParts(parts []PartRef, start, end int64) ([]PartSlice, error) { + if start < 0 || end < 0 || end < start { + return nil, fmt.Errorf("invalid start/end: %d/%d", start, end) + } + if start == end { + return nil, nil + } + + var ( + out []PartSlice + pos int64 // start offset of current part in logical stream + ) + + for _, p := range parts { + if p.Size == 0 { + return nil, fmt.Errorf("invalid part size 0") + } + partStart := pos + partEnd := pos + int64(p.Size) + if partEnd < partStart { + return nil, fmt.Errorf("part size overflow") + } + + // Does this part overlap [start,end)? + if end <= partStart { + break + } + if start >= partEnd { + pos = partEnd + continue + } + + // Compute overlap. + s := start + if s < partStart { + s = partStart + } + e := end + if e > partEnd { + e = partEnd + } + if e > s { + out = append(out, PartSlice{ + OIDHex: p.OIDHex, + Offset: s - partStart, + Length: e - s, + }) + } + pos = partEnd + } + + // Validate that the requested interval was fully covered by parts. + if len(out) == 0 { + return nil, fmt.Errorf("range [%d,%d) not covered by parts", start, end) + } + var covered int64 + for _, s := range out { + covered += s.Length + } + if covered != (end - start) { + return nil, fmt.Errorf("range [%d,%d) not fully covered by parts", start, end) + } + return out, nil +} diff --git a/go/store/blobstore/internal/gitbs/ranges_test.go b/go/store/blobstore/internal/gitbs/ranges_test.go new file mode 100644 index 0000000000..1b1b4ea768 --- /dev/null +++ b/go/store/blobstore/internal/gitbs/ranges_test.go @@ -0,0 +1,82 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNormalizeRange(t *testing.T) { + start, end, err := NormalizeRange(10, 0, 0) + require.NoError(t, err) + require.Equal(t, int64(0), start) + require.Equal(t, int64(10), end) + + start, end, err = NormalizeRange(10, 2, 3) + require.NoError(t, err) + require.Equal(t, int64(2), start) + require.Equal(t, int64(5), end) + + start, end, err = NormalizeRange(10, -3, 0) + require.NoError(t, err) + require.Equal(t, int64(7), start) + require.Equal(t, int64(10), end) + + start, end, err = NormalizeRange(10, -3, 2) + require.NoError(t, err) + require.Equal(t, int64(7), start) + require.Equal(t, int64(9), end) + + _, _, err = NormalizeRange(10, 11, 0) + require.Error(t, err) + + _, _, err = NormalizeRange(10, -11, 0) + require.Error(t, err) + + _, _, err = NormalizeRange(10, 0, -1) + require.Error(t, err) +} + +func TestSliceParts(t *testing.T) { + parts := []PartRef{ + {OIDHex: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Size: 3}, + {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Size: 4}, + {OIDHex: "cccccccccccccccccccccccccccccccccccccccc", Size: 2}, + } + + slices, err := SliceParts(parts, 0, 9) + require.NoError(t, err) + require.Len(t, slices, 3) + require.Equal(t, int64(3), slices[0].Length) + require.Equal(t, int64(4), slices[1].Length) + require.Equal(t, int64(2), slices[2].Length) + + // Middle slice spanning two parts: [2,5) covers a[2:] + b[:2] + slices, err = SliceParts(parts, 2, 5) + require.NoError(t, err) + require.Equal(t, []PartSlice{ + {OIDHex: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Offset: 2, Length: 1}, + {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Offset: 0, Length: 2}, + }, slices) + + // Single-part slice: [3,7) maps to b[0:4] + slices, err = SliceParts(parts, 3, 7) + require.NoError(t, err) + require.Equal(t, []PartSlice{ + {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Offset: 0, Length: 4}, + }, slices) +} From 4a09c8ec5366a50b136975286fbf6a7dba556967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 12:04:07 -0800 Subject: [PATCH 06/28] /go/store/blobstore: disable Concatenate on chunked branch Create db/gitblobstore-next-2a by leaving GitBlobstore.Concatenate unimplemented and removing concatenate-focused tests, while keeping chunked Get/Put/CheckAndPut work intact. Co-authored-by: Cursor --- go/store/blobstore/git_blobstore.go | 341 +----------------- .../git_blobstore_chunked_concatenate_test.go | 88 ----- go/store/blobstore/git_blobstore_test.go | 157 -------- 3 files changed, 8 insertions(+), 578 deletions(-) delete mode 100644 go/store/blobstore/git_blobstore_chunked_concatenate_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 3f18d29920..40e75ea4f3 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -27,7 +27,6 @@ import ( "time" "github.com/cenkalti/backoff/v4" - "golang.org/x/sync/errgroup" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" @@ -701,344 +700,20 @@ func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key s } func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { - key, err := normalizeGitTreePath(key) + // Chunked-object support is landing in phases. Concatenate is the final piece + // needed for NBS conjoin and is intentionally left unimplemented on this branch. + // + // Keep key validation for consistent error behavior. + _, err := normalizeGitTreePath(key) if err != nil { return "", err } - - normSources := make([]string, len(sources)) - for i, src := range sources { - src, err := normalizeGitTreePath(src) - if err != nil { - return "", err - } - normSources[i] = src - } - - // Snapshot the current head for reading sources so we don't depend on the ref staying - // stable while we stream the concatenated contents into a new blob object. - snapshot, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return "", err - } - if !ok && len(normSources) > 0 { - // If the ref doesn't exist, the store is missing/corrupt (there is no commit to - // resolve source paths against). - return "", &git.RefNotFoundError{Ref: gbs.ref} - } - - msg := fmt.Sprintf("gitblobstore: concatenate %s (%d sources)", key, len(normSources)) - - var writes []treeWrite - if gbs.maxPartSize == 0 { - blobOID, err := gbs.hashConcatenation(ctx, snapshot, ok, normSources) - if err != nil { - return "", err - } - writes = []treeWrite{{path: key, oid: blobOID}} - } else { - writes, err = gbs.planConcatenateWritesChunked(ctx, snapshot, ok, key, normSources) - if err != nil { - return "", err - } - } - - const maxRetries = 31 // 32 total attempts (initial + retries) - bo := backoff.NewExponentialBackOff() - bo.InitialInterval = 5 * time.Millisecond - bo.Multiplier = 2 - bo.MaxInterval = 320 * time.Millisecond - bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed - bo.Reset() - policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - - var ver string - op := func() error { - parent, hasParent, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return backoff.Permanent(err) - } - - newCommit, err := gbs.buildCommitWithWrites(ctx, parent, hasParent, writes, msg) - if err != nil { - return backoff.Permanent(err) - } - - if !hasParent { - // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids - // losing concurrent writes when multiple goroutines create the ref at once. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - ver = newCommit.String() - return nil - } - - err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) - if err == nil { - ver = newCommit.String() - return nil - } - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - - if err := backoff.Retry(op, policy); err != nil { - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - - return ver, nil -} - -func (gbs *GitBlobstore) hashConcatenation(ctx context.Context, commit git.OID, hasCommit bool, sources []string) (git.OID, error) { - if len(sources) == 0 { - return gbs.api.HashObject(ctx, bytes.NewReader(nil)) - } - if !hasCommit { - return "", &git.RefNotFoundError{Ref: gbs.ref} - } - - pr, pw := io.Pipe() - eg, ectx := errgroup.WithContext(ctx) - eg.Go(func() error { - defer func() { - _ = pw.Close() - }() - - for _, src := range sources { - blobOID, err := gbs.api.ResolvePathBlob(ectx, commit, src) - if err != nil { - if git.IsPathNotFound(err) { - _ = pw.CloseWithError(NotFound{Key: src}) - return NotFound{Key: src} - } - _ = pw.CloseWithError(err) - return err - } - - rc, err := gbs.api.BlobReader(ectx, blobOID) - if err != nil { - _ = pw.CloseWithError(err) - return err - } - - _, err = io.Copy(pw, rc) - cerr := rc.Close() - if err == nil { - err = cerr - } - if err != nil { - _ = pw.CloseWithError(err) - return err - } - } - return nil - }) - - oid, err := gbs.api.HashObject(ectx, pr) - if err != nil { - _ = pr.CloseWithError(err) - if werr := eg.Wait(); werr != nil { - return "", werr - } - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - - _ = pr.Close() - if err := eg.Wait(); err != nil { - return "", err - } - return oid, nil -} - -type resolvedConcatSource struct { - inlineOID git.OID - inlineSize int64 - desc *gitbs.Descriptor -} - -func (gbs *GitBlobstore) resolveConcatSource(ctx context.Context, commit git.OID, path string) (resolvedConcatSource, error) { - blobOID, err := gbs.api.ResolvePathBlob(ctx, commit, path) - if err != nil { - if git.IsPathNotFound(err) { - return resolvedConcatSource{}, NotFound{Key: path} - } - return resolvedConcatSource{}, err - } - sz, err := gbs.api.BlobSize(ctx, blobOID) - if err != nil { - return resolvedConcatSource{}, err - } - - // Peek enough bytes to detect descriptor prefix conservatively. - rc, err := gbs.api.BlobReader(ctx, blobOID) - if err != nil { - return resolvedConcatSource{}, err - } - defer rc.Close() - - peek := make([]byte, 0, 64) - buf := make([]byte, 64) - for len(peek) < cap(peek) { - n, rerr := rc.Read(buf[:min(cap(peek)-len(peek), len(buf))]) - if n > 0 { - peek = append(peek, buf[:n]...) - } - if rerr != nil { - if errors.Is(rerr, io.EOF) { - break - } - return resolvedConcatSource{}, rerr - } - } - - if !gitbs.IsDescriptorPrefix(peek) { - return resolvedConcatSource{inlineOID: blobOID, inlineSize: sz}, nil - } - - // Descriptor: re-read whole descriptor blob (bounded). - // TODO(gitblobstore): configurable MaxDescriptorSize. - const maxDesc = int64(64 * 1024) - if sz > maxDesc { - return resolvedConcatSource{}, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", sz, maxDesc) - } - _ = rc.Close() - - rc2, err := gbs.api.BlobReader(ctx, blobOID) - if err != nil { - return resolvedConcatSource{}, err - } - defer rc2.Close() - - descBytes, err := io.ReadAll(rc2) - if err != nil { - return resolvedConcatSource{}, err - } - desc, err := gitbs.ParseDescriptor(descBytes) - if err != nil { - return resolvedConcatSource{}, err - } - return resolvedConcatSource{desc: &desc}, nil -} - -func (gbs *GitBlobstore) planConcatenateWritesChunked(ctx context.Context, snapshot git.OID, hasSnapshot bool, key string, sources []string) ([]treeWrite, error) { - if len(sources) == 0 { - // Empty concatenation => empty object. Store inline. - oid, err := gbs.api.HashObject(ctx, bytes.NewReader(nil)) - if err != nil { - return nil, err - } - return []treeWrite{{path: key, oid: oid}}, nil - } - if !hasSnapshot { - return nil, &git.RefNotFoundError{Ref: gbs.ref} - } - - var ( - allParts []gitbs.PartRef - allPartOID = make(map[git.OID]struct{}) - total uint64 - ) - for _, src := range sources { - rs, err := gbs.resolveConcatSource(ctx, snapshot, src) - if err != nil { - return nil, err - } - - var parts []gitbs.PartRef - var oids []git.OID - - if rs.desc != nil { - parts = rs.desc.Parts - for _, p := range parts { - oid := git.OID(p.OIDHex) - if p.Size > gbs.maxPartSize { - // Re-chunk oversized part. - rc, err := gbs.api.BlobReader(ctx, oid) - if err != nil { - return nil, err - } - newParts, newOIDs, _, err := gbs.hashParts(ctx, rc) - _ = rc.Close() - if err != nil { - return nil, err - } - allParts = append(allParts, newParts...) - for _, no := range newOIDs { - allPartOID[no] = struct{}{} - } - for _, np := range newParts { - total += np.Size - } - continue - } - oids = append(oids, oid) - } - } else { - // Inline. - if rs.inlineSize < 0 { - return nil, fmt.Errorf("gitblobstore: invalid inline size %d", rs.inlineSize) - } - if uint64(rs.inlineSize) > gbs.maxPartSize { - // Re-chunk oversized inline blob. - rc, err := gbs.api.BlobReader(ctx, rs.inlineOID) - if err != nil { - return nil, err - } - newParts, newOIDs, _, err := gbs.hashParts(ctx, rc) - _ = rc.Close() - if err != nil { - return nil, err - } - parts = newParts - oids = newOIDs - } else { - parts = []gitbs.PartRef{{OIDHex: rs.inlineOID.String(), Size: uint64(rs.inlineSize)}} - oids = []git.OID{rs.inlineOID} - } - } - - allParts = append(allParts, parts...) - for _, o := range oids { - allPartOID[o] = struct{}{} - } - for _, p := range parts { - total += p.Size + if _, err := normalizeGitTreePath(src); err != nil { + return "", err } } - - descBytes, err := gitbs.EncodeDescriptor(gitbs.Descriptor{TotalSize: total, Parts: allParts}) - if err != nil { - return nil, err - } - descOID, err := gbs.api.HashObject(ctx, bytes.NewReader(descBytes)) - if err != nil { - return nil, err - } - - writes := make([]treeWrite, 0, 1+len(allPartOID)) - writes = append(writes, treeWrite{path: key, oid: descOID}) - for oid := range allPartOID { - ppath, err := gitbs.PartPath(oid.String()) - if err != nil { - return nil, err - } - writes = append(writes, treeWrite{path: ppath, oid: oid}) - } - return writes, nil + return "", git.ErrUnimplemented } // normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path. diff --git a/go/store/blobstore/git_blobstore_chunked_concatenate_test.go b/go/store/blobstore/git_blobstore_chunked_concatenate_test.go deleted file mode 100644 index ed989cbb4b..0000000000 --- a/go/store/blobstore/git_blobstore_chunked_concatenate_test.go +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package blobstore - -import ( - "bytes" - "context" - "io" - "testing" - - "github.com/stretchr/testify/require" - - git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" - "github.com/dolthub/dolt/go/store/testutils/gitrepo" -) - -func TestGitBlobstore_Concatenate_ChunkedStructuralAndRechunksOversizedInline(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - // Seed an oversized inline blob (chunking disabled) so we exercise re-chunking during Concatenate. - seed, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ - Identity: testIdentity(), - MaxPartSize: 0, - }) - require.NoError(t, err) - - inline := []byte("abcdefghij") // 10 bytes - _, err = seed.Put(ctx, "a", int64(len(inline)), bytes.NewReader(inline)) - require.NoError(t, err) - - // Now concatenate in chunked mode with a small max part size. - bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ - Identity: testIdentity(), - MaxPartSize: 3, - }) - require.NoError(t, err) - - ver, err := bs.Concatenate(ctx, "out", []string{"a"}) - require.NoError(t, err) - require.NotEmpty(t, ver) - - got, ver2, err := GetBytes(ctx, bs, "out", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, inline, got) - - // Verify "out" is a descriptor and all parts are <= 3 and reachable under parts namespace. - runner, err := git.NewRunner(repo.GitDir) - require.NoError(t, err) - api := git.NewGitAPIImpl(runner) - commit := git.OID(ver) - - outOID, err := api.ResolvePathBlob(ctx, commit, "out") - require.NoError(t, err) - rc, err := api.BlobReader(ctx, outOID) - require.NoError(t, err) - descBytes, err := io.ReadAll(rc) - require.NoError(t, err) - require.NoError(t, rc.Close()) - - desc, err := gitbs.ParseDescriptor(descBytes) - require.NoError(t, err) - require.Equal(t, uint64(len(inline)), desc.TotalSize) - for _, p := range desc.Parts { - require.LessOrEqual(t, p.Size, uint64(3)) - ppath, err := gitbs.PartPath(p.OIDHex) - require.NoError(t, err) - _, err = api.ResolvePathBlob(ctx, commit, ppath) - require.NoError(t, err) - } -} diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index 870238dbb1..a543808572 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -215,12 +215,6 @@ func TestGitBlobstore_InvalidKeysError(t *testing.T) { _, err = bs.Put(ctx, k, 1, bytes.NewReader([]byte("x"))) require.Error(t, err, "expected error for key %q", k) - - _, err = bs.Concatenate(ctx, k, []string{"ok"}) - require.Error(t, err, "expected error for key %q", k) - - _, err = bs.Concatenate(ctx, "ok2", []string{k}) - require.Error(t, err, "expected error for source key %q", k) } } @@ -274,157 +268,6 @@ func TestGitBlobstore_Put_Overwrite(t *testing.T) { require.Equal(t, []byte("v2\n"), got) } -func TestGitBlobstore_Concatenate_RoundTripAndRanges(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) - require.NoError(t, err) - - a := []byte("aaaaa") - b := []byte("bbb") - c := []byte("cccccccc") - _, err = PutBytes(ctx, bs, "a", a) - require.NoError(t, err) - _, err = PutBytes(ctx, bs, "b", b) - require.NoError(t, err) - _, err = PutBytes(ctx, bs, "c", c) - require.NoError(t, err) - - ver, err := bs.Concatenate(ctx, "composite", []string{"a", "b", "c"}) - require.NoError(t, err) - require.NotEmpty(t, ver) - - // Full object. - got, ver2, err := GetBytes(ctx, bs, "composite", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, append(append(append([]byte(nil), a...), b...), c...), got) - - // Range verification across boundaries. - var off int64 - rc, sz, ver3, err := bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(a))}) - require.NoError(t, err) - require.Equal(t, ver, ver3) - require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) - buf, err := io.ReadAll(rc) - _ = rc.Close() - require.NoError(t, err) - require.Equal(t, a, buf) - off += int64(len(a)) - - rc, sz, ver3, err = bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(b))}) - require.NoError(t, err) - require.Equal(t, ver, ver3) - require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) - buf, err = io.ReadAll(rc) - _ = rc.Close() - require.NoError(t, err) - require.Equal(t, b, buf) - off += int64(len(b)) - - rc, sz, ver3, err = bs.Get(ctx, "composite", BlobRange{offset: off, length: int64(len(c))}) - require.NoError(t, err) - require.Equal(t, ver, ver3) - require.Equal(t, uint64(len(a)+len(b)+len(c)), sz) - buf, err = io.ReadAll(rc) - _ = rc.Close() - require.NoError(t, err) - require.Equal(t, c, buf) -} - -func TestGitBlobstore_Concatenate_EmptySourcesCreatesEmptyBlob(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) - require.NoError(t, err) - - ver, err := bs.Concatenate(ctx, "empty", nil) - require.NoError(t, err) - require.NotEmpty(t, ver) - - rc, sz, ver2, err := bs.Get(ctx, "empty", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, uint64(0), sz) - data, err := io.ReadAll(rc) - _ = rc.Close() - require.NoError(t, err) - require.Empty(t, data) -} - -func TestGitBlobstore_Concatenate_MissingSourceIsNotFound(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) - require.NoError(t, err) - - _, err = PutBytes(ctx, bs, "present", []byte("x")) - require.NoError(t, err) - - _, err = bs.Concatenate(ctx, "composite", []string{"present", "missing"}) - require.Error(t, err) - require.True(t, IsNotFoundError(err)) - - ok, err := bs.Exists(ctx, "composite") - require.NoError(t, err) - require.False(t, ok) -} - -func TestGitBlobstore_Concatenate_ContentionRetryPreservesOtherKey(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - // Seed the ref so Concatenate takes the CAS path. - _, err = repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ - "a": []byte("A"), - "b": []byte("B"), - }, "seed") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) - require.NoError(t, err) - - origAPI := bs.api - h := &hookGitAPI{GitAPI: origAPI, ref: DoltDataRef} - h.onFirstCAS = func(ctx context.Context, old git.OID) { - // Advance the ref to simulate another writer committing concurrently. - _, _ = writeKeyToRef(ctx, origAPI, DoltDataRef, "external", []byte("external\n"), testIdentity()) - } - bs.api = h - - ver, err := bs.Concatenate(ctx, "composite", []string{"a", "b"}) - require.NoError(t, err) - require.NotEmpty(t, ver) - - got, ver2, err := GetBytes(ctx, bs, "composite", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, []byte("AB"), got) - - got, _, err = GetBytes(ctx, bs, "external", AllRange) - require.NoError(t, err) - require.Equal(t, []byte("external\n"), got) - - got, _, err = GetBytes(ctx, bs, "a", AllRange) - require.NoError(t, err) - require.Equal(t, []byte("A"), got) -} - type hookGitAPI struct { git.GitAPI From 71fee78f5b17d62541b9e7fc93ba0f3228527aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 12:39:10 -0800 Subject: [PATCH 07/28] /go/store/blobstore/internal/gitbs/descriptor.go: refactor --- .../blobstore/internal/gitbs/descriptor.go | 191 ++++++++++-------- 1 file changed, 109 insertions(+), 82 deletions(-) diff --git a/go/store/blobstore/internal/gitbs/descriptor.go b/go/store/blobstore/internal/gitbs/descriptor.go index ba502fba75..3da88b8827 100644 --- a/go/store/blobstore/internal/gitbs/descriptor.go +++ b/go/store/blobstore/internal/gitbs/descriptor.go @@ -41,6 +41,12 @@ type PartRef struct { Size uint64 } +type descriptorParseState struct { + d Descriptor + haveSz bool + sumPart uint64 +} + // IsDescriptorPrefix returns true if |b| looks like the beginning of a descriptor. // Callers can use this on a small prefix before deciding whether to read and parse // the full blob. @@ -73,74 +79,16 @@ func ParseDescriptor(b []byte) (Descriptor, error) { return Descriptor{}, fmt.Errorf("descriptor: invalid magic %q", lines[0]) } - var ( - d Descriptor - haveSz bool - sumPart uint64 - ) - + var st descriptorParseState for _, line := range lines[1:] { if strings.TrimSpace(line) == "" { continue } - fields := strings.Fields(line) - switch { - case len(fields) >= 1 && fields[0] == "size": - if haveSz { - return Descriptor{}, fmt.Errorf("descriptor: multiple size lines") - } - if len(fields) != 2 { - return Descriptor{}, fmt.Errorf("descriptor: malformed size line %q", line) - } - n, err := parseUint(fields[1]) - if err != nil { - return Descriptor{}, fmt.Errorf("descriptor: invalid size %q: %w", fields[1], err) - } - d.TotalSize = n - haveSz = true - - case len(fields) >= 1 && fields[0] == "part": - if len(fields) != 3 { - return Descriptor{}, fmt.Errorf("descriptor: malformed part line %q", line) - } - oid := fields[1] - if err := validateOIDHex(oid); err != nil { - return Descriptor{}, fmt.Errorf("descriptor: invalid part oid %q: %w", oid, err) - } - sz, err := parseUint(fields[2]) - if err != nil { - return Descriptor{}, fmt.Errorf("descriptor: invalid part size %q: %w", fields[2], err) - } - if sz == 0 { - return Descriptor{}, fmt.Errorf("descriptor: part size must be > 0") - } - if sumPart > ^uint64(0)-sz { - return Descriptor{}, fmt.Errorf("descriptor: part sizes overflow uint64") - } - sumPart += sz - d.Parts = append(d.Parts, PartRef{OIDHex: oid, Size: sz}) - - default: - return Descriptor{}, fmt.Errorf("descriptor: unknown line %q", line) + if err := parseDescriptorLine(&st, line); err != nil { + return Descriptor{}, err } } - - if !haveSz { - return Descriptor{}, fmt.Errorf("descriptor: missing size line") - } - if d.TotalSize == 0 { - if len(d.Parts) != 0 { - return Descriptor{}, fmt.Errorf("descriptor: total size 0 requires zero parts") - } - return d, nil - } - if len(d.Parts) == 0 { - return Descriptor{}, fmt.Errorf("descriptor: non-zero total size requires at least one part") - } - if sumPart != d.TotalSize { - return Descriptor{}, fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sumPart, d.TotalSize) - } - return d, nil + return finalizeParsedDescriptor(st) } // EncodeDescriptor encodes a descriptor in the stable line-oriented format. @@ -158,42 +106,121 @@ func EncodeDescriptor(d Descriptor) ([]byte, error) { buf.WriteString(strconv.FormatUint(d.TotalSize, 10)) buf.WriteByte('\n') for _, p := range d.Parts { - buf.WriteString("part ") - buf.WriteString(p.OIDHex) - buf.WriteByte(' ') - buf.WriteString(strconv.FormatUint(p.Size, 10)) - buf.WriteByte('\n') + writePartLine(&buf, p) } return []byte(buf.String()), nil } func validateDescriptorForEncode(d Descriptor) (Descriptor, error) { - var sum uint64 - if d.TotalSize == 0 { - if len(d.Parts) != 0 { - return Descriptor{}, fmt.Errorf("descriptor: total size 0 requires zero parts") + sum, err := validateDescriptorParts(d.Parts) + if err != nil { + return Descriptor{}, err + } + if err := validateDescriptorSizeAndParts(d.TotalSize, len(d.Parts), sum); err != nil { + return Descriptor{}, err + } + return d, nil +} + +func parseDescriptorLine(st *descriptorParseState, line string) error { + fields := strings.Fields(line) + switch { + case len(fields) >= 1 && fields[0] == "size": + return parseSizeLine(st, line, fields) + case len(fields) >= 1 && fields[0] == "part": + return parsePartLine(st, line, fields) + default: + return fmt.Errorf("descriptor: unknown line %q", line) + } +} + +func parseSizeLine(st *descriptorParseState, line string, fields []string) error { + if st.haveSz { + return fmt.Errorf("descriptor: multiple size lines") + } + if len(fields) != 2 { + return fmt.Errorf("descriptor: malformed size line %q", line) + } + n, err := parseUint(fields[1]) + if err != nil { + return fmt.Errorf("descriptor: invalid size %q: %w", fields[1], err) + } + st.d.TotalSize = n + st.haveSz = true + return nil +} + +func parsePartLine(st *descriptorParseState, line string, fields []string) error { + if len(fields) != 3 { + return fmt.Errorf("descriptor: malformed part line %q", line) + } + oid := fields[1] + if err := validateOIDHex(oid); err != nil { + return fmt.Errorf("descriptor: invalid part oid %q: %w", oid, err) + } + sz, err := parseUint(fields[2]) + if err != nil { + return fmt.Errorf("descriptor: invalid part size %q: %w", fields[2], err) + } + if sz == 0 { + return fmt.Errorf("descriptor: part size must be > 0") + } + if st.sumPart > ^uint64(0)-sz { + return fmt.Errorf("descriptor: part sizes overflow uint64") + } + st.sumPart += sz + st.d.Parts = append(st.d.Parts, PartRef{OIDHex: oid, Size: sz}) + return nil +} + +func finalizeParsedDescriptor(st descriptorParseState) (Descriptor, error) { + if !st.haveSz { + return Descriptor{}, fmt.Errorf("descriptor: missing size line") + } + if err := validateDescriptorSizeAndParts(st.d.TotalSize, len(st.d.Parts), st.sumPart); err != nil { + return Descriptor{}, err + } + return st.d, nil +} + +func validateDescriptorSizeAndParts(totalSize uint64, partCount int, sumParts uint64) error { + if totalSize == 0 { + if partCount != 0 { + return fmt.Errorf("descriptor: total size 0 requires zero parts") } - return d, nil + return nil } - if len(d.Parts) == 0 { - return Descriptor{}, fmt.Errorf("descriptor: non-zero total size requires at least one part") + if partCount == 0 { + return fmt.Errorf("descriptor: non-zero total size requires at least one part") } - for _, p := range d.Parts { + if sumParts != totalSize { + return fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sumParts, totalSize) + } + return nil +} + +func validateDescriptorParts(parts []PartRef) (sum uint64, err error) { + for _, p := range parts { if err := validateOIDHex(p.OIDHex); err != nil { - return Descriptor{}, fmt.Errorf("descriptor: invalid part oid %q: %w", p.OIDHex, err) + return 0, fmt.Errorf("descriptor: invalid part oid %q: %w", p.OIDHex, err) } if p.Size == 0 { - return Descriptor{}, fmt.Errorf("descriptor: part size must be > 0") + return 0, fmt.Errorf("descriptor: part size must be > 0") } if sum > ^uint64(0)-p.Size { - return Descriptor{}, fmt.Errorf("descriptor: part sizes overflow uint64") + return 0, fmt.Errorf("descriptor: part sizes overflow uint64") } sum += p.Size } - if sum != d.TotalSize { - return Descriptor{}, fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sum, d.TotalSize) - } - return d, nil + return sum, nil +} + +func writePartLine(buf *strings.Builder, p PartRef) { + buf.WriteString("part ") + buf.WriteString(p.OIDHex) + buf.WriteByte(' ') + buf.WriteString(strconv.FormatUint(p.Size, 10)) + buf.WriteByte('\n') } func splitLines(s string) []string { From 18ba8292ea6bae4520825cff67d29319717ad994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 14:16:05 -0800 Subject: [PATCH 08/28] /go/store/blobstore/internal/gitbs/ranges.go: refactor --- go/store/blobstore/internal/gitbs/ranges.go | 159 ++++++++++++++------ 1 file changed, 109 insertions(+), 50 deletions(-) diff --git a/go/store/blobstore/internal/gitbs/ranges.go b/go/store/blobstore/internal/gitbs/ranges.go index 2de6f30332..4e7c8ba6e6 100644 --- a/go/store/blobstore/internal/gitbs/ranges.go +++ b/go/store/blobstore/internal/gitbs/ranges.go @@ -33,29 +33,16 @@ type PartSlice struct { // - length == 0 means "to end" // - length < 0 is invalid func NormalizeRange(total int64, offset int64, length int64) (start, end int64, err error) { - if total < 0 { - return 0, 0, fmt.Errorf("invalid total size %d", total) + if err := validateNormalizeRangeInputs(total, length); err != nil { + return 0, 0, err } - if length < 0 { - return 0, 0, fmt.Errorf("invalid length %d", length) + start, err = normalizeStart(total, offset) + if err != nil { + return 0, 0, err } - start = offset - if start < 0 { - start = total + start - } - if start < 0 || start > total { - return 0, 0, fmt.Errorf("invalid offset %d for total size %d", offset, total) - } - if length == 0 { - end = total - } else { - end = start + length - if end < start { - return 0, 0, fmt.Errorf("range overflow") - } - if end > total { - end = total - } + end, err = normalizeEnd(total, start, length) + if err != nil { + return 0, 0, err } return start, end, nil } @@ -66,10 +53,10 @@ func NormalizeRange(total int64, offset int64, length int64) (start, end int64, // - start/end are byte offsets in the logical object (0 <= start <= end <= total) // - parts must have Size > 0 func SliceParts(parts []PartRef, start, end int64) ([]PartSlice, error) { - if start < 0 || end < 0 || end < start { - return nil, fmt.Errorf("invalid start/end: %d/%d", start, end) + if err := validateStartEnd(start, end); err != nil { + return nil, err } - if start == end { + if isEmptyRange(start, end) { return nil, nil } @@ -79,13 +66,9 @@ func SliceParts(parts []PartRef, start, end int64) ([]PartSlice, error) { ) for _, p := range parts { - if p.Size == 0 { - return nil, fmt.Errorf("invalid part size 0") - } - partStart := pos - partEnd := pos + int64(p.Size) - if partEnd < partStart { - return nil, fmt.Errorf("part size overflow") + partStart, partEnd, err := partBounds(pos, p.Size) + if err != nil { + return nil, err } // Does this part overlap [start,end)? @@ -97,35 +80,111 @@ func SliceParts(parts []PartRef, start, end int64) ([]PartSlice, error) { continue } - // Compute overlap. - s := start - if s < partStart { - s = partStart - } - e := end - if e > partEnd { - e = partEnd - } - if e > s { - out = append(out, PartSlice{ - OIDHex: p.OIDHex, - Offset: s - partStart, - Length: e - s, - }) + if s, e, ok := overlap(partStart, partEnd, start, end); ok { + out = append(out, newPartSlice(p.OIDHex, partStart, s, e)) } pos = partEnd } + return validateCoverage(out, start, end) +} + +func validateNormalizeRangeInputs(total int64, length int64) error { + if total < 0 { + return fmt.Errorf("invalid total size %d", total) + } + if length < 0 { + return fmt.Errorf("invalid length %d", length) + } + return nil +} + +func normalizeStart(total int64, offset int64) (int64, error) { + start := offset + if start < 0 { + start = total + start + } + if start < 0 || start > total { + return 0, fmt.Errorf("invalid offset %d for total size %d", offset, total) + } + return start, nil +} + +func normalizeEnd(total int64, start int64, length int64) (int64, error) { + if length == 0 { + return total, nil + } + end := start + length + if end < start { + return 0, fmt.Errorf("range overflow") + } + if end > total { + end = total + } + return end, nil +} + +func validateStartEnd(start, end int64) error { + if start < 0 || end < 0 || end < start { + return fmt.Errorf("invalid start/end: %d/%d", start, end) + } + return nil +} + +func isEmptyRange(start, end int64) bool { + return start == end +} + +func partBounds(pos int64, size uint64) (start, end int64, err error) { + if size == 0 { + return 0, 0, fmt.Errorf("invalid part size 0") + } + start = pos + end = pos + int64(size) + if end < start { + return 0, 0, fmt.Errorf("part size overflow") + } + return start, end, nil +} + +func overlap(partStart, partEnd, start, end int64) (s, e int64, ok bool) { + s = start + if s < partStart { + s = partStart + } + e = end + if e > partEnd { + e = partEnd + } + if e <= s { + return 0, 0, false + } + return s, e, true +} + +func newPartSlice(oidHex string, partStart, s, e int64) PartSlice { + return PartSlice{ + OIDHex: oidHex, + Offset: s - partStart, + Length: e - s, + } +} + +func validateCoverage(out []PartSlice, start, end int64) ([]PartSlice, error) { // Validate that the requested interval was fully covered by parts. if len(out) == 0 { return nil, fmt.Errorf("range [%d,%d) not covered by parts", start, end) } - var covered int64 - for _, s := range out { - covered += s.Length - } + covered := coveredLength(out) if covered != (end - start) { return nil, fmt.Errorf("range [%d,%d) not fully covered by parts", start, end) } return out, nil } + +func coveredLength(slices []PartSlice) (covered int64) { + for _, s := range slices { + covered += s.Length + } + return covered +} From 9b7033875a608c50a386931aee6ed7227a08d8c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 14:32:47 -0800 Subject: [PATCH 09/28] /go/store/blobstore: tests for helper methods --- go/store/blobstore/git_blobstore.go | 171 ++++++++---- .../blobstore/git_blobstore_helpers_test.go | 247 ++++++++++++++++++ .../internal/gitbs/descriptor_helpers_test.go | 70 +++++ .../internal/gitbs/ranges_helpers_test.go | 69 +++++ 4 files changed, 501 insertions(+), 56 deletions(-) create mode 100644 go/store/blobstore/git_blobstore_helpers_test.go create mode 100644 go/store/blobstore/internal/gitbs/descriptor_helpers_test.go create mode 100644 go/store/blobstore/internal/gitbs/ranges_helpers_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 40e75ea4f3..4bd78951df 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -126,35 +126,60 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. if err != nil { return nil, 0, "", err } - commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + commit, ver, err := gbs.resolveCommitForGet(ctx, key) if err != nil { - return nil, 0, "", err - } - if !ok { - // If the ref doesn't exist, treat the manifest as missing (empty store), - // but surface a hard error for other keys: the store itself is missing. - if key == "manifest" { - return nil, 0, "", NotFound{Key: key} - } - return nil, 0, "", &git.RefNotFoundError{Ref: gbs.ref} + return nil, 0, ver, err } - blobOID, err := gbs.api.ResolvePathBlob(ctx, commit, key) + blobOID, ver, err := gbs.resolveBlobForGet(ctx, commit, key) if err != nil { - if git.IsPathNotFound(err) { - return nil, 0, commit.String(), NotFound{Key: key} - } - return nil, 0, commit.String(), err + return nil, 0, ver, err } - sz, err := gbs.api.BlobSize(ctx, blobOID) + sz, ver, err := gbs.resolveBlobSizeForGet(ctx, commit, blobOID) if err != nil { - return nil, 0, commit.String(), err + return nil, 0, ver, err } return gbs.openBlobOrDescriptorRange(ctx, commit, blobOID, sz, br) } +func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, ver string, err error) { + commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return git.OID(""), "", err + } + if ok { + return commit, commit.String(), nil + } + + // If the ref doesn't exist, treat the manifest as missing (empty store), + // but surface a hard error for other keys: the store itself is missing. + if key == "manifest" { + return git.OID(""), "", NotFound{Key: key} + } + return git.OID(""), "", &git.RefNotFoundError{Ref: gbs.ref} +} + +func (gbs *GitBlobstore) resolveBlobForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, ver string, err error) { + oid, err = gbs.api.ResolvePathBlob(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return git.OID(""), commit.String(), NotFound{Key: key} + } + return git.OID(""), commit.String(), err + } + return oid, commit.String(), nil +} + +func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.OID, oid git.OID) (sz int64, ver string, err error) { + sz, err = gbs.api.BlobSize(ctx, oid) + if err != nil { + return 0, commit.String(), err + } + return sz, commit.String(), nil +} + type limitReadCloser struct { r io.Reader c io.Closer @@ -166,7 +191,6 @@ func (l *limitReadCloser) Close() error { return l.c.Close() } func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit git.OID, blobOID git.OID, blobSize int64, br BlobRange) (io.ReadCloser, uint64, string, error) { ver := commit.String() - // Read the blob contents. If it's a descriptor, we'll parse it and stream across parts. rc, err := gbs.api.BlobReader(ctx, blobOID) if err != nil { return nil, 0, ver, err @@ -177,57 +201,27 @@ func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit g } }() - // Read up to a bounded prefix to determine if it's a descriptor. If it looks like one, - // read the full blob (descriptors are expected to be small). const peekN = 64 * 1024 - peek := make([]byte, 0, 256) - buf := make([]byte, 256) - for len(peek) < cap(peek) { - n, rerr := rc.Read(buf[:min(cap(peek)-len(peek), len(buf))]) - if n > 0 { - peek = append(peek, buf[:n]...) - } - if rerr != nil { - if errors.Is(rerr, io.EOF) { - break - } - return nil, 0, ver, rerr - } + peek, err := readAtMost(rc, 256) + if err != nil { + return nil, 0, ver, err } // Not a descriptor: stream inline blob with BlobRange slicing. if !gitbs.IsDescriptorPrefix(peek) { - // Re-open for streaming the full inline blob. (Simpler than splicing peek+rest.) - _ = rc.Close() - rc = nil - - inlineRC, err := gbs.api.BlobReader(ctx, blobOID) + inlineRC, err := gbs.reopenInlineBlobReader(ctx, rc, blobOID) if err != nil { return nil, 0, ver, err } + rc = nil // ownership transferred / already closed return sliceInlineBlob(inlineRC, blobSize, br, ver) } // It's probably a descriptor. Read the full contents (bounded defensively). // TODO(gitblobstore): add a MaxDescriptorSize config; for now cap at 64KiB. - descBytes := append([]byte(nil), peek...) - for int64(len(descBytes)) < blobSize && len(descBytes) < peekN { - n, rerr := rc.Read(buf) - if n > 0 { - descBytes = append(descBytes, buf[:n]...) - } - if rerr != nil { - if errors.Is(rerr, io.EOF) { - break - } - return nil, 0, ver, rerr - } - } - if int64(len(descBytes)) < blobSize { - if blobSize > peekN { - return nil, 0, ver, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", blobSize, peekN) - } - return nil, 0, ver, io.ErrUnexpectedEOF + descBytes, err := readFullBlobBounded(rc, peek, blobSize, peekN) + if err != nil { + return nil, 0, ver, err } desc, err := gitbs.ParseDescriptor(descBytes) @@ -258,8 +252,73 @@ func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit g return streamRC, uint64(desc.TotalSize), ver, nil } +func (gbs *GitBlobstore) reopenInlineBlobReader(ctx context.Context, rc io.ReadCloser, blobOID git.OID) (io.ReadCloser, error) { + // Re-open for streaming the full inline blob. (Simpler than splicing peek+rest.) + if rc != nil { + _ = rc.Close() + } + return gbs.api.BlobReader(ctx, blobOID) +} + +func readAtMost(r io.Reader, n int) ([]byte, error) { + if n <= 0 { + return nil, nil + } + out := make([]byte, 0, n) + buf := make([]byte, min(256, n)) + for len(out) < n { + toRead := min(n-len(out), len(buf)) + rd, err := r.Read(buf[:toRead]) + if rd > 0 { + out = append(out, buf[:rd]...) + } + if err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + } + return out, nil +} + +func readFullBlobBounded(r io.Reader, already []byte, blobSize int64, max int) ([]byte, error) { + if blobSize < 0 { + return nil, fmt.Errorf("gitblobstore: invalid blob size %d", blobSize) + } + if int64(len(already)) > blobSize { + // Defensive: callers should pass a prefix read from this same blob reader. + return nil, io.ErrUnexpectedEOF + } + + descBytes := append([]byte(nil), already...) + buf := make([]byte, 256) + for int64(len(descBytes)) < blobSize && len(descBytes) < max { + n, err := r.Read(buf) + if n > 0 { + descBytes = append(descBytes, buf[:n]...) + } + if err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + } + if int64(len(descBytes)) < blobSize { + if blobSize > int64(max) { + return nil, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", blobSize, max) + } + return nil, io.ErrUnexpectedEOF + } + return descBytes, nil +} + func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { // Implement BlobRange by slicing the streamed blob contents. + // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads + // that do many small ranged reads (e.g. table index/footer reads). Consider caching/materializing + // blobs to a local file (or using a batched git cat-file mode) to serve ranges efficiently. if br.isAllRange() { return rc, uint64(sz), ver, nil } diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go new file mode 100644 index 0000000000..fe8f225bda --- /dev/null +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -0,0 +1,247 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" +) + +type fakeGitAPI struct { + tryResolveRefCommit func(ctx context.Context, ref string) (git.OID, bool, error) + resolvePathBlob func(ctx context.Context, commit git.OID, path string) (git.OID, error) + blobSize func(ctx context.Context, oid git.OID) (int64, error) + blobReader func(ctx context.Context, oid git.OID) (io.ReadCloser, error) +} + +func (f fakeGitAPI) TryResolveRefCommit(ctx context.Context, ref string) (git.OID, bool, error) { + return f.tryResolveRefCommit(ctx, ref) +} +func (f fakeGitAPI) ResolveRefCommit(ctx context.Context, ref string) (git.OID, error) { + panic("unexpected call") +} +func (f fakeGitAPI) ResolvePathBlob(ctx context.Context, commit git.OID, path string) (git.OID, error) { + return f.resolvePathBlob(ctx, commit, path) +} +func (f fakeGitAPI) CatFileType(ctx context.Context, oid git.OID) (string, error) { panic("unexpected call") } +func (f fakeGitAPI) BlobSize(ctx context.Context, oid git.OID) (int64, error) { + return f.blobSize(ctx, oid) +} +func (f fakeGitAPI) BlobReader(ctx context.Context, oid git.OID) (io.ReadCloser, error) { + return f.blobReader(ctx, oid) +} +func (f fakeGitAPI) HashObject(ctx context.Context, contents io.Reader) (git.OID, error) { + panic("unexpected call") +} +func (f fakeGitAPI) ReadTree(ctx context.Context, commit git.OID, indexFile string) error { + panic("unexpected call") +} +func (f fakeGitAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error { panic("unexpected call") } +func (f fakeGitAPI) UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid git.OID, path string) error { + panic("unexpected call") +} +func (f fakeGitAPI) WriteTree(ctx context.Context, indexFile string) (git.OID, error) { + panic("unexpected call") +} +func (f fakeGitAPI) CommitTree(ctx context.Context, tree git.OID, parent *git.OID, message string, author *git.Identity) (git.OID, error) { + panic("unexpected call") +} +func (f fakeGitAPI) UpdateRefCAS(ctx context.Context, ref string, newOID git.OID, oldOID git.OID, msg string) error { + panic("unexpected call") +} +func (f fakeGitAPI) UpdateRef(ctx context.Context, ref string, newOID git.OID, msg string) error { + panic("unexpected call") +} + +type trackingReadCloser struct { + io.Reader + closed bool +} + +func (t *trackingReadCloser) Close() error { + t.closed = true + return nil +} + +func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { + ctx := context.Background() + + t.Run("ok", func(t *testing.T) { + api := fakeGitAPI{ + tryResolveRefCommit: func(ctx context.Context, ref string) (git.OID, bool, error) { + require.Equal(t, DoltDataRef, ref) + return git.OID("0123456789abcdef0123456789abcdef01234567"), true, nil + }, + } + gbs := &GitBlobstore{ref: DoltDataRef, api: api} + + commit, ver, err := gbs.resolveCommitForGet(ctx, "k") + require.NoError(t, err) + require.Equal(t, git.OID("0123456789abcdef0123456789abcdef01234567"), commit) + require.Equal(t, "0123456789abcdef0123456789abcdef01234567", ver) + }) + + t.Run("missingRef_manifestIsNotFound", func(t *testing.T) { + api := fakeGitAPI{ + tryResolveRefCommit: func(ctx context.Context, ref string) (git.OID, bool, error) { + return git.OID(""), false, nil + }, + } + gbs := &GitBlobstore{ref: DoltDataRef, api: api} + + _, _, err := gbs.resolveCommitForGet(ctx, "manifest") + var nf NotFound + require.ErrorAs(t, err, &nf) + require.Equal(t, "manifest", nf.Key) + }) + + t.Run("missingRef_nonManifestIsRefNotFound", func(t *testing.T) { + api := fakeGitAPI{ + tryResolveRefCommit: func(ctx context.Context, ref string) (git.OID, bool, error) { + return git.OID(""), false, nil + }, + } + gbs := &GitBlobstore{ref: DoltDataRef, api: api} + + _, _, err := gbs.resolveCommitForGet(ctx, "somekey") + var rnf *git.RefNotFoundError + require.ErrorAs(t, err, &rnf) + require.Equal(t, DoltDataRef, rnf.Ref) + }) + + t.Run("propagatesError", func(t *testing.T) { + sentinel := errors.New("boom") + api := fakeGitAPI{ + tryResolveRefCommit: func(ctx context.Context, ref string) (git.OID, bool, error) { + return git.OID(""), false, sentinel + }, + } + gbs := &GitBlobstore{ref: DoltDataRef, api: api} + + _, _, err := gbs.resolveCommitForGet(ctx, "k") + require.ErrorIs(t, err, sentinel) + }) +} + +func TestGitBlobstoreHelpers_resolveBlobForGet(t *testing.T) { + ctx := context.Background() + commit := git.OID("0123456789abcdef0123456789abcdef01234567") + + t.Run("ok", func(t *testing.T) { + api := fakeGitAPI{ + resolvePathBlob: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, error) { + require.Equal(t, commit, gotCommit) + require.Equal(t, "k", path) + return git.OID("89abcdef0123456789abcdef0123456789abcdef"), nil + }, + } + gbs := &GitBlobstore{api: api} + + oid, ver, err := gbs.resolveBlobForGet(ctx, commit, "k") + require.NoError(t, err) + require.Equal(t, "0123456789abcdef0123456789abcdef01234567", ver) + require.Equal(t, git.OID("89abcdef0123456789abcdef0123456789abcdef"), oid) + }) + + t.Run("pathNotFoundMapsToNotFound", func(t *testing.T) { + api := fakeGitAPI{ + resolvePathBlob: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, error) { + return git.OID(""), &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} + }, + } + gbs := &GitBlobstore{api: api} + + _, ver, err := gbs.resolveBlobForGet(ctx, commit, "k") + require.Equal(t, commit.String(), ver) + var nf NotFound + require.ErrorAs(t, err, &nf) + require.Equal(t, "k", nf.Key) + }) +} + +func TestGitBlobstoreHelpers_resolveBlobSizeForGet(t *testing.T) { + ctx := context.Background() + commit := git.OID("0123456789abcdef0123456789abcdef01234567") + oid := git.OID("89abcdef0123456789abcdef0123456789abcdef") + + t.Run("ok", func(t *testing.T) { + api := fakeGitAPI{ + blobSize: func(ctx context.Context, gotOID git.OID) (int64, error) { + require.Equal(t, oid, gotOID) + return 123, nil + }, + } + gbs := &GitBlobstore{api: api} + + sz, ver, err := gbs.resolveBlobSizeForGet(ctx, commit, oid) + require.NoError(t, err) + require.Equal(t, commit.String(), ver) + require.Equal(t, int64(123), sz) + }) +} + +func TestGitBlobstoreHelpers_reopenInlineBlobReaderClosesOriginal(t *testing.T) { + ctx := context.Background() + blobOID := git.OID("0123456789abcdef0123456789abcdef01234567") + + orig := &trackingReadCloser{Reader: bytes.NewReader([]byte("x"))} + api := fakeGitAPI{ + blobReader: func(ctx context.Context, gotOID git.OID) (io.ReadCloser, error) { + require.Equal(t, blobOID, gotOID) + return io.NopCloser(bytes.NewReader([]byte("y"))), nil + }, + } + gbs := &GitBlobstore{api: api} + + rc, err := gbs.reopenInlineBlobReader(ctx, orig, blobOID) + require.NoError(t, err) + require.True(t, orig.closed) + require.NotNil(t, rc) + _ = rc.Close() +} + +func TestReadAtMost(t *testing.T) { + out, err := readAtMost(bytes.NewReader([]byte("hello")), 3) + require.NoError(t, err) + require.Equal(t, []byte("hel"), out) + + out, err = readAtMost(bytes.NewReader([]byte("hi")), 3) + require.NoError(t, err) + require.Equal(t, []byte("hi"), out) +} + +func TestReadFullBlobBounded(t *testing.T) { + // Reads through to blobSize when within max. + // Note: |already| is expected to be a prefix read from |r|, so |r| must represent the + // remaining stream after the prefix has been consumed. + r := bytes.NewReader([]byte("cdef")) + got, err := readFullBlobBounded(r, []byte("ab"), 6, 64) + require.NoError(t, err) + require.Equal(t, []byte("abcdef"), got) + + // Errors if blobSize exceeds max and we hit the cap. + r = bytes.NewReader(bytes.Repeat([]byte("x"), 100)) + _, err = readFullBlobBounded(r, bytes.Repeat([]byte("x"), 10), 100000, 10) + require.Error(t, err) + require.Contains(t, err.Error(), "descriptor too large") +} + diff --git a/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go b/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go new file mode 100644 index 0000000000..444ef3c67e --- /dev/null +++ b/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go @@ -0,0 +1,70 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDescriptorHelpers_validateDescriptorParts(t *testing.T) { + sum, err := validateDescriptorParts([]PartRef{ + {OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 3}, + {OIDHex: "89abcdef0123456789abcdef0123456789abcdef", Size: 4}, + }) + require.NoError(t, err) + require.Equal(t, uint64(7), sum) + + _, err = validateDescriptorParts([]PartRef{{OIDHex: "not-an-oid", Size: 1}}) + require.Error(t, err) + + _, err = validateDescriptorParts([]PartRef{{OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 0}}) + require.Error(t, err) +} + +func TestDescriptorHelpers_validateDescriptorSizeAndParts(t *testing.T) { + require.NoError(t, validateDescriptorSizeAndParts(0, 0, 0)) + require.Error(t, validateDescriptorSizeAndParts(0, 1, 1)) + require.Error(t, validateDescriptorSizeAndParts(1, 0, 0)) + require.Error(t, validateDescriptorSizeAndParts(3, 1, 2)) + require.NoError(t, validateDescriptorSizeAndParts(3, 1, 3)) +} + +func TestDescriptorHelpers_parseLines(t *testing.T) { + var st descriptorParseState + + err := parseDescriptorLine(&st, "size 3") + require.NoError(t, err) + require.True(t, st.haveSz) + require.Equal(t, uint64(3), st.d.TotalSize) + + err = parseDescriptorLine(&st, "part 0123456789abcdef0123456789abcdef01234567 3") + require.NoError(t, err) + require.Len(t, st.d.Parts, 1) + + d, err := finalizeParsedDescriptor(st) + require.NoError(t, err) + require.Equal(t, uint64(3), d.TotalSize) + require.Len(t, d.Parts, 1) +} + +func TestDescriptorHelpers_writePartLine(t *testing.T) { + var b strings.Builder + writePartLine(&b, PartRef{OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 9}) + require.Equal(t, "part 0123456789abcdef0123456789abcdef01234567 9\n", b.String()) +} + diff --git a/go/store/blobstore/internal/gitbs/ranges_helpers_test.go b/go/store/blobstore/internal/gitbs/ranges_helpers_test.go new file mode 100644 index 0000000000..68e7f4c8cf --- /dev/null +++ b/go/store/blobstore/internal/gitbs/ranges_helpers_test.go @@ -0,0 +1,69 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestRangesHelpers_normalizeStartEnd(t *testing.T) { + start, err := normalizeStart(10, -2) + require.NoError(t, err) + require.Equal(t, int64(8), start) + + _, err = normalizeStart(10, 11) + require.Error(t, err) + + end, err := normalizeEnd(10, 2, 0) + require.NoError(t, err) + require.Equal(t, int64(10), end) + + end, err = normalizeEnd(10, 2, 100) + require.NoError(t, err) + require.Equal(t, int64(10), end) +} + +func TestRangesHelpers_partBoundsAndOverlap(t *testing.T) { + _, _, err := partBounds(0, 0) + require.Error(t, err) + + // Force int64 overflow path: end wraps negative, so end < start. + _, _, err = partBounds(math.MaxInt64-1, 10) + require.Error(t, err) + + s, e, ok := overlap(0, 10, 2, 5) + require.True(t, ok) + require.Equal(t, int64(2), s) + require.Equal(t, int64(5), e) + + _, _, ok = overlap(0, 10, 10, 12) + require.False(t, ok) +} + +func TestRangesHelpers_validateCoverage(t *testing.T) { + _, err := validateCoverage(nil, 0, 1) + require.Error(t, err) + + _, err = validateCoverage([]PartSlice{{OIDHex: "a", Offset: 0, Length: 1}}, 0, 2) + require.Error(t, err) + + out, err := validateCoverage([]PartSlice{{OIDHex: "a", Offset: 0, Length: 2}}, 0, 2) + require.NoError(t, err) + require.Len(t, out, 1) +} + From 4186aa048b36ac814a1b483566ca265be84c3c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 14:48:51 -0800 Subject: [PATCH 10/28] /go/store/blobstore: refactor --- go/store/blobstore/git_blobstore.go | 123 ++++++++++++------ .../blobstore/git_blobstore_multipart_test.go | 116 +++++++++++++++++ .../blobstore/internal/gitbs/descriptor.go | 17 --- go/store/blobstore/internal/gitbs/oid.go | 40 ++++++ 4 files changed, 238 insertions(+), 58 deletions(-) create mode 100644 go/store/blobstore/git_blobstore_multipart_test.go create mode 100644 go/store/blobstore/internal/gitbs/oid.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 4bd78951df..6291950f9c 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -365,58 +365,99 @@ type multiPartReadCloser struct { func (m *multiPartReadCloser) Read(p []byte) (int, error) { for { + if err := m.ensureCurrent(); err != nil { + return 0, err + } if m.curRC == nil { - if m.curIdx >= len(m.slices) { - return 0, io.EOF - } - s := m.slices[m.curIdx] - rc, err := m.api.BlobReader(m.ctx, git.OID(s.OIDHex)) - if err != nil { - return 0, err - } - // Skip within part. - if s.Offset > 0 { - if _, err := io.CopyN(io.Discard, rc, s.Offset); err != nil { - _ = rc.Close() - return 0, err - } - } - m.curRC = rc - m.rem = s.Length + return 0, io.EOF } if m.rem == 0 { - _ = m.curRC.Close() - m.curRC = nil - m.curIdx++ + _ = m.closeCurrentAndAdvance() continue } - toRead := len(p) - if int64(toRead) > m.rem { - toRead = int(m.rem) - } - n, err := m.curRC.Read(p[:toRead]) - if n > 0 { - m.rem -= int64(n) - return n, nil - } - if err != nil { - if errors.Is(err, io.EOF) { - // End of underlying part blob; if we still expected bytes, that's corruption. - if m.rem > 0 { - return 0, io.ErrUnexpectedEOF - } - _ = m.curRC.Close() - m.curRC = nil - m.curIdx++ - continue - } - return 0, err + n, err := m.readCurrent(p) + if n > 0 || err != nil { + return n, err } } } +func (m *multiPartReadCloser) ensureCurrent() error { + if m.curRC != nil { + return nil + } + if m.curIdx >= len(m.slices) { + return nil + } + s := m.slices[m.curIdx] + rc, err := m.openSliceReader(s) + if err != nil { + return err + } + m.curRC = rc + m.rem = s.Length + return nil +} + +func (m *multiPartReadCloser) openSliceReader(s gitbs.PartSlice) (io.ReadCloser, error) { + rc, err := m.api.BlobReader(m.ctx, git.OID(s.OIDHex)) + if err != nil { + return nil, err + } + if err := skipN(rc, s.Offset); err != nil { + _ = rc.Close() + return nil, err + } + return rc, nil +} + +func (m *multiPartReadCloser) closeCurrentAndAdvance() error { + if m.curRC != nil { + err := m.curRC.Close() + m.curRC = nil + m.rem = 0 + m.curIdx++ + return err + } + m.curIdx++ + return nil +} + +func (m *multiPartReadCloser) readCurrent(p []byte) (int, error) { + toRead := len(p) + if int64(toRead) > m.rem { + toRead = int(m.rem) + } + + n, err := m.curRC.Read(p[:toRead]) + if n > 0 { + m.rem -= int64(n) + return n, nil + } + if err == nil { + return 0, nil + } + if errors.Is(err, io.EOF) { + // End of underlying part blob; if we still expected bytes, that's corruption. + if m.rem > 0 { + return 0, io.ErrUnexpectedEOF + } + _ = m.closeCurrentAndAdvance() + return 0, nil + } + return 0, err +} + +func skipN(r io.Reader, n int64) error { + if n <= 0 { + return nil + } + _, err := io.CopyN(io.Discard, r, n) + return err +} + func (m *multiPartReadCloser) Close() error { if m.curRC != nil { err := m.curRC.Close() diff --git a/go/store/blobstore/git_blobstore_multipart_test.go b/go/store/blobstore/git_blobstore_multipart_test.go new file mode 100644 index 0000000000..115529a37b --- /dev/null +++ b/go/store/blobstore/git_blobstore_multipart_test.go @@ -0,0 +1,116 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" +) + +func TestMultiPartReadCloser_ReadConcatenatesAcrossPartsWithOffsets(t *testing.T) { + ctx := context.Background() + + oid1 := "0123456789abcdef0123456789abcdef01234567" + oid2 := "89abcdef0123456789abcdef0123456789abcdef" + + blobs := map[string][]byte{ + oid1: []byte("hello"), + oid2: []byte("world!"), + } + + api := fakeGitAPI{ + blobReader: func(ctx context.Context, oid git.OID) (io.ReadCloser, error) { + b, ok := blobs[oid.String()] + require.True(t, ok, "unexpected oid %s", oid.String()) + return io.NopCloser(bytes.NewReader(b)), nil + }, + } + + rc := &multiPartReadCloser{ + ctx: ctx, + api: api, + slices: []gitbs.PartSlice{ + {OIDHex: oid1, Offset: 1, Length: 3}, // "ell" + {OIDHex: oid2, Offset: 2, Length: 3}, // "rld" + }, + } + defer func() { _ = rc.Close() }() + + got, err := io.ReadAll(rc) + require.NoError(t, err) + require.Equal(t, []byte("ellrld"), got) +} + +func TestMultiPartReadCloser_ReadUnexpectedEOFWhenPartShorterThanDeclared(t *testing.T) { + ctx := context.Background() + + oid := "0123456789abcdef0123456789abcdef01234567" + api := fakeGitAPI{ + blobReader: func(ctx context.Context, oid git.OID) (io.ReadCloser, error) { + return io.NopCloser(bytes.NewReader([]byte("hi"))), nil // 2 bytes + }, + } + + rc := &multiPartReadCloser{ + ctx: ctx, + api: api, + slices: []gitbs.PartSlice{ + {OIDHex: oid, Offset: 0, Length: 3}, // expect 3 bytes, only 2 available + }, + } + defer func() { _ = rc.Close() }() + + _, err := io.ReadAll(rc) + require.Error(t, err) + require.True(t, errors.Is(err, io.ErrUnexpectedEOF)) +} + +func TestMultiPartReadCloser_CloseClosesUnderlyingPartReader(t *testing.T) { + ctx := context.Background() + + oid := "0123456789abcdef0123456789abcdef01234567" + underlying := &trackingReadCloser{Reader: bytes.NewReader([]byte("hello"))} + + api := fakeGitAPI{ + blobReader: func(ctx context.Context, oid git.OID) (io.ReadCloser, error) { + return underlying, nil + }, + } + + rc := &multiPartReadCloser{ + ctx: ctx, + api: api, + slices: []gitbs.PartSlice{ + {OIDHex: oid, Offset: 0, Length: 1}, + }, + } + + // Force the underlying reader to be opened. + buf := make([]byte, 1) + _, err := rc.Read(buf) + require.NoError(t, err) + + require.NoError(t, rc.Close()) + require.True(t, underlying.closed) +} + diff --git a/go/store/blobstore/internal/gitbs/descriptor.go b/go/store/blobstore/internal/gitbs/descriptor.go index 3da88b8827..379766eacd 100644 --- a/go/store/blobstore/internal/gitbs/descriptor.go +++ b/go/store/blobstore/internal/gitbs/descriptor.go @@ -236,20 +236,3 @@ func splitLines(s string) []string { func parseUint(s string) (uint64, error) { return strconv.ParseUint(s, 10, 64) } - -func validateOIDHex(oid string) error { - if len(oid) != 40 { - return fmt.Errorf("expected 40 hex chars, got %d", len(oid)) - } - for i := 0; i < len(oid); i++ { - c := oid[i] - switch { - case c >= '0' && c <= '9': - case c >= 'a' && c <= 'f': - case c >= 'A' && c <= 'F': - default: - return fmt.Errorf("non-hex character %q", c) - } - } - return nil -} diff --git a/go/store/blobstore/internal/gitbs/oid.go b/go/store/blobstore/internal/gitbs/oid.go new file mode 100644 index 0000000000..3470e4936a --- /dev/null +++ b/go/store/blobstore/internal/gitbs/oid.go @@ -0,0 +1,40 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gitbs + +import "fmt" + +// validateOIDHex validates a 40-character hex object id. +// +// This is intentionally lenient about case (accepts A-F) since we may parse OIDs +// from sources that aren't normalized. Callers that require a canonical form should +// normalize separately (e.g. strings.ToLower). +func validateOIDHex(oid string) error { + if len(oid) != 40 { + return fmt.Errorf("expected 40 hex chars, got %d", len(oid)) + } + for i := 0; i < len(oid); i++ { + c := oid[i] + switch { + case c >= '0' && c <= '9': + case c >= 'a' && c <= 'f': + case c >= 'A' && c <= 'F': + default: + return fmt.Errorf("non-hex character %q", c) + } + } + return nil +} + From f95f4f0e30e6e1f489f5689d3e7a49898231c4c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 15:04:40 -0800 Subject: [PATCH 11/28] /go/store/blobstore: format --- go/store/blobstore/git_blobstore_helpers_test.go | 9 ++++++--- go/store/blobstore/git_blobstore_multipart_test.go | 1 - .../blobstore/internal/gitbs/descriptor_helpers_test.go | 1 - go/store/blobstore/internal/gitbs/oid.go | 1 - go/store/blobstore/internal/gitbs/ranges_helpers_test.go | 1 - 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index fe8f225bda..c590a0bcc0 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -42,7 +42,9 @@ func (f fakeGitAPI) ResolveRefCommit(ctx context.Context, ref string) (git.OID, func (f fakeGitAPI) ResolvePathBlob(ctx context.Context, commit git.OID, path string) (git.OID, error) { return f.resolvePathBlob(ctx, commit, path) } -func (f fakeGitAPI) CatFileType(ctx context.Context, oid git.OID) (string, error) { panic("unexpected call") } +func (f fakeGitAPI) CatFileType(ctx context.Context, oid git.OID) (string, error) { + panic("unexpected call") +} func (f fakeGitAPI) BlobSize(ctx context.Context, oid git.OID) (int64, error) { return f.blobSize(ctx, oid) } @@ -55,7 +57,9 @@ func (f fakeGitAPI) HashObject(ctx context.Context, contents io.Reader) (git.OID func (f fakeGitAPI) ReadTree(ctx context.Context, commit git.OID, indexFile string) error { panic("unexpected call") } -func (f fakeGitAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error { panic("unexpected call") } +func (f fakeGitAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error { + panic("unexpected call") +} func (f fakeGitAPI) UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid git.OID, path string) error { panic("unexpected call") } @@ -244,4 +248,3 @@ func TestReadFullBlobBounded(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "descriptor too large") } - diff --git a/go/store/blobstore/git_blobstore_multipart_test.go b/go/store/blobstore/git_blobstore_multipart_test.go index 115529a37b..883c2b67bd 100644 --- a/go/store/blobstore/git_blobstore_multipart_test.go +++ b/go/store/blobstore/git_blobstore_multipart_test.go @@ -113,4 +113,3 @@ func TestMultiPartReadCloser_CloseClosesUnderlyingPartReader(t *testing.T) { require.NoError(t, rc.Close()) require.True(t, underlying.closed) } - diff --git a/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go b/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go index 444ef3c67e..7886695486 100644 --- a/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go +++ b/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go @@ -67,4 +67,3 @@ func TestDescriptorHelpers_writePartLine(t *testing.T) { writePartLine(&b, PartRef{OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 9}) require.Equal(t, "part 0123456789abcdef0123456789abcdef01234567 9\n", b.String()) } - diff --git a/go/store/blobstore/internal/gitbs/oid.go b/go/store/blobstore/internal/gitbs/oid.go index 3470e4936a..55b0e8b753 100644 --- a/go/store/blobstore/internal/gitbs/oid.go +++ b/go/store/blobstore/internal/gitbs/oid.go @@ -37,4 +37,3 @@ func validateOIDHex(oid string) error { } return nil } - diff --git a/go/store/blobstore/internal/gitbs/ranges_helpers_test.go b/go/store/blobstore/internal/gitbs/ranges_helpers_test.go index 68e7f4c8cf..d4f66b315f 100644 --- a/go/store/blobstore/internal/gitbs/ranges_helpers_test.go +++ b/go/store/blobstore/internal/gitbs/ranges_helpers_test.go @@ -66,4 +66,3 @@ func TestRangesHelpers_validateCoverage(t *testing.T) { require.NoError(t, err) require.Len(t, out, 1) } - From c2ea3de6f2e8599aaa95945abc8de0065ed2bca2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 16:54:17 -0800 Subject: [PATCH 12/28] /go/store/blobstore: wip, simplifying --- go/store/blobstore/git_blobstore.go | 172 ++++++-------- .../git_blobstore_chunked_checkandput_test.go | 82 ------- .../git_blobstore_chunked_get_test.go | 96 +++----- .../git_blobstore_chunked_put_test.go | 84 ------- .../blobstore/git_blobstore_helpers_test.go | 89 +++----- .../blobstore/git_blobstore_multipart_test.go | 10 + go/store/blobstore/internal/git/api.go | 25 +++ go/store/blobstore/internal/git/impl.go | 97 ++++++++ go/store/blobstore/internal/git/impl_test.go | 210 ++++++++++++++++++ 9 files changed, 481 insertions(+), 384 deletions(-) delete mode 100644 go/store/blobstore/git_blobstore_chunked_checkandput_test.go delete mode 100644 go/store/blobstore/git_blobstore_chunked_put_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 6291950f9c..645d66db1a 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -20,9 +20,11 @@ import ( "errors" "fmt" "io" + "math" "os" "path/filepath" "sort" + "strconv" "strings" "time" @@ -111,7 +113,7 @@ func (gbs *GitBlobstore) Exists(ctx context.Context, key string) (bool, error) { if !ok { return false, nil } - _, err = gbs.api.ResolvePathBlob(ctx, commit, key) + _, _, err = gbs.api.ResolvePathObject(ctx, commit, key) if err != nil { if git.IsPathNotFound(err) { return false, nil @@ -131,17 +133,29 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. return nil, 0, ver, err } - blobOID, ver, err := gbs.resolveBlobForGet(ctx, commit, key) + oid, typ, ver, err := gbs.resolveObjectForGet(ctx, commit, key) if err != nil { return nil, 0, ver, err } - sz, ver, err := gbs.resolveBlobSizeForGet(ctx, commit, blobOID) - if err != nil { - return nil, 0, ver, err - } + switch typ { + case "blob": + sz, ver, err := gbs.resolveBlobSizeForGet(ctx, commit, oid) + if err != nil { + return nil, 0, ver, err + } + rc, err := gbs.api.BlobReader(ctx, oid) + if err != nil { + return nil, 0, ver, err + } + return sliceInlineBlob(rc, sz, br, ver) - return gbs.openBlobOrDescriptorRange(ctx, commit, blobOID, sz, br) + case "tree": + return gbs.openChunkedTreeRange(ctx, commit, key, oid, br) + + default: + return nil, 0, ver, fmt.Errorf("gitblobstore: unsupported object type %q for key %q", typ, key) + } } func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, ver string, err error) { @@ -161,15 +175,15 @@ func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (c return git.OID(""), "", &git.RefNotFoundError{Ref: gbs.ref} } -func (gbs *GitBlobstore) resolveBlobForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, ver string, err error) { - oid, err = gbs.api.ResolvePathBlob(ctx, commit, key) +func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ string, ver string, err error) { + oid, typ, err = gbs.api.ResolvePathObject(ctx, commit, key) if err != nil { if git.IsPathNotFound(err) { - return git.OID(""), commit.String(), NotFound{Key: key} + return git.OID(""), "", commit.String(), NotFound{Key: key} } - return git.OID(""), commit.String(), err + return git.OID(""), "", commit.String(), err } - return oid, commit.String(), nil + return oid, typ, commit.String(), nil } func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.OID, oid git.OID) (sz int64, ver string, err error) { @@ -188,56 +202,27 @@ type limitReadCloser struct { func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } func (l *limitReadCloser) Close() error { return l.c.Close() } -func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit git.OID, blobOID git.OID, blobSize int64, br BlobRange) (io.ReadCloser, uint64, string, error) { +func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OID, key string, treeOID git.OID, br BlobRange) (io.ReadCloser, uint64, string, error) { ver := commit.String() - rc, err := gbs.api.BlobReader(ctx, blobOID) + _ = treeOID // treeOID is informational; ListTree resolves by path. + entries, err := gbs.api.ListTree(ctx, commit, key) if err != nil { return nil, 0, ver, err } - defer func() { - if rc != nil { - _ = rc.Close() - } - }() - - const peekN = 64 * 1024 - peek, err := readAtMost(rc, 256) + parts, totalSize, err := gbs.validateAndSizeChunkedParts(ctx, entries) if err != nil { return nil, 0, ver, err } - // Not a descriptor: stream inline blob with BlobRange slicing. - if !gitbs.IsDescriptorPrefix(peek) { - inlineRC, err := gbs.reopenInlineBlobReader(ctx, rc, blobOID) - if err != nil { - return nil, 0, ver, err - } - rc = nil // ownership transferred / already closed - return sliceInlineBlob(inlineRC, blobSize, br, ver) - } - - // It's probably a descriptor. Read the full contents (bounded defensively). - // TODO(gitblobstore): add a MaxDescriptorSize config; for now cap at 64KiB. - descBytes, err := readFullBlobBounded(rc, peek, blobSize, peekN) - if err != nil { - return nil, 0, ver, err - } - - desc, err := gitbs.ParseDescriptor(descBytes) - if err != nil { - // Treat malformed descriptors as corruption (hard error). - return nil, 0, ver, err - } - - total := int64(desc.TotalSize) + total := int64(totalSize) start, end, err := gitbs.NormalizeRange(total, br.offset, br.length) if err != nil { - return nil, uint64(desc.TotalSize), ver, err + return nil, totalSize, ver, err } - slices, err := gitbs.SliceParts(desc.Parts, start, end) + slices, err := gitbs.SliceParts(parts, start, end) if err != nil { - return nil, uint64(desc.TotalSize), ver, err + return nil, totalSize, ver, err } // Stream across part blobs. @@ -246,72 +231,55 @@ func (gbs *GitBlobstore) openBlobOrDescriptorRange(ctx context.Context, commit g api: gbs.api, slices: slices, } - // Close descriptor blob reader (not used past this point). - _ = rc.Close() - rc = nil - return streamRC, uint64(desc.TotalSize), ver, nil + return streamRC, totalSize, ver, nil } -func (gbs *GitBlobstore) reopenInlineBlobReader(ctx context.Context, rc io.ReadCloser, blobOID git.OID) (io.ReadCloser, error) { - // Re-open for streaming the full inline blob. (Simpler than splicing peek+rest.) - if rc != nil { - _ = rc.Close() +func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entries []git.TreeEntry) ([]gitbs.PartRef, uint64, error) { + if len(entries) == 0 { + return nil, 0, fmt.Errorf("gitblobstore: chunked tree has no parts") } - return gbs.api.BlobReader(ctx, blobOID) -} -func readAtMost(r io.Reader, n int) ([]byte, error) { - if n <= 0 { - return nil, nil + width := len(entries[0].Name) + // First pass: validate names + types, and determine width. + if width < 4 { + return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected at least 4 digits)", entries[0].Name) } - out := make([]byte, 0, n) - buf := make([]byte, min(256, n)) - for len(out) < n { - toRead := min(n-len(out), len(buf)) - rd, err := r.Read(buf[:toRead]) - if rd > 0 { - out = append(out, buf[:rd]...) + + parts := make([]gitbs.PartRef, 0, len(entries)) + var total uint64 + for i, e := range entries { + if e.Type != "blob" { + return nil, 0, fmt.Errorf("gitblobstore: invalid part %q: expected blob, got %q", e.Name, e.Type) } + if len(e.Name) != width { + return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected width %d)", e.Name, width) + } + n, err := strconv.Atoi(e.Name) if err != nil { - if errors.Is(err, io.EOF) { - break - } - return nil, err + return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected digits): %w", e.Name, err) } - } - return out, nil -} - -func readFullBlobBounded(r io.Reader, already []byte, blobSize int64, max int) ([]byte, error) { - if blobSize < 0 { - return nil, fmt.Errorf("gitblobstore: invalid blob size %d", blobSize) - } - if int64(len(already)) > blobSize { - // Defensive: callers should pass a prefix read from this same blob reader. - return nil, io.ErrUnexpectedEOF - } - - descBytes := append([]byte(nil), already...) - buf := make([]byte, 256) - for int64(len(descBytes)) < blobSize && len(descBytes) < max { - n, err := r.Read(buf) - if n > 0 { - descBytes = append(descBytes, buf[:n]...) + if n != i+1 { + want := fmt.Sprintf("%0*d", width, i+1) + return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected %q)", e.Name, want) } + if want := fmt.Sprintf("%0*d", width, n); want != e.Name { + return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected %q)", e.Name, want) + } + + sz, err := gbs.api.BlobSize(ctx, e.OID) if err != nil { - if errors.Is(err, io.EOF) { - break - } - return nil, err + return nil, 0, err } - } - if int64(len(descBytes)) < blobSize { - if blobSize > int64(max) { - return nil, fmt.Errorf("gitblobstore: descriptor too large (%d bytes, cap %d)", blobSize, max) + if sz < 0 { + return nil, 0, fmt.Errorf("gitblobstore: invalid part size %d for %q", sz, e.Name) } - return nil, io.ErrUnexpectedEOF + if uint64(sz) > math.MaxUint64-total { + return nil, 0, fmt.Errorf("gitblobstore: total size overflow") + } + total += uint64(sz) + parts = append(parts, gitbs.PartRef{OIDHex: e.OID.String(), Size: uint64(sz)}) } - return descBytes, nil + return parts, total, nil } func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { diff --git a/go/store/blobstore/git_blobstore_chunked_checkandput_test.go b/go/store/blobstore/git_blobstore_chunked_checkandput_test.go deleted file mode 100644 index 316131fc9c..0000000000 --- a/go/store/blobstore/git_blobstore_chunked_checkandput_test.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package blobstore - -import ( - "bytes" - "context" - "errors" - "io" - "testing" - - "github.com/stretchr/testify/require" - - "github.com/dolthub/dolt/go/store/testutils/gitrepo" -) - -func TestGitBlobstore_CheckAndPut_ChunkedRoundTrip_CreateOnly(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ - Identity: testIdentity(), - MaxPartSize: 3, - }) - require.NoError(t, err) - - want := []byte("abcdefghij") // 10 bytes -> chunked - ver, err := bs.CheckAndPut(ctx, "", "big", int64(len(want)), bytes.NewReader(want)) - require.NoError(t, err) - require.NotEmpty(t, ver) - - got, ver2, err := GetBytes(ctx, bs, "big", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, want, got) -} - -type failReadSeeker struct{} - -func (f failReadSeeker) Read(p []byte) (int, error) { - return 0, errors.New("read should not be called") -} - -func TestGitBlobstore_CheckAndPut_MismatchDoesNotConsumeReader(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - // Seed any commit so actualVersion != "". - bs0, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{Identity: testIdentity()}) - require.NoError(t, err) - _, err = bs0.Put(ctx, "x", 1, bytes.NewReader([]byte("x"))) - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ - Identity: testIdentity(), - MaxPartSize: 3, - }) - require.NoError(t, err) - - // Provide a wrong expectedVersion; should fail without reading. - _, err = bs.CheckAndPut(ctx, "definitely-wrong", "y", 1, io.Reader(failReadSeeker{})) - require.Error(t, err) - require.True(t, IsCheckAndPutError(err)) -} diff --git a/go/store/blobstore/git_blobstore_chunked_get_test.go b/go/store/blobstore/git_blobstore_chunked_get_test.go index 923f91071a..a0efbaf0ec 100644 --- a/go/store/blobstore/git_blobstore_chunked_get_test.go +++ b/go/store/blobstore/git_blobstore_chunked_get_test.go @@ -15,19 +15,16 @@ package blobstore import ( - "bytes" "context" "os/exec" "testing" "github.com/stretchr/testify/require" - git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" "github.com/dolthub/dolt/go/store/testutils/gitrepo" ) -func TestGitBlobstore_Get_ChunkedDescriptor_AllAndRanges(t *testing.T) { +func TestGitBlobstore_Get_ChunkedTree_AllAndRanges(t *testing.T) { if _, err := exec.LookPath("git"); err != nil { t.Skip("git not found on PATH") } @@ -36,49 +33,13 @@ func TestGitBlobstore_Get_ChunkedDescriptor_AllAndRanges(t *testing.T) { repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") require.NoError(t, err) - runner, err := git.NewRunner(repo.GitDir) - require.NoError(t, err) - api := git.NewGitAPIImpl(runner) - - // Create two part blobs. part1 := []byte("abc") part2 := []byte("defgh") - oid1, err := api.HashObject(ctx, bytes.NewReader(part1)) + commitOID, err := repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ + "chunked/0001": part1, + "chunked/0002": part2, + }, "seed chunked tree") require.NoError(t, err) - oid2, err := api.HashObject(ctx, bytes.NewReader(part2)) - require.NoError(t, err) - - desc := gitbs.Descriptor{ - TotalSize: uint64(len(part1) + len(part2)), - Parts: []gitbs.PartRef{ - {OIDHex: oid1.String(), Size: uint64(len(part1))}, - {OIDHex: oid2.String(), Size: uint64(len(part2))}, - }, - } - descBytes, err := gitbs.EncodeDescriptor(desc) - require.NoError(t, err) - descOID, err := api.HashObject(ctx, bytes.NewReader(descBytes)) - require.NoError(t, err) - - // Build a commit whose tree contains: - // - key "chunked" -> descriptor blob - // - parts staged under reserved parts namespace (reachability) - _, indexFile, cleanup, err := newTempIndex() - require.NoError(t, err) - defer cleanup() - - require.NoError(t, api.ReadTreeEmpty(ctx, indexFile)) - require.NoError(t, api.UpdateIndexCacheInfo(ctx, indexFile, "100644", descOID, "chunked")) - _, err = stagePartReachable(ctx, api, indexFile, oid1) - require.NoError(t, err) - _, err = stagePartReachable(ctx, api, indexFile, oid2) - require.NoError(t, err) - - treeOID, err := api.WriteTree(ctx, indexFile) - require.NoError(t, err) - commitOID, err := api.CommitTree(ctx, treeOID, nil, "seed chunked descriptor", &git.Identity{Name: "t", Email: "t@t"}) - require.NoError(t, err) - require.NoError(t, api.UpdateRef(ctx, DoltDataRef, commitOID, "seed")) bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) require.NoError(t, err) @@ -87,40 +48,47 @@ func TestGitBlobstore_Get_ChunkedDescriptor_AllAndRanges(t *testing.T) { got, ver, err := GetBytes(ctx, bs, "chunked", AllRange) require.NoError(t, err) - require.Equal(t, commitOID.String(), ver) + require.Equal(t, commitOID, ver) require.Equal(t, wantAll, got) // Range spanning boundary: offset 2 length 4 => "cdef" got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(2, 4)) require.NoError(t, err) - require.Equal(t, commitOID.String(), ver) + require.Equal(t, commitOID, ver) require.Equal(t, []byte("cdef"), got) // Tail read last 3 bytes => "fgh" got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(-3, 0)) require.NoError(t, err) - require.Equal(t, commitOID.String(), ver) + require.Equal(t, commitOID, ver) require.Equal(t, []byte("fgh"), got) - // Validate size returned is logical size, not descriptor size. + // Validate size returned is logical size. rc, sz, ver2, err := bs.Get(ctx, "chunked", NewBlobRange(0, 1)) require.NoError(t, err) require.Equal(t, uint64(len(wantAll)), sz) - require.Equal(t, commitOID.String(), ver2) + require.Equal(t, commitOID, ver2) _ = rc.Close() - - // Also verify "inline blob that happens to start with magic" is treated as inline - // if it doesn't match the descriptor prefix (magic + size line). - inline := "DOLTBS1\nthis is not a descriptor\n" - inlineCommit, err := repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ - "inline": []byte(inline), - }, "seed inline magic") - require.NoError(t, err) - - bs2, err := NewGitBlobstore(repo.GitDir, DoltDataRef) - require.NoError(t, err) - got2, ver3, err := GetBytes(ctx, bs2, "inline", AllRange) - require.NoError(t, err) - require.Equal(t, inlineCommit, ver3) - require.Equal(t, []byte(inline), got2) +} + +func TestGitBlobstore_Get_ChunkedTree_InvalidPartsError(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Gap: 0001, 0003 + _, err = repo.SetRefToTree(ctx, DoltDataRef, map[string][]byte{ + "chunked/0001": []byte("a"), + "chunked/0003": []byte("b"), + }, "seed invalid chunked tree") + require.NoError(t, err) + + bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) + require.NoError(t, err) + + _, _, err = GetBytes(ctx, bs, "chunked", AllRange) + require.Error(t, err) + require.False(t, IsNotFoundError(err)) } diff --git a/go/store/blobstore/git_blobstore_chunked_put_test.go b/go/store/blobstore/git_blobstore_chunked_put_test.go deleted file mode 100644 index 5c16b9ec77..0000000000 --- a/go/store/blobstore/git_blobstore_chunked_put_test.go +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package blobstore - -import ( - "bytes" - "context" - "io" - "testing" - - "github.com/stretchr/testify/require" - - git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" - "github.com/dolthub/dolt/go/store/testutils/gitrepo" -) - -func TestGitBlobstore_Put_ChunkedUnderMaxPartSize(t *testing.T) { - requireGitOnPath(t) - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ - Identity: testIdentity(), - MaxPartSize: 3, - }) - require.NoError(t, err) - - want := []byte("abcdefghij") // 10 bytes -> 3,3,3,1 - ver, err := bs.Put(ctx, "big", int64(len(want)), bytes.NewReader(want)) - require.NoError(t, err) - - got, ver2, err := GetBytes(ctx, bs, "big", AllRange) - require.NoError(t, err) - require.Equal(t, ver, ver2) - require.Equal(t, want, got) - - runner, err := git.NewRunner(repo.GitDir) - require.NoError(t, err) - api := git.NewGitAPIImpl(runner) - - commit := git.OID(ver) - keyOID, err := api.ResolvePathBlob(ctx, commit, "big") - require.NoError(t, err) - - rc, err := api.BlobReader(ctx, keyOID) - require.NoError(t, err) - descBytes, err := io.ReadAll(rc) - require.NoError(t, err) - require.NoError(t, rc.Close()) - - desc, err := gitbs.ParseDescriptor(descBytes) - require.NoError(t, err) - require.Equal(t, uint64(len(want)), desc.TotalSize) - require.GreaterOrEqual(t, len(desc.Parts), 2) - - for _, p := range desc.Parts { - require.LessOrEqual(t, p.Size, uint64(3)) - ppath, err := gitbs.PartPath(p.OIDHex) - require.NoError(t, err) - gotOID, err := api.ResolvePathBlob(ctx, commit, ppath) - require.NoError(t, err) - require.Equal(t, git.OID(p.OIDHex), gotOID) - } - - // Range spanning boundary (offset 2, length 4) => "cdef" - got, _, err = GetBytes(ctx, bs, "big", NewBlobRange(2, 4)) - require.NoError(t, err) - require.Equal(t, []byte("cdef"), got) -} diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index c590a0bcc0..c2c9f4d5dd 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -15,7 +15,6 @@ package blobstore import ( - "bytes" "context" "errors" "io" @@ -29,6 +28,7 @@ import ( type fakeGitAPI struct { tryResolveRefCommit func(ctx context.Context, ref string) (git.OID, bool, error) resolvePathBlob func(ctx context.Context, commit git.OID, path string) (git.OID, error) + resolvePathObject func(ctx context.Context, commit git.OID, path string) (git.OID, string, error) blobSize func(ctx context.Context, oid git.OID) (int64, error) blobReader func(ctx context.Context, oid git.OID) (io.ReadCloser, error) } @@ -42,6 +42,12 @@ func (f fakeGitAPI) ResolveRefCommit(ctx context.Context, ref string) (git.OID, func (f fakeGitAPI) ResolvePathBlob(ctx context.Context, commit git.OID, path string) (git.OID, error) { return f.resolvePathBlob(ctx, commit, path) } +func (f fakeGitAPI) ResolvePathObject(ctx context.Context, commit git.OID, path string) (git.OID, string, error) { + return f.resolvePathObject(ctx, commit, path) +} +func (f fakeGitAPI) ListTree(ctx context.Context, commit git.OID, treePath string) ([]git.TreeEntry, error) { + panic("unexpected call") +} func (f fakeGitAPI) CatFileType(ctx context.Context, oid git.OID) (string, error) { panic("unexpected call") } @@ -63,6 +69,9 @@ func (f fakeGitAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error { func (f fakeGitAPI) UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid git.OID, path string) error { panic("unexpected call") } +func (f fakeGitAPI) RemoveIndexPaths(ctx context.Context, indexFile string, paths []string) error { + panic("unexpected call") +} func (f fakeGitAPI) WriteTree(ctx context.Context, indexFile string) (git.OID, error) { panic("unexpected call") } @@ -76,16 +85,6 @@ func (f fakeGitAPI) UpdateRef(ctx context.Context, ref string, newOID git.OID, m panic("unexpected call") } -type trackingReadCloser struct { - io.Reader - closed bool -} - -func (t *trackingReadCloser) Close() error { - t.closed = true - return nil -} - func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { ctx := context.Background() @@ -146,35 +145,36 @@ func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { }) } -func TestGitBlobstoreHelpers_resolveBlobForGet(t *testing.T) { +func TestGitBlobstoreHelpers_resolveObjectForGet(t *testing.T) { ctx := context.Background() commit := git.OID("0123456789abcdef0123456789abcdef01234567") t.Run("ok", func(t *testing.T) { api := fakeGitAPI{ - resolvePathBlob: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, error) { + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, string, error) { require.Equal(t, commit, gotCommit) require.Equal(t, "k", path) - return git.OID("89abcdef0123456789abcdef0123456789abcdef"), nil + return git.OID("89abcdef0123456789abcdef0123456789abcdef"), "blob", nil }, } gbs := &GitBlobstore{api: api} - oid, ver, err := gbs.resolveBlobForGet(ctx, commit, "k") + oid, typ, ver, err := gbs.resolveObjectForGet(ctx, commit, "k") require.NoError(t, err) require.Equal(t, "0123456789abcdef0123456789abcdef01234567", ver) + require.Equal(t, "blob", typ) require.Equal(t, git.OID("89abcdef0123456789abcdef0123456789abcdef"), oid) }) t.Run("pathNotFoundMapsToNotFound", func(t *testing.T) { api := fakeGitAPI{ - resolvePathBlob: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, error) { - return git.OID(""), &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, string, error) { + return git.OID(""), "", &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} }, } gbs := &GitBlobstore{api: api} - _, ver, err := gbs.resolveBlobForGet(ctx, commit, "k") + _, _, ver, err := gbs.resolveObjectForGet(ctx, commit, "k") require.Equal(t, commit.String(), ver) var nf NotFound require.ErrorAs(t, err, &nf) @@ -203,48 +203,33 @@ func TestGitBlobstoreHelpers_resolveBlobSizeForGet(t *testing.T) { }) } -func TestGitBlobstoreHelpers_reopenInlineBlobReaderClosesOriginal(t *testing.T) { +func TestGitBlobstoreHelpers_validateAndSizeChunkedParts(t *testing.T) { ctx := context.Background() - blobOID := git.OID("0123456789abcdef0123456789abcdef01234567") - orig := &trackingReadCloser{Reader: bytes.NewReader([]byte("x"))} api := fakeGitAPI{ - blobReader: func(ctx context.Context, gotOID git.OID) (io.ReadCloser, error) { - require.Equal(t, blobOID, gotOID) - return io.NopCloser(bytes.NewReader([]byte("y"))), nil + blobSize: func(ctx context.Context, oid git.OID) (int64, error) { + switch oid { + case "0123456789abcdef0123456789abcdef01234567": + return 3, nil + case "89abcdef0123456789abcdef0123456789abcdef": + return 5, nil + default: + return 0, errors.New("unexpected oid") + } }, } gbs := &GitBlobstore{api: api} - rc, err := gbs.reopenInlineBlobReader(ctx, orig, blobOID) + parts, total, err := gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{ + {Name: "0001", Type: "blob", OID: "0123456789abcdef0123456789abcdef01234567"}, + {Name: "0002", Type: "blob", OID: "89abcdef0123456789abcdef0123456789abcdef"}, + }) require.NoError(t, err) - require.True(t, orig.closed) - require.NotNil(t, rc) - _ = rc.Close() -} + require.Equal(t, uint64(8), total) + require.Len(t, parts, 2) + require.Equal(t, "0123456789abcdef0123456789abcdef01234567", parts[0].OIDHex) + require.Equal(t, uint64(3), parts[0].Size) -func TestReadAtMost(t *testing.T) { - out, err := readAtMost(bytes.NewReader([]byte("hello")), 3) - require.NoError(t, err) - require.Equal(t, []byte("hel"), out) - - out, err = readAtMost(bytes.NewReader([]byte("hi")), 3) - require.NoError(t, err) - require.Equal(t, []byte("hi"), out) -} - -func TestReadFullBlobBounded(t *testing.T) { - // Reads through to blobSize when within max. - // Note: |already| is expected to be a prefix read from |r|, so |r| must represent the - // remaining stream after the prefix has been consumed. - r := bytes.NewReader([]byte("cdef")) - got, err := readFullBlobBounded(r, []byte("ab"), 6, 64) - require.NoError(t, err) - require.Equal(t, []byte("abcdef"), got) - - // Errors if blobSize exceeds max and we hit the cap. - r = bytes.NewReader(bytes.Repeat([]byte("x"), 100)) - _, err = readFullBlobBounded(r, bytes.Repeat([]byte("x"), 10), 100000, 10) + _, _, err = gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{{Name: "1", Type: "blob", OID: "0123456789abcdef0123456789abcdef01234567"}}) require.Error(t, err) - require.Contains(t, err.Error(), "descriptor too large") } diff --git a/go/store/blobstore/git_blobstore_multipart_test.go b/go/store/blobstore/git_blobstore_multipart_test.go index 883c2b67bd..45b48c9ec5 100644 --- a/go/store/blobstore/git_blobstore_multipart_test.go +++ b/go/store/blobstore/git_blobstore_multipart_test.go @@ -27,6 +27,16 @@ import ( gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" ) +type trackingReadCloser struct { + io.Reader + closed bool +} + +func (t *trackingReadCloser) Close() error { + t.closed = true + return nil +} + func TestMultiPartReadCloser_ReadConcatenatesAcrossPartsWithOffsets(t *testing.T) { ctx := context.Background() diff --git a/go/store/blobstore/internal/git/api.go b/go/store/blobstore/internal/git/api.go index d767f5bc81..7e5851b06d 100644 --- a/go/store/blobstore/internal/git/api.go +++ b/go/store/blobstore/internal/git/api.go @@ -34,6 +34,18 @@ type GitAPI interface { // resolves to a non-blob object. ResolvePathBlob(ctx context.Context, commit OID, path string) (OID, error) + // ResolvePathObject resolves |path| within |commit| to an object OID and type. + // It returns PathNotFoundError if the path does not exist. + // + // Typical types are "blob" and "tree". + ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ string, err error) + + // ListTree lists the entries of the tree at |treePath| within |commit|. + // The listing is non-recursive: it returns only immediate children. + // + // It returns PathNotFoundError if |treePath| does not exist. + ListTree(ctx context.Context, commit OID, treePath string) ([]TreeEntry, error) + // CatFileType returns the git object type for |oid| (e.g. "blob", "tree", "commit"). CatFileType(ctx context.Context, oid OID) (string, error) @@ -63,6 +75,11 @@ type GitAPI interface { // GIT_DIR=... GIT_INDEX_FILE= git update-index --add --cacheinfo UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid OID, path string) error + // RemoveIndexPaths removes |paths| from |indexFile| if present. + // Equivalent plumbing: + // GIT_DIR=... GIT_INDEX_FILE= git update-index --remove -z --stdin + RemoveIndexPaths(ctx context.Context, indexFile string, paths []string) error + // WriteTree writes a tree object from the contents of |indexFile| and returns its oid. // Equivalent plumbing: // GIT_DIR=... GIT_INDEX_FILE= git write-tree @@ -84,6 +101,14 @@ type GitAPI interface { UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error } +// TreeEntry describes one entry in a git tree listing. +type TreeEntry struct { + Mode string + Type string + OID OID + Name string +} + // Identity represents git author/committer metadata. A future implementation may set // this via environment variables (GIT_AUTHOR_NAME, etc.). type Identity struct { diff --git a/go/store/blobstore/internal/git/impl.go b/go/store/blobstore/internal/git/impl.go index 53d63edeb8..324aa97b92 100644 --- a/go/store/blobstore/internal/git/impl.go +++ b/go/store/blobstore/internal/git/impl.go @@ -88,6 +88,62 @@ func (a *GitAPIImpl) ResolvePathBlob(ctx context.Context, commit OID, path strin return OID(oid), nil } +func (a *GitAPIImpl) ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ string, err error) { + spec := commit.String() + ":" + path + out, err := a.r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec) + if err != nil { + if isPathNotFoundErr(err) { + return "", "", &PathNotFoundError{Commit: commit.String(), Path: path} + } + return "", "", err + } + oidStr := strings.TrimSpace(string(out)) + if oidStr == "" { + return "", "", fmt.Errorf("git rev-parse returned empty oid for %q", spec) + } + + typ, err = a.CatFileType(ctx, OID(oidStr)) + if err != nil { + return "", "", err + } + return OID(oidStr), typ, nil +} + +func (a *GitAPIImpl) ListTree(ctx context.Context, commit OID, treePath string) ([]TreeEntry, error) { + // Note: `git ls-tree ` accepts a tree-ish of the form ":". + // Use that to list children of a tree path without needing to pre-resolve the tree OID. + spec := commit.String() + if treePath != "" { + spec = spec + ":" + treePath + } else { + spec = spec + "^{tree}" + } + + out, err := a.r.Run(ctx, RunOptions{}, "ls-tree", spec) + if err != nil { + if isPathNotFoundErr(err) && treePath != "" { + return nil, &PathNotFoundError{Commit: commit.String(), Path: treePath} + } + return nil, err + } + lines := strings.Split(strings.TrimRight(string(out), "\n"), "\n") + if len(lines) == 1 && strings.TrimSpace(lines[0]) == "" { + return nil, nil + } + entries := make([]TreeEntry, 0, len(lines)) + for _, line := range lines { + if strings.TrimSpace(line) == "" { + continue + } + e, err := parseLsTreeLine(line) + if err != nil { + return nil, err + } + entries = append(entries, e) + } + return entries, nil +} + func (a *GitAPIImpl) CatFileType(ctx context.Context, oid OID) (string, error) { out, err := a.r.Run(ctx, RunOptions{}, "cat-file", "-t", oid.String()) if err != nil { @@ -141,6 +197,26 @@ func (a *GitAPIImpl) UpdateIndexCacheInfo(ctx context.Context, indexFile string, return err } +func (a *GitAPIImpl) RemoveIndexPaths(ctx context.Context, indexFile string, paths []string) error { + if len(paths) == 0 { + return nil + } + var buf bytes.Buffer + // `git update-index --remove` is about removing *missing worktree files*, and requires a worktree. + // For bare repos / index-only workflows, use `--index-info` to remove paths by writing mode "0". + // + // Format: + // \t\n + // To remove: + // 0 0000000000000000000000000000000000000000 0\t\n + const zeroOID = "0000000000000000000000000000000000000000" + for _, p := range paths { + fmt.Fprintf(&buf, "0 %s 0\t%s\n", zeroOID, p) + } + _, err := a.r.Run(ctx, RunOptions{IndexFile: indexFile, Stdin: &buf}, "update-index", "--index-info") + return err +} + func (a *GitAPIImpl) WriteTree(ctx context.Context, indexFile string) (OID, error) { out, err := a.r.Run(ctx, RunOptions{IndexFile: indexFile}, "write-tree") if err != nil { @@ -243,3 +319,24 @@ func isPathNotFoundErr(err error) bool { } return false } + +func parseLsTreeLine(line string) (TreeEntry, error) { + // Format (one entry): + // SP SP \t + // Example: + // 100644 blob e69de29bb2d1d6434b8b29ae775ad8c2e48c5391\tfile.txt + parts := strings.SplitN(line, "\t", 2) + if len(parts) != 2 { + return TreeEntry{}, fmt.Errorf("git ls-tree: malformed line %q", line) + } + left := strings.Fields(parts[0]) + if len(left) != 3 { + return TreeEntry{}, fmt.Errorf("git ls-tree: malformed line %q", line) + } + return TreeEntry{ + Mode: left[0], + Type: left[1], + OID: OID(left[2]), + Name: parts[1], + }, nil +} diff --git a/go/store/blobstore/internal/git/impl_test.go b/go/store/blobstore/internal/git/impl_test.go index 8219f27ce1..71589eb96c 100644 --- a/go/store/blobstore/internal/git/impl_test.go +++ b/go/store/blobstore/internal/git/impl_test.go @@ -366,6 +366,216 @@ func TestGitAPIImpl_UpdateIndexCacheInfo_FileDirectoryConflictErrors(t *testing. } } +func TestGitAPIImpl_ResolvePathObject_BlobAndTree(t *testing.T) { + t.Parallel() + + ctx := context.Background() + repo, err := gitrepo.InitBareTemp(ctx, "") + if err != nil { + t.Fatal(err) + } + + r, err := NewRunner(repo.GitDir) + if err != nil { + t.Fatal(err) + } + api := NewGitAPIImpl(r) + + indexFile := tempIndexFile(t) + if err := api.ReadTreeEmpty(ctx, indexFile); err != nil { + t.Fatal(err) + } + + blobOID, err := api.HashObject(ctx, bytes.NewReader([]byte("hi\n"))) + if err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", blobOID, "dir/file.txt"); err != nil { + t.Fatal(err) + } + treeOID, err := api.WriteTree(ctx, indexFile) + if err != nil { + t.Fatal(err) + } + commitOID, err := api.CommitTree(ctx, treeOID, nil, "seed", testAuthor()) + if err != nil { + t.Fatal(err) + } + + gotOID, gotTyp, err := api.ResolvePathObject(ctx, commitOID, "dir/file.txt") + if err != nil { + t.Fatal(err) + } + if gotTyp != "blob" { + t.Fatalf("expected type blob, got %q", gotTyp) + } + if gotOID != blobOID { + t.Fatalf("expected oid %q, got %q", blobOID, gotOID) + } + + _, gotTyp, err = api.ResolvePathObject(ctx, commitOID, "dir") + if err != nil { + t.Fatal(err) + } + if gotTyp != "tree" { + t.Fatalf("expected type tree, got %q", gotTyp) + } +} + +func TestGitAPIImpl_ListTree_NonRecursive(t *testing.T) { + t.Parallel() + + ctx := context.Background() + repo, err := gitrepo.InitBareTemp(ctx, "") + if err != nil { + t.Fatal(err) + } + + r, err := NewRunner(repo.GitDir) + if err != nil { + t.Fatal(err) + } + api := NewGitAPIImpl(r) + + indexFile := tempIndexFile(t) + if err := api.ReadTreeEmpty(ctx, indexFile); err != nil { + t.Fatal(err) + } + + oidA, err := api.HashObject(ctx, bytes.NewReader([]byte("a\n"))) + if err != nil { + t.Fatal(err) + } + oidB, err := api.HashObject(ctx, bytes.NewReader([]byte("b\n"))) + if err != nil { + t.Fatal(err) + } + oidX, err := api.HashObject(ctx, bytes.NewReader([]byte("x\n"))) + if err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", oidA, "dir/a.txt"); err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", oidB, "dir/b.txt"); err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", oidX, "dir/sub/x.txt"); err != nil { + t.Fatal(err) + } + + treeOID, err := api.WriteTree(ctx, indexFile) + if err != nil { + t.Fatal(err) + } + commitOID, err := api.CommitTree(ctx, treeOID, nil, "seed", testAuthor()) + if err != nil { + t.Fatal(err) + } + + entries, err := api.ListTree(ctx, commitOID, "dir") + if err != nil { + t.Fatal(err) + } + // Expect: a.txt (blob), b.txt (blob), sub (tree) + if len(entries) != 3 { + t.Fatalf("expected 3 entries, got %d: %+v", len(entries), entries) + } + + var gotA, gotB, gotSub bool + for _, e := range entries { + switch e.Name { + case "a.txt": + gotA = true + if e.Type != "blob" || e.OID != oidA { + t.Fatalf("unexpected a.txt entry: %+v", e) + } + case "b.txt": + gotB = true + if e.Type != "blob" || e.OID != oidB { + t.Fatalf("unexpected b.txt entry: %+v", e) + } + case "sub": + gotSub = true + if e.Type != "tree" || e.OID == "" { + t.Fatalf("unexpected sub entry: %+v", e) + } + default: + t.Fatalf("unexpected entry: %+v", e) + } + } + if !gotA || !gotB || !gotSub { + t.Fatalf("missing expected entries: gotA=%v gotB=%v gotSub=%v", gotA, gotB, gotSub) + } +} + +func TestGitAPIImpl_RemoveIndexPaths_RemovesFromIndex(t *testing.T) { + t.Parallel() + + ctx := context.Background() + repo, err := gitrepo.InitBareTemp(ctx, "") + if err != nil { + t.Fatal(err) + } + + r, err := NewRunner(repo.GitDir) + if err != nil { + t.Fatal(err) + } + api := NewGitAPIImpl(r) + + indexFile := tempIndexFile(t) + if err := api.ReadTreeEmpty(ctx, indexFile); err != nil { + t.Fatal(err) + } + + oidA, err := api.HashObject(ctx, bytes.NewReader([]byte("a\n"))) + if err != nil { + t.Fatal(err) + } + oidB, err := api.HashObject(ctx, bytes.NewReader([]byte("b\n"))) + if err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", oidA, "a.txt"); err != nil { + t.Fatal(err) + } + if err := api.UpdateIndexCacheInfo(ctx, indexFile, "100644", oidB, "b.txt"); err != nil { + t.Fatal(err) + } + + if err := api.RemoveIndexPaths(ctx, indexFile, []string{"a.txt"}); err != nil { + t.Fatal(err) + } + + treeOID, err := api.WriteTree(ctx, indexFile) + if err != nil { + t.Fatal(err) + } + commitOID, err := api.CommitTree(ctx, treeOID, nil, "seed", testAuthor()) + if err != nil { + t.Fatal(err) + } + + // a.txt removed, b.txt still present + _, err = api.ResolvePathBlob(ctx, commitOID, "a.txt") + if err == nil { + t.Fatalf("expected a.txt missing") + } + var pnf *PathNotFoundError + if !errors.As(err, &pnf) { + t.Fatalf("expected PathNotFoundError, got %T: %v", err, err) + } + + gotB, err := api.ResolvePathBlob(ctx, commitOID, "b.txt") + if err != nil { + t.Fatal(err) + } + if gotB != oidB { + t.Fatalf("expected b.txt oid %q, got %q", oidB, gotB) + } +} + func TestGitAPIImpl_ReadTree_PreservesExistingPaths(t *testing.T) { t.Parallel() From 222180155474439a223c4bd05c71746ed343a79e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 16:59:29 -0800 Subject: [PATCH 13/28] /go/store/blobstore: cleanup --- go/store/blobstore/git_blobstore.go | 166 +++++++++--- .../git_blobstore_chunked_checkandput_test.go | 81 ++++++ .../git_blobstore_chunked_put_test.go | 102 ++++++++ go/store/blobstore/git_blobstore_parts.go | 40 --- .../blobstore/git_blobstore_parts_test.go | 67 ----- .../blobstore/internal/gitbs/descriptor.go | 238 ------------------ .../internal/gitbs/descriptor_helpers_test.go | 69 ----- .../internal/gitbs/descriptor_test.go | 91 ------- go/store/blobstore/internal/gitbs/oid.go | 39 --- .../blobstore/internal/gitbs/parts_path.go | 40 --- .../internal/gitbs/parts_path_test.go | 40 --- go/store/blobstore/internal/gitbs/ranges.go | 6 + 12 files changed, 314 insertions(+), 665 deletions(-) create mode 100644 go/store/blobstore/git_blobstore_chunked_checkandput_test.go create mode 100644 go/store/blobstore/git_blobstore_chunked_put_test.go delete mode 100644 go/store/blobstore/git_blobstore_parts.go delete mode 100644 go/store/blobstore/git_blobstore_parts_test.go delete mode 100644 go/store/blobstore/internal/gitbs/descriptor.go delete mode 100644 go/store/blobstore/internal/gitbs/descriptor_helpers_test.go delete mode 100644 go/store/blobstore/internal/gitbs/descriptor_test.go delete mode 100644 go/store/blobstore/internal/gitbs/oid.go delete mode 100644 go/store/blobstore/internal/gitbs/parts_path.go delete mode 100644 go/store/blobstore/internal/gitbs/parts_path_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 645d66db1a..3ed803169d 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -34,6 +34,8 @@ import ( gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" ) +const gitblobstorePartNameWidth = 8 // "00000001" + // GitBlobstore is a Blobstore implementation backed by a git repository's object // database (bare repo or .git directory). It stores keys as paths within the tree // of the commit referenced by a git ref (e.g. refs/dolt/data). @@ -452,7 +454,7 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, // we can reuse the resulting object OIDs without re-reading |reader|. - writes, err := gbs.planPutWrites(ctx, key, totalSize, reader) + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) if err != nil { return "", err } @@ -476,7 +478,7 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r return backoff.Permanent(err) } - newCommit, err := gbs.buildCommitWithWrites(ctx, parent, ok, writes, msg) + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) if err != nil { return backoff.Permanent(err) } @@ -580,53 +582,135 @@ func (gbs *GitBlobstore) buildCommitWithWrites(ctx context.Context, parent git.O return commitOID, nil } -func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) ([]treeWrite, error) { +func (gbs *GitBlobstore) buildCommitForKeyWrite(ctx context.Context, parent git.OID, hasParent bool, key string, plan putPlan, msg string) (git.OID, error) { + _, indexFile, cleanup, err := newTempIndex() + if err != nil { + return "", err + } + defer cleanup() + + if hasParent { + if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { + return "", err + } + } else { + if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { + return "", err + } + } + + if hasParent { + if err := gbs.removeKeyConflictsFromIndex(ctx, parent, indexFile, key, plan.chunked); err != nil { + return "", err + } + } + + sort.Slice(plan.writes, func(i, j int) bool { return plan.writes[i].path < plan.writes[j].path }) + for _, w := range plan.writes { + if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { + return "", err + } + } + + treeOID, err := gbs.api.WriteTree(ctx, indexFile) + if err != nil { + return "", err + } + + var parentPtr *git.OID + if hasParent && parent != "" { + p := parent + parentPtr = &p + } + + commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) + if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { + commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) + } + if err != nil { + return "", err + } + return commitOID, nil +} + +func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent git.OID, indexFile string, key string, newIsChunked bool) error { + oid, typ, err := gbs.api.ResolvePathObject(ctx, parent, key) + if err != nil { + if git.IsPathNotFound(err) { + return nil + } + return err + } + _ = oid + + switch typ { + case "blob": + if newIsChunked { + // blob -> tree: must remove the file entry at + return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) + } + return nil + + case "tree": + // tree -> blob OR tree overwrite: remove old child entries under /... + entries, err := gbs.api.ListTree(ctx, parent, key) + if err != nil { + return err + } + if len(entries) == 0 { + return nil + } + paths := make([]string, 0, len(entries)) + for _, e := range entries { + paths = append(paths, key+"/"+e.Name) + } + return gbs.api.RemoveIndexPaths(ctx, indexFile, paths) + + default: + return fmt.Errorf("gitblobstore: unsupported existing object type %q at key %q", typ, key) + } +} + +type putPlan struct { + writes []treeWrite + // If true, the key should be represented as a tree (chunked parts under key/NNNNNNNN). + chunked bool +} + +func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) (putPlan, error) { // Minimal policy: chunk only when explicitly enabled and |totalSize| exceeds MaxPartSize. if gbs.maxPartSize == 0 || totalSize <= 0 || uint64(totalSize) <= gbs.maxPartSize { blobOID, err := gbs.api.HashObject(ctx, reader) if err != nil { - return nil, err + return putPlan{}, err } - return []treeWrite{{path: key, oid: blobOID}}, nil + return putPlan{writes: []treeWrite{{path: key, oid: blobOID}}}, nil } - descOID, partOIDs, err := gbs.hashChunkedObject(ctx, reader) + partOIDs, err := gbs.hashChunkedParts(ctx, reader) + if err != nil { + return putPlan{}, err + } + + writes := make([]treeWrite, 0, len(partOIDs)) + for i, p := range partOIDs { + partName := fmt.Sprintf("%0*d", gitblobstorePartNameWidth, i+1) + writes = append(writes, treeWrite{path: key + "/" + partName, oid: p}) + } + return putPlan{writes: writes, chunked: true}, nil +} + +func (gbs *GitBlobstore) hashChunkedParts(ctx context.Context, reader io.Reader) (partOIDs []git.OID, err error) { + max := int64(gbs.maxPartSize) + if max <= 0 { + return nil, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) + } + + _, partOIDs, _, err = gbs.hashParts(ctx, reader) if err != nil { return nil, err } - - writes := make([]treeWrite, 0, 1+len(partOIDs)) - writes = append(writes, treeWrite{path: key, oid: descOID}) - for _, p := range partOIDs { - ppath, err := gitbs.PartPath(p.String()) - if err != nil { - return nil, err - } - writes = append(writes, treeWrite{path: ppath, oid: p}) - } - return writes, nil -} - -func (gbs *GitBlobstore) hashChunkedObject(ctx context.Context, reader io.Reader) (descOID git.OID, partOIDs []git.OID, err error) { - max := int64(gbs.maxPartSize) - if max <= 0 { - return "", nil, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) - } - - parts, partOIDs, total, err := gbs.hashParts(ctx, reader) - if err != nil { - return "", nil, err - } - - descBytes, err := gitbs.EncodeDescriptor(gitbs.Descriptor{TotalSize: total, Parts: parts}) - if err != nil { - return "", nil, err - } - descOID, err = gbs.api.HashObject(ctx, bytes.NewReader(descBytes)) - if err != nil { - return "", nil, err - } - return descOID, partOIDs, nil + return partOIDs, nil } func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []gitbs.PartRef, partOIDs []git.OID, total uint64, err error) { @@ -733,12 +817,12 @@ func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key s } msg := fmt.Sprintf("gitblobstore: checkandput %s", key) - writes, err := gbs.planPutWrites(ctx, key, totalSize, reader) + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) if err != nil { return "", err } - newCommit, err := gbs.buildCommitWithWrites(ctx, parent, ok, writes, msg) + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) if err != nil { return "", err } diff --git a/go/store/blobstore/git_blobstore_chunked_checkandput_test.go b/go/store/blobstore/git_blobstore_chunked_checkandput_test.go new file mode 100644 index 0000000000..cc1e18ff9e --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_checkandput_test.go @@ -0,0 +1,81 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "errors" + "io" + "testing" + + "github.com/stretchr/testify/require" + + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_CheckAndPut_ChunkedRoundTrip_CreateOnly(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + want := []byte("abcdefghij") // 10 bytes -> chunked tree + ver, err := bs.CheckAndPut(ctx, "", "big", int64(len(want)), bytes.NewReader(want)) + require.NoError(t, err) + require.NotEmpty(t, ver) + + got, ver2, err := GetBytes(ctx, bs, "big", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, want, got) +} + +type chunkedFailReader struct{} + +func (chunkedFailReader) Read(_ []byte) (int, error) { + return 0, errors.New("read should not be called") +} + +func TestGitBlobstore_CheckAndPut_MismatchDoesNotConsumeReader_WithChunkingEnabled(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + // Seed any commit so actualVersion != "". + bs0, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{Identity: testIdentity()}) + require.NoError(t, err) + _, err = bs0.Put(ctx, "x", 1, bytes.NewReader([]byte("x"))) + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + _, err = bs.CheckAndPut(ctx, "definitely-wrong", "y", 1, io.Reader(chunkedFailReader{})) + require.Error(t, err) + require.True(t, IsCheckAndPutError(err)) +} diff --git a/go/store/blobstore/git_blobstore_chunked_put_test.go b/go/store/blobstore/git_blobstore_chunked_put_test.go new file mode 100644 index 0000000000..b27cdd4cab --- /dev/null +++ b/go/store/blobstore/git_blobstore_chunked_put_test.go @@ -0,0 +1,102 @@ +// Copyright 2026 Dolthub, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blobstore + +import ( + "bytes" + "context" + "testing" + + "github.com/stretchr/testify/require" + + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" + "github.com/dolthub/dolt/go/store/testutils/gitrepo" +) + +func TestGitBlobstore_Put_ChunkedWritesTreeParts(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + want := []byte("abcdefghij") // 10 bytes -> 3,3,3,1 + ver, err := bs.Put(ctx, "big", int64(len(want)), bytes.NewReader(want)) + require.NoError(t, err) + + got, ver2, err := GetBytes(ctx, bs, "big", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, want, got) + + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + + commit := git.OID(ver) + _, typ, err := api.ResolvePathObject(ctx, commit, "big") + require.NoError(t, err) + require.Equal(t, "tree", typ) + + entries, err := api.ListTree(ctx, commit, "big") + require.NoError(t, err) + require.Len(t, entries, 4) + require.Equal(t, "00000001", entries[0].Name) + require.Equal(t, "00000004", entries[3].Name) +} + +func TestGitBlobstore_Put_TreeToBlobAndBlobToTreeTransitions(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 3, + }) + require.NoError(t, err) + + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + + // blob -> tree + _, err = bs.Put(ctx, "k", 2, bytes.NewReader([]byte("hi"))) + require.NoError(t, err) + verTree, err := bs.Put(ctx, "k", 10, bytes.NewReader([]byte("abcdefghij"))) + require.NoError(t, err) + _, typ, err := api.ResolvePathObject(ctx, git.OID(verTree), "k") + require.NoError(t, err) + require.Equal(t, "tree", typ) + + // tree -> blob + verBlob, err := bs.Put(ctx, "k", 2, bytes.NewReader([]byte("ok"))) + require.NoError(t, err) + _, typ, err = api.ResolvePathObject(ctx, git.OID(verBlob), "k") + require.NoError(t, err) + require.Equal(t, "blob", typ) + + got, _, err := GetBytes(ctx, bs, "k", AllRange) + require.NoError(t, err) + require.Equal(t, []byte("ok"), got) +} diff --git a/go/store/blobstore/git_blobstore_parts.go b/go/store/blobstore/git_blobstore_parts.go deleted file mode 100644 index dccd36bea0..0000000000 --- a/go/store/blobstore/git_blobstore_parts.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package blobstore - -import ( - "context" - - git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" -) - -const ( - // gitblobstorePartFileMode is the canonical filemode used for part blobs staged into the tree. - gitblobstorePartFileMode = "100644" -) - -// stagePartReachable stages a tree entry for |partOID| into |indexFile| under the reserved -// parts namespace, ensuring the blob is reachable from the resulting tree/commit. -// -// This operation is idempotent: staging the same part OID at the same computed path twice -// should result in the same index state. -func stagePartReachable(ctx context.Context, api git.GitAPI, indexFile string, partOID git.OID) (path string, err error) { - path, err = gitbs.PartPath(partOID.String()) - if err != nil { - return "", err - } - return path, api.UpdateIndexCacheInfo(ctx, indexFile, gitblobstorePartFileMode, partOID, path) -} diff --git a/go/store/blobstore/git_blobstore_parts_test.go b/go/store/blobstore/git_blobstore_parts_test.go deleted file mode 100644 index 2ae66b06a7..0000000000 --- a/go/store/blobstore/git_blobstore_parts_test.go +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package blobstore - -import ( - "context" - "os/exec" - "strings" - "testing" - - "github.com/stretchr/testify/require" - - git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - "github.com/dolthub/dolt/go/store/testutils/gitrepo" -) - -func TestStagePartReachable_Idempotent(t *testing.T) { - if _, err := exec.LookPath("git"); err != nil { - t.Skip("git not found on PATH") - } - - ctx := context.Background() - repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") - require.NoError(t, err) - - runner, err := git.NewRunner(repo.GitDir) - require.NoError(t, err) - api := git.NewGitAPIImpl(runner) - - partOID, err := api.HashObject(ctx, strings.NewReader("part-bytes")) - require.NoError(t, err) - - _, indexFile, cleanup, err := newTempIndex() - require.NoError(t, err) - defer cleanup() - - require.NoError(t, api.ReadTreeEmpty(ctx, indexFile)) - - path1, err := stagePartReachable(ctx, api, indexFile, partOID) - require.NoError(t, err) - path2, err := stagePartReachable(ctx, api, indexFile, partOID) - require.NoError(t, err) - require.Equal(t, path1, path2) - - treeOID, err := api.WriteTree(ctx, indexFile) - require.NoError(t, err) - - commitOID, err := api.CommitTree(ctx, treeOID, nil, "stage part reachable test", &git.Identity{Name: "t", Email: "t@t"}) - require.NoError(t, err) - - // Verify the staged path resolves to the part blob in the committed tree. - got, err := api.ResolvePathBlob(ctx, commitOID, path1) - require.NoError(t, err) - require.Equal(t, partOID, got) -} diff --git a/go/store/blobstore/internal/gitbs/descriptor.go b/go/store/blobstore/internal/gitbs/descriptor.go deleted file mode 100644 index 379766eacd..0000000000 --- a/go/store/blobstore/internal/gitbs/descriptor.go +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package gitbs contains internal helpers for GitBlobstore representations. -// -// This package is intentionally Git-agnostic: it does not import the internal/git -// plumbing, and does not assume any ref/update strategy. It focuses on chunked -// object descriptor encoding/decoding and validation. -package gitbs - -import ( - "bytes" - "fmt" - "strconv" - "strings" -) - -const ( - // DescriptorMagic is the first line of a chunked-object descriptor. - DescriptorMagic = "DOLTBS1" -) - -type Descriptor struct { - TotalSize uint64 - Parts []PartRef -} - -type PartRef struct { - OIDHex string - Size uint64 -} - -type descriptorParseState struct { - d Descriptor - haveSz bool - sumPart uint64 -} - -// IsDescriptorPrefix returns true if |b| looks like the beginning of a descriptor. -// Callers can use this on a small prefix before deciding whether to read and parse -// the full blob. -func IsDescriptorPrefix(b []byte) bool { - // Be conservative: require the magic line break plus "size " prefix. - // This avoids mis-detecting arbitrary inline content that begins with "DOLTBS1". - if !bytes.HasPrefix(b, []byte(DescriptorMagic)) { - return false - } - if len(b) < len(DescriptorMagic)+1 { - return false - } - rest := b[len(DescriptorMagic):] - if bytes.HasPrefix(rest, []byte("\nsize ")) { - return true - } - if bytes.HasPrefix(rest, []byte("\r\nsize ")) { - return true - } - return false -} - -// ParseDescriptor parses and validates a descriptor blob. -func ParseDescriptor(b []byte) (Descriptor, error) { - lines := splitLines(string(b)) - if len(lines) == 0 { - return Descriptor{}, fmt.Errorf("descriptor: empty") - } - if lines[0] != DescriptorMagic { - return Descriptor{}, fmt.Errorf("descriptor: invalid magic %q", lines[0]) - } - - var st descriptorParseState - for _, line := range lines[1:] { - if strings.TrimSpace(line) == "" { - continue - } - if err := parseDescriptorLine(&st, line); err != nil { - return Descriptor{}, err - } - } - return finalizeParsedDescriptor(st) -} - -// EncodeDescriptor encodes a descriptor in the stable line-oriented format. -func EncodeDescriptor(d Descriptor) ([]byte, error) { - // Validate basic invariants so Encode+Parse is deterministic. - if _, err := validateDescriptorForEncode(d); err != nil { - return nil, err - } - - var buf strings.Builder - buf.Grow(64 + len(d.Parts)*64) - buf.WriteString(DescriptorMagic) - buf.WriteByte('\n') - buf.WriteString("size ") - buf.WriteString(strconv.FormatUint(d.TotalSize, 10)) - buf.WriteByte('\n') - for _, p := range d.Parts { - writePartLine(&buf, p) - } - return []byte(buf.String()), nil -} - -func validateDescriptorForEncode(d Descriptor) (Descriptor, error) { - sum, err := validateDescriptorParts(d.Parts) - if err != nil { - return Descriptor{}, err - } - if err := validateDescriptorSizeAndParts(d.TotalSize, len(d.Parts), sum); err != nil { - return Descriptor{}, err - } - return d, nil -} - -func parseDescriptorLine(st *descriptorParseState, line string) error { - fields := strings.Fields(line) - switch { - case len(fields) >= 1 && fields[0] == "size": - return parseSizeLine(st, line, fields) - case len(fields) >= 1 && fields[0] == "part": - return parsePartLine(st, line, fields) - default: - return fmt.Errorf("descriptor: unknown line %q", line) - } -} - -func parseSizeLine(st *descriptorParseState, line string, fields []string) error { - if st.haveSz { - return fmt.Errorf("descriptor: multiple size lines") - } - if len(fields) != 2 { - return fmt.Errorf("descriptor: malformed size line %q", line) - } - n, err := parseUint(fields[1]) - if err != nil { - return fmt.Errorf("descriptor: invalid size %q: %w", fields[1], err) - } - st.d.TotalSize = n - st.haveSz = true - return nil -} - -func parsePartLine(st *descriptorParseState, line string, fields []string) error { - if len(fields) != 3 { - return fmt.Errorf("descriptor: malformed part line %q", line) - } - oid := fields[1] - if err := validateOIDHex(oid); err != nil { - return fmt.Errorf("descriptor: invalid part oid %q: %w", oid, err) - } - sz, err := parseUint(fields[2]) - if err != nil { - return fmt.Errorf("descriptor: invalid part size %q: %w", fields[2], err) - } - if sz == 0 { - return fmt.Errorf("descriptor: part size must be > 0") - } - if st.sumPart > ^uint64(0)-sz { - return fmt.Errorf("descriptor: part sizes overflow uint64") - } - st.sumPart += sz - st.d.Parts = append(st.d.Parts, PartRef{OIDHex: oid, Size: sz}) - return nil -} - -func finalizeParsedDescriptor(st descriptorParseState) (Descriptor, error) { - if !st.haveSz { - return Descriptor{}, fmt.Errorf("descriptor: missing size line") - } - if err := validateDescriptorSizeAndParts(st.d.TotalSize, len(st.d.Parts), st.sumPart); err != nil { - return Descriptor{}, err - } - return st.d, nil -} - -func validateDescriptorSizeAndParts(totalSize uint64, partCount int, sumParts uint64) error { - if totalSize == 0 { - if partCount != 0 { - return fmt.Errorf("descriptor: total size 0 requires zero parts") - } - return nil - } - if partCount == 0 { - return fmt.Errorf("descriptor: non-zero total size requires at least one part") - } - if sumParts != totalSize { - return fmt.Errorf("descriptor: part sizes sum to %d, expected %d", sumParts, totalSize) - } - return nil -} - -func validateDescriptorParts(parts []PartRef) (sum uint64, err error) { - for _, p := range parts { - if err := validateOIDHex(p.OIDHex); err != nil { - return 0, fmt.Errorf("descriptor: invalid part oid %q: %w", p.OIDHex, err) - } - if p.Size == 0 { - return 0, fmt.Errorf("descriptor: part size must be > 0") - } - if sum > ^uint64(0)-p.Size { - return 0, fmt.Errorf("descriptor: part sizes overflow uint64") - } - sum += p.Size - } - return sum, nil -} - -func writePartLine(buf *strings.Builder, p PartRef) { - buf.WriteString("part ") - buf.WriteString(p.OIDHex) - buf.WriteByte(' ') - buf.WriteString(strconv.FormatUint(p.Size, 10)) - buf.WriteByte('\n') -} - -func splitLines(s string) []string { - // Normalize CRLF to LF, then split. - s = strings.ReplaceAll(s, "\r\n", "\n") - s = strings.TrimRight(s, "\n") - if s == "" { - return nil - } - return strings.Split(s, "\n") -} - -func parseUint(s string) (uint64, error) { - return strconv.ParseUint(s, 10, 64) -} diff --git a/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go b/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go deleted file mode 100644 index 7886695486..0000000000 --- a/go/store/blobstore/internal/gitbs/descriptor_helpers_test.go +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "strings" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestDescriptorHelpers_validateDescriptorParts(t *testing.T) { - sum, err := validateDescriptorParts([]PartRef{ - {OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 3}, - {OIDHex: "89abcdef0123456789abcdef0123456789abcdef", Size: 4}, - }) - require.NoError(t, err) - require.Equal(t, uint64(7), sum) - - _, err = validateDescriptorParts([]PartRef{{OIDHex: "not-an-oid", Size: 1}}) - require.Error(t, err) - - _, err = validateDescriptorParts([]PartRef{{OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 0}}) - require.Error(t, err) -} - -func TestDescriptorHelpers_validateDescriptorSizeAndParts(t *testing.T) { - require.NoError(t, validateDescriptorSizeAndParts(0, 0, 0)) - require.Error(t, validateDescriptorSizeAndParts(0, 1, 1)) - require.Error(t, validateDescriptorSizeAndParts(1, 0, 0)) - require.Error(t, validateDescriptorSizeAndParts(3, 1, 2)) - require.NoError(t, validateDescriptorSizeAndParts(3, 1, 3)) -} - -func TestDescriptorHelpers_parseLines(t *testing.T) { - var st descriptorParseState - - err := parseDescriptorLine(&st, "size 3") - require.NoError(t, err) - require.True(t, st.haveSz) - require.Equal(t, uint64(3), st.d.TotalSize) - - err = parseDescriptorLine(&st, "part 0123456789abcdef0123456789abcdef01234567 3") - require.NoError(t, err) - require.Len(t, st.d.Parts, 1) - - d, err := finalizeParsedDescriptor(st) - require.NoError(t, err) - require.Equal(t, uint64(3), d.TotalSize) - require.Len(t, d.Parts, 1) -} - -func TestDescriptorHelpers_writePartLine(t *testing.T) { - var b strings.Builder - writePartLine(&b, PartRef{OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 9}) - require.Equal(t, "part 0123456789abcdef0123456789abcdef01234567 9\n", b.String()) -} diff --git a/go/store/blobstore/internal/gitbs/descriptor_test.go b/go/store/blobstore/internal/gitbs/descriptor_test.go deleted file mode 100644 index 2d72d783b2..0000000000 --- a/go/store/blobstore/internal/gitbs/descriptor_test.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestEncodeParseDescriptor_RoundTrip(t *testing.T) { - d := Descriptor{ - TotalSize: 7, - Parts: []PartRef{ - {OIDHex: "0123456789abcdef0123456789abcdef01234567", Size: 3}, - {OIDHex: "89abcdef0123456789abcdef0123456789abcdef", Size: 4}, - }, - } - - b, err := EncodeDescriptor(d) - require.NoError(t, err) - - got, err := ParseDescriptor(b) - require.NoError(t, err) - require.Equal(t, d, got) -} - -func TestParseDescriptor_InvalidMagic(t *testing.T) { - _, err := ParseDescriptor([]byte("NOPE\nsize 0\n")) - require.Error(t, err) -} - -func TestParseDescriptor_MissingSizeLine(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\npart 0123456789abcdef0123456789abcdef01234567 1\n")) - require.Error(t, err) -} - -func TestParseDescriptor_MultipleSizeLines(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 1\nsize 2\n")) - require.Error(t, err) -} - -func TestParseDescriptor_UnknownLine(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\nwat 1\n")) - require.Error(t, err) -} - -func TestParseDescriptor_InvalidOID(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 1\npart not-an-oid 1\n")) - require.Error(t, err) -} - -func TestParseDescriptor_PartSizeZeroRejected(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\npart 0123456789abcdef0123456789abcdef01234567 0\n")) - require.Error(t, err) -} - -func TestParseDescriptor_SumMismatch(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 2\npart 0123456789abcdef0123456789abcdef01234567 1\n")) - require.Error(t, err) -} - -func TestParseDescriptor_TotalSizeZeroRequiresNoParts(t *testing.T) { - _, err := ParseDescriptor([]byte("DOLTBS1\nsize 0\npart 0123456789abcdef0123456789abcdef01234567 1\n")) - require.Error(t, err) -} - -func TestEncodeDescriptor_Validates(t *testing.T) { - _, err := EncodeDescriptor(Descriptor{TotalSize: 1}) - require.Error(t, err) -} - -func TestIsDescriptorPrefix(t *testing.T) { - require.True(t, IsDescriptorPrefix([]byte("DOLTBS1\nsize "))) - require.True(t, IsDescriptorPrefix([]byte("DOLTBS1\r\nsize "))) - require.False(t, IsDescriptorPrefix([]byte("DOLTBS"))) - require.False(t, IsDescriptorPrefix([]byte("xxxxDOLTBS1\n"))) - require.False(t, IsDescriptorPrefix([]byte("DOLTBS1\nthis is not a descriptor\n"))) -} diff --git a/go/store/blobstore/internal/gitbs/oid.go b/go/store/blobstore/internal/gitbs/oid.go deleted file mode 100644 index 55b0e8b753..0000000000 --- a/go/store/blobstore/internal/gitbs/oid.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import "fmt" - -// validateOIDHex validates a 40-character hex object id. -// -// This is intentionally lenient about case (accepts A-F) since we may parse OIDs -// from sources that aren't normalized. Callers that require a canonical form should -// normalize separately (e.g. strings.ToLower). -func validateOIDHex(oid string) error { - if len(oid) != 40 { - return fmt.Errorf("expected 40 hex chars, got %d", len(oid)) - } - for i := 0; i < len(oid); i++ { - c := oid[i] - switch { - case c >= '0' && c <= '9': - case c >= 'a' && c <= 'f': - case c >= 'A' && c <= 'F': - default: - return fmt.Errorf("non-hex character %q", c) - } - } - return nil -} diff --git a/go/store/blobstore/internal/gitbs/parts_path.go b/go/store/blobstore/internal/gitbs/parts_path.go deleted file mode 100644 index f14cfd8a27..0000000000 --- a/go/store/blobstore/internal/gitbs/parts_path.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "path" - "strings" -) - -const ( - // PartsPrefix is a reserved tree prefix under which part blobs are staged to ensure - // reachability from the GitBlobstore ref snapshot. - PartsPrefix = "__dolt_blobstore_parts__" -) - -// PartPath returns the reserved tree path for a part blob with the given hex OID. -// The returned path uses forward slashes (git tree paths) and a 2-level fanout: -// -// __dolt_blobstore_parts__/aa/bb/ -// -// where aa/bb are the first 4 hex characters of the oid. -func PartPath(oidHex string) (string, error) { - if err := validateOIDHex(oidHex); err != nil { - return "", err - } - oidHex = strings.ToLower(oidHex) - return path.Join(PartsPrefix, oidHex[:2], oidHex[2:4], oidHex), nil -} diff --git a/go/store/blobstore/internal/gitbs/parts_path_test.go b/go/store/blobstore/internal/gitbs/parts_path_test.go deleted file mode 100644 index b809519e21..0000000000 --- a/go/store/blobstore/internal/gitbs/parts_path_test.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestPartPath_Deterministic(t *testing.T) { - oid := "0123456789abcdef0123456789abcdef01234567" - p, err := PartPath(oid) - require.NoError(t, err) - require.Equal(t, "__dolt_blobstore_parts__/01/23/"+oid, p) -} - -func TestPartPath_NormalizesToLower(t *testing.T) { - oidUpper := "0123456789ABCDEF0123456789ABCDEF01234567" - p, err := PartPath(oidUpper) - require.NoError(t, err) - require.Equal(t, "__dolt_blobstore_parts__/01/23/0123456789abcdef0123456789abcdef01234567", p) -} - -func TestPartPath_InvalidOID(t *testing.T) { - _, err := PartPath("nope") - require.Error(t, err) -} diff --git a/go/store/blobstore/internal/gitbs/ranges.go b/go/store/blobstore/internal/gitbs/ranges.go index 4e7c8ba6e6..e54ec0988c 100644 --- a/go/store/blobstore/internal/gitbs/ranges.go +++ b/go/store/blobstore/internal/gitbs/ranges.go @@ -16,6 +16,12 @@ package gitbs import "fmt" +// PartRef describes one part of a logically concatenated object. +type PartRef struct { + OIDHex string + Size uint64 +} + // PartSlice describes a contiguous slice to read from a particular part. type PartSlice struct { OIDHex string From ba7da4321af8126f4141352d4b3c2f74e0d9f0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Thu, 5 Feb 2026 17:16:36 -0800 Subject: [PATCH 14/28] /go/store/{blobstore,nbs}: cas on oid --- go/store/blobstore/git_blobstore.go | 190 +++++++++++++----- .../git_blobstore_chunked_get_test.go | 15 +- .../git_blobstore_chunked_put_test.go | 47 +++-- .../blobstore/git_blobstore_helpers_test.go | 15 +- go/store/blobstore/git_blobstore_test.go | 88 +++++--- go/store/nbs/git_blobstore_read_smoke_test.go | 7 +- 6 files changed, 256 insertions(+), 106 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 3ed803169d..cb42b983bd 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -130,14 +130,14 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. if err != nil { return nil, 0, "", err } - commit, ver, err := gbs.resolveCommitForGet(ctx, key) + commit, err := gbs.resolveCommitForGet(ctx, key) if err != nil { - return nil, 0, ver, err + return nil, 0, "", err } - oid, typ, ver, err := gbs.resolveObjectForGet(ctx, commit, key) + oid, typ, err := gbs.resolveObjectForGet(ctx, commit, key) if err != nil { - return nil, 0, ver, err + return nil, 0, "", err } switch typ { @@ -150,42 +150,45 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. if err != nil { return nil, 0, ver, err } - return sliceInlineBlob(rc, sz, br, ver) + // Per-key version: blob object id. + return sliceInlineBlob(rc, sz, br, oid.String()) case "tree": - return gbs.openChunkedTreeRange(ctx, commit, key, oid, br) + // Per-key version: tree object id at this key. + rc, sz, _, err := gbs.openChunkedTreeRange(ctx, commit, key, oid, br) + return rc, sz, oid.String(), err default: - return nil, 0, ver, fmt.Errorf("gitblobstore: unsupported object type %q for key %q", typ, key) + return nil, 0, "", fmt.Errorf("gitblobstore: unsupported object type %q for key %q", typ, key) } } -func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, ver string, err error) { +func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, err error) { commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) if err != nil { - return git.OID(""), "", err + return git.OID(""), err } if ok { - return commit, commit.String(), nil + return commit, nil } // If the ref doesn't exist, treat the manifest as missing (empty store), // but surface a hard error for other keys: the store itself is missing. if key == "manifest" { - return git.OID(""), "", NotFound{Key: key} + return git.OID(""), NotFound{Key: key} } - return git.OID(""), "", &git.RefNotFoundError{Ref: gbs.ref} + return git.OID(""), &git.RefNotFoundError{Ref: gbs.ref} } -func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ string, ver string, err error) { +func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ string, err error) { oid, typ, err = gbs.api.ResolvePathObject(ctx, commit, key) if err != nil { if git.IsPathNotFound(err) { - return git.OID(""), "", commit.String(), NotFound{Key: key} + return git.OID(""), "", NotFound{Key: key} } - return git.OID(""), "", commit.String(), err + return git.OID(""), "", err } - return oid, typ, commit.String(), nil + return oid, typ, nil } func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.OID, oid git.OID) (sz int64, ver string, err error) { @@ -450,6 +453,28 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r return "", err } + // Many NBS/table-file writes are content-addressed: if the key already exists, callers + // assume it refers to the same bytes and treat the operation as idempotent. + // + // The manifest is the main exception (it is mutable and updated via CheckAndPut), so + // we only apply this fast-path for non-manifest keys. + if key != "manifest" { + commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return "", err + } + if ok { + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err == nil { + // Per-key version: existing object id. + return oid.String(), nil + } + if !git.IsPathNotFound(err) { + return "", err + } + } + } + msg := fmt.Sprintf("gitblobstore: put %s", key) // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, @@ -493,13 +518,21 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r } return backoff.Permanent(err) } - ver = newCommit.String() + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + ver = oid.String() return nil } err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) if err == nil { - ver = newCommit.String() + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + ver = oid.String() return nil } @@ -803,52 +836,101 @@ func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key s return "", err } - // Resolve current head and validate expectedVersion before consuming |reader|. - parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return "", err - } - actualVersion := "" - if ok { - actualVersion = parent.String() - } - if expectedVersion != actualVersion { - return "", CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualVersion} - } - msg := fmt.Sprintf("gitblobstore: checkandput %s", key) - plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) - if err != nil { - return "", err - } - newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) - if err != nil { - return "", err - } + // Implement per-key CAS by validating |expectedVersion| against the current key version + // at HEAD, then committing on that HEAD and CAS-updating the ref. If the ref advances, + // retry by re-checking the key version. + const maxRetries = 31 // 32 total attempts (initial + retries) + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Millisecond + bo.Multiplier = 2 + bo.MaxInterval = 320 * time.Millisecond + bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed + bo.Reset() + policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - if ok { - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { - // If the ref changed, surface as a standard mismatch error. - cur, ok2, err2 := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err2 == nil && ok2 && cur != parent { - return "", CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: cur.String()} + var newKeyVersion string + var cachedPlan *putPlan + op := func() error { + parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + actualKeyVersion, err := gbs.currentKeyVersion(ctx, parent, ok, key) + if err != nil { + return backoff.Permanent(err) + } + if expectedVersion != actualKeyVersion { + return backoff.Permanent(CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualKeyVersion}) + } + + // Only hash/consume the reader once we know the expectedVersion matches. + // If we need to retry due to unrelated ref advances, reuse the cached plan so we + // don't re-read |reader| (which may not be rewindable). + if cachedPlan == nil { + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) + if err != nil { + return backoff.Permanent(err) } - return "", err + cachedPlan = &plan } - return newCommit.String(), nil + + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, *cachedPlan, msg) + if err != nil { + return backoff.Permanent(err) + } + + if !ok { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { + // If the ref now exists, retry; otherwise surface the error. + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + } else { + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + } + + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + newKeyVersion = oid.String() + return nil } - // Create-only CAS: oldOID=all-zero requires the ref to not exist. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - cur, ok2, err2 := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err2 == nil && ok2 { - return "", CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: cur.String()} + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() } return "", err } - return newCommit.String(), nil + return newKeyVersion, nil +} + +func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, haveCommit bool, key string) (string, error) { + if !haveCommit { + // Ref missing => empty store => key missing. + return "", nil + } + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return "", nil + } + return "", err + } + return oid.String(), nil } func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { diff --git a/go/store/blobstore/git_blobstore_chunked_get_test.go b/go/store/blobstore/git_blobstore_chunked_get_test.go index a0efbaf0ec..2f6054f632 100644 --- a/go/store/blobstore/git_blobstore_chunked_get_test.go +++ b/go/store/blobstore/git_blobstore_chunked_get_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/require" + git "github.com/dolthub/dolt/go/store/blobstore/internal/git" "github.com/dolthub/dolt/go/store/testutils/gitrepo" ) @@ -41,6 +42,12 @@ func TestGitBlobstore_Get_ChunkedTree_AllAndRanges(t *testing.T) { }, "seed chunked tree") require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + treeOID, _, err := api.ResolvePathObject(ctx, git.OID(commitOID), "chunked") + require.NoError(t, err) + bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) require.NoError(t, err) @@ -48,26 +55,26 @@ func TestGitBlobstore_Get_ChunkedTree_AllAndRanges(t *testing.T) { got, ver, err := GetBytes(ctx, bs, "chunked", AllRange) require.NoError(t, err) - require.Equal(t, commitOID, ver) + require.Equal(t, treeOID.String(), ver) require.Equal(t, wantAll, got) // Range spanning boundary: offset 2 length 4 => "cdef" got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(2, 4)) require.NoError(t, err) - require.Equal(t, commitOID, ver) + require.Equal(t, treeOID.String(), ver) require.Equal(t, []byte("cdef"), got) // Tail read last 3 bytes => "fgh" got, ver, err = GetBytes(ctx, bs, "chunked", NewBlobRange(-3, 0)) require.NoError(t, err) - require.Equal(t, commitOID, ver) + require.Equal(t, treeOID.String(), ver) require.Equal(t, []byte("fgh"), got) // Validate size returned is logical size. rc, sz, ver2, err := bs.Get(ctx, "chunked", NewBlobRange(0, 1)) require.NoError(t, err) require.Equal(t, uint64(len(wantAll)), sz) - require.Equal(t, commitOID, ver2) + require.Equal(t, treeOID.String(), ver2) _ = rc.Close() } diff --git a/go/store/blobstore/git_blobstore_chunked_put_test.go b/go/store/blobstore/git_blobstore_chunked_put_test.go index b27cdd4cab..9f22d2a8a1 100644 --- a/go/store/blobstore/git_blobstore_chunked_put_test.go +++ b/go/store/blobstore/git_blobstore_chunked_put_test.go @@ -51,19 +51,22 @@ func TestGitBlobstore_Put_ChunkedWritesTreeParts(t *testing.T) { require.NoError(t, err) api := git.NewGitAPIImpl(runner) - commit := git.OID(ver) - _, typ, err := api.ResolvePathObject(ctx, commit, "big") + head, ok, err := api.TryResolveRefCommit(ctx, DoltDataRef) + require.NoError(t, err) + require.True(t, ok) + + _, typ, err := api.ResolvePathObject(ctx, head, "big") require.NoError(t, err) require.Equal(t, "tree", typ) - entries, err := api.ListTree(ctx, commit, "big") + entries, err := api.ListTree(ctx, head, "big") require.NoError(t, err) require.Len(t, entries, 4) require.Equal(t, "00000001", entries[0].Name) require.Equal(t, "00000004", entries[3].Name) } -func TestGitBlobstore_Put_TreeToBlobAndBlobToTreeTransitions(t *testing.T) { +func TestGitBlobstore_Put_IdempotentDoesNotChangeExistingRepresentation(t *testing.T) { requireGitOnPath(t) ctx := context.Background() @@ -80,23 +83,39 @@ func TestGitBlobstore_Put_TreeToBlobAndBlobToTreeTransitions(t *testing.T) { require.NoError(t, err) api := git.NewGitAPIImpl(runner) - // blob -> tree - _, err = bs.Put(ctx, "k", 2, bytes.NewReader([]byte("hi"))) + // blob stays blob (even if the caller would have triggered chunked mode) + verBlob, err := bs.Put(ctx, "k", 2, bytes.NewReader([]byte("hi"))) require.NoError(t, err) - verTree, err := bs.Put(ctx, "k", 10, bytes.NewReader([]byte("abcdefghij"))) + verNoop, err := bs.Put(ctx, "k", 10, putShouldNotRead{}) require.NoError(t, err) - _, typ, err := api.ResolvePathObject(ctx, git.OID(verTree), "k") - require.NoError(t, err) - require.Equal(t, "tree", typ) + require.Equal(t, verBlob, verNoop) - // tree -> blob - verBlob, err := bs.Put(ctx, "k", 2, bytes.NewReader([]byte("ok"))) + head1, ok, err := api.TryResolveRefCommit(ctx, DoltDataRef) require.NoError(t, err) - _, typ, err = api.ResolvePathObject(ctx, git.OID(verBlob), "k") + require.True(t, ok) + _, typ, err := api.ResolvePathObject(ctx, head1, "k") require.NoError(t, err) require.Equal(t, "blob", typ) got, _, err := GetBytes(ctx, bs, "k", AllRange) require.NoError(t, err) - require.Equal(t, []byte("ok"), got) + require.Equal(t, []byte("hi"), got) + + // tree stays tree + verTree, err := bs.Put(ctx, "ktree", 10, bytes.NewReader([]byte("abcdefghij"))) + require.NoError(t, err) + head2, ok, err := api.TryResolveRefCommit(ctx, DoltDataRef) + require.NoError(t, err) + require.True(t, ok) + _, typ, err = api.ResolvePathObject(ctx, head2, "ktree") + require.NoError(t, err) + require.Equal(t, "tree", typ) + + verTreeNoop, err := bs.Put(ctx, "ktree", 2, putShouldNotRead{}) + require.NoError(t, err) + require.Equal(t, verTree, verTreeNoop) + + got, _, err = GetBytes(ctx, bs, "ktree", AllRange) + require.NoError(t, err) + require.Equal(t, []byte("abcdefghij"), got) } diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index c2c9f4d5dd..0bd8d4eb72 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -97,10 +97,9 @@ func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { } gbs := &GitBlobstore{ref: DoltDataRef, api: api} - commit, ver, err := gbs.resolveCommitForGet(ctx, "k") + commit, err := gbs.resolveCommitForGet(ctx, "k") require.NoError(t, err) require.Equal(t, git.OID("0123456789abcdef0123456789abcdef01234567"), commit) - require.Equal(t, "0123456789abcdef0123456789abcdef01234567", ver) }) t.Run("missingRef_manifestIsNotFound", func(t *testing.T) { @@ -111,7 +110,7 @@ func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { } gbs := &GitBlobstore{ref: DoltDataRef, api: api} - _, _, err := gbs.resolveCommitForGet(ctx, "manifest") + _, err := gbs.resolveCommitForGet(ctx, "manifest") var nf NotFound require.ErrorAs(t, err, &nf) require.Equal(t, "manifest", nf.Key) @@ -125,7 +124,7 @@ func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { } gbs := &GitBlobstore{ref: DoltDataRef, api: api} - _, _, err := gbs.resolveCommitForGet(ctx, "somekey") + _, err := gbs.resolveCommitForGet(ctx, "somekey") var rnf *git.RefNotFoundError require.ErrorAs(t, err, &rnf) require.Equal(t, DoltDataRef, rnf.Ref) @@ -140,7 +139,7 @@ func TestGitBlobstoreHelpers_resolveCommitForGet(t *testing.T) { } gbs := &GitBlobstore{ref: DoltDataRef, api: api} - _, _, err := gbs.resolveCommitForGet(ctx, "k") + _, err := gbs.resolveCommitForGet(ctx, "k") require.ErrorIs(t, err, sentinel) }) } @@ -159,9 +158,8 @@ func TestGitBlobstoreHelpers_resolveObjectForGet(t *testing.T) { } gbs := &GitBlobstore{api: api} - oid, typ, ver, err := gbs.resolveObjectForGet(ctx, commit, "k") + oid, typ, err := gbs.resolveObjectForGet(ctx, commit, "k") require.NoError(t, err) - require.Equal(t, "0123456789abcdef0123456789abcdef01234567", ver) require.Equal(t, "blob", typ) require.Equal(t, git.OID("89abcdef0123456789abcdef0123456789abcdef"), oid) }) @@ -174,8 +172,7 @@ func TestGitBlobstoreHelpers_resolveObjectForGet(t *testing.T) { } gbs := &GitBlobstore{api: api} - _, _, ver, err := gbs.resolveObjectForGet(ctx, commit, "k") - require.Equal(t, commit.String(), ver) + _, _, err := gbs.resolveObjectForGet(ctx, commit, "k") var nf NotFound require.ErrorAs(t, err, &nf) require.Equal(t, "k", nf.Key) diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index a543808572..1d41173eff 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -85,6 +85,12 @@ func TestGitBlobstore_ExistsAndGet_AllRange(t *testing.T) { bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + manifestOID, _, err := api.ResolvePathObject(ctx, git.OID(commit), "manifest") + require.NoError(t, err) + ok, err := bs.Exists(ctx, "manifest") require.NoError(t, err) require.True(t, ok) @@ -100,14 +106,14 @@ func TestGitBlobstore_ExistsAndGet_AllRange(t *testing.T) { got, ver, err := GetBytes(ctx, bs, "manifest", AllRange) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, manifestOID.String(), ver) require.Equal(t, want, got) // Validate size + version on Get. rc, sz, ver2, err := bs.Get(ctx, "manifest", NewBlobRange(0, 5)) require.NoError(t, err) require.Equal(t, uint64(len(want)), sz) - require.Equal(t, commit, ver2) + require.Equal(t, manifestOID.String(), ver2) _ = rc.Close() } @@ -149,34 +155,40 @@ func TestGitBlobstore_BlobRangeSemantics(t *testing.T) { bs, err := NewGitBlobstore(repo.GitDir, DoltDataRef) require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + rangeOID, _, err := api.ResolvePathObject(ctx, git.OID(commit), "range") + require.NoError(t, err) + // full range got, ver, err := GetBytes(ctx, bs, "range", AllRange) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, rangeOID.String(), ver) require.Equal(t, rangeData(0, maxValue), got) // first 2048 bytes (1024 shorts) got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(0, 2048)) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, rangeOID.String(), ver) require.Equal(t, rangeData(0, 1024), got) // bytes 2048..4096 of original got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(2*1024, 2*1024)) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, rangeOID.String(), ver) require.Equal(t, rangeData(1024, 2048), got) // last 2048 bytes got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 0)) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, rangeOID.String(), ver) require.Equal(t, rangeData(maxValue-1024, maxValue), got) // tail slice: beginning 2048 bytes from end, size 512 got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 512)) require.NoError(t, err) - require.Equal(t, commit, ver) + require.Equal(t, rangeOID.String(), ver) require.Equal(t, rangeData(maxValue-1024, maxValue-768), got) } @@ -243,7 +255,13 @@ func TestGitBlobstore_Put_RoundTripAndVersion(t *testing.T) { require.Equal(t, want, got) } -func TestGitBlobstore_Put_Overwrite(t *testing.T) { +type putShouldNotRead struct{} + +func (putShouldNotRead) Read(_ []byte) (int, error) { + return 0, errors.New("read should not be called") +} + +func TestGitBlobstore_Put_IdempotentIfKeyExists(t *testing.T) { requireGitOnPath(t) ctx := context.Background() @@ -257,15 +275,14 @@ func TestGitBlobstore_Put_Overwrite(t *testing.T) { require.NoError(t, err) require.NotEmpty(t, ver1) - ver2, err := PutBytes(ctx, bs, "k", []byte("v2\n")) + ver2, err := bs.Put(ctx, "k", 3, putShouldNotRead{}) require.NoError(t, err) - require.NotEmpty(t, ver2) - require.NotEqual(t, ver1, ver2) + require.Equal(t, ver1, ver2) got, ver3, err := GetBytes(ctx, bs, "k", AllRange) require.NoError(t, err) - require.Equal(t, ver2, ver3) - require.Equal(t, []byte("v2\n"), got) + require.Equal(t, ver1, ver3) + require.Equal(t, []byte("v1\n"), got) } type hookGitAPI struct { @@ -431,8 +448,14 @@ func TestGitBlobstore_CheckAndPut_MismatchDoesNotRead(t *testing.T) { bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + keyOID, _, err := api.ResolvePathObject(ctx, git.OID(commit), "k") + require.NoError(t, err) + r := &failReader{} - _, err = bs.CheckAndPut(ctx, commit+"-wrong", "k", 1, r) + _, err = bs.CheckAndPut(ctx, keyOID.String()+"-wrong", "k", 1, r) require.Error(t, err) require.True(t, IsCheckAndPutError(err)) require.False(t, r.called.Load(), "expected reader not to be consumed on version mismatch") @@ -454,11 +477,17 @@ func TestGitBlobstore_CheckAndPut_UpdateSuccess(t *testing.T) { bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + keyOID, _, err := api.ResolvePathObject(ctx, git.OID(commit), "k") + require.NoError(t, err) + want := []byte("updated\n") - ver2, err := bs.CheckAndPut(ctx, commit, "k", int64(len(want)), bytes.NewReader(want)) + ver2, err := bs.CheckAndPut(ctx, keyOID.String(), "k", int64(len(want)), bytes.NewReader(want)) require.NoError(t, err) require.NotEmpty(t, ver2) - require.NotEqual(t, commit, ver2) + require.NotEqual(t, keyOID.String(), ver2) got, ver3, err := GetBytes(ctx, bs, "k", AllRange) require.NoError(t, err) @@ -470,7 +499,7 @@ func TestGitBlobstore_CheckAndPut_UpdateSuccess(t *testing.T) { require.Equal(t, []byte("keep\n"), got) } -func TestGitBlobstore_CheckAndPut_ConcurrentUpdateReturnsMismatch(t *testing.T) { +func TestGitBlobstore_CheckAndPut_ConcurrentUnrelatedUpdateStillSucceeds(t *testing.T) { requireGitOnPath(t) ctx := context.Background() @@ -485,6 +514,12 @@ func TestGitBlobstore_CheckAndPut_ConcurrentUpdateReturnsMismatch(t *testing.T) bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) require.NoError(t, err) + runner, err := git.NewRunner(repo.GitDir) + require.NoError(t, err) + api := git.NewGitAPIImpl(runner) + keyOID, _, err := api.ResolvePathObject(ctx, git.OID(commit), "k") + require.NoError(t, err) + origAPI := bs.api h := &hookGitAPI{GitAPI: origAPI, ref: DoltDataRef} h.onFirstCAS = func(ctx context.Context, old git.OID) { @@ -493,12 +528,17 @@ func TestGitBlobstore_CheckAndPut_ConcurrentUpdateReturnsMismatch(t *testing.T) } bs.api = h - _, err = bs.CheckAndPut(ctx, commit, "k", 0, bytes.NewReader([]byte("mine\n"))) - require.Error(t, err) - require.True(t, IsCheckAndPutError(err)) - - // Verify key did not change, since our CAS should have failed. - got, _, err := GetBytes(ctx, bs, "k", AllRange) + ver2, err := bs.CheckAndPut(ctx, keyOID.String(), "k", 0, bytes.NewReader([]byte("mine\n"))) require.NoError(t, err) - require.Equal(t, []byte("base\n"), got) + require.NotEmpty(t, ver2) + require.NotEqual(t, keyOID.String(), ver2) + + got, ver3, err := GetBytes(ctx, bs, "k", AllRange) + require.NoError(t, err) + require.Equal(t, ver2, ver3) + require.Equal(t, []byte("mine\n"), got) + + got, _, err = GetBytes(ctx, bs, "external", AllRange) + require.NoError(t, err) + require.Equal(t, []byte("external\n"), got) } diff --git a/go/store/nbs/git_blobstore_read_smoke_test.go b/go/store/nbs/git_blobstore_read_smoke_test.go index acae17284a..3570457166 100644 --- a/go/store/nbs/git_blobstore_read_smoke_test.go +++ b/go/store/nbs/git_blobstore_read_smoke_test.go @@ -85,13 +85,18 @@ func TestGitBlobstoreReadSmoke_ManifestAndTableAccessPatterns(t *testing.T) { rc, totalSz, ver, err := bs.Get(ctx, "table", blobstore.NewBlobRange(-tailN, 0)) require.NoError(t, err) require.Equal(t, uint64(len(table)), totalSz) - require.Equal(t, commit, ver) + require.NotEmpty(t, ver) tail := make([]byte, tailN) _, err = io.ReadFull(rc, tail) require.NoError(t, err) require.NoError(t, rc.Close()) require.Equal(t, table[len(table)-tailN:], tail) + // Per-key version should be stable across reads. + _, _, ver2, err := bs.Get(ctx, "table", blobstore.AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + // 3) ReadAt-style ranged reads used by table readers. tr := &bsTableReaderAt{bs: bs, key: "table"} out := make([]byte, 4096) From cf03ca22bd010e4405ff995326ae5b395115a811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 07:07:28 -0800 Subject: [PATCH 15/28] /go/store/blobstore: use real types --- go/store/blobstore/git_blobstore.go | 16 ++++++++-------- .../blobstore/git_blobstore_chunked_put_test.go | 6 +++--- .../blobstore/git_blobstore_helpers_test.go | 14 +++++++------- go/store/blobstore/internal/git/api.go | 17 +++++++++++++---- go/store/blobstore/internal/git/impl.go | 16 ++++++++-------- go/store/blobstore/internal/git/impl_test.go | 10 +++++----- 6 files changed, 44 insertions(+), 35 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index cb42b983bd..e942cc883d 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -141,7 +141,7 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. } switch typ { - case "blob": + case git.ObjectTypeBlob: sz, ver, err := gbs.resolveBlobSizeForGet(ctx, commit, oid) if err != nil { return nil, 0, ver, err @@ -153,7 +153,7 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. // Per-key version: blob object id. return sliceInlineBlob(rc, sz, br, oid.String()) - case "tree": + case git.ObjectTypeTree: // Per-key version: tree object id at this key. rc, sz, _, err := gbs.openChunkedTreeRange(ctx, commit, key, oid, br) return rc, sz, oid.String(), err @@ -180,13 +180,13 @@ func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (c return git.OID(""), &git.RefNotFoundError{Ref: gbs.ref} } -func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ string, err error) { +func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ git.ObjectType, err error) { oid, typ, err = gbs.api.ResolvePathObject(ctx, commit, key) if err != nil { if git.IsPathNotFound(err) { - return git.OID(""), "", NotFound{Key: key} + return git.OID(""), git.ObjectTypeUnknown, NotFound{Key: key} } - return git.OID(""), "", err + return git.OID(""), git.ObjectTypeUnknown, err } return oid, typ, nil } @@ -253,7 +253,7 @@ func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entrie parts := make([]gitbs.PartRef, 0, len(entries)) var total uint64 for i, e := range entries { - if e.Type != "blob" { + if e.Type != git.ObjectTypeBlob { return nil, 0, fmt.Errorf("gitblobstore: invalid part %q: expected blob, got %q", e.Name, e.Type) } if len(e.Name) != width { @@ -677,14 +677,14 @@ func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent _ = oid switch typ { - case "blob": + case git.ObjectTypeBlob: if newIsChunked { // blob -> tree: must remove the file entry at return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) } return nil - case "tree": + case git.ObjectTypeTree: // tree -> blob OR tree overwrite: remove old child entries under /... entries, err := gbs.api.ListTree(ctx, parent, key) if err != nil { diff --git a/go/store/blobstore/git_blobstore_chunked_put_test.go b/go/store/blobstore/git_blobstore_chunked_put_test.go index 9f22d2a8a1..3096310bc6 100644 --- a/go/store/blobstore/git_blobstore_chunked_put_test.go +++ b/go/store/blobstore/git_blobstore_chunked_put_test.go @@ -57,7 +57,7 @@ func TestGitBlobstore_Put_ChunkedWritesTreeParts(t *testing.T) { _, typ, err := api.ResolvePathObject(ctx, head, "big") require.NoError(t, err) - require.Equal(t, "tree", typ) + require.Equal(t, git.ObjectTypeTree, typ) entries, err := api.ListTree(ctx, head, "big") require.NoError(t, err) @@ -95,7 +95,7 @@ func TestGitBlobstore_Put_IdempotentDoesNotChangeExistingRepresentation(t *testi require.True(t, ok) _, typ, err := api.ResolvePathObject(ctx, head1, "k") require.NoError(t, err) - require.Equal(t, "blob", typ) + require.Equal(t, git.ObjectTypeBlob, typ) got, _, err := GetBytes(ctx, bs, "k", AllRange) require.NoError(t, err) @@ -109,7 +109,7 @@ func TestGitBlobstore_Put_IdempotentDoesNotChangeExistingRepresentation(t *testi require.True(t, ok) _, typ, err = api.ResolvePathObject(ctx, head2, "ktree") require.NoError(t, err) - require.Equal(t, "tree", typ) + require.Equal(t, git.ObjectTypeTree, typ) verTreeNoop, err := bs.Put(ctx, "ktree", 2, putShouldNotRead{}) require.NoError(t, err) diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index 0bd8d4eb72..3e1864b4c5 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -28,7 +28,7 @@ import ( type fakeGitAPI struct { tryResolveRefCommit func(ctx context.Context, ref string) (git.OID, bool, error) resolvePathBlob func(ctx context.Context, commit git.OID, path string) (git.OID, error) - resolvePathObject func(ctx context.Context, commit git.OID, path string) (git.OID, string, error) + resolvePathObject func(ctx context.Context, commit git.OID, path string) (git.OID, git.ObjectType, error) blobSize func(ctx context.Context, oid git.OID) (int64, error) blobReader func(ctx context.Context, oid git.OID) (io.ReadCloser, error) } @@ -42,7 +42,7 @@ func (f fakeGitAPI) ResolveRefCommit(ctx context.Context, ref string) (git.OID, func (f fakeGitAPI) ResolvePathBlob(ctx context.Context, commit git.OID, path string) (git.OID, error) { return f.resolvePathBlob(ctx, commit, path) } -func (f fakeGitAPI) ResolvePathObject(ctx context.Context, commit git.OID, path string) (git.OID, string, error) { +func (f fakeGitAPI) ResolvePathObject(ctx context.Context, commit git.OID, path string) (git.OID, git.ObjectType, error) { return f.resolvePathObject(ctx, commit, path) } func (f fakeGitAPI) ListTree(ctx context.Context, commit git.OID, treePath string) ([]git.TreeEntry, error) { @@ -150,24 +150,24 @@ func TestGitBlobstoreHelpers_resolveObjectForGet(t *testing.T) { t.Run("ok", func(t *testing.T) { api := fakeGitAPI{ - resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, string, error) { + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { require.Equal(t, commit, gotCommit) require.Equal(t, "k", path) - return git.OID("89abcdef0123456789abcdef0123456789abcdef"), "blob", nil + return git.OID("89abcdef0123456789abcdef0123456789abcdef"), git.ObjectTypeBlob, nil }, } gbs := &GitBlobstore{api: api} oid, typ, err := gbs.resolveObjectForGet(ctx, commit, "k") require.NoError(t, err) - require.Equal(t, "blob", typ) + require.Equal(t, git.ObjectTypeBlob, typ) require.Equal(t, git.OID("89abcdef0123456789abcdef0123456789abcdef"), oid) }) t.Run("pathNotFoundMapsToNotFound", func(t *testing.T) { api := fakeGitAPI{ - resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, string, error) { - return git.OID(""), "", &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { + return git.OID(""), git.ObjectTypeUnknown, &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} }, } gbs := &GitBlobstore{api: api} diff --git a/go/store/blobstore/internal/git/api.go b/go/store/blobstore/internal/git/api.go index 7e5851b06d..cabda14d60 100644 --- a/go/store/blobstore/internal/git/api.go +++ b/go/store/blobstore/internal/git/api.go @@ -19,6 +19,17 @@ import ( "io" ) +// ObjectType is a git object type returned by plumbing (e.g. "blob", "tree"). +type ObjectType string + +const ( + ObjectTypeUnknown ObjectType = "" + ObjectTypeBlob ObjectType = "blob" + ObjectTypeTree ObjectType = "tree" + ObjectTypeCommit ObjectType = "commit" + ObjectTypeTag ObjectType = "tag" +) + // GitAPI defines the git plumbing operations needed by GitBlobstore. It includes both // read and write operations to allow swapping implementations (e.g. git CLI vs a Go git // library) while keeping callers stable. @@ -36,9 +47,7 @@ type GitAPI interface { // ResolvePathObject resolves |path| within |commit| to an object OID and type. // It returns PathNotFoundError if the path does not exist. - // - // Typical types are "blob" and "tree". - ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ string, err error) + ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ ObjectType, err error) // ListTree lists the entries of the tree at |treePath| within |commit|. // The listing is non-recursive: it returns only immediate children. @@ -104,7 +113,7 @@ type GitAPI interface { // TreeEntry describes one entry in a git tree listing. type TreeEntry struct { Mode string - Type string + Type ObjectType OID OID Name string } diff --git a/go/store/blobstore/internal/git/impl.go b/go/store/blobstore/internal/git/impl.go index 324aa97b92..8e78ccfaf6 100644 --- a/go/store/blobstore/internal/git/impl.go +++ b/go/store/blobstore/internal/git/impl.go @@ -88,25 +88,25 @@ func (a *GitAPIImpl) ResolvePathBlob(ctx context.Context, commit OID, path strin return OID(oid), nil } -func (a *GitAPIImpl) ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ string, err error) { +func (a *GitAPIImpl) ResolvePathObject(ctx context.Context, commit OID, path string) (oid OID, typ ObjectType, err error) { spec := commit.String() + ":" + path out, err := a.r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec) if err != nil { if isPathNotFoundErr(err) { - return "", "", &PathNotFoundError{Commit: commit.String(), Path: path} + return "", ObjectTypeUnknown, &PathNotFoundError{Commit: commit.String(), Path: path} } - return "", "", err + return "", ObjectTypeUnknown, err } oidStr := strings.TrimSpace(string(out)) if oidStr == "" { - return "", "", fmt.Errorf("git rev-parse returned empty oid for %q", spec) + return "", ObjectTypeUnknown, fmt.Errorf("git rev-parse returned empty oid for %q", spec) } - typ, err = a.CatFileType(ctx, OID(oidStr)) + typStr, err := a.CatFileType(ctx, OID(oidStr)) if err != nil { - return "", "", err + return "", ObjectTypeUnknown, err } - return OID(oidStr), typ, nil + return OID(oidStr), ObjectType(typStr), nil } func (a *GitAPIImpl) ListTree(ctx context.Context, commit OID, treePath string) ([]TreeEntry, error) { @@ -335,7 +335,7 @@ func parseLsTreeLine(line string) (TreeEntry, error) { } return TreeEntry{ Mode: left[0], - Type: left[1], + Type: ObjectType(left[1]), OID: OID(left[2]), Name: parts[1], }, nil diff --git a/go/store/blobstore/internal/git/impl_test.go b/go/store/blobstore/internal/git/impl_test.go index 71589eb96c..216cb880c7 100644 --- a/go/store/blobstore/internal/git/impl_test.go +++ b/go/store/blobstore/internal/git/impl_test.go @@ -406,7 +406,7 @@ func TestGitAPIImpl_ResolvePathObject_BlobAndTree(t *testing.T) { if err != nil { t.Fatal(err) } - if gotTyp != "blob" { + if gotTyp != ObjectTypeBlob { t.Fatalf("expected type blob, got %q", gotTyp) } if gotOID != blobOID { @@ -417,7 +417,7 @@ func TestGitAPIImpl_ResolvePathObject_BlobAndTree(t *testing.T) { if err != nil { t.Fatal(err) } - if gotTyp != "tree" { + if gotTyp != ObjectTypeTree { t.Fatalf("expected type tree, got %q", gotTyp) } } @@ -487,17 +487,17 @@ func TestGitAPIImpl_ListTree_NonRecursive(t *testing.T) { switch e.Name { case "a.txt": gotA = true - if e.Type != "blob" || e.OID != oidA { + if e.Type != ObjectTypeBlob || e.OID != oidA { t.Fatalf("unexpected a.txt entry: %+v", e) } case "b.txt": gotB = true - if e.Type != "blob" || e.OID != oidB { + if e.Type != ObjectTypeBlob || e.OID != oidB { t.Fatalf("unexpected b.txt entry: %+v", e) } case "sub": gotSub = true - if e.Type != "tree" || e.OID == "" { + if e.Type != ObjectTypeTree || e.OID == "" { t.Fatalf("unexpected sub entry: %+v", e) } default: From 296b607369ce85f41514ef7b43885487fb9ad51f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 07:15:52 -0800 Subject: [PATCH 16/28] /go/store/blobstore: more refactor --- go/store/blobstore/git_blobstore.go | 126 +++++++++-- .../blobstore/git_blobstore_helpers_test.go | 10 +- .../blobstore/git_blobstore_multipart_test.go | 15 +- go/store/blobstore/internal/gitbs/ranges.go | 196 ------------------ .../internal/gitbs/ranges_helpers_test.go | 68 ------ .../blobstore/internal/gitbs/ranges_test.go | 82 -------- 6 files changed, 125 insertions(+), 372 deletions(-) delete mode 100644 go/store/blobstore/internal/gitbs/ranges.go delete mode 100644 go/store/blobstore/internal/gitbs/ranges_helpers_test.go delete mode 100644 go/store/blobstore/internal/gitbs/ranges_test.go diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index e942cc883d..f3053dfd49 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -31,11 +31,21 @@ import ( "github.com/cenkalti/backoff/v4" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" ) const gitblobstorePartNameWidth = 8 // "00000001" +type chunkPartRef struct { + oidHex string + size uint64 +} + +type chunkPartSlice struct { + oidHex string + offset int64 + length int64 +} + // GitBlobstore is a Blobstore implementation backed by a git repository's object // database (bare repo or .git directory). It stores keys as paths within the tree // of the commit referenced by a git ref (e.g. refs/dolt/data). @@ -221,11 +231,11 @@ func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OI } total := int64(totalSize) - start, end, err := gitbs.NormalizeRange(total, br.offset, br.length) + start, end, err := normalizeRange(total, br.offset, br.length) if err != nil { return nil, totalSize, ver, err } - slices, err := gitbs.SliceParts(parts, start, end) + slices, err := sliceChunkParts(parts, start, end) if err != nil { return nil, totalSize, ver, err } @@ -239,7 +249,7 @@ func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OI return streamRC, totalSize, ver, nil } -func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entries []git.TreeEntry) ([]gitbs.PartRef, uint64, error) { +func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entries []git.TreeEntry) ([]chunkPartRef, uint64, error) { if len(entries) == 0 { return nil, 0, fmt.Errorf("gitblobstore: chunked tree has no parts") } @@ -250,7 +260,7 @@ func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entrie return nil, 0, fmt.Errorf("gitblobstore: invalid part name %q (expected at least 4 digits)", entries[0].Name) } - parts := make([]gitbs.PartRef, 0, len(entries)) + parts := make([]chunkPartRef, 0, len(entries)) var total uint64 for i, e := range entries { if e.Type != git.ObjectTypeBlob { @@ -282,7 +292,7 @@ func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entrie return nil, 0, fmt.Errorf("gitblobstore: total size overflow") } total += uint64(sz) - parts = append(parts, gitbs.PartRef{OIDHex: e.OID.String(), Size: uint64(sz)}) + parts = append(parts, chunkPartRef{oidHex: e.OID.String(), size: uint64(sz)}) } return parts, total, nil } @@ -329,7 +339,7 @@ type multiPartReadCloser struct { ctx context.Context api git.GitAPI - slices []gitbs.PartSlice + slices []chunkPartSlice curIdx int curRC io.ReadCloser @@ -370,16 +380,16 @@ func (m *multiPartReadCloser) ensureCurrent() error { return err } m.curRC = rc - m.rem = s.Length + m.rem = s.length return nil } -func (m *multiPartReadCloser) openSliceReader(s gitbs.PartSlice) (io.ReadCloser, error) { - rc, err := m.api.BlobReader(m.ctx, git.OID(s.OIDHex)) +func (m *multiPartReadCloser) openSliceReader(s chunkPartSlice) (io.ReadCloser, error) { + rc, err := m.api.BlobReader(m.ctx, git.OID(s.oidHex)) if err != nil { return nil, err } - if err := skipN(rc, s.Offset); err != nil { + if err := skipN(rc, s.offset); err != nil { _ = rc.Close() return nil, err } @@ -431,6 +441,96 @@ func skipN(r io.Reader, n int64) error { return err } +func normalizeRange(total int64, offset int64, length int64) (start, end int64, err error) { + if total < 0 { + return 0, 0, fmt.Errorf("invalid total size %d", total) + } + if length < 0 { + return 0, 0, fmt.Errorf("invalid length %d", length) + } + start = offset + if start < 0 { + start = total + start + } + if start < 0 || start > total { + return 0, 0, fmt.Errorf("invalid offset %d for total size %d", offset, total) + } + if length == 0 { + end = total + } else { + end = start + length + if end < start { + return 0, 0, fmt.Errorf("range overflow") + } + if end > total { + end = total + } + } + return start, end, nil +} + +func sliceChunkParts(parts []chunkPartRef, start, end int64) ([]chunkPartSlice, error) { + if start < 0 || end < 0 || end < start { + return nil, fmt.Errorf("invalid start/end: %d/%d", start, end) + } + if start == end { + return nil, nil + } + + var ( + out []chunkPartSlice + pos int64 + ) + + for _, p := range parts { + if p.size == 0 { + return nil, fmt.Errorf("invalid part size 0") + } + partStart := pos + partEnd := pos + int64(p.size) + if partEnd < partStart { + return nil, fmt.Errorf("part size overflow") + } + + if end <= partStart { + break + } + if start >= partEnd { + pos = partEnd + continue + } + + s := start + if s < partStart { + s = partStart + } + e := end + if e > partEnd { + e = partEnd + } + if e > s { + out = append(out, chunkPartSlice{ + oidHex: p.oidHex, + offset: s - partStart, + length: e - s, + }) + } + pos = partEnd + } + + if len(out) == 0 { + return nil, fmt.Errorf("range [%d,%d) not covered by parts", start, end) + } + var covered int64 + for _, s := range out { + covered += s.length + } + if covered != (end - start) { + return nil, fmt.Errorf("range [%d,%d) not fully covered by parts", start, end) + } + return out, nil +} + func (m *multiPartReadCloser) Close() error { if m.curRC != nil { err := m.curRC.Close() @@ -746,7 +846,7 @@ func (gbs *GitBlobstore) hashChunkedParts(ctx context.Context, reader io.Reader) return partOIDs, nil } -func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []gitbs.PartRef, partOIDs []git.OID, total uint64, err error) { +func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []chunkPartRef, partOIDs []git.OID, total uint64, err error) { max := int64(gbs.maxPartSize) if max <= 0 { return nil, nil, 0, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) @@ -773,7 +873,7 @@ func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts return nil, nil, 0, err } partOIDs = append(partOIDs, oid) - parts = append(parts, gitbs.PartRef{OIDHex: oid.String(), Size: uint64(n)}) + parts = append(parts, chunkPartRef{oidHex: oid.String(), size: uint64(n)}) total += uint64(n) if errors.Is(rerr, io.ErrUnexpectedEOF) { break diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index 3e1864b4c5..89be72406e 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -218,15 +218,15 @@ func TestGitBlobstoreHelpers_validateAndSizeChunkedParts(t *testing.T) { gbs := &GitBlobstore{api: api} parts, total, err := gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{ - {Name: "0001", Type: "blob", OID: "0123456789abcdef0123456789abcdef01234567"}, - {Name: "0002", Type: "blob", OID: "89abcdef0123456789abcdef0123456789abcdef"}, + {Name: "0001", Type: git.ObjectTypeBlob, OID: "0123456789abcdef0123456789abcdef01234567"}, + {Name: "0002", Type: git.ObjectTypeBlob, OID: "89abcdef0123456789abcdef0123456789abcdef"}, }) require.NoError(t, err) require.Equal(t, uint64(8), total) require.Len(t, parts, 2) - require.Equal(t, "0123456789abcdef0123456789abcdef01234567", parts[0].OIDHex) - require.Equal(t, uint64(3), parts[0].Size) + require.Equal(t, "0123456789abcdef0123456789abcdef01234567", parts[0].oidHex) + require.Equal(t, uint64(3), parts[0].size) - _, _, err = gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{{Name: "1", Type: "blob", OID: "0123456789abcdef0123456789abcdef01234567"}}) + _, _, err = gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{{Name: "1", Type: git.ObjectTypeBlob, OID: "0123456789abcdef0123456789abcdef01234567"}}) require.Error(t, err) } diff --git a/go/store/blobstore/git_blobstore_multipart_test.go b/go/store/blobstore/git_blobstore_multipart_test.go index 45b48c9ec5..a28636e291 100644 --- a/go/store/blobstore/git_blobstore_multipart_test.go +++ b/go/store/blobstore/git_blobstore_multipart_test.go @@ -24,7 +24,6 @@ import ( "github.com/stretchr/testify/require" git "github.com/dolthub/dolt/go/store/blobstore/internal/git" - gitbs "github.com/dolthub/dolt/go/store/blobstore/internal/gitbs" ) type trackingReadCloser struct { @@ -59,9 +58,9 @@ func TestMultiPartReadCloser_ReadConcatenatesAcrossPartsWithOffsets(t *testing.T rc := &multiPartReadCloser{ ctx: ctx, api: api, - slices: []gitbs.PartSlice{ - {OIDHex: oid1, Offset: 1, Length: 3}, // "ell" - {OIDHex: oid2, Offset: 2, Length: 3}, // "rld" + slices: []chunkPartSlice{ + {oidHex: oid1, offset: 1, length: 3}, // "ell" + {oidHex: oid2, offset: 2, length: 3}, // "rld" }, } defer func() { _ = rc.Close() }() @@ -84,8 +83,8 @@ func TestMultiPartReadCloser_ReadUnexpectedEOFWhenPartShorterThanDeclared(t *tes rc := &multiPartReadCloser{ ctx: ctx, api: api, - slices: []gitbs.PartSlice{ - {OIDHex: oid, Offset: 0, Length: 3}, // expect 3 bytes, only 2 available + slices: []chunkPartSlice{ + {oidHex: oid, offset: 0, length: 3}, // expect 3 bytes, only 2 available }, } defer func() { _ = rc.Close() }() @@ -110,8 +109,8 @@ func TestMultiPartReadCloser_CloseClosesUnderlyingPartReader(t *testing.T) { rc := &multiPartReadCloser{ ctx: ctx, api: api, - slices: []gitbs.PartSlice{ - {OIDHex: oid, Offset: 0, Length: 1}, + slices: []chunkPartSlice{ + {oidHex: oid, offset: 0, length: 1}, }, } diff --git a/go/store/blobstore/internal/gitbs/ranges.go b/go/store/blobstore/internal/gitbs/ranges.go deleted file mode 100644 index e54ec0988c..0000000000 --- a/go/store/blobstore/internal/gitbs/ranges.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import "fmt" - -// PartRef describes one part of a logically concatenated object. -type PartRef struct { - OIDHex string - Size uint64 -} - -// PartSlice describes a contiguous slice to read from a particular part. -type PartSlice struct { - OIDHex string - // Offset is the byte offset into the part at which to begin reading. - Offset int64 - // Length is the number of bytes to read from the part slice. - Length int64 -} - -// NormalizeRange converts (offset,length) with possible negative offsets into a -// concrete half-open interval [start,end) over an object of total size |total|. -// -// Semantics match blobstore.BlobRange: -// - offset < 0 means relative to end (start = total + offset) -// - length == 0 means "to end" -// - length < 0 is invalid -func NormalizeRange(total int64, offset int64, length int64) (start, end int64, err error) { - if err := validateNormalizeRangeInputs(total, length); err != nil { - return 0, 0, err - } - start, err = normalizeStart(total, offset) - if err != nil { - return 0, 0, err - } - end, err = normalizeEnd(total, start, length) - if err != nil { - return 0, 0, err - } - return start, end, nil -} - -// SliceParts maps a logical range [start,end) over the concatenation of |parts| -// into per-part slices. -// -// - start/end are byte offsets in the logical object (0 <= start <= end <= total) -// - parts must have Size > 0 -func SliceParts(parts []PartRef, start, end int64) ([]PartSlice, error) { - if err := validateStartEnd(start, end); err != nil { - return nil, err - } - if isEmptyRange(start, end) { - return nil, nil - } - - var ( - out []PartSlice - pos int64 // start offset of current part in logical stream - ) - - for _, p := range parts { - partStart, partEnd, err := partBounds(pos, p.Size) - if err != nil { - return nil, err - } - - // Does this part overlap [start,end)? - if end <= partStart { - break - } - if start >= partEnd { - pos = partEnd - continue - } - - if s, e, ok := overlap(partStart, partEnd, start, end); ok { - out = append(out, newPartSlice(p.OIDHex, partStart, s, e)) - } - pos = partEnd - } - - return validateCoverage(out, start, end) -} - -func validateNormalizeRangeInputs(total int64, length int64) error { - if total < 0 { - return fmt.Errorf("invalid total size %d", total) - } - if length < 0 { - return fmt.Errorf("invalid length %d", length) - } - return nil -} - -func normalizeStart(total int64, offset int64) (int64, error) { - start := offset - if start < 0 { - start = total + start - } - if start < 0 || start > total { - return 0, fmt.Errorf("invalid offset %d for total size %d", offset, total) - } - return start, nil -} - -func normalizeEnd(total int64, start int64, length int64) (int64, error) { - if length == 0 { - return total, nil - } - end := start + length - if end < start { - return 0, fmt.Errorf("range overflow") - } - if end > total { - end = total - } - return end, nil -} - -func validateStartEnd(start, end int64) error { - if start < 0 || end < 0 || end < start { - return fmt.Errorf("invalid start/end: %d/%d", start, end) - } - return nil -} - -func isEmptyRange(start, end int64) bool { - return start == end -} - -func partBounds(pos int64, size uint64) (start, end int64, err error) { - if size == 0 { - return 0, 0, fmt.Errorf("invalid part size 0") - } - start = pos - end = pos + int64(size) - if end < start { - return 0, 0, fmt.Errorf("part size overflow") - } - return start, end, nil -} - -func overlap(partStart, partEnd, start, end int64) (s, e int64, ok bool) { - s = start - if s < partStart { - s = partStart - } - e = end - if e > partEnd { - e = partEnd - } - if e <= s { - return 0, 0, false - } - return s, e, true -} - -func newPartSlice(oidHex string, partStart, s, e int64) PartSlice { - return PartSlice{ - OIDHex: oidHex, - Offset: s - partStart, - Length: e - s, - } -} - -func validateCoverage(out []PartSlice, start, end int64) ([]PartSlice, error) { - // Validate that the requested interval was fully covered by parts. - if len(out) == 0 { - return nil, fmt.Errorf("range [%d,%d) not covered by parts", start, end) - } - covered := coveredLength(out) - if covered != (end - start) { - return nil, fmt.Errorf("range [%d,%d) not fully covered by parts", start, end) - } - return out, nil -} - -func coveredLength(slices []PartSlice) (covered int64) { - for _, s := range slices { - covered += s.Length - } - return covered -} diff --git a/go/store/blobstore/internal/gitbs/ranges_helpers_test.go b/go/store/blobstore/internal/gitbs/ranges_helpers_test.go deleted file mode 100644 index d4f66b315f..0000000000 --- a/go/store/blobstore/internal/gitbs/ranges_helpers_test.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "math" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestRangesHelpers_normalizeStartEnd(t *testing.T) { - start, err := normalizeStart(10, -2) - require.NoError(t, err) - require.Equal(t, int64(8), start) - - _, err = normalizeStart(10, 11) - require.Error(t, err) - - end, err := normalizeEnd(10, 2, 0) - require.NoError(t, err) - require.Equal(t, int64(10), end) - - end, err = normalizeEnd(10, 2, 100) - require.NoError(t, err) - require.Equal(t, int64(10), end) -} - -func TestRangesHelpers_partBoundsAndOverlap(t *testing.T) { - _, _, err := partBounds(0, 0) - require.Error(t, err) - - // Force int64 overflow path: end wraps negative, so end < start. - _, _, err = partBounds(math.MaxInt64-1, 10) - require.Error(t, err) - - s, e, ok := overlap(0, 10, 2, 5) - require.True(t, ok) - require.Equal(t, int64(2), s) - require.Equal(t, int64(5), e) - - _, _, ok = overlap(0, 10, 10, 12) - require.False(t, ok) -} - -func TestRangesHelpers_validateCoverage(t *testing.T) { - _, err := validateCoverage(nil, 0, 1) - require.Error(t, err) - - _, err = validateCoverage([]PartSlice{{OIDHex: "a", Offset: 0, Length: 1}}, 0, 2) - require.Error(t, err) - - out, err := validateCoverage([]PartSlice{{OIDHex: "a", Offset: 0, Length: 2}}, 0, 2) - require.NoError(t, err) - require.Len(t, out, 1) -} diff --git a/go/store/blobstore/internal/gitbs/ranges_test.go b/go/store/blobstore/internal/gitbs/ranges_test.go deleted file mode 100644 index 1b1b4ea768..0000000000 --- a/go/store/blobstore/internal/gitbs/ranges_test.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2026 Dolthub, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gitbs - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestNormalizeRange(t *testing.T) { - start, end, err := NormalizeRange(10, 0, 0) - require.NoError(t, err) - require.Equal(t, int64(0), start) - require.Equal(t, int64(10), end) - - start, end, err = NormalizeRange(10, 2, 3) - require.NoError(t, err) - require.Equal(t, int64(2), start) - require.Equal(t, int64(5), end) - - start, end, err = NormalizeRange(10, -3, 0) - require.NoError(t, err) - require.Equal(t, int64(7), start) - require.Equal(t, int64(10), end) - - start, end, err = NormalizeRange(10, -3, 2) - require.NoError(t, err) - require.Equal(t, int64(7), start) - require.Equal(t, int64(9), end) - - _, _, err = NormalizeRange(10, 11, 0) - require.Error(t, err) - - _, _, err = NormalizeRange(10, -11, 0) - require.Error(t, err) - - _, _, err = NormalizeRange(10, 0, -1) - require.Error(t, err) -} - -func TestSliceParts(t *testing.T) { - parts := []PartRef{ - {OIDHex: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Size: 3}, - {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Size: 4}, - {OIDHex: "cccccccccccccccccccccccccccccccccccccccc", Size: 2}, - } - - slices, err := SliceParts(parts, 0, 9) - require.NoError(t, err) - require.Len(t, slices, 3) - require.Equal(t, int64(3), slices[0].Length) - require.Equal(t, int64(4), slices[1].Length) - require.Equal(t, int64(2), slices[2].Length) - - // Middle slice spanning two parts: [2,5) covers a[2:] + b[:2] - slices, err = SliceParts(parts, 2, 5) - require.NoError(t, err) - require.Equal(t, []PartSlice{ - {OIDHex: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", Offset: 2, Length: 1}, - {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Offset: 0, Length: 2}, - }, slices) - - // Single-part slice: [3,7) maps to b[0:4] - slices, err = SliceParts(parts, 3, 7) - require.NoError(t, err) - require.Equal(t, []PartSlice{ - {OIDHex: "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", Offset: 0, Length: 4}, - }, slices) -} From 333879d085ab69a87d4175791689209bb0c211e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 07:26:48 -0800 Subject: [PATCH 17/28] /go/store/blobstore: fixes --- go/store/blobstore/git_blobstore.go | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index f3053dfd49..a7f771e4f8 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -165,7 +165,7 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. case git.ObjectTypeTree: // Per-key version: tree object id at this key. - rc, sz, _, err := gbs.openChunkedTreeRange(ctx, commit, key, oid, br) + rc, sz, _, err := gbs.openChunkedTreeRange(ctx, commit, key, br) return rc, sz, oid.String(), err default: @@ -217,10 +217,9 @@ type limitReadCloser struct { func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } func (l *limitReadCloser) Close() error { return l.c.Close() } -func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OID, key string, treeOID git.OID, br BlobRange) (io.ReadCloser, uint64, string, error) { +func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OID, key string, br BlobRange) (io.ReadCloser, uint64, string, error) { ver := commit.String() - _ = treeOID // treeOID is informational; ListTree resolves by path. entries, err := gbs.api.ListTree(ctx, commit, key) if err != nil { return nil, 0, ver, err @@ -433,6 +432,15 @@ func (m *multiPartReadCloser) readCurrent(p []byte) (int, error) { return 0, err } +func (m *multiPartReadCloser) Close() error { + if m.curRC != nil { + err := m.curRC.Close() + m.curRC = nil + return err + } + return nil +} + func skipN(r io.Reader, n int64) error { if n <= 0 { return nil @@ -531,15 +539,6 @@ func sliceChunkParts(parts []chunkPartRef, start, end int64) ([]chunkPartSlice, return out, nil } -func (m *multiPartReadCloser) Close() error { - if m.curRC != nil { - err := m.curRC.Close() - m.curRC = nil - return err - } - return nil -} - func min(a, b int) int { if a < b { return a From ee528134ff085aa125149fd327991d187219d1bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 07:48:26 -0800 Subject: [PATCH 18/28] /go/store/blobstore: reorg --- go/store/blobstore/git_blobstore.go | 949 ++++++++++++++-------------- 1 file changed, 471 insertions(+), 478 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index a7f771e4f8..0a96b7cd9e 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -173,6 +173,232 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. } } +func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + + // Many NBS/table-file writes are content-addressed: if the key already exists, callers + // assume it refers to the same bytes and treat the operation as idempotent. + // + // The manifest is the main exception (it is mutable and updated via CheckAndPut), so + // we only apply this fast-path for non-manifest keys. + if key != "manifest" { + commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return "", err + } + if ok { + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err == nil { + // Per-key version: existing object id. + return oid.String(), nil + } + if !git.IsPathNotFound(err) { + return "", err + } + } + } + + msg := fmt.Sprintf("gitblobstore: put %s", key) + + // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, + // we can reuse the resulting object OIDs without re-reading |reader|. + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) + if err != nil { + return "", err + } + + // Make Put resilient to concurrent writers updating unrelated keys by using a CAS loop + // under the hood. This matches typical object-store semantics more closely than an + // unconditional ref update (which could clobber other keys). + const maxRetries = 31 // 32 total attempts (initial + retries) + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Millisecond + bo.Multiplier = 2 + bo.MaxInterval = 320 * time.Millisecond + bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed + bo.Reset() + policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) + + var ver string + op := func() error { + parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) + if err != nil { + return backoff.Permanent(err) + } + + if !ok { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids + // losing concurrent writes when multiple goroutines create the ref at once. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + ver = oid.String() + return nil + } + + err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) + if err == nil { + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + ver = oid.String() + return nil + } + + // If the ref changed since we read |parent|, retry on the new head. Otherwise + // surface the error (e.g. permissions, corruption). + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + return ver, nil +} + +func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + + msg := fmt.Sprintf("gitblobstore: checkandput %s", key) + + // Implement per-key CAS by validating |expectedVersion| against the current key version + // at HEAD, then committing on that HEAD and CAS-updating the ref. If the ref advances, + // retry by re-checking the key version. + const maxRetries = 31 // 32 total attempts (initial + retries) + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Millisecond + bo.Multiplier = 2 + bo.MaxInterval = 320 * time.Millisecond + bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed + bo.Reset() + policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) + + var newKeyVersion string + var cachedPlan *putPlan + op := func() error { + parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + actualKeyVersion, err := gbs.currentKeyVersion(ctx, parent, ok, key) + if err != nil { + return backoff.Permanent(err) + } + if expectedVersion != actualKeyVersion { + return backoff.Permanent(CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualKeyVersion}) + } + + // Only hash/consume the reader once we know the expectedVersion matches. + // If we need to retry due to unrelated ref advances, reuse the cached plan so we + // don't re-read |reader| (which may not be rewindable). + if cachedPlan == nil { + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) + if err != nil { + return backoff.Permanent(err) + } + cachedPlan = &plan + } + + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, *cachedPlan, msg) + if err != nil { + return backoff.Permanent(err) + } + + if !ok { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { + // If the ref now exists, retry; otherwise surface the error. + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + } else { + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + } + + oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + newKeyVersion = oid.String() + return nil + } + + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + return newKeyVersion, nil +} + +func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { + // Chunked-object support is landing in phases. Concatenate is the final piece + // needed for NBS conjoin and is intentionally left unimplemented on this branch. + // + // Keep key validation for consistent error behavior. + _, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + for _, src := range sources { + if _, err := normalizeGitTreePath(src); err != nil { + return "", err + } + } + return "", git.ErrUnimplemented +} + +func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, haveCommit bool, key string) (string, error) { + if !haveCommit { + // Ref missing => empty store => key missing. + return "", nil + } + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return "", nil + } + return "", err + } + return oid.String(), nil +} + func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, err error) { commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) if err != nil { @@ -209,14 +435,6 @@ func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.O return sz, commit.String(), nil } -type limitReadCloser struct { - r io.Reader - c io.Closer -} - -func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } -func (l *limitReadCloser) Close() error { return l.c.Close() } - func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OID, key string, br BlobRange) (io.ReadCloser, uint64, string, error) { ver := commit.String() @@ -296,6 +514,251 @@ func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entrie return parts, total, nil } +func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, msg string) (git.OID, error) { + return gbs.buildCommitWithWrites(ctx, parent, hasParent, []treeWrite{{path: key, oid: blobOID}}, msg) +} + +func (gbs *GitBlobstore) buildCommitWithWrites(ctx context.Context, parent git.OID, hasParent bool, writes []treeWrite, msg string) (git.OID, error) { + _, indexFile, cleanup, err := newTempIndex() + if err != nil { + return "", err + } + defer cleanup() + + if hasParent { + if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { + return "", err + } + } else { + if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { + return "", err + } + } + + // TODO(gitblobstore): Decide on a policy for file-vs-directory prefix conflicts when staging keys. + // For example, staging "a" when "a/b" already exists in the tree/index (or vice-versa) can fail + // with a git index error (path appears as both a file and directory). Today our NBS keyspace is + // flat (e.g. "manifest", "", ".records"), so this should not occur. If we ever + // namespace keys into directories, consider proactively removing conflicting paths from the index + // before UpdateIndexCacheInfo so Put/CheckAndPut remain robust. + sort.Slice(writes, func(i, j int) bool { return writes[i].path < writes[j].path }) + for _, w := range writes { + if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { + return "", err + } + } + + treeOID, err := gbs.api.WriteTree(ctx, indexFile) + if err != nil { + return "", err + } + + var parentPtr *git.OID + if hasParent && parent != "" { + p := parent + parentPtr = &p + } + + // Prefer git's default identity from env/config when not explicitly configured. + commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) + if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { + commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) + } + if err != nil { + return "", err + } + + return commitOID, nil +} + +func (gbs *GitBlobstore) buildCommitForKeyWrite(ctx context.Context, parent git.OID, hasParent bool, key string, plan putPlan, msg string) (git.OID, error) { + _, indexFile, cleanup, err := newTempIndex() + if err != nil { + return "", err + } + defer cleanup() + + if hasParent { + if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { + return "", err + } + } else { + if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { + return "", err + } + } + + if hasParent { + if err := gbs.removeKeyConflictsFromIndex(ctx, parent, indexFile, key, plan.chunked); err != nil { + return "", err + } + } + + sort.Slice(plan.writes, func(i, j int) bool { return plan.writes[i].path < plan.writes[j].path }) + for _, w := range plan.writes { + if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { + return "", err + } + } + + treeOID, err := gbs.api.WriteTree(ctx, indexFile) + if err != nil { + return "", err + } + + var parentPtr *git.OID + if hasParent && parent != "" { + p := parent + parentPtr = &p + } + + commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) + if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { + commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) + } + if err != nil { + return "", err + } + return commitOID, nil +} + +func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent git.OID, indexFile string, key string, newIsChunked bool) error { + oid, typ, err := gbs.api.ResolvePathObject(ctx, parent, key) + if err != nil { + if git.IsPathNotFound(err) { + return nil + } + return err + } + _ = oid + + switch typ { + case git.ObjectTypeBlob: + if newIsChunked { + // blob -> tree: must remove the file entry at + return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) + } + return nil + + case git.ObjectTypeTree: + // tree -> blob OR tree overwrite: remove old child entries under /... + entries, err := gbs.api.ListTree(ctx, parent, key) + if err != nil { + return err + } + if len(entries) == 0 { + return nil + } + paths := make([]string, 0, len(entries)) + for _, e := range entries { + paths = append(paths, key+"/"+e.Name) + } + return gbs.api.RemoveIndexPaths(ctx, indexFile, paths) + + default: + return fmt.Errorf("gitblobstore: unsupported existing object type %q at key %q", typ, key) + } +} + +func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) (putPlan, error) { + // Minimal policy: chunk only when explicitly enabled and |totalSize| exceeds MaxPartSize. + if gbs.maxPartSize == 0 || totalSize <= 0 || uint64(totalSize) <= gbs.maxPartSize { + blobOID, err := gbs.api.HashObject(ctx, reader) + if err != nil { + return putPlan{}, err + } + return putPlan{writes: []treeWrite{{path: key, oid: blobOID}}}, nil + } + + partOIDs, err := gbs.hashChunkedParts(ctx, reader) + if err != nil { + return putPlan{}, err + } + + writes := make([]treeWrite, 0, len(partOIDs)) + for i, p := range partOIDs { + partName := fmt.Sprintf("%0*d", gitblobstorePartNameWidth, i+1) + writes = append(writes, treeWrite{path: key + "/" + partName, oid: p}) + } + return putPlan{writes: writes, chunked: true}, nil +} + +func (gbs *GitBlobstore) hashChunkedParts(ctx context.Context, reader io.Reader) (partOIDs []git.OID, err error) { + max := int64(gbs.maxPartSize) + if max <= 0 { + return nil, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) + } + + _, partOIDs, _, err = gbs.hashParts(ctx, reader) + if err != nil { + return nil, err + } + return partOIDs, nil +} + +func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []chunkPartRef, partOIDs []git.OID, total uint64, err error) { + max := int64(gbs.maxPartSize) + if max <= 0 { + return nil, nil, 0, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) + } + + buf := make([]byte, max) + for { + n, rerr := io.ReadFull(reader, buf) + if rerr != nil { + if errors.Is(rerr, io.EOF) { + break + } + if !errors.Is(rerr, io.ErrUnexpectedEOF) { + return nil, nil, 0, rerr + } + // ErrUnexpectedEOF: process final short chunk and stop. + } + if n == 0 { + break + } + partBytes := append([]byte(nil), buf[:n]...) + oid, err := gbs.api.HashObject(ctx, bytes.NewReader(partBytes)) + if err != nil { + return nil, nil, 0, err + } + partOIDs = append(partOIDs, oid) + parts = append(parts, chunkPartRef{oidHex: oid.String(), size: uint64(n)}) + total += uint64(n) + if errors.Is(rerr, io.ErrUnexpectedEOF) { + break + } + } + return parts, partOIDs, total, nil +} + +func (gbs *GitBlobstore) refAdvanced(ctx context.Context, old git.OID) bool { + if ctx.Err() != nil { + return false + } + cur, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + return err == nil && ok && cur != old +} + +type treeWrite struct { + path string + oid git.OID +} + +type putPlan struct { + writes []treeWrite + // If true, the key should be represented as a tree (chunked parts under key/NNNNNNNN). + chunked bool +} + +type limitReadCloser struct { + r io.Reader + c io.Closer +} + +func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } +func (l *limitReadCloser) Close() error { return l.c.Close() } + func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { // Implement BlobRange by slicing the streamed blob contents. // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads @@ -539,348 +1002,6 @@ func sliceChunkParts(parts []chunkPartRef, start, end int64) ([]chunkPartSlice, return out, nil } -func min(a, b int) int { - if a < b { - return a - } - return b -} - -func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { - key, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - - // Many NBS/table-file writes are content-addressed: if the key already exists, callers - // assume it refers to the same bytes and treat the operation as idempotent. - // - // The manifest is the main exception (it is mutable and updated via CheckAndPut), so - // we only apply this fast-path for non-manifest keys. - if key != "manifest" { - commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return "", err - } - if ok { - oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) - if err == nil { - // Per-key version: existing object id. - return oid.String(), nil - } - if !git.IsPathNotFound(err) { - return "", err - } - } - } - - msg := fmt.Sprintf("gitblobstore: put %s", key) - - // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, - // we can reuse the resulting object OIDs without re-reading |reader|. - plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) - if err != nil { - return "", err - } - - // Make Put resilient to concurrent writers updating unrelated keys by using a CAS loop - // under the hood. This matches typical object-store semantics more closely than an - // unconditional ref update (which could clobber other keys). - const maxRetries = 31 // 32 total attempts (initial + retries) - bo := backoff.NewExponentialBackOff() - bo.InitialInterval = 5 * time.Millisecond - bo.Multiplier = 2 - bo.MaxInterval = 320 * time.Millisecond - bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed - bo.Reset() - policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - - var ver string - op := func() error { - parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return backoff.Permanent(err) - } - - newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) - if err != nil { - return backoff.Permanent(err) - } - - if !ok { - // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids - // losing concurrent writes when multiple goroutines create the ref at once. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - ver = oid.String() - return nil - } - - err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) - if err == nil { - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - ver = oid.String() - return nil - } - - // If the ref changed since we read |parent|, retry on the new head. Otherwise - // surface the error (e.g. permissions, corruption). - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - - if err := backoff.Retry(op, policy); err != nil { - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - return ver, nil -} - -type treeWrite struct { - path string - oid git.OID -} - -func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, msg string) (git.OID, error) { - return gbs.buildCommitWithWrites(ctx, parent, hasParent, []treeWrite{{path: key, oid: blobOID}}, msg) -} - -func (gbs *GitBlobstore) buildCommitWithWrites(ctx context.Context, parent git.OID, hasParent bool, writes []treeWrite, msg string) (git.OID, error) { - _, indexFile, cleanup, err := newTempIndex() - if err != nil { - return "", err - } - defer cleanup() - - if hasParent { - if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { - return "", err - } - } else { - if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { - return "", err - } - } - - // TODO(gitblobstore): Decide on a policy for file-vs-directory prefix conflicts when staging keys. - // For example, staging "a" when "a/b" already exists in the tree/index (or vice-versa) can fail - // with a git index error (path appears as both a file and directory). Today our NBS keyspace is - // flat (e.g. "manifest", "", ".records"), so this should not occur. If we ever - // namespace keys into directories, consider proactively removing conflicting paths from the index - // before UpdateIndexCacheInfo so Put/CheckAndPut remain robust. - sort.Slice(writes, func(i, j int) bool { return writes[i].path < writes[j].path }) - for _, w := range writes { - if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { - return "", err - } - } - - treeOID, err := gbs.api.WriteTree(ctx, indexFile) - if err != nil { - return "", err - } - - var parentPtr *git.OID - if hasParent && parent != "" { - p := parent - parentPtr = &p - } - - // Prefer git's default identity from env/config when not explicitly configured. - commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) - if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { - commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) - } - if err != nil { - return "", err - } - - return commitOID, nil -} - -func (gbs *GitBlobstore) buildCommitForKeyWrite(ctx context.Context, parent git.OID, hasParent bool, key string, plan putPlan, msg string) (git.OID, error) { - _, indexFile, cleanup, err := newTempIndex() - if err != nil { - return "", err - } - defer cleanup() - - if hasParent { - if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { - return "", err - } - } else { - if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { - return "", err - } - } - - if hasParent { - if err := gbs.removeKeyConflictsFromIndex(ctx, parent, indexFile, key, plan.chunked); err != nil { - return "", err - } - } - - sort.Slice(plan.writes, func(i, j int) bool { return plan.writes[i].path < plan.writes[j].path }) - for _, w := range plan.writes { - if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { - return "", err - } - } - - treeOID, err := gbs.api.WriteTree(ctx, indexFile) - if err != nil { - return "", err - } - - var parentPtr *git.OID - if hasParent && parent != "" { - p := parent - parentPtr = &p - } - - commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) - if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { - commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) - } - if err != nil { - return "", err - } - return commitOID, nil -} - -func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent git.OID, indexFile string, key string, newIsChunked bool) error { - oid, typ, err := gbs.api.ResolvePathObject(ctx, parent, key) - if err != nil { - if git.IsPathNotFound(err) { - return nil - } - return err - } - _ = oid - - switch typ { - case git.ObjectTypeBlob: - if newIsChunked { - // blob -> tree: must remove the file entry at - return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) - } - return nil - - case git.ObjectTypeTree: - // tree -> blob OR tree overwrite: remove old child entries under /... - entries, err := gbs.api.ListTree(ctx, parent, key) - if err != nil { - return err - } - if len(entries) == 0 { - return nil - } - paths := make([]string, 0, len(entries)) - for _, e := range entries { - paths = append(paths, key+"/"+e.Name) - } - return gbs.api.RemoveIndexPaths(ctx, indexFile, paths) - - default: - return fmt.Errorf("gitblobstore: unsupported existing object type %q at key %q", typ, key) - } -} - -type putPlan struct { - writes []treeWrite - // If true, the key should be represented as a tree (chunked parts under key/NNNNNNNN). - chunked bool -} - -func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) (putPlan, error) { - // Minimal policy: chunk only when explicitly enabled and |totalSize| exceeds MaxPartSize. - if gbs.maxPartSize == 0 || totalSize <= 0 || uint64(totalSize) <= gbs.maxPartSize { - blobOID, err := gbs.api.HashObject(ctx, reader) - if err != nil { - return putPlan{}, err - } - return putPlan{writes: []treeWrite{{path: key, oid: blobOID}}}, nil - } - - partOIDs, err := gbs.hashChunkedParts(ctx, reader) - if err != nil { - return putPlan{}, err - } - - writes := make([]treeWrite, 0, len(partOIDs)) - for i, p := range partOIDs { - partName := fmt.Sprintf("%0*d", gitblobstorePartNameWidth, i+1) - writes = append(writes, treeWrite{path: key + "/" + partName, oid: p}) - } - return putPlan{writes: writes, chunked: true}, nil -} - -func (gbs *GitBlobstore) hashChunkedParts(ctx context.Context, reader io.Reader) (partOIDs []git.OID, err error) { - max := int64(gbs.maxPartSize) - if max <= 0 { - return nil, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) - } - - _, partOIDs, _, err = gbs.hashParts(ctx, reader) - if err != nil { - return nil, err - } - return partOIDs, nil -} - -func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts []chunkPartRef, partOIDs []git.OID, total uint64, err error) { - max := int64(gbs.maxPartSize) - if max <= 0 { - return nil, nil, 0, fmt.Errorf("gitblobstore: invalid maxPartSize %d", gbs.maxPartSize) - } - - buf := make([]byte, max) - for { - n, rerr := io.ReadFull(reader, buf) - if rerr != nil { - if errors.Is(rerr, io.EOF) { - break - } - if !errors.Is(rerr, io.ErrUnexpectedEOF) { - return nil, nil, 0, rerr - } - // ErrUnexpectedEOF: process final short chunk and stop. - } - if n == 0 { - break - } - partBytes := append([]byte(nil), buf[:n]...) - oid, err := gbs.api.HashObject(ctx, bytes.NewReader(partBytes)) - if err != nil { - return nil, nil, 0, err - } - partOIDs = append(partOIDs, oid) - parts = append(parts, chunkPartRef{oidHex: oid.String(), size: uint64(n)}) - total += uint64(n) - if errors.Is(rerr, io.ErrUnexpectedEOF) { - break - } - } - return parts, partOIDs, total, nil -} - func defaultGitBlobstoreIdentity() *git.Identity { // Deterministic fallback identity for environments without git identity configured. return &git.Identity{Name: "dolt gitblobstore", Email: "gitblobstore@dolt.invalid"} @@ -921,134 +1042,6 @@ func newTempIndex() (dir, indexFile string, cleanup func(), err error) { return dir, indexFile, cleanup, nil } -func (gbs *GitBlobstore) refAdvanced(ctx context.Context, old git.OID) bool { - if ctx.Err() != nil { - return false - } - cur, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - return err == nil && ok && cur != old -} - -func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { - key, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - - msg := fmt.Sprintf("gitblobstore: checkandput %s", key) - - // Implement per-key CAS by validating |expectedVersion| against the current key version - // at HEAD, then committing on that HEAD and CAS-updating the ref. If the ref advances, - // retry by re-checking the key version. - const maxRetries = 31 // 32 total attempts (initial + retries) - bo := backoff.NewExponentialBackOff() - bo.InitialInterval = 5 * time.Millisecond - bo.Multiplier = 2 - bo.MaxInterval = 320 * time.Millisecond - bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed - bo.Reset() - policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - - var newKeyVersion string - var cachedPlan *putPlan - op := func() error { - parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return backoff.Permanent(err) - } - - actualKeyVersion, err := gbs.currentKeyVersion(ctx, parent, ok, key) - if err != nil { - return backoff.Permanent(err) - } - if expectedVersion != actualKeyVersion { - return backoff.Permanent(CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualKeyVersion}) - } - - // Only hash/consume the reader once we know the expectedVersion matches. - // If we need to retry due to unrelated ref advances, reuse the cached plan so we - // don't re-read |reader| (which may not be rewindable). - if cachedPlan == nil { - plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) - if err != nil { - return backoff.Permanent(err) - } - cachedPlan = &plan - } - - newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, *cachedPlan, msg) - if err != nil { - return backoff.Permanent(err) - } - - if !ok { - // Create-only CAS: oldOID=all-zero requires the ref to not exist. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - // If the ref now exists, retry; otherwise surface the error. - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - } else { - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - } - - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - newKeyVersion = oid.String() - return nil - } - - if err := backoff.Retry(op, policy); err != nil { - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - return newKeyVersion, nil -} - -func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, haveCommit bool, key string) (string, error) { - if !haveCommit { - // Ref missing => empty store => key missing. - return "", nil - } - oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) - if err != nil { - if git.IsPathNotFound(err) { - return "", nil - } - return "", err - } - return oid.String(), nil -} - -func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { - // Chunked-object support is landing in phases. Concatenate is the final piece - // needed for NBS conjoin and is intentionally left unimplemented on this branch. - // - // Keep key validation for consistent error behavior. - _, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - for _, src := range sources { - if _, err := normalizeGitTreePath(src); err != nil { - return "", err - } - } - return "", git.ErrUnimplemented -} - // normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path. // // Rules: From 05b083e63a298bafe50e1c7f148b81a2afe804b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 08:24:29 -0800 Subject: [PATCH 19/28] /go/store/blobstore: more refactor --- go/store/blobstore/git_blobstore.go | 742 +++++++++++++--------------- 1 file changed, 344 insertions(+), 398 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 0a96b7cd9e..e27e13ccdf 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -173,268 +173,6 @@ func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io. } } -func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { - key, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - - // Many NBS/table-file writes are content-addressed: if the key already exists, callers - // assume it refers to the same bytes and treat the operation as idempotent. - // - // The manifest is the main exception (it is mutable and updated via CheckAndPut), so - // we only apply this fast-path for non-manifest keys. - if key != "manifest" { - commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return "", err - } - if ok { - oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) - if err == nil { - // Per-key version: existing object id. - return oid.String(), nil - } - if !git.IsPathNotFound(err) { - return "", err - } - } - } - - msg := fmt.Sprintf("gitblobstore: put %s", key) - - // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, - // we can reuse the resulting object OIDs without re-reading |reader|. - plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) - if err != nil { - return "", err - } - - // Make Put resilient to concurrent writers updating unrelated keys by using a CAS loop - // under the hood. This matches typical object-store semantics more closely than an - // unconditional ref update (which could clobber other keys). - const maxRetries = 31 // 32 total attempts (initial + retries) - bo := backoff.NewExponentialBackOff() - bo.InitialInterval = 5 * time.Millisecond - bo.Multiplier = 2 - bo.MaxInterval = 320 * time.Millisecond - bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed - bo.Reset() - policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - - var ver string - op := func() error { - parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return backoff.Permanent(err) - } - - newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) - if err != nil { - return backoff.Permanent(err) - } - - if !ok { - // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids - // losing concurrent writes when multiple goroutines create the ref at once. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - ver = oid.String() - return nil - } - - err = gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg) - if err == nil { - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - ver = oid.String() - return nil - } - - // If the ref changed since we read |parent|, retry on the new head. Otherwise - // surface the error (e.g. permissions, corruption). - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - - if err := backoff.Retry(op, policy); err != nil { - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - return ver, nil -} - -func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { - key, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - - msg := fmt.Sprintf("gitblobstore: checkandput %s", key) - - // Implement per-key CAS by validating |expectedVersion| against the current key version - // at HEAD, then committing on that HEAD and CAS-updating the ref. If the ref advances, - // retry by re-checking the key version. - const maxRetries = 31 // 32 total attempts (initial + retries) - bo := backoff.NewExponentialBackOff() - bo.InitialInterval = 5 * time.Millisecond - bo.Multiplier = 2 - bo.MaxInterval = 320 * time.Millisecond - bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed - bo.Reset() - policy := backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) - - var newKeyVersion string - var cachedPlan *putPlan - op := func() error { - parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return backoff.Permanent(err) - } - - actualKeyVersion, err := gbs.currentKeyVersion(ctx, parent, ok, key) - if err != nil { - return backoff.Permanent(err) - } - if expectedVersion != actualKeyVersion { - return backoff.Permanent(CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualKeyVersion}) - } - - // Only hash/consume the reader once we know the expectedVersion matches. - // If we need to retry due to unrelated ref advances, reuse the cached plan so we - // don't re-read |reader| (which may not be rewindable). - if cachedPlan == nil { - plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) - if err != nil { - return backoff.Permanent(err) - } - cachedPlan = &plan - } - - newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, *cachedPlan, msg) - if err != nil { - return backoff.Permanent(err) - } - - if !ok { - // Create-only CAS: oldOID=all-zero requires the ref to not exist. - const zeroOID = git.OID("0000000000000000000000000000000000000000") - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { - // If the ref now exists, retry; otherwise surface the error. - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - } else { - if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { - if gbs.refAdvanced(ctx, parent) { - return err - } - return backoff.Permanent(err) - } - } - - oid, _, err := gbs.api.ResolvePathObject(ctx, newCommit, key) - if err != nil { - return backoff.Permanent(err) - } - newKeyVersion = oid.String() - return nil - } - - if err := backoff.Retry(op, policy); err != nil { - if ctx.Err() != nil { - return "", ctx.Err() - } - return "", err - } - return newKeyVersion, nil -} - -func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { - // Chunked-object support is landing in phases. Concatenate is the final piece - // needed for NBS conjoin and is intentionally left unimplemented on this branch. - // - // Keep key validation for consistent error behavior. - _, err := normalizeGitTreePath(key) - if err != nil { - return "", err - } - for _, src := range sources { - if _, err := normalizeGitTreePath(src); err != nil { - return "", err - } - } - return "", git.ErrUnimplemented -} - -func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, haveCommit bool, key string) (string, error) { - if !haveCommit { - // Ref missing => empty store => key missing. - return "", nil - } - oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) - if err != nil { - if git.IsPathNotFound(err) { - return "", nil - } - return "", err - } - return oid.String(), nil -} - -func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, err error) { - commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) - if err != nil { - return git.OID(""), err - } - if ok { - return commit, nil - } - - // If the ref doesn't exist, treat the manifest as missing (empty store), - // but surface a hard error for other keys: the store itself is missing. - if key == "manifest" { - return git.OID(""), NotFound{Key: key} - } - return git.OID(""), &git.RefNotFoundError{Ref: gbs.ref} -} - -func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ git.ObjectType, err error) { - oid, typ, err = gbs.api.ResolvePathObject(ctx, commit, key) - if err != nil { - if git.IsPathNotFound(err) { - return git.OID(""), git.ObjectTypeUnknown, NotFound{Key: key} - } - return git.OID(""), git.ObjectTypeUnknown, err - } - return oid, typ, nil -} - -func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.OID, oid git.OID) (sz int64, ver string, err error) { - sz, err = gbs.api.BlobSize(ctx, oid) - if err != nil { - return 0, commit.String(), err - } - return sz, commit.String(), nil -} - func (gbs *GitBlobstore) openChunkedTreeRange(ctx context.Context, commit git.OID, key string, br BlobRange) (io.ReadCloser, uint64, string, error) { ver := commit.String() @@ -514,150 +252,68 @@ func (gbs *GitBlobstore) validateAndSizeChunkedParts(ctx context.Context, entrie return parts, total, nil } -func (gbs *GitBlobstore) buildCommitWithMessage(ctx context.Context, parent git.OID, hasParent bool, key string, blobOID git.OID, msg string) (git.OID, error) { - return gbs.buildCommitWithWrites(ctx, parent, hasParent, []treeWrite{{path: key, oid: blobOID}}, msg) +func (gbs *GitBlobstore) resolveCommitForGet(ctx context.Context, key string) (commit git.OID, err error) { + commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return git.OID(""), err + } + if ok { + return commit, nil + } + + // If the ref doesn't exist, treat the manifest as missing (empty store), + // but surface a hard error for other keys: the store itself is missing. + if key == "manifest" { + return git.OID(""), NotFound{Key: key} + } + return git.OID(""), &git.RefNotFoundError{Ref: gbs.ref} } -func (gbs *GitBlobstore) buildCommitWithWrites(ctx context.Context, parent git.OID, hasParent bool, writes []treeWrite, msg string) (git.OID, error) { - _, indexFile, cleanup, err := newTempIndex() - if err != nil { - return "", err - } - defer cleanup() - - if hasParent { - if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { - return "", err - } - } else { - if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { - return "", err - } - } - - // TODO(gitblobstore): Decide on a policy for file-vs-directory prefix conflicts when staging keys. - // For example, staging "a" when "a/b" already exists in the tree/index (or vice-versa) can fail - // with a git index error (path appears as both a file and directory). Today our NBS keyspace is - // flat (e.g. "manifest", "", ".records"), so this should not occur. If we ever - // namespace keys into directories, consider proactively removing conflicting paths from the index - // before UpdateIndexCacheInfo so Put/CheckAndPut remain robust. - sort.Slice(writes, func(i, j int) bool { return writes[i].path < writes[j].path }) - for _, w := range writes { - if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { - return "", err - } - } - - treeOID, err := gbs.api.WriteTree(ctx, indexFile) - if err != nil { - return "", err - } - - var parentPtr *git.OID - if hasParent && parent != "" { - p := parent - parentPtr = &p - } - - // Prefer git's default identity from env/config when not explicitly configured. - commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) - if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { - commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) - } - if err != nil { - return "", err - } - - return commitOID, nil -} - -func (gbs *GitBlobstore) buildCommitForKeyWrite(ctx context.Context, parent git.OID, hasParent bool, key string, plan putPlan, msg string) (git.OID, error) { - _, indexFile, cleanup, err := newTempIndex() - if err != nil { - return "", err - } - defer cleanup() - - if hasParent { - if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { - return "", err - } - } else { - if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { - return "", err - } - } - - if hasParent { - if err := gbs.removeKeyConflictsFromIndex(ctx, parent, indexFile, key, plan.chunked); err != nil { - return "", err - } - } - - sort.Slice(plan.writes, func(i, j int) bool { return plan.writes[i].path < plan.writes[j].path }) - for _, w := range plan.writes { - if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { - return "", err - } - } - - treeOID, err := gbs.api.WriteTree(ctx, indexFile) - if err != nil { - return "", err - } - - var parentPtr *git.OID - if hasParent && parent != "" { - p := parent - parentPtr = &p - } - - commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) - if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { - commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) - } - if err != nil { - return "", err - } - return commitOID, nil -} - -func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent git.OID, indexFile string, key string, newIsChunked bool) error { - oid, typ, err := gbs.api.ResolvePathObject(ctx, parent, key) +func (gbs *GitBlobstore) resolveObjectForGet(ctx context.Context, commit git.OID, key string) (oid git.OID, typ git.ObjectType, err error) { + oid, typ, err = gbs.api.ResolvePathObject(ctx, commit, key) if err != nil { if git.IsPathNotFound(err) { - return nil + return git.OID(""), git.ObjectTypeUnknown, NotFound{Key: key} } - return err + return git.OID(""), git.ObjectTypeUnknown, err } - _ = oid + return oid, typ, nil +} - switch typ { - case git.ObjectTypeBlob: - if newIsChunked { - // blob -> tree: must remove the file entry at - return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) - } - return nil - - case git.ObjectTypeTree: - // tree -> blob OR tree overwrite: remove old child entries under /... - entries, err := gbs.api.ListTree(ctx, parent, key) - if err != nil { - return err - } - if len(entries) == 0 { - return nil - } - paths := make([]string, 0, len(entries)) - for _, e := range entries { - paths = append(paths, key+"/"+e.Name) - } - return gbs.api.RemoveIndexPaths(ctx, indexFile, paths) - - default: - return fmt.Errorf("gitblobstore: unsupported existing object type %q at key %q", typ, key) +func (gbs *GitBlobstore) resolveBlobSizeForGet(ctx context.Context, commit git.OID, oid git.OID) (sz int64, ver string, err error) { + sz, err = gbs.api.BlobSize(ctx, oid) + if err != nil { + return 0, commit.String(), err } + return sz, commit.String(), nil +} + +func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + + // Many NBS/table-file writes are content-addressed: if the key already exists, callers + // assume it refers to the same bytes and treat the operation as idempotent. + // + // The manifest is the main exception (it is mutable and updated via CheckAndPut), so + // we only apply this fast-path for non-manifest keys. + if ver, ok, err := gbs.tryFastSucceedPutIfKeyExists(ctx, key); err != nil { + return "", err + } else if ok { + return ver, nil + } + + msg := fmt.Sprintf("gitblobstore: put %s", key) + + // Hash the contents once. If we need to retry due to concurrent updates to |gbs.ref|, + // we can reuse the resulting object OIDs without re-reading |reader|. + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) + if err != nil { + return "", err + } + return gbs.putWithCASRetries(ctx, key, plan, msg) } func (gbs *GitBlobstore) planPutWrites(ctx context.Context, key string, totalSize int64, reader io.Reader) (putPlan, error) { @@ -732,6 +388,168 @@ func (gbs *GitBlobstore) hashParts(ctx context.Context, reader io.Reader) (parts return parts, partOIDs, total, nil } +func (gbs *GitBlobstore) putWithCASRetries(ctx context.Context, key string, plan putPlan, msg string) (string, error) { + // Make Put resilient to concurrent writers updating unrelated keys by using a CAS loop + // under the hood. This matches typical object-store semantics more closely than an + // unconditional ref update (which could clobber other keys). + policy := gbs.casRetryPolicy(ctx) + + var ver string + op := func() error { + parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, plan, msg) + if err != nil { + return backoff.Permanent(err) + } + + if err := gbs.updateRefCASForWrite(ctx, parent, ok, newCommit, msg); err != nil { + return err + } + + ver, err = gbs.resolveKeyVersionAtCommit(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + return nil + } + + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + return ver, nil +} + +func (gbs *GitBlobstore) casRetryPolicy(ctx context.Context) backoff.BackOff { + const maxRetries = 31 // 32 total attempts (initial + retries) + bo := backoff.NewExponentialBackOff() + bo.InitialInterval = 5 * time.Millisecond + bo.Multiplier = 2 + bo.MaxInterval = 320 * time.Millisecond + bo.RandomizationFactor = 0 // deterministic; can add jitter later if needed + bo.Reset() + return backoff.WithContext(backoff.WithMaxRetries(bo, maxRetries), ctx) +} + +func (gbs *GitBlobstore) buildCommitForKeyWrite(ctx context.Context, parent git.OID, hasParent bool, key string, plan putPlan, msg string) (git.OID, error) { + _, indexFile, cleanup, err := newTempIndex() + if err != nil { + return "", err + } + defer cleanup() + + if hasParent { + if err := gbs.api.ReadTree(ctx, parent, indexFile); err != nil { + return "", err + } + } else { + if err := gbs.api.ReadTreeEmpty(ctx, indexFile); err != nil { + return "", err + } + } + + if hasParent { + if err := gbs.removeKeyConflictsFromIndex(ctx, parent, indexFile, key, plan.chunked); err != nil { + return "", err + } + } + + sort.Slice(plan.writes, func(i, j int) bool { return plan.writes[i].path < plan.writes[j].path }) + for _, w := range plan.writes { + if err := gbs.api.UpdateIndexCacheInfo(ctx, indexFile, "100644", w.oid, w.path); err != nil { + return "", err + } + } + + treeOID, err := gbs.api.WriteTree(ctx, indexFile) + if err != nil { + return "", err + } + + var parentPtr *git.OID + if hasParent && parent != "" { + p := parent + parentPtr = &p + } + + commitOID, err := gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, gbs.identity) + if err != nil && gbs.identity == nil && isMissingGitIdentityErr(err) { + commitOID, err = gbs.api.CommitTree(ctx, treeOID, parentPtr, msg, defaultGitBlobstoreIdentity()) + } + if err != nil { + return "", err + } + return commitOID, nil +} + +func (gbs *GitBlobstore) removeKeyConflictsFromIndex(ctx context.Context, parent git.OID, indexFile string, key string, newIsChunked bool) error { + _, typ, err := gbs.api.ResolvePathObject(ctx, parent, key) + if err != nil { + if git.IsPathNotFound(err) { + return nil + } + return err + } + + switch typ { + case git.ObjectTypeBlob: + if newIsChunked { + // blob -> tree: must remove the file entry at + return gbs.api.RemoveIndexPaths(ctx, indexFile, []string{key}) + } + return nil + + case git.ObjectTypeTree: + // tree -> blob OR tree overwrite: remove old child entries under /... + entries, err := gbs.api.ListTree(ctx, parent, key) + if err != nil { + return err + } + if len(entries) == 0 { + return nil + } + paths := make([]string, 0, len(entries)) + for _, e := range entries { + paths = append(paths, key+"/"+e.Name) + } + return gbs.api.RemoveIndexPaths(ctx, indexFile, paths) + + default: + return fmt.Errorf("gitblobstore: unsupported existing object type %q at key %q", typ, key) + } +} + +func (gbs *GitBlobstore) updateRefCASForWrite(ctx context.Context, parent git.OID, haveParent bool, newCommit git.OID, msg string) error { + if !haveParent { + // Create-only CAS: oldOID=all-zero requires the ref to not exist. This avoids + // losing concurrent writes when multiple goroutines create the ref at once. + const zeroOID = git.OID("0000000000000000000000000000000000000000") + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, zeroOID, msg); err != nil { + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + return nil + } + + if err := gbs.api.UpdateRefCAS(ctx, gbs.ref, newCommit, parent, msg); err != nil { + // If the ref changed since we read |parent|, retry on the new head. Otherwise + // surface the error (e.g. permissions, corruption). + if gbs.refAdvanced(ctx, parent) { + return err + } + return backoff.Permanent(err) + } + return nil +} + func (gbs *GitBlobstore) refAdvanced(ctx context.Context, old git.OID) bool { if ctx.Err() != nil { return false @@ -740,6 +558,134 @@ func (gbs *GitBlobstore) refAdvanced(ctx context.Context, old git.OID) bool { return err == nil && ok && cur != old } +func (gbs *GitBlobstore) resolveKeyVersionAtCommit(ctx context.Context, commit git.OID, key string) (string, error) { + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err != nil { + return "", err + } + return oid.String(), nil +} + +func (gbs *GitBlobstore) tryFastSucceedPutIfKeyExists(ctx context.Context, key string) (ver string, ok bool, err error) { + if key == "manifest" { + return "", false, nil + } + + commit, haveCommit, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return "", false, err + } + if !haveCommit { + return "", false, nil + } + + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err == nil { + // Per-key version: existing object id. + return oid.String(), true, nil + } + if git.IsPathNotFound(err) { + return "", false, nil + } + return "", false, err +} + +func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) { + key, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + + msg := fmt.Sprintf("gitblobstore: checkandput %s", key) + + policy := gbs.casRetryPolicy(ctx) + + var newKeyVersion string + var cachedPlan *putPlan + op := func() error { + parent, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return backoff.Permanent(err) + } + + actualKeyVersion, err := gbs.currentKeyVersion(ctx, parent, ok, key) + if err != nil { + return backoff.Permanent(err) + } + if expectedVersion != actualKeyVersion { + return backoff.Permanent(CheckAndPutError{Key: key, ExpectedVersion: expectedVersion, ActualVersion: actualKeyVersion}) + } + + // Only hash/consume the reader once we know the expectedVersion matches. + // If we need to retry due to unrelated ref advances, reuse the cached plan so we + // don't re-read |reader| (which may not be rewindable). + if cachedPlan == nil { + plan, err := gbs.planPutWrites(ctx, key, totalSize, reader) + if err != nil { + return backoff.Permanent(err) + } + cachedPlan = &plan + } + + newCommit, err := gbs.buildCommitForKeyWrite(ctx, parent, ok, key, *cachedPlan, msg) + if err != nil { + return backoff.Permanent(err) + } + + if err := gbs.updateRefCASForWrite(ctx, parent, ok, newCommit, msg); err != nil { + return err + } + + ver, err := gbs.resolveKeyVersionAtCommit(ctx, newCommit, key) + if err != nil { + return backoff.Permanent(err) + } + newKeyVersion = ver + return nil + } + + if err := backoff.Retry(op, policy); err != nil { + if ctx.Err() != nil { + return "", ctx.Err() + } + return "", err + } + + return newKeyVersion, nil +} + +func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, haveCommit bool, key string) (string, error) { + if !haveCommit { + // Ref missing => empty store => key missing. + return "", nil + } + oid, _, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return "", nil + } + return "", err + } + return oid.String(), nil +} + +func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { + // Chunked-object support is landing in phases. Concatenate is the final piece + // needed for NBS conjoin and is intentionally left unimplemented on this branch. + // + // Keep key validation for consistent error behavior. + _, err := normalizeGitTreePath(key) + if err != nil { + return "", err + } + for _, src := range sources { + if _, err := normalizeGitTreePath(src); err != nil { + return "", err + } + } + return "", git.ErrUnimplemented +} + type treeWrite struct { path string oid git.OID From 330ad2a96324c18e8e6411ce88c2b4def0f7783c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 08:29:06 -0800 Subject: [PATCH 20/28] /go/store/blobstore/git_blobstore.go: more refactor --- go/store/blobstore/git_blobstore.go | 252 ++++++++++++++-------------- 1 file changed, 126 insertions(+), 126 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index e27e13ccdf..a9a1ee0073 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -46,6 +46,132 @@ type chunkPartSlice struct { length int64 } +type treeWrite struct { + path string + oid git.OID +} + +type putPlan struct { + writes []treeWrite + // If true, the key should be represented as a tree (chunked parts under key/NNNNNNNN). + chunked bool +} + +type limitReadCloser struct { + r io.Reader + c io.Closer +} + +func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } +func (l *limitReadCloser) Close() error { return l.c.Close() } + +type multiPartReadCloser struct { + ctx context.Context + api git.GitAPI + + slices []chunkPartSlice + curIdx int + + curRC io.ReadCloser + rem int64 +} + +func (m *multiPartReadCloser) Read(p []byte) (int, error) { + for { + if err := m.ensureCurrent(); err != nil { + return 0, err + } + if m.curRC == nil { + return 0, io.EOF + } + + if m.rem == 0 { + _ = m.closeCurrentAndAdvance() + continue + } + + n, err := m.readCurrent(p) + if n > 0 || err != nil { + return n, err + } + } +} + +func (m *multiPartReadCloser) ensureCurrent() error { + if m.curRC != nil { + return nil + } + if m.curIdx >= len(m.slices) { + return nil + } + s := m.slices[m.curIdx] + rc, err := m.openSliceReader(s) + if err != nil { + return err + } + m.curRC = rc + m.rem = s.length + return nil +} + +func (m *multiPartReadCloser) openSliceReader(s chunkPartSlice) (io.ReadCloser, error) { + rc, err := m.api.BlobReader(m.ctx, git.OID(s.oidHex)) + if err != nil { + return nil, err + } + if err := skipN(rc, s.offset); err != nil { + _ = rc.Close() + return nil, err + } + return rc, nil +} + +func (m *multiPartReadCloser) closeCurrentAndAdvance() error { + if m.curRC != nil { + err := m.curRC.Close() + m.curRC = nil + m.rem = 0 + m.curIdx++ + return err + } + m.curIdx++ + return nil +} + +func (m *multiPartReadCloser) readCurrent(p []byte) (int, error) { + toRead := len(p) + if int64(toRead) > m.rem { + toRead = int(m.rem) + } + + n, err := m.curRC.Read(p[:toRead]) + if n > 0 { + m.rem -= int64(n) + return n, nil + } + if err == nil { + return 0, nil + } + if errors.Is(err, io.EOF) { + // End of underlying part blob; if we still expected bytes, that's corruption. + if m.rem > 0 { + return 0, io.ErrUnexpectedEOF + } + _ = m.closeCurrentAndAdvance() + return 0, nil + } + return 0, err +} + +func (m *multiPartReadCloser) Close() error { + if m.curRC != nil { + err := m.curRC.Close() + m.curRC = nil + return err + } + return nil +} + // GitBlobstore is a Blobstore implementation backed by a git repository's object // database (bare repo or .git directory). It stores keys as paths within the tree // of the commit referenced by a git ref (e.g. refs/dolt/data). @@ -686,25 +812,6 @@ func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources [] return "", git.ErrUnimplemented } -type treeWrite struct { - path string - oid git.OID -} - -type putPlan struct { - writes []treeWrite - // If true, the key should be represented as a tree (chunked parts under key/NNNNNNNN). - chunked bool -} - -type limitReadCloser struct { - r io.Reader - c io.Closer -} - -func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) } -func (l *limitReadCloser) Close() error { return l.c.Close() } - func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { // Implement BlobRange by slicing the streamed blob contents. // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads @@ -743,113 +850,6 @@ func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.R return &limitReadCloser{r: io.LimitReader(rc, pos.length), c: rc}, uint64(sz), ver, nil } -type multiPartReadCloser struct { - ctx context.Context - api git.GitAPI - - slices []chunkPartSlice - curIdx int - - curRC io.ReadCloser - rem int64 -} - -func (m *multiPartReadCloser) Read(p []byte) (int, error) { - for { - if err := m.ensureCurrent(); err != nil { - return 0, err - } - if m.curRC == nil { - return 0, io.EOF - } - - if m.rem == 0 { - _ = m.closeCurrentAndAdvance() - continue - } - - n, err := m.readCurrent(p) - if n > 0 || err != nil { - return n, err - } - } -} - -func (m *multiPartReadCloser) ensureCurrent() error { - if m.curRC != nil { - return nil - } - if m.curIdx >= len(m.slices) { - return nil - } - s := m.slices[m.curIdx] - rc, err := m.openSliceReader(s) - if err != nil { - return err - } - m.curRC = rc - m.rem = s.length - return nil -} - -func (m *multiPartReadCloser) openSliceReader(s chunkPartSlice) (io.ReadCloser, error) { - rc, err := m.api.BlobReader(m.ctx, git.OID(s.oidHex)) - if err != nil { - return nil, err - } - if err := skipN(rc, s.offset); err != nil { - _ = rc.Close() - return nil, err - } - return rc, nil -} - -func (m *multiPartReadCloser) closeCurrentAndAdvance() error { - if m.curRC != nil { - err := m.curRC.Close() - m.curRC = nil - m.rem = 0 - m.curIdx++ - return err - } - m.curIdx++ - return nil -} - -func (m *multiPartReadCloser) readCurrent(p []byte) (int, error) { - toRead := len(p) - if int64(toRead) > m.rem { - toRead = int(m.rem) - } - - n, err := m.curRC.Read(p[:toRead]) - if n > 0 { - m.rem -= int64(n) - return n, nil - } - if err == nil { - return 0, nil - } - if errors.Is(err, io.EOF) { - // End of underlying part blob; if we still expected bytes, that's corruption. - if m.rem > 0 { - return 0, io.ErrUnexpectedEOF - } - _ = m.closeCurrentAndAdvance() - return 0, nil - } - return 0, err -} - -func (m *multiPartReadCloser) Close() error { - if m.curRC != nil { - err := m.curRC.Close() - m.curRC = nil - return err - } - return nil -} - func skipN(r io.Reader, n int64) error { if n <= 0 { return nil From 7f6f4709ab4cc9f869aec3250038a60e16f18ea9 Mon Sep 17 00:00:00 2001 From: Dustin Brown Date: Fri, 6 Feb 2026 09:03:26 -0800 Subject: [PATCH 21/28] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- go/store/blobstore/git_blobstore.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index a9a1ee0073..478c9c5867 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -86,7 +86,9 @@ func (m *multiPartReadCloser) Read(p []byte) (int, error) { } if m.rem == 0 { - _ = m.closeCurrentAndAdvance() + if err := m.closeCurrentAndAdvance(); err != nil { + return 0, err + } continue } From 5419d5279c4cd94fbefa275abd0b7b1acf0d0719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Fri, 6 Feb 2026 09:08:18 -0800 Subject: [PATCH 22/28] /go/store/blobstore: update comments --- go/store/blobstore/blobstore.go | 7 ++++++- go/store/blobstore/git_blobstore.go | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/go/store/blobstore/blobstore.go b/go/store/blobstore/blobstore.go index 2ff5a97259..cfcfc0281c 100644 --- a/go/store/blobstore/blobstore.go +++ b/go/store/blobstore/blobstore.go @@ -33,7 +33,12 @@ type Blobstore interface { // Get returns a byte range of from the blob keyed by |key|, and the latest store version. Get(ctx context.Context, key string, br BlobRange) (rc io.ReadCloser, size uint64, version string, err error) - // Put creates a new blob from |reader| keyed by |key|, it returns the latest store version. + // Put stores a blob from |reader| keyed by |key|, returning the latest store version. + // + // If |key| already exists, behavior is implementation-defined: some Blobstore + // implementations overwrite, while others may treat Put as idempotent and fast-succeed + // without consuming |reader|. Callers that require an explicit check-and-set should use + // CheckAndPut. Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (version string, err error) // CheckAndPut updates the blob keyed by |key| using a check-and-set on |expectedVersion|. diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 478c9c5867..e688635863 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -425,8 +425,11 @@ func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, r // Many NBS/table-file writes are content-addressed: if the key already exists, callers // assume it refers to the same bytes and treat the operation as idempotent. // - // The manifest is the main exception (it is mutable and updated via CheckAndPut), so - // we only apply this fast-path for non-manifest keys. + // GitBlobstore enforces that assumption by fast-succeeding when a non-manifest key + // already exists: it returns the existing per-key version and does not overwrite the + // key (and does not consume |reader|). + // + // The manifest is the main exception (it is mutable and updated via CheckAndPut). if ver, ok, err := gbs.tryFastSucceedPutIfKeyExists(ctx, key); err != nil { return "", err } else if ok { From b7bb09dffef03ce956fe12597100030614b507e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Mon, 9 Feb 2026 12:45:32 -0800 Subject: [PATCH 23/28] /go/store/nbs/git_blobstore_read_smoke_test.go: fix windows test --- go/store/nbs/git_blobstore_read_smoke_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/go/store/nbs/git_blobstore_read_smoke_test.go b/go/store/nbs/git_blobstore_read_smoke_test.go index 3570457166..8be83c3e0c 100644 --- a/go/store/nbs/git_blobstore_read_smoke_test.go +++ b/go/store/nbs/git_blobstore_read_smoke_test.go @@ -93,8 +93,12 @@ func TestGitBlobstoreReadSmoke_ManifestAndTableAccessPatterns(t *testing.T) { require.Equal(t, table[len(table)-tailN:], tail) // Per-key version should be stable across reads. - _, _, ver2, err := bs.Get(ctx, "table", blobstore.AllRange) + rc2, _, ver2, err := bs.Get(ctx, "table", blobstore.AllRange) require.NoError(t, err) + // Drain before close to avoid broken-pipe errors from killing git early. + _, err = io.Copy(io.Discard, rc2) + require.NoError(t, err) + require.NoError(t, rc2.Close()) require.Equal(t, ver, ver2) // 3) ReadAt-style ranged reads used by table readers. From 32eca34587f311b74076490299277b72b1c2fe74 Mon Sep 17 00:00:00 2001 From: Aaron Son Date: Mon, 9 Feb 2026 13:01:07 -0800 Subject: [PATCH 24/28] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../concurrent_writes_test.go | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/integration-tests/go-sql-server-driver/concurrent_writes_test.go b/integration-tests/go-sql-server-driver/concurrent_writes_test.go index 733635cc08..6c4b340391 100644 --- a/integration-tests/go-sql-server-driver/concurrent_writes_test.go +++ b/integration-tests/go-sql-server-driver/concurrent_writes_test.go @@ -26,7 +26,7 @@ import ( driver "github.com/dolthub/dolt/go/libraries/doltcore/dtestutils/sql_server_driver" ) -// The txLocks in +// TestConcurrentWrites verifies concurrent write behavior and transaction locking in the SQL server driver. func TestConcurrentWrites(t *testing.T) { t.Parallel() var ports DynamicResources @@ -125,27 +125,11 @@ func TestConcurrentWrites(t *testing.T) { defer func () { require.NoError(t, conn.Close()) }() - rows, err := conn.QueryContext(ctx, "SELECT COUNT(*) FROM data") - if err != nil { - require.NoError(t, err) - } var i int - for rows.Next() { - err = rows.Scan(&i) - require.NoError(t, err) - } - require.NoError(t, rows.Err()) - require.NoError(t, rows.Close()) + err = conn.QueryRowContext(ctx, "SELECT COUNT(*) FROM data").Scan(&i) + require.NoError(t, err) t.Logf("read %d", i) - rows, err = conn.QueryContext(ctx, "SELECT COUNT(*) FROM dolt_log") - if err != nil { - require.NoError(t, err) - } - for rows.Next() { - err = rows.Scan(&i) - require.NoError(t, err) - } - require.NoError(t, rows.Err()) - require.NoError(t, rows.Close()) + err = conn.QueryRowContext(ctx, "SELECT COUNT(*) FROM dolt_log").Scan(&i) + require.NoError(t, err) t.Logf("created %d commits", i) } From f21a6bcd4dc73b393ab76457239fd7d95d80914a Mon Sep 17 00:00:00 2001 From: Aaron Son Date: Mon, 9 Feb 2026 13:03:57 -0800 Subject: [PATCH 25/28] go: sqle/dsess: More PR feedback. --- go/libraries/doltcore/sqle/dsess/transactions.go | 6 +++--- .../go-sql-server-driver/concurrent_writes_test.go | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/go/libraries/doltcore/sqle/dsess/transactions.go b/go/libraries/doltcore/sqle/dsess/transactions.go index f9f40cae04..d5b3a0cf06 100644 --- a/go/libraries/doltcore/sqle/dsess/transactions.go +++ b/go/libraries/doltcore/sqle/dsess/transactions.go @@ -386,10 +386,11 @@ func (tx *DoltTransaction) doCommit( if !ok { return nil, nil, fmt.Errorf("database %s unknown to transaction, this is a bug", dbName) } + normalizedDbName := strings.ToLower(branchState.dbState.dbName) // Load the start state for this working set from the noms root at tx start // Get the base DB name from the db state, not the branch state - startPoint, ok := tx.dbStartPoints[strings.ToLower(branchState.dbState.dbName)] + startPoint, ok := tx.dbStartPoints[normalizedDbName] if !ok { return nil, nil, fmt.Errorf("database %s unknown to transaction, this is a bug", dbName) } @@ -403,7 +404,7 @@ func (tx *DoltTransaction) doCommit( mergeOpts := branchState.EditOpts() - lockID := dbName + "\u0000" + workingSet.Ref().String() + lockID := normalizedDbName + "\u0000" + workingSet.Ref().String() for i := 0; i < maxTxCommitRetries; i++ { updatedWs, newCommit, err := func() (*doltdb.WorkingSet, *doltdb.Commit, error) { @@ -501,7 +502,6 @@ func (tx *DoltTransaction) mergeRoots( workingSet *doltdb.WorkingSet, mergeOpts editor.Options, ) (*doltdb.WorkingSet, error) { - tableResolver, err := GetTableResolver(ctx, dbName) if err != nil { return nil, err diff --git a/integration-tests/go-sql-server-driver/concurrent_writes_test.go b/integration-tests/go-sql-server-driver/concurrent_writes_test.go index 6c4b340391..da6c203bfb 100644 --- a/integration-tests/go-sql-server-driver/concurrent_writes_test.go +++ b/integration-tests/go-sql-server-driver/concurrent_writes_test.go @@ -85,6 +85,7 @@ func TestConcurrentWrites(t *testing.T) { } db, err := server.DB(driver.Connection{User: "root"}) require.NoError(t, err) + defer db.Close() db.SetMaxOpenConns(1) conn, err := db.Conn(ctx) if err != nil { @@ -119,9 +120,7 @@ func TestConcurrentWrites(t *testing.T) { t.Logf("wrote %d", nextInt) ctx = t.Context() conn, err := db.Conn(ctx) - if err != nil { - require.NoError(t, err) - } + require.NoError(t, err) defer func () { require.NoError(t, conn.Close()) }() From 86226f720d9fd506796ad8a6395385f621e605e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Mon, 9 Feb 2026 13:43:50 -0800 Subject: [PATCH 26/28] /go/store/blobstore: implement concatenate --- go/store/blobstore/git_blobstore.go | 218 +++++++++++++++++- .../blobstore/git_blobstore_helpers_test.go | 182 ++++++++++++++- go/store/blobstore/git_blobstore_test.go | 135 +++++++++++ 3 files changed, 528 insertions(+), 7 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index e688635863..08bcb6e9d4 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -801,20 +801,226 @@ func (gbs *GitBlobstore) currentKeyVersion(ctx context.Context, commit git.OID, } func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) { - // Chunked-object support is landing in phases. Concatenate is the final piece - // needed for NBS conjoin and is intentionally left unimplemented on this branch. - // // Keep key validation for consistent error behavior. - _, err := normalizeGitTreePath(key) + var err error + key, err = normalizeGitTreePath(key) if err != nil { return "", err } + if len(sources) == 0 { + return "", fmt.Errorf("gitblobstore: concatenate requires at least one source") + } + normSources := make([]string, 0, len(sources)) for _, src := range sources { - if _, err := normalizeGitTreePath(src); err != nil { + norm, err := normalizeGitTreePath(src) + if err != nil { return "", err } + normSources = append(normSources, norm) } - return "", git.ErrUnimplemented + sources = normSources + + // For non-manifest keys, match Put's behavior: if the key already exists, succeed without overwriting. + if ver, ok, err := gbs.tryFastSucceedPutIfKeyExists(ctx, key); err != nil { + return "", err + } else if ok { + return ver, nil + } + + // Resolve a snapshot commit for the sources. + commit, ok, err := gbs.api.TryResolveRefCommit(ctx, gbs.ref) + if err != nil { + return "", err + } + if !ok { + // Consistent with Get: empty store => manifest missing, other keys => ref missing. + if key == "manifest" { + return "", NotFound{Key: key} + } + return "", &git.RefNotFoundError{Ref: gbs.ref} + } + + totalSz, err := gbs.totalSizeAtCommit(ctx, commit, sources) + if err != nil { + return "", err + } + + rc := &concatReadCloser{ + ctx: ctx, + keys: sources, + open: func(ctx context.Context, k string) (io.ReadCloser, error) { + return gbs.openReaderAtCommit(ctx, commit, k) + }, + } + defer rc.Close() + + plan, err := gbs.planPutWrites(ctx, key, totalSz, rc) + if err != nil { + return "", err + } + + msg := fmt.Sprintf("gitblobstore: concatenate %s", key) + return gbs.putWithCASRetries(ctx, key, plan, msg) +} + +func (gbs *GitBlobstore) openReaderAtCommit(ctx context.Context, commit git.OID, key string) (io.ReadCloser, error) { + oid, typ, err := gbs.resolveObjectForGet(ctx, commit, key) + if err != nil { + return nil, err + } + switch typ { + case git.ObjectTypeBlob: + return gbs.api.BlobReader(ctx, oid) + case git.ObjectTypeTree: + rc, _, _, err := gbs.openChunkedTreeRange(ctx, commit, key, AllRange) + if err != nil { + // Defensive: resolveObjectForGet succeeded, but keep NotFound mapping consistent. + var pnf *git.PathNotFoundError + if errors.As(err, &pnf) { + return nil, NotFound{Key: key} + } + return nil, err + } + return rc, nil + default: + return nil, fmt.Errorf("gitblobstore: unsupported object type %q for key %q", typ, key) + } +} + +// sizeAtCommit returns the byte size of |key| as of |commit|. +// It supports both inline blobs and the chunked-tree representation used by GitBlobstore. +// If |key| is missing at |commit|, it returns NotFound{Key: key}. +func (gbs *GitBlobstore) sizeAtCommit(ctx context.Context, commit git.OID, key string) (uint64, error) { + oid, typ, err := gbs.api.ResolvePathObject(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return 0, NotFound{Key: key} + } + return 0, err + } + + switch typ { + case git.ObjectTypeBlob: + sz, err := gbs.api.BlobSize(ctx, oid) + if err != nil { + return 0, err + } + if sz < 0 { + return 0, fmt.Errorf("gitblobstore: invalid blob size %d for key %q", sz, key) + } + return uint64(sz), nil + + case git.ObjectTypeTree: + entries, err := gbs.api.ListTree(ctx, commit, key) + if err != nil { + if git.IsPathNotFound(err) { + return 0, NotFound{Key: key} + } + return 0, err + } + _, total, err := gbs.validateAndSizeChunkedParts(ctx, entries) + return total, err + + default: + return 0, fmt.Errorf("gitblobstore: unsupported object type %q for key %q", typ, key) + } +} + +// totalSizeAtCommit sums the sizes of |sources| at |commit| and returns the total as int64. +// Returns an error on overflow or if any source is missing. +func (gbs *GitBlobstore) totalSizeAtCommit(ctx context.Context, commit git.OID, sources []string) (int64, error) { + var total uint64 + for _, src := range sources { + sz, err := gbs.sizeAtCommit(ctx, commit, src) + if err != nil { + return 0, err + } + if sz > math.MaxUint64-total { + return 0, fmt.Errorf("gitblobstore: concatenated size overflow") + } + total += sz + } + if total > uint64(math.MaxInt64) { + return 0, fmt.Errorf("gitblobstore: concatenated size %d overflows int64", total) + } + return int64(total), nil +} + +type concatReadCloser struct { + ctx context.Context + keys []string + open func(ctx context.Context, key string) (io.ReadCloser, error) + cur int + curRC io.ReadCloser + done bool +} + +func (c *concatReadCloser) ensureCurrent() error { + if c.done || c.curRC != nil { + return nil + } + if c.cur >= len(c.keys) { + c.done = true + return nil + } + rc, err := c.open(c.ctx, c.keys[c.cur]) + if err != nil { + return err + } + c.curRC = rc + return nil +} + +func (c *concatReadCloser) closeCurrentAndAdvance() error { + if c.curRC != nil { + err := c.curRC.Close() + c.curRC = nil + c.cur++ + return err + } + c.cur++ + return nil +} + +func (c *concatReadCloser) Read(p []byte) (int, error) { + for { + if err := c.ensureCurrent(); err != nil { + return 0, err + } + if c.curRC == nil { + return 0, io.EOF + } + + n, err := c.curRC.Read(p) + if n > 0 { + // Preserve data; defer advancement until next Read call. + if err == io.EOF { + _ = c.closeCurrentAndAdvance() + return n, nil + } + return n, err + } + if err == nil { + continue + } + if err == io.EOF { + if cerr := c.closeCurrentAndAdvance(); cerr != nil { + return 0, cerr + } + continue + } + return 0, err + } +} + +func (c *concatReadCloser) Close() error { + c.done = true + if c.curRC != nil { + err := c.curRC.Close() + c.curRC = nil + return err + } + return nil } func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { diff --git a/go/store/blobstore/git_blobstore_helpers_test.go b/go/store/blobstore/git_blobstore_helpers_test.go index 89be72406e..8080f4e838 100644 --- a/go/store/blobstore/git_blobstore_helpers_test.go +++ b/go/store/blobstore/git_blobstore_helpers_test.go @@ -15,6 +15,7 @@ package blobstore import ( + "bytes" "context" "errors" "io" @@ -29,6 +30,7 @@ type fakeGitAPI struct { tryResolveRefCommit func(ctx context.Context, ref string) (git.OID, bool, error) resolvePathBlob func(ctx context.Context, commit git.OID, path string) (git.OID, error) resolvePathObject func(ctx context.Context, commit git.OID, path string) (git.OID, git.ObjectType, error) + listTree func(ctx context.Context, commit git.OID, treePath string) ([]git.TreeEntry, error) blobSize func(ctx context.Context, oid git.OID) (int64, error) blobReader func(ctx context.Context, oid git.OID) (io.ReadCloser, error) } @@ -46,7 +48,7 @@ func (f fakeGitAPI) ResolvePathObject(ctx context.Context, commit git.OID, path return f.resolvePathObject(ctx, commit, path) } func (f fakeGitAPI) ListTree(ctx context.Context, commit git.OID, treePath string) ([]git.TreeEntry, error) { - panic("unexpected call") + return f.listTree(ctx, commit, treePath) } func (f fakeGitAPI) CatFileType(ctx context.Context, oid git.OID) (string, error) { panic("unexpected call") @@ -230,3 +232,181 @@ func TestGitBlobstoreHelpers_validateAndSizeChunkedParts(t *testing.T) { _, _, err = gbs.validateAndSizeChunkedParts(ctx, []git.TreeEntry{{Name: "1", Type: git.ObjectTypeBlob, OID: "0123456789abcdef0123456789abcdef01234567"}}) require.Error(t, err) } + +func TestGitBlobstoreHelpers_sizeAtCommit(t *testing.T) { + ctx := context.Background() + commit := git.OID("0123456789abcdef0123456789abcdef01234567") + + t.Run("blob", func(t *testing.T) { + api := fakeGitAPI{ + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { + require.Equal(t, commit, gotCommit) + require.Equal(t, "k", path) + return git.OID("89abcdef0123456789abcdef0123456789abcdef"), git.ObjectTypeBlob, nil + }, + blobSize: func(ctx context.Context, gotOID git.OID) (int64, error) { + require.Equal(t, git.OID("89abcdef0123456789abcdef0123456789abcdef"), gotOID) + return 123, nil + }, + } + gbs := &GitBlobstore{api: api} + sz, err := gbs.sizeAtCommit(ctx, commit, "k") + require.NoError(t, err) + require.Equal(t, uint64(123), sz) + }) + + t.Run("chunkedTree", func(t *testing.T) { + api := fakeGitAPI{ + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { + require.Equal(t, commit, gotCommit) + require.Equal(t, "k", path) + return git.OID("treeoid"), git.ObjectTypeTree, nil + }, + listTree: func(ctx context.Context, gotCommit git.OID, treePath string) ([]git.TreeEntry, error) { + require.Equal(t, commit, gotCommit) + require.Equal(t, "k", treePath) + return []git.TreeEntry{ + {Name: "0001", Type: git.ObjectTypeBlob, OID: "0123456789abcdef0123456789abcdef01234567"}, + {Name: "0002", Type: git.ObjectTypeBlob, OID: "89abcdef0123456789abcdef0123456789abcdef"}, + }, nil + }, + blobSize: func(ctx context.Context, oid git.OID) (int64, error) { + switch oid { + case "0123456789abcdef0123456789abcdef01234567": + return 3, nil + case "89abcdef0123456789abcdef0123456789abcdef": + return 5, nil + default: + return 0, errors.New("unexpected oid") + } + }, + } + gbs := &GitBlobstore{api: api} + sz, err := gbs.sizeAtCommit(ctx, commit, "k") + require.NoError(t, err) + require.Equal(t, uint64(8), sz) + }) + + t.Run("notFound", func(t *testing.T) { + api := fakeGitAPI{ + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { + return git.OID(""), git.ObjectTypeUnknown, &git.PathNotFoundError{Commit: gotCommit.String(), Path: path} + }, + } + gbs := &GitBlobstore{api: api} + _, err := gbs.sizeAtCommit(ctx, commit, "missing") + var nf NotFound + require.ErrorAs(t, err, &nf) + require.Equal(t, "missing", nf.Key) + }) +} + +func TestGitBlobstoreHelpers_totalSizeAtCommit_overflowInt64(t *testing.T) { + ctx := context.Background() + commit := git.OID("0123456789abcdef0123456789abcdef01234567") + + api := fakeGitAPI{ + resolvePathObject: func(ctx context.Context, gotCommit git.OID, path string) (git.OID, git.ObjectType, error) { + return git.OID(path + "_oid"), git.ObjectTypeBlob, nil + }, + blobSize: func(ctx context.Context, gotOID git.OID) (int64, error) { + // Make the total exceed int64 max with two sources. + if gotOID == "a_oid" { + return int64(^uint64(0) >> 1), nil // math.MaxInt64 without importing math + } + return 1, nil + }, + } + gbs := &GitBlobstore{api: api} + _, err := gbs.totalSizeAtCommit(ctx, commit, []string{"a", "b"}) + require.Error(t, err) +} + +func TestConcatReadCloser(t *testing.T) { + ctx := context.Background() + closed := map[string]int{} + opened := map[string]int{} + + mk := func(s string) io.ReadCloser { + r := bytes.NewReader([]byte(s)) + return &trackedReadCloser{ + r: r, + onClose: func() { + closed[s]++ + }, + } + } + + crc := &concatReadCloser{ + ctx: ctx, + keys: []string{"a", "b"}, + open: func(ctx context.Context, key string) (io.ReadCloser, error) { + opened[key]++ + if key == "a" { + return mk("hi"), nil + } + return mk("there"), nil + }, + } + + out, err := io.ReadAll(crc) + require.NoError(t, err) + require.Equal(t, "hithere", string(out)) + require.NoError(t, crc.Close()) + require.Equal(t, 1, opened["a"]) + require.Equal(t, 1, opened["b"]) + require.Equal(t, 1, closed["hi"]) + require.Equal(t, 1, closed["there"]) +} + +func TestConcatReadCloser_CloseEarlyClosesCurrent(t *testing.T) { + ctx := context.Background() + closed := map[string]int{} + opened := map[string]int{} + + mk := func(id string, s string) io.ReadCloser { + r := bytes.NewReader([]byte(s)) + return &trackedReadCloser{ + r: r, + onClose: func() { + closed[id]++ + }, + } + } + + crc := &concatReadCloser{ + ctx: ctx, + keys: []string{"a", "b"}, + open: func(ctx context.Context, key string) (io.ReadCloser, error) { + opened[key]++ + if key == "a" { + return mk("a", "hello"), nil + } + return mk("b", "world"), nil + }, + } + + buf := make([]byte, 1) + n, err := crc.Read(buf) + require.Equal(t, 1, n) + require.NoError(t, err) + + require.NoError(t, crc.Close()) + require.Equal(t, 1, opened["a"]) + require.Equal(t, 0, opened["b"], "expected not to open second reader when closing early") + require.Equal(t, 1, closed["a"]) + require.Equal(t, 0, closed["b"]) +} + +type trackedReadCloser struct { + r io.Reader + onClose func() +} + +func (t *trackedReadCloser) Read(p []byte) (int, error) { return t.r.Read(p) } +func (t *trackedReadCloser) Close() error { + if t.onClose != nil { + t.onClose() + } + return nil +} diff --git a/go/store/blobstore/git_blobstore_test.go b/go/store/blobstore/git_blobstore_test.go index 1d41173eff..18b7156ec5 100644 --- a/go/store/blobstore/git_blobstore_test.go +++ b/go/store/blobstore/git_blobstore_test.go @@ -255,6 +255,141 @@ func TestGitBlobstore_Put_RoundTripAndVersion(t *testing.T) { require.Equal(t, want, got) } +func TestGitBlobstore_Concatenate_Basic(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + _, err = PutBytes(ctx, bs, "a", []byte("hi ")) + require.NoError(t, err) + _, err = PutBytes(ctx, bs, "b", []byte("there")) + require.NoError(t, err) + + ver, err := bs.Concatenate(ctx, "c", []string{"a", "b"}) + require.NoError(t, err) + require.NotEmpty(t, ver) + + got, ver2, err := GetBytes(ctx, bs, "c", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, []byte("hi there"), got) +} + +func TestGitBlobstore_Concatenate_ChunkedResult(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithOptions(repo.GitDir, DoltDataRef, GitBlobstoreOptions{ + Identity: testIdentity(), + MaxPartSize: 1024, + }) + require.NoError(t, err) + + a := bytes.Repeat([]byte("a"), 700) + b := bytes.Repeat([]byte("b"), 700) + want := append(append([]byte(nil), a...), b...) + + _, err = PutBytes(ctx, bs, "a", a) + require.NoError(t, err) + _, err = PutBytes(ctx, bs, "b", b) + require.NoError(t, err) + + ver, err := bs.Concatenate(ctx, "c", []string{"a", "b"}) + require.NoError(t, err) + require.NotEmpty(t, ver) + + // Verify the resulting key is stored as a chunked tree (not a single blob). + head, ok, err := bs.api.TryResolveRefCommit(ctx, DoltDataRef) + require.NoError(t, err) + require.True(t, ok) + oid, typ, err := bs.api.ResolvePathObject(ctx, head, "c") + require.NoError(t, err) + require.Equal(t, git.ObjectTypeTree, typ) + require.Equal(t, oid.String(), ver) + + parts, err := bs.api.ListTree(ctx, head, "c") + require.NoError(t, err) + require.GreaterOrEqual(t, len(parts), 2) + require.Equal(t, "00000001", parts[0].Name) + + got, ver2, err := GetBytes(ctx, bs, "c", AllRange) + require.NoError(t, err) + require.Equal(t, ver, ver2) + require.Equal(t, want, got) +} + +func TestGitBlobstore_Concatenate_KeyExistsFastSucceeds(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + ver1, err := PutBytes(ctx, bs, "c", []byte("original")) + require.NoError(t, err) + require.NotEmpty(t, ver1) + + _, err = PutBytes(ctx, bs, "a", []byte("new ")) + require.NoError(t, err) + _, err = PutBytes(ctx, bs, "b", []byte("value")) + require.NoError(t, err) + + ver2, err := bs.Concatenate(ctx, "c", []string{"a", "b"}) + require.NoError(t, err) + require.Equal(t, ver1, ver2, "expected concatenate to fast-succeed without overwriting existing key") + + got, ver3, err := GetBytes(ctx, bs, "c", AllRange) + require.NoError(t, err) + require.Equal(t, ver1, ver3) + require.Equal(t, []byte("original"), got) +} + +func TestGitBlobstore_Concatenate_MissingSourceIsNotFound(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + _, err = PutBytes(ctx, bs, "present", []byte("x")) + require.NoError(t, err) + + _, err = bs.Concatenate(ctx, "c", []string{"present", "missing"}) + require.Error(t, err) + require.True(t, IsNotFoundError(err)) + var nf NotFound + require.ErrorAs(t, err, &nf) + require.Equal(t, "missing", nf.Key) +} + +func TestGitBlobstore_Concatenate_EmptySourcesErrors(t *testing.T) { + requireGitOnPath(t) + + ctx := context.Background() + repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git") + require.NoError(t, err) + + bs, err := NewGitBlobstoreWithIdentity(repo.GitDir, DoltDataRef, testIdentity()) + require.NoError(t, err) + + _, err = bs.Concatenate(ctx, "c", nil) + require.Error(t, err) +} + type putShouldNotRead struct{} func (putShouldNotRead) Read(_ []byte) (int, error) { From a2139d16a356d54a4759f50208c8f61466c81f7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?coffeegoddd=E2=98=95=EF=B8=8F=E2=9C=A8?= Date: Mon, 9 Feb 2026 13:52:21 -0800 Subject: [PATCH 27/28] /go/store/blobstore/git_blobstore.go: refactor --- go/store/blobstore/git_blobstore.go | 154 ++++++++++++++-------------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/go/store/blobstore/git_blobstore.go b/go/store/blobstore/git_blobstore.go index 08bcb6e9d4..c005dd00d6 100644 --- a/go/store/blobstore/git_blobstore.go +++ b/go/store/blobstore/git_blobstore.go @@ -174,6 +174,83 @@ func (m *multiPartReadCloser) Close() error { return nil } +type concatReadCloser struct { + ctx context.Context + keys []string + open func(ctx context.Context, key string) (io.ReadCloser, error) + cur int + curRC io.ReadCloser + done bool +} + +func (c *concatReadCloser) ensureCurrent() error { + if c.done || c.curRC != nil { + return nil + } + if c.cur >= len(c.keys) { + c.done = true + return nil + } + rc, err := c.open(c.ctx, c.keys[c.cur]) + if err != nil { + return err + } + c.curRC = rc + return nil +} + +func (c *concatReadCloser) closeCurrentAndAdvance() error { + if c.curRC != nil { + err := c.curRC.Close() + c.curRC = nil + c.cur++ + return err + } + c.cur++ + return nil +} + +func (c *concatReadCloser) Read(p []byte) (int, error) { + for { + if err := c.ensureCurrent(); err != nil { + return 0, err + } + if c.curRC == nil { + return 0, io.EOF + } + + n, err := c.curRC.Read(p) + if n > 0 { + // Preserve data; defer advancement until next Read call. + if err == io.EOF { + _ = c.closeCurrentAndAdvance() + return n, nil + } + return n, err + } + if err == nil { + continue + } + if err == io.EOF { + if cerr := c.closeCurrentAndAdvance(); cerr != nil { + return 0, cerr + } + continue + } + return 0, err + } +} + +func (c *concatReadCloser) Close() error { + c.done = true + if c.curRC != nil { + err := c.curRC.Close() + c.curRC = nil + return err + } + return nil +} + // GitBlobstore is a Blobstore implementation backed by a git repository's object // database (bare repo or .git directory). It stores keys as paths within the tree // of the commit referenced by a git ref (e.g. refs/dolt/data). @@ -946,83 +1023,6 @@ func (gbs *GitBlobstore) totalSizeAtCommit(ctx context.Context, commit git.OID, return int64(total), nil } -type concatReadCloser struct { - ctx context.Context - keys []string - open func(ctx context.Context, key string) (io.ReadCloser, error) - cur int - curRC io.ReadCloser - done bool -} - -func (c *concatReadCloser) ensureCurrent() error { - if c.done || c.curRC != nil { - return nil - } - if c.cur >= len(c.keys) { - c.done = true - return nil - } - rc, err := c.open(c.ctx, c.keys[c.cur]) - if err != nil { - return err - } - c.curRC = rc - return nil -} - -func (c *concatReadCloser) closeCurrentAndAdvance() error { - if c.curRC != nil { - err := c.curRC.Close() - c.curRC = nil - c.cur++ - return err - } - c.cur++ - return nil -} - -func (c *concatReadCloser) Read(p []byte) (int, error) { - for { - if err := c.ensureCurrent(); err != nil { - return 0, err - } - if c.curRC == nil { - return 0, io.EOF - } - - n, err := c.curRC.Read(p) - if n > 0 { - // Preserve data; defer advancement until next Read call. - if err == io.EOF { - _ = c.closeCurrentAndAdvance() - return n, nil - } - return n, err - } - if err == nil { - continue - } - if err == io.EOF { - if cerr := c.closeCurrentAndAdvance(); cerr != nil { - return 0, cerr - } - continue - } - return 0, err - } -} - -func (c *concatReadCloser) Close() error { - c.done = true - if c.curRC != nil { - err := c.curRC.Close() - c.curRC = nil - return err - } - return nil -} - func sliceInlineBlob(rc io.ReadCloser, sz int64, br BlobRange, ver string) (io.ReadCloser, uint64, string, error) { // Implement BlobRange by slicing the streamed blob contents. // TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads From 76f2168f0cafbf08f983fbb7e9194d5d05e943f0 Mon Sep 17 00:00:00 2001 From: angelamayxie Date: Tue, 10 Feb 2026 00:55:28 +0000 Subject: [PATCH 28/28] [ga-bump-dep] Bump dependency in Dolt by angelamayxie --- go/go.mod | 2 +- go/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/go.mod b/go/go.mod index a7452145c4..fb5804552e 100644 --- a/go/go.mod +++ b/go/go.mod @@ -61,7 +61,7 @@ require ( github.com/dolthub/dolt-mcp v0.2.2 github.com/dolthub/eventsapi_schema v0.0.0-20260205214132-a7a3c84c84a1 github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2 - github.com/dolthub/go-mysql-server v0.20.1-0.20260206233720-bbef18042f77 + github.com/dolthub/go-mysql-server v0.20.1-0.20260210005347-46fe127d0460 github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63 github.com/edsrzf/mmap-go v1.2.0 github.com/esote/minmaxheap v1.0.0 diff --git a/go/go.sum b/go/go.sum index 7a556cf1a8..4c96b0eee2 100644 --- a/go/go.sum +++ b/go/go.sum @@ -196,8 +196,8 @@ github.com/dolthub/fslock v0.0.0-20251215194149-ef20baba2318 h1:n+vdH5G5Db+1qnDC github.com/dolthub/fslock v0.0.0-20251215194149-ef20baba2318/go.mod h1:QWql+P17oAAMLnL4HGB5tiovtDuAjdDTPbuqx7bYfa0= github.com/dolthub/go-icu-regex v0.0.0-20250916051405-78a38d478790 h1:zxMsH7RLiG+dlZ/y0LgJHTV26XoiSJcuWq+em6t6VVc= github.com/dolthub/go-icu-regex v0.0.0-20250916051405-78a38d478790/go.mod h1:F3cnm+vMRK1HaU6+rNqQrOCyR03HHhR1GWG2gnPOqaE= -github.com/dolthub/go-mysql-server v0.20.1-0.20260206233720-bbef18042f77 h1:1b6Z3rm58d5LtLFQI2olPwnNTbwC1g7aTVRhrO6HJdc= -github.com/dolthub/go-mysql-server v0.20.1-0.20260206233720-bbef18042f77/go.mod h1:LEWdXw6LKjdonOv2X808RpUc8wZVtQx4ZEPvmDWkvY4= +github.com/dolthub/go-mysql-server v0.20.1-0.20260210005347-46fe127d0460 h1:ku4qVcwZUUImcaWOOrPWwhjD5BD34wS6LuENxU3XJUU= +github.com/dolthub/go-mysql-server v0.20.1-0.20260210005347-46fe127d0460/go.mod h1:LEWdXw6LKjdonOv2X808RpUc8wZVtQx4ZEPvmDWkvY4= github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63 h1:OAsXLAPL4du6tfbBgK0xXHZkOlos63RdKYS3Sgw/dfI= github.com/dolthub/gozstd v0.0.0-20240423170813-23a2903bca63/go.mod h1:lV7lUeuDhH5thVGDCKXbatwKy2KW80L4rMT46n+Y2/Q= github.com/dolthub/ishell v0.0.0-20240701202509-2b217167d718 h1:lT7hE5k+0nkBdj/1UOSFwjWpNxf+LCApbRHgnCA17XE=