Merge pull request #10409 from dolthub/db/gitblobstore

Add read-only GitBlobstore
This commit is contained in:
Dustin Brown
2026-02-04 10:10:30 -08:00
committed by GitHub
11 changed files with 1412 additions and 2 deletions
+4 -1
View File
@@ -21,4 +21,7 @@ integration-tests/bats/batsee_results
CLAUDE.md
*~
.dir-locals.el
.dir-locals.el
.beads
.gitattributes
+217
View File
@@ -0,0 +1,217 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package blobstore
import (
"context"
"fmt"
"io"
"strings"
git "github.com/dolthub/dolt/go/store/blobstore/internal/git"
)
// GitBlobstore is a Blobstore implementation backed by a git repository's object
// database (bare repo or .git directory). It stores keys as paths within the tree
// of the commit referenced by a git ref (e.g. refs/dolt/data).
//
// This initial implementation is intentionally READ-ONLY. Write-path methods
// (Put / CheckAndPut / Concatenate) return an explicit unimplemented error while
// we lock down read behavior for manifests and table files.
type GitBlobstore struct {
gitDir string
ref string
runner *git.Runner
}
var _ Blobstore = (*GitBlobstore)(nil)
// NewGitBlobstore creates a new read-only GitBlobstore rooted at |gitDir| and |ref|.
// |gitDir| should point at a bare repo directory or a .git directory.
func NewGitBlobstore(gitDir, ref string) (*GitBlobstore, error) {
r, err := git.NewRunner(gitDir)
if err != nil {
return nil, err
}
return &GitBlobstore{gitDir: gitDir, ref: ref, runner: r}, nil
}
func (gbs *GitBlobstore) Path() string {
return fmt.Sprintf("%s@%s", gbs.gitDir, gbs.ref)
}
func (gbs *GitBlobstore) Exists(ctx context.Context, key string) (bool, error) {
key, err := normalizeGitTreePath(key)
if err != nil {
return false, err
}
commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref)
if err != nil {
return false, err
}
if !ok {
return false, nil
}
_, err = git.ResolvePathBlob(ctx, gbs.runner, commit, key)
if err != nil {
if git.IsPathNotFound(err) {
return false, nil
}
return false, err
}
return true, nil
}
func (gbs *GitBlobstore) Get(ctx context.Context, key string, br BlobRange) (io.ReadCloser, uint64, string, error) {
key, err := normalizeGitTreePath(key)
if err != nil {
return nil, 0, "", err
}
commit, ok, err := git.TryResolveRefCommit(ctx, gbs.runner, gbs.ref)
if err != nil {
return nil, 0, "", err
}
if !ok {
// If the ref doesn't exist, treat the manifest as missing (empty store),
// but surface a hard error for other keys: the store itself is missing.
if key == "manifest" {
return nil, 0, "", NotFound{Key: key}
}
return nil, 0, "", &git.RefNotFoundError{Ref: gbs.ref}
}
blobOID, err := git.ResolvePathBlob(ctx, gbs.runner, commit, key)
if err != nil {
if git.IsPathNotFound(err) {
return nil, 0, commit.String(), NotFound{Key: key}
}
return nil, 0, commit.String(), err
}
sz, err := git.BlobSize(ctx, gbs.runner, blobOID)
if err != nil {
return nil, 0, commit.String(), err
}
// TODO(gitblobstore): This streaming implementation is correct but may be slow for workloads
// that do many small ranged reads (e.g. table index/footer reads). Consider caching/materializing
// blobs to a local file (or using a batched git cat-file mode) to serve ranges efficiently.
rc, err := git.BlobReader(ctx, gbs.runner, blobOID)
if err != nil {
return nil, 0, commit.String(), err
}
// Implement BlobRange by slicing the streamed blob contents.
if br.isAllRange() {
return rc, uint64(sz), commit.String(), nil
}
pos := br.positiveRange(sz)
if pos.offset < 0 || pos.offset > sz {
_ = rc.Close()
return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange offset %d for blob of size %d", pos.offset, sz)
}
if pos.length < 0 {
_ = rc.Close()
return nil, uint64(sz), commit.String(), fmt.Errorf("invalid BlobRange length %d", pos.length)
}
if pos.length == 0 {
// Read from offset to end.
pos.length = sz - pos.offset
}
// Clamp to end (defensive; positiveRange should already do this).
if pos.offset+pos.length > sz {
pos.length = sz - pos.offset
}
// Skip to offset.
if pos.offset > 0 {
if _, err := io.CopyN(io.Discard, rc, pos.offset); err != nil {
_ = rc.Close()
return nil, uint64(sz), commit.String(), err
}
}
return &limitReadCloser{r: io.LimitReader(rc, pos.length), c: rc}, uint64(sz), commit.String(), nil
}
type limitReadCloser struct {
r io.Reader
c io.Closer
}
func (l *limitReadCloser) Read(p []byte) (int, error) { return l.r.Read(p) }
func (l *limitReadCloser) Close() error { return l.c.Close() }
func (gbs *GitBlobstore) Put(ctx context.Context, key string, totalSize int64, reader io.Reader) (string, error) {
if _, err := normalizeGitTreePath(key); err != nil {
return "", err
}
return "", fmt.Errorf("%w: GitBlobstore.Put", git.ErrUnimplemented)
}
func (gbs *GitBlobstore) CheckAndPut(ctx context.Context, expectedVersion, key string, totalSize int64, reader io.Reader) (string, error) {
if _, err := normalizeGitTreePath(key); err != nil {
return "", err
}
return "", fmt.Errorf("%w: GitBlobstore.CheckAndPut", git.ErrUnimplemented)
}
func (gbs *GitBlobstore) Concatenate(ctx context.Context, key string, sources []string) (string, error) {
if _, err := normalizeGitTreePath(key); err != nil {
return "", err
}
for _, src := range sources {
if _, err := normalizeGitTreePath(src); err != nil {
return "", err
}
}
return "", fmt.Errorf("%w: GitBlobstore.Concatenate", git.ErrUnimplemented)
}
// normalizeGitTreePath normalizes and validates a blobstore key for use as a git tree path.
//
// Rules:
// - convert Windows-style separators: "\" -> "/"
// - disallow absolute paths (leading "/")
// - disallow empty segments and trailing "/"
// - disallow "." and ".." segments
// - disallow NUL bytes
func normalizeGitTreePath(key string) (string, error) {
if strings.ContainsRune(key, '\x00') {
return "", fmt.Errorf("invalid git blobstore key (NUL byte): %q", key)
}
key = strings.ReplaceAll(key, "\\", "/")
if key == "" {
return "", fmt.Errorf("invalid git blobstore key (empty)")
}
if strings.HasPrefix(key, "/") {
return "", fmt.Errorf("invalid git blobstore key (absolute path): %q", key)
}
parts := strings.Split(key, "/")
for _, p := range parts {
if p == "" {
return "", fmt.Errorf("invalid git blobstore key (empty path segment): %q", key)
}
if p == "." || p == ".." {
return "", fmt.Errorf("invalid git blobstore key (path traversal): %q", key)
}
if strings.ContainsRune(p, '\x00') {
return "", fmt.Errorf("invalid git blobstore key (NUL byte): %q", key)
}
}
return key, nil
}
+210
View File
@@ -0,0 +1,210 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package blobstore
import (
"context"
"errors"
"os/exec"
"testing"
"github.com/stretchr/testify/require"
git "github.com/dolthub/dolt/go/store/blobstore/internal/git"
"github.com/dolthub/dolt/go/store/testutils/gitrepo"
)
func TestGitBlobstore_RefMissingIsNotFound(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
ok, err := bs.Exists(ctx, "manifest")
require.NoError(t, err)
require.False(t, ok)
_, _, err = GetBytes(ctx, bs, "manifest", AllRange)
require.Error(t, err)
require.True(t, IsNotFoundError(err))
// For non-manifest keys, missing the ref is a hard error.
_, _, _, err = bs.Get(ctx, "table", AllRange)
require.Error(t, err)
require.False(t, IsNotFoundError(err))
var rnf *git.RefNotFoundError
require.True(t, errors.As(err, &rnf))
}
func TestGitBlobstore_ExistsAndGet_AllRange(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
want := []byte("hello manifest\n")
commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{
"manifest": want,
"dir/file": []byte("abc"),
}, "seed")
require.NoError(t, err)
bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
ok, err := bs.Exists(ctx, "manifest")
require.NoError(t, err)
require.True(t, ok)
ok, err = bs.Exists(ctx, "missing")
require.NoError(t, err)
require.False(t, ok)
// Validate key normalization: backslash -> slash.
ok, err = bs.Exists(ctx, "dir\\file")
require.NoError(t, err)
require.True(t, ok)
got, ver, err := GetBytes(ctx, bs, "manifest", AllRange)
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, want, got)
// Validate size + version on Get.
rc, sz, ver2, err := bs.Get(ctx, "manifest", NewBlobRange(0, 5))
require.NoError(t, err)
require.Equal(t, uint64(len(want)), sz)
require.Equal(t, commit, ver2)
_ = rc.Close()
}
func TestGitBlobstore_Get_NotFoundMissingKey(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
_, err = repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{
"present": []byte("x"),
}, "seed")
require.NoError(t, err)
bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
_, _, err = GetBytes(ctx, bs, "missing", AllRange)
require.Error(t, err)
require.True(t, IsNotFoundError(err))
}
func TestGitBlobstore_BlobRangeSemantics(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
maxValue := int64(16 * 1024)
testData := rangeData(0, maxValue)
commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{
"range": testData,
}, "range fixture")
require.NoError(t, err)
bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
// full range
got, ver, err := GetBytes(ctx, bs, "range", AllRange)
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, rangeData(0, maxValue), got)
// first 2048 bytes (1024 shorts)
got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(0, 2048))
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, rangeData(0, 1024), got)
// bytes 2048..4096 of original
got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(2*1024, 2*1024))
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, rangeData(1024, 2048), got)
// last 2048 bytes
got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 0))
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, rangeData(maxValue-1024, maxValue), got)
// tail slice: beginning 2048 bytes from end, size 512
got, ver, err = GetBytes(ctx, bs, "range", NewBlobRange(-2*1024, 512))
require.NoError(t, err)
require.Equal(t, commit, ver)
require.Equal(t, rangeData(maxValue-1024, maxValue-768), got)
}
func TestGitBlobstore_InvalidKeysError(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
_, err = repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{"ok": []byte("x")}, "seed")
require.NoError(t, err)
bs, err := NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
invalid := []string{
"",
"/abs",
"../x",
"a/../b",
"a//b",
"a/",
".",
"..",
"a/./b",
"a/\x00/b",
}
for _, k := range invalid {
_, err := bs.Exists(ctx, k)
require.Error(t, err, "expected error for key %q", k)
_, _, _, err = bs.Get(ctx, k, AllRange)
require.Error(t, err, "expected error for key %q", k)
}
}
+67
View File
@@ -0,0 +1,67 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package git
import (
"errors"
"fmt"
)
// ErrUnimplemented is returned by stubbed write-path APIs. It is intentionally
// exported so higher layers (e.g. GitBlobstore) can wrap or match it.
var ErrUnimplemented = errors.New("unimplemented")
// RefNotFoundError indicates that a ref (e.g. refs/dolt/data) could not be resolved.
type RefNotFoundError struct {
Ref string
}
func (e *RefNotFoundError) Error() string {
return fmt.Sprintf("git ref not found: %s", e.Ref)
}
// PathNotFoundError indicates that a tree path could not be resolved within a commit.
type PathNotFoundError struct {
Commit string
Path string
}
func (e *PathNotFoundError) Error() string {
return fmt.Sprintf("git path not found: %s:%s", e.Commit, e.Path)
}
// NotBlobError indicates that a resolved path did not refer to a blob object.
type NotBlobError struct {
Commit string
Path string
Type string
}
func (e *NotBlobError) Error() string {
if e.Type == "" {
return fmt.Sprintf("git path is not a blob: %s:%s", e.Commit, e.Path)
}
return fmt.Sprintf("git path is not a blob (%s): %s:%s", e.Type, e.Commit, e.Path)
}
func IsRefNotFound(err error) bool {
var e *RefNotFoundError
return errors.As(err, &e)
}
func IsPathNotFound(err error) bool {
var e *PathNotFoundError
return errors.As(err, &e)
}
+176
View File
@@ -0,0 +1,176 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package git
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"strconv"
"strings"
)
// OID is a git object id in hex (typically 40-char SHA1).
type OID string
func (o OID) String() string { return string(o) }
// TryResolveRefCommit resolves |ref| to a commit OID. Returns ok=false if the ref does not exist.
func TryResolveRefCommit(ctx context.Context, r *Runner, ref string) (oid OID, ok bool, err error) {
out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", "--quiet", ref+"^{commit}")
if err == nil {
s := strings.TrimSpace(string(out))
if s == "" {
// Shouldn't happen, but treat as missing.
return "", false, nil
}
return OID(s), true, nil
}
if isRefNotFoundErr(err) {
return "", false, nil
}
return "", false, err
}
// ResolveRefCommit resolves |ref| to a commit OID.
func ResolveRefCommit(ctx context.Context, r *Runner, ref string) (OID, error) {
oid, ok, err := TryResolveRefCommit(ctx, r, ref)
if err != nil {
return "", err
}
if !ok {
return "", &RefNotFoundError{Ref: ref}
}
return oid, nil
}
// ResolvePathBlob resolves |path| within |commit| to a blob OID.
// It returns PathNotFoundError if the path does not exist, and NotBlobError if the
// path resolves to a non-blob object (e.g. a tree).
func ResolvePathBlob(ctx context.Context, r *Runner, commit OID, path string) (OID, error) {
spec := commit.String() + ":" + path
out, err := r.Run(ctx, RunOptions{}, "rev-parse", "--verify", spec)
if err != nil {
if isPathNotFoundErr(err) {
return "", &PathNotFoundError{Commit: commit.String(), Path: path}
}
return "", err
}
oid := strings.TrimSpace(string(out))
if oid == "" {
return "", fmt.Errorf("git rev-parse returned empty oid for %q", spec)
}
typ, err := CatFileType(ctx, r, OID(oid))
if err != nil {
return "", err
}
if typ != "blob" {
return "", &NotBlobError{Commit: commit.String(), Path: path, Type: typ}
}
return OID(oid), nil
}
// CatFileType returns the git object type for |oid| (e.g. "blob", "tree", "commit").
func CatFileType(ctx context.Context, r *Runner, oid OID) (string, error) {
out, err := r.Run(ctx, RunOptions{}, "cat-file", "-t", oid.String())
if err != nil {
return "", err
}
return strings.TrimSpace(string(out)), nil
}
// BlobSize returns the size in bytes of the blob object |oid|.
func BlobSize(ctx context.Context, r *Runner, oid OID) (int64, error) {
out, err := r.Run(ctx, RunOptions{}, "cat-file", "-s", oid.String())
if err != nil {
return 0, err
}
s := strings.TrimSpace(string(out))
n, err := strconv.ParseInt(s, 10, 64)
if err != nil {
return 0, fmt.Errorf("git cat-file -s parse error (%q): %w", s, err)
}
return n, nil
}
// BlobReader returns a reader for blob contents. The returned ReadCloser will wait for
// the git process to exit when closed, returning a CmdError if the process fails.
func BlobReader(ctx context.Context, r *Runner, oid OID) (io.ReadCloser, error) {
rc, _, err := r.Start(ctx, RunOptions{}, "cat-file", "blob", oid.String())
return rc, err
}
func isRefNotFoundErr(err error) bool {
ce, ok := err.(*CmdError)
if !ok {
return false
}
// For `git rev-parse --verify --quiet <ref>^{commit}`, a missing ref typically yields exit 1 and no output.
if ce.ExitCode == 1 && len(bytes.TrimSpace(ce.Output)) == 0 {
return true
}
// Some git versions may still emit "fatal: Needed a single revision" without --quiet; keep a defensive check.
msg := strings.ToLower(string(ce.Output))
return strings.Contains(msg, "needed a single revision") ||
strings.Contains(msg, "unknown revision") ||
strings.Contains(msg, "not a valid object name")
}
func isPathNotFoundErr(err error) bool {
ce, ok := err.(*CmdError)
if !ok {
return false
}
if ce.ExitCode == 128 || ce.ExitCode == 1 {
msg := strings.ToLower(string(ce.Output))
// Common patterns:
// - "fatal: Path 'x' does not exist in 'HEAD'"
// - "fatal: invalid object name 'HEAD:x'"
// - "fatal: Needed a single revision"
// - "fatal: ambiguous argument '...': unknown revision or path not in the working tree."
if strings.Contains(msg, "does not exist in") ||
strings.Contains(msg, "invalid object name") ||
strings.Contains(msg, "needed a single revision") ||
strings.Contains(msg, "unknown revision or path not in the working tree") {
return true
}
}
return false
}
// ReadAllBytes is a small helper for read-path callers that want a whole object.
// This is not used by GitBlobstore.Get (which must support BlobRange), but it is useful in tests.
func ReadAllBytes(ctx context.Context, r *Runner, oid OID) ([]byte, error) {
rc, err := BlobReader(ctx, r, oid)
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// NormalizeGitPlumbingError unwraps CmdError wrappers, returning the underlying error.
// Mostly useful for callers that want to compare against context cancellation.
func NormalizeGitPlumbingError(err error) error {
var ce *CmdError
if errors.As(err, &ce) && ce.Cause != nil {
return ce.Cause
}
return err
}
+255
View File
@@ -0,0 +1,255 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package git provides helpers for invoking git plumbing commands against a bare
// repository or .git directory without a working tree checkout.
package git
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"os"
"os/exec"
"strings"
)
const maxCapturedOutputBytes = 64 * 1024
// Runner executes git commands with GIT_DIR set (and optionally GIT_INDEX_FILE).
// It is intended for git plumbing usage and should not require a working tree.
type Runner struct {
gitPath string
gitDir string
// extraEnv is appended to os.Environ() for every command.
extraEnv []string
}
// NewRunner creates a Runner using the git binary on PATH.
func NewRunner(gitDir string) (*Runner, error) {
p, err := exec.LookPath("git")
if err != nil {
return nil, fmt.Errorf("git not found on PATH: %w", err)
}
return NewRunnerWithGitPath(gitDir, p), nil
}
// NewRunnerWithGitPath creates a Runner using an explicit git binary path.
func NewRunnerWithGitPath(gitDir, gitPath string) *Runner {
return &Runner{
gitPath: gitPath,
gitDir: gitDir,
}
}
// WithExtraEnv returns a copy of r that appends env entries (e.g. "K=V") to all commands.
func (r *Runner) WithExtraEnv(env ...string) *Runner {
cp := *r
cp.extraEnv = append(append([]string(nil), r.extraEnv...), env...)
return &cp
}
// RunOptions control a single git invocation.
type RunOptions struct {
// Dir is the working directory for the git process. Optional.
Dir string
// IndexFile sets GIT_INDEX_FILE for the git process. Optional.
IndexFile string
// Stdin provides stdin to the git process. Optional.
Stdin io.Reader
// Stdout and Stderr override output destinations. If both are nil, output is captured and returned.
Stdout io.Writer
Stderr io.Writer
// Env is appended to the process environment.
Env []string
}
// CmdError represents a failed git invocation with captured output.
type CmdError struct {
Args []string
Dir string
ExitCode int
Output []byte
Cause error
}
func (e *CmdError) Error() string {
var b strings.Builder
b.WriteString("git command failed")
if e.ExitCode != 0 {
b.WriteString(fmt.Sprintf(" (exit %d)", e.ExitCode))
}
if len(e.Args) > 0 {
b.WriteString("\ncommand: git ")
b.WriteString(strings.Join(e.Args, " "))
}
if e.Dir != "" {
b.WriteString("\ndir: ")
b.WriteString(e.Dir)
}
b.WriteString("\noutput:\n")
b.WriteString(formatOutput(e.Output))
if e.Cause != nil {
b.WriteString("\nerror: ")
b.WriteString(e.Cause.Error())
}
return b.String()
}
func (e *CmdError) Unwrap() error { return e.Cause }
// Run executes "git <args...>" with GIT_DIR set and returns captured combined output
// when Stdout/Stderr are not supplied.
func (r *Runner) Run(ctx context.Context, opts RunOptions, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing.
if opts.Dir != "" {
cmd.Dir = opts.Dir
}
cmd.Env = r.env(opts)
if opts.Stdin != nil {
cmd.Stdin = opts.Stdin
}
// Capture combined output unless caller provided destinations.
var buf bytes.Buffer
if opts.Stdout == nil && opts.Stderr == nil {
cmd.Stdout = &buf
cmd.Stderr = &buf
} else {
if opts.Stdout != nil {
cmd.Stdout = opts.Stdout
}
if opts.Stderr != nil {
cmd.Stderr = opts.Stderr
} else if opts.Stdout != nil {
// Reasonable default: if only Stdout is set, send stderr there too.
cmd.Stderr = opts.Stdout
}
}
err := cmd.Run()
out := buf.Bytes()
if err == nil {
return out, nil
}
exitCode := 0
var ee *exec.ExitError
if errors.As(err, &ee) {
exitCode = ee.ExitCode()
}
return out, &CmdError{
Args: append([]string(nil), args...),
Dir: cmd.Dir,
ExitCode: exitCode,
Output: out,
Cause: err,
}
}
// Start starts "git <args...>" and returns a ReadCloser for stdout.
//
// Resource management:
// - Call Close() on the returned ReadCloser to ensure the underlying git process
// is waited (cmd.Wait()) and resources are released.
// - The returned *exec.Cmd is provided for advanced uses (e.g. signals), but most
// callers should not call Wait() directly.
func (r *Runner) Start(ctx context.Context, opts RunOptions, args ...string) (io.ReadCloser, *exec.Cmd, error) {
cmd := exec.CommandContext(ctx, r.gitPath, args...) //nolint:gosec // args are controlled by caller; used for internal plumbing.
if opts.Dir != "" {
cmd.Dir = opts.Dir
}
cmd.Env = r.env(opts)
if opts.Stdin != nil {
cmd.Stdin = opts.Stdin
}
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, nil, err
}
// Capture stderr into a buffer so failures have actionable output.
var stderr bytes.Buffer
cmd.Stderr = &stderr
if err := cmd.Start(); err != nil {
_ = stdout.Close()
return nil, nil, err
}
// Wrap stdout so that Close also waits to avoid zombies if callers bail early.
rc := &cmdReadCloser{
r: stdout,
cmd: cmd,
stderr: &stderr,
args: append([]string(nil), args...),
dir: cmd.Dir,
}
return rc, cmd, nil
}
type cmdReadCloser struct {
r io.ReadCloser
cmd *exec.Cmd
stderr *bytes.Buffer
args []string
dir string
}
func (c *cmdReadCloser) Read(p []byte) (int, error) { return c.r.Read(p) }
func (c *cmdReadCloser) Close() error {
_ = c.r.Close()
err := c.cmd.Wait()
if err == nil {
return nil
}
exitCode := 0
var ee *exec.ExitError
if errors.As(err, &ee) {
exitCode = ee.ExitCode()
}
return &CmdError{
Args: c.args,
Dir: c.dir,
ExitCode: exitCode,
Output: c.stderr.Bytes(),
Cause: err,
}
}
func (r *Runner) env(opts RunOptions) []string {
env := append([]string(nil), os.Environ()...)
env = append(env, "GIT_DIR="+r.gitDir)
if opts.IndexFile != "" {
env = append(env, "GIT_INDEX_FILE="+opts.IndexFile)
}
env = append(env, r.extraEnv...)
env = append(env, opts.Env...)
return env
}
func formatOutput(out []byte) string {
if len(out) == 0 {
return "(no output)"
}
if len(out) <= maxCapturedOutputBytes {
return strings.TrimRight(string(out), "\n")
}
trimmed := out[len(out)-maxCapturedOutputBytes:]
return fmt.Sprintf("... (truncated; showing last %d bytes)\n%s", maxCapturedOutputBytes, strings.TrimRight(string(trimmed), "\n"))
}
+104
View File
@@ -0,0 +1,104 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package git
import (
"context"
"fmt"
)
// WriteAPI defines the git plumbing operations needed for Approach A (temporary index
// via GIT_INDEX_FILE) to perform updates without a working tree checkout.
//
// This file intentionally does not implement these operations yet; the current
// GitBlobstore milestone is read-only. All methods on the default implementation
// return ErrUnimplemented.
type WriteAPI interface {
// ReadTree populates |indexFile| with the entries from |commit|'s root tree.
// Equivalent plumbing:
// GIT_DIR=... GIT_INDEX_FILE=<indexFile> git read-tree <commit>^{tree}
ReadTree(ctx context.Context, commit OID, indexFile string) error
// ReadTreeEmpty initializes |indexFile| to an empty index.
// Equivalent plumbing:
// GIT_DIR=... GIT_INDEX_FILE=<indexFile> git read-tree --empty
ReadTreeEmpty(ctx context.Context, indexFile string) error
// UpdateIndexCacheInfo adds or replaces |path| in |indexFile| with the given blob |oid| and filemode.
// Equivalent plumbing:
// GIT_DIR=... GIT_INDEX_FILE=<indexFile> git update-index --add --cacheinfo <mode> <oid> <path>
UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid OID, path string) error
// WriteTree writes a tree object from the contents of |indexFile| and returns its oid.
// Equivalent plumbing:
// GIT_DIR=... GIT_INDEX_FILE=<indexFile> git write-tree
WriteTree(ctx context.Context, indexFile string) (OID, error)
// CommitTree creates a commit object from |tree| with optional |parent| and returns its oid.
// Equivalent plumbing:
// GIT_DIR=... git commit-tree <tree> [-p <parent>] -m <message>
CommitTree(ctx context.Context, tree OID, parent *OID, message string, author *Identity) (OID, error)
// UpdateRefCAS atomically updates |ref| from |old| to |new|.
// Equivalent plumbing:
// GIT_DIR=... git update-ref -m <msg> <ref> <new> <old>
UpdateRefCAS(ctx context.Context, ref string, newOID OID, oldOID OID, msg string) error
// UpdateRef updates |ref| to |new| without a compare-and-swap.
// Equivalent plumbing:
// GIT_DIR=... git update-ref -m <msg> <ref> <new>
UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error
}
// Identity represents git author/committer metadata. A future implementation
// may set this via environment variables (GIT_AUTHOR_NAME, etc.).
type Identity struct {
Name string
Email string
}
// UnimplementedWriteAPI is the default write API for the read-only milestone.
// It can be embedded or returned by constructors to make write paths fail fast.
type UnimplementedWriteAPI struct{}
var _ WriteAPI = UnimplementedWriteAPI{}
func (UnimplementedWriteAPI) ReadTree(ctx context.Context, commit OID, indexFile string) error {
return fmt.Errorf("%w: ReadTree", ErrUnimplemented)
}
func (UnimplementedWriteAPI) ReadTreeEmpty(ctx context.Context, indexFile string) error {
return fmt.Errorf("%w: ReadTreeEmpty", ErrUnimplemented)
}
func (UnimplementedWriteAPI) UpdateIndexCacheInfo(ctx context.Context, indexFile string, mode string, oid OID, path string) error {
return fmt.Errorf("%w: UpdateIndexCacheInfo", ErrUnimplemented)
}
func (UnimplementedWriteAPI) WriteTree(ctx context.Context, indexFile string) (OID, error) {
return "", fmt.Errorf("%w: WriteTree", ErrUnimplemented)
}
func (UnimplementedWriteAPI) CommitTree(ctx context.Context, tree OID, parent *OID, message string, author *Identity) (OID, error) {
return "", fmt.Errorf("%w: CommitTree", ErrUnimplemented)
}
func (UnimplementedWriteAPI) UpdateRefCAS(ctx context.Context, ref string, newOID OID, oldOID OID, msg string) error {
return fmt.Errorf("%w: UpdateRefCAS", ErrUnimplemented)
}
func (UnimplementedWriteAPI) UpdateRef(ctx context.Context, ref string, newOID OID, msg string) error {
return fmt.Errorf("%w: UpdateRef", ErrUnimplemented)
}
@@ -0,0 +1,110 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package nbs
import (
"bytes"
"context"
"io"
"os/exec"
"testing"
"github.com/stretchr/testify/require"
"github.com/dolthub/dolt/go/store/blobstore"
"github.com/dolthub/dolt/go/store/hash"
"github.com/dolthub/dolt/go/store/testutils/gitrepo"
"github.com/dolthub/dolt/go/store/types"
)
func TestGitBlobstoreReadSmoke_ManifestAndTableAccessPatterns(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
repo, err := gitrepo.InitBare(ctx, t.TempDir()+"/repo.git")
require.NoError(t, err)
// Seed a valid v5 manifest with no tables. This should allow NBS to open
// without triggering any write paths.
mc := manifestContents{
nbfVers: types.Format_DOLT.VersionString(),
lock: hash.Of([]byte("lock")),
root: hash.Of([]byte("root")),
gcGen: hash.Of([]byte("gcgen")),
specs: nil,
}
var buf bytes.Buffer
require.NoError(t, writeManifest(&buf, mc))
// Seed a "table-like" blob to exercise the same access patterns NBS uses:
// - tail reads via negative BlobRange offsets
// - ReadAt-style ranged reads (ReadAtWithStats)
table := make([]byte, 64*1024)
for i := range table {
table[i] = byte(i % 251)
}
commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{
"manifest": buf.Bytes(),
"table": table,
}, "seed refs/dolt/data for smoke test")
require.NoError(t, err)
require.NotEmpty(t, commit)
bs, err := blobstore.NewGitBlobstore(repo.GitDir, "refs/dolt/data")
require.NoError(t, err)
// 1) Manifest read path via blobstoreManifest.ParseIfExists.
stats := NewStats()
exists, got, err := blobstoreManifest{bs: bs}.ParseIfExists(ctx, stats, nil)
require.NoError(t, err)
require.True(t, exists)
require.Equal(t, mc.nbfVers, got.nbfVers)
require.Equal(t, mc.root, got.root)
require.Equal(t, mc.lock, got.lock)
require.Equal(t, mc.gcGen, got.gcGen)
require.Len(t, got.specs, 0)
// 2) Tail-read pattern used by table index/footer loads:
// bs.Get(key, NewBlobRange(-N, 0)) and io.ReadFull.
const tailN = 1024
rc, totalSz, ver, err := bs.Get(ctx, "table", blobstore.NewBlobRange(-tailN, 0))
require.NoError(t, err)
require.Equal(t, uint64(len(table)), totalSz)
require.Equal(t, commit, ver)
tail := make([]byte, tailN)
_, err = io.ReadFull(rc, tail)
require.NoError(t, err)
require.NoError(t, rc.Close())
require.Equal(t, table[len(table)-tailN:], tail)
// 3) ReadAt-style ranged reads used by table readers.
tr := &bsTableReaderAt{bs: bs, key: "table"}
out := make([]byte, 4096)
n, err := tr.ReadAtWithStats(ctx, out, 1234, stats)
require.NoError(t, err)
require.Equal(t, len(out), n)
require.Equal(t, table[1234:1234+int64(len(out))], out)
// Near-end reads should return short read without error.
out2 := make([]byte, 4096)
start := int64(len(table) - 100)
n, err = tr.ReadAtWithStats(ctx, out2, start, stats)
require.NoError(t, err)
require.Equal(t, 100, n)
require.Equal(t, table[start:], out2[:n])
}
+211
View File
@@ -0,0 +1,211 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package gitrepo contains test helpers for creating and manipulating git repositories
// using plumbing commands without requiring a working tree checkout.
//
// This package is intended for tests of GitBlobstore and related read paths. It
// deliberately uses the git CLI (not a Go git library) to keep the harness small
// and to match how the initial GitBlobstore implementation interacts with git.
package gitrepo
import (
"bytes"
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
)
// Repo is a test-only handle to a bare git repository (its directory is the GIT_DIR).
type Repo struct {
// GitDir is the path to the bare repository directory.
GitDir string
}
// InitBare initializes a new bare git repository at |dir|.
// For portability across git versions, callers should generally pass a path that
// does not exist yet.
func InitBare(ctx context.Context, dir string) (*Repo, error) {
if err := runGit(ctx, "", "", "", "init", "--bare", dir); err != nil {
return nil, err
}
return &Repo{GitDir: dir}, nil
}
// InitBareTemp creates and initializes a new bare git repository under |parentDir|
// (or os.TempDir if empty).
func InitBareTemp(ctx context.Context, parentDir string) (*Repo, error) {
if parentDir == "" {
parentDir = os.TempDir()
}
dir, err := os.MkdirTemp(parentDir, "gitrepo-bare-")
if err != nil {
return nil, err
}
// git init --bare expects the target directory to not exist in some versions;
// to avoid that, create a child directory.
bareDir := filepath.Join(dir, "repo.git")
return InitBare(ctx, bareDir)
}
// SetRefToTree writes a commit whose tree contains |files| and updates |ref| to point at it.
// This is done without a working tree checkout using a temporary index (GIT_INDEX_FILE).
//
// - |ref| example: "refs/dolt/data"
// - |files| keys are tree paths (e.g. "manifest", "a/b/c")
// - |message| becomes the commit message (defaults to "test commit" if empty)
func (r *Repo) SetRefToTree(ctx context.Context, ref string, files map[string][]byte, message string) (commitOID string, err error) {
if message == "" {
message = "test commit"
}
indexDir, err := os.MkdirTemp("", "gitrepo-index-")
if err != nil {
return "", err
}
defer func() {
_ = os.RemoveAll(indexDir)
}()
indexFile := filepath.Join(indexDir, "index")
// Empty index.
if err := runGit(ctx, r.GitDir, indexFile, "", "read-tree", "--empty"); err != nil {
return "", err
}
// Add paths. Sort for determinism.
paths := make([]string, 0, len(files))
for p := range files {
paths = append(paths, p)
}
sort.Strings(paths)
for _, p := range paths {
oid, err := hashObject(ctx, r.GitDir, files[p])
if err != nil {
return "", err
}
if err := runGit(ctx, r.GitDir, indexFile, "", "update-index", "--add", "--cacheinfo", "100644", oid, p); err != nil {
return "", err
}
}
treeOID, err := outputGit(ctx, r.GitDir, indexFile, nil, "write-tree")
if err != nil {
return "", err
}
treeOID = strings.TrimSpace(treeOID)
if treeOID == "" {
return "", fmt.Errorf("write-tree returned empty oid")
}
commitOID, err = outputGit(ctx, r.GitDir, "", commitEnv(), "commit-tree", treeOID, "-m", message)
if err != nil {
return "", err
}
commitOID = strings.TrimSpace(commitOID)
if commitOID == "" {
return "", fmt.Errorf("commit-tree returned empty oid")
}
if err := runGit(ctx, r.GitDir, "", "", "update-ref", ref, commitOID); err != nil {
return "", err
}
return commitOID, nil
}
func commitEnv() []string {
// Deterministic-ish author/committer identity for tests.
return []string{
"GIT_AUTHOR_NAME=gitrepo test",
"GIT_AUTHOR_EMAIL=gitrepo@test.invalid",
"GIT_COMMITTER_NAME=gitrepo test",
"GIT_COMMITTER_EMAIL=gitrepo@test.invalid",
}
}
func hashObject(ctx context.Context, gitDir string, data []byte) (string, error) {
out, err := outputGitWithStdin(ctx, gitDir, "", "", bytes.NewReader(data), "hash-object", "-w", "--stdin")
if err != nil {
return "", err
}
oid := strings.TrimSpace(out)
if oid == "" {
return "", fmt.Errorf("hash-object returned empty oid")
}
return oid, nil
}
func runGit(ctx context.Context, gitDir, indexFile string, extraEnv string, args ...string) error {
_, err := outputGit(ctx, gitDir, indexFile, splitEnv(extraEnv), args...)
return err
}
func outputGit(ctx context.Context, gitDir, indexFile string, extraEnv []string, args ...string) (string, error) {
cmd := exec.CommandContext(ctx, "git", args...) //nolint:gosec // test harness invokes git with controlled args.
cmd.Env = envForGit(gitDir, indexFile, extraEnv)
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%w\ncommand: %s\noutput:\n%s", err, cmd.String(), strings.TrimRight(buf.String(), "\n"))
}
return buf.String(), nil
}
func outputGitWithStdin(ctx context.Context, gitDir, indexFile string, extraEnv string, stdin *bytes.Reader, args ...string) (string, error) {
cmd := exec.CommandContext(ctx, "git", args...) //nolint:gosec // test harness invokes git with controlled args.
cmd.Env = envForGit(gitDir, indexFile, splitEnv(extraEnv))
cmd.Stdin = stdin
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
return "", fmt.Errorf("%w\ncommand: %s\noutput:\n%s", err, cmd.String(), strings.TrimRight(buf.String(), "\n"))
}
return buf.String(), nil
}
func envForGit(gitDir, indexFile string, extra []string) []string {
env := append([]string(nil), os.Environ()...)
if gitDir != "" {
env = append(env, "GIT_DIR="+gitDir)
}
if indexFile != "" {
env = append(env, "GIT_INDEX_FILE="+indexFile)
}
env = append(env, extra...)
return env
}
func splitEnv(extraEnv string) []string {
if extraEnv == "" {
return nil
}
// Allow callers to pass "K=V\nK2=V2" style strings.
lines := strings.Split(extraEnv, "\n")
out := lines[:0]
for _, l := range lines {
l = strings.TrimSpace(l)
if l != "" {
out = append(out, l)
}
}
return out
}
@@ -0,0 +1,57 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gitrepo
import (
"context"
"os/exec"
"path/filepath"
"strings"
"testing"
)
func TestInitBareAndSetRefToTree(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
root := t.TempDir()
bareDir := filepath.Join(root, "repo.git")
repo, err := InitBare(ctx, bareDir)
if err != nil {
t.Fatalf("InitBare failed: %v", err)
}
commit, err := repo.SetRefToTree(ctx, "refs/dolt/data", map[string][]byte{
"manifest": []byte("hello\n"),
"dir/file": []byte("abc"),
"dir/file2": []byte("def"),
"dir2/x.txt": []byte("xyz"),
}, "seed refs/dolt/data")
if err != nil {
t.Fatalf("SetRefToTree failed: %v", err)
}
if len(strings.TrimSpace(commit)) == 0 {
t.Fatalf("expected non-empty commit oid")
}
// Validate the path exists in the commit.
cmd := exec.CommandContext(ctx, "git", "--git-dir", repo.GitDir, "cat-file", "-e", commit+":manifest") //nolint:gosec
if out, err := cmd.CombinedOutput(); err != nil {
t.Fatalf("cat-file -e failed: %v\n%s", err, string(out))
}
}
+1 -1
View File
@@ -10,7 +10,7 @@ paths=`find . -maxdepth 1 -mindepth 1 \( -type d -print -o -type f -name '*.go'
goimports -w -local github.com/dolthub/dolt,github.com/dolthub/eventsapi_schema $paths
bad_files=$(find $paths -name '*.go' | while read f; do
if [[ $(awk '/import \(/{flag=1;next}/\)/{flag=0}flag' < $f | egrep -c '$^') -gt 2 ]]; then
if [[ $(awk '/import \(/{flag=1;next}/\)/{flag=0}flag' < $f | grep -Ec '$^') -gt 2 ]]; then
echo $f
fi
done)