/go/{cmd,libraries}: dbfactory stuff and url stuff

This commit is contained in:
coffeegoddd☕️✨
2026-02-10 16:41:28 -08:00
parent 86ca603c47
commit a33f7cb26b
9 changed files with 620 additions and 13 deletions

View File

@@ -187,9 +187,12 @@ func parseArgs(apr *argparser.ArgParseResults) (string, string, errhand.VerboseE
urlStr := apr.Arg(0)
_, err := earl.Parse(urlStr)
if err != nil {
return "", "", errhand.BuildDError("error: invalid remote url: %s", urlStr).Build()
if normalized, ok, nerr := env.NormalizeGitRemoteUrl(urlStr); nerr == nil && ok {
urlStr = normalized
} else {
return "", "", errhand.BuildDError("error: invalid remote url: %s", urlStr).Build()
}
}
var dir string

View File

@@ -99,7 +99,11 @@ func (cmd ReadTablesCmd) Exec(ctx context.Context, commandStr string, args []str
_, err := earl.Parse(urlStr)
if err != nil {
return HandleVErrAndExitCode(errhand.BuildDError("Invalid remote url").AddCause(err).Build(), usage)
if normalized, ok, nerr := env.NormalizeGitRemoteUrl(urlStr); nerr == nil && ok {
urlStr = normalized
} else {
return HandleVErrAndExitCode(errhand.BuildDError("Invalid remote url").AddCause(err).Build(), usage)
}
}
dir := apr.GetValueOrDefault(dirParamName, path.Base(urlStr))

View File

@@ -53,6 +53,12 @@ const (
OSSScheme = "oss"
// Git remote dbfactory schemes (Git remotes as Dolt remotes)
GitFileScheme = "git+file"
GitHTTPScheme = "git+http"
GitHTTPSScheme = "git+https"
GitSSHScheme = "git+ssh"
defaultScheme = HTTPSScheme
defaultMemTableSize = 256 * 1024 * 1024
)
@@ -69,15 +75,19 @@ type DBFactory interface {
// DBFactories is a map from url scheme name to DBFactory. Additional factories can be added to the DBFactories map
// from external packages.
var DBFactories = map[string]DBFactory{
AWSScheme: AWSFactory{},
OSSScheme: OSSFactory{},
GSScheme: GSFactory{},
OCIScheme: OCIFactory{},
FileScheme: FileFactory{},
MemScheme: MemFactory{},
LocalBSScheme: LocalBSFactory{},
HTTPScheme: NewDoltRemoteFactory(true),
HTTPSScheme: NewDoltRemoteFactory(false),
AWSScheme: AWSFactory{},
OSSScheme: OSSFactory{},
GSScheme: GSFactory{},
OCIScheme: OCIFactory{},
FileScheme: FileFactory{},
MemScheme: MemFactory{},
LocalBSScheme: LocalBSFactory{},
HTTPScheme: NewDoltRemoteFactory(true),
HTTPSScheme: NewDoltRemoteFactory(false),
GitFileScheme: GitRemoteFactory{},
GitHTTPScheme: GitRemoteFactory{},
GitHTTPSScheme: GitRemoteFactory{},
GitSSHScheme: GitRemoteFactory{},
}
// CreateDB creates a database based on the supplied urlStr, and creation params. The DBFactory used for creation is

View File

@@ -0,0 +1,239 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbfactory
import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"github.com/dolthub/dolt/go/store/blobstore"
"github.com/dolthub/dolt/go/store/datas"
"github.com/dolthub/dolt/go/store/nbs"
"github.com/dolthub/dolt/go/store/prolly/tree"
"github.com/dolthub/dolt/go/store/types"
)
const (
gitCacheDirParam = "git_cache_dir"
gitCacheDirEnv = "DOLT_GIT_REMOTE_CACHE_DIR"
defaultGitRef = "refs/dolt/data"
)
// GitRemoteFactory opens a Dolt database backed by a Git remote, using a local bare
// repository as an object cache and remote configuration store.
//
// Supported schemes (registered in factory.go):
// - git+file
// - git+http
// - git+https
// - git+ssh
type GitRemoteFactory struct{}
var _ DBFactory = GitRemoteFactory{}
func (fact GitRemoteFactory) PrepareDB(ctx context.Context, nbf *types.NomsBinFormat, urlObj *url.URL, params map[string]interface{}) error {
switch strings.ToLower(urlObj.Scheme) {
case GitFileScheme:
remoteURL, _, err := parseGitRemoteFactoryURL(urlObj)
if err != nil {
return err
}
if remoteURL.Scheme != "file" {
return fmt.Errorf("git+file: expected underlying file URL, got %q", remoteURL.Scheme)
}
p := filepath.Join(remoteURL.Host, filepath.FromSlash(remoteURL.Path))
if p == "" {
return fmt.Errorf("git+file: empty remote path")
}
if _, err := os.Stat(p); err == nil {
return nil
} else if !errors.Is(err, os.ErrNotExist) {
return err
}
return runGitInitBare(ctx, p)
default:
return fmt.Errorf("prepare not supported for scheme %q", urlObj.Scheme)
}
}
func (fact GitRemoteFactory) CreateDB(ctx context.Context, nbf *types.NomsBinFormat, urlObj *url.URL, params map[string]interface{}) (datas.Database, types.ValueReadWriter, tree.NodeStore, error) {
remoteURL, ref, err := parseGitRemoteFactoryURL(urlObj)
if err != nil {
return nil, nil, nil, err
}
cacheBase, err := resolveGitCacheBase(params)
if err != nil {
return nil, nil, nil, err
}
cacheRepo, err := cacheRepoPath(cacheBase, remoteURL.String(), ref)
if err != nil {
return nil, nil, nil, err
}
if err := ensureBareRepo(ctx, cacheRepo); err != nil {
return nil, nil, nil, err
}
// Ensure remote "origin" exists and points to the underlying git remote URL.
if err := ensureGitRemoteURL(ctx, cacheRepo, "origin", remoteURL.String()); err != nil {
return nil, nil, nil, err
}
q := nbs.NewUnlimitedMemQuotaProvider()
cs, err := nbs.NewGitStore(ctx, nbf.VersionString(), cacheRepo, ref, blobstore.GitBlobstoreOptions{RemoteName: "origin"}, defaultMemTableSize, q)
if err != nil {
return nil, nil, nil, err
}
vrw := types.NewValueStore(cs)
ns := tree.NewNodeStore(cs)
db := datas.NewTypesDatabase(vrw, ns)
return db, vrw, ns, nil
}
func parseGitRemoteFactoryURL(urlObj *url.URL) (remoteURL *url.URL, ref string, err error) {
if urlObj == nil {
return nil, "", fmt.Errorf("nil url")
}
scheme := strings.ToLower(urlObj.Scheme)
if !strings.HasPrefix(scheme, "git+") {
return nil, "", fmt.Errorf("expected git+ scheme, got %q", urlObj.Scheme)
}
underlyingScheme := strings.TrimPrefix(scheme, "git+")
if underlyingScheme == "" {
return nil, "", fmt.Errorf("invalid git+ scheme %q", urlObj.Scheme)
}
ref = urlObj.Query().Get("ref")
if ref == "" {
ref = defaultGitRef
}
cp := *urlObj
cp.Scheme = underlyingScheme
cp.RawQuery = ""
cp.Fragment = ""
return &cp, ref, nil
}
func resolveGitCacheBase(params map[string]interface{}) (string, error) {
if params != nil {
if v, ok := params[gitCacheDirParam]; ok && v != nil {
s, ok := v.(string)
if !ok {
return "", fmt.Errorf("%s must be a string", gitCacheDirParam)
}
if strings.TrimSpace(s) == "" {
return "", fmt.Errorf("%s cannot be empty", gitCacheDirParam)
}
return s, nil
}
}
if v := strings.TrimSpace(os.Getenv(gitCacheDirEnv)); v != "" {
return v, nil
}
base, err := os.UserCacheDir()
if err != nil {
return "", err
}
return filepath.Join(base, "dolt", "git-remote-cache"), nil
}
func cacheRepoPath(cacheBase, remoteURL, ref string) (string, error) {
if strings.TrimSpace(cacheBase) == "" {
return "", fmt.Errorf("empty git cache base")
}
sum := sha256.Sum256([]byte(remoteURL + "|" + ref))
h := hex.EncodeToString(sum[:])
return filepath.Join(cacheBase, h, "repo.git"), nil
}
func ensureBareRepo(ctx context.Context, gitDir string) error {
if gitDir == "" {
return fmt.Errorf("empty gitDir")
}
if st, err := os.Stat(gitDir); err == nil {
if !st.IsDir() {
return fmt.Errorf("git cache repo path is not a directory: %s", gitDir)
}
return nil
} else if !errors.Is(err, os.ErrNotExist) {
return err
}
if err := os.MkdirAll(filepath.Dir(gitDir), 0o755); err != nil {
return err
}
return runGitInitBare(ctx, gitDir)
}
func ensureGitRemoteURL(ctx context.Context, gitDir string, remoteName string, remoteURL string) error {
if strings.TrimSpace(remoteName) == "" {
return fmt.Errorf("empty remote name")
}
if strings.TrimSpace(remoteURL) == "" {
return fmt.Errorf("empty remote url")
}
got, err := runGitInDir(ctx, gitDir, "remote", "get-url", remoteName)
if err != nil {
// Remote likely doesn't exist; attempt to add.
return runGitInDirNoOutput(ctx, gitDir, "remote", "add", remoteName, remoteURL)
}
got = strings.TrimSpace(got)
if got == remoteURL {
return nil
}
return runGitInDirNoOutput(ctx, gitDir, "remote", "set-url", remoteName, remoteURL)
}
func runGitInitBare(ctx context.Context, dir string) error {
_, err := exec.LookPath("git")
if err != nil {
return fmt.Errorf("git not found on PATH: %w", err)
}
cmd := exec.CommandContext(ctx, "git", "init", "--bare", dir) //nolint:gosec // controlled args
out, err := cmd.CombinedOutput()
if err != nil {
return fmt.Errorf("git init --bare failed: %w\noutput:\n%s", err, strings.TrimSpace(string(out)))
}
return nil
}
func runGitInDir(ctx context.Context, gitDir string, args ...string) (string, error) {
_, err := exec.LookPath("git")
if err != nil {
return "", fmt.Errorf("git not found on PATH: %w", err)
}
all := append([]string{"--git-dir", gitDir}, args...)
cmd := exec.CommandContext(ctx, "git", all...) //nolint:gosec // controlled args
out, err := cmd.CombinedOutput()
if err != nil {
return "", fmt.Errorf("git %s failed: %w\noutput:\n%s", strings.Join(args, " "), err, strings.TrimSpace(string(out)))
}
return string(out), nil
}
func runGitInDirNoOutput(ctx context.Context, gitDir string, args ...string) error {
_, err := runGitInDir(ctx, gitDir, args...)
return err
}

View File

@@ -0,0 +1,89 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbfactory
import (
"context"
"crypto/sha256"
"encoding/hex"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/require"
"github.com/dolthub/dolt/go/store/chunks"
"github.com/dolthub/dolt/go/store/hash"
"github.com/dolthub/dolt/go/store/testutils/gitrepo"
"github.com/dolthub/dolt/go/store/types"
)
func TestGitRemoteFactory_GitFile_UsesConfiguredCacheDirAndCanWrite(t *testing.T) {
if _, err := exec.LookPath("git"); err != nil {
t.Skip("git not found on PATH")
}
ctx := context.Background()
remoteRepo, err := gitrepo.InitBare(ctx, t.TempDir()+"/remote.git")
require.NoError(t, err)
cacheDir := t.TempDir()
remotePath := filepath.ToSlash(remoteRepo.GitDir)
remoteURL := "file://" + remotePath
urlStr := "git+file://" + remotePath + "?ref=refs/dolt/data"
params := map[string]interface{}{
gitCacheDirParam: cacheDir,
}
db, vrw, _, err := CreateDB(ctx, types.Format_Default, urlStr, params)
require.NoError(t, err)
require.NotNil(t, db)
require.NotNil(t, vrw)
// Ensure cache repo created under configured cache dir.
sum := sha256.Sum256([]byte(remoteURL + "|" + "refs/dolt/data"))
h := hex.EncodeToString(sum[:])
cacheRepo := filepath.Join(cacheDir, h, "repo.git")
_, err = os.Stat(filepath.Join(cacheRepo, "HEAD"))
require.NoError(t, err)
vs, ok := vrw.(*types.ValueStore)
require.True(t, ok, "expected ValueReadWriter to be *types.ValueStore, got %T", vrw)
cs := vs.ChunkStore()
// Minimal write: put one chunk and commit its hash as the root.
c := chunks.NewChunk([]byte("hello\n"))
err = cs.Put(ctx, c, func(chunks.Chunk) chunks.GetAddrsCb {
return func(context.Context, hash.HashSet, chunks.PendingRefExists) error { return nil }
})
require.NoError(t, err)
last, err := cs.Root(ctx)
require.NoError(t, err)
okCommit, err := cs.Commit(ctx, c.Hash(), last)
require.NoError(t, err)
require.True(t, okCommit)
require.NoError(t, db.Close())
// Remote should now have refs/dolt/data.
cmd := exec.CommandContext(ctx, "git", "--git-dir", remoteRepo.GitDir, "rev-parse", "--verify", "--quiet", "refs/dolt/data^{commit}")
out, err := cmd.CombinedOutput()
require.NoError(t, err, "git rev-parse failed: %s", strings.TrimSpace(string(out)))
}

View File

@@ -0,0 +1,185 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package env
import (
"fmt"
"net/url"
"path/filepath"
"strings"
)
const defaultGitRemoteRef = "refs/dolt/data"
var supportedGitPlusSchemes = map[string]struct{}{
"git+file": {},
"git+http": {},
"git+https": {},
"git+ssh": {},
}
var supportedUnderlyingGitSchemes = map[string]struct{}{
"file": {},
"http": {},
"https": {},
"ssh": {},
}
// NormalizeGitRemoteUrl translates user-provided git remote strings into a canonical dbfactory URL
// using git+* schemes.
//
// It accepts:
// - Explicit dbfactory URLs: git+file/http/https/ssh://...
// - URLs ending in .git: file/http/https/ssh URLs
// - scp-style ssh: [user@]host:path/repo.git
// - schemeless host/path: host/org/repo.git (defaults to git+https)
// - local paths ending in .git (absolute or relative) (translated to git+file)
//
// It returns ok=false when the input is not recognized as a git remote URL (so callers can fall back
// to existing remote handling).
func NormalizeGitRemoteUrl(urlArg string) (normalized string, ok bool, err error) {
urlArg = strings.TrimSpace(urlArg)
if urlArg == "" {
return "", false, fmt.Errorf("empty remote url")
}
// Fast-path: explicit git+* dbfactory URL.
if strings.HasPrefix(strings.ToLower(urlArg), "git+") {
u, err := url.Parse(urlArg)
if err != nil {
return "", false, err
}
if _, ok := supportedGitPlusSchemes[strings.ToLower(u.Scheme)]; !ok {
return "", false, fmt.Errorf("unsupported git dbfactory scheme %q", u.Scheme)
}
ensureDefaultRefQuery(u)
return u.String(), true, nil
}
// Only translate obvious git remote strings (must end in .git).
base := stripQueryAndFragment(urlArg)
if !strings.HasSuffix(base, ".git") {
return "", false, nil
}
// scp-like ssh: [user@]host:path/repo.git (no scheme, no ://)
if isScpLikeGitRemote(urlArg) {
host, p := splitScpLike(urlArg)
ssh := "git+ssh://" + host + "/" + strings.TrimPrefix(p, "/")
u, err := url.Parse(ssh)
if err != nil {
return "", false, err
}
ensureDefaultRefQuery(u)
return u.String(), true, nil
}
// file/http/https/ssh url with a scheme.
if strings.Contains(urlArg, "://") {
u, err := url.Parse(urlArg)
if err != nil {
return "", false, err
}
s := strings.ToLower(u.Scheme)
if _, ok := supportedUnderlyingGitSchemes[s]; !ok {
return "", false, nil
}
u.Scheme = "git+" + s
ensureDefaultRefQuery(u)
return u.String(), true, nil
}
// Local filesystem path (absolute or relative).
if looksLikeLocalPath(urlArg) {
abs, err := filepath.Abs(urlArg)
if err != nil {
return "", false, err
}
abs = filepath.ToSlash(abs)
u, err := url.Parse("git+file://" + abs)
if err != nil {
return "", false, err
}
ensureDefaultRefQuery(u)
return u.String(), true, nil
}
// Schemeless host/path.git defaults to https.
u, err := url.Parse("git+https://" + urlArg)
if err != nil {
return "", false, err
}
ensureDefaultRefQuery(u)
return u.String(), true, nil
}
func stripQueryAndFragment(s string) string {
// Order matters: strip fragment then query.
if i := strings.IndexByte(s, '#'); i >= 0 {
s = s[:i]
}
if i := strings.IndexByte(s, '?'); i >= 0 {
s = s[:i]
}
return s
}
func looksLikeLocalPath(s string) bool {
return strings.HasPrefix(s, "/") || strings.HasPrefix(s, "./") || strings.HasPrefix(s, "../")
}
func isScpLikeGitRemote(s string) bool {
// This intentionally keeps the matcher simple:
// - no scheme (no "://")
// - contains a single ':' separating host from path
// - host part contains no '/'
// - path ends in .git (already checked by caller)
if strings.Contains(s, "://") {
return false
}
colon := strings.IndexByte(s, ':')
if colon < 0 {
return false
}
host := s[:colon]
path := s[colon+1:]
if host == "" || path == "" {
return false
}
if strings.Contains(host, "/") {
return false
}
// Avoid misclassifying Windows paths; host must contain a dot or an '@' (git@host:...).
if !strings.Contains(host, ".") && !strings.Contains(host, "@") {
return false
}
return true
}
func splitScpLike(s string) (host string, path string) {
i := strings.IndexByte(s, ':')
if i < 0 {
return "", s
}
return s[:i], s[i+1:]
}
func ensureDefaultRefQuery(u *url.URL) {
q := u.Query()
if q.Get("ref") == "" {
q.Set("ref", defaultGitRemoteRef)
u.RawQuery = q.Encode()
}
}

View File

@@ -0,0 +1,67 @@
// Copyright 2026 Dolthub, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package env
import (
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
func TestNormalizeGitRemoteUrl(t *testing.T) {
t.Run("explicit git+https keeps scheme and adds default ref", func(t *testing.T) {
got, ok, err := NormalizeGitRemoteUrl("git+https://example.com/org/repo.git")
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "git+https://example.com/org/repo.git?ref=refs%2Fdolt%2Fdata", got)
})
t.Run("https .git becomes git+https and adds default ref", func(t *testing.T) {
got, ok, err := NormalizeGitRemoteUrl("https://example.com/org/repo.git")
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "git+https://example.com/org/repo.git?ref=refs%2Fdolt%2Fdata", got)
})
t.Run("scp-style becomes git+ssh and adds default ref", func(t *testing.T) {
got, ok, err := NormalizeGitRemoteUrl("git@github.com:org/repo.git")
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "git+ssh://git@github.com/org/repo.git?ref=refs%2Fdolt%2Fdata", got)
})
t.Run("schemeless host/path defaults to git+https and adds default ref", func(t *testing.T) {
got, ok, err := NormalizeGitRemoteUrl("github.com/org/repo.git")
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "git+https://github.com/org/repo.git?ref=refs%2Fdolt%2Fdata", got)
})
t.Run("local absolute path becomes git+file and adds default ref", func(t *testing.T) {
p := filepath.ToSlash(filepath.Join(t.TempDir(), "remote.git"))
got, ok, err := NormalizeGitRemoteUrl(p)
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "git+file://"+p+"?ref=refs%2Fdolt%2Fdata", got)
})
t.Run("non .git url not recognized", func(t *testing.T) {
got, ok, err := NormalizeGitRemoteUrl("https://example.com/not-git")
require.NoError(t, err)
require.False(t, ok)
require.Empty(t, got)
})
}

View File

@@ -643,6 +643,12 @@ func NewPullSpec[C doltdb.Context](
}
func GetAbsRemoteUrl(fs filesys2.Filesys, cfg config.ReadableConfig, urlArg string) (string, string, error) {
if normalized, ok, nerr := NormalizeGitRemoteUrl(urlArg); nerr != nil {
return "", "", nerr
} else if ok {
urlArg = normalized
}
u, err := earl.Parse(urlArg)
if err != nil {
return "", "", err

View File

@@ -82,7 +82,11 @@ func getDirectoryAndUrlString(apr *argparser.ArgParseResults) (string, string, e
urlStr := apr.Arg(0)
_, err := earl.Parse(urlStr)
if err != nil {
return "", "", errhand.BuildDError("error: invalid remote url: %s", urlStr).Build()
if normalized, ok, nerr := env.NormalizeGitRemoteUrl(urlStr); nerr == nil && ok {
urlStr = normalized
} else {
return "", "", errhand.BuildDError("error: invalid remote url: %s", urlStr).Build()
}
}
var dir string