Merge pull request #5899 from dolthub/aaron/dolt_gc-unavailable-on-standby-replica

go: sqle: dolt_gc.go: Disable CALL dolt_gc() on a standby replica, where it is not currently safe. Shallow GC is still available.
This commit is contained in:
Aaron Son
2023-05-10 11:53:02 -07:00
committed by GitHub
4 changed files with 128 additions and 15 deletions
@@ -95,8 +95,6 @@ type procedurestore interface {
}
const (
DoltClusterRoleVariable = "dolt_cluster_role"
DoltClusterRoleEpochVariable = "dolt_cluster_role_epoch"
// Since we fetch the keys from the other replicas were going to use a fixed string here.
DoltClusterRemoteApiAudience = "dolt-cluster-remote-api.dolthub.com"
)
@@ -285,17 +283,17 @@ func (c *Controller) refreshSystemVars() {
role, epoch := string(c.role), c.epoch
vars := []sql.SystemVariable{
{
Name: DoltClusterRoleVariable,
Name: dsess.DoltClusterRoleVariable,
Dynamic: false,
Scope: sql.SystemVariableScope_Persist,
Type: gmstypes.NewSystemStringType(DoltClusterRoleVariable),
Type: gmstypes.NewSystemStringType(dsess.DoltClusterRoleVariable),
Default: role,
},
{
Name: DoltClusterRoleEpochVariable,
Name: dsess.DoltClusterRoleEpochVariable,
Dynamic: false,
Scope: sql.SystemVariableScope_Persist,
Type: gmstypes.NewSystemIntType(DoltClusterRoleEpochVariable, 0, 9223372036854775807, false),
Type: gmstypes.NewSystemIntType(dsess.DoltClusterRoleEpochVariable, 0, 9223372036854775807, false),
Default: epoch,
},
}
@@ -304,16 +302,16 @@ func (c *Controller) refreshSystemVars() {
func (c *Controller) persistVariables() error {
toset := make(map[string]string)
toset[DoltClusterRoleVariable] = string(c.role)
toset[DoltClusterRoleEpochVariable] = strconv.Itoa(c.epoch)
toset[dsess.DoltClusterRoleVariable] = string(c.role)
toset[dsess.DoltClusterRoleEpochVariable] = strconv.Itoa(c.epoch)
return c.persistentCfg.SetStrings(toset)
}
func applyBootstrapClusterConfig(lgr *logrus.Logger, cfg Config, pCfg config.ReadWriteConfig) (Role, int, error) {
toset := make(map[string]string)
persistentRole := pCfg.GetStringOrDefault(DoltClusterRoleVariable, "")
persistentRole := pCfg.GetStringOrDefault(dsess.DoltClusterRoleVariable, "")
var roleFromPersistentConfig bool
persistentEpoch := pCfg.GetStringOrDefault(DoltClusterRoleEpochVariable, "")
persistentEpoch := pCfg.GetStringOrDefault(dsess.DoltClusterRoleEpochVariable, "")
if persistentRole == "" {
if cfg.BootstrapRole() != "" {
lgr.Tracef("cluster/controller: persisted cluster role was empty, apply bootstrap_role %s", cfg.BootstrapRole())
@@ -322,7 +320,7 @@ func applyBootstrapClusterConfig(lgr *logrus.Logger, cfg Config, pCfg config.Rea
lgr.Trace("cluster/controller: persisted cluster role was empty, bootstrap_role was empty: defaulted to primary")
persistentRole = "primary"
}
toset[DoltClusterRoleVariable] = persistentRole
toset[dsess.DoltClusterRoleVariable] = persistentRole
} else {
roleFromPersistentConfig = true
lgr.Tracef("cluster/controller: persisted cluster role is %s", persistentRole)
@@ -330,19 +328,19 @@ func applyBootstrapClusterConfig(lgr *logrus.Logger, cfg Config, pCfg config.Rea
if persistentEpoch == "" {
persistentEpoch = strconv.Itoa(cfg.BootstrapEpoch())
lgr.Tracef("cluster/controller: persisted cluster role epoch is empty, took boostrap_epoch: %s", persistentEpoch)
toset[DoltClusterRoleEpochVariable] = persistentEpoch
toset[dsess.DoltClusterRoleEpochVariable] = persistentEpoch
} else {
lgr.Tracef("cluster/controller: persisted cluster role epoch is %s", persistentEpoch)
}
if persistentRole != string(RolePrimary) && persistentRole != string(RoleStandby) {
isallowed := persistentRole == string(RoleDetectedBrokenConfig) && roleFromPersistentConfig
if !isallowed {
return "", 0, fmt.Errorf("persisted role %s.%s = %s must be \"primary\" or \"secondary\"", PersistentConfigPrefix, DoltClusterRoleVariable, persistentRole)
return "", 0, fmt.Errorf("persisted role %s.%s = %s must be \"primary\" or \"secondary\"", PersistentConfigPrefix, dsess.DoltClusterRoleVariable, persistentRole)
}
}
epochi, err := strconv.Atoi(persistentEpoch)
if err != nil {
return "", 0, fmt.Errorf("persisted role epoch %s.%s = %s must be an integer", PersistentConfigPrefix, DoltClusterRoleEpochVariable, persistentEpoch)
return "", 0, fmt.Errorf("persisted role epoch %s.%s = %s must be an integer", PersistentConfigPrefix, dsess.DoltClusterRoleEpochVariable, persistentEpoch)
}
if len(toset) > 0 {
err := pCfg.SetStrings(toset)
@@ -86,10 +86,47 @@ func doDoltGC(ctx *sql.Context, args []string) (int, error) {
return cmdFailure, err
}
} else {
// Currently, if this server is involved in cluster
// replication, a full GC is only safe to run on the primary.
// We assert that we are the primary here before we begin, and
// we assert again that we are the primary at the same epoch as
// we establish the safepoint.
origepoch := -1
if _, role, ok := sql.SystemVariables.GetGlobal(dsess.DoltClusterRoleVariable); ok {
// TODO: magic constant...
if role.(string) != "primary" {
return cmdFailure, fmt.Errorf("cannot run a full dolt_gc() while cluster replication is enabled and role is %s; must be the primary", role.(string))
}
_, epoch, ok := sql.SystemVariables.GetGlobal(dsess.DoltClusterRoleEpochVariable)
if !ok {
return cmdFailure, fmt.Errorf("internal error: cannot run a full dolt_gc(); cluster replication is enabled but could not read %s", dsess.DoltClusterRoleEpochVariable)
}
origepoch = epoch.(int)
}
// TODO: If we got a callback at the beginning and an
// (allowed-to-block) callback at the end, we could more
// gracefully tear things down.
err = ddb.GC(ctx, func() error {
if origepoch != -1 {
// Here we need to sanity check role and epoch.
if _, role, ok := sql.SystemVariables.GetGlobal(dsess.DoltClusterRoleVariable); ok {
if role.(string) != "primary" {
return fmt.Errorf("dolt_gc failed: when we began we were a primary in a cluster, but now our role is %s", role.(string))
}
_, epoch, ok := sql.SystemVariables.GetGlobal(dsess.DoltClusterRoleEpochVariable)
if !ok {
return fmt.Errorf("dolt_gc failed: when we began we were a primary in a cluster, but we can no longer read the cluster role epoch.")
}
if origepoch != epoch.(int) {
return fmt.Errorf("dolt_gc failed: when we began we were primary in the cluster at epoch %d, but now we are at epoch %d. for gc to safely finalize, our role and epoch must not change throughout the gc.", origepoch, epoch.(int))
}
} else {
return fmt.Errorf("dolt_gc failed: when we began we were a primary in a cluster, but we can no longer read the cluster role.")
}
}
killed := make(map[uint32]struct{})
processes := ctx.ProcessList.Processes()
for _, p := range processes {
@@ -50,6 +50,9 @@ const (
AwsCredsProfile = "aws_credentials_profile"
AwsCredsRegion = "aws_credentials_region"
ShowBranchDatabases = "dolt_show_branch_databases"
DoltClusterRoleVariable = "dolt_cluster_role"
DoltClusterRoleEpochVariable = "dolt_cluster_role_epoch"
)
const URLTemplateDatabasePlaceholder = "{database}"
@@ -1022,4 +1022,79 @@ tests:
- on: server1
queries:
- exec: 'use repo1'
- exec: 'call dolt_checkout("new_branch_name")'
- exec: 'call dolt_checkout("new_branch_name")'
- name: call dolt gc
multi_repos:
- name: server1
with_files:
- name: server.yaml
contents: |
log_level: trace
listener:
host: 0.0.0.0
port: 3309
cluster:
standby_remotes:
- name: standby
remote_url_template: http://localhost:3852/{database}
bootstrap_role: primary
bootstrap_epoch: 1
remotesapi:
port: 3851
server:
args: ["--config", "server.yaml"]
port: 3309
- name: server2
with_files:
- name: server.yaml
contents: |
log_level: trace
listener:
host: 0.0.0.0
port: 3310
cluster:
standby_remotes:
- name: standby
remote_url_template: http://localhost:3851/{database}
bootstrap_role: standby
bootstrap_epoch: 1
remotesapi:
port: 3852
server:
args: ["--config", "server.yaml"]
port: 3310
connections:
- on: server1
queries:
- exec: 'create database repo1'
- exec: 'use repo1'
- exec: 'create table vals (id int primary key, val int)'
- exec: 'insert into vals values (1,1)'
- exec: 'insert into vals values (2,2)'
- exec: 'insert into vals values (3,3)'
- exec: 'insert into vals values (4,4)'
- exec: 'call dolt_gc()'
- exec: 'select * from vals'
error_match: "this connection can no longer be used"
- on: server1
queries:
- query: "select `database`, standby_remote, role, epoch, replication_lag_millis, current_error from dolt_cluster.dolt_cluster_status order by `database` asc"
result:
columns: ["database","standby_remote","role","epoch","replication_lag_millis","current_error"]
rows:
- ["repo1","standby","primary","1","0","NULL"]
retry_attempts: 100
- on: server2
queries:
- exec: 'use repo1'
- query: "select * from vals order by id asc"
result:
columns: ["id","val"]
rows:
- [1,1]
- [2,2]
- [3,3]
- [4,4]
- exec: 'call dolt_gc()'
error_match: "must be the primary"
- exec: 'call dolt_gc("--shallow")'