From a4e28fa5b57407cbaf9d89b08f28a7dcae2c4260 Mon Sep 17 00:00:00 2001 From: Vinai Rachakonda Date: Tue, 14 Jun 2022 13:28:23 -0400 Subject: [PATCH] Support the bulk edit accumulator for `dolt table import` (#3591) --- go/cmd/dolt/commands/engine/sqlengine.go | 3 ++- go/cmd/dolt/commands/engine/utils.go | 12 ++++++++---- .../doltcore/mvdata/engine_table_writer.go | 8 ++------ go/performance/import_tester/run_importer.sh | 14 ++++++++------ 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/go/cmd/dolt/commands/engine/sqlengine.go b/go/cmd/dolt/commands/engine/sqlengine.go index ce0257583a..a4bd8a9dd9 100644 --- a/go/cmd/dolt/commands/engine/sqlengine.go +++ b/go/cmd/dolt/commands/engine/sqlengine.go @@ -54,6 +54,7 @@ type SqlEngineConfig struct { ServerUser string ServerPass string Autocommit bool + Bulk bool } // NewSqlEngine returns a SqlEngine @@ -66,7 +67,7 @@ func NewSqlEngine( parallelism := runtime.GOMAXPROCS(0) - dbs, err := CollectDBs(ctx, mrEnv) + dbs, err := CollectDBs(ctx, mrEnv, config.Bulk) if err != nil { return nil, err } diff --git a/go/cmd/dolt/commands/engine/utils.go b/go/cmd/dolt/commands/engine/utils.go index 23da7994a1..a5f08a52ad 100644 --- a/go/cmd/dolt/commands/engine/utils.go +++ b/go/cmd/dolt/commands/engine/utils.go @@ -30,7 +30,7 @@ import ( // CollectDBs takes a MultiRepoEnv and creates Database objects from each environment and returns a slice of these // objects. -func CollectDBs(ctx context.Context, mrEnv *env.MultiRepoEnv) ([]sqle.SqlDatabase, error) { +func CollectDBs(ctx context.Context, mrEnv *env.MultiRepoEnv, useBulkEditor bool) ([]sqle.SqlDatabase, error) { var dbs []sqle.SqlDatabase var db sqle.SqlDatabase @@ -41,7 +41,7 @@ func CollectDBs(ctx context.Context, mrEnv *env.MultiRepoEnv) ([]sqle.SqlDatabas } dEnv.DoltDB.SetCommitHooks(ctx, postCommitHooks) - db = newDatabase(name, dEnv) + db = newDatabase(name, dEnv, useBulkEditor) if _, remote, ok := sql.SystemVariables.GetGlobal(sqle.ReadReplicaRemoteKey); ok && remote != "" { remoteName, ok := remote.(string) @@ -84,9 +84,13 @@ func GetCommitHooks(ctx context.Context, dEnv *env.DoltEnv) ([]doltdb.CommitHook return postCommitHooks, nil } -func newDatabase(name string, dEnv *env.DoltEnv) sqle.Database { +func newDatabase(name string, dEnv *env.DoltEnv, useBulkEditor bool) sqle.Database { + deaf := dEnv.DbEaFactory() + if useBulkEditor { + deaf = dEnv.BulkDbEaFactory() + } opts := editor.Options{ - Deaf: dEnv.DbEaFactory(), + Deaf: deaf, Tempdir: dEnv.TempTableFilesDir(), } return sqle.NewDatabase(name, dEnv.DbData(), opts) diff --git a/go/libraries/doltcore/mvdata/engine_table_writer.go b/go/libraries/doltcore/mvdata/engine_table_writer.go index cba15172d3..d297aea9b3 100644 --- a/go/libraries/doltcore/mvdata/engine_table_writer.go +++ b/go/libraries/doltcore/mvdata/engine_table_writer.go @@ -84,7 +84,8 @@ func NewSqlEngineTableWriter(ctx context.Context, dEnv *env.DoltEnv, createTable PrivFilePath: "", ServerUser: "root", ServerPass: "", - Autocommit: true, + Autocommit: false, // We set autocommit == false to ensure to improve performance. Bulk import should not commit on each row. + Bulk: true, } se, err := engine.NewSqlEngine( ctx, @@ -107,11 +108,6 @@ func NewSqlEngineTableWriter(ctx context.Context, dEnv *env.DoltEnv, createTable dsess.DSessFromSess(sqlCtx.Session).EnableBatchedMode() - err = sqlCtx.Session.SetSessionVariable(sqlCtx, sql.AutoCommitSessionVar, false) - if err != nil { - return nil, err - } - doltCreateTableSchema, err := sqlutil.FromDoltSchema(options.TableToWriteTo, createTableSchema) if err != nil { return nil, err diff --git a/go/performance/import_tester/run_importer.sh b/go/performance/import_tester/run_importer.sh index f82362279c..613eec5540 100755 --- a/go/performance/import_tester/run_importer.sh +++ b/go/performance/import_tester/run_importer.sh @@ -27,22 +27,24 @@ python3 csv_gen.py '{ {"name":"c3", "type":"float"}, {"name":"c4", "type":"int"} ], - "row_count": 1000000 + "row_count": 10000000 }' > benchmark.csv -# Run the current version of dolt TODO: Assumes no storage version changes... Change if there is +# Run the current version of dolt echo "Running the current version of import" rm -rf .dolt dolt init time dolt table import -c --pk=pk current_version benchmark.csv +# Run the current version of export +echo "Running the current version of export" +time dolt table export -f current_version export.csv + # Run the old version of dolt +rm -rf .dolt +./old-dolt init echo "Running version 0.34.5" time ./old-dolt table import -c --pk=pk old_version benchmark.csv -# Run the current version of export -echo "Running the current version of export" -time dolt table export current_version export.csv - # Run the old version of export time ./old-dolt table export -f old_version export.csv