diff --git a/.github/scripts/merge-perf/data.py b/.github/scripts/merge-perf/data.py index a44b60b75b..31ff7a83ac 100644 --- a/.github/scripts/merge-perf/data.py +++ b/.github/scripts/merge-perf/data.py @@ -3,8 +3,8 @@ import shutil import sys import random -if len(sys.argv) != 5: - print("usage: python3 data.py ") +if len(sys.argv) != 7: + print("usage: python3 data.py ") sys.exit(1) table_dir = sys.argv[1] @@ -12,13 +12,17 @@ tables = int(sys.argv[2]) rows = int(sys.argv[3]) adds = int(sys.argv[4]) +deletes = int(sys.argv[5]) +updates = int(sys.argv[6]) if __name__=="__main__": + if deletes + updates > rows: + raise ValueError(f"deletes({deletes}) + updates({updates}) = {updates+deletes} > total rows({rows})") if not os.path.exists(table_dir): shutil.rmtree(table_dir, ignore_errors=True) os.makedirs(table_dir) - ys = [i for i in range(rows+adds)] + ys = [i for i in range(rows+adds+deletes+updates)] random.shuffle(ys) with open(f"{table_dir}/create.sql", "+w") as f: @@ -32,30 +36,74 @@ if __name__=="__main__": for j in range(tables): with open(f"{table_dir}/table{j}.csv", "+w") as f: f.write("x,y,z\n") - for i in range(rows): + for i in range(rows+deletes+updates): f.write(f"{i},{ys[i]},{i}\n") with open(f"{table_dir}/branch.sql", "+w") as f: for i in range(tables): f.write(f"set foreign_key_checks = 0;\n") f.write(f"set unique_checks = 0;\n") - f.write(f"insert into table{i} values\n") - for j,k in enumerate(ys[rows:rows+adds]): - if j == 0: - f.write(f" ") - else: - f.write(f", ") - f.write(f"({rows+j},{k},{rows+j})") - f.write(f";\n") + if adds > 0: + f.write(f"insert into table{i} values\n") + for j,k in enumerate(ys[rows+deletes+updates:rows+deletes+updates+adds]): + if j == 0: + f.write(f" ") + else: + f.write(f", ") + f.write(f"({rows+deletes+updates+j},{k},{rows+deletes+updates+j})") + f.write(f";\n") + + if deletes > 0: + f.write(f"delete from table{i} where x in\n") + for j, y in enumerate(ys[:deletes]): + if j == 0: + f.write(f" (") + else: + f.write(f", ") + f.write(f"{y}") + f.write(f");\n") + + if updates > 0: + f.write(f"update table{i} set y=y+1 where x in\n") + for j, y in enumerate(ys[deletes:deletes+updates]): + if j == 0: + f.write(f" (") + else: + f.write(f", ") + f.write(f"{y}") + f.write(f");\n") + with open(f"{table_dir}/diverge_main.sql", "+w") as f: for i in range(tables): f.write(f"set foreign_key_checks = 0;\n") f.write(f"set unique_checks = 0;\n") - f.write(f"insert into table{i} values\n") - for j,k in enumerate(ys[rows:rows+adds]): - if j == 0: - f.write(f" ") - else: - f.write(f", ") - f.write(f"({rows+j},{k+1},{rows+j})") - f.write(f";\n") + if adds > 0: + # y value is one higher, conflict + f.write(f"insert into table{i} values\n") + for j,k in enumerate(ys[rows+deletes+updates:rows+deletes+updates+adds]): + if j == 0: + f.write(f" ") + else: + f.write(f", ") + f.write(f"({rows+deletes+updates+j},{k+1},{rows+deletes+updates+j})") + f.write(f";\n") + + if deletes > 0: + f.write(f"delete from table{i} where y in\n") + for j, y in enumerate(ys[:deletes]): + if j == 0: + f.write(f" (") + else: + f.write(f", ") + f.write(f"{y}") + f.write(f");\n") + + if updates > 0: + f.write(f"update table{i} set y=y+1 where y in\n") + for j, y in enumerate(ys[deletes:deletes+updates]): + if j == 0: + f.write(f" (") + else: + f.write(f", ") + f.write(f"{y}") + f.write(f");\n") diff --git a/.github/scripts/merge-perf/setup.sh b/.github/scripts/merge-perf/setup.sh index 518bbf26ae..943b3e3ffd 100755 --- a/.github/scripts/merge-perf/setup.sh +++ b/.github/scripts/merge-perf/setup.sh @@ -36,3 +36,4 @@ dolt sql < $DATA/branch.sql dolt commit -Am "new branch" +dolt merge --no-edit main diff --git a/.github/workflows/merge-perf.yaml b/.github/workflows/merge-perf.yaml index a1e4268dbe..f830b1e681 100644 --- a/.github/workflows/merge-perf.yaml +++ b/.github/workflows/merge-perf.yaml @@ -40,29 +40,41 @@ jobs: working-directory: ./go run: go install ./cmd/dolt - - name: Run bench - id: bench + - name: Config dolt + id: config run: | dolt config --global --add user.email "merge-perf@dolthub.com" dolt config --global --add user.name "merge-perf" + - name: Run bench + id: bench + run: | gw=$GITHUB_WORKSPACE DATADIR=$gw/data - TABLE_NUM=2 - ROW_NUM=1000000 - ADD_NUM=60000 - python ${{ env.SCRIPT_DIR }}/data.py $DATADIR $TABLE_NUM $ROW_NUM $ADD_NUM - - TMPDIR=$gw/tmp - ./${{ env.SCRIPT_DIR}}/setup.sh $TMPDIR $DATADIR - - TIMES=$gw/time.log - cd $TMPDIR - latency=$(python3 -c "import time, subprocess; start = time.time(); res=subprocess.run(['dolt', 'merge', 'main'], capture_output=True); output = res.stdout + res.stderr if res.returncode != 0 else time.time() -start; print(output); exit(res.returncode)") RESULTS=$gw/results.sql - echo "CREATE TABLE ${{env.RESULT_TABLE_NAME }} (name varchar(50) primary key, table_cnt int, run_cnt int, add_cnt int, conflict_cnt int, fks bool, latency float);" >> $RESULTS - echo "INSERT INTO ${{ env.RESULT_TABLE_NAME }} values ('1m rows, 100k conflicts', 2, $ROW_NUM, $ADD_NUM, $ADD_NUM, true, $latency);" >> $RESULTS + echo "CREATE TABLE ${{env.RESULT_TABLE_NAME }} (name varchar(50) primary key, table_cnt int, run_cnt int, add_cnt int, delete_cnt int, update_cnt int, conflict_cnt int, fks bool, latency float);" >> $RESULTS + + TABLE_NUM=2 + names=('adds_only' 'deletes_only' 'updates_only' 'adds_updates_deletes') + adds=(60000 0 0 60000) + deletes=(0 60000 0 60000) + updates=(0 0 60000 60000) + + for i in {0..3}; do + echo "${names[$i]}, ${adds[$i]}, ${deletes[$i]}, ${updates[$i]}" + python ${{ env.SCRIPT_DIR }}/data.py $DATADIR $TABLE_NUM $ROW_NUM ${adds[$i]} ${deletes[$i]} ${updates[$i]} + + TMPDIR=$gw/tmp + ./${{ env.SCRIPT_DIR}}/setup.sh $TMPDIR $DATADIR + + cd $TMPDIR + latency=$(python3 -c "import time, subprocess; start = time.time(); res=subprocess.run(['dolt', 'merge', '--no-edit', 'main'], capture_output=True); output = res.stdout + res.stderr if res.returncode != 0 else time.time() -start; print(output); exit(res.returncode)") + + conflicts=$(dolt sql -r csv -q "select count(*) from dolt_conflicts_table0;" | tail -1) + + echo "INSERT INTO ${{ env.RESULT_TABLE_NAME }} values ("${names[$i]}", $TABLE_NUM, $ROW_NUM, ${adds[$i]}, ${deletes[$i]}, ${updates[$i]}, $conflicts, true, $latency);" >> $RESULTS + one echo "::set-output name=result_path::$RESULTS" @@ -71,7 +83,7 @@ jobs: run: | gw=$GITHUB_WORKSPACE in="${{ steps.bench.outputs.result_path }}" - query="select name, round(latency, 2) as latency from ${{ env.RESULT_TABLE_NAME }}" + query="select name, add_cnt, delete_cnt, update_cnt, round(latency, 2) as latency from ${{ env.RESULT_TABLE_NAME }}" summaryq="select round(avg(latency), 2) as avg from ${{ env.RESULT_TABLE_NAME }}" out="$gw/results.csv"