Files
dolt/benchmark/benchmark.pl

846 lines
22 KiB
Perl
Executable File

#!/usr/bin/perl -w
###################################################################################
#
#
# benchmark.pl - Dolt benchmarking script
#
#
# Description: Dolt benchmarking script.
# Author: Tim Sehn
# Date: March, 2019
#
###################################################################################
use strict;
use Data::Dumper;
use Getopt::Long;
use List::Util qw(shuffle);
use Pod::Usage;
# These are defaults and can be overridden with command line args.
use constant BENCHMARK_ROOT => '/var/tmp';
use constant DOLT_PATH => '~/go/bin/';
use constant LOG_LEVEL => 1; # 0 = quiet, 1 = status, 2 = verbose
use constant UNSAFE => 0;
use constant PRESERVE_INPUTS => 0;
###################################################################################
#
# Configuration
#
###################################################################################
# Ideally, we will store the configuration in a dolt repository. We will pull down
# the repo and extract all this information from the repository. Then, we'll
# insert the output with the configuration version identifier in the output.
# Define the benchmarks we will run.
my $benchmark_config = {
# Version the configuration to store with the output
version => '0.0.1',
# Define the schema and size of the test database.
# This creates a set of csv files and a dolt schema file which are used in the
# benchmark tests. The gen field is either increment or rand. Types supported
# are int and string.
seed => {
name => 'test.csv',
size => 1000000,
schema_file => 'test.schema',
schema => [
{
name => 'id',
type => 'int',
primary => 1,
gen => 'increment',
},
{
name => 'int1',
type => 'int',
primary => 0,
gen => 'rand',
size => 10,
},
{
name => 'int2',
type => 'int',
primary => 0,
gen => 'rand',
size => 100,
},
{
name => 'int3',
type => 'int',
primary => 0,
gen => 'rand',
size => 1000,
},
{
name => 'int4',
type => 'int',
primary => 0,
gen => 'rand',
size => 10000,
},
{
name => 'int5',
type => 'int',
primary => 0,
gen => 'rand',
size => 100000,
},
{
name => 'string1',
type => 'string',
primary => 0,
gen => 'rand',
size => 1,
},
{
name => 'string2',
type => 'string',
primary => 0,
gen => 'rand',
size => 2,
},
{
name => 'string3',
type => 'string',
primary => 0,
gen => 'rand',
size => 4,
},
{
name => 'string4',
type => 'string',
primary => 0,
gen => 'rand',
size => 8,
},
{
name => 'string5',
type => 'string',
primary => 0,
gen => 'rand',
size => 16,
},
],
},
# This configuration defines which csv files we'll create to represent
# a small, medium, and large change. The pct key/value pair is used to
# calculate the percentage chance that a column value is changed.
changes => [
{
file => 'small-change.csv',
pct => 0.001,
},
{
file => 'medium-change.csv',
pct => 0.01,
},
{
file => 'large-change.csv',
pct => 0.05,
},
],
benchmarks => {
git => {
root => 'git-benchmark',
tests => [
{
name => 'raw',
command => 'git',
},
{
name => 'init',
command => 'git init',
},
{
prep => ['cp ../test.csv .'],
name => 'add',
command => 'git add test.csv',
},
{
name => 'commit',
command => 'git commit -m "first test commit"',
check_disk => 1,
},
{
prep => ['cp ../small-change.csv ./test.csv'],
name => 'small diff',
command => 'git diff test.csv',
post => ['git add test.csv', 'git commit -m "Small change"'],
check_disk => 1,
},
{
prep => ['cp ../medium-change.csv ./test.csv'],
name => 'medium diff',
command => 'git diff test.csv',
post => ['git add test.csv', 'git commit -m "Medium change"'],
check_disk => 1,
},
{
prep => ['cp ../large-change.csv ./test.csv'],
name => 'large diff',
command => 'git diff test.csv',
post => ['git add test.csv', 'git commit -m "Large change"'],
check_disk => 1,
}
]
},
dolt => {
root => 'dolt-benchmark',
tests => [
{
name => 'raw',
command => 'dolt',
},
{
name => 'init',
command => 'dolt init',
},
{
prep => [
'dolt table create -s ../test.schema test',
'dolt table import -u test ../test.csv',
],
name =>'add',
command=> 'dolt add test',
},
{
name => 'commit',
command => 'dolt commit -m "first test commit"',
check_disk => 1,
},
{
prep => [
'dolt table rm test',
'dolt table import -c -s ../test.schema test ../small-change.csv'
],
name => 'small diff',
command => 'dolt diff test',
post => ['dolt add test', 'dolt commit -m "small change"'],
check_disk => 1,
},
{
prep => [
'dolt table rm test',
'dolt table import -c -s ../test.schema test ../medium-change.csv'
],
name => 'medium diff',
command => 'dolt diff test',
post => ['dolt add test',
'dolt commit -m "medium change"'],
check_disk => 1,
},
{
prep => [
'dolt table rm test',
'dolt table import -c -s ../test.schema test ../large-change.csv'
],
name => 'large diff',
command => 'dolt diff test',
post => ['dolt add test', 'dolt commit -m "large change"'],
check_disk => 1,
}
]
}
}
};
my $publish_config = {
repo_root => '/Users/timsehn/liquidata/dolt-repos/dolt-benchmark',
table => 'results'
};
###################################################################################
#
# Execute the Benchmark
#
###################################################################################
# Process command line arguments
my $root = BENCHMARK_ROOT;
my $log_level = LOG_LEVEL;
my $unsafe = UNSAFE;
my $preserve = PRESERVE_INPUTS;
my $dolt_path = DOLT_PATH;
my $publish = 0;
my $publish_repo = '';
my $help = 0;
my $man = 0;
GetOptions("root=s" => \$root,
"loglevel=i" => \$log_level,
"preserve" => \$preserve,
"unsafe" => \$unsafe,
"dolt-path=s" => \$dolt_path,
"publish" => \$publish,
"publish-repo=s" => \$publish_repo,
'help|?' => \$help,
'man' => \$man) or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-exitval => 0, -verbose => 2) if $man;
if ( $publish_repo ) {
die("Cannot specify --results-repo unless --publish is specified")
unless $publish;
$publish_config->{'repo_root'} = $publish_repo;
}
# Set up the environment
$ENV{'PATH'} = "$ENV{PATH}:$dolt_path";
$ENV{'NOMS_VERSION_NEXT'} = 1;
# Make sure root exists
if ( -d $root ) {
output("Changing directory to $root", 2);
chdir($root) or die("Could not cd to $root\n");
} else {
die("Could not run benchmarks in $root. Directory does not exist.\n");
}
# Build input files
my $test_csv = $benchmark_config->{'seed'}{'name'};
my $schema_file = $benchmark_config->{'seed'}{'schema_file'};
my $schema = $benchmark_config->{'seed'}{'schema'};
my $rows = $benchmark_config->{'seed'}{'size'};
my $columns = scalar(@{$schema});
my $changes = $benchmark_config->{'changes'};
output("Building input files...$rows rows, $columns columns", 1);
generate_dolt_schema($schema_file, $schema);
create_test_input_csvs($test_csv, $rows, $schema, $changes);
# TO DO: Gather system information to insert into the output.
my $profile = {};
gather_profile_info($profile);
# Run the benchmarks
my %data;
foreach my $benchmark ( keys %{$benchmark_config->{'benchmarks'}} ) {
output("Executing $benchmark benchmark...", 1);
# Build the root directory for the repository
my $benchmarks = $benchmark_config->{'benchmarks'};
my $benchmark_root = $benchmarks->{$benchmark}{'root'};
if ( -d $benchmark_root ) {
if ( $unsafe ) {
output("Deleting $root/$benchmark_root because it alreadys exists", 2);
run_command("rm -rf $benchmark_root");
} else {
error_exit("$root/$benchmark_root must not exist to run benchmark");
}
}
output("Changing directory to $benchmark_root\n", 2);
mkdir($benchmark_root) or error_exit("Could not mkdir $benchmark_root");
chdir($benchmark_root) or error_exit("Could not cd to $benchmark_root");
# Run and time the commands in the root directory
foreach my $test ( @{$benchmarks->{$benchmark}{'tests'}} ) {
output("Running test: " . $test->{'name'}, 1);
foreach my $prep ( @{$test->{'prep'}} ) {
run_command($prep);
}
my ($real, $user, $system) = time_command($test->{'command'}, $log_level);
$data{$test->{'name'}}{$benchmark}{'real'} = $real;
$data{$test->{'name'}}{$benchmark}{'user'} = $user;
$data{$test->{'name'}}{$benchmark}{'system'} = $system;
foreach my $post ( @{$test->{'post'}} ) {
run_command($post);
}
if ( $test->{'check_disk'} ) {
$data{$test->{'name'}}{$benchmark}{'disk'} = disk_usage();
}
}
# Cleanup the repository
output("Changing directory to $root and removing $benchmark_root", 2);
chdir($root);
run_command("rm -rf $benchmark_root") unless $preserve;
}
# Cleanup the input files.
output("Cleaning up...", 1);
cleanup($root, $benchmark_config, $preserve, $unsafe);
# Output
publish($publish_config, \%data, $profile, $benchmark_config, $root) if $publish;
output_data(\%data, $benchmark_config->{'benchmarks'}, $log_level);
exit 0;
###################################################################################
#
# Functions
#
###################################################################################
# System utility functions
sub time_command {
my $command = shift;
my $log_level = shift;
output("Running:\n\t$command", 2);
# time outputs to STDERR so I'll trash STDOUT and grab STDERR from
# STDOUT which `` writes to
my $piped_command;
if ( $log_level > 1 ) {
$piped_command = "{ time $command ;} 2>&1";
} else {
$piped_command = "{ time $command ;} 2>&1 1>/dev/null";
}
my $output = `$piped_command`;
# To Do: Some of these commands expect to exit 1. ie, git and dolt.
# I need to build in an expect into the benchmark definition
# if ($?) {
# die "Error running: $piped_command\n";
# }
$output =~ /real\s+(.+)\nuser\s+(.+)\nsys\s+(.+)\n/;
output("Output:\n\t$output", 2) if $output;
my $real = convert_time_output_to_ms($1);
my $user = convert_time_output_to_ms($2);
my $system = convert_time_output_to_ms($3);
return ($real, $user, $system);
}
sub disk_usage {
output("Checking disk usage...", 2);
my $command = 'du -h -d 0';
output("Running $command", 2);
my $output = `$command`;
output("Output:\n\t$output", 2) if $output;
$output =~ /^\s*([\d\w\.]+)\s+\./;
return $1;
}
sub run_command {
my $command = shift;
output("Running:\n\t$command", 2);
my $output = `$command 2>&1`;
output("Output:\n\t$output", 2) if $output;
if ($?) {
error_exit("Error running: $command");
}
}
sub convert_time_output_to_ms {
my $time = shift;
$time =~ /(\d+)m(\d+)\.(\d+)s/;
my $minutes = $1;
my $seconds = $2;
my $ms = $3;
return $ms + ($seconds*1000) + ($minutes*60*1000);
}
# CSV Creation functions
sub create_test_input_csvs {
my $csv = shift;
my $size = shift;
my $schema = shift;
my $changes = shift;
my @all_filehandles;
open(CSV, ">", $csv) or error_exit("Could not open $csv: $!\n");
push @all_filehandles, *CSV;
foreach my $change ( @{$changes} ){
open($change->{'filehandle'}, '>', $change->{'file'})
or error_exit("Could not open ". $change->{'file'} . ": $!");
push @all_filehandles, $change->{'filehandle'};
}
# Create header row and write it to all csvs
my $first = 1;
foreach my $column ( @{$schema} ) {
write_to_files(',', @all_filehandles) unless $first;
write_to_files($column->{'name'}, @all_filehandles);
$first = 0;
}
write_to_files("\n", @all_filehandles);;
# Create mock data
# Create an array with the data and write the original CSV
my @values;
foreach ( my $i = 0; $i < $size; $i++ ) {
$first = 1;
$values[$i] = [];
foreach my $column ( @{$schema} ) {
print CSV ',' unless $first;
$first = 0;
my $value = generate_value($column->{'type'},
$column->{'gen'},
$column->{'size'},
$i);
print CSV $value;
push @{$values[$i]}, $value;
}
print CSV "\n";
}
# Shuffle the rows and change the values
foreach my $change ( @{$changes} ) {
my $fh = $change->{'filehandle'};
my @shuffle = shuffle(@values);
foreach my $row ( @shuffle ) {
my $first = 1;
my $i = 0;
foreach my $column ( @{$schema} ) {
my $value = $row->[$i];
print $fh ',' unless $first;
$first = 0;
if ( rand() < $change->{'pct'} ) {
$value = generate_value($column->{'type'},
$column->{'gen'},
$column->{'size'},
$row->[0]);
}
print $fh $value;
$i++;
}
print $fh "\n";
}
}
foreach my $fh (@all_filehandles) {
close $fh;
}
}
sub generate_value {
my $type = shift;
my $gen = shift;
my $size = shift;
my $i = shift; # Used for increment
if ( $type eq 'int' ) {
return $i if ( $gen eq 'increment' );
if ( $gen eq 'rand' ) {
return int(rand($size+1));
} else {
error_exit("Do not understand generator: $gen");
}
} elsif ( $type eq 'string' ) {
if ( $gen eq 'rand' ) {
return rndStr($size, 'a'..'z', 0..9);
} else {
error_exit("Do not understand generator: $gen");
}
} else {
error_exit("Do not understand type: $type");
}
}
sub write_to_files {
my $string = shift;
my @filehandles = @_;
foreach my $filehandle ( @filehandles ) {
print $filehandle $string;
}
}
# Perl wizardry. Do not question.
sub rndStr {
join('', @_[ map{ rand @_ } 1 .. shift ]);
}
# Gather Profile information
sub gather_profile_info {
my $profile = shift;
output('Gathering profile information...', 1);
my $uname_cmd = 'uname -a';
output("Running $uname_cmd", 2);
# TO DO: Turn this into structured data
$profile->{'uname'} = `$uname_cmd`;
$profile->{'uname'} =~ s/\n//g;
if ($?) {
error_exit("Error running: $uname_cmd");
}
output("uname is:\n\t$profile->{uname}", 2);
$profile->{'now'} = time();
$profile->{'git_version'} = `git version`;
$profile->{'git_version'} =~ s/\n//g;
$profile->{'dolt_version'} = `dolt version`;
$profile->{'dolt_version'} =~ s/\n//g;
}
# Generate schema
# TO DO: Change these schema generation functions to build the proper perl
# data structure and use a JSON parser to output the proper JSON schema
sub generate_dolt_schema {
my $schema_file = shift;
my $schema = shift;
my $filehandle;
open($filehandle, '>', $schema_file)
or error_exit("Could not open $schema_file");
print $filehandle "{\n\"columns\":[\n";
my $first = 1;
my $tag = 0;
foreach my $column ( @{$schema} ) {
print $filehandle ",\n" unless $first;
$first = 0;
generate_column_schema($column, $tag, $filehandle);
$tag++;
}
print $filehandle "]\n}\n";
}
sub generate_column_schema {
my $col_schema = shift;
my $tag = shift;
my $filehandle = shift;
print $filehandle "{\n\"tag\": $tag,\n";
print $filehandle "\"name\":\"$col_schema->{name}\",\n";
print $filehandle "\"kind\":\"$col_schema->{type}\",\n";
if ( $col_schema->{primary} ) {
print $filehandle "\"is_part_of_pk\": true,\n" .
"\"col_constraints\": [\n{\n\"constraint_type\": \"not_null\",\n" .
"\"params\": null\n}\n]\n";
} else {
print $filehandle "\"is_part_of_pk\": false,\n" .
"\"col_constraints\": []\n";
}
print $filehandle "}";
}
sub cleanup {
my $root = shift;
my $benchmark_config = shift;
my $preserve = shift;
my $unsafe = shift;
return if $preserve;
chdir($root) or error_exit("Could not cd to $root");
my $seed = $benchmark_config->{'seed'}{'name'};
my $schema = $benchmark_config->{'seed'}{'schema_file'};
my $changes = $benchmark_config->{'changes'};
output("Removing $seed and $schema files", 2);
unlink($seed) if ( -e $seed);
unlink($schema) if ( -e $schema );
foreach my $change ( @{$changes} ) {
output("Removing $change->{file}", 2);
unlink($change->{'file'}) if ( -e $change->{'file'} );
}
output("Removing repository roots",2);
foreach my $benchmark ( keys %{$benchmark_config->{'benchmarks'}} ) {
my $benchmark_root = $benchmark_config->{'benchmarks'}{$benchmark}{'root'};
run_command("rm -rf $benchmark_root");
}
}
# Data
sub output_data {
my $data = shift;
my $benchmarks = shift;
my $log_level = shift;
return if ( $log_level == 0 );
print Dumper $data if ( $log_level >= 2 );
print "\n--- Times ---\n";
foreach my $test ( @{$benchmarks->{'dolt'}{'tests'}} ) {
my $test_name = $test->{'name'};
print "$test_name:\n";
print "\tDolt: $data->{$test_name}{dolt}{real}ms\n";
print "\tGit: $data->{$test_name}{'git'}{'real'}ms\n";
}
print "\n--- Disk ---\n";
foreach my $test ( @{$benchmarks->{'dolt'}{'tests'}} ) {
my $test_name = $test->{'name'};
if ( $data->{$test_name}{'dolt'}{'disk'} ) {
print "$test_name:\n";
print "\tDolt: $data->{$test_name}{dolt}{disk}\n";
print "\tGit: $data->{$test_name}{'git'}{'disk'}\n";
}
}
}
sub publish {
my $publish_config = shift;
my $data = shift;
my $profile = shift;
my $benchmark_config = shift;
my $root = shift;
# Once we have remotes, we'll want to pull the repo down from DoltHub,
# Make our inserts on a new branch, and then push the branch back to DoltHub.
# Then, we can delete the repo or have a keep flag if users want to inspect
# the results.
# We'll assume the output repo is in a schema we understand
my $data_repo_root = $publish_config->{'repo_root'};
my $results_table = $publish_config->{'table'};
output('Publishing results to dolt...', 1);
output("Changing directory to $data_repo_root...", 2);
chdir($data_repo_root) or error_exit("Could not cd to $data_repo_root");
# Make sure this is a valid dolt repo and the results table exists
my $output = `dolt ls`;
error_exit("$data_repo_root does not contain a valid dolt repository") if ($?);
error_exit("$results_table not found in dolt repository in $data_repo_root")
unless ( $output =~ /$results_table/ );
# Insert data into dolt with the following schema:
# uname (pk), now (pk), benchmark version (pk), test name (pk),
# dolt time, git time, dolt disk, git disk
my $uname = $profile->{'uname'};
my $now = $profile->{'now'};
my $git_version = $profile->{'git_version'};
my $dolt_version = $profile->{'dolt_version'};
my $version = $benchmark_config->{version};
foreach my $test ( keys %{$data} ) {
my $dolt_time = $data->{$test}{'dolt'}{'real'};
my $git_time = $data->{$test}{'git'}{'real'};
my $dolt_disk = $data->{$test}{'dolt'}{'disk'} || "";
my $git_disk = $data->{$test}{'git'}{'disk'} || "";
my $dolt_insert = "dolt table put-row $results_table uname:\"$uname\" " .
"test_time:$now git_version:\"$git_version\" " .
"dolt_version:\"$dolt_version\" benchmark_version:\"$version\" " .
"test_name:\"$test\" dolt_time:$dolt_time git_time:$git_time " .
"dolt_disk:\"$dolt_disk\" git_disk:\"$git_disk\"";
run_command($dolt_insert);
}
output("Returning to $root directory...", 2);
chdir($root) or error_exit("Could not cd to $root");
}
# Logging
# 0 = quiet, 1 = status, 2 = verbose
sub output {
my $message = shift;
my $level = shift;
my $now = localtime();
# Take advantage of log level being global
print "$now: $message\n" if ( $level <= $log_level );
}
sub error_exit {
my $message = shift;
print STDERR "$message\n";
print "Exiting early...attempting to cleanup...\n";
# Take advantage that these are global so I don't have to pass them around.
cleanup($root, $benchmark_config, $preserve, $unsafe);
exit 1;
}
__END__
=head1 NAME
benchmark.pl - Performs a Dolt benchmark against Git
=head1 SYNOPSIS
benchmark.pl [options]
=head1 OPTIONS
=over 8
=item B<-root>
Override the root directory to perform the benchmark in. Defaults to /var/tmp.
=item B<-loglevel>
The verbosity of the output. 0 is quiet. 1 is status. 2 is verbose. Defaults to 1.
=item B<-dolt-path>
Override where the dolt utility is located. Defaults to ~/go/bin/.
=item B<-preserve>
Do not delete the CSV inputs, Dolt repo, and Git repo. Useful for debugging.
=item B<-unsafe>
Delete files and directories that are in the way of the benchmark doing its job.
=item B<-publish>
Publish the results to the shared benchmark results Dolt repository.
=item B<-publish-repo>
Specify the directory where you would like the dolt repository used to pusblish
results to be placed. -publish must also be specified.
=item B<-help>
Print a brief help message and exit.
=item B<-man>
Print the manual page and exit.
=back
=head1 DESCRIPTION
B<benchmark.pl> will create a benchmark according to the benchmark configuration
specified in this script. The benchmark will entail creating random CSV input files
of a defined schema. These files will be imported into a Dolt and Git repository
and various commands will be timed. The disk usage will also be gathered at various
points. The benchmark output will be printed to the screen.
=cut