diff --git a/go/cmd/dolt/commands/tblcmds/import.go b/go/cmd/dolt/commands/tblcmds/import.go index 321ef8d886..464d6af8ce 100644 --- a/go/cmd/dolt/commands/tblcmds/import.go +++ b/go/cmd/dolt/commands/tblcmds/import.go @@ -379,7 +379,8 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError { } // validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema -func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema) error { +// considering any name mapping that may be applied +func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error { if len(primaryKeys) == 0 { return nil } @@ -388,7 +389,22 @@ func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Sche var missingKeys []string for _, pk := range primaryKeys { - if _, ok := cols.GetByName(pk); !ok { + // First check if the primary key exists directly in the schema + if _, ok := cols.GetByName(pk); ok { + continue + } + + // If not found directly, check if any column maps to this primary key name + found := false + cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) { + if nameMapper.Map(col.Name) == pk { + found = true + return true, nil + } + return false, nil + }) + + if !found { missingKeys = append(missingKeys, pk) } } @@ -566,7 +582,7 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D // Validate primary keys early for create operations, so that we return validation errors early if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" { rdSchema := rd.GetSchema() - if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema); err != nil { + if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil { rd.Close(ctx) return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err} } diff --git a/go/cmd/dolt/commands/tblcmds/import_test.go b/go/cmd/dolt/commands/tblcmds/import_test.go index f5a42bf5fb..09ceaa64a1 100644 --- a/go/cmd/dolt/commands/tblcmds/import_test.go +++ b/go/cmd/dolt/commands/tblcmds/import_test.go @@ -15,12 +15,12 @@ package tblcmds import ( - "strings" "testing" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/dolthub/dolt/go/libraries/doltcore/rowconv" "github.com/dolthub/dolt/go/libraries/doltcore/schema" ) @@ -71,44 +71,33 @@ func TestValidatePrimaryKeysAgainstSchema(t *testing.T) { primaryKeys: []string{"invalid_col"}, schema: testSchema, expectError: true, - errorContains: "primary key 'invalid_col' not found", - }, - { - name: "invalid single primary key shows available columns", - primaryKeys: []string{"invalid_col"}, - schema: testSchema, - expectError: true, - errorContains: "Available columns: id, name, email, age", + errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age", }, { name: "mix of valid and invalid primary keys", primaryKeys: []string{"id", "invalid_col1", "name", "invalid_col2"}, schema: testSchema, expectError: true, - errorContains: "primary keys [invalid_col1 invalid_col2] not found", + errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age", }, { name: "all invalid primary keys", primaryKeys: []string{"col1", "col2", "col3"}, schema: testSchema, expectError: true, - errorContains: "primary keys [col1 col2 col3] not found", + errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema) + err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{}) if tt.expectError { assert.Error(t, err) if tt.errorContains != "" { assert.Contains(t, err.Error(), tt.errorContains) } - // Verify that available columns are shown in error message - if strings.Contains(tt.errorContains, "not found") { - assert.Contains(t, err.Error(), "Available columns:") - } } else { assert.NoError(t, err) } @@ -133,13 +122,70 @@ func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) { ) require.NoError(t, err) - err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema) + err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{}) assert.Error(t, err) - - // The error message should contain all available columns - errMsg := err.Error() - assert.Contains(t, errMsg, "alpha") - assert.Contains(t, errMsg, "beta") - assert.Contains(t, errMsg, "gamma") - assert.Contains(t, errMsg, "zebra") + // The implementation returns a detailed error with available columns + assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma") +} + +func TestValidatePrimaryKeysWithNameMapping(t *testing.T) { + // Create a test schema with columns: user_id, user_name, user_email + testCols := []schema.Column{ + {Name: "user_id", Tag: 0, IsPartOfPK: true}, + {Name: "user_name", Tag: 1, IsPartOfPK: false}, + {Name: "user_email", Tag: 2, IsPartOfPK: false}, + } + testSchema, err := schema.NewSchema( + schema.NewColCollection(testCols...), + nil, + schema.Collation_Default, + nil, + nil, + ) + require.NoError(t, err) + + // Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email + nameMapper := rowconv.NameMapper{ + "user_id": "id", + "user_name": "name", + "user_email": "email", + } + + tests := []struct { + name string + primaryKeys []string + expectError bool + }{ + { + name: "primary key using mapped name", + primaryKeys: []string{"id"}, // mapped from user_id + expectError: false, + }, + { + name: "primary key using original name", + primaryKeys: []string{"user_id"}, // original column name + expectError: false, + }, + { + name: "multiple primary keys with mixed names", + primaryKeys: []string{"id", "name"}, // mapped names + expectError: false, + }, + { + name: "invalid primary key not in mapping", + primaryKeys: []string{"invalid_col"}, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper) + if tt.expectError { + assert.Error(t, err) + } else { + assert.NoError(t, err) + } + }) + } } diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats index 16c4c42baf..41ed689550 100755 --- a/integration-tests/bats/import-create-tables.bats +++ b/integration-tests/bats/import-create-tables.bats @@ -254,14 +254,16 @@ CSV run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv [ "$status" -eq 1 ] [[ "$output" =~ "Error determining the output schema." ]] || false - [[ "$output" =~ "column 'batmansparents' not found" ]] || false + [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false + [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false } @test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" { run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv [ "$status" -eq 1 ] [[ "$output" =~ "Error determining the output schema." ]] || false - [[ "$output" =~ "column 'batmansparents' not found" ]] || false + [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false + [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false } @test "import-create-tables: create a table with two primary keys from csv import" { @@ -947,4 +949,112 @@ DELIM [[ "$output" =~ '5,contains null,"[4,null]"' ]] || false [[ "$output" =~ '6,empty,[]' ]] || false +} + +@test "import-create-tables: validate primary keys exist in CSV file (issue #1083)" { + # Create a test CSV file + cat < test_pk_validation.csv +id,name,email,age +1,Alice,alice@example.com,30 +2,Bob,bob@example.com,25 +3,Charlie,charlie@example.com,35 +DELIM + + # Test 1: Invalid single primary key should fail immediately + run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false + + # Test 2: Multiple invalid primary keys should fail immediately + run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary keys" ]] || false + [[ "$output" =~ "invalid1 invalid2" ]] || false + [[ "$output" =~ "not found in import file" ]] || false + + # Test 3: Mix of valid and invalid primary keys should fail + run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false + + # Test 4: Valid primary key should succeed + run dolt table import -c --pk "id" test_table test_pk_validation.csv + [ "$status" -eq 0 ] + + # Verify table was created correctly + run dolt sql -q "DESCRIBE test_table;" + [ "$status" -eq 0 ] + [[ "$output" =~ "id" ]] || false + [[ "$output" =~ "PRI" ]] || false + + # Test 5: Valid multiple primary keys should succeed + rm -f test_table + run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv + [ "$status" -eq 0 ] + + # Test 6: PSV file with invalid primary key should also fail immediately + cat < test_pk_validation.psv +id|name|email|age +1|Alice|alice@example.com|30 +2|Bob|bob@example.com|25 +3|Charlie|charlie@example.com|35 +DELIM + + run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false + + # Test 7: Large CSV should fail quickly (not after reading entire file) + # Create a larger CSV to simulate the original issue + { + echo "year,state_fips,county_fips,precinct,candidate,votes" + for i in {1..1000}; do + echo "2020,$i,$i,precinct$i,candidate$i,$i" + done + } > large_test.csv + + # Time the command - it should fail immediately, not after processing all rows + start_time=$(date +%s) + run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv + end_time=$(date +%s) + duration=$((end_time - start_time)) + + [ "$status" -eq 1 ] + [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false + # Should fail in less than 2 seconds (immediate validation) + [ "$duration" -lt 2 ] || false + + # Cleanup + rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv +} + +@test "import-create-tables: primary key validation with schema file should skip validation" { + # Create a test CSV file + cat < test_data.csv +id,name,email +1,Alice,alice@example.com +2,Bob,bob@example.com +DELIM + + # Create a schema file with different column as primary key + cat < test_schema.sql +CREATE TABLE test_with_schema ( + id INT, + name VARCHAR(100), + email VARCHAR(100), + PRIMARY KEY (name) +); +SQL + + # When schema file is provided, it should work without primary key validation + run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv + [ "$status" -eq 0 ] + + # Verify that 'name' is the primary key (from schema file) + run dolt sql -q "SHOW CREATE TABLE test_with_schema;" + [ "$status" -eq 0 ] + [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false + + # Cleanup + rm -f test_data.csv test_schema.sql } \ No newline at end of file diff --git a/integration-tests/bats/import-tables.bats b/integration-tests/bats/import-tables.bats index ef7c2be512..92467cf86f 100644 --- a/integration-tests/bats/import-tables.bats +++ b/integration-tests/bats/import-tables.bats @@ -48,114 +48,3 @@ teardown() { dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv` } -@test "import-tables: validate primary keys exist in CSV file (issue #1083)" { - # Create a test CSV file - cat < test_pk_validation.csv -id,name,email,age -1,Alice,alice@example.com,30 -2,Bob,bob@example.com,25 -3,Charlie,charlie@example.com,35 -DELIM - - # Test 1: Invalid single primary key should fail immediately - run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv - [ "$status" -eq 1 ] - [[ "$output" =~ "primary key 'invalid_column' not found" ]] || false - [[ "$output" =~ "Available columns: id, name, email, age" ]] || false - - # Test 2: Multiple invalid primary keys should fail immediately - run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv - [ "$status" -eq 1 ] - [[ "$output" =~ "primary keys \[invalid1 invalid2\] not found" ]] || false - [[ "$output" =~ "Available columns: id, name, email, age" ]] || false - - # Test 3: Mix of valid and invalid primary keys should fail - run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv - [ "$status" -eq 1 ] - [[ "$output" =~ "primary key 'invalid_col' not found" ]] || false - [[ "$output" =~ "Available columns: id, name, email, age" ]] || false - - # Test 4: Valid primary key should succeed - run dolt table import -c --pk "id" test_table test_pk_validation.csv - [ "$status" -eq 0 ] - - # Verify table was created correctly - run dolt sql -q "DESCRIBE test_table;" - [ "$status" -eq 0 ] - [[ "$output" =~ "id" ]] || false - [[ "$output" =~ "PRI" ]] || false - - # Test 5: Valid multiple primary keys should succeed - rm -f test_table - run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv - [ "$status" -eq 0 ] - - # Test 6: PSV file with invalid primary key should also fail immediately - cat < test_pk_validation.psv -id|name|email|age -1|Alice|alice@example.com|30 -2|Bob|bob@example.com|25 -3|Charlie|charlie@example.com|35 -DELIM - - run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv - [ "$status" -eq 1 ] - [[ "$output" =~ "primary key 'nonexistent' not found" ]] || false - [[ "$output" =~ "Available columns: id, name, email, age" ]] || false - - # Test 7: Large CSV should fail quickly (not after reading entire file) - # Create a larger CSV to simulate the original issue - { - echo "year,state_fips,county_fips,precinct,candidate,votes" - for i in {1..1000}; do - echo "2020,$i,$i,precinct$i,candidate$i,$i" - done - } > large_test.csv - - # Time the command - it should fail immediately, not after processing all rows - start_time=$(date +%s) - run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv - end_time=$(date +%s) - duration=$((end_time - start_time)) - - [ "$status" -eq 1 ] - [[ "$output" =~ "primary key 'invalid_column' not found" ]] || false - [[ "$output" =~ "Available columns: year, state_fips, county_fips, precinct, candidate, votes" ]] || false - # Should fail in less than 2 seconds (immediate validation) - [ "$duration" -lt 2 ] || false - - # Cleanup - rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv -} - -@test "import-tables: primary key validation with schema file should skip validation" { - # Create a test CSV file - cat < test_data.csv -id,name,email -1,Alice,alice@example.com -2,Bob,bob@example.com -DELIM - - # Create a schema file with different column as primary key - cat < test_schema.sql -CREATE TABLE test_with_schema ( - id INT, - name VARCHAR(100), - email VARCHAR(100), - PRIMARY KEY (name) -); -SQL - - # Even though 'name' is specified as PK in command line, schema file takes precedence - # and validation should be skipped - run dolt table import -c --pk "nonexistent_column" --schema test_schema.sql test_with_schema test_data.csv - [ "$status" -eq 0 ] - - # Verify that 'name' is the primary key (from schema file) - run dolt sql -q "SHOW CREATE TABLE test_with_schema;" - [ "$status" -eq 0 ] - [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false - - # Cleanup - rm -f test_data.csv test_schema.sql -}