fix(import): address PR feedback for primary key validation

Update validation function to support name mapping during primary key
validation, allowing users to specify primary keys using either
original CSV column names or mapped target column names when using
the --map option. Ensure validation is properly skipped when schema
files are provided since --pk and --schema parameters are mutually
exclusive.

Update test expectations in import-create-tables.bats to match the
existing detailed error message format that includes available columns,
rather than changing the error messages to match the old test
expectations. Move new validation tests from import-tables.bats to
import-create-tables.bats as requested in code review.

The two failing tests "try to table import with nonexistent --pk arg"
and "try to table import with one valid and one nonexistent --pk arg"
now pass because our early validation catches these errors before
InferSchema, producing more helpful error messages that include
available columns. Updated the test expectations to match our
validation's error format instead of the old generic "column not found"
message.

Refs: #1083
This commit is contained in:
David Dansby
2025-06-15 15:54:49 -07:00
parent d5a3c25678
commit dd80d98dc9
4 changed files with 201 additions and 140 deletions

View File

@@ -379,7 +379,8 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError {
}
// validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema
func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema) error {
// considering any name mapping that may be applied
func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error {
if len(primaryKeys) == 0 {
return nil
}
@@ -388,7 +389,22 @@ func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Sche
var missingKeys []string
for _, pk := range primaryKeys {
if _, ok := cols.GetByName(pk); !ok {
// First check if the primary key exists directly in the schema
if _, ok := cols.GetByName(pk); ok {
continue
}
// If not found directly, check if any column maps to this primary key name
found := false
cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
if nameMapper.Map(col.Name) == pk {
found = true
return true, nil
}
return false, nil
})
if !found {
missingKeys = append(missingKeys, pk)
}
}
@@ -566,7 +582,7 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D
// Validate primary keys early for create operations, so that we return validation errors early
if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" {
rdSchema := rd.GetSchema()
if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema); err != nil {
if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil {
rd.Close(ctx)
return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err}
}

View File

@@ -15,12 +15,12 @@
package tblcmds
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
)
@@ -71,44 +71,33 @@ func TestValidatePrimaryKeysAgainstSchema(t *testing.T) {
primaryKeys: []string{"invalid_col"},
schema: testSchema,
expectError: true,
errorContains: "primary key 'invalid_col' not found",
},
{
name: "invalid single primary key shows available columns",
primaryKeys: []string{"invalid_col"},
schema: testSchema,
expectError: true,
errorContains: "Available columns: id, name, email, age",
errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age",
},
{
name: "mix of valid and invalid primary keys",
primaryKeys: []string{"id", "invalid_col1", "name", "invalid_col2"},
schema: testSchema,
expectError: true,
errorContains: "primary keys [invalid_col1 invalid_col2] not found",
errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age",
},
{
name: "all invalid primary keys",
primaryKeys: []string{"col1", "col2", "col3"},
schema: testSchema,
expectError: true,
errorContains: "primary keys [col1 col2 col3] not found",
errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema)
err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{})
if tt.expectError {
assert.Error(t, err)
if tt.errorContains != "" {
assert.Contains(t, err.Error(), tt.errorContains)
}
// Verify that available columns are shown in error message
if strings.Contains(tt.errorContains, "not found") {
assert.Contains(t, err.Error(), "Available columns:")
}
} else {
assert.NoError(t, err)
}
@@ -133,13 +122,70 @@ func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) {
)
require.NoError(t, err)
err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema)
err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{})
assert.Error(t, err)
// The error message should contain all available columns
errMsg := err.Error()
assert.Contains(t, errMsg, "alpha")
assert.Contains(t, errMsg, "beta")
assert.Contains(t, errMsg, "gamma")
assert.Contains(t, errMsg, "zebra")
// The implementation returns a detailed error with available columns
assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma")
}
func TestValidatePrimaryKeysWithNameMapping(t *testing.T) {
// Create a test schema with columns: user_id, user_name, user_email
testCols := []schema.Column{
{Name: "user_id", Tag: 0, IsPartOfPK: true},
{Name: "user_name", Tag: 1, IsPartOfPK: false},
{Name: "user_email", Tag: 2, IsPartOfPK: false},
}
testSchema, err := schema.NewSchema(
schema.NewColCollection(testCols...),
nil,
schema.Collation_Default,
nil,
nil,
)
require.NoError(t, err)
// Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email
nameMapper := rowconv.NameMapper{
"user_id": "id",
"user_name": "name",
"user_email": "email",
}
tests := []struct {
name string
primaryKeys []string
expectError bool
}{
{
name: "primary key using mapped name",
primaryKeys: []string{"id"}, // mapped from user_id
expectError: false,
},
{
name: "primary key using original name",
primaryKeys: []string{"user_id"}, // original column name
expectError: false,
},
{
name: "multiple primary keys with mixed names",
primaryKeys: []string{"id", "name"}, // mapped names
expectError: false,
},
{
name: "invalid primary key not in mapping",
primaryKeys: []string{"invalid_col"},
expectError: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper)
if tt.expectError {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}

View File

@@ -254,14 +254,16 @@ CSV
run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "Error determining the output schema." ]] || false
[[ "$output" =~ "column 'batmansparents' not found" ]] || false
[[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
[[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
}
@test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" {
run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "Error determining the output schema." ]] || false
[[ "$output" =~ "column 'batmansparents' not found" ]] || false
[[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
[[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
}
@test "import-create-tables: create a table with two primary keys from csv import" {
@@ -947,4 +949,112 @@ DELIM
[[ "$output" =~ '5,contains null,"[4,null]"' ]] || false
[[ "$output" =~ '6,empty,[]' ]] || false
}
@test "import-create-tables: validate primary keys exist in CSV file (issue #1083)" {
# Create a test CSV file
cat <<DELIM > test_pk_validation.csv
id,name,email,age
1,Alice,alice@example.com,30
2,Bob,bob@example.com,25
3,Charlie,charlie@example.com,35
DELIM
# Test 1: Invalid single primary key should fail immediately
run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
# Test 2: Multiple invalid primary keys should fail immediately
run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary keys" ]] || false
[[ "$output" =~ "invalid1 invalid2" ]] || false
[[ "$output" =~ "not found in import file" ]] || false
# Test 3: Mix of valid and invalid primary keys should fail
run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false
# Test 4: Valid primary key should succeed
run dolt table import -c --pk "id" test_table test_pk_validation.csv
[ "$status" -eq 0 ]
# Verify table was created correctly
run dolt sql -q "DESCRIBE test_table;"
[ "$status" -eq 0 ]
[[ "$output" =~ "id" ]] || false
[[ "$output" =~ "PRI" ]] || false
# Test 5: Valid multiple primary keys should succeed
rm -f test_table
run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
[ "$status" -eq 0 ]
# Test 6: PSV file with invalid primary key should also fail immediately
cat <<DELIM > test_pk_validation.psv
id|name|email|age
1|Alice|alice@example.com|30
2|Bob|bob@example.com|25
3|Charlie|charlie@example.com|35
DELIM
run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false
# Test 7: Large CSV should fail quickly (not after reading entire file)
# Create a larger CSV to simulate the original issue
{
echo "year,state_fips,county_fips,precinct,candidate,votes"
for i in {1..1000}; do
echo "2020,$i,$i,precinct$i,candidate$i,$i"
done
} > large_test.csv
# Time the command - it should fail immediately, not after processing all rows
start_time=$(date +%s)
run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
end_time=$(date +%s)
duration=$((end_time - start_time))
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
# Should fail in less than 2 seconds (immediate validation)
[ "$duration" -lt 2 ] || false
# Cleanup
rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv
}
@test "import-create-tables: primary key validation with schema file should skip validation" {
# Create a test CSV file
cat <<DELIM > test_data.csv
id,name,email
1,Alice,alice@example.com
2,Bob,bob@example.com
DELIM
# Create a schema file with different column as primary key
cat <<SQL > test_schema.sql
CREATE TABLE test_with_schema (
id INT,
name VARCHAR(100),
email VARCHAR(100),
PRIMARY KEY (name)
);
SQL
# When schema file is provided, it should work without primary key validation
run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv
[ "$status" -eq 0 ]
# Verify that 'name' is the primary key (from schema file)
run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
[ "$status" -eq 0 ]
[[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
# Cleanup
rm -f test_data.csv test_schema.sql
}

View File

@@ -48,114 +48,3 @@ teardown() {
dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv`
}
@test "import-tables: validate primary keys exist in CSV file (issue #1083)" {
# Create a test CSV file
cat <<DELIM > test_pk_validation.csv
id,name,email,age
1,Alice,alice@example.com,30
2,Bob,bob@example.com,25
3,Charlie,charlie@example.com,35
DELIM
# Test 1: Invalid single primary key should fail immediately
run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found" ]] || false
[[ "$output" =~ "Available columns: id, name, email, age" ]] || false
# Test 2: Multiple invalid primary keys should fail immediately
run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary keys \[invalid1 invalid2\] not found" ]] || false
[[ "$output" =~ "Available columns: id, name, email, age" ]] || false
# Test 3: Mix of valid and invalid primary keys should fail
run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_col' not found" ]] || false
[[ "$output" =~ "Available columns: id, name, email, age" ]] || false
# Test 4: Valid primary key should succeed
run dolt table import -c --pk "id" test_table test_pk_validation.csv
[ "$status" -eq 0 ]
# Verify table was created correctly
run dolt sql -q "DESCRIBE test_table;"
[ "$status" -eq 0 ]
[[ "$output" =~ "id" ]] || false
[[ "$output" =~ "PRI" ]] || false
# Test 5: Valid multiple primary keys should succeed
rm -f test_table
run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
[ "$status" -eq 0 ]
# Test 6: PSV file with invalid primary key should also fail immediately
cat <<DELIM > test_pk_validation.psv
id|name|email|age
1|Alice|alice@example.com|30
2|Bob|bob@example.com|25
3|Charlie|charlie@example.com|35
DELIM
run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'nonexistent' not found" ]] || false
[[ "$output" =~ "Available columns: id, name, email, age" ]] || false
# Test 7: Large CSV should fail quickly (not after reading entire file)
# Create a larger CSV to simulate the original issue
{
echo "year,state_fips,county_fips,precinct,candidate,votes"
for i in {1..1000}; do
echo "2020,$i,$i,precinct$i,candidate$i,$i"
done
} > large_test.csv
# Time the command - it should fail immediately, not after processing all rows
start_time=$(date +%s)
run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
end_time=$(date +%s)
duration=$((end_time - start_time))
[ "$status" -eq 1 ]
[[ "$output" =~ "primary key 'invalid_column' not found" ]] || false
[[ "$output" =~ "Available columns: year, state_fips, county_fips, precinct, candidate, votes" ]] || false
# Should fail in less than 2 seconds (immediate validation)
[ "$duration" -lt 2 ] || false
# Cleanup
rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv
}
@test "import-tables: primary key validation with schema file should skip validation" {
# Create a test CSV file
cat <<DELIM > test_data.csv
id,name,email
1,Alice,alice@example.com
2,Bob,bob@example.com
DELIM
# Create a schema file with different column as primary key
cat <<SQL > test_schema.sql
CREATE TABLE test_with_schema (
id INT,
name VARCHAR(100),
email VARCHAR(100),
PRIMARY KEY (name)
);
SQL
# Even though 'name' is specified as PK in command line, schema file takes precedence
# and validation should be skipped
run dolt table import -c --pk "nonexistent_column" --schema test_schema.sql test_with_schema test_data.csv
[ "$status" -eq 0 ]
# Verify that 'name' is the primary key (from schema file)
run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
[ "$status" -eq 0 ]
[[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
# Cleanup
rm -f test_data.csv test_schema.sql
}