fix(import): address PR feedback for primary key validation

Update validation function to support name mapping during primary key validation, allowing users to specify primary keys using either original CSV column names or mapped target column names when using the --map option. Ensure validation is properly skipped when schema files are provided since --pk and --schema parameters are mutually exclusive. Update test expectations in import-create-tables.bats to match the existing detailed error message format that includes available columns, rather than changing the error messages to match the old test expectations. Move new validation tests from import-tables.bats to import-create-tables.bats as requested in code review. The two failing tests "try to table import with nonexistent --pk arg" and "try to table import with one valid and one nonexistent --pk arg" now pass because our early validation catches these errors before InferSchema, producing more helpful error messages that include available columns. Updated the test expectations to match our validation's error format instead of the old generic "column not found" message. Refs: #1083
2026-02-11 02:59:34 -06:00 · 2025-06-15 15:54:49 -07:00
parent d5a3c25678
commit dd80d98dc9
4 changed files with 201 additions and 140 deletions
--- a/go/cmd/dolt/commands/tblcmds/import.go
+++ b/go/cmd/dolt/commands/tblcmds/import.go
@@ -379,7 +379,8 @@ func validateImportArgs(apr *argparser.ArgParseResults) errhand.VerboseError {
 }

 // validatePrimaryKeysAgainstSchema checks if all provided primary keys exist in the reader's schema
-func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema) error {
+// considering any name mapping that may be applied
+func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Schema, nameMapper rowconv.NameMapper) error {
 	if len(primaryKeys) == 0 {
 		return nil
 	}
@@ -388,7 +389,22 @@ func validatePrimaryKeysAgainstSchema(primaryKeys []string, rdSchema schema.Sche
 	var missingKeys []string

 	for _, pk := range primaryKeys {
-		if _, ok := cols.GetByName(pk); !ok {
+		// First check if the primary key exists directly in the schema
+		if _, ok := cols.GetByName(pk); ok {
+			continue
+		}
+
+		// If not found directly, check if any column maps to this primary key name
+		found := false
+		cols.Iter(func(tag uint64, col schema.Column) (stop bool, err error) {
+			if nameMapper.Map(col.Name) == pk {
+				found = true
+				return true, nil
+			}
+			return false, nil
+		})
+
+		if !found {
 			missingKeys = append(missingKeys, pk)
 		}
 	}
@@ -566,7 +582,7 @@ func newImportDataReader(ctx context.Context, root doltdb.RootValue, dEnv *env.D
 	// Validate primary keys early for create operations, so that we return validation errors early
 	if impOpts.operation == mvdata.CreateOp && len(impOpts.primaryKeys) > 0 && impOpts.schFile == "" {
 		rdSchema := rd.GetSchema()
-		if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema); err != nil {
+		if err := validatePrimaryKeysAgainstSchema(impOpts.primaryKeys, rdSchema, impOpts.nameMapper); err != nil {
 			rd.Close(ctx)
 			return nil, &mvdata.DataMoverCreationError{ErrType: mvdata.SchemaErr, Cause: err}
 		}
--- a/go/cmd/dolt/commands/tblcmds/import_test.go
+++ b/go/cmd/dolt/commands/tblcmds/import_test.go
@@ -15,12 +15,12 @@
 package tblcmds

 import (
-	"strings"
 	"testing"

 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"

+	"github.com/dolthub/dolt/go/libraries/doltcore/rowconv"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
 )

@@ -71,44 +71,33 @@ func TestValidatePrimaryKeysAgainstSchema(t *testing.T) {
 			primaryKeys:   []string{"invalid_col"},
 			schema:        testSchema,
 			expectError:   true,
-			errorContains: "primary key 'invalid_col' not found",
-		},
-		{
-			name:          "invalid single primary key shows available columns",
-			primaryKeys:   []string{"invalid_col"},
-			schema:        testSchema,
-			expectError:   true,
-			errorContains: "Available columns: id, name, email, age",
+			errorContains: "primary key 'invalid_col' not found in import file. Available columns: id, name, email, age",
 		},
 		{
 			name:          "mix of valid and invalid primary keys",
 			primaryKeys:   []string{"id", "invalid_col1", "name", "invalid_col2"},
 			schema:        testSchema,
 			expectError:   true,
-			errorContains: "primary keys [invalid_col1 invalid_col2] not found",
+			errorContains: "primary keys [invalid_col1 invalid_col2] not found in import file. Available columns: id, name, email, age",
 		},
 		{
 			name:          "all invalid primary keys",
 			primaryKeys:   []string{"col1", "col2", "col3"},
 			schema:        testSchema,
 			expectError:   true,
-			errorContains: "primary keys [col1 col2 col3] not found",
+			errorContains: "primary keys [col1 col2 col3] not found in import file. Available columns: id, name, email, age",
 		},
 	}

 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema)
+			err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, tt.schema, rowconv.NameMapper{})

 			if tt.expectError {
 				assert.Error(t, err)
 				if tt.errorContains != "" {
 					assert.Contains(t, err.Error(), tt.errorContains)
 				}
-				// Verify that available columns are shown in error message
-				if strings.Contains(tt.errorContains, "not found") {
-					assert.Contains(t, err.Error(), "Available columns:")
-				}
 			} else {
 				assert.NoError(t, err)
 			}
@@ -133,13 +122,70 @@ func TestValidatePrimaryKeysAgainstSchemaColumnOrder(t *testing.T) {
 	)
 	require.NoError(t, err)

-	err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema)
+	err = validatePrimaryKeysAgainstSchema([]string{"invalid"}, testSchema, rowconv.NameMapper{})
 	assert.Error(t, err)
-
-	// The error message should contain all available columns
-	errMsg := err.Error()
-	assert.Contains(t, errMsg, "alpha")
-	assert.Contains(t, errMsg, "beta")
-	assert.Contains(t, errMsg, "gamma")
-	assert.Contains(t, errMsg, "zebra")
+	// The implementation returns a detailed error with available columns
+	assert.Contains(t, err.Error(), "primary key 'invalid' not found in import file. Available columns: zebra, alpha, beta, gamma")
+}
+
+func TestValidatePrimaryKeysWithNameMapping(t *testing.T) {
+	// Create a test schema with columns: user_id, user_name, user_email
+	testCols := []schema.Column{
+		{Name: "user_id", Tag: 0, IsPartOfPK: true},
+		{Name: "user_name", Tag: 1, IsPartOfPK: false},
+		{Name: "user_email", Tag: 2, IsPartOfPK: false},
+	}
+	testSchema, err := schema.NewSchema(
+		schema.NewColCollection(testCols...),
+		nil,
+		schema.Collation_Default,
+		nil,
+		nil,
+	)
+	require.NoError(t, err)
+
+	// Create a name mapper that maps user_id -> id, user_name -> name, user_email -> email
+	nameMapper := rowconv.NameMapper{
+		"user_id":    "id",
+		"user_name":  "name",
+		"user_email": "email",
+	}
+
+	tests := []struct {
+		name        string
+		primaryKeys []string
+		expectError bool
+	}{
+		{
+			name:        "primary key using mapped name",
+			primaryKeys: []string{"id"}, // mapped from user_id
+			expectError: false,
+		},
+		{
+			name:        "primary key using original name",
+			primaryKeys: []string{"user_id"}, // original column name
+			expectError: false,
+		},
+		{
+			name:        "multiple primary keys with mixed names",
+			primaryKeys: []string{"id", "name"}, // mapped names
+			expectError: false,
+		},
+		{
+			name:        "invalid primary key not in mapping",
+			primaryKeys: []string{"invalid_col"},
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := validatePrimaryKeysAgainstSchema(tt.primaryKeys, testSchema, nameMapper)
+			if tt.expectError {
+				assert.Error(t, err)
+			} else {
+				assert.NoError(t, err)
+			}
+		})
+	}
 }
--- a/integration-tests/bats/import-create-tables.bats
+++ b/integration-tests/bats/import-create-tables.bats
@@ -254,14 +254,16 @@ CSV
    run dolt table import -c -pk="batmansparents" test 1pk5col-ints.csv
    [ "$status" -eq 1 ]
    [[ "$output" =~ "Error determining the output schema." ]] || false
-    [[ "$output" =~ "column 'batmansparents' not found" ]] || false
+    [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
+    [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
 }

@test "import-create-tables: try to table import with one valid and one nonexistent --pk arg" {
    run dolt table import -c -pk="pk,batmansparents" test 1pk5col-ints.csv
    [ "$status" -eq 1 ]
    [[ "$output" =~ "Error determining the output schema." ]] || false
-    [[ "$output" =~ "column 'batmansparents' not found" ]] || false
+    [[ "$output" =~ "primary key 'batmansparents' not found in import file" ]] || false
+    [[ "$output" =~ "Available columns: pk, c1, c2, c3, c4, c5" ]] || false
 }

@test "import-create-tables: create a table with two primary keys from csv import" {
@@ -947,4 +949,112 @@ DELIM
  [[ "$output" =~ '5,contains null,"[4,null]"' ]] || false
  [[ "$output" =~ '6,empty,[]' ]] || false

+}
+
+@test "import-create-tables: validate primary keys exist in CSV file (issue #1083)" {
+    # Create a test CSV file
+    cat <<DELIM > test_pk_validation.csv
+id,name,email,age
+1,Alice,alice@example.com,30
+2,Bob,bob@example.com,25
+3,Charlie,charlie@example.com,35
+DELIM
+
+    # Test 1: Invalid single primary key should fail immediately
+    run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
+
+    # Test 2: Multiple invalid primary keys should fail immediately
+    run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary keys" ]] || false
+    [[ "$output" =~ "invalid1 invalid2" ]] || false
+    [[ "$output" =~ "not found in import file" ]] || false
+
+    # Test 3: Mix of valid and invalid primary keys should fail
+    run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_col' not found in import file" ]] || false
+
+    # Test 4: Valid primary key should succeed
+    run dolt table import -c --pk "id" test_table test_pk_validation.csv
+    [ "$status" -eq 0 ]
+    
+    # Verify table was created correctly
+    run dolt sql -q "DESCRIBE test_table;"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "id" ]] || false
+    [[ "$output" =~ "PRI" ]] || false
+
+    # Test 5: Valid multiple primary keys should succeed
+    rm -f test_table
+    run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
+    [ "$status" -eq 0 ]
+    
+    # Test 6: PSV file with invalid primary key should also fail immediately
+    cat <<DELIM > test_pk_validation.psv
+id|name|email|age
+1|Alice|alice@example.com|30
+2|Bob|bob@example.com|25
+3|Charlie|charlie@example.com|35
+DELIM
+
+    run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'nonexistent' not found in import file" ]] || false
+
+    # Test 7: Large CSV should fail quickly (not after reading entire file)
+    # Create a larger CSV to simulate the original issue
+    {
+        echo "year,state_fips,county_fips,precinct,candidate,votes"
+        for i in {1..1000}; do
+            echo "2020,$i,$i,precinct$i,candidate$i,$i"
+        done
+    } > large_test.csv
+
+    # Time the command - it should fail immediately, not after processing all rows
+    start_time=$(date +%s)
+    run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
+    end_time=$(date +%s)
+    duration=$((end_time - start_time))
+    
+    [ "$status" -eq 1 ]
+    [[ "$output" =~ "primary key 'invalid_column' not found in import file" ]] || false
+    # Should fail in less than 2 seconds (immediate validation)
+    [ "$duration" -lt 2 ] || false
+
+    # Cleanup
+    rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv
+}
+
+@test "import-create-tables: primary key validation with schema file should skip validation" {
+    # Create a test CSV file
+    cat <<DELIM > test_data.csv
+id,name,email
+1,Alice,alice@example.com
+2,Bob,bob@example.com
+DELIM
+
+    # Create a schema file with different column as primary key
+    cat <<SQL > test_schema.sql
+CREATE TABLE test_with_schema (
+  id INT,
+  name VARCHAR(100),
+  email VARCHAR(100),
+  PRIMARY KEY (name)
+);
+SQL
+
+    # When schema file is provided, it should work without primary key validation
+    run dolt table import -c --schema test_schema.sql test_with_schema test_data.csv
+    [ "$status" -eq 0 ]
+    
+    # Verify that 'name' is the primary key (from schema file)
+    run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
+
+    # Cleanup
+    rm -f test_data.csv test_schema.sql
 }
--- a/integration-tests/bats/import-tables.bats
+++ b/integration-tests/bats/import-tables.bats
@@ -48,114 +48,3 @@ teardown() {
    dolt table import -r -pk "/Key5" t `batshelper escaped-characters.csv`
 }

-@test "import-tables: validate primary keys exist in CSV file (issue #1083)" {
-    # Create a test CSV file
-    cat <<DELIM > test_pk_validation.csv
-id,name,email,age
-1,Alice,alice@example.com,30
-2,Bob,bob@example.com,25
-3,Charlie,charlie@example.com,35
-DELIM
-
-    # Test 1: Invalid single primary key should fail immediately
-    run dolt table import -c --pk "invalid_column" test_table test_pk_validation.csv
-    [ "$status" -eq 1 ]
-    [[ "$output" =~ "primary key 'invalid_column' not found" ]] || false
-    [[ "$output" =~ "Available columns: id, name, email, age" ]] || false
-
-    # Test 2: Multiple invalid primary keys should fail immediately
-    run dolt table import -c --pk "invalid1,invalid2" test_table test_pk_validation.csv
-    [ "$status" -eq 1 ]
-    [[ "$output" =~ "primary keys \[invalid1 invalid2\] not found" ]] || false
-    [[ "$output" =~ "Available columns: id, name, email, age" ]] || false
-
-    # Test 3: Mix of valid and invalid primary keys should fail
-    run dolt table import -c --pk "id,invalid_col,name" test_table test_pk_validation.csv
-    [ "$status" -eq 1 ]
-    [[ "$output" =~ "primary key 'invalid_col' not found" ]] || false
-    [[ "$output" =~ "Available columns: id, name, email, age" ]] || false
-
-    # Test 4: Valid primary key should succeed
-    run dolt table import -c --pk "id" test_table test_pk_validation.csv
-    [ "$status" -eq 0 ]
-    
-    # Verify table was created correctly
-    run dolt sql -q "DESCRIBE test_table;"
-    [ "$status" -eq 0 ]
-    [[ "$output" =~ "id" ]] || false
-    [[ "$output" =~ "PRI" ]] || false
-
-    # Test 5: Valid multiple primary keys should succeed
-    rm -f test_table
-    run dolt table import -c --pk "id,name" test_table2 test_pk_validation.csv
-    [ "$status" -eq 0 ]
-    
-    # Test 6: PSV file with invalid primary key should also fail immediately
-    cat <<DELIM > test_pk_validation.psv
-id|name|email|age
-1|Alice|alice@example.com|30
-2|Bob|bob@example.com|25
-3|Charlie|charlie@example.com|35
-DELIM
-
-    run dolt table import -c --pk "nonexistent" test_table3 test_pk_validation.psv
-    [ "$status" -eq 1 ]
-    [[ "$output" =~ "primary key 'nonexistent' not found" ]] || false
-    [[ "$output" =~ "Available columns: id, name, email, age" ]] || false
-
-    # Test 7: Large CSV should fail quickly (not after reading entire file)
-    # Create a larger CSV to simulate the original issue
-    {
-        echo "year,state_fips,county_fips,precinct,candidate,votes"
-        for i in {1..1000}; do
-            echo "2020,$i,$i,precinct$i,candidate$i,$i"
-        done
-    } > large_test.csv
-
-    # Time the command - it should fail immediately, not after processing all rows
-    start_time=$(date +%s)
-    run dolt table import -c --pk "year,state_fips,county_fips,precinct,invalid_column" precinct_results large_test.csv
-    end_time=$(date +%s)
-    duration=$((end_time - start_time))
-    
-    [ "$status" -eq 1 ]
-    [[ "$output" =~ "primary key 'invalid_column' not found" ]] || false
-    [[ "$output" =~ "Available columns: year, state_fips, county_fips, precinct, candidate, votes" ]] || false
-    # Should fail in less than 2 seconds (immediate validation)
-    [ "$duration" -lt 2 ] || false
-
-    # Cleanup
-    rm -f test_pk_validation.csv test_pk_validation.psv large_test.csv
-}
-
-@test "import-tables: primary key validation with schema file should skip validation" {
-    # Create a test CSV file
-    cat <<DELIM > test_data.csv
-id,name,email
-1,Alice,alice@example.com
-2,Bob,bob@example.com
-DELIM
-
-    # Create a schema file with different column as primary key
-    cat <<SQL > test_schema.sql
-CREATE TABLE test_with_schema (
-  id INT,
-  name VARCHAR(100),
-  email VARCHAR(100),
-  PRIMARY KEY (name)
-);
-SQL
-
-    # Even though 'name' is specified as PK in command line, schema file takes precedence
-    # and validation should be skipped
-    run dolt table import -c --pk "nonexistent_column" --schema test_schema.sql test_with_schema test_data.csv
-    [ "$status" -eq 0 ]
-    
-    # Verify that 'name' is the primary key (from schema file)
-    run dolt sql -q "SHOW CREATE TABLE test_with_schema;"
-    [ "$status" -eq 0 ]
-    [[ "$output" =~ "PRIMARY KEY (\`name\`)" ]] || false
-
-    # Cleanup
-    rm -f test_data.csv test_schema.sql
-}