Add special encoding to csv imported struct fields (#2441)

CSV importing is changed to strip invalid characters from csv fields and camel case spaces. i.e. ca-mel case is translated to camelCase.
2026-01-24 03:09:22 -06:00 · 2016-08-30 14:59:10 -07:00
parent cba7a4f118
commit aeb5c42bcc
4 changed files with 128 additions and 60 deletions
--- a/go/types/struct.go
+++ b/go/types/struct.go
@@ -9,6 +9,7 @@ import (
 	"fmt"
 	"regexp"
 	"sort"
+	"strings"

 	"github.com/attic-labs/noms/go/d"
 	"github.com/attic-labs/noms/go/hash"
@@ -169,18 +170,65 @@ func (s1 Struct) Diff(s2 Struct, changes chan<- ValueChanged, closeChan <-chan s
 }

 var escapeChar = "Q"
-var headPattern = regexp.MustCompile("[a-zA-PR-Z]")
-var tailPattern = regexp.MustCompile("[a-zA-PR-Z0-9_]")
-var completePattern = regexp.MustCompile("^" + headPattern.String() + tailPattern.String() + "*$")
+var headFieldNamePattern = regexp.MustCompile("[a-zA-Z]")
+var tailFieldNamePattern = regexp.MustCompile("[a-zA-Z0-9_]")
+var spaceRegex = regexp.MustCompile("[ ]")
+var escapeRegex = regexp.MustCompile(escapeChar)

-// Escapes names for use as noms structs. Disallowed characters are encoded as
-// 'Q<hex-encoded-utf8-bytes>'. Note that Q itself is also escaped since it is
-// the escape character.
-func EscapeStructField(input string) string {
-	if completePattern.MatchString(input) {
-		return input
+var fieldNameComponentRe = regexp.MustCompile("^" + headFieldNamePattern.String() + tailFieldNamePattern.String() + "*")
+var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
+
+type encodingFunc func(string, *regexp.Regexp) string
+
+func CamelCaseFieldName(input string) string {
+	//strip invalid struct characters and leave spaces
+	encode := func(s1 string, p *regexp.Regexp) string {
+		if p.MatchString(s1) || spaceRegex.MatchString(s1) {
+			return s1
+		}
+		return ""
 	}

+	strippedField := escapeField(input, encode)
+	splitField := strings.Fields(strippedField)
+
+	if len(splitField) == 0 {
+		return ""
+	}
+
+	//Camelcase field
+	output := strings.ToLower(splitField[0])
+	if len(splitField) > 1 {
+		for _, field := range splitField[1:] {
+			output += strings.Title(strings.ToLower(field))
+		}
+	}
+	//Because we are removing characters, we may generate an invalid field name
+	//i.e. -- 1A B, we will remove the first bad chars and process until 1aB
+	//1aB is invalid struct field name so we will return ""
+	if !IsValidStructFieldName(output) {
+		return ""
+	}
+	return output
+}
+
+func escapeField(input string, encode encodingFunc) string {
+	output := ""
+	pattern := headFieldNamePattern
+	for _, ch := range input {
+		output += encode(string([]rune{ch}), pattern)
+		pattern = tailFieldNamePattern
+	}
+	return output
+}
+
+// EscapeStructField escapes names for use as noms structs with regards to non CSV imported data.
+// Disallowed characters are encoded as 'Q<hex-encoded-utf8-bytes>'.
+// Note that Q itself is also escaped since it is the escape character.
+func EscapeStructField(input string) string {
+	if !escapeRegex.MatchString(input) && IsValidStructFieldName(input) {
+		return input
+	}
 	encode := func(s1 string, p *regexp.Regexp) string {
 		if p.MatchString(s1) && s1 != escapeChar {
 			return s1
@@ -195,13 +243,42 @@ func EscapeStructField(input string) string {
 		buf.WriteString(hs)
 		return buf.String()
 	}
+	return escapeField(input, encode)
+}

-	output := ""
-	pattern := headPattern
-	for _, ch := range input {
-		output += encode(string([]rune{ch}), pattern)
-		pattern = tailPattern
+// IsValidStructFieldName returns whether the name is valid as a field name in a struct.
+// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
+func IsValidStructFieldName(name string) bool {
+	return fieldNameRe.MatchString(name)
+}
+
+func verifyFieldNames(names []string) {
+	if len(names) == 0 {
+		return
 	}

-	return output
+	last := names[0]
+	verifyFieldName(last)
+
+	for i := 1; i < len(names); i++ {
+		verifyFieldName(names[i])
+		if strings.Compare(names[i], last) <= 0 {
+			d.Chk.Fail("Field names must be unique and ordered alphabetically")
+		}
+		last = names[i]
+	}
+}
+
+func verifyName(name, kind string) {
+	d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
+}
+
+func verifyFieldName(name string) {
+	verifyName(name, " field")
+}
+
+func verifyStructName(name string) {
+	if name != "" {
+		verifyName(name, "")
+	}
 }
--- a/go/types/type.go
+++ b/go/types/type.go
@@ -6,8 +6,6 @@
 package types

 import (
-	"regexp"
-
 	"github.com/attic-labs/noms/go/d"
 	"github.com/attic-labs/noms/go/hash"
 )
@@ -148,43 +146,3 @@ func MakePrimitiveTypeByString(p string) *Type {
 	d.Chk.Fail("invalid type string: %s", p)
 	return nil
 }
-
-var fieldNameComponentRe = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9_]*`)
-var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
-
-func verifyFieldNames(names []string) {
-	if len(names) == 0 {
-		return
-	}
-
-	last := names[0]
-	verifyFieldName(last)
-
-	for i := 1; i < len(names); i++ {
-		verifyFieldName(names[i])
-		if names[i] <= last {
-			d.Chk.Fail("Field names must be unique and ordered alphabetically")
-		}
-		last = names[i]
-	}
-}
-
-// IsValidStructFieldName returns whether the name is valid without as a field name in a struct.
-// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
-func IsValidStructFieldName(name string) bool {
-	return fieldNameRe.MatchString(name)
-}
-
-func verifyName(name, kind string) {
-	d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
-}
-
-func verifyFieldName(name string) {
-	verifyName(name, " field")
-}
-
-func verifyStructName(name string) {
-	if name != "" {
-		verifyName(name, "")
-	}
-}
--- a/samples/go/csv/read.go
+++ b/samples/go/csv/read.go
@@ -44,6 +44,14 @@ func KindsToStrings(kinds KindSlice) []string {
 	return strs
 }

+//EscapeStructFieldFromCSV removes special characters and replaces spaces with camelCasing (camel case turns to camelCase)
+func EscapeStructFieldFromCSV(input string) string {
+	if types.IsValidStructFieldName(input) {
+		return input
+	}
+	return types.CamelCaseFieldName(input)
+}
+
 // MakeStructTypeFromHeaders creates a struct type from the headers using |kinds| as the type of each field. If |kinds| is empty, default to strings.
 func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSlice) (typ *types.Type, fieldOrder []int, kindMap []types.NomsKind) {
 	useStringType := len(kinds) == 0
@@ -54,7 +62,7 @@ func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSl
 	fieldNames := make(sort.StringSlice, len(headers))

 	for i, key := range headers {
-		fn := types.EscapeStructField(key)
+		fn := EscapeStructFieldFromCSV(key)
 		origOrder[fn] = i
 		kind := types.StringKind
 		if !useStringType {
--- a/samples/go/csv/read_test.go
+++ b/samples/go/csv/read_test.go
@@ -134,6 +134,31 @@ g,h,i,j
 	testTrailingHelper(t, dataString)
 }

+func TestEscapeStructFieldFromCSV(t *testing.T) {
+	assert := assert.New(t)
+	cases := []string{
+		"a", "a",
+		"1a", "a",
+		"AaZz19_", "AaZz19_",
+		"Q", "Q",
+		"AQ", "AQ",
+		"_content", "content",
+		"Few ¢ents Short", "fewEntsShort",
+		"CAMEL💩case letTerS", "camelcaseLetters",
+		"https://picasaweb.google.com/data", "httpspicasawebgooglecomdata",
+		"💩", "",
+		"11 1💩", "",
+		"-- A B", "aB",
+		"-- A --", "a",
+		"-- A -- B", "aB",
+	}
+
+	for i := 0; i < len(cases); i += 2 {
+		orig, expected := cases[i], cases[i+1]
+		assert.Equal(expected, EscapeStructFieldFromCSV(orig))
+	}
+}
+
 func TestReadParseError(t *testing.T) {
 	assert := assert.New(t)
 	ds := datas.NewDatabase(chunks.NewMemoryStore())
@@ -174,12 +199,12 @@ func TestEscapeFieldNames(t *testing.T) {

 	l, _ := ReadToList(r, "test", headers, kinds, ds)
 	assert.Equal(uint64(1), l.Len())
-	assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(types.EscapeStructField("A A")))
+	assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))

 	r = NewCSVReader(bytes.NewBufferString(dataString), ',')
 	m := ReadToMap(r, "test", headers, []string{"1"}, kinds, ds)
 	assert.Equal(uint64(1), l.Len())
-	assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(types.EscapeStructField("A A")))
+	assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))
 }

 func TestDefaults(t *testing.T) {