mirror of
https://github.com/dolthub/dolt.git
synced 2026-01-24 03:09:22 -06:00
Add special encoding to csv imported struct fields (#2441)
CSV importing is changed to strip invalid characters from csv fields and camel case spaces. i.e. ca-mel case is translated to camelCase.
This commit is contained in:
@@ -9,6 +9,7 @@ import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
"github.com/attic-labs/noms/go/hash"
|
||||
@@ -169,18 +170,65 @@ func (s1 Struct) Diff(s2 Struct, changes chan<- ValueChanged, closeChan <-chan s
|
||||
}
|
||||
|
||||
var escapeChar = "Q"
|
||||
var headPattern = regexp.MustCompile("[a-zA-PR-Z]")
|
||||
var tailPattern = regexp.MustCompile("[a-zA-PR-Z0-9_]")
|
||||
var completePattern = regexp.MustCompile("^" + headPattern.String() + tailPattern.String() + "*$")
|
||||
var headFieldNamePattern = regexp.MustCompile("[a-zA-Z]")
|
||||
var tailFieldNamePattern = regexp.MustCompile("[a-zA-Z0-9_]")
|
||||
var spaceRegex = regexp.MustCompile("[ ]")
|
||||
var escapeRegex = regexp.MustCompile(escapeChar)
|
||||
|
||||
// Escapes names for use as noms structs. Disallowed characters are encoded as
|
||||
// 'Q<hex-encoded-utf8-bytes>'. Note that Q itself is also escaped since it is
|
||||
// the escape character.
|
||||
func EscapeStructField(input string) string {
|
||||
if completePattern.MatchString(input) {
|
||||
return input
|
||||
var fieldNameComponentRe = regexp.MustCompile("^" + headFieldNamePattern.String() + tailFieldNamePattern.String() + "*")
|
||||
var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
|
||||
|
||||
type encodingFunc func(string, *regexp.Regexp) string
|
||||
|
||||
func CamelCaseFieldName(input string) string {
|
||||
//strip invalid struct characters and leave spaces
|
||||
encode := func(s1 string, p *regexp.Regexp) string {
|
||||
if p.MatchString(s1) || spaceRegex.MatchString(s1) {
|
||||
return s1
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
strippedField := escapeField(input, encode)
|
||||
splitField := strings.Fields(strippedField)
|
||||
|
||||
if len(splitField) == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
//Camelcase field
|
||||
output := strings.ToLower(splitField[0])
|
||||
if len(splitField) > 1 {
|
||||
for _, field := range splitField[1:] {
|
||||
output += strings.Title(strings.ToLower(field))
|
||||
}
|
||||
}
|
||||
//Because we are removing characters, we may generate an invalid field name
|
||||
//i.e. -- 1A B, we will remove the first bad chars and process until 1aB
|
||||
//1aB is invalid struct field name so we will return ""
|
||||
if !IsValidStructFieldName(output) {
|
||||
return ""
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
func escapeField(input string, encode encodingFunc) string {
|
||||
output := ""
|
||||
pattern := headFieldNamePattern
|
||||
for _, ch := range input {
|
||||
output += encode(string([]rune{ch}), pattern)
|
||||
pattern = tailFieldNamePattern
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
// EscapeStructField escapes names for use as noms structs with regards to non CSV imported data.
|
||||
// Disallowed characters are encoded as 'Q<hex-encoded-utf8-bytes>'.
|
||||
// Note that Q itself is also escaped since it is the escape character.
|
||||
func EscapeStructField(input string) string {
|
||||
if !escapeRegex.MatchString(input) && IsValidStructFieldName(input) {
|
||||
return input
|
||||
}
|
||||
encode := func(s1 string, p *regexp.Regexp) string {
|
||||
if p.MatchString(s1) && s1 != escapeChar {
|
||||
return s1
|
||||
@@ -195,13 +243,42 @@ func EscapeStructField(input string) string {
|
||||
buf.WriteString(hs)
|
||||
return buf.String()
|
||||
}
|
||||
return escapeField(input, encode)
|
||||
}
|
||||
|
||||
output := ""
|
||||
pattern := headPattern
|
||||
for _, ch := range input {
|
||||
output += encode(string([]rune{ch}), pattern)
|
||||
pattern = tailPattern
|
||||
// IsValidStructFieldName returns whether the name is valid as a field name in a struct.
|
||||
// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
|
||||
func IsValidStructFieldName(name string) bool {
|
||||
return fieldNameRe.MatchString(name)
|
||||
}
|
||||
|
||||
func verifyFieldNames(names []string) {
|
||||
if len(names) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
return output
|
||||
last := names[0]
|
||||
verifyFieldName(last)
|
||||
|
||||
for i := 1; i < len(names); i++ {
|
||||
verifyFieldName(names[i])
|
||||
if strings.Compare(names[i], last) <= 0 {
|
||||
d.Chk.Fail("Field names must be unique and ordered alphabetically")
|
||||
}
|
||||
last = names[i]
|
||||
}
|
||||
}
|
||||
|
||||
func verifyName(name, kind string) {
|
||||
d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
|
||||
}
|
||||
|
||||
func verifyFieldName(name string) {
|
||||
verifyName(name, " field")
|
||||
}
|
||||
|
||||
func verifyStructName(name string) {
|
||||
if name != "" {
|
||||
verifyName(name, "")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,8 +6,6 @@
|
||||
package types
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
"github.com/attic-labs/noms/go/hash"
|
||||
)
|
||||
@@ -148,43 +146,3 @@ func MakePrimitiveTypeByString(p string) *Type {
|
||||
d.Chk.Fail("invalid type string: %s", p)
|
||||
return nil
|
||||
}
|
||||
|
||||
var fieldNameComponentRe = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9_]*`)
|
||||
var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
|
||||
|
||||
func verifyFieldNames(names []string) {
|
||||
if len(names) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
last := names[0]
|
||||
verifyFieldName(last)
|
||||
|
||||
for i := 1; i < len(names); i++ {
|
||||
verifyFieldName(names[i])
|
||||
if names[i] <= last {
|
||||
d.Chk.Fail("Field names must be unique and ordered alphabetically")
|
||||
}
|
||||
last = names[i]
|
||||
}
|
||||
}
|
||||
|
||||
// IsValidStructFieldName returns whether the name is valid without as a field name in a struct.
|
||||
// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
|
||||
func IsValidStructFieldName(name string) bool {
|
||||
return fieldNameRe.MatchString(name)
|
||||
}
|
||||
|
||||
func verifyName(name, kind string) {
|
||||
d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
|
||||
}
|
||||
|
||||
func verifyFieldName(name string) {
|
||||
verifyName(name, " field")
|
||||
}
|
||||
|
||||
func verifyStructName(name string) {
|
||||
if name != "" {
|
||||
verifyName(name, "")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,14 @@ func KindsToStrings(kinds KindSlice) []string {
|
||||
return strs
|
||||
}
|
||||
|
||||
//EscapeStructFieldFromCSV removes special characters and replaces spaces with camelCasing (camel case turns to camelCase)
|
||||
func EscapeStructFieldFromCSV(input string) string {
|
||||
if types.IsValidStructFieldName(input) {
|
||||
return input
|
||||
}
|
||||
return types.CamelCaseFieldName(input)
|
||||
}
|
||||
|
||||
// MakeStructTypeFromHeaders creates a struct type from the headers using |kinds| as the type of each field. If |kinds| is empty, default to strings.
|
||||
func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSlice) (typ *types.Type, fieldOrder []int, kindMap []types.NomsKind) {
|
||||
useStringType := len(kinds) == 0
|
||||
@@ -54,7 +62,7 @@ func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSl
|
||||
fieldNames := make(sort.StringSlice, len(headers))
|
||||
|
||||
for i, key := range headers {
|
||||
fn := types.EscapeStructField(key)
|
||||
fn := EscapeStructFieldFromCSV(key)
|
||||
origOrder[fn] = i
|
||||
kind := types.StringKind
|
||||
if !useStringType {
|
||||
|
||||
@@ -134,6 +134,31 @@ g,h,i,j
|
||||
testTrailingHelper(t, dataString)
|
||||
}
|
||||
|
||||
func TestEscapeStructFieldFromCSV(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
cases := []string{
|
||||
"a", "a",
|
||||
"1a", "a",
|
||||
"AaZz19_", "AaZz19_",
|
||||
"Q", "Q",
|
||||
"AQ", "AQ",
|
||||
"_content", "content",
|
||||
"Few ¢ents Short", "fewEntsShort",
|
||||
"CAMEL💩case letTerS", "camelcaseLetters",
|
||||
"https://picasaweb.google.com/data", "httpspicasawebgooglecomdata",
|
||||
"💩", "",
|
||||
"11 1💩", "",
|
||||
"-- A B", "aB",
|
||||
"-- A --", "a",
|
||||
"-- A -- B", "aB",
|
||||
}
|
||||
|
||||
for i := 0; i < len(cases); i += 2 {
|
||||
orig, expected := cases[i], cases[i+1]
|
||||
assert.Equal(expected, EscapeStructFieldFromCSV(orig))
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadParseError(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
ds := datas.NewDatabase(chunks.NewMemoryStore())
|
||||
@@ -174,12 +199,12 @@ func TestEscapeFieldNames(t *testing.T) {
|
||||
|
||||
l, _ := ReadToList(r, "test", headers, kinds, ds)
|
||||
assert.Equal(uint64(1), l.Len())
|
||||
assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(types.EscapeStructField("A A")))
|
||||
assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))
|
||||
|
||||
r = NewCSVReader(bytes.NewBufferString(dataString), ',')
|
||||
m := ReadToMap(r, "test", headers, []string{"1"}, kinds, ds)
|
||||
assert.Equal(uint64(1), l.Len())
|
||||
assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(types.EscapeStructField("A A")))
|
||||
assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))
|
||||
}
|
||||
|
||||
func TestDefaults(t *testing.T) {
|
||||
|
||||
Reference in New Issue
Block a user