Add special encoding to csv imported struct fields (#2441)

CSV importing is changed to strip invalid characters from csv fields 
and camel case spaces. i.e. ca-mel case is translated to camelCase.
This commit is contained in:
zcstarr
2016-08-30 14:59:10 -07:00
committed by GitHub
parent cba7a4f118
commit aeb5c42bcc
4 changed files with 128 additions and 60 deletions

View File

@@ -9,6 +9,7 @@ import (
"fmt"
"regexp"
"sort"
"strings"
"github.com/attic-labs/noms/go/d"
"github.com/attic-labs/noms/go/hash"
@@ -169,18 +170,65 @@ func (s1 Struct) Diff(s2 Struct, changes chan<- ValueChanged, closeChan <-chan s
}
var escapeChar = "Q"
var headPattern = regexp.MustCompile("[a-zA-PR-Z]")
var tailPattern = regexp.MustCompile("[a-zA-PR-Z0-9_]")
var completePattern = regexp.MustCompile("^" + headPattern.String() + tailPattern.String() + "*$")
var headFieldNamePattern = regexp.MustCompile("[a-zA-Z]")
var tailFieldNamePattern = regexp.MustCompile("[a-zA-Z0-9_]")
var spaceRegex = regexp.MustCompile("[ ]")
var escapeRegex = regexp.MustCompile(escapeChar)
// Escapes names for use as noms structs. Disallowed characters are encoded as
// 'Q<hex-encoded-utf8-bytes>'. Note that Q itself is also escaped since it is
// the escape character.
func EscapeStructField(input string) string {
if completePattern.MatchString(input) {
return input
var fieldNameComponentRe = regexp.MustCompile("^" + headFieldNamePattern.String() + tailFieldNamePattern.String() + "*")
var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
type encodingFunc func(string, *regexp.Regexp) string
func CamelCaseFieldName(input string) string {
//strip invalid struct characters and leave spaces
encode := func(s1 string, p *regexp.Regexp) string {
if p.MatchString(s1) || spaceRegex.MatchString(s1) {
return s1
}
return ""
}
strippedField := escapeField(input, encode)
splitField := strings.Fields(strippedField)
if len(splitField) == 0 {
return ""
}
//Camelcase field
output := strings.ToLower(splitField[0])
if len(splitField) > 1 {
for _, field := range splitField[1:] {
output += strings.Title(strings.ToLower(field))
}
}
//Because we are removing characters, we may generate an invalid field name
//i.e. -- 1A B, we will remove the first bad chars and process until 1aB
//1aB is invalid struct field name so we will return ""
if !IsValidStructFieldName(output) {
return ""
}
return output
}
func escapeField(input string, encode encodingFunc) string {
output := ""
pattern := headFieldNamePattern
for _, ch := range input {
output += encode(string([]rune{ch}), pattern)
pattern = tailFieldNamePattern
}
return output
}
// EscapeStructField escapes names for use as noms structs with regards to non CSV imported data.
// Disallowed characters are encoded as 'Q<hex-encoded-utf8-bytes>'.
// Note that Q itself is also escaped since it is the escape character.
func EscapeStructField(input string) string {
if !escapeRegex.MatchString(input) && IsValidStructFieldName(input) {
return input
}
encode := func(s1 string, p *regexp.Regexp) string {
if p.MatchString(s1) && s1 != escapeChar {
return s1
@@ -195,13 +243,42 @@ func EscapeStructField(input string) string {
buf.WriteString(hs)
return buf.String()
}
return escapeField(input, encode)
}
output := ""
pattern := headPattern
for _, ch := range input {
output += encode(string([]rune{ch}), pattern)
pattern = tailPattern
// IsValidStructFieldName returns whether the name is valid as a field name in a struct.
// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
func IsValidStructFieldName(name string) bool {
return fieldNameRe.MatchString(name)
}
func verifyFieldNames(names []string) {
if len(names) == 0 {
return
}
return output
last := names[0]
verifyFieldName(last)
for i := 1; i < len(names); i++ {
verifyFieldName(names[i])
if strings.Compare(names[i], last) <= 0 {
d.Chk.Fail("Field names must be unique and ordered alphabetically")
}
last = names[i]
}
}
func verifyName(name, kind string) {
d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
}
func verifyFieldName(name string) {
verifyName(name, " field")
}
func verifyStructName(name string) {
if name != "" {
verifyName(name, "")
}
}

View File

@@ -6,8 +6,6 @@
package types
import (
"regexp"
"github.com/attic-labs/noms/go/d"
"github.com/attic-labs/noms/go/hash"
)
@@ -148,43 +146,3 @@ func MakePrimitiveTypeByString(p string) *Type {
d.Chk.Fail("invalid type string: %s", p)
return nil
}
var fieldNameComponentRe = regexp.MustCompile(`^[a-zA-Z][a-zA-Z0-9_]*`)
var fieldNameRe = regexp.MustCompile(fieldNameComponentRe.String() + "$")
func verifyFieldNames(names []string) {
if len(names) == 0 {
return
}
last := names[0]
verifyFieldName(last)
for i := 1; i < len(names); i++ {
verifyFieldName(names[i])
if names[i] <= last {
d.Chk.Fail("Field names must be unique and ordered alphabetically")
}
last = names[i]
}
}
// IsValidStructFieldName returns whether the name is valid without as a field name in a struct.
// Valid names must start with `a-zA-Z` and after that `a-zA-Z0-9_`.
func IsValidStructFieldName(name string) bool {
return fieldNameRe.MatchString(name)
}
func verifyName(name, kind string) {
d.PanicIfTrue(!IsValidStructFieldName(name), `Invalid struct%s name: "%s"`, kind, name)
}
func verifyFieldName(name string) {
verifyName(name, " field")
}
func verifyStructName(name string) {
if name != "" {
verifyName(name, "")
}
}

View File

@@ -44,6 +44,14 @@ func KindsToStrings(kinds KindSlice) []string {
return strs
}
//EscapeStructFieldFromCSV removes special characters and replaces spaces with camelCasing (camel case turns to camelCase)
func EscapeStructFieldFromCSV(input string) string {
if types.IsValidStructFieldName(input) {
return input
}
return types.CamelCaseFieldName(input)
}
// MakeStructTypeFromHeaders creates a struct type from the headers using |kinds| as the type of each field. If |kinds| is empty, default to strings.
func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSlice) (typ *types.Type, fieldOrder []int, kindMap []types.NomsKind) {
useStringType := len(kinds) == 0
@@ -54,7 +62,7 @@ func MakeStructTypeFromHeaders(headers []string, structName string, kinds KindSl
fieldNames := make(sort.StringSlice, len(headers))
for i, key := range headers {
fn := types.EscapeStructField(key)
fn := EscapeStructFieldFromCSV(key)
origOrder[fn] = i
kind := types.StringKind
if !useStringType {

View File

@@ -134,6 +134,31 @@ g,h,i,j
testTrailingHelper(t, dataString)
}
func TestEscapeStructFieldFromCSV(t *testing.T) {
assert := assert.New(t)
cases := []string{
"a", "a",
"1a", "a",
"AaZz19_", "AaZz19_",
"Q", "Q",
"AQ", "AQ",
"_content", "content",
"Few ¢ents Short", "fewEntsShort",
"CAMEL💩case letTerS", "camelcaseLetters",
"https://picasaweb.google.com/data", "httpspicasawebgooglecomdata",
"💩", "",
"11 1💩", "",
"-- A B", "aB",
"-- A --", "a",
"-- A -- B", "aB",
}
for i := 0; i < len(cases); i += 2 {
orig, expected := cases[i], cases[i+1]
assert.Equal(expected, EscapeStructFieldFromCSV(orig))
}
}
func TestReadParseError(t *testing.T) {
assert := assert.New(t)
ds := datas.NewDatabase(chunks.NewMemoryStore())
@@ -174,12 +199,12 @@ func TestEscapeFieldNames(t *testing.T) {
l, _ := ReadToList(r, "test", headers, kinds, ds)
assert.Equal(uint64(1), l.Len())
assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(types.EscapeStructField("A A")))
assert.Equal(types.Number(1), l.Get(0).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))
r = NewCSVReader(bytes.NewBufferString(dataString), ',')
m := ReadToMap(r, "test", headers, []string{"1"}, kinds, ds)
assert.Equal(uint64(1), l.Len())
assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(types.EscapeStructField("A A")))
assert.Equal(types.Number(1), m.Get(types.Number(2)).(types.Struct).Get(EscapeStructFieldFromCSV("A A")))
}
func TestDefaults(t *testing.T) {