string delimeters for csv files

This commit is contained in:
Brian Hendriks
2019-07-18 17:54:28 -07:00
parent a68636c40b
commit a4bbac200b
6 changed files with 87 additions and 53 deletions

View File

@@ -141,7 +141,7 @@ func (dl *DataLocation) CreateReader(ctx context.Context, root *doltdb.RootValue
return rd, false, err
case PsvFile:
rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim('|'))
rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim("|"))
return rd, false, err
case XlsxFile:
@@ -189,7 +189,7 @@ func (dl *DataLocation) CreateOverwritingDataWriter(ctx context.Context, mvOpts
case CsvFile:
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo())
case PsvFile:
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim('|'))
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim("|"))
case XlsxFile:
return xlsx.OpenXLSXWriter(dl.Path, fs, outSch, xlsx.NewXLSXInfo())
case JsonFile:

View File

@@ -68,7 +68,7 @@ func TestPipeline(t *testing.T) {
}
func() {
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
@@ -106,7 +106,7 @@ func TestAddingStages(t *testing.T) {
}
func() {
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
@@ -176,11 +176,11 @@ Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true
Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true,1`
func() {
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
addedStages := []NamedTransform {
addedStages := []NamedTransform{
NewNamedTransform("identity", identityTransFunc),
NewNamedTransform("label", labelTransFunc),
NewNamedTransform("dupe", dupeTransFunc),
@@ -262,7 +262,7 @@ func TestAbort(t *testing.T) {
}
func() {
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)

View File

@@ -3,7 +3,7 @@ package csv
// CSVFileInfo describes a csv file
type CSVFileInfo struct {
// Delim says which character is used as a field delimiter
Delim rune
Delim string
// HasHeaderLine says if the csv has a header line which contains the names of the columns
HasHeaderLine bool
// Columns can be provided if you no the columns and their order in the csv
@@ -14,11 +14,11 @@ type CSVFileInfo struct {
// NewCSVInfo creates a new CSVInfo struct with default values
func NewCSVInfo() *CSVFileInfo {
return &CSVFileInfo{',', true, nil, true}
return &CSVFileInfo{",", true, nil, true}
}
// SetDelim sets the Delim member and returns the CSVFileInfo
func (info *CSVFileInfo) SetDelim(delim rune) *CSVFileInfo {
func (info *CSVFileInfo) SetDelim(delim string) *CSVFileInfo {
info.Delim = delim
return info
}

View File

@@ -8,18 +8,18 @@ import (
func TestCSVFileInfo(t *testing.T) {
nfo := NewCSVInfo()
if nfo.Delim != ',' || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes {
if nfo.Delim != "," || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes {
t.Error("Unexpected values")
}
testCols := []string{"c1,c2"}
nfo = NewCSVInfo().
SetColumns(testCols).
SetDelim('|').
SetDelim("|").
SetEscapeQuotes(false).
SetHasHeaderLine(false)
if nfo.Delim != '|' || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes {
if nfo.Delim != "|" || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes {
t.Error("Unexpected values")
}
}

View File

@@ -1,34 +1,66 @@
package csv
func csvSplitLine(str string, delim rune, escapedQuotes bool) []string {
var tokens []string
import (
"math"
"strings"
)
quotations := 0
escaped := false
start := 0
for pos, c := range str {
if c == delim && !escaped {
tokens = appendToken(tokens, str, start, pos, quotations)
start = pos + 1
quotations = 0
func csvSplitLineRuneDelim(str string, delim rune, escapedQuotes bool) []string {
return csvSplitLine(str, string(delim), escapedQuotes)
}
if pos == len(str)-1 {
tokens = appendToken(tokens, "", 0, 0, 0)
}
} else if escapedQuotes && c == '"' {
escaped = !escaped
quotations++
}
func csvSplitLine(str string, delim string, escapedQuotes bool) []string {
if strings.IndexRune(delim, '"') != -1 {
panic("delims cannot contain quotes")
}
if start != len(str) {
tokens = appendToken(tokens, str, start, len(str), quotations)
var tokens []string
delimLen := len(delim)
done := false
escaped := false
currPos := 0
cellStart := 0
for !done {
remainingStr := str[currPos:]
nextQuote := strings.Index(remainingStr, "\"")
nextDelim := strings.Index(remainingStr, delim)
if nextQuote == -1 || !escapedQuotes {
nextQuote = math.MaxInt32
}
if !escaped && nextDelim < nextQuote {
if nextDelim == -1 {
nextDelim = len(remainingStr)
done = true
}
tokens = appendToken(tokens, str, cellStart, currPos+nextDelim, escapedQuotes)
cellStart = currPos + nextDelim + delimLen
currPos = cellStart
} else if escapedQuotes && nextQuote != -1 {
escaped = !escaped
currPos += nextQuote + 1
} else {
break
}
}
return tokens
}
func appendToken(tokens []string, line string, start, pos, quotations int) []string {
func appendToken(tokens []string, line string, start, pos int, escapedQuotes bool) []string {
quotations := 0
if escapedQuotes {
for _, c := range line {
if c == '"' {
quotations++
}
}
}
if start == pos {
return append(tokens, "")
}

View File

@@ -5,34 +5,36 @@ import "testing"
func TestCSVSplitLine(t *testing.T) {
splitTests := []struct {
ToSplit string
Delim rune
Delim string
expectedTokens []string
escapeQuotes bool
}{
{``, ',', []string{}, true},
{`one`, ',', []string{"one"}, true},
{`one,`, ',', []string{"one", ""}, true},
{`one,two, three`, ',', []string{"one", "two", "three"}, true},
{`one,"two", three`, ',', []string{"one", "two", "three"}, true},
{`one," two", three`, ',', []string{"one", " two", "three"}, true},
{`one," two", three`, ',', []string{"one", `" two"`, "three"}, false},
{`one,"two, three"`, ',', []string{"one", "two, three"}, true},
{`one,""two three""`, ',', []string{"one", `"two three"`}, true},
{`one,"two, ""three""`, ',', []string{"one", `two, "three"`}, true},
{`brian ""the great"" hendriks,mr.,1.7526`, ',', []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true},
{`col1,"Industriepark ""De Bruwaan""",col3`, ',', []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true},
{`|a|`, '|', []string{"", "a", ""}, true},
{`72470|30|0|40|0||||`, '|', []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true},
{`"one","two"`, ',', []string{`"one"`, `"two"`}, false},
{`"one","two"`, ',', []string{`one`, `two`}, true},
{`one, two`, ',', []string{`one`, `two`}, true},
{`one," two"`, ',', []string{`one`, ` two`}, true},
{``, ",", []string{""}, true},
{`one`, ",", []string{"one"}, true},
{`one,`, ",", []string{"one", ""}, true},
{`one,two, three`, ",", []string{"one", "two", "three"}, true},
{`one,"two", three`, ",", []string{"one", "two", "three"}, true},
{`one," two", three`, ",", []string{"one", " two", "three"}, true},
{`one," two", three`, ",", []string{"one", `" two"`, "three"}, false},
{`one,"two, three"`, ",", []string{"one", "two, three"}, true},
{`one,""two three""`, ",", []string{"one", `"two three"`}, true},
{`one,"two, ""three""`, ",", []string{"one", `two, "three"`}, true},
{`brian ""the great"" hendriks,mr.,1.7526`, ",", []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true},
{`col1,"Industriepark ""De Bruwaan""",col3`, ",", []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true},
{`|a|`, "|", []string{"", "a", ""}, true},
{`72470|30|0|40|0||||`, "|", []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true},
{`"one","two"`, ",", []string{`"one"`, `"two"`}, false},
{`"one","two"`, ",", []string{`one`, `two`}, true},
{`one, two`, ",", []string{`one`, `two`}, true},
{`one," two"`, ",", []string{`one`, ` two`}, true},
{
`23660|1300|"Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"""`,
'|',
"|",
[]string{"23660", "1300", `Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"`},
true,
},
{`72470<delim>30<delim>0<delim>40<delim>0<delim>"<delim>"<delim><delim><delim>`, "<delim>", []string{"72470", "30", "0", "40", "0", "<delim>", "", "", ""}, true},
{`72470<delim>30<delim>0<delim>40<delim>0<delim>"""<delim>"""<delim><delim><delim>`, "<delim>", []string{"72470", "30", "0", "40", "0", `"<delim>"`, "", "", ""}, true},
}
for _, test := range splitTests {