diff --git a/go/libraries/doltcore/mvdata/data_loc.go b/go/libraries/doltcore/mvdata/data_loc.go index 8109b8a9f1..ebe8617936 100644 --- a/go/libraries/doltcore/mvdata/data_loc.go +++ b/go/libraries/doltcore/mvdata/data_loc.go @@ -141,7 +141,7 @@ func (dl *DataLocation) CreateReader(ctx context.Context, root *doltdb.RootValue return rd, false, err case PsvFile: - rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim('|')) + rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim("|")) return rd, false, err case XlsxFile: @@ -189,7 +189,7 @@ func (dl *DataLocation) CreateOverwritingDataWriter(ctx context.Context, mvOpts case CsvFile: return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo()) case PsvFile: - return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim('|')) + return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim("|")) case XlsxFile: return xlsx.OpenXLSXWriter(dl.Path, fs, outSch, xlsx.NewXLSXInfo()) case JsonFile: diff --git a/go/libraries/doltcore/table/pipeline/transform_test.go b/go/libraries/doltcore/table/pipeline/transform_test.go index 32eae9f089..43ff5953f5 100644 --- a/go/libraries/doltcore/table/pipeline/transform_test.go +++ b/go/libraries/doltcore/table/pipeline/transform_test.go @@ -68,7 +68,7 @@ func TestPipeline(t *testing.T) { } func() { - csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true} + csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true} rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo) wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo) @@ -106,7 +106,7 @@ func TestAddingStages(t *testing.T) { } func() { - csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true} + csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true} rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo) wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo) @@ -176,11 +176,11 @@ Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true,1` func() { - csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true} + csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true} rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo) wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo) - addedStages := []NamedTransform { + addedStages := []NamedTransform{ NewNamedTransform("identity", identityTransFunc), NewNamedTransform("label", labelTransFunc), NewNamedTransform("dupe", dupeTransFunc), @@ -262,7 +262,7 @@ func TestAbort(t *testing.T) { } func() { - csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true} + csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true} rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo) wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo) diff --git a/go/libraries/doltcore/table/untyped/csv/file_info.go b/go/libraries/doltcore/table/untyped/csv/file_info.go index cb98d02252..2c8ba7265a 100644 --- a/go/libraries/doltcore/table/untyped/csv/file_info.go +++ b/go/libraries/doltcore/table/untyped/csv/file_info.go @@ -3,7 +3,7 @@ package csv // CSVFileInfo describes a csv file type CSVFileInfo struct { // Delim says which character is used as a field delimiter - Delim rune + Delim string // HasHeaderLine says if the csv has a header line which contains the names of the columns HasHeaderLine bool // Columns can be provided if you no the columns and their order in the csv @@ -14,11 +14,11 @@ type CSVFileInfo struct { // NewCSVInfo creates a new CSVInfo struct with default values func NewCSVInfo() *CSVFileInfo { - return &CSVFileInfo{',', true, nil, true} + return &CSVFileInfo{",", true, nil, true} } // SetDelim sets the Delim member and returns the CSVFileInfo -func (info *CSVFileInfo) SetDelim(delim rune) *CSVFileInfo { +func (info *CSVFileInfo) SetDelim(delim string) *CSVFileInfo { info.Delim = delim return info } diff --git a/go/libraries/doltcore/table/untyped/csv/file_info_test.go b/go/libraries/doltcore/table/untyped/csv/file_info_test.go index 59f860a447..c752972fe8 100644 --- a/go/libraries/doltcore/table/untyped/csv/file_info_test.go +++ b/go/libraries/doltcore/table/untyped/csv/file_info_test.go @@ -8,18 +8,18 @@ import ( func TestCSVFileInfo(t *testing.T) { nfo := NewCSVInfo() - if nfo.Delim != ',' || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes { + if nfo.Delim != "," || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes { t.Error("Unexpected values") } testCols := []string{"c1,c2"} nfo = NewCSVInfo(). SetColumns(testCols). - SetDelim('|'). + SetDelim("|"). SetEscapeQuotes(false). SetHasHeaderLine(false) - if nfo.Delim != '|' || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes { + if nfo.Delim != "|" || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes { t.Error("Unexpected values") } } diff --git a/go/libraries/doltcore/table/untyped/csv/line.go b/go/libraries/doltcore/table/untyped/csv/line.go index a242680b1a..46187f4190 100644 --- a/go/libraries/doltcore/table/untyped/csv/line.go +++ b/go/libraries/doltcore/table/untyped/csv/line.go @@ -1,34 +1,66 @@ package csv -func csvSplitLine(str string, delim rune, escapedQuotes bool) []string { - var tokens []string +import ( + "math" + "strings" +) - quotations := 0 - escaped := false - start := 0 - for pos, c := range str { - if c == delim && !escaped { - tokens = appendToken(tokens, str, start, pos, quotations) - start = pos + 1 - quotations = 0 +func csvSplitLineRuneDelim(str string, delim rune, escapedQuotes bool) []string { + return csvSplitLine(str, string(delim), escapedQuotes) +} - if pos == len(str)-1 { - tokens = appendToken(tokens, "", 0, 0, 0) - } - } else if escapedQuotes && c == '"' { - escaped = !escaped - quotations++ - } +func csvSplitLine(str string, delim string, escapedQuotes bool) []string { + if strings.IndexRune(delim, '"') != -1 { + panic("delims cannot contain quotes") } - if start != len(str) { - tokens = appendToken(tokens, str, start, len(str), quotations) + var tokens []string + delimLen := len(delim) + + done := false + escaped := false + currPos := 0 + cellStart := 0 + for !done { + remainingStr := str[currPos:] + nextQuote := strings.Index(remainingStr, "\"") + nextDelim := strings.Index(remainingStr, delim) + + if nextQuote == -1 || !escapedQuotes { + nextQuote = math.MaxInt32 + } + + if !escaped && nextDelim < nextQuote { + if nextDelim == -1 { + nextDelim = len(remainingStr) + done = true + } + + tokens = appendToken(tokens, str, cellStart, currPos+nextDelim, escapedQuotes) + cellStart = currPos + nextDelim + delimLen + currPos = cellStart + } else if escapedQuotes && nextQuote != -1 { + escaped = !escaped + currPos += nextQuote + 1 + } else { + break + } } return tokens } -func appendToken(tokens []string, line string, start, pos, quotations int) []string { +func appendToken(tokens []string, line string, start, pos int, escapedQuotes bool) []string { + quotations := 0 + + if escapedQuotes { + for _, c := range line { + if c == '"' { + quotations++ + } + } + } + if start == pos { return append(tokens, "") } diff --git a/go/libraries/doltcore/table/untyped/csv/line_test.go b/go/libraries/doltcore/table/untyped/csv/line_test.go index d12318f22d..758c8af0fd 100644 --- a/go/libraries/doltcore/table/untyped/csv/line_test.go +++ b/go/libraries/doltcore/table/untyped/csv/line_test.go @@ -5,34 +5,36 @@ import "testing" func TestCSVSplitLine(t *testing.T) { splitTests := []struct { ToSplit string - Delim rune + Delim string expectedTokens []string escapeQuotes bool }{ - {``, ',', []string{}, true}, - {`one`, ',', []string{"one"}, true}, - {`one,`, ',', []string{"one", ""}, true}, - {`one,two, three`, ',', []string{"one", "two", "three"}, true}, - {`one,"two", three`, ',', []string{"one", "two", "three"}, true}, - {`one," two", three`, ',', []string{"one", " two", "three"}, true}, - {`one," two", three`, ',', []string{"one", `" two"`, "three"}, false}, - {`one,"two, three"`, ',', []string{"one", "two, three"}, true}, - {`one,""two three""`, ',', []string{"one", `"two three"`}, true}, - {`one,"two, ""three""`, ',', []string{"one", `two, "three"`}, true}, - {`brian ""the great"" hendriks,mr.,1.7526`, ',', []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true}, - {`col1,"Industriepark ""De Bruwaan""",col3`, ',', []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true}, - {`|a|`, '|', []string{"", "a", ""}, true}, - {`72470|30|0|40|0||||`, '|', []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true}, - {`"one","two"`, ',', []string{`"one"`, `"two"`}, false}, - {`"one","two"`, ',', []string{`one`, `two`}, true}, - {`one, two`, ',', []string{`one`, `two`}, true}, - {`one," two"`, ',', []string{`one`, ` two`}, true}, + {``, ",", []string{""}, true}, + {`one`, ",", []string{"one"}, true}, + {`one,`, ",", []string{"one", ""}, true}, + {`one,two, three`, ",", []string{"one", "two", "three"}, true}, + {`one,"two", three`, ",", []string{"one", "two", "three"}, true}, + {`one," two", three`, ",", []string{"one", " two", "three"}, true}, + {`one," two", three`, ",", []string{"one", `" two"`, "three"}, false}, + {`one,"two, three"`, ",", []string{"one", "two, three"}, true}, + {`one,""two three""`, ",", []string{"one", `"two three"`}, true}, + {`one,"two, ""three""`, ",", []string{"one", `two, "three"`}, true}, + {`brian ""the great"" hendriks,mr.,1.7526`, ",", []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true}, + {`col1,"Industriepark ""De Bruwaan""",col3`, ",", []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true}, + {`|a|`, "|", []string{"", "a", ""}, true}, + {`72470|30|0|40|0||||`, "|", []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true}, + {`"one","two"`, ",", []string{`"one"`, `"two"`}, false}, + {`"one","two"`, ",", []string{`one`, `two`}, true}, + {`one, two`, ",", []string{`one`, `two`}, true}, + {`one," two"`, ",", []string{`one`, ` two`}, true}, { `23660|1300|"Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"""`, - '|', + "|", []string{"23660", "1300", `Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"`}, true, }, + {`72470300400""`, "", []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true}, + {`72470300400""""""`, "", []string{"72470", "30", "0", "40", "0", `""`, "", "", ""}, true}, } for _, test := range splitTests {