mirror of
https://github.com/dolthub/dolt.git
synced 2026-01-07 16:17:37 -06:00
string delimeters for csv files
This commit is contained in:
@@ -141,7 +141,7 @@ func (dl *DataLocation) CreateReader(ctx context.Context, root *doltdb.RootValue
|
||||
return rd, false, err
|
||||
|
||||
case PsvFile:
|
||||
rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim('|'))
|
||||
rd, err := csv.OpenCSVReader(root.VRW().Format(), dl.Path, fs, csv.NewCSVInfo().SetDelim("|"))
|
||||
return rd, false, err
|
||||
|
||||
case XlsxFile:
|
||||
@@ -189,7 +189,7 @@ func (dl *DataLocation) CreateOverwritingDataWriter(ctx context.Context, mvOpts
|
||||
case CsvFile:
|
||||
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo())
|
||||
case PsvFile:
|
||||
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim('|'))
|
||||
return csv.OpenCSVWriter(dl.Path, fs, outSch, csv.NewCSVInfo().SetDelim("|"))
|
||||
case XlsxFile:
|
||||
return xlsx.OpenXLSXWriter(dl.Path, fs, outSch, xlsx.NewXLSXInfo())
|
||||
case JsonFile:
|
||||
|
||||
@@ -68,7 +68,7 @@ func TestPipeline(t *testing.T) {
|
||||
}
|
||||
|
||||
func() {
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
|
||||
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
|
||||
|
||||
@@ -106,7 +106,7 @@ func TestAddingStages(t *testing.T) {
|
||||
}
|
||||
|
||||
func() {
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
|
||||
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
|
||||
|
||||
@@ -176,11 +176,11 @@ Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true
|
||||
Don,Beddoe,Bewitched (episode Humbug Not to Be Spoken Here - Season 4),1967,true,1`
|
||||
|
||||
func() {
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
|
||||
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
|
||||
|
||||
addedStages := []NamedTransform {
|
||||
addedStages := []NamedTransform{
|
||||
NewNamedTransform("identity", identityTransFunc),
|
||||
NewNamedTransform("label", labelTransFunc),
|
||||
NewNamedTransform("dupe", dupeTransFunc),
|
||||
@@ -262,7 +262,7 @@ func TestAbort(t *testing.T) {
|
||||
}
|
||||
|
||||
func() {
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ',', HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
csvInfo := &csv.CSVFileInfo{Delim: ",", HasHeaderLine: true, Columns: nil, EscapeQuotes: true}
|
||||
rd, _ := csv.NewCSVReader(types.Format_7_18, ioutil.NopCloser(buf), csvInfo)
|
||||
wr, _ := csv.NewCSVWriter(iohelp.NopWrCloser(outBuf), schOut, csvInfo)
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ package csv
|
||||
// CSVFileInfo describes a csv file
|
||||
type CSVFileInfo struct {
|
||||
// Delim says which character is used as a field delimiter
|
||||
Delim rune
|
||||
Delim string
|
||||
// HasHeaderLine says if the csv has a header line which contains the names of the columns
|
||||
HasHeaderLine bool
|
||||
// Columns can be provided if you no the columns and their order in the csv
|
||||
@@ -14,11 +14,11 @@ type CSVFileInfo struct {
|
||||
|
||||
// NewCSVInfo creates a new CSVInfo struct with default values
|
||||
func NewCSVInfo() *CSVFileInfo {
|
||||
return &CSVFileInfo{',', true, nil, true}
|
||||
return &CSVFileInfo{",", true, nil, true}
|
||||
}
|
||||
|
||||
// SetDelim sets the Delim member and returns the CSVFileInfo
|
||||
func (info *CSVFileInfo) SetDelim(delim rune) *CSVFileInfo {
|
||||
func (info *CSVFileInfo) SetDelim(delim string) *CSVFileInfo {
|
||||
info.Delim = delim
|
||||
return info
|
||||
}
|
||||
|
||||
@@ -8,18 +8,18 @@ import (
|
||||
func TestCSVFileInfo(t *testing.T) {
|
||||
nfo := NewCSVInfo()
|
||||
|
||||
if nfo.Delim != ',' || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes {
|
||||
if nfo.Delim != "," || nfo.HasHeaderLine != true || nfo.Columns != nil || !nfo.EscapeQuotes {
|
||||
t.Error("Unexpected values")
|
||||
}
|
||||
|
||||
testCols := []string{"c1,c2"}
|
||||
nfo = NewCSVInfo().
|
||||
SetColumns(testCols).
|
||||
SetDelim('|').
|
||||
SetDelim("|").
|
||||
SetEscapeQuotes(false).
|
||||
SetHasHeaderLine(false)
|
||||
|
||||
if nfo.Delim != '|' || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes {
|
||||
if nfo.Delim != "|" || nfo.HasHeaderLine != false || !reflect.DeepEqual(nfo.Columns, testCols) || nfo.EscapeQuotes {
|
||||
t.Error("Unexpected values")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,34 +1,66 @@
|
||||
package csv
|
||||
|
||||
func csvSplitLine(str string, delim rune, escapedQuotes bool) []string {
|
||||
var tokens []string
|
||||
import (
|
||||
"math"
|
||||
"strings"
|
||||
)
|
||||
|
||||
quotations := 0
|
||||
escaped := false
|
||||
start := 0
|
||||
for pos, c := range str {
|
||||
if c == delim && !escaped {
|
||||
tokens = appendToken(tokens, str, start, pos, quotations)
|
||||
start = pos + 1
|
||||
quotations = 0
|
||||
func csvSplitLineRuneDelim(str string, delim rune, escapedQuotes bool) []string {
|
||||
return csvSplitLine(str, string(delim), escapedQuotes)
|
||||
}
|
||||
|
||||
if pos == len(str)-1 {
|
||||
tokens = appendToken(tokens, "", 0, 0, 0)
|
||||
}
|
||||
} else if escapedQuotes && c == '"' {
|
||||
escaped = !escaped
|
||||
quotations++
|
||||
}
|
||||
func csvSplitLine(str string, delim string, escapedQuotes bool) []string {
|
||||
if strings.IndexRune(delim, '"') != -1 {
|
||||
panic("delims cannot contain quotes")
|
||||
}
|
||||
|
||||
if start != len(str) {
|
||||
tokens = appendToken(tokens, str, start, len(str), quotations)
|
||||
var tokens []string
|
||||
delimLen := len(delim)
|
||||
|
||||
done := false
|
||||
escaped := false
|
||||
currPos := 0
|
||||
cellStart := 0
|
||||
for !done {
|
||||
remainingStr := str[currPos:]
|
||||
nextQuote := strings.Index(remainingStr, "\"")
|
||||
nextDelim := strings.Index(remainingStr, delim)
|
||||
|
||||
if nextQuote == -1 || !escapedQuotes {
|
||||
nextQuote = math.MaxInt32
|
||||
}
|
||||
|
||||
if !escaped && nextDelim < nextQuote {
|
||||
if nextDelim == -1 {
|
||||
nextDelim = len(remainingStr)
|
||||
done = true
|
||||
}
|
||||
|
||||
tokens = appendToken(tokens, str, cellStart, currPos+nextDelim, escapedQuotes)
|
||||
cellStart = currPos + nextDelim + delimLen
|
||||
currPos = cellStart
|
||||
} else if escapedQuotes && nextQuote != -1 {
|
||||
escaped = !escaped
|
||||
currPos += nextQuote + 1
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
func appendToken(tokens []string, line string, start, pos, quotations int) []string {
|
||||
func appendToken(tokens []string, line string, start, pos int, escapedQuotes bool) []string {
|
||||
quotations := 0
|
||||
|
||||
if escapedQuotes {
|
||||
for _, c := range line {
|
||||
if c == '"' {
|
||||
quotations++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if start == pos {
|
||||
return append(tokens, "")
|
||||
}
|
||||
|
||||
@@ -5,34 +5,36 @@ import "testing"
|
||||
func TestCSVSplitLine(t *testing.T) {
|
||||
splitTests := []struct {
|
||||
ToSplit string
|
||||
Delim rune
|
||||
Delim string
|
||||
expectedTokens []string
|
||||
escapeQuotes bool
|
||||
}{
|
||||
{``, ',', []string{}, true},
|
||||
{`one`, ',', []string{"one"}, true},
|
||||
{`one,`, ',', []string{"one", ""}, true},
|
||||
{`one,two, three`, ',', []string{"one", "two", "three"}, true},
|
||||
{`one,"two", three`, ',', []string{"one", "two", "three"}, true},
|
||||
{`one," two", three`, ',', []string{"one", " two", "three"}, true},
|
||||
{`one," two", three`, ',', []string{"one", `" two"`, "three"}, false},
|
||||
{`one,"two, three"`, ',', []string{"one", "two, three"}, true},
|
||||
{`one,""two three""`, ',', []string{"one", `"two three"`}, true},
|
||||
{`one,"two, ""three""`, ',', []string{"one", `two, "three"`}, true},
|
||||
{`brian ""the great"" hendriks,mr.,1.7526`, ',', []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true},
|
||||
{`col1,"Industriepark ""De Bruwaan""",col3`, ',', []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true},
|
||||
{`|a|`, '|', []string{"", "a", ""}, true},
|
||||
{`72470|30|0|40|0||||`, '|', []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true},
|
||||
{`"one","two"`, ',', []string{`"one"`, `"two"`}, false},
|
||||
{`"one","two"`, ',', []string{`one`, `two`}, true},
|
||||
{`one, two`, ',', []string{`one`, `two`}, true},
|
||||
{`one," two"`, ',', []string{`one`, ` two`}, true},
|
||||
{``, ",", []string{""}, true},
|
||||
{`one`, ",", []string{"one"}, true},
|
||||
{`one,`, ",", []string{"one", ""}, true},
|
||||
{`one,two, three`, ",", []string{"one", "two", "three"}, true},
|
||||
{`one,"two", three`, ",", []string{"one", "two", "three"}, true},
|
||||
{`one," two", three`, ",", []string{"one", " two", "three"}, true},
|
||||
{`one," two", three`, ",", []string{"one", `" two"`, "three"}, false},
|
||||
{`one,"two, three"`, ",", []string{"one", "two, three"}, true},
|
||||
{`one,""two three""`, ",", []string{"one", `"two three"`}, true},
|
||||
{`one,"two, ""three""`, ",", []string{"one", `two, "three"`}, true},
|
||||
{`brian ""the great"" hendriks,mr.,1.7526`, ",", []string{`brian "the great" hendriks`, "mr.", "1.7526"}, true},
|
||||
{`col1,"Industriepark ""De Bruwaan""",col3`, ",", []string{"col1", `Industriepark "De Bruwaan"`, "col3"}, true},
|
||||
{`|a|`, "|", []string{"", "a", ""}, true},
|
||||
{`72470|30|0|40|0||||`, "|", []string{"72470", "30", "0", "40", "0", "", "", "", ""}, true},
|
||||
{`"one","two"`, ",", []string{`"one"`, `"two"`}, false},
|
||||
{`"one","two"`, ",", []string{`one`, `two`}, true},
|
||||
{`one, two`, ",", []string{`one`, `two`}, true},
|
||||
{`one," two"`, ",", []string{`one`, ` two`}, true},
|
||||
{
|
||||
`23660|1300|"Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"""`,
|
||||
'|',
|
||||
"|",
|
||||
[]string{"23660", "1300", `Beef, brisket, flat half, separable lean and fat, trimmed to 1/8"`},
|
||||
true,
|
||||
},
|
||||
{`72470<delim>30<delim>0<delim>40<delim>0<delim>"<delim>"<delim><delim><delim>`, "<delim>", []string{"72470", "30", "0", "40", "0", "<delim>", "", "", ""}, true},
|
||||
{`72470<delim>30<delim>0<delim>40<delim>0<delim>"""<delim>"""<delim><delim><delim>`, "<delim>", []string{"72470", "30", "0", "40", "0", `"<delim>"`, "", "", ""}, true},
|
||||
}
|
||||
|
||||
for _, test := range splitTests {
|
||||
|
||||
Reference in New Issue
Block a user