mirror of
https://github.com/dolthub/dolt.git
synced 2026-02-11 02:59:34 -06:00
Fix: Safely handle carriage returns in CSVs
Carriage returns (CR) are not fully supported by Go's CSV reader. When a CR is encountered by the reader, the reader silently ignores the CR and continues processing the line. This is ok when the CR is part of a CRLF, but when the CR is independent then the line isn't properly terminated. Github issue #1049
This commit is contained in:
50
clients/csv/csv_reader.go
Normal file
50
clients/csv/csv_reader.go
Normal file
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
Package macreader changes Classic Mac (CR) line endings to Linux (LF) line
|
||||
endings in files. This is useful when handling CSV files generated by the Mac
|
||||
version of Microsoft Excel as documented in this issue:
|
||||
https://github.com/golang/go/issues/7802.
|
||||
*/
|
||||
package csv
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"bufio"
|
||||
)
|
||||
|
||||
var (
|
||||
rByte byte = 13 // the byte that corresponds to the '\r' rune.
|
||||
nByte byte = 10 // the byte that corresponds to the '\n' rune.
|
||||
)
|
||||
|
||||
type reader struct {
|
||||
r *bufio.Reader
|
||||
}
|
||||
|
||||
// New creates a new io.Reader that wraps r to convert Classic Mac (CR)
|
||||
// line endings to Linux (LF) line endings.
|
||||
func SafeCSVReader(r *bufio.Reader) *reader {
|
||||
return &reader{r: r}
|
||||
}
|
||||
|
||||
// Read replaces CR line endings in the source reader with LF line endings.
|
||||
func (r reader) Read(p []byte) (n int, err error) {
|
||||
n, err = r.r.Read(p)
|
||||
bn, err := r.r.Peek(1)
|
||||
for i, b := range p {
|
||||
// if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF
|
||||
if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) {
|
||||
p[i] = nByte
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
|
||||
func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
|
||||
bufRes := bufio.NewReader(res)
|
||||
r := csv.NewReader(SafeCSVReader(bufRes))
|
||||
r.Comma = comma
|
||||
r.FieldsPerRecord = -1 // Let first row determine the number of fields.
|
||||
return r
|
||||
}
|
||||
86
clients/csv/csv_reader_test.go
Normal file
86
clients/csv/csv_reader_test.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package csv
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
// "fmt"
|
||||
"strings"
|
||||
"bufio"
|
||||
)
|
||||
|
||||
|
||||
func TestCR(t *testing.T) {
|
||||
testFile := bytes.NewBufferString("a,b,c\r1,2,3\r").Bytes()
|
||||
|
||||
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
|
||||
lines, err := r.ReadAll()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("An error occurred while reading the data: %v", err)
|
||||
}
|
||||
if len(lines) != 2 {
|
||||
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLF(t *testing.T) {
|
||||
testFile := bytes.NewBufferString("a,b,c\n1,2,3\n").Bytes()
|
||||
|
||||
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
|
||||
lines, err := r.ReadAll()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("An error occurred while reading the data: %v", err)
|
||||
}
|
||||
if len(lines) != 2 {
|
||||
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCRLF(t *testing.T) {
|
||||
testFile := bytes.NewBufferString("a,b,c\r\n1,2,3\r\n").Bytes()
|
||||
|
||||
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
|
||||
lines, err := r.ReadAll()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("An error occurred while reading the data: %v", err)
|
||||
}
|
||||
if len(lines) != 2 {
|
||||
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCRInQuote(t *testing.T) {
|
||||
testFile := bytes.NewBufferString("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r").Bytes()
|
||||
|
||||
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
|
||||
lines, err := r.ReadAll()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("An error occurred while reading the data: %v", err)
|
||||
}
|
||||
if len(lines) != 2 {
|
||||
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
|
||||
}
|
||||
if strings.Contains(lines[1][1], "\n\n") {
|
||||
t.Error("The CRLF was converted to a LFLF")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCRLFEndOfBufferLength(t *testing.T) {
|
||||
testFile := bytes.NewBuffer(make([]byte, 4096 * 2, 4096 * 2)).Bytes()
|
||||
testFile[4095] = 13 // \r byte
|
||||
testFile[4096] = 10 // \n byte
|
||||
|
||||
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
|
||||
lines, err := r.ReadAll()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("An error occurred while reading the data: %v", err)
|
||||
}
|
||||
if len(lines) != 2 {
|
||||
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
|
||||
}
|
||||
}
|
||||
@@ -39,14 +39,6 @@ func KindsToStrings(kinds KindSlice) []string {
|
||||
return strs
|
||||
}
|
||||
|
||||
// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
|
||||
func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
|
||||
r := csv.NewReader(res)
|
||||
r.Comma = comma
|
||||
r.FieldsPerRecord = -1 // Don't enforce number of fields.
|
||||
return r
|
||||
}
|
||||
|
||||
// ReportValidFieldTypes takes a CSV reader and the headers. It returns a slice of types.NomsKind for each column in the data indicating what Noms types could be used to represent that row.
|
||||
// For example, if all values in a row are negative integers between -127 and 0, the slice for that row would be [types.Int8Kind, types.Int16Kind, types.Int32Kind, types.Int64Kind, types.Float32Kind, types.Float64Kind, types.StringKind]. If even one value in the row is a floating point number, however, all the integer kinds would be dropped. All values can be represented as a string, so that option is always provided.
|
||||
func ReportValidFieldTypes(r *csv.Reader, headers []string) []KindSlice {
|
||||
|
||||
Reference in New Issue
Block a user