Fix: Safely handle carriage returns in CSVs

Carriage returns (CR) are not fully supported by Go's CSV reader. When
a CR is encountered by the reader, the reader silently ignores the CR
and continues processing the line. This is ok when the CR is part of a
CRLF, but when the CR is independent then the line isn't properly
terminated.

Github issue #1049
This commit is contained in:
Kyle Derkacz
2016-03-10 18:07:13 -08:00
parent 12fde71912
commit 506d0013ea
3 changed files with 136 additions and 8 deletions

50
clients/csv/csv_reader.go Normal file
View File

@@ -0,0 +1,50 @@
/*
Package macreader changes Classic Mac (CR) line endings to Linux (LF) line
endings in files. This is useful when handling CSV files generated by the Mac
version of Microsoft Excel as documented in this issue:
https://github.com/golang/go/issues/7802.
*/
package csv
import (
"encoding/csv"
"io"
"bufio"
)
var (
rByte byte = 13 // the byte that corresponds to the '\r' rune.
nByte byte = 10 // the byte that corresponds to the '\n' rune.
)
type reader struct {
r *bufio.Reader
}
// New creates a new io.Reader that wraps r to convert Classic Mac (CR)
// line endings to Linux (LF) line endings.
func SafeCSVReader(r *bufio.Reader) *reader {
return &reader{r: r}
}
// Read replaces CR line endings in the source reader with LF line endings.
func (r reader) Read(p []byte) (n int, err error) {
n, err = r.r.Read(p)
bn, err := r.r.Peek(1)
for i, b := range p {
// if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF
if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) {
p[i] = nByte
}
}
return
}
// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
bufRes := bufio.NewReader(res)
r := csv.NewReader(SafeCSVReader(bufRes))
r.Comma = comma
r.FieldsPerRecord = -1 // Let first row determine the number of fields.
return r
}

View File

@@ -0,0 +1,86 @@
package csv
import (
"testing"
"bytes"
"encoding/csv"
// "fmt"
"strings"
"bufio"
)
func TestCR(t *testing.T) {
testFile := bytes.NewBufferString("a,b,c\r1,2,3\r").Bytes()
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
lines, err := r.ReadAll()
if err != nil {
t.Errorf("An error occurred while reading the data: %v", err)
}
if len(lines) != 2 {
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
}
}
func TestLF(t *testing.T) {
testFile := bytes.NewBufferString("a,b,c\n1,2,3\n").Bytes()
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
lines, err := r.ReadAll()
if err != nil {
t.Errorf("An error occurred while reading the data: %v", err)
}
if len(lines) != 2 {
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
}
}
func TestCRLF(t *testing.T) {
testFile := bytes.NewBufferString("a,b,c\r\n1,2,3\r\n").Bytes()
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
lines, err := r.ReadAll()
if err != nil {
t.Errorf("An error occurred while reading the data: %v", err)
}
if len(lines) != 2 {
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
}
}
func TestCRInQuote(t *testing.T) {
testFile := bytes.NewBufferString("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r").Bytes()
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
lines, err := r.ReadAll()
if err != nil {
t.Errorf("An error occurred while reading the data: %v", err)
}
if len(lines) != 2 {
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
}
if strings.Contains(lines[1][1], "\n\n") {
t.Error("The CRLF was converted to a LFLF")
}
}
func TestCRLFEndOfBufferLength(t *testing.T) {
testFile := bytes.NewBuffer(make([]byte, 4096 * 2, 4096 * 2)).Bytes()
testFile[4095] = 13 // \r byte
testFile[4096] = 10 // \n byte
r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
lines, err := r.ReadAll()
if err != nil {
t.Errorf("An error occurred while reading the data: %v", err)
}
if len(lines) != 2 {
t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
}
}

View File

@@ -39,14 +39,6 @@ func KindsToStrings(kinds KindSlice) []string {
return strs
}
// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
r := csv.NewReader(res)
r.Comma = comma
r.FieldsPerRecord = -1 // Don't enforce number of fields.
return r
}
// ReportValidFieldTypes takes a CSV reader and the headers. It returns a slice of types.NomsKind for each column in the data indicating what Noms types could be used to represent that row.
// For example, if all values in a row are negative integers between -127 and 0, the slice for that row would be [types.Int8Kind, types.Int16Kind, types.Int32Kind, types.Int64Kind, types.Float32Kind, types.Float64Kind, types.StringKind]. If even one value in the row is a floating point number, however, all the integer kinds would be dropped. All values can be represented as a string, so that option is always provided.
func ReportValidFieldTypes(r *csv.Reader, headers []string) []KindSlice {