From 506d0013ea0e7e39893cf449a02b110490ab5a84 Mon Sep 17 00:00:00 2001 From: Kyle Derkacz Date: Thu, 10 Mar 2016 18:07:13 -0800 Subject: [PATCH 1/2] Fix: Safely handle carriage returns in CSVs Carriage returns (CR) are not fully supported by Go's CSV reader. When a CR is encountered by the reader, the reader silently ignores the CR and continues processing the line. This is ok when the CR is part of a CRLF, but when the CR is independent then the line isn't properly terminated. Github issue #1049 --- clients/csv/csv_reader.go | 50 ++++++++++++++++++++ clients/csv/csv_reader_test.go | 86 ++++++++++++++++++++++++++++++++++ clients/csv/read.go | 8 ---- 3 files changed, 136 insertions(+), 8 deletions(-) create mode 100644 clients/csv/csv_reader.go create mode 100644 clients/csv/csv_reader_test.go diff --git a/clients/csv/csv_reader.go b/clients/csv/csv_reader.go new file mode 100644 index 0000000000..6f76947aef --- /dev/null +++ b/clients/csv/csv_reader.go @@ -0,0 +1,50 @@ +/* +Package macreader changes Classic Mac (CR) line endings to Linux (LF) line +endings in files. This is useful when handling CSV files generated by the Mac +version of Microsoft Excel as documented in this issue: +https://github.com/golang/go/issues/7802. +*/ +package csv + +import ( + "encoding/csv" + "io" + "bufio" +) + +var ( + rByte byte = 13 // the byte that corresponds to the '\r' rune. + nByte byte = 10 // the byte that corresponds to the '\n' rune. +) + +type reader struct { + r *bufio.Reader +} + +// New creates a new io.Reader that wraps r to convert Classic Mac (CR) +// line endings to Linux (LF) line endings. +func SafeCSVReader(r *bufio.Reader) *reader { + return &reader{r: r} +} + +// Read replaces CR line endings in the source reader with LF line endings. +func (r reader) Read(p []byte) (n int, err error) { + n, err = r.r.Read(p) + bn, err := r.r.Peek(1) + for i, b := range p { + // if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF + if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) { + p[i] = nByte + } + } + return +} + +// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first. +func NewCSVReader(res io.Reader, comma rune) *csv.Reader { + bufRes := bufio.NewReader(res) + r := csv.NewReader(SafeCSVReader(bufRes)) + r.Comma = comma + r.FieldsPerRecord = -1 // Let first row determine the number of fields. + return r +} \ No newline at end of file diff --git a/clients/csv/csv_reader_test.go b/clients/csv/csv_reader_test.go new file mode 100644 index 0000000000..7b6fd19ae7 --- /dev/null +++ b/clients/csv/csv_reader_test.go @@ -0,0 +1,86 @@ +package csv + +import ( + "testing" + "bytes" + "encoding/csv" + // "fmt" + "strings" + "bufio" +) + + +func TestCR(t *testing.T) { + testFile := bytes.NewBufferString("a,b,c\r1,2,3\r").Bytes() + + r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) + lines, err := r.ReadAll() + + if err != nil { + t.Errorf("An error occurred while reading the data: %v", err) + } + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } +} + +func TestLF(t *testing.T) { + testFile := bytes.NewBufferString("a,b,c\n1,2,3\n").Bytes() + + r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) + lines, err := r.ReadAll() + + if err != nil { + t.Errorf("An error occurred while reading the data: %v", err) + } + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } +} + +func TestCRLF(t *testing.T) { + testFile := bytes.NewBufferString("a,b,c\r\n1,2,3\r\n").Bytes() + + r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) + lines, err := r.ReadAll() + + if err != nil { + t.Errorf("An error occurred while reading the data: %v", err) + } + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } +} + +func TestCRInQuote(t *testing.T) { + testFile := bytes.NewBufferString("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r").Bytes() + + r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) + lines, err := r.ReadAll() + + if err != nil { + t.Errorf("An error occurred while reading the data: %v", err) + } + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } + if strings.Contains(lines[1][1], "\n\n") { + t.Error("The CRLF was converted to a LFLF") + } +} + +func TestCRLFEndOfBufferLength(t *testing.T) { + testFile := bytes.NewBuffer(make([]byte, 4096 * 2, 4096 * 2)).Bytes() + testFile[4095] = 13 // \r byte + testFile[4096] = 10 // \n byte + + r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) + lines, err := r.ReadAll() + + if err != nil { + t.Errorf("An error occurred while reading the data: %v", err) + } + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } +} diff --git a/clients/csv/read.go b/clients/csv/read.go index 9a824e44ef..c6284c4d61 100644 --- a/clients/csv/read.go +++ b/clients/csv/read.go @@ -39,14 +39,6 @@ func KindsToStrings(kinds KindSlice) []string { return strs } -// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first. -func NewCSVReader(res io.Reader, comma rune) *csv.Reader { - r := csv.NewReader(res) - r.Comma = comma - r.FieldsPerRecord = -1 // Don't enforce number of fields. - return r -} - // ReportValidFieldTypes takes a CSV reader and the headers. It returns a slice of types.NomsKind for each column in the data indicating what Noms types could be used to represent that row. // For example, if all values in a row are negative integers between -127 and 0, the slice for that row would be [types.Int8Kind, types.Int16Kind, types.Int32Kind, types.Int64Kind, types.Float32Kind, types.Float64Kind, types.StringKind]. If even one value in the row is a floating point number, however, all the integer kinds would be dropped. All values can be represented as a string, so that option is always provided. func ReportValidFieldTypes(r *csv.Reader, headers []string) []KindSlice { From d7b55cfbc5f1794a5cc16af18c710451eec6c8e8 Mon Sep 17 00:00:00 2001 From: Kyle Derkacz Date: Fri, 11 Mar 2016 18:32:13 -0800 Subject: [PATCH 2/2] Fixing code formatting errors in the csv reader --- clients/csv/csv_reader.go | 56 +++++++---------- clients/csv/csv_reader_test.go | 107 +++++++++++++++------------------ 2 files changed, 72 insertions(+), 91 deletions(-) diff --git a/clients/csv/csv_reader.go b/clients/csv/csv_reader.go index 6f76947aef..9714988195 100644 --- a/clients/csv/csv_reader.go +++ b/clients/csv/csv_reader.go @@ -1,50 +1,38 @@ -/* -Package macreader changes Classic Mac (CR) line endings to Linux (LF) line -endings in files. This is useful when handling CSV files generated by the Mac -version of Microsoft Excel as documented in this issue: -https://github.com/golang/go/issues/7802. -*/ package csv import ( - "encoding/csv" - "io" - "bufio" + "bufio" + "encoding/csv" + "io" ) var ( - rByte byte = 13 // the byte that corresponds to the '\r' rune. - nByte byte = 10 // the byte that corresponds to the '\n' rune. + rByte byte = 13 // the byte that corresponds to the '\r' rune. + nByte byte = 10 // the byte that corresponds to the '\n' rune. ) type reader struct { - r *bufio.Reader + r *bufio.Reader } -// New creates a new io.Reader that wraps r to convert Classic Mac (CR) -// line endings to Linux (LF) line endings. -func SafeCSVReader(r *bufio.Reader) *reader { - return &reader{r: r} -} - -// Read replaces CR line endings in the source reader with LF line endings. +// Read replaces CR line endings in the source reader with LF line endings if the CR is not followed by a LF. func (r reader) Read(p []byte) (n int, err error) { - n, err = r.r.Read(p) - bn, err := r.r.Peek(1) - for i, b := range p { - // if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF - if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) { - p[i] = nByte - } - } - return + n, err = r.r.Read(p) + bn, err := r.r.Peek(1) + for i, b := range p { + // if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF + if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) { + p[i] = nByte + } + } + return } // NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first. func NewCSVReader(res io.Reader, comma rune) *csv.Reader { - bufRes := bufio.NewReader(res) - r := csv.NewReader(SafeCSVReader(bufRes)) - r.Comma = comma - r.FieldsPerRecord = -1 // Let first row determine the number of fields. - return r -} \ No newline at end of file + bufRes := bufio.NewReader(res) + r := csv.NewReader(reader{r: bufRes}) + r.Comma = comma + r.FieldsPerRecord = -1 // Don't enforce number of fields. + return r +} diff --git a/clients/csv/csv_reader_test.go b/clients/csv/csv_reader_test.go index 7b6fd19ae7..d3b9054941 100644 --- a/clients/csv/csv_reader_test.go +++ b/clients/csv/csv_reader_test.go @@ -1,86 +1,79 @@ package csv import ( - "testing" - "bytes" - "encoding/csv" - // "fmt" - "strings" - "bufio" + "bytes" + "strings" + "testing" + + "github.com/stretchr/testify/assert" ) - func TestCR(t *testing.T) { - testFile := bytes.NewBufferString("a,b,c\r1,2,3\r").Bytes() + testFile := []byte("a,b,c\r1,2,3\r") + delimiter, err := StringToRune(",") - r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) - lines, err := r.ReadAll() + r := NewCSVReader(bytes.NewReader(testFile), delimiter) + lines, err := r.ReadAll() - if err != nil { - t.Errorf("An error occurred while reading the data: %v", err) - } - if len(lines) != 2 { - t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) - } + assert.NoError(t, err, "An error occurred while reading the data: %v", err) + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } } func TestLF(t *testing.T) { - testFile := bytes.NewBufferString("a,b,c\n1,2,3\n").Bytes() + testFile := []byte("a,b,c\n1,2,3\n") + delimiter, err := StringToRune(",") - r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) - lines, err := r.ReadAll() + r := NewCSVReader(bytes.NewReader(testFile), delimiter) + lines, err := r.ReadAll() - if err != nil { - t.Errorf("An error occurred while reading the data: %v", err) - } - if len(lines) != 2 { - t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) - } + assert.NoError(t, err, "An error occurred while reading the data: %v", err) + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } } func TestCRLF(t *testing.T) { - testFile := bytes.NewBufferString("a,b,c\r\n1,2,3\r\n").Bytes() + testFile := []byte("a,b,c\r\n1,2,3\r\n") + delimiter, err := StringToRune(",") - r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) - lines, err := r.ReadAll() + r := NewCSVReader(bytes.NewReader(testFile), delimiter) + lines, err := r.ReadAll() - if err != nil { - t.Errorf("An error occurred while reading the data: %v", err) - } - if len(lines) != 2 { - t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) - } + assert.NoError(t, err, "An error occurred while reading the data: %v", err) + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } } func TestCRInQuote(t *testing.T) { - testFile := bytes.NewBufferString("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r").Bytes() + testFile := []byte("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r") + delimiter, err := StringToRune(",") - r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) - lines, err := r.ReadAll() + r := NewCSVReader(bytes.NewReader(testFile), delimiter) + lines, err := r.ReadAll() - if err != nil { - t.Errorf("An error occurred while reading the data: %v", err) - } - if len(lines) != 2 { - t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) - } - if strings.Contains(lines[1][1], "\n\n") { - t.Error("The CRLF was converted to a LFLF") - } + assert.NoError(t, err, "An error occurred while reading the data: %v", err) + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } + if strings.Contains(lines[1][1], "\n\n") { + t.Error("The CRLF was converted to a LFLF") + } } func TestCRLFEndOfBufferLength(t *testing.T) { - testFile := bytes.NewBuffer(make([]byte, 4096 * 2, 4096 * 2)).Bytes() - testFile[4095] = 13 // \r byte - testFile[4096] = 10 // \n byte + testFile := make([]byte, 4096*2, 4096*2) + testFile[4095] = 13 // \r byte + testFile[4096] = 10 // \n byte + delimiter, err := StringToRune(",") - r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile)))) - lines, err := r.ReadAll() + r := NewCSVReader(bytes.NewReader(testFile), delimiter) + lines, err := r.ReadAll() - if err != nil { - t.Errorf("An error occurred while reading the data: %v", err) - } - if len(lines) != 2 { - t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) - } + assert.NoError(t, err, "An error occurred while reading the data: %v", err) + if len(lines) != 2 { + t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines)) + } }