Fix: Safely handle carriage returns in CSVs

Carriage returns (CR) are not fully supported by Go's CSV reader. When a CR is encountered by the reader, the reader silently ignores the CR and continues processing the line. This is ok when the CR is part of a CRLF, but when the CR is independent then the line isn't properly terminated. Github issue #1049
2026-02-11 02:59:34 -06:00 · 2016-03-10 18:07:13 -08:00
parent 12fde71912
commit 506d0013ea
3 changed files with 136 additions and 8 deletions
--- a/clients/csv/csv_reader.go
+++ b/clients/csv/csv_reader.go
@@ -0,0 +1,50 @@
+/*
+Package macreader changes Classic Mac (CR) line endings to Linux (LF) line
+endings in files. This is useful when handling CSV files generated by the Mac
+version of Microsoft Excel as documented in this issue:
+https://github.com/golang/go/issues/7802.
+*/
+package csv
+
+import (
+    "encoding/csv"
+    "io"
+    "bufio"
+)
+
+var (
+    rByte byte = 13 // the byte that corresponds to the '\r' rune.
+    nByte byte = 10 // the byte that corresponds to the '\n' rune.
+)
+
+type reader struct {
+    r *bufio.Reader
+}
+
+// New creates a new io.Reader that wraps r to convert Classic Mac (CR)
+// line endings to Linux (LF) line endings.
+func SafeCSVReader(r *bufio.Reader) *reader {
+    return &reader{r: r}
+}
+
+// Read replaces CR line endings in the source reader with LF line endings.
+func (r reader) Read(p []byte) (n int, err error) {
+    n, err = r.r.Read(p)
+    bn, err := r.r.Peek(1)
+    for i, b := range p {
+        // if the current byte is a CR and the next byte is NOT a LF then replace the current byte with a LF
+        if j := i + 1; b == rByte && ((j < len(p) && p[j] != nByte) || (len(bn) > 0 && bn[0] != nByte)) {
+            p[i] = nByte
+        }
+    }
+    return
+}
+
+// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
+func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
+    bufRes := bufio.NewReader(res)
+    r := csv.NewReader(SafeCSVReader(bufRes))
+    r.Comma = comma
+    r.FieldsPerRecord = -1 // Let first row determine the number of fields.
+    return r
+}
--- a/clients/csv/csv_reader_test.go
+++ b/clients/csv/csv_reader_test.go
@@ -0,0 +1,86 @@
+package csv
+
+import (
+    "testing"
+    "bytes"
+    "encoding/csv"
+    // "fmt"
+    "strings"
+    "bufio"
+)
+
+
+func TestCR(t *testing.T) {
+    testFile := bytes.NewBufferString("a,b,c\r1,2,3\r").Bytes()
+
+    r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
+    lines, err := r.ReadAll()
+
+    if err != nil {
+        t.Errorf("An error occurred while reading the data: %v", err)
+    }
+    if len(lines) != 2 {
+        t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
+    }
+}
+
+func TestLF(t *testing.T) {
+    testFile := bytes.NewBufferString("a,b,c\n1,2,3\n").Bytes()
+
+    r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
+    lines, err := r.ReadAll()
+
+    if err != nil {
+        t.Errorf("An error occurred while reading the data: %v", err)
+    }
+    if len(lines) != 2 {
+        t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
+    }
+}
+
+func TestCRLF(t *testing.T) {
+    testFile := bytes.NewBufferString("a,b,c\r\n1,2,3\r\n").Bytes()
+
+    r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
+    lines, err := r.ReadAll()
+
+    if err != nil {
+        t.Errorf("An error occurred while reading the data: %v", err)
+    }
+    if len(lines) != 2 {
+        t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
+    }
+}
+
+func TestCRInQuote(t *testing.T) {
+    testFile := bytes.NewBufferString("a,\"foo,\rbar\",c\r1,\"2\r\n2\",3\r").Bytes()
+
+    r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
+    lines, err := r.ReadAll()
+
+    if err != nil {
+        t.Errorf("An error occurred while reading the data: %v", err)
+    }
+    if len(lines) != 2 {
+        t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
+    }
+    if strings.Contains(lines[1][1], "\n\n") {
+        t.Error("The CRLF was converted to a LFLF")
+    }
+}
+
+func TestCRLFEndOfBufferLength(t *testing.T) {
+    testFile := bytes.NewBuffer(make([]byte, 4096 * 2, 4096 * 2)).Bytes()
+    testFile[4095] = 13 // \r byte
+    testFile[4096] = 10 // \n byte
+
+    r := csv.NewReader(SafeCSVReader(bufio.NewReader(bytes.NewReader(testFile))))
+    lines, err := r.ReadAll()
+
+    if err != nil {
+        t.Errorf("An error occurred while reading the data: %v", err)
+    }
+    if len(lines) != 2 {
+        t.Errorf("Wrong number of lines. Expected 2, got %d", len(lines))
+    }
+}
--- a/clients/csv/read.go
+++ b/clients/csv/read.go
@@ -39,14 +39,6 @@ func KindsToStrings(kinds KindSlice) []string {
 	return strs
 }

-// NewCSVReader returns a new csv.Reader that splits on comma and asserts that all rows contain the same number of fields as the first.
-func NewCSVReader(res io.Reader, comma rune) *csv.Reader {
-	r := csv.NewReader(res)
-	r.Comma = comma
-	r.FieldsPerRecord = -1 // Don't enforce number of fields.
-	return r
-}
-
 // ReportValidFieldTypes takes a CSV reader and the headers. It returns a slice of types.NomsKind for each column in the data indicating what Noms types could be used to represent that row.
 // For example, if all values in a row are negative integers between -127 and 0, the slice for that row would be [types.Int8Kind, types.Int16Kind, types.Int32Kind, types.Int64Kind, types.Float32Kind, types.Float64Kind, types.StringKind]. If even one value in the row is a floating point number, however, all the integer kinds would be dropped. All values can be represented as a string, so that option is always provided.
 func ReportValidFieldTypes(r *csv.Reader, headers []string) []KindSlice {