From 04bd70ad2e2932193d51b78f5d87432279dd56de Mon Sep 17 00:00:00 2001 From: Aaron Son Date: Thu, 7 Dec 2023 14:59:33 -0800 Subject: [PATCH] dolt table import: json,csv: Support BOM file headers. The semantics are as follows: For CSV files, the default import is an uninterpreted character encoding where newline has to match 0xa and the delimeters have to match. In general Dolt expects UTF8, but non-UTF8 characters in string fields can make it through to the imported table for encodings which are close enough to ASCII, for example. If there is a UTF8, UTF16LE or UTF16BE BOM header, then character decoding of the input stream switches to the indicated encoding. For JSON files, the default import is UTF8 character encoding. If there is a UTF8, UTF16LE or UTF16BE BOM header, then character decoding of the input stream switches to the indicated encoding. --- .../doltcore/table/typed/json/reader.go | 9 +- .../doltcore/table/typed/json/reader_test.go | 94 ++++++++++++++---- .../doltcore/table/untyped/csv/reader.go | 27 +++-- .../doltcore/table/untyped/csv/reader_test.go | 45 ++++++--- go/libraries/events/emitter.go | 2 +- ...loyees-tbl-schema-unordered.utf16bebom.csv | Bin 0 -> 344 bytes ...loyees-tbl-schema-unordered.utf16lebom.csv | Bin 0 -> 344 bytes ...employees-tbl-schema-unordered.utf8bom.csv | 4 + .../bats/helper/employees-tbl.utf16bebom.json | Bin 0 -> 1018 bytes .../bats/helper/employees-tbl.utf16lebom.json | Bin 0 -> 1018 bytes .../bats/helper/employees-tbl.utf8bom.json | 28 ++++++ .../bats/import-create-tables.bats | 57 +++++++---- .../bats/import-update-tables.bats | 78 +++++++++++++++ 13 files changed, 279 insertions(+), 65 deletions(-) create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv create mode 100644 integration-tests/bats/helper/employees-tbl.utf16bebom.json create mode 100644 integration-tests/bats/helper/employees-tbl.utf16lebom.json create mode 100644 integration-tests/bats/helper/employees-tbl.utf8bom.json diff --git a/go/libraries/doltcore/table/typed/json/reader.go b/go/libraries/doltcore/table/typed/json/reader.go index b5b912d4e6..c938ae008d 100644 --- a/go/libraries/doltcore/table/typed/json/reader.go +++ b/go/libraries/doltcore/table/typed/json/reader.go @@ -22,6 +22,8 @@ import ( "github.com/bcicen/jstream" "github.com/dolthub/go-mysql-server/sql" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" "github.com/dolthub/dolt/go/libraries/doltcore/row" "github.com/dolthub/dolt/go/libraries/doltcore/schema" @@ -52,12 +54,17 @@ func OpenJSONReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableF return NewJSONReader(vrw, r, sch) } +// The bytes of the supplied reader are treated as UTF-8. If there is a UTF8, +// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the +// remaining contents of the reader are treated as that encoding. func NewJSONReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONReader, error) { if sch == nil { return nil, errors.New("schema must be provided to JsonReader") } - decoder := jstream.NewDecoder(r, 2) // extract JSON values at a depth level of 1 + textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder())) + + decoder := jstream.NewDecoder(textReader, 2) // extract JSON values at a depth level of 1 return &JSONReader{vrw: vrw, closer: r, sch: sch, jsonStream: decoder}, nil } diff --git a/go/libraries/doltcore/table/typed/json/reader_test.go b/go/libraries/doltcore/table/typed/json/reader_test.go index 504ef50866..4fc9cd01f4 100644 --- a/go/libraries/doltcore/table/typed/json/reader_test.go +++ b/go/libraries/doltcore/table/typed/json/reader_test.go @@ -15,6 +15,7 @@ package json import ( + "bytes" "context" "io" "os" @@ -24,6 +25,8 @@ import ( "github.com/dolthub/go-mysql-server/sql" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" "github.com/dolthub/dolt/go/libraries/doltcore/row" "github.com/dolthub/dolt/go/libraries/doltcore/schema" @@ -33,25 +36,7 @@ import ( "github.com/dolthub/dolt/go/store/types" ) -func TestReader(t *testing.T) { - testJSON := `{ - "rows": [ - { - "id": 0, - "first name": "tim", - "last name": "sehn" - }, - { - "id": 1, - "first name": "brian", - "last name": "hendriks" - } - ] - }` - - fs := filesys.EmptyInMemFS("/") - require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm)) - +func testGoodJSON(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (*JSONReader, error)) { colColl := schema.NewColCollection( schema.Column{ Name: "id", @@ -83,7 +68,7 @@ func TestReader(t *testing.T) { require.NoError(t, err) vrw := types.NewMemoryValueStore() - reader, err := OpenJSONReader(vrw, "file.json", fs, sch) + reader, err := getReader(vrw, sch) require.NoError(t, err) verifySchema, err := reader.VerifySchema(sch) @@ -109,6 +94,75 @@ func TestReader(t *testing.T) { assert.Equal(t, enginetest.WidenRows(sqlSch.Schema, expectedRows), rows) } +func TestReader(t *testing.T) { + testJSON := `{ + "rows": [ + { + "id": 0, + "first name": "tim", + "last name": "sehn" + }, + { + "id": 1, + "first name": "brian", + "last name": "hendriks" + } + ] + }` + + fs := filesys.EmptyInMemFS("/") + require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm)) + + testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) { + return OpenJSONReader(vrw, "file.json", fs, sch) + }) +} + +func TestReaderBOMHandling(t *testing.T) { + testJSON := `{ + "rows": [ + { + "id": 0, + "first name": "tim", + "last name": "sehn" + }, + { + "id": 1, + "first name": "brian", + "last name": "hendriks" + } + ] + }` + t.Run("UTF-8", func(t *testing.T) { + bs := bytes.NewBuffer([]byte(testJSON)) + reader := transform.NewReader(bs, unicode.UTF8.NewEncoder()) + testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) { + return NewJSONReader(vrw, io.NopCloser(reader), sch) + }) + }) + t.Run("UTF-8 BOM", func(t *testing.T) { + bs := bytes.NewBuffer([]byte(testJSON)) + reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder()) + testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) { + return NewJSONReader(vrw, io.NopCloser(reader), sch) + }) + }) + t.Run("UTF-16 LE BOM", func(t *testing.T) { + bs := bytes.NewBuffer([]byte(testJSON)) + reader := transform.NewReader(bs, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder()) + testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) { + return NewJSONReader(vrw, io.NopCloser(reader), sch) + }) + }) + t.Run("UTF-16 BE BOM", func(t *testing.T) { + bs := bytes.NewBuffer([]byte(testJSON)) + reader := transform.NewReader(bs, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewEncoder()) + testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) { + return NewJSONReader(vrw, io.NopCloser(reader), sch) + }) + }) +} + func TestReaderBadJson(t *testing.T) { testJSON := ` { "rows": [ diff --git a/go/libraries/doltcore/table/untyped/csv/reader.go b/go/libraries/doltcore/table/untyped/csv/reader.go index 74545e1fe2..7ae7f2519d 100644 --- a/go/libraries/doltcore/table/untyped/csv/reader.go +++ b/go/libraries/doltcore/table/untyped/csv/reader.go @@ -27,6 +27,8 @@ import ( "unicode/utf8" "github.com/dolthub/go-mysql-server/sql" + textunicode "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" "github.com/dolthub/dolt/go/libraries/doltcore/row" "github.com/dolthub/dolt/go/libraries/doltcore/schema" @@ -73,6 +75,14 @@ func OpenCSVReader(nbf *types.NomsBinFormat, path string, fs filesys.ReadableFS, } // NewCSVReader creates a CSVReader from a given ReadCloser. The CSVFileInfo should describe the csv file being read. +// +// The interpretation of the bytes of the supplied reader is a little murky. If +// there is a UTF8, UTF16LE or UTF16BE BOM as the first bytes read, then the +// BOM is stripped and the remaining contents of the reader are treated as that +// encoding. If we are not in any of those marked encodings, then some of the +// bytes go uninterpreted until we get to the SQL layer. It is currently the +// case that newlines must be encoded as a '0xa' byte and the delimiter must +// match |info.Delim|. func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) (*CSVReader, error) { if len(info.Delim) < 1 { return nil, fmt.Errorf("delimiter '%s' has invalid length", info.Delim) @@ -81,7 +91,9 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) return nil, fmt.Errorf("invalid delimiter: %s", string(info.Delim)) } - br := bufio.NewReaderSize(r, ReadBufSize) + textReader := transform.NewReader(r, textunicode.BOMOverride(transform.Nop)) + + br := bufio.NewReaderSize(textReader, ReadBufSize) colStrs, err := getColHeaders(br, info) if err != nil { @@ -102,18 +114,6 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) }, nil } -// trimBOM checks if the given string has the Byte Order Mark, and removes it if it is -// the BOM is there if the first 3 bytes are xEF\xBB\xBF and indicates that a file is in UTF-8 encoding -func trimBOM(s string) string { - if len(s) < 3 { - return s - } - if s[0] == '\xEF' && s[1] == '\xBB' && s[2] == '\xBF' { - return s[3:] - } - return s -} - func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) { colStrs := info.Columns if info.HasHeaderLine { @@ -124,7 +124,6 @@ func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) { } else if strings.TrimSpace(line) == "" { return nil, errors.New("Header line is empty") } - line = trimBOM(line) colStrsFromFile, err := csvSplitLine(line, info.Delim, info.EscapeQuotes) if err != nil { diff --git a/go/libraries/doltcore/table/untyped/csv/reader_test.go b/go/libraries/doltcore/table/untyped/csv/reader_test.go index 01abc4ae20..a821fa03eb 100644 --- a/go/libraries/doltcore/table/untyped/csv/reader_test.go +++ b/go/libraries/doltcore/table/untyped/csv/reader_test.go @@ -20,6 +20,11 @@ import ( "strings" "testing" + "github.com/stretchr/testify/require" + "golang.org/x/text/encoding" + "golang.org/x/text/encoding/unicode" + "golang.org/x/text/transform" + "github.com/dolthub/dolt/go/libraries/doltcore/row" "github.com/dolthub/dolt/go/libraries/doltcore/table" "github.com/dolthub/dolt/go/libraries/doltcore/table/untyped" @@ -67,6 +72,13 @@ func mustRow(r row.Row, err error) row.Row { return r } +func mustEncodeBytes(t *testing.T, bs []byte, enc encoding.Encoding) []byte { + ret, n, err := transform.Bytes(enc.NewEncoder(), bs) + require.NoError(t, err) + require.Equal(t, n, len(bs)) + return ret +} + func TestReader(t *testing.T) { colNames := []string{"name", "age", "title"} _, sch := untyped.NewUntypedSchema(colNames...) @@ -82,33 +94,42 @@ func TestReader(t *testing.T) { mustRow(untyped.NewRowFromStrings(types.Format_Default, sch, []string{"Jack Jackson", "27"})), } + utf8bomBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF8BOM) + require.Equal(t, utf8bomBytes[0:3], []byte{0xEF, 0xBB, 0xBF}) + utf16leBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.LittleEndian, unicode.UseBOM)) + utf16beBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.BigEndian, unicode.UseBOM)) + tests := []struct { - inputStr string + input []byte expectedRows []row.Row info *CSVFileInfo }{ - {PersonDB1, goodExpectedRows, NewCSVInfo()}, - {PersonDB2, goodExpectedRows, NewCSVInfo()}, - {PersonDB3, goodExpectedRows, NewCSVInfo()}, + {[]byte(PersonDB1), goodExpectedRows, NewCSVInfo()}, + {[]byte(PersonDB2), goodExpectedRows, NewCSVInfo()}, + {[]byte(PersonDB3), goodExpectedRows, NewCSVInfo()}, - {PersonDBWithBadRow, badExpectedRows, NewCSVInfo()}, - {PersonDBWithBadRow2, badExpectedRows, NewCSVInfo()}, - {PersonDBWithBadRow3, badExpectedRows, NewCSVInfo()}, + {utf8bomBytes, goodExpectedRows, NewCSVInfo()}, + {utf16leBytes, goodExpectedRows, NewCSVInfo()}, + {utf16beBytes, goodExpectedRows, NewCSVInfo()}, + + {[]byte(PersonDBWithBadRow), badExpectedRows, NewCSVInfo()}, + {[]byte(PersonDBWithBadRow2), badExpectedRows, NewCSVInfo()}, + {[]byte(PersonDBWithBadRow3), badExpectedRows, NewCSVInfo()}, { - PersonDBWithoutHeaders, + []byte(PersonDBWithoutHeaders), goodExpectedRows, NewCSVInfo().SetHasHeaderLine(false).SetColumns(colNames), }, { - PersonDBDifferentHeaders, + []byte(PersonDBDifferentHeaders), goodExpectedRows, NewCSVInfo().SetHasHeaderLine(true).SetColumns(colNames), }, } for _, test := range tests { - rows, numBad, err := readTestRows(t, test.inputStr, test.info) + rows, numBad, err := readTestRows(t, test.input, test.info) if err != nil { t.Fatal("Unexpected Error:", err) @@ -136,11 +157,11 @@ func TestReader(t *testing.T) { } } -func readTestRows(t *testing.T, inputStr string, info *CSVFileInfo) ([]row.Row, int, error) { +func readTestRows(t *testing.T, input []byte, info *CSVFileInfo) ([]row.Row, int, error) { const root = "/" const path = "/file.csv" - fs := filesys.NewInMemFS(nil, map[string][]byte{path: []byte(inputStr)}, root) + fs := filesys.NewInMemFS(nil, map[string][]byte{path: input}, root) csvR, err := OpenCSVReader(types.Format_Default, path, fs, info) defer csvR.Close(context.Background()) diff --git a/go/libraries/events/emitter.go b/go/libraries/events/emitter.go index d8df06e976..1b78613fb8 100644 --- a/go/libraries/events/emitter.go +++ b/go/libraries/events/emitter.go @@ -32,7 +32,7 @@ import ( "github.com/dolthub/dolt/go/libraries/utils/iohelp" ) -// Application is the application ID used for all events emitted by this application. Other applications (not dolt) +// Application is the application ID used for all events emitted by this application. Other applications (not dolt) // should set this once at initialization. var Application = eventsapi.AppID_APP_DOLT diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv new file mode 100644 index 0000000000000000000000000000000000000000..84a09ac7c1df2c42dc2661a8e5747a7e1a4ea6b4 GIT binary patch literal 344 zcmb7=u@1s83`B26;vb^S48R`+NY#p#N^SXl;MoaC?Zy)OY~T5u*IS)#Iw)u%ThuE% zn#q$U-atPQOFioWUe_9!lZ+AQCJlAACtp{C_S)&_8=|hvjfxd*qxl(r-I>`I_2>$x u%4y&~-BHHDR^f}2Uotw|E{(V<$lRR&(z>vxL(iNPtrlnYCsx0s><&NHNihKc literal 0 HcmV?d00001 diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv new file mode 100644 index 0000000000000000000000000000000000000000..f7aaf9b5a8dfd7e47865bd7a5b0342d820f85454 GIT binary patch literal 344 zcmb7=u@1s83`B26;vb^S48R`+Xw`_8N-g|8@a&{W?Z%MAXZz0QeBN0%9TfCb=JaYD z&BRF)cc7klOTN(qyw0^SCmAD9O&V%%PrS|s?X}aG3M4vf4}=N`3m2sxij7t<w)?=xzJ6Wdnt_3 z^JjgtOGxhOHa4WB6PX6X&EE_U+{lun?~?osQ(lQ&f#gRxy4RxJ{?vGO7rJ-vw89g% CYjw&1 literal 0 HcmV?d00001 diff --git a/integration-tests/bats/helper/employees-tbl.utf8bom.json b/integration-tests/bats/helper/employees-tbl.utf8bom.json new file mode 100644 index 0000000000..a9fe2cd6cb --- /dev/null +++ b/integration-tests/bats/helper/employees-tbl.utf8bom.json @@ -0,0 +1,28 @@ +{ + "rows": [ + { + "id": 0, + "first name": "tim", + "last name": "sehn", + "title": "ceo", + "start date": "", + "end date": "" + }, + { + "id": 1, + "first name": "aaron", + "last name": "son", + "title": "founder", + "start date": "", + "end date": "" + }, + { + "id": 2, + "first name": "brian", + "last name": "hendricks", + "title": "founder", + "start date": "", + "end date": "" + } + ] +} \ No newline at end of file diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats index 65fce7575d..5f885d7cfa 100755 --- a/integration-tests/bats/import-create-tables.bats +++ b/integration-tests/bats/import-create-tables.bats @@ -60,23 +60,6 @@ teardown() { teardown_common } -@test "import-create-tables: correctly ignores byte order mark (BOM)" { - printf '\xEF\xBB\xBF' > bom.csv - cat <> bom.csv -c1,c2 -1,2 -DELIM - - run dolt table import -c bom bom.csv - [ "$status" -eq 0 ] - [[ "$output" =~ "Rows Processed: 1, Additions: 1, Modifications: 0, Had No Effect: 0" ]] || false - [[ "$output" =~ "Import completed successfully." ]] || false - - run dolt sql -q "select c1 from bom" - [ "$status" -eq 0 ] - [[ "$output" =~ "1" ]] || false -} - @test "import-create-tables: create a table with json import" { run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.json` [ "$status" -eq 0 ] @@ -90,6 +73,46 @@ DELIM [ "${#lines[@]}" -eq 7 ] } +@test "import-create-tables: create a table with json import, utf8 with bom" { + run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf8bom.json` + echo "$output" + [ "$status" -eq 0 ] + [[ "$output" =~ "Import completed successfully." ]] || false + run dolt ls + [ "$status" -eq 0 ] + [[ "$output" =~ "employees" ]] || false + run dolt sql -q "select * from employees" + [ "$status" -eq 0 ] + [[ "$output" =~ "tim" ]] || false + [ "${#lines[@]}" -eq 7 ] +} + +@test "import-create-tables: create a table with json import, utf16le with bom" { + run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16lebom.json` + [ "$status" -eq 0 ] + [[ "$output" =~ "Import completed successfully." ]] || false + run dolt ls + [ "$status" -eq 0 ] + [[ "$output" =~ "employees" ]] || false + run dolt sql -q "select * from employees" + [ "$status" -eq 0 ] + [[ "$output" =~ "tim" ]] || false + [ "${#lines[@]}" -eq 7 ] +} + +@test "import-create-tables: create a table with json import, utf16be with bom" { + run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16bebom.json` + [ "$status" -eq 0 ] + [[ "$output" =~ "Import completed successfully." ]] || false + run dolt ls + [ "$status" -eq 0 ] + [[ "$output" =~ "employees" ]] || false + run dolt sql -q "select * from employees" + [ "$status" -eq 0 ] + [[ "$output" =~ "tim" ]] || false + [ "${#lines[@]}" -eq 7 ] +} + @test "import-create-tables: create a table with json import. no schema." { run dolt table import -c employees `batshelper employees-tbl.json` [ "$status" -ne 0 ] diff --git a/integration-tests/bats/import-update-tables.bats b/integration-tests/bats/import-update-tables.bats index 8b4c4ce967..8888c3ceb6 100644 --- a/integration-tests/bats/import-update-tables.bats +++ b/integration-tests/bats/import-update-tables.bats @@ -270,6 +270,84 @@ SQL [[ "${lines[6]}" =~ "end date" ]] || false } +@test "import-update-tables: update table with a csv with columns in different order, utf8 with bom" { + dolt sql < 1pk1col-rpt-chars.csv pk,c