dolt table import: json,csv: Support BOM file headers.

The semantics are as follows:

For CSV files, the default import is an uninterpreted character encoding where
newline has to match 0xa and the delimeters have to match. In general Dolt
expects UTF8, but non-UTF8 characters in string fields can make it through to
the imported table for encodings which are close enough to ASCII, for example.
If there is a UTF8, UTF16LE or UTF16BE BOM header, then character decoding of
the input stream switches to the indicated encoding.

For JSON files, the default import is UTF8 character encoding. If there is a
UTF8, UTF16LE or UTF16BE BOM header, then character decoding of the input
stream switches to the indicated encoding.
This commit is contained in:
Aaron Son
2023-12-07 14:59:33 -08:00
parent db3472ab54
commit 04bd70ad2e
13 changed files with 279 additions and 65 deletions

View File

@@ -22,6 +22,8 @@ import (
"github.com/bcicen/jstream"
"github.com/dolthub/go-mysql-server/sql"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -52,12 +54,17 @@ func OpenJSONReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableF
return NewJSONReader(vrw, r, sch)
}
// The bytes of the supplied reader are treated as UTF-8. If there is a UTF8,
// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the
// remaining contents of the reader are treated as that encoding.
func NewJSONReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONReader, error) {
if sch == nil {
return nil, errors.New("schema must be provided to JsonReader")
}
decoder := jstream.NewDecoder(r, 2) // extract JSON values at a depth level of 1
textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder()))
decoder := jstream.NewDecoder(textReader, 2) // extract JSON values at a depth level of 1
return &JSONReader{vrw: vrw, closer: r, sch: sch, jsonStream: decoder}, nil
}

View File

@@ -15,6 +15,7 @@
package json
import (
"bytes"
"context"
"io"
"os"
@@ -24,6 +25,8 @@ import (
"github.com/dolthub/go-mysql-server/sql"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -33,25 +36,7 @@ import (
"github.com/dolthub/dolt/go/store/types"
)
func TestReader(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`
fs := filesys.EmptyInMemFS("/")
require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
func testGoodJSON(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (*JSONReader, error)) {
colColl := schema.NewColCollection(
schema.Column{
Name: "id",
@@ -83,7 +68,7 @@ func TestReader(t *testing.T) {
require.NoError(t, err)
vrw := types.NewMemoryValueStore()
reader, err := OpenJSONReader(vrw, "file.json", fs, sch)
reader, err := getReader(vrw, sch)
require.NoError(t, err)
verifySchema, err := reader.VerifySchema(sch)
@@ -109,6 +94,75 @@ func TestReader(t *testing.T) {
assert.Equal(t, enginetest.WidenRows(sqlSch.Schema, expectedRows), rows)
}
func TestReader(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`
fs := filesys.EmptyInMemFS("/")
require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return OpenJSONReader(vrw, "file.json", fs, sch)
})
}
func TestReaderBOMHandling(t *testing.T) {
testJSON := `{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn"
},
{
"id": 1,
"first name": "brian",
"last name": "hendriks"
}
]
}`
t.Run("UTF-8", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF8.NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-8 BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-16 LE BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
t.Run("UTF-16 BE BOM", func(t *testing.T) {
bs := bytes.NewBuffer([]byte(testJSON))
reader := transform.NewReader(bs, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewEncoder())
testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
return NewJSONReader(vrw, io.NopCloser(reader), sch)
})
})
}
func TestReaderBadJson(t *testing.T) {
testJSON := ` {
"rows": [

View File

@@ -27,6 +27,8 @@ import (
"unicode/utf8"
"github.com/dolthub/go-mysql-server/sql"
textunicode "golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -73,6 +75,14 @@ func OpenCSVReader(nbf *types.NomsBinFormat, path string, fs filesys.ReadableFS,
}
// NewCSVReader creates a CSVReader from a given ReadCloser. The CSVFileInfo should describe the csv file being read.
//
// The interpretation of the bytes of the supplied reader is a little murky. If
// there is a UTF8, UTF16LE or UTF16BE BOM as the first bytes read, then the
// BOM is stripped and the remaining contents of the reader are treated as that
// encoding. If we are not in any of those marked encodings, then some of the
// bytes go uninterpreted until we get to the SQL layer. It is currently the
// case that newlines must be encoded as a '0xa' byte and the delimiter must
// match |info.Delim|.
func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) (*CSVReader, error) {
if len(info.Delim) < 1 {
return nil, fmt.Errorf("delimiter '%s' has invalid length", info.Delim)
@@ -81,7 +91,9 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
return nil, fmt.Errorf("invalid delimiter: %s", string(info.Delim))
}
br := bufio.NewReaderSize(r, ReadBufSize)
textReader := transform.NewReader(r, textunicode.BOMOverride(transform.Nop))
br := bufio.NewReaderSize(textReader, ReadBufSize)
colStrs, err := getColHeaders(br, info)
if err != nil {
@@ -102,18 +114,6 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
}, nil
}
// trimBOM checks if the given string has the Byte Order Mark, and removes it if it is
// the BOM is there if the first 3 bytes are xEF\xBB\xBF and indicates that a file is in UTF-8 encoding
func trimBOM(s string) string {
if len(s) < 3 {
return s
}
if s[0] == '\xEF' && s[1] == '\xBB' && s[2] == '\xBF' {
return s[3:]
}
return s
}
func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
colStrs := info.Columns
if info.HasHeaderLine {
@@ -124,7 +124,6 @@ func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
} else if strings.TrimSpace(line) == "" {
return nil, errors.New("Header line is empty")
}
line = trimBOM(line)
colStrsFromFile, err := csvSplitLine(line, info.Delim, info.EscapeQuotes)
if err != nil {

View File

@@ -20,6 +20,11 @@ import (
"strings"
"testing"
"github.com/stretchr/testify/require"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
"github.com/dolthub/dolt/go/libraries/doltcore/row"
"github.com/dolthub/dolt/go/libraries/doltcore/table"
"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped"
@@ -67,6 +72,13 @@ func mustRow(r row.Row, err error) row.Row {
return r
}
func mustEncodeBytes(t *testing.T, bs []byte, enc encoding.Encoding) []byte {
ret, n, err := transform.Bytes(enc.NewEncoder(), bs)
require.NoError(t, err)
require.Equal(t, n, len(bs))
return ret
}
func TestReader(t *testing.T) {
colNames := []string{"name", "age", "title"}
_, sch := untyped.NewUntypedSchema(colNames...)
@@ -82,33 +94,42 @@ func TestReader(t *testing.T) {
mustRow(untyped.NewRowFromStrings(types.Format_Default, sch, []string{"Jack Jackson", "27"})),
}
utf8bomBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF8BOM)
require.Equal(t, utf8bomBytes[0:3], []byte{0xEF, 0xBB, 0xBF})
utf16leBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.LittleEndian, unicode.UseBOM))
utf16beBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.BigEndian, unicode.UseBOM))
tests := []struct {
inputStr string
input []byte
expectedRows []row.Row
info *CSVFileInfo
}{
{PersonDB1, goodExpectedRows, NewCSVInfo()},
{PersonDB2, goodExpectedRows, NewCSVInfo()},
{PersonDB3, goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB1), goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB2), goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDB3), goodExpectedRows, NewCSVInfo()},
{PersonDBWithBadRow, badExpectedRows, NewCSVInfo()},
{PersonDBWithBadRow2, badExpectedRows, NewCSVInfo()},
{PersonDBWithBadRow3, badExpectedRows, NewCSVInfo()},
{utf8bomBytes, goodExpectedRows, NewCSVInfo()},
{utf16leBytes, goodExpectedRows, NewCSVInfo()},
{utf16beBytes, goodExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow), badExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow2), badExpectedRows, NewCSVInfo()},
{[]byte(PersonDBWithBadRow3), badExpectedRows, NewCSVInfo()},
{
PersonDBWithoutHeaders,
[]byte(PersonDBWithoutHeaders),
goodExpectedRows,
NewCSVInfo().SetHasHeaderLine(false).SetColumns(colNames),
},
{
PersonDBDifferentHeaders,
[]byte(PersonDBDifferentHeaders),
goodExpectedRows,
NewCSVInfo().SetHasHeaderLine(true).SetColumns(colNames),
},
}
for _, test := range tests {
rows, numBad, err := readTestRows(t, test.inputStr, test.info)
rows, numBad, err := readTestRows(t, test.input, test.info)
if err != nil {
t.Fatal("Unexpected Error:", err)
@@ -136,11 +157,11 @@ func TestReader(t *testing.T) {
}
}
func readTestRows(t *testing.T, inputStr string, info *CSVFileInfo) ([]row.Row, int, error) {
func readTestRows(t *testing.T, input []byte, info *CSVFileInfo) ([]row.Row, int, error) {
const root = "/"
const path = "/file.csv"
fs := filesys.NewInMemFS(nil, map[string][]byte{path: []byte(inputStr)}, root)
fs := filesys.NewInMemFS(nil, map[string][]byte{path: input}, root)
csvR, err := OpenCSVReader(types.Format_Default, path, fs, info)
defer csvR.Close(context.Background())

View File

@@ -32,7 +32,7 @@ import (
"github.com/dolthub/dolt/go/libraries/utils/iohelp"
)
// Application is the application ID used for all events emitted by this application. Other applications (not dolt)
// Application is the application ID used for all events emitted by this application. Other applications (not dolt)
// should set this once at initialization.
var Application = eventsapi.AppID_APP_DOLT

Binary file not shown.
1 id title start date end date first name last name
2 0 ceo tim sehn
3 1 founder aaron son
4 2 founder brian hendriks

Binary file not shown.
1 id title start date end date first name last name
2 0 ceo tim sehn
3 1 founder aaron son
4 2 founder brian hendriks

View File

@@ -0,0 +1,4 @@
id, title, start date, end date, first name, last name
0, "ceo", "", "", "tim", "sehn"
1, "founder", "", "", "aaron", "son"
2, "founder", "", "", "brian", "hendriks"
1 id title start date end date first name last name
2 0 ceo tim sehn
3 1 founder aaron son
4 2 founder brian hendriks

View File

@@ -0,0 +1,28 @@
{
"rows": [
{
"id": 0,
"first name": "tim",
"last name": "sehn",
"title": "ceo",
"start date": "",
"end date": ""
},
{
"id": 1,
"first name": "aaron",
"last name": "son",
"title": "founder",
"start date": "",
"end date": ""
},
{
"id": 2,
"first name": "brian",
"last name": "hendricks",
"title": "founder",
"start date": "",
"end date": ""
}
]
}

View File

@@ -60,23 +60,6 @@ teardown() {
teardown_common
}
@test "import-create-tables: correctly ignores byte order mark (BOM)" {
printf '\xEF\xBB\xBF' > bom.csv
cat <<DELIM >> bom.csv
c1,c2
1,2
DELIM
run dolt table import -c bom bom.csv
[ "$status" -eq 0 ]
[[ "$output" =~ "Rows Processed: 1, Additions: 1, Modifications: 0, Had No Effect: 0" ]] || false
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt sql -q "select c1 from bom"
[ "$status" -eq 0 ]
[[ "$output" =~ "1" ]] || false
}
@test "import-create-tables: create a table with json import" {
run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.json`
[ "$status" -eq 0 ]
@@ -90,6 +73,46 @@ DELIM
[ "${#lines[@]}" -eq 7 ]
}
@test "import-create-tables: create a table with json import, utf8 with bom" {
run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf8bom.json`
echo "$output"
[ "$status" -eq 0 ]
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt ls
[ "$status" -eq 0 ]
[[ "$output" =~ "employees" ]] || false
run dolt sql -q "select * from employees"
[ "$status" -eq 0 ]
[[ "$output" =~ "tim" ]] || false
[ "${#lines[@]}" -eq 7 ]
}
@test "import-create-tables: create a table with json import, utf16le with bom" {
run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16lebom.json`
[ "$status" -eq 0 ]
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt ls
[ "$status" -eq 0 ]
[[ "$output" =~ "employees" ]] || false
run dolt sql -q "select * from employees"
[ "$status" -eq 0 ]
[[ "$output" =~ "tim" ]] || false
[ "${#lines[@]}" -eq 7 ]
}
@test "import-create-tables: create a table with json import, utf16be with bom" {
run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16bebom.json`
[ "$status" -eq 0 ]
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt ls
[ "$status" -eq 0 ]
[[ "$output" =~ "employees" ]] || false
run dolt sql -q "select * from employees"
[ "$status" -eq 0 ]
[[ "$output" =~ "tim" ]] || false
[ "${#lines[@]}" -eq 7 ]
}
@test "import-create-tables: create a table with json import. no schema." {
run dolt table import -c employees `batshelper employees-tbl.json`
[ "$status" -ne 0 ]

View File

@@ -270,6 +270,84 @@ SQL
[[ "${lines[6]}" =~ "end date" ]] || false
}
@test "import-update-tables: update table with a csv with columns in different order, utf8 with bom" {
dolt sql <<SQL
CREATE TABLE employees (
\`id\` varchar(20) NOT NULL COMMENT 'tag:0',
\`first name\` LONGTEXT COMMENT 'tag:1',
\`last name\` LONGTEXT COMMENT 'tag:2',
\`title\` LONGTEXT COMMENT 'tag:3',
\`start date\` LONGTEXT COMMENT 'tag:4',
\`end date\` LONGTEXT COMMENT 'tag:5',
PRIMARY KEY (id)
);
SQL
run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf8bom.csv`
[ "$status" -eq 0 ]
[[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt schema export employees
[[ "$status" -eq 0 ]] || false
[[ "${lines[1]}" =~ "id" ]] || false
[[ "${lines[2]}" =~ "first name" ]] || false
[[ "${lines[3]}" =~ "last name" ]] || false
[[ "${lines[4]}" =~ "title" ]] || false
[[ "${lines[5]}" =~ "start date" ]] || false
[[ "${lines[6]}" =~ "end date" ]] || false
}
@test "import-update-tables: update table with a csv with columns in different order, utf16le with bom" {
dolt sql <<SQL
CREATE TABLE employees (
\`id\` varchar(20) NOT NULL COMMENT 'tag:0',
\`first name\` LONGTEXT COMMENT 'tag:1',
\`last name\` LONGTEXT COMMENT 'tag:2',
\`title\` LONGTEXT COMMENT 'tag:3',
\`start date\` LONGTEXT COMMENT 'tag:4',
\`end date\` LONGTEXT COMMENT 'tag:5',
PRIMARY KEY (id)
);
SQL
run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf16lebom.csv`
[ "$status" -eq 0 ]
[[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt schema export employees
[[ "$status" -eq 0 ]] || false
[[ "${lines[1]}" =~ "id" ]] || false
[[ "${lines[2]}" =~ "first name" ]] || false
[[ "${lines[3]}" =~ "last name" ]] || false
[[ "${lines[4]}" =~ "title" ]] || false
[[ "${lines[5]}" =~ "start date" ]] || false
[[ "${lines[6]}" =~ "end date" ]] || false
}
@test "import-update-tables: update table with a csv with columns in different order, utf16be with bom" {
dolt sql <<SQL
CREATE TABLE employees (
\`id\` varchar(20) NOT NULL COMMENT 'tag:0',
\`first name\` LONGTEXT COMMENT 'tag:1',
\`last name\` LONGTEXT COMMENT 'tag:2',
\`title\` LONGTEXT COMMENT 'tag:3',
\`start date\` LONGTEXT COMMENT 'tag:4',
\`end date\` LONGTEXT COMMENT 'tag:5',
PRIMARY KEY (id)
);
SQL
run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf16bebom.csv`
[ "$status" -eq 0 ]
[[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
[[ "$output" =~ "Import completed successfully." ]] || false
run dolt schema export employees
[[ "$status" -eq 0 ]] || false
[[ "${lines[1]}" =~ "id" ]] || false
[[ "${lines[2]}" =~ "first name" ]] || false
[[ "${lines[3]}" =~ "last name" ]] || false
[[ "${lines[4]}" =~ "title" ]] || false
[[ "${lines[5]}" =~ "start date" ]] || false
[[ "${lines[6]}" =~ "end date" ]] || false
}
@test "import-update-tables: updating table by inputting string longer than char column throws an error" {
cat <<DELIM > 1pk1col-rpt-chars.csv
pk,c