From 04bd70ad2e2932193d51b78f5d87432279dd56de Mon Sep 17 00:00:00 2001
From: Aaron Son <aaron@dolthub.com>
Date: Thu, 7 Dec 2023 14:59:33 -0800
Subject: [PATCH] dolt table import: json,csv: Support BOM file headers.

The semantics are as follows:

For CSV files, the default import is an uninterpreted character encoding where
newline has to match 0xa and the delimeters have to match. In general Dolt
expects UTF8, but non-UTF8 characters in string fields can make it through to
the imported table for encodings which are close enough to ASCII, for example.
If there is a UTF8, UTF16LE or UTF16BE BOM header, then character decoding of
the input stream switches to the indicated encoding.

For JSON files, the default import is UTF8 character encoding. If there is a
UTF8, UTF16LE or UTF16BE BOM header, then character decoding of the input
stream switches to the indicated encoding.
---
 .../doltcore/table/typed/json/reader.go       |   9 +-
 .../doltcore/table/typed/json/reader_test.go  |  94 ++++++++++++++----
 .../doltcore/table/untyped/csv/reader.go      |  27 +++--
 .../doltcore/table/untyped/csv/reader_test.go |  45 ++++++---
 go/libraries/events/emitter.go                |   2 +-
 ...loyees-tbl-schema-unordered.utf16bebom.csv | Bin 0 -> 344 bytes
 ...loyees-tbl-schema-unordered.utf16lebom.csv | Bin 0 -> 344 bytes
 ...employees-tbl-schema-unordered.utf8bom.csv |   4 +
 .../bats/helper/employees-tbl.utf16bebom.json | Bin 0 -> 1018 bytes
 .../bats/helper/employees-tbl.utf16lebom.json | Bin 0 -> 1018 bytes
 .../bats/helper/employees-tbl.utf8bom.json    |  28 ++++++
 .../bats/import-create-tables.bats            |  57 +++++++----
 .../bats/import-update-tables.bats            |  78 +++++++++++++++
 13 files changed, 279 insertions(+), 65 deletions(-)
 create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv
 create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv
 create mode 100644 integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv
 create mode 100644 integration-tests/bats/helper/employees-tbl.utf16bebom.json
 create mode 100644 integration-tests/bats/helper/employees-tbl.utf16lebom.json
 create mode 100644 integration-tests/bats/helper/employees-tbl.utf8bom.json

diff --git a/go/libraries/doltcore/table/typed/json/reader.go b/go/libraries/doltcore/table/typed/json/reader.go
index b5b912d4e6..c938ae008d 100644
--- a/go/libraries/doltcore/table/typed/json/reader.go
+++ b/go/libraries/doltcore/table/typed/json/reader.go
@@ -22,6 +22,8 @@ import (
 
 	"github.com/bcicen/jstream"
 	"github.com/dolthub/go-mysql-server/sql"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -52,12 +54,17 @@ func OpenJSONReader(vrw types.ValueReadWriter, path string, fs filesys.ReadableF
 	return NewJSONReader(vrw, r, sch)
 }
 
+// The bytes of the supplied reader are treated as UTF-8. If there is a UTF8,
+// UTF16LE or UTF16BE BOM at the first bytes read, then it is stripped and the
+// remaining contents of the reader are treated as that encoding.
 func NewJSONReader(vrw types.ValueReadWriter, r io.ReadCloser, sch schema.Schema) (*JSONReader, error) {
 	if sch == nil {
 		return nil, errors.New("schema must be provided to JsonReader")
 	}
 
-	decoder := jstream.NewDecoder(r, 2) // extract JSON values at a depth level of 1
+	textReader := transform.NewReader(r, unicode.BOMOverride(unicode.UTF8.NewDecoder()))
+
+	decoder := jstream.NewDecoder(textReader, 2) // extract JSON values at a depth level of 1
 
 	return &JSONReader{vrw: vrw, closer: r, sch: sch, jsonStream: decoder}, nil
 }
diff --git a/go/libraries/doltcore/table/typed/json/reader_test.go b/go/libraries/doltcore/table/typed/json/reader_test.go
index 504ef50866..4fc9cd01f4 100644
--- a/go/libraries/doltcore/table/typed/json/reader_test.go
+++ b/go/libraries/doltcore/table/typed/json/reader_test.go
@@ -15,6 +15,7 @@
 package json
 
 import (
+	"bytes"
 	"context"
 	"io"
 	"os"
@@ -24,6 +25,8 @@ import (
 	"github.com/dolthub/go-mysql-server/sql"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -33,25 +36,7 @@ import (
 	"github.com/dolthub/dolt/go/store/types"
 )
 
-func TestReader(t *testing.T) {
-	testJSON := `{
-		"rows": [
-			 {
-			   "id": 0,
-			   "first name": "tim",
-			   "last name": "sehn"
-			},
-			{
-			   "id": 1,
-			   "first name": "brian",
-			   "last name": "hendriks"
-			}
-		]
-	}`
-
-	fs := filesys.EmptyInMemFS("/")
-	require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
-
+func testGoodJSON(t *testing.T, getReader func(types.ValueReadWriter, schema.Schema) (*JSONReader, error)) {
 	colColl := schema.NewColCollection(
 		schema.Column{
 			Name:       "id",
@@ -83,7 +68,7 @@ func TestReader(t *testing.T) {
 	require.NoError(t, err)
 
 	vrw := types.NewMemoryValueStore()
-	reader, err := OpenJSONReader(vrw, "file.json", fs, sch)
+	reader, err := getReader(vrw, sch)
 	require.NoError(t, err)
 
 	verifySchema, err := reader.VerifySchema(sch)
@@ -109,6 +94,75 @@ func TestReader(t *testing.T) {
 	assert.Equal(t, enginetest.WidenRows(sqlSch.Schema, expectedRows), rows)
 }
 
+func TestReader(t *testing.T) {
+	testJSON := `{
+		"rows": [
+			 {
+			   "id": 0,
+			   "first name": "tim",
+			   "last name": "sehn"
+			},
+			{
+			   "id": 1,
+			   "first name": "brian",
+			   "last name": "hendriks"
+			}
+		]
+	}`
+
+	fs := filesys.EmptyInMemFS("/")
+	require.NoError(t, fs.WriteFile("file.json", []byte(testJSON), os.ModePerm))
+
+	testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+		return OpenJSONReader(vrw, "file.json", fs, sch)
+	})
+}
+
+func TestReaderBOMHandling(t *testing.T) {
+	testJSON := `{
+		"rows": [
+			 {
+			   "id": 0,
+			   "first name": "tim",
+			   "last name": "sehn"
+			},
+			{
+			   "id": 1,
+			   "first name": "brian",
+			   "last name": "hendriks"
+			}
+		]
+	}`
+	t.Run("UTF-8", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF8.NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-8 BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF8BOM.NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-16 LE BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF16(unicode.LittleEndian, unicode.UseBOM).NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+	t.Run("UTF-16 BE BOM", func(t *testing.T) {
+		bs := bytes.NewBuffer([]byte(testJSON))
+		reader := transform.NewReader(bs, unicode.UTF16(unicode.BigEndian, unicode.UseBOM).NewEncoder())
+		testGoodJSON(t, func(vrw types.ValueReadWriter, sch schema.Schema) (*JSONReader, error) {
+			return NewJSONReader(vrw, io.NopCloser(reader), sch)
+		})
+	})
+}
+
 func TestReaderBadJson(t *testing.T) {
 	testJSON := ` {
    "rows": [
diff --git a/go/libraries/doltcore/table/untyped/csv/reader.go b/go/libraries/doltcore/table/untyped/csv/reader.go
index 74545e1fe2..7ae7f2519d 100644
--- a/go/libraries/doltcore/table/untyped/csv/reader.go
+++ b/go/libraries/doltcore/table/untyped/csv/reader.go
@@ -27,6 +27,8 @@ import (
 	"unicode/utf8"
 
 	"github.com/dolthub/go-mysql-server/sql"
+	textunicode "golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
 
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/schema"
@@ -73,6 +75,14 @@ func OpenCSVReader(nbf *types.NomsBinFormat, path string, fs filesys.ReadableFS,
 }
 
 // NewCSVReader creates a CSVReader from a given ReadCloser.  The CSVFileInfo should describe the csv file being read.
+//
+// The interpretation of the bytes of the supplied reader is a little murky. If
+// there is a UTF8, UTF16LE or UTF16BE BOM as the first bytes read, then the
+// BOM is stripped and the remaining contents of the reader are treated as that
+// encoding. If we are not in any of those marked encodings, then some of the
+// bytes go uninterpreted until we get to the SQL layer. It is currently the
+// case that newlines must be encoded as a '0xa' byte and the delimiter must
+// match |info.Delim|.
 func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo) (*CSVReader, error) {
 	if len(info.Delim) < 1 {
 		return nil, fmt.Errorf("delimiter '%s' has invalid length", info.Delim)
@@ -81,7 +91,9 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
 		return nil, fmt.Errorf("invalid delimiter: %s", string(info.Delim))
 	}
 
-	br := bufio.NewReaderSize(r, ReadBufSize)
+	textReader := transform.NewReader(r, textunicode.BOMOverride(transform.Nop))
+
+	br := bufio.NewReaderSize(textReader, ReadBufSize)
 	colStrs, err := getColHeaders(br, info)
 
 	if err != nil {
@@ -102,18 +114,6 @@ func NewCSVReader(nbf *types.NomsBinFormat, r io.ReadCloser, info *CSVFileInfo)
 	}, nil
 }
 
-// trimBOM checks if the given string has the Byte Order Mark, and removes it if it is
-// the BOM is there if the first 3 bytes are xEF\xBB\xBF and indicates that a file is in UTF-8 encoding
-func trimBOM(s string) string {
-	if len(s) < 3 {
-		return s
-	}
-	if s[0] == '\xEF' && s[1] == '\xBB' && s[2] == '\xBF' {
-		return s[3:]
-	}
-	return s
-}
-
 func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
 	colStrs := info.Columns
 	if info.HasHeaderLine {
@@ -124,7 +124,6 @@ func getColHeaders(br *bufio.Reader, info *CSVFileInfo) ([]string, error) {
 		} else if strings.TrimSpace(line) == "" {
 			return nil, errors.New("Header line is empty")
 		}
-		line = trimBOM(line)
 		colStrsFromFile, err := csvSplitLine(line, info.Delim, info.EscapeQuotes)
 
 		if err != nil {
diff --git a/go/libraries/doltcore/table/untyped/csv/reader_test.go b/go/libraries/doltcore/table/untyped/csv/reader_test.go
index 01abc4ae20..a821fa03eb 100644
--- a/go/libraries/doltcore/table/untyped/csv/reader_test.go
+++ b/go/libraries/doltcore/table/untyped/csv/reader_test.go
@@ -20,6 +20,11 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/stretchr/testify/require"
+	"golang.org/x/text/encoding"
+	"golang.org/x/text/encoding/unicode"
+	"golang.org/x/text/transform"
+
 	"github.com/dolthub/dolt/go/libraries/doltcore/row"
 	"github.com/dolthub/dolt/go/libraries/doltcore/table"
 	"github.com/dolthub/dolt/go/libraries/doltcore/table/untyped"
@@ -67,6 +72,13 @@ func mustRow(r row.Row, err error) row.Row {
 	return r
 }
 
+func mustEncodeBytes(t *testing.T, bs []byte, enc encoding.Encoding) []byte {
+	ret, n, err := transform.Bytes(enc.NewEncoder(), bs)
+	require.NoError(t, err)
+	require.Equal(t, n, len(bs))
+	return ret
+}
+
 func TestReader(t *testing.T) {
 	colNames := []string{"name", "age", "title"}
 	_, sch := untyped.NewUntypedSchema(colNames...)
@@ -82,33 +94,42 @@ func TestReader(t *testing.T) {
 		mustRow(untyped.NewRowFromStrings(types.Format_Default, sch, []string{"Jack Jackson", "27"})),
 	}
 
+	utf8bomBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF8BOM)
+	require.Equal(t, utf8bomBytes[0:3], []byte{0xEF, 0xBB, 0xBF})
+	utf16leBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.LittleEndian, unicode.UseBOM))
+	utf16beBytes := mustEncodeBytes(t, []byte(PersonDB1), unicode.UTF16(unicode.BigEndian, unicode.UseBOM))
+
 	tests := []struct {
-		inputStr     string
+		input        []byte
 		expectedRows []row.Row
 		info         *CSVFileInfo
 	}{
-		{PersonDB1, goodExpectedRows, NewCSVInfo()},
-		{PersonDB2, goodExpectedRows, NewCSVInfo()},
-		{PersonDB3, goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB1), goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB2), goodExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDB3), goodExpectedRows, NewCSVInfo()},
 
-		{PersonDBWithBadRow, badExpectedRows, NewCSVInfo()},
-		{PersonDBWithBadRow2, badExpectedRows, NewCSVInfo()},
-		{PersonDBWithBadRow3, badExpectedRows, NewCSVInfo()},
+		{utf8bomBytes, goodExpectedRows, NewCSVInfo()},
+		{utf16leBytes, goodExpectedRows, NewCSVInfo()},
+		{utf16beBytes, goodExpectedRows, NewCSVInfo()},
+
+		{[]byte(PersonDBWithBadRow), badExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDBWithBadRow2), badExpectedRows, NewCSVInfo()},
+		{[]byte(PersonDBWithBadRow3), badExpectedRows, NewCSVInfo()},
 
 		{
-			PersonDBWithoutHeaders,
+			[]byte(PersonDBWithoutHeaders),
 			goodExpectedRows,
 			NewCSVInfo().SetHasHeaderLine(false).SetColumns(colNames),
 		},
 		{
-			PersonDBDifferentHeaders,
+			[]byte(PersonDBDifferentHeaders),
 			goodExpectedRows,
 			NewCSVInfo().SetHasHeaderLine(true).SetColumns(colNames),
 		},
 	}
 
 	for _, test := range tests {
-		rows, numBad, err := readTestRows(t, test.inputStr, test.info)
+		rows, numBad, err := readTestRows(t, test.input, test.info)
 
 		if err != nil {
 			t.Fatal("Unexpected Error:", err)
@@ -136,11 +157,11 @@ func TestReader(t *testing.T) {
 	}
 }
 
-func readTestRows(t *testing.T, inputStr string, info *CSVFileInfo) ([]row.Row, int, error) {
+func readTestRows(t *testing.T, input []byte, info *CSVFileInfo) ([]row.Row, int, error) {
 	const root = "/"
 	const path = "/file.csv"
 
-	fs := filesys.NewInMemFS(nil, map[string][]byte{path: []byte(inputStr)}, root)
+	fs := filesys.NewInMemFS(nil, map[string][]byte{path: input}, root)
 	csvR, err := OpenCSVReader(types.Format_Default, path, fs, info)
 	defer csvR.Close(context.Background())
 
diff --git a/go/libraries/events/emitter.go b/go/libraries/events/emitter.go
index d8df06e976..1b78613fb8 100644
--- a/go/libraries/events/emitter.go
+++ b/go/libraries/events/emitter.go
@@ -32,7 +32,7 @@ import (
 	"github.com/dolthub/dolt/go/libraries/utils/iohelp"
 )
 
-// Application is the application ID used for all events emitted by this application. Other applications (not dolt) 
+// Application is the application ID used for all events emitted by this application. Other applications (not dolt)
 // should set this once at initialization.
 var Application = eventsapi.AppID_APP_DOLT
 
diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16bebom.csv
new file mode 100644
index 0000000000000000000000000000000000000000..84a09ac7c1df2c42dc2661a8e5747a7e1a4ea6b4
GIT binary patch
literal 344
zcmb7=u@1s83`B26;vb^S48R`+NY#p#N^SXl;MoaC?Zy)OY~T5u*IS)#Iw)u%ThuE%
zn#q$U-atPQOFioWUe_9!lZ+AQCJlAACtp{C_S)&_8=|hvjfxd*qxl(r-I>`I_2>$x
u%4y&~-BHHDR^f}2Uotw|E{(V<$lRR&(z>vxL(iNPtrlnYCsx0s><&NHNihKc

literal 0
HcmV?d00001

diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf16lebom.csv
new file mode 100644
index 0000000000000000000000000000000000000000..f7aaf9b5a8dfd7e47865bd7a5b0342d820f85454
GIT binary patch
literal 344
zcmb7=u@1s83`B26;vb^S48R`+Xw`_8N-g|8@a&{W?Z%MAXZz0QeBN0%9TfCb=JaYD
z&BRF)cc7klOTN(qyw0^SCmAD9O&V%%PrS|s?X}a<Cq!N88x<?szUF86RcCrT)T1k$
vDyN12az_mgwh9YQzRBop+ce^;Aais6OY6d%jGj3uS{=^pPb_~&xj6g+1za%!

literal 0
HcmV?d00001

diff --git a/integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv
new file mode 100644
index 0000000000..5f841761ff
--- /dev/null
+++ b/integration-tests/bats/helper/employees-tbl-schema-unordered.utf8bom.csv
@@ -0,0 +1,4 @@
+﻿id, title, start date, end date, first name, last name 
+0, "ceo", "", "", "tim", "sehn"
+1, "founder", "", "", "aaron", "son"
+2, "founder", "", "", "brian", "hendriks"
diff --git a/integration-tests/bats/helper/employees-tbl.utf16bebom.json b/integration-tests/bats/helper/employees-tbl.utf16bebom.json
new file mode 100644
index 0000000000000000000000000000000000000000..a05747183716ad0e7e78fe18ca74093b6aeab8f7
GIT binary patch
literal 1018
zcmd6mJr06E5QSfD;vEdNg<5(QV}Y=QjVOs=EDTpi-z=brvlu%<GsC?3`!ZWUaK{`o
zEcgYuAjb^_tu6iCD<aXiPP|5M5x3eEmLKXSp1`tZfs*<&LY!b)eZYT~exT-M)aqFU
zE2ONY=Sj^HS+7e(JNKxiL&*7Lx|mkk%hT%{c8*nb$58gJ20M+DTMyK~$%Ss}+)H7M
zo<HlGT|#nKx3M85oyas8ZvJL?;6|1leV62CnDR>G3M4<m(Y+S!_NT_PyU@LRr_~ce
Co^{Fq

literal 0
HcmV?d00001

diff --git a/integration-tests/bats/helper/employees-tbl.utf16lebom.json b/integration-tests/bats/helper/employees-tbl.utf16lebom.json
new file mode 100644
index 0000000000000000000000000000000000000000..8ffe50b963e6ef596377c23a09a7882a1ed8e985
GIT binary patch
literal 1018
zcmd6mJr06E5QSfD;vEdNg<5(QV}Y=QjVOs=EDTpi-z=brvlu%<GsC?3`!f5e@0eqT
z1-}3n<hY@rwWYs%MI`#xiPz{Y;#RxD@<ZLk6Ij+PP*Q(Jh!ae!5BSg057fMjT0N^^
zg_O1QJgHeC>vf4}=N`3m2sxij7t<<xd3t@r&atZQ7|Pz&V5f0%>w)?=xzJ6Wdnt_3
z^JjgtOGxhOHa4WB6PX6X&EE_U+{lun?~?osQ(lQ&f#gRxy4RxJ{?vGO7rJ-vw89g%
CYjw&1

literal 0
HcmV?d00001

diff --git a/integration-tests/bats/helper/employees-tbl.utf8bom.json b/integration-tests/bats/helper/employees-tbl.utf8bom.json
new file mode 100644
index 0000000000..a9fe2cd6cb
--- /dev/null
+++ b/integration-tests/bats/helper/employees-tbl.utf8bom.json
@@ -0,0 +1,28 @@
+﻿{
+  "rows": [
+    {
+      "id": 0,
+      "first name": "tim",
+      "last name": "sehn",
+      "title": "ceo",
+      "start date": "",
+      "end date": ""
+    },
+    {
+      "id": 1,
+      "first name": "aaron",
+      "last name": "son",
+      "title": "founder",
+      "start date": "",
+      "end date": ""
+    },
+    {
+      "id": 2,
+      "first name": "brian",
+      "last name": "hendricks",
+      "title": "founder",
+      "start date": "",
+      "end date": ""
+    }
+  ]
+}
\ No newline at end of file
diff --git a/integration-tests/bats/import-create-tables.bats b/integration-tests/bats/import-create-tables.bats
index 65fce7575d..5f885d7cfa 100755
--- a/integration-tests/bats/import-create-tables.bats
+++ b/integration-tests/bats/import-create-tables.bats
@@ -60,23 +60,6 @@ teardown() {
     teardown_common
 }
 
-@test "import-create-tables: correctly ignores byte order mark (BOM)" {
-    printf '\xEF\xBB\xBF' > bom.csv
-    cat <<DELIM >> bom.csv
-c1,c2
-1,2
-DELIM
-
-    run dolt table import -c bom bom.csv
-    [ "$status" -eq 0 ]
-    [[ "$output" =~ "Rows Processed: 1, Additions: 1, Modifications: 0, Had No Effect: 0" ]] || false
-    [[ "$output" =~ "Import completed successfully." ]] || false
-
-    run dolt sql -q "select c1 from bom"
-    [ "$status" -eq 0 ]
-    [[ "$output" =~ "1" ]] || false
-}
-
 @test "import-create-tables: create a table with json import" {
     run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.json`
     [ "$status" -eq 0 ]
@@ -90,6 +73,46 @@ DELIM
     [ "${#lines[@]}" -eq 7 ]
 }
 
+@test "import-create-tables: create a table with json import, utf8 with bom" {
+    run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf8bom.json`
+    echo "$output"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt ls
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "employees" ]] || false
+    run dolt sql -q "select * from employees"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "tim" ]] || false
+    [ "${#lines[@]}" -eq 7 ]
+}
+
+@test "import-create-tables: create a table with json import, utf16le with bom" {
+    run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16lebom.json`
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt ls
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "employees" ]] || false
+    run dolt sql -q "select * from employees"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "tim" ]] || false
+    [ "${#lines[@]}" -eq 7 ]
+}
+
+@test "import-create-tables: create a table with json import, utf16be with bom" {
+    run dolt table import -c -s `batshelper employees-sch.sql` employees `batshelper employees-tbl.utf16bebom.json`
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt ls
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "employees" ]] || false
+    run dolt sql -q "select * from employees"
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "tim" ]] || false
+    [ "${#lines[@]}" -eq 7 ]
+}
+
 @test "import-create-tables: create a table with json import. no schema." {
     run dolt table import -c employees `batshelper employees-tbl.json`
     [ "$status" -ne 0 ]
diff --git a/integration-tests/bats/import-update-tables.bats b/integration-tests/bats/import-update-tables.bats
index 8b4c4ce967..8888c3ceb6 100644
--- a/integration-tests/bats/import-update-tables.bats
+++ b/integration-tests/bats/import-update-tables.bats
@@ -270,6 +270,84 @@ SQL
     [[ "${lines[6]}" =~ "end date" ]]   || false
 }
 
+@test "import-update-tables: update table with a csv with columns in different order, utf8 with bom" {
+    dolt sql <<SQL
+CREATE TABLE employees (
+  \`id\` varchar(20) NOT NULL COMMENT 'tag:0',
+  \`first name\` LONGTEXT COMMENT 'tag:1',
+  \`last name\` LONGTEXT COMMENT 'tag:2',
+  \`title\` LONGTEXT COMMENT 'tag:3',
+  \`start date\` LONGTEXT COMMENT 'tag:4',
+  \`end date\` LONGTEXT COMMENT 'tag:5',
+  PRIMARY KEY (id)
+);
+SQL
+    run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf8bom.csv`
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt schema export employees
+    [[ "$status" -eq 0 ]] || false
+    [[ "${lines[1]}" =~ "id" ]]         || false
+    [[ "${lines[2]}" =~ "first name" ]] || false
+    [[ "${lines[3]}" =~ "last name" ]]  || false
+    [[ "${lines[4]}" =~ "title" ]]      || false
+    [[ "${lines[5]}" =~ "start date" ]] || false
+    [[ "${lines[6]}" =~ "end date" ]]   || false
+}
+
+@test "import-update-tables: update table with a csv with columns in different order, utf16le with bom" {
+    dolt sql <<SQL
+CREATE TABLE employees (
+  \`id\` varchar(20) NOT NULL COMMENT 'tag:0',
+  \`first name\` LONGTEXT COMMENT 'tag:1',
+  \`last name\` LONGTEXT COMMENT 'tag:2',
+  \`title\` LONGTEXT COMMENT 'tag:3',
+  \`start date\` LONGTEXT COMMENT 'tag:4',
+  \`end date\` LONGTEXT COMMENT 'tag:5',
+  PRIMARY KEY (id)
+);
+SQL
+    run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf16lebom.csv`
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt schema export employees
+    [[ "$status" -eq 0 ]] || false
+    [[ "${lines[1]}" =~ "id" ]]         || false
+    [[ "${lines[2]}" =~ "first name" ]] || false
+    [[ "${lines[3]}" =~ "last name" ]]  || false
+    [[ "${lines[4]}" =~ "title" ]]      || false
+    [[ "${lines[5]}" =~ "start date" ]] || false
+    [[ "${lines[6]}" =~ "end date" ]]   || false
+}
+
+@test "import-update-tables: update table with a csv with columns in different order, utf16be with bom" {
+    dolt sql <<SQL
+CREATE TABLE employees (
+  \`id\` varchar(20) NOT NULL COMMENT 'tag:0',
+  \`first name\` LONGTEXT COMMENT 'tag:1',
+  \`last name\` LONGTEXT COMMENT 'tag:2',
+  \`title\` LONGTEXT COMMENT 'tag:3',
+  \`start date\` LONGTEXT COMMENT 'tag:4',
+  \`end date\` LONGTEXT COMMENT 'tag:5',
+  PRIMARY KEY (id)
+);
+SQL
+    run dolt table import -u employees `batshelper employees-tbl-schema-unordered.utf16bebom.csv`
+    [ "$status" -eq 0 ]
+    [[ "$output" =~ "Rows Processed: 3, Additions: 3, Modifications: 0, Had No Effect: 0" ]] || false
+    [[ "$output" =~ "Import completed successfully." ]] || false
+    run dolt schema export employees
+    [[ "$status" -eq 0 ]] || false
+    [[ "${lines[1]}" =~ "id" ]]         || false
+    [[ "${lines[2]}" =~ "first name" ]] || false
+    [[ "${lines[3]}" =~ "last name" ]]  || false
+    [[ "${lines[4]}" =~ "title" ]]      || false
+    [[ "${lines[5]}" =~ "start date" ]] || false
+    [[ "${lines[6]}" =~ "end date" ]]   || false
+}
+
 @test "import-update-tables: updating table by inputting string longer than char column throws an error" {
     cat <<DELIM > 1pk1col-rpt-chars.csv
 pk,c