Support more field separators and quote chars in the CSV import

This adds support for more characters as field separator and for quoting in the CSV import. Before this we only supported ASCII characters, with this we support all characters which when UTF-8 encoded require up to 16 bits. See issue #1860.
2026-05-18 03:29:25 -05:00 · 2019-05-07 23:13:03 +02:00
parent cec6b82561
commit afee3ca79e
4 changed files with 124 additions and 73 deletions
@@ -58,8 +58,8 @@ ImportCsvDialog::ImportCsvDialog(const QStringList &filenames, DBBrowserDB* db,
    ui->checkboxHeader->setChecked(Settings::getValue("importcsv", "firstrowheader").toBool());
    ui->checkBoxTrimFields->setChecked(Settings::getValue("importcsv", "trimfields").toBool());
    ui->checkBoxSeparateTables->setChecked(Settings::getValue("importcsv", "separatetables").toBool());
-    setSeparatorChar(Settings::getValue("importcsv", "separator").toInt());
-    setQuoteChar(Settings::getValue("importcsv", "quotecharacter").toInt());
+    setSeparatorChar(static_cast<char32_t>(Settings::getValue("importcsv", "separator").toInt()));
+    setQuoteChar(static_cast<char32_t>(Settings::getValue("importcsv", "quotecharacter").toInt()));
    setEncoding(Settings::getValue("importcsv", "encoding").toString());

    ui->checkboxHeader->blockSignals(false);
@@ -704,14 +704,14 @@ bool ImportCsvDialog::importCsv(const QString& fileName, const QString& name)
    return true;
 }

-void ImportCsvDialog::setQuoteChar(const QChar& c)
+void ImportCsvDialog::setQuoteChar(char32_t c)
 {
    QComboBox* combo = ui->comboQuote;
-    int index = combo->findText(c);
+    int index = combo->findText(QString(c));
    if(index == -1)
    {
        combo->setCurrentIndex(combo->count() - 1);
-        ui->editCustomQuote->setText(c);
+        ui->editCustomQuote->setText(QString(c));
    }
    else
    {
@@ -719,19 +719,21 @@ void ImportCsvDialog::setQuoteChar(const QChar& c)
    }
 }

-char ImportCsvDialog::currentQuoteChar() const
+char32_t ImportCsvDialog::currentQuoteChar() const
 {
+    QString value;
+
    // The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
    if(ui->comboQuote->currentIndex() == ui->comboQuote->count()-1)
-        return ui->editCustomQuote->text().length() ? ui->editCustomQuote->text().at(0).toLatin1() : 0;
+        value = ui->editCustomQuote->text().length() ? ui->editCustomQuote->text() : "";

    if(ui->comboQuote->currentText().length())
-        return ui->comboQuote->currentText().at(0).toLatin1();
-    else
-        return 0;
+        value = ui->comboQuote->currentText();
+
+    return toUtf8(value);
 }

-void ImportCsvDialog::setSeparatorChar(const QChar& c)
+void ImportCsvDialog::setSeparatorChar(char32_t c)
 {
    QComboBox* combo = ui->comboSeparator;
    QString sText = c == '\t' ? QString("Tab") : QString(c);
@@ -739,7 +741,7 @@ void ImportCsvDialog::setSeparatorChar(const QChar& c)
    if(index == -1)
    {
        combo->setCurrentIndex(combo->count() - 1);
-        ui->editCustomSeparator->setText(c);
+        ui->editCustomSeparator->setText(QString(c));
    }
    else
    {
@@ -747,13 +749,17 @@ void ImportCsvDialog::setSeparatorChar(const QChar& c)
    }
 }

-char ImportCsvDialog::currentSeparatorChar() const
+char32_t ImportCsvDialog::currentSeparatorChar() const
 {
-    // The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
-    if(ui->comboSeparator->currentIndex() == ui->comboSeparator->count()-1)
-        return ui->editCustomSeparator->text().length() ? ui->editCustomSeparator->text().at(0).toLatin1() : 0;
+    QString value;

-    return ui->comboSeparator->currentText() == tr("Tab") ? '\t' : ui->comboSeparator->currentText().at(0).toLatin1();
+    // The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
+    if(ui->comboSeparator->currentIndex() == ui->comboSeparator->count()-1 || ui->comboSeparator->currentText().isEmpty())
+        value = ui->editCustomSeparator->text().length() ? ui->editCustomSeparator->text() : "";
+    else
+        value = ui->comboSeparator->currentText() == tr("Tab") ? "\t" : ui->comboSeparator->currentText();
+
+    return toUtf8(value);
 }

 void ImportCsvDialog::setEncoding(const QString& sEnc)
@@ -804,3 +810,17 @@ void ImportCsvDialog::toggleAdvancedSection(bool show)
    ui->labelOnConflictStrategy->setVisible(show);
    ui->comboOnConflictStrategy->setVisible(show);
 }
+
+char32_t ImportCsvDialog::toUtf8(const QString& s) const
+{
+    if(s.isEmpty())
+        return 0;
+
+    QByteArray ba = s.toUtf8();
+
+    char32_t result = 0;
+    for(int i=std::min(ba.size()-1,3);i>=0;i--)
+        result = (result << 8) + static_cast<unsigned char>(ba.at(i));
+
+    return result;
+}
@@ -45,16 +45,18 @@ private:

    bool importCsv(const QString& f, const QString& n = QString());

-    void setQuoteChar(const QChar& c);
-    char currentQuoteChar() const;
+    void setQuoteChar(char32_t c);
+    char32_t currentQuoteChar() const;

-    void setSeparatorChar(const QChar& c);
-    char currentSeparatorChar() const;
+    void setSeparatorChar(char32_t c);
+    char32_t currentSeparatorChar() const;

    void setEncoding(const QString& sEnc);
    QString currentEncoding() const;

    QString currentOnConflictStrategy() const;
+
+    char32_t toUtf8(const QString& s) const;
 };

 #endif
@@ -2,13 +2,23 @@

 #include <QTextStream>

-CSVParser::CSVParser(bool trimfields, char fieldseparator, char quotechar)
+CSVParser::CSVParser(bool trimfields, char32_t fieldseparator, char32_t quotechar)
    : m_bTrimFields(trimfields)
-    , m_cFieldSeparator(fieldseparator)
-    , m_cQuoteChar(quotechar)
+    , m_iNumExtraBytesFieldSeparator(0)
+    , m_iNumExtraBytesQuoteChar(0)
    , m_pCSVProgress(nullptr)
    , m_nBufferSize(4096)
 {
+    for(int i=0;i<4;i++)
+    {
+        m_cFieldSeparator[i] = static_cast<char>((fieldseparator >> i*8) & 0xFF);
+        m_cQuoteChar[i] = static_cast<char>((quotechar >> i*8) & 0xFF);
+
+        if(i && m_cFieldSeparator[i])
+            m_iNumExtraBytesFieldSeparator = i;
+        if(i && m_cQuoteChar[i])
+            m_iNumExtraBytesQuoteChar = i;
+    }
 }

 CSVParser::~CSVParser()
@@ -156,38 +166,26 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
            {
            case StateNormal:
            {
-                if(c == m_cFieldSeparator)
+                if(c == m_cFieldSeparator[0])
                {
-                    field = addColumn(record, field, m_bTrimFields);
+                    if(!m_iNumExtraBytesFieldSeparator || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cFieldSeparator[1]))
+                    {
+                        field = addColumn(record, field, m_bTrimFields);
+                        it += m_iNumExtraBytesFieldSeparator;
+                    }
                }
-                else if(c == m_cQuoteChar)
+                else if(c == m_cQuoteChar[0])
                {
-                    state = StateInQuote;
+                    if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
+                    {
+                        state = StateInQuote;
+                        it += m_iNumExtraBytesQuoteChar;
+                    }
                }
                else if(c == '\r')
                {
                    // look ahead to check for linefeed
-                    auto nit = it + 1;
-
-                    // In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
-                    // boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
-                    // because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
-                    // next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
-                    // case and, if this is what's happening, we'll just load an extra byte.
-                    if(nit == sBufferEnd && !stream.atEnd())
-                    {
-                        // Load one more byte
-                        sBuffer.append(stream.read(1));
-                        sBufferEnd = sBuffer.constEnd();
-
-                        // Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
-                        // just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
-                        it = sBufferEnd - 2;
-                        nit = sBufferEnd - 1;
-                    }
-
-                    // no linefeed, so assume that CR represents a newline
-                    if(nit != sBufferEnd && *nit != '\n')
+                    if(!look_ahead(stream, sBuffer, &it, &sBufferEnd, '\n'))
                    {
                        addColumn(record, field, m_bTrimFields);

@@ -210,9 +208,13 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
            break;
            case StateInQuote:
            {
-                if(c == m_cQuoteChar)
+                if(c == m_cQuoteChar[0])
                {
-                    state = StateEndQuote;
+                    if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
+                    {
+                        state = StateEndQuote;
+                        it += m_iNumExtraBytesQuoteChar;
+                    }
                }
                else
                {
@@ -222,15 +224,23 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
            break;
            case StateEndQuote:
            {
-                if(c == m_cQuoteChar)
+                if(c == m_cQuoteChar[0])
                {
-                    state = StateInQuote;
-                    addChar(field, c);
+                    if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
+                    {
+                        state = StateInQuote;
+                        addChar(field, c);
+                        it += m_iNumExtraBytesQuoteChar;
+                    }
                }
-                else if(c == m_cFieldSeparator)
+                else if(c == m_cFieldSeparator[0])
                {
-                    state = StateNormal;
-                    field = addColumn(record, field, m_bTrimFields);
+                    if(!m_iNumExtraBytesFieldSeparator || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cFieldSeparator[1]))
+                    {
+                        state = StateNormal;
+                        field = addColumn(record, field, m_bTrimFields);
+                        it += m_iNumExtraBytesFieldSeparator;
+                    }
                }
                else if(c == '\n')
                {
@@ -243,19 +253,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
                else if(c == '\r')
                {
                    // look ahead to check for linefeed
-                    auto nit = it + 1;
-
-                    // See above for details on this.
-                    if(nit == sBufferEnd && !stream.atEnd())
-                    {
-                        sBuffer.append(stream.read(1));
-                        sBufferEnd = sBuffer.constEnd();
-                        it = sBufferEnd - 2;
-                        nit = sBufferEnd - 1;
-                    }
-
-                    // no linefeed, so assume that CR represents a newline
-                    if(nit != sBufferEnd && *nit != '\n')
+                    if(!look_ahead(stream, sBuffer, &it, &sBufferEnd, '\n'))
                    {
                        addColumn(record, field, m_bTrimFields);

@@ -296,3 +294,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr

    return (state == StateNormal) ? ParserResult::ParserResultSuccess : ParserResult::ParserResultError;
 }
+
+bool CSVParser::look_ahead(QTextStream& stream, QByteArray& sBuffer, const char** it, const char** sBufferEnd, char expected)
+{
+    // look ahead for next byte
+    auto nit = *it + 1;
+
+    // In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
+    // boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
+    // because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
+    // next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
+    // case and, if this is what's happening, we'll just load an extra byte.
+    if(nit == *sBufferEnd && !stream.atEnd())
+    {
+        // Load one more byte
+        sBuffer.append(stream.read(1));
+        *sBufferEnd = sBuffer.constEnd();
+
+        // Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
+        // just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
+        *it = *sBufferEnd - 2;
+        nit = *sBufferEnd - 1;
+    }
+
+    // Check whether there actually is one more byte and it is the expected one
+    return nit != *sBufferEnd && *nit == expected;
+}
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <cstddef>

+class QByteArray;
 class QTextStream;

 /*!
@@ -52,7 +53,7 @@ class CSVParser
 public:
    using csvRowFunction = std::function<bool(size_t, CSVRow)>;

-    CSVParser(bool trimfields = true, char fieldseparator = ',', char quotechar = '"');
+    CSVParser(bool trimfields = true, char32_t fieldseparator = ',', char32_t quotechar = '"');
    ~CSVParser();

    enum ParserResult
@@ -84,11 +85,15 @@ private:

 private:
    bool m_bTrimFields;
-    char m_cFieldSeparator;
-    char m_cQuoteChar;
+    char m_cFieldSeparator[4];
+    char m_cQuoteChar[4];
+    int m_iNumExtraBytesFieldSeparator;
+    int m_iNumExtraBytesQuoteChar;
    CSVProgress* m_pCSVProgress;

    int64_t m_nBufferSize;        //! internal buffer read size
+
+    bool look_ahead(QTextStream& stream, QByteArray& sBuffer, const char** it, const char** sBufferEnd, char expected);
 };

 #endif