Support more field separators and quote chars in the CSV import

This adds support for more characters as field separator and for quoting
in the CSV import. Before this we only supported ASCII characters, with
this we support all characters which when UTF-8 encoded require up to
16 bits.

See issue #1860.
This commit is contained in:
Martin Kleusberg
2019-05-07 23:13:03 +02:00
parent cec6b82561
commit afee3ca79e
4 changed files with 124 additions and 73 deletions
+37 -17
View File
@@ -58,8 +58,8 @@ ImportCsvDialog::ImportCsvDialog(const QStringList &filenames, DBBrowserDB* db,
ui->checkboxHeader->setChecked(Settings::getValue("importcsv", "firstrowheader").toBool());
ui->checkBoxTrimFields->setChecked(Settings::getValue("importcsv", "trimfields").toBool());
ui->checkBoxSeparateTables->setChecked(Settings::getValue("importcsv", "separatetables").toBool());
setSeparatorChar(Settings::getValue("importcsv", "separator").toInt());
setQuoteChar(Settings::getValue("importcsv", "quotecharacter").toInt());
setSeparatorChar(static_cast<char32_t>(Settings::getValue("importcsv", "separator").toInt()));
setQuoteChar(static_cast<char32_t>(Settings::getValue("importcsv", "quotecharacter").toInt()));
setEncoding(Settings::getValue("importcsv", "encoding").toString());
ui->checkboxHeader->blockSignals(false);
@@ -704,14 +704,14 @@ bool ImportCsvDialog::importCsv(const QString& fileName, const QString& name)
return true;
}
void ImportCsvDialog::setQuoteChar(const QChar& c)
void ImportCsvDialog::setQuoteChar(char32_t c)
{
QComboBox* combo = ui->comboQuote;
int index = combo->findText(c);
int index = combo->findText(QString(c));
if(index == -1)
{
combo->setCurrentIndex(combo->count() - 1);
ui->editCustomQuote->setText(c);
ui->editCustomQuote->setText(QString(c));
}
else
{
@@ -719,19 +719,21 @@ void ImportCsvDialog::setQuoteChar(const QChar& c)
}
}
char ImportCsvDialog::currentQuoteChar() const
char32_t ImportCsvDialog::currentQuoteChar() const
{
QString value;
// The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
if(ui->comboQuote->currentIndex() == ui->comboQuote->count()-1)
return ui->editCustomQuote->text().length() ? ui->editCustomQuote->text().at(0).toLatin1() : 0;
value = ui->editCustomQuote->text().length() ? ui->editCustomQuote->text() : "";
if(ui->comboQuote->currentText().length())
return ui->comboQuote->currentText().at(0).toLatin1();
else
return 0;
value = ui->comboQuote->currentText();
return toUtf8(value);
}
void ImportCsvDialog::setSeparatorChar(const QChar& c)
void ImportCsvDialog::setSeparatorChar(char32_t c)
{
QComboBox* combo = ui->comboSeparator;
QString sText = c == '\t' ? QString("Tab") : QString(c);
@@ -739,7 +741,7 @@ void ImportCsvDialog::setSeparatorChar(const QChar& c)
if(index == -1)
{
combo->setCurrentIndex(combo->count() - 1);
ui->editCustomSeparator->setText(c);
ui->editCustomSeparator->setText(QString(c));
}
else
{
@@ -747,13 +749,17 @@ void ImportCsvDialog::setSeparatorChar(const QChar& c)
}
}
char ImportCsvDialog::currentSeparatorChar() const
char32_t ImportCsvDialog::currentSeparatorChar() const
{
// The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
if(ui->comboSeparator->currentIndex() == ui->comboSeparator->count()-1)
return ui->editCustomSeparator->text().length() ? ui->editCustomSeparator->text().at(0).toLatin1() : 0;
QString value;
return ui->comboSeparator->currentText() == tr("Tab") ? '\t' : ui->comboSeparator->currentText().at(0).toLatin1();
// The last item in the combobox is the 'Other' item; if it is selected return the text of the line edit field instead
if(ui->comboSeparator->currentIndex() == ui->comboSeparator->count()-1 || ui->comboSeparator->currentText().isEmpty())
value = ui->editCustomSeparator->text().length() ? ui->editCustomSeparator->text() : "";
else
value = ui->comboSeparator->currentText() == tr("Tab") ? "\t" : ui->comboSeparator->currentText();
return toUtf8(value);
}
void ImportCsvDialog::setEncoding(const QString& sEnc)
@@ -804,3 +810,17 @@ void ImportCsvDialog::toggleAdvancedSection(bool show)
ui->labelOnConflictStrategy->setVisible(show);
ui->comboOnConflictStrategy->setVisible(show);
}
char32_t ImportCsvDialog::toUtf8(const QString& s) const
{
if(s.isEmpty())
return 0;
QByteArray ba = s.toUtf8();
char32_t result = 0;
for(int i=std::min(ba.size()-1,3);i>=0;i--)
result = (result << 8) + static_cast<unsigned char>(ba.at(i));
return result;
}
+6 -4
View File
@@ -45,16 +45,18 @@ private:
bool importCsv(const QString& f, const QString& n = QString());
void setQuoteChar(const QChar& c);
char currentQuoteChar() const;
void setQuoteChar(char32_t c);
char32_t currentQuoteChar() const;
void setSeparatorChar(const QChar& c);
char currentSeparatorChar() const;
void setSeparatorChar(char32_t c);
char32_t currentSeparatorChar() const;
void setEncoding(const QString& sEnc);
QString currentEncoding() const;
QString currentOnConflictStrategy() const;
char32_t toUtf8(const QString& s) const;
};
#endif
+73 -49
View File
@@ -2,13 +2,23 @@
#include <QTextStream>
CSVParser::CSVParser(bool trimfields, char fieldseparator, char quotechar)
CSVParser::CSVParser(bool trimfields, char32_t fieldseparator, char32_t quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
, m_iNumExtraBytesFieldSeparator(0)
, m_iNumExtraBytesQuoteChar(0)
, m_pCSVProgress(nullptr)
, m_nBufferSize(4096)
{
for(int i=0;i<4;i++)
{
m_cFieldSeparator[i] = static_cast<char>((fieldseparator >> i*8) & 0xFF);
m_cQuoteChar[i] = static_cast<char>((quotechar >> i*8) & 0xFF);
if(i && m_cFieldSeparator[i])
m_iNumExtraBytesFieldSeparator = i;
if(i && m_cQuoteChar[i])
m_iNumExtraBytesQuoteChar = i;
}
}
CSVParser::~CSVParser()
@@ -156,38 +166,26 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
case StateNormal:
{
if(c == m_cFieldSeparator)
if(c == m_cFieldSeparator[0])
{
field = addColumn(record, field, m_bTrimFields);
if(!m_iNumExtraBytesFieldSeparator || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cFieldSeparator[1]))
{
field = addColumn(record, field, m_bTrimFields);
it += m_iNumExtraBytesFieldSeparator;
}
}
else if(c == m_cQuoteChar)
else if(c == m_cQuoteChar[0])
{
state = StateInQuote;
if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
{
state = StateInQuote;
it += m_iNumExtraBytesQuoteChar;
}
}
else if(c == '\r')
{
// look ahead to check for linefeed
auto nit = it + 1;
// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}
// no linefeed, so assume that CR represents a newline
if(nit != sBufferEnd && *nit != '\n')
if(!look_ahead(stream, sBuffer, &it, &sBufferEnd, '\n'))
{
addColumn(record, field, m_bTrimFields);
@@ -210,9 +208,13 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
case StateInQuote:
{
if(c == m_cQuoteChar)
if(c == m_cQuoteChar[0])
{
state = StateEndQuote;
if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
{
state = StateEndQuote;
it += m_iNumExtraBytesQuoteChar;
}
}
else
{
@@ -222,15 +224,23 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
case StateEndQuote:
{
if(c == m_cQuoteChar)
if(c == m_cQuoteChar[0])
{
state = StateInQuote;
addChar(field, c);
if(!m_iNumExtraBytesQuoteChar || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cQuoteChar[1]))
{
state = StateInQuote;
addChar(field, c);
it += m_iNumExtraBytesQuoteChar;
}
}
else if(c == m_cFieldSeparator)
else if(c == m_cFieldSeparator[0])
{
state = StateNormal;
field = addColumn(record, field, m_bTrimFields);
if(!m_iNumExtraBytesFieldSeparator || look_ahead(stream, sBuffer, &it, &sBufferEnd, m_cFieldSeparator[1]))
{
state = StateNormal;
field = addColumn(record, field, m_bTrimFields);
it += m_iNumExtraBytesFieldSeparator;
}
}
else if(c == '\n')
{
@@ -243,19 +253,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
else if(c == '\r')
{
// look ahead to check for linefeed
auto nit = it + 1;
// See above for details on this.
if(nit == sBufferEnd && !stream.atEnd())
{
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}
// no linefeed, so assume that CR represents a newline
if(nit != sBufferEnd && *nit != '\n')
if(!look_ahead(stream, sBuffer, &it, &sBufferEnd, '\n'))
{
addColumn(record, field, m_bTrimFields);
@@ -296,3 +294,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
return (state == StateNormal) ? ParserResult::ParserResultSuccess : ParserResult::ParserResultError;
}
bool CSVParser::look_ahead(QTextStream& stream, QByteArray& sBuffer, const char** it, const char** sBufferEnd, char expected)
{
// look ahead for next byte
auto nit = *it + 1;
// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == *sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
*sBufferEnd = sBuffer.constEnd();
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
*it = *sBufferEnd - 2;
nit = *sBufferEnd - 1;
}
// Check whether there actually is one more byte and it is the expected one
return nit != *sBufferEnd && *nit == expected;
}
+8 -3
View File
@@ -5,6 +5,7 @@
#include <cstdint>
#include <cstddef>
class QByteArray;
class QTextStream;
/*!
@@ -52,7 +53,7 @@ class CSVParser
public:
using csvRowFunction = std::function<bool(size_t, CSVRow)>;
CSVParser(bool trimfields = true, char fieldseparator = ',', char quotechar = '"');
CSVParser(bool trimfields = true, char32_t fieldseparator = ',', char32_t quotechar = '"');
~CSVParser();
enum ParserResult
@@ -84,11 +85,15 @@ private:
private:
bool m_bTrimFields;
char m_cFieldSeparator;
char m_cQuoteChar;
char m_cFieldSeparator[4];
char m_cQuoteChar[4];
int m_iNumExtraBytesFieldSeparator;
int m_iNumExtraBytesQuoteChar;
CSVProgress* m_pCSVProgress;
int64_t m_nBufferSize; //! internal buffer read size
bool look_ahead(QTextStream& stream, QByteArray& sBuffer, const char** it, const char** sBufferEnd, char expected);
};
#endif