Optimise the CSV import performance

This commit bundles a number of smaller optimisations in the CSV parser
and import code. They do add up to a noticible speed gain though (at
least on some systems and configurations).
This commit is contained in:
Martin Kleusberg
2017-09-13 15:03:13 +02:00
parent 6ed8080fdb
commit 0eb1f65798
5 changed files with 93 additions and 90 deletions
+17 -17
View File
@@ -104,7 +104,7 @@ void rollback(
class CSVImportProgress : public CSVProgress
{
public:
explicit CSVImportProgress(size_t filesize)
explicit CSVImportProgress(qint64 filesize)
{
m_pProgressDlg = new QProgressDialog(
QObject::tr("Importing CSV file..."),
@@ -124,7 +124,7 @@ public:
m_pProgressDlg->show();
}
bool update(size_t pos)
bool update(qint64 pos)
{
m_pProgressDlg->setValue(pos);
qApp->processEvents();
@@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader);
// Parse file
parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool {
parseCSV(selectedFile, [this](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Skip first row if it is to be used as header
if(rowNum == 0 && ui->checkboxHeader->isChecked())
return true;
@@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview()
// Fill data section
ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1);
for(QStringList::const_iterator it=data.begin();it!=data.end();++it)
for(auto it=data.constBegin();it!=data.constEnd();++it)
{
// Generate vertical header items
if(it == data.begin())
@@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview()
ui->tablePreview->setItem(
rowNum,
std::distance(data.begin(), it),
new QTableWidgetItem(*it));
new QTableWidgetItem(QString(*it)));
}
return true;
@@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar()
checkInput();
}
CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count)
CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count)
{
// Parse all csv data
QFile file(fileName);
@@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::
CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar());
// Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one.
if(count == -1)
if(count == 0)
csv.setCSVProgress(new CSVImportProgress(file.size()));
QTextStream tstream(&file);
@@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename)
sqlb::FieldVector fieldList; // List of fields in the file
// Parse the first couple of records of the CSV file and only analyse them
parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool {
parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Has this row more columns than the previous one? Then add more fields to the field list as necessary.
for(int i=fieldList.size();i<data.size();i++)
{
@@ -436,7 +436,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return rollback(this, pdb, restorepointName, 0, tr("Creating restore point failed: %1").arg(pdb->lastError()));
// Create table
QStringList nullValues;
QVector<QByteArray> nullValues;
if(!importToExistingTable)
{
if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList))
@@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
if(f->isInteger() && f->notnull()) // If this is an integer column but NULL isn't allowed, insert 0
nullValues << "0";
else if(f->isInteger() && !f->notnull()) // If this is an integer column and NULL is allowed, insert NULL
nullValues << QString();
nullValues << QByteArray();
else // Otherwise (i.e. if this isn't an integer column), insert an empty string
nullValues << "";
}
@@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
// Parse entire file
size_t lastRowNum = 0;
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool {
CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector<QByteArray>& data) -> bool {
// Process the parser results row by row
#ifdef CSV_BENCHMARK
@@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name)
return true;
// Bind all values
unsigned int bound_fields = 0;
for(int i=0;i<data.size();i++,bound_fields++)
int bound_fields = 0;
for(auto it=data.constBegin();it!=data.constEnd();++it,bound_fields++)
{
// Empty values need special treatment, but only when importing into an existing table where we could find out something about
// its table definition
if(importToExistingTable && data.at(i).isEmpty() && nullValues.size() > i)
if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields)
{
// This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into.
QString val = nullValues.at(i);
const QByteArray& val = nullValues.at(bound_fields);
if(!val.isNull()) // No need to bind NULL values here as that is the default bound value in SQLite
sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC);
} else {
// This is a non-empty value. Just add it to the statement
sqlite3_bind_text(stmt, i+1, static_cast<const char*>(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC);
}
}
+1 -1
View File
@@ -39,7 +39,7 @@ private:
DBBrowserDB* pdb;
QCompleter* encodingCompleter;
CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QStringList)> rowFunction, qint64 count = -1);
CSVParser::ParserResult parseCSV(const QString& fileName, std::function<bool(size_t, QVector<QByteArray>)> rowFunction, size_t count = 0);
sqlb::FieldVector generateFieldList(const QString& filename);
void importCsv(const QString& f, const QString &n = QString());
+48 -31
View File
@@ -3,7 +3,7 @@
#include <QTextStream>
#include <algorithm>
CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
@@ -18,34 +18,49 @@ CSVParser::~CSVParser()
}
namespace {
inline void addColumn(QStringList& r, QString& field, bool trim)
inline void addColumn(QVector<QByteArray>& r, QString& field, bool trim)
{
if(trim)
r << field.trimmed();
r.push_back(field.trimmed().toUtf8());
else
r << field;
r.push_back(field.toUtf8());
field.clear();
field.reserve(128);
}
inline bool addRow(CSVParser::csvRowFunction& f, QVector<QByteArray>& r, size_t& rowCount)
{
if(!f(rowCount, r))
return false;
r.clear();
rowCount++;
return true;
}
}
CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords)
{
m_iParsedRows = 0;
m_insertFunction = insertFunction;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
ParseStates state = StateNormal; // State of the parser
QString sBuffer; // Buffer for reading in the file
QString fieldbuf; // Buffer for parsing the current field
QVector<QByteArray> record; // Buffer for parsing the current row
size_t parsedRows = 0; // Number of rows parsed so far
if(m_pCSVProgress)
m_pCSVProgress->start();
while(!stream.atEnd())
{
QString sBuffer = stream.read(m_nBufferSize);
sBuffer = stream.read(m_nBufferSize);
auto sBufferEnd = sBuffer.constEnd();
for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it)
{
QChar c = *it;
// Get next char
char16_t c = it->unicode();
switch(state)
{
case StateNormal:
@@ -61,30 +76,31 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;
// In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize
// boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string
// because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the
// next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular
// case and, if this is what's happening, we'll just load an extra byte.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
// Load one more byte
sBuffer.append(stream.read(1));
sBufferEnd = sBuffer.constEnd();
// Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've
// Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've
// just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that.
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}
// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);
if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
@@ -92,7 +108,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
addColumn(record, fieldbuf, m_bTrimFields);
if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else
@@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);
if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
else if(c == '\r')
{
// look ahead to check for linefeed
QString::iterator nit = it + 1;
auto nit = it + 1;
// See above for details on this.
if(nit == sBuffer.end() && !stream.atEnd())
if(nit == sBufferEnd && !stream.atEnd())
{
sBuffer.append(stream.read(1));
it = sBuffer.end() - 2;
nit = sBuffer.end() - 1;
sBufferEnd = sBuffer.constEnd();
it = sBufferEnd - 2;
nit = sBufferEnd - 1;
}
// no linefeed, so assume that CR represents a newline
if(nit != sBuffer.end() && *nit != '\n')
if(nit != sBufferEnd && *nit != '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);
if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
}
@@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
break;
}
if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
if(nMaxRecords > 0 && parsedRows >= nMaxRecords)
return ParserResult::ParserResultSuccess;
}
if(m_pCSVProgress && m_iParsedRows % 100 == 0)
if(m_pCSVProgress && parsedRows % 100 == 0)
{
if(!m_pCSVProgress->update(stream.pos()))
return ParserResult::ParserResultCancelled;
@@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr
{
addColumn(record, fieldbuf, m_bTrimFields);
if(!addRow(record))
if(!addRow(insertFunction, record, parsedRows))
return ParserResult::ParserResultError;
}
+9 -23
View File
@@ -1,8 +1,7 @@
#ifndef CSVPARSER_H
#define CSVPARSER_H
#include <QChar>
#include <QStringList>
#include <QVector>
#include <functional>
class QTextStream;
@@ -18,16 +17,16 @@ public:
virtual ~CSVProgress() { }
virtual void start() = 0;
virtual bool update(size_t pos) = 0;
virtual bool update(qint64 pos) = 0;
virtual void end() = 0;
};
class CSVParser
{
public:
typedef std::function<bool(size_t, QStringList)> csvRowFunction;
typedef std::function<bool(size_t, QVector<QByteArray>)> csvRowFunction;
CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"');
~CSVParser();
enum ParserResult
@@ -42,10 +41,10 @@ public:
* @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns
* in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true.
* \param stream Stream with the CSV parser
* \param nMaxRecords Max records too read, -1 if unlimited
* \param nMaxRecords Max records too read, 0 if unlimited
* \return ParserResult value that indicated whether action finished normally, was cancelled or errored.
*/
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1);
ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0);
void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }
@@ -57,26 +56,13 @@ private:
StateEndQuote
};
inline bool addRow(QStringList& r)
{
if(!m_insertFunction(m_iParsedRows, r))
return false;
r.clear();
m_iParsedRows++;
return true;
}
private:
bool m_bTrimFields;
QChar m_cFieldSeparator;
QChar m_cQuoteChar;
char16_t m_cFieldSeparator;
char16_t m_cQuoteChar;
CSVProgress* m_pCSVProgress;
csvRowFunction m_insertFunction;
qint64 m_iParsedRows; // Number of rows parsed so far
size_t m_nBufferSize; //! internal buffer read size
qint64 m_nBufferSize; //! internal buffer read size
};
#endif
+18 -18
View File
@@ -27,7 +27,7 @@ void TestImport::csvImport()
QFETCH(char, quote);
QFETCH(QString, encoding);
QFETCH(int, numfields);
QFETCH(QVector<QStringList>, result);
QFETCH(QVector<QVector<QByteArray>>, result);
// Create temporary CSV file
QTemporaryFile file;
@@ -44,9 +44,9 @@ void TestImport::csvImport()
QTextStream tstream(&file);
tstream.setCodec(encoding.toUtf8());
QVector<QStringList> parsedCsv;
QVector<QVector<QByteArray>> parsedCsv;
int parsedCsvColumns = 0;
csvparser.parse([&parsedCsv, &parsedCsvColumns](size_t /*rowNum*/, const QStringList& data) -> bool {
csvparser.parse([&parsedCsv, &parsedCsvColumns](size_t /*rowNum*/, const QVector<QByteArray>& data) -> bool {
parsedCsv.push_back(data);
if(data.size() > parsedCsvColumns)
parsedCsvColumns = data.size();
@@ -65,12 +65,12 @@ void TestImport::csvImport_data()
QTest::addColumn<char>("quote");
QTest::addColumn<QString>("encoding");
QTest::addColumn<int>("numfields");
QTest::addColumn<QVector<QStringList>>("result");
QTest::addColumn<QVector<QVector<QByteArray>>>("result");
QVector<QStringList> result;
result.append(QStringList() << "a" << "b" << "c");
result.append(QStringList() << "d" << "e" << "f");
result.append(QStringList() << "g" << "h" << "i");
QVector<QVector<QByteArray>> result;
result.append(QVector<QByteArray>() << "a" << "b" << "c");
result.append(QVector<QByteArray>() << "d" << "e" << "f");
result.append(QVector<QByteArray>() << "g" << "h" << "i");
QTest::newRow("commas_noquotes") << "a,b,c\nd,e,f\ng,h,i\n"
<< ','
<< (char)0
@@ -109,11 +109,11 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result.append(QStringList() << "a" << "b" << "");
result.append(QStringList() << "c" << "");
result.append(QStringList() << "d" << "" << "e");
result.append(QStringList() << "");
result.append(QStringList() << "" << "" << "f");
result.append(QVector<QByteArray>() << "a" << "b" << "");
result.append(QVector<QByteArray>() << "c" << "");
result.append(QVector<QByteArray>() << "d" << "" << "e");
result.append(QVector<QByteArray>() << "");
result.append(QVector<QByteArray>() << "" << "" << "f");
QTest::newRow("emptyvalues") << "a,b,\nc,\nd,,e\n\n,,f"
<< ','
<< (char)0
@@ -122,7 +122,7 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result.append(QStringList() << "a" << "b" << "c");
result.append(QVector<QByteArray>() << "a" << "b" << "c");
QTest::newRow("oneline") << "a,b,c"
<< ','
<< (char)0
@@ -131,8 +131,8 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result.append(QStringList() << "a,a\"" << "b" << "c");
result.append(QStringList() << "d" << "e" << "\"\"f,f");
result.append(QVector<QByteArray>() << "a,a\"" << "b" << "c");
result.append(QVector<QByteArray>() << "d" << "e" << "\"\"f,f");
QTest::newRow("manyquotes") << "\"a,a\"\"\",\"b\",\"c\"\n\"d\",\"e\",\"\"\"\"\"f,f\"\n"
<< ','
<< '"'
@@ -141,7 +141,7 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result.append(QStringList() << QString::fromUtf8("\xC2\xAE") << QString::fromUtf8("\xC9\x85") << QString::fromUtf8("\xC6\x89"));
result.append(QVector<QByteArray>() << QByteArray("\xC2\xAE") << QByteArray("\xC9\x85") << QByteArray("\xC6\x89"));
QString csv = QString::fromUtf8("\xC2\xAE") + "," + QString::fromUtf8("\xC9\x85") + "," + QString::fromUtf8("\xC6\x89") + "\n";
QTest::newRow("utf8chars") << csv
<< ','
@@ -151,7 +151,7 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result.append(QStringList() << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B"));
result.append(QVector<QByteArray>() << QByteArray("\u4E18") << QByteArray("\u4E26") << QByteArray("\u4E4B"));
QString csv2 = QString::fromUtf8("\u4E18") + "," + QString::fromUtf8("\u4E26") + "," + QString::fromUtf8("\u4E4B") + "\n";
QTest::newRow("utf16chars") << csv2
<< ','