From 0eb1f6579815ee1148323ce928d24eab9f0f8002 Mon Sep 17 00:00:00 2001 From: Martin Kleusberg Date: Wed, 13 Sep 2017 15:03:13 +0200 Subject: [PATCH] Optimise the CSV import performance This commit bundles a number of smaller optimisations in the CSV parser and import code. They do add up to a noticible speed gain though (at least on some systems and configurations). --- src/ImportCsvDialog.cpp | 34 ++++++++--------- src/ImportCsvDialog.h | 2 +- src/csvparser.cpp | 79 ++++++++++++++++++++++++---------------- src/csvparser.h | 32 +++++----------- src/tests/TestImport.cpp | 36 +++++++++--------- 5 files changed, 93 insertions(+), 90 deletions(-) diff --git a/src/ImportCsvDialog.cpp b/src/ImportCsvDialog.cpp index 4c47545a..d918472d 100644 --- a/src/ImportCsvDialog.cpp +++ b/src/ImportCsvDialog.cpp @@ -104,7 +104,7 @@ void rollback( class CSVImportProgress : public CSVProgress { public: - explicit CSVImportProgress(size_t filesize) + explicit CSVImportProgress(qint64 filesize) { m_pProgressDlg = new QProgressDialog( QObject::tr("Importing CSV file..."), @@ -124,7 +124,7 @@ public: m_pProgressDlg->show(); } - bool update(size_t pos) + bool update(qint64 pos) { m_pProgressDlg->setValue(pos); qApp->processEvents(); @@ -203,7 +203,7 @@ void ImportCsvDialog::updatePreview() ui->tablePreview->setHorizontalHeaderLabels(horizontalHeader); // Parse file - parseCSV(selectedFile, [this](size_t rowNum, const QStringList& data) -> bool { + parseCSV(selectedFile, [this](size_t rowNum, const QVector& data) -> bool { // Skip first row if it is to be used as header if(rowNum == 0 && ui->checkboxHeader->isChecked()) return true; @@ -215,7 +215,7 @@ void ImportCsvDialog::updatePreview() // Fill data section ui->tablePreview->setRowCount(ui->tablePreview->rowCount() + 1); - for(QStringList::const_iterator it=data.begin();it!=data.end();++it) + for(auto it=data.constBegin();it!=data.constEnd();++it) { // Generate vertical header items if(it == data.begin()) @@ -225,7 +225,7 @@ void ImportCsvDialog::updatePreview() ui->tablePreview->setItem( rowNum, std::distance(data.begin(), it), - new QTableWidgetItem(*it)); + new QTableWidgetItem(QString(*it))); } return true; @@ -320,7 +320,7 @@ void ImportCsvDialog::matchSimilar() checkInput(); } -CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function rowFunction, qint64 count) +CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std::function)> rowFunction, size_t count) { // Parse all csv data QFile file(fileName); @@ -329,7 +329,7 @@ CSVParser::ParserResult ImportCsvDialog::parseCSV(const QString &fileName, std:: CSVParser csv(ui->checkBoxTrimFields->isChecked(), currentSeparatorChar(), currentQuoteChar()); // Only show progress dialog if we parse all rows. The assumption here is that if a row count limit has been set, it won't be a very high one. - if(count == -1) + if(count == 0) csv.setCSVProgress(new CSVImportProgress(file.size())); QTextStream tstream(&file); @@ -343,7 +343,7 @@ sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename) sqlb::FieldVector fieldList; // List of fields in the file // Parse the first couple of records of the CSV file and only analyse them - parseCSV(filename, [this, &fieldList](size_t rowNum, const QStringList& data) -> bool { + parseCSV(filename, [this, &fieldList](size_t rowNum, const QVector& data) -> bool { // Has this row more columns than the previous one? Then add more fields to the field list as necessary. for(int i=fieldList.size();ilastError())); // Create table - QStringList nullValues; + QVector nullValues; if(!importToExistingTable) { if(!pdb->createTable(sqlb::ObjectIdentifier("main", tableName), fieldList)) @@ -454,7 +454,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) if(f->isInteger() && f->notnull()) // If this is an integer column but NULL isn't allowed, insert 0 nullValues << "0"; else if(f->isInteger() && !f->notnull()) // If this is an integer column and NULL is allowed, insert NULL - nullValues << QString(); + nullValues << QByteArray(); else // Otherwise (i.e. if this isn't an integer column), insert an empty string nullValues << ""; } @@ -472,7 +472,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) // Parse entire file size_t lastRowNum = 0; - CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QStringList& data) -> bool { + CSVParser::ParserResult result = parseCSV(fileName, [&](size_t rowNum, const QVector& data) -> bool { // Process the parser results row by row #ifdef CSV_BENCHMARK @@ -487,20 +487,20 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) return true; // Bind all values - unsigned int bound_fields = 0; - for(int i=0;i i) + if(importToExistingTable && it->isEmpty() && nullValues.size() > bound_fields) { // This is an empty value. We'll need to look up how to handle it depending on the field to be inserted into. - QString val = nullValues.at(i); + const QByteArray& val = nullValues.at(bound_fields); if(!val.isNull()) // No need to bind NULL values here as that is the default bound value in SQLite - sqlite3_bind_text(stmt, i+1, val.toUtf8(), val.toUtf8().size(), SQLITE_TRANSIENT); + sqlite3_bind_text(stmt, bound_fields+1, val, val.size(), SQLITE_STATIC); } else { // This is a non-empty value. Just add it to the statement - sqlite3_bind_text(stmt, i+1, static_cast(data.at(i).toUtf8()), data.at(i).toUtf8().size(), SQLITE_TRANSIENT); + sqlite3_bind_text(stmt, bound_fields+1, *it, it->size(), SQLITE_STATIC); } } diff --git a/src/ImportCsvDialog.h b/src/ImportCsvDialog.h index 3eec6cb2..9821e48c 100644 --- a/src/ImportCsvDialog.h +++ b/src/ImportCsvDialog.h @@ -39,7 +39,7 @@ private: DBBrowserDB* pdb; QCompleter* encodingCompleter; - CSVParser::ParserResult parseCSV(const QString& fileName, std::function rowFunction, qint64 count = -1); + CSVParser::ParserResult parseCSV(const QString& fileName, std::function)> rowFunction, size_t count = 0); sqlb::FieldVector generateFieldList(const QString& filename); void importCsv(const QString& f, const QString &n = QString()); diff --git a/src/csvparser.cpp b/src/csvparser.cpp index cdd6e0c1..dfc72a11 100644 --- a/src/csvparser.cpp +++ b/src/csvparser.cpp @@ -3,7 +3,7 @@ #include #include -CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar) +CSVParser::CSVParser(bool trimfields, char16_t fieldseparator, char16_t quotechar) : m_bTrimFields(trimfields) , m_cFieldSeparator(fieldseparator) , m_cQuoteChar(quotechar) @@ -18,34 +18,49 @@ CSVParser::~CSVParser() } namespace { -inline void addColumn(QStringList& r, QString& field, bool trim) +inline void addColumn(QVector& r, QString& field, bool trim) { if(trim) - r << field.trimmed(); + r.push_back(field.trimmed().toUtf8()); else - r << field; + r.push_back(field.toUtf8()); + field.clear(); + field.reserve(128); +} + +inline bool addRow(CSVParser::csvRowFunction& f, QVector& r, size_t& rowCount) +{ + if(!f(rowCount, r)) + return false; + + r.clear(); + rowCount++; + return true; } } -CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords) +CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords) { - m_iParsedRows = 0; - m_insertFunction = insertFunction; - ParseStates state = StateNormal; - QString fieldbuf; - QStringList record; + ParseStates state = StateNormal; // State of the parser + QString sBuffer; // Buffer for reading in the file + QString fieldbuf; // Buffer for parsing the current field + QVector record; // Buffer for parsing the current row + size_t parsedRows = 0; // Number of rows parsed so far if(m_pCSVProgress) m_pCSVProgress->start(); while(!stream.atEnd()) { - QString sBuffer = stream.read(m_nBufferSize); + sBuffer = stream.read(m_nBufferSize); + auto sBufferEnd = sBuffer.constEnd(); - for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it) + for(auto it = sBuffer.constBegin(); it != sBufferEnd; ++it) { - QChar c = *it; + // Get next char + char16_t c = it->unicode(); + switch(state) { case StateNormal: @@ -61,30 +76,31 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr else if(c == '\r') { // look ahead to check for linefeed - QString::iterator nit = it + 1; + auto nit = it + 1; // In order to check what the next byte is we must make sure that that byte is already loaded. Assume we're at an m_nBufferSize // boundary but not at the end of the file when we hit a \r character. Now we're going to be at the end of the sBuffer string // because of the m_nBufferSize boundary. But this means that the following check won't work properly because we can't check the // next byte when we really should be able to do so because there's more data coming. To fix this we'll check for this particular // case and, if this is what's happening, we'll just load an extra byte. - if(nit == sBuffer.end() && !stream.atEnd()) + if(nit == sBufferEnd && !stream.atEnd()) { // Load one more byte sBuffer.append(stream.read(1)); + sBufferEnd = sBuffer.constEnd(); - // Restore both iterators. sBuffer.end() points to the imagined char after the last one in the string. So the extra byte we've + // Restore both iterators. sBufferEnd points to the imagined char after the last one in the string. So the extra byte we've // just loaded is the one before that, i.e. the actual last one, and the original last char is the one before that. - it = sBuffer.end() - 2; - nit = sBuffer.end() - 1; + it = sBufferEnd - 2; + nit = sBufferEnd - 1; } // no linefeed, so assume that CR represents a newline - if(nit != sBuffer.end() && *nit != '\n') + if(nit != sBufferEnd && *nit != '\n') { addColumn(record, fieldbuf, m_bTrimFields); - if(!addRow(record)) + if(!addRow(insertFunction, record, parsedRows)) return ParserResult::ParserResultError; } } @@ -92,7 +108,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr { addColumn(record, fieldbuf, m_bTrimFields); - if(!addRow(record)) + if(!addRow(insertFunction, record, parsedRows)) return ParserResult::ParserResultError; } else @@ -130,28 +146,29 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr state = StateNormal; addColumn(record, fieldbuf, m_bTrimFields); - if(!addRow(record)) + if(!addRow(insertFunction, record, parsedRows)) return ParserResult::ParserResultError; } else if(c == '\r') { // look ahead to check for linefeed - QString::iterator nit = it + 1; + auto nit = it + 1; // See above for details on this. - if(nit == sBuffer.end() && !stream.atEnd()) + if(nit == sBufferEnd && !stream.atEnd()) { sBuffer.append(stream.read(1)); - it = sBuffer.end() - 2; - nit = sBuffer.end() - 1; + sBufferEnd = sBuffer.constEnd(); + it = sBufferEnd - 2; + nit = sBufferEnd - 1; } // no linefeed, so assume that CR represents a newline - if(nit != sBuffer.end() && *nit != '\n') + if(nit != sBufferEnd && *nit != '\n') { addColumn(record, fieldbuf, m_bTrimFields); - if(!addRow(record)) + if(!addRow(insertFunction, record, parsedRows)) return ParserResult::ParserResultError; } } @@ -164,11 +181,11 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr break; } - if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords) + if(nMaxRecords > 0 && parsedRows >= nMaxRecords) return ParserResult::ParserResultSuccess; } - if(m_pCSVProgress && m_iParsedRows % 100 == 0) + if(m_pCSVProgress && parsedRows % 100 == 0) { if(!m_pCSVProgress->update(stream.pos())) return ParserResult::ParserResultCancelled; @@ -179,7 +196,7 @@ CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStr { addColumn(record, fieldbuf, m_bTrimFields); - if(!addRow(record)) + if(!addRow(insertFunction, record, parsedRows)) return ParserResult::ParserResultError; } diff --git a/src/csvparser.h b/src/csvparser.h index 4f8fad57..f6720210 100644 --- a/src/csvparser.h +++ b/src/csvparser.h @@ -1,8 +1,7 @@ #ifndef CSVPARSER_H #define CSVPARSER_H -#include -#include +#include #include class QTextStream; @@ -18,16 +17,16 @@ public: virtual ~CSVProgress() { } virtual void start() = 0; - virtual bool update(size_t pos) = 0; + virtual bool update(qint64 pos) = 0; virtual void end() = 0; }; class CSVParser { public: - typedef std::function csvRowFunction; + typedef std::function)> csvRowFunction; - CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"'); + CSVParser(bool trimfields = true, char16_t fieldseparator = ',', char16_t quotechar = '"'); ~CSVParser(); enum ParserResult @@ -42,10 +41,10 @@ public: * @param insertFunction A function pointer that is called for each parsed row. It is passed two parameters, the row number and a list of all parsed columns * in the row. The called function may return false if an error ocurred to stop the import process. Otherwise it should return true. * \param stream Stream with the CSV parser - * \param nMaxRecords Max records too read, -1 if unlimited + * \param nMaxRecords Max records too read, 0 if unlimited * \return ParserResult value that indicated whether action finished normally, was cancelled or errored. */ - ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords = -1); + ParserResult parse(csvRowFunction insertFunction, QTextStream& stream, size_t nMaxRecords = 0); void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; } @@ -57,26 +56,13 @@ private: StateEndQuote }; - inline bool addRow(QStringList& r) - { - if(!m_insertFunction(m_iParsedRows, r)) - return false; - - r.clear(); - m_iParsedRows++; - return true; - } - private: bool m_bTrimFields; - QChar m_cFieldSeparator; - QChar m_cQuoteChar; + char16_t m_cFieldSeparator; + char16_t m_cQuoteChar; CSVProgress* m_pCSVProgress; - csvRowFunction m_insertFunction; - qint64 m_iParsedRows; // Number of rows parsed so far - - size_t m_nBufferSize; //! internal buffer read size + qint64 m_nBufferSize; //! internal buffer read size }; #endif diff --git a/src/tests/TestImport.cpp b/src/tests/TestImport.cpp index 001af450..8933dd7e 100644 --- a/src/tests/TestImport.cpp +++ b/src/tests/TestImport.cpp @@ -27,7 +27,7 @@ void TestImport::csvImport() QFETCH(char, quote); QFETCH(QString, encoding); QFETCH(int, numfields); - QFETCH(QVector, result); + QFETCH(QVector>, result); // Create temporary CSV file QTemporaryFile file; @@ -44,9 +44,9 @@ void TestImport::csvImport() QTextStream tstream(&file); tstream.setCodec(encoding.toUtf8()); - QVector parsedCsv; + QVector> parsedCsv; int parsedCsvColumns = 0; - csvparser.parse([&parsedCsv, &parsedCsvColumns](size_t /*rowNum*/, const QStringList& data) -> bool { + csvparser.parse([&parsedCsv, &parsedCsvColumns](size_t /*rowNum*/, const QVector& data) -> bool { parsedCsv.push_back(data); if(data.size() > parsedCsvColumns) parsedCsvColumns = data.size(); @@ -65,12 +65,12 @@ void TestImport::csvImport_data() QTest::addColumn("quote"); QTest::addColumn("encoding"); QTest::addColumn("numfields"); - QTest::addColumn>("result"); + QTest::addColumn>>("result"); - QVector result; - result.append(QStringList() << "a" << "b" << "c"); - result.append(QStringList() << "d" << "e" << "f"); - result.append(QStringList() << "g" << "h" << "i"); + QVector> result; + result.append(QVector() << "a" << "b" << "c"); + result.append(QVector() << "d" << "e" << "f"); + result.append(QVector() << "g" << "h" << "i"); QTest::newRow("commas_noquotes") << "a,b,c\nd,e,f\ng,h,i\n" << ',' << (char)0 @@ -109,11 +109,11 @@ void TestImport::csvImport_data() << result; result.clear(); - result.append(QStringList() << "a" << "b" << ""); - result.append(QStringList() << "c" << ""); - result.append(QStringList() << "d" << "" << "e"); - result.append(QStringList() << ""); - result.append(QStringList() << "" << "" << "f"); + result.append(QVector() << "a" << "b" << ""); + result.append(QVector() << "c" << ""); + result.append(QVector() << "d" << "" << "e"); + result.append(QVector() << ""); + result.append(QVector() << "" << "" << "f"); QTest::newRow("emptyvalues") << "a,b,\nc,\nd,,e\n\n,,f" << ',' << (char)0 @@ -122,7 +122,7 @@ void TestImport::csvImport_data() << result; result.clear(); - result.append(QStringList() << "a" << "b" << "c"); + result.append(QVector() << "a" << "b" << "c"); QTest::newRow("oneline") << "a,b,c" << ',' << (char)0 @@ -131,8 +131,8 @@ void TestImport::csvImport_data() << result; result.clear(); - result.append(QStringList() << "a,a\"" << "b" << "c"); - result.append(QStringList() << "d" << "e" << "\"\"f,f"); + result.append(QVector() << "a,a\"" << "b" << "c"); + result.append(QVector() << "d" << "e" << "\"\"f,f"); QTest::newRow("manyquotes") << "\"a,a\"\"\",\"b\",\"c\"\n\"d\",\"e\",\"\"\"\"\"f,f\"\n" << ',' << '"' @@ -141,7 +141,7 @@ void TestImport::csvImport_data() << result; result.clear(); - result.append(QStringList() << QString::fromUtf8("\xC2\xAE") << QString::fromUtf8("\xC9\x85") << QString::fromUtf8("\xC6\x89")); + result.append(QVector() << QByteArray("\xC2\xAE") << QByteArray("\xC9\x85") << QByteArray("\xC6\x89")); QString csv = QString::fromUtf8("\xC2\xAE") + "," + QString::fromUtf8("\xC9\x85") + "," + QString::fromUtf8("\xC6\x89") + "\n"; QTest::newRow("utf8chars") << csv << ',' @@ -151,7 +151,7 @@ void TestImport::csvImport_data() << result; result.clear(); - result.append(QStringList() << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B")); + result.append(QVector() << QByteArray("\u4E18") << QByteArray("\u4E26") << QByteArray("\u4E4B")); QString csv2 = QString::fromUtf8("\u4E18") + "," + QString::fromUtf8("\u4E26") + "," + QString::fromUtf8("\u4E4B") + "\n"; QTest::newRow("utf16chars") << csv2 << ','