From 97e2025cc9127a4e29bb30a8d419bcfc638ec7f0 Mon Sep 17 00:00:00 2001 From: Peinthor Rene Date: Tue, 2 Sep 2014 18:05:04 +0200 Subject: [PATCH] cvsparser: Newly implemented CSV Parser Moved parser into it's own class This parser now proper supports new lines in quoted text and returns a QVector result. --- CMakeLists.txt | 2 + src/ImportCsvDialog.cpp | 171 ++++++++++++++++++++++++++------------- src/csvparser.cpp | 142 ++++++++++++++++++++++++++++++++ src/csvparser.h | 81 +++++++++++++++++++ src/sqlitedb.cpp | 107 ------------------------ src/sqlitedb.h | 2 - src/tests/TestImport.cpp | 52 ++++++++---- src/tests/TestImport.h | 8 ++ tests/CMakeLists.txt | 2 + 9 files changed, 387 insertions(+), 180 deletions(-) create mode 100644 src/csvparser.cpp create mode 100644 src/csvparser.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c986af6d..844fffdd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ endif() set(SQLB_HDR src/gen_version.h src/sqlitetypes.h + src/csvparser.h src/grammar/sqlite3TokenTypes.hpp src/grammar/Sqlite3Lexer.hpp src/grammar/Sqlite3Parser.hpp @@ -73,6 +74,7 @@ set(SQLB_SRC src/sqlitetablemodel.cpp src/sqlitetypes.cpp src/sqltextedit.cpp + src/csvparser.cpp src/DbStructureModel.cpp src/grammar/Sqlite3Lexer.cpp src/grammar/Sqlite3Parser.cpp diff --git a/src/ImportCsvDialog.cpp b/src/ImportCsvDialog.cpp index 37249eda..9564d3ad 100644 --- a/src/ImportCsvDialog.cpp +++ b/src/ImportCsvDialog.cpp @@ -1,6 +1,7 @@ #include "ImportCsvDialog.h" #include "ui_ImportCsvDialog.h" #include "sqlitedb.h" +#include "csvparser.h" #include #include @@ -9,6 +10,9 @@ #include #include #include +#include +#include +#include ImportCsvDialog::ImportCsvDialog(const QString& filename, DBBrowserDB* db, QWidget* parent) : QDialog(parent), @@ -45,27 +49,77 @@ void rollback(ImportCsvDialog* dialog, DBBrowserDB* pdb, QProgressDialog& progre } } +class CSVImportProgress : public CSVProgress +{ +public: + CSVImportProgress(size_t filesize) + { + m_pProgressDlg = new QProgressDialog( + QObject::tr("Decoding CSV file..."), + QObject::tr("Cancel"), + 0, + filesize); + m_pProgressDlg->setWindowModality(Qt::ApplicationModal); + } + + ~CSVImportProgress() + { + delete m_pProgressDlg; + } + + void start() + { + m_pProgressDlg->show(); + } + + bool update(size_t pos) + { + m_pProgressDlg->setValue(pos); + qApp->processEvents(); + + return !m_pProgressDlg->wasCanceled(); + } + + void end() + { + m_pProgressDlg->hide(); + } + +private: + QProgressDialog* m_pProgressDlg; +}; + void ImportCsvDialog::accept() { QString sql; // Parse all csv data - int numfields; - QStringList curList = pdb->decodeCSV(csvFilename, currentSeparatorChar(), currentQuoteChar(), currentEncoding(), -1, &numfields); + QFile file(csvFilename); + file.open(QIODevice::ReadOnly | QIODevice::Text); - // Can not operate on an empty result - if(numfields == 0) + CSVParser csv(true, currentSeparatorChar(), currentQuoteChar()); + csv.setCSVProgress(new CSVImportProgress(file.size())); + + QTextStream tstream(&file); + tstream.setCodec(currentEncoding().toUtf8()); + csv.parse(tstream); + file.close(); + + if(csv.csv().size() == 0) return; // Generate field names. These are either taken from the first CSV row or are generated in the format of "fieldXY" depending on the user input sqlb::FieldVector fieldList; + CSVParser::TCSVResult::const_iterator itBegin = csv.csv().begin(); if(ui->checkboxHeader->isChecked()) { - int cfieldnum = 0; - while(!curList.empty() && cfieldnum != numfields) + ++itBegin; + for(QStringList::const_iterator it = csv.csv().at(0).begin(); + it != csv.csv().at(0).end(); + ++it) { // Remove invalid characters - QString thisfield = curList.front(); + QString thisfield = *it; thisfield.replace("`", ""); thisfield.replace(" ", ""); thisfield.replace('"', ""); @@ -75,24 +129,19 @@ void ImportCsvDialog::accept() // Avoid empty field names if(thisfield.isEmpty()) - thisfield = QString("field%1").arg(cfieldnum+1); + thisfield = QString("field%1").arg(std::distance(csv.csv().at(0).begin(), it) + 1); fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(thisfield, ""))); - cfieldnum++; - curList.pop_front(); } } else { - for(int i=0; i < numfields; ++i) + for(int i=0; i < csv.columns(); ++i) fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(QString("field%1").arg(i+1), ""))); } // Show progress dialog - QProgressDialog progress(tr("Inserting data..."), tr("Cancel"), 0, curList.size()); + QProgressDialog progress(tr("Inserting data..."), tr("Cancel"), 0, csv.csv().size()); progress.setWindowModality(Qt::ApplicationModal); - // declare local variables we will need before the rollback jump - int colNum = 0; - // Are we importing into an existing table? bool importToExistingTable = false; objectMap objects = pdb->getBrowsableObjects(); @@ -100,7 +149,7 @@ void ImportCsvDialog::accept() { if(i.value().gettype() == "table" && i.value().getname() == ui->editName->text()) { - if(i.value().table.fields().size() != numfields) + if(i.value().table.fields().size() != csv.columns()) { QMessageBox::warning(this, QApplication::applicationName(), tr("There is already a table of that name and an import into an existing table is only possible if the number of columns match.")); @@ -131,28 +180,30 @@ void ImportCsvDialog::accept() } // now lets import all data, one row at a time - for(int i=0;ieditName->text()); + sql = QString("INSERT INTO `%1` VALUES(").arg(ui->editName->text()); - // need to mprintf here - char* formSQL = sqlite3_mprintf("%Q", (const char*)curList[i].toUtf8()); - sql.append(formSQL); - if(formSQL) - sqlite3_free(formSQL); - - colNum++; - if(colNum < numfields) + for(QStringList::const_iterator jt = it->begin(); jt != it->end(); ++jt) { - sql.append(","); - } else { - colNum = 0; - sql.append(");"); - if(!pdb->executeSQL(sql, false, false)) - return rollback(this, pdb, progress, restorepointName); + // need to mprintf here + char* formSQL = sqlite3_mprintf("%Q", (const char*)jt->toUtf8()); + sql.append(formSQL); + if(formSQL) + sqlite3_free(formSQL); + + if(jt != (it->end() - 1)) + sql.append((',')); } - progress.setValue(i); + + sql.append(");"); + + if(!pdb->executeSQL(sql, false, false)) + return rollback(this, pdb, progress, restorepointName); + + progress.setValue(std::distance(csv.csv().begin(), it)); if(progress.wasCanceled()) return rollback(this, pdb, progress, restorepointName); } @@ -169,42 +220,52 @@ void ImportCsvDialog::updatePreview() ui->editCustomEncoding->setVisible(ui->comboEncoding->currentIndex() == ui->comboEncoding->count()-1); // Get preview data - int numfields; - int maxrecs = 20; - QStringList curList = pdb->decodeCSV(csvFilename, currentSeparatorChar(), currentQuoteChar(), currentEncoding(), maxrecs, &numfields); + QFile file(csvFilename); + file.open(QIODevice::ReadOnly | QIODevice::Text); + + CSVParser csv(true, currentSeparatorChar(), currentQuoteChar()); + + QTextStream tstream(&file); + tstream.setCodec(currentEncoding().toUtf8()); + csv.parse(tstream, 20); + file.close(); // Reset preview widget ui->tablePreview->clear(); - ui->tablePreview->setColumnCount(numfields); + ui->tablePreview->setColumnCount(csv.columns()); // Exit if there are no lines to preview at all - if(numfields == 0) + if(csv.columns() == 0) return; // Use first row as header if necessary + CSVParser::TCSVResult::const_iterator itBegin = csv.csv().begin(); if(ui->checkboxHeader->isChecked()) { - ui->tablePreview->setHorizontalHeaderLabels(curList); - - // Remove this row to not show it in the data section - for(int e=0;e < numfields; ++e) - curList.pop_front(); + ui->tablePreview->setHorizontalHeaderLabels(*itBegin); + ++itBegin; } // Fill data section - ui->tablePreview->setRowCount(curList.count() / numfields); - int rowNum = 0; - int colNum = 0; - for(QStringList::Iterator ct=curList.begin();ct!=curList.end();++ct) + ui->tablePreview->setRowCount(std::distance(itBegin, csv.csv().end())); + + for(CSVParser::TCSVResult::const_iterator ct = itBegin; + ct != csv.csv().end(); + ++ct) { - if(colNum == 0) - ui->tablePreview->setVerticalHeaderItem(rowNum, new QTableWidgetItem(QString::number(rowNum + 1))); - ui->tablePreview->setItem(rowNum, colNum, new QTableWidgetItem(*ct)); - colNum++; - if(colNum == numfields) + for(QStringList::const_iterator it = ct->begin(); it != ct->end(); ++it) { - colNum = 0; - rowNum++; + int rowNum = std::distance(itBegin, ct); + if(it == ct->begin()) + { + ui->tablePreview->setVerticalHeaderItem( + rowNum, + new QTableWidgetItem(QString::number(rowNum + 1))); + } + ui->tablePreview->setItem( + rowNum, + std::distance(ct->begin(), it), + new QTableWidgetItem(*it)); } } } diff --git a/src/csvparser.cpp b/src/csvparser.cpp new file mode 100644 index 00000000..db0268ff --- /dev/null +++ b/src/csvparser.cpp @@ -0,0 +1,142 @@ +#include "csvparser.h" + +#include +#include + +CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar) + : m_bTrimFields(trimfields) + , m_cFieldSeparator(fieldseparator) + , m_cQuoteChar(quotechar) + , m_pCSVProgress(0) + , m_nBufferSize(4096) +{ +} + +CSVParser::~CSVParser() +{ + delete m_pCSVProgress; +} + +namespace { +inline void addColumn(QStringList& r, QString& field, bool trim) +{ + if(trim) + r << field.trimmed(); + else + r << field; + field.clear(); +} +} + +bool CSVParser::parse(QTextStream& stream, int64_t nMaxRecords) +{ + m_vCSVData.clear(); + m_nColumns = 0; + ParseStates state = StateNormal; + QString fieldbuf; + QStringList record; + + if(m_pCSVProgress) + m_pCSVProgress->start(); + + while(!stream.atEnd()) + { + QString sBuffer = stream.read(m_nBufferSize); + + for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it) + { + QChar c = *it; + switch(state) + { + case StateNormal: + { + if(c == m_cFieldSeparator) + { + addColumn(record, fieldbuf, m_bTrimFields); + } + else if(c == m_cQuoteChar) + { + state = StateInQuote; + } + else if(c == '\r') + { + // look ahead to check for newline + QString::iterator nit = it + 1; + if(nit != sBuffer.end() && *nit != '\n') + fieldbuf.append(c); + } + else if(c == '\n') + { + addColumn(record, fieldbuf, m_bTrimFields); + + addRow(record); + } + else + { + fieldbuf.append(c); + } + } + break; + case StateInQuote: + { + if(c == m_cQuoteChar) + { + state = StateEndQuote; + } + else + { + fieldbuf.append(c); + } + } + break; + case StateEndQuote: + { + if(c == m_cQuoteChar) + { + state = StateInQuote; + fieldbuf.append(c); + } + else if(c == m_cFieldSeparator) + { + state = StateNormal; + addColumn(record, fieldbuf, m_bTrimFields); + } + else if(c == '\n') + { + state = StateNormal; + addColumn(record, fieldbuf, m_bTrimFields); + + addRow(record); + } + else + { + state = StateNormal; + fieldbuf.append(c); + } + } + break; + } + + if(nMaxRecords != -1 && m_vCSVData.size() >= nMaxRecords) + return true; + } + + if(m_pCSVProgress && m_vCSVData.size() % 100 == 0) + { + if(!m_pCSVProgress->update(stream.pos())) + return false; + } + } + + if(!fieldbuf.isEmpty()) + { + addColumn(record, fieldbuf, m_bTrimFields); + + addRow(record); + } + + if(m_pCSVProgress) + m_pCSVProgress->end(); + + return state == StateNormal; +} diff --git a/src/csvparser.h b/src/csvparser.h new file mode 100644 index 00000000..beaaa38c --- /dev/null +++ b/src/csvparser.h @@ -0,0 +1,81 @@ +#ifndef CSVPARSER_H +#define CSVPARSER_H + +#include +#include +#include + +class QTextStream; + +/*! + * \brief The CSVProgress class + * + * This is an abstract class you can provide overriden provde + * to the CSVParser to get progress updates. + */ +class CSVProgress +{ +public: + virtual void start() = 0; + virtual bool update(size_t pos) = 0; + virtual void end() = 0; +}; + +class CSVParser +{ +public: + typedef QVector TCSVResult; + + CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"'); + ~CSVParser(); + + /*! + * \brief parse the given stream + * \param stream Stream with the CSV parser + * \param nMaxRecords Max records too read, -1 if unlimited + * \return True if parsing worked. + */ + bool parse(QTextStream& stream, int64_t nMaxRecords = -1); + + /*! + * \brief csv + * \return The parse result + */ + const TCSVResult& csv() const { return m_vCSVData; } + + /*! + * \brief columns + * \return Number of columns parsed + */ + size_t columns() const { return m_nColumns; } + + void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; } + +private: + enum ParseStates + { + StateNormal, + StateInQuote, + StateEndQuote + }; + + inline void addRow(QStringList& r) + { + m_vCSVData.append(r); + m_nColumns = std::max(r.size(), m_nColumns); + r.clear(); + } + +private: + bool m_bTrimFields; + QChar m_cFieldSeparator; + QChar m_cQuoteChar; + CSVProgress* m_pCSVProgress; + + TCSVResult m_vCSVData; + size_t m_nColumns; + + size_t m_nBufferSize; //! internal buffer read size +}; + +#endif // CSVPARSER_H diff --git a/src/sqlitedb.cpp b/src/sqlitedb.cpp index a1cf397f..63c47e74 100644 --- a/src/sqlitedb.cpp +++ b/src/sqlitedb.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -854,112 +853,6 @@ void DBBrowserDB::updateSchema( ) } } -QStringList DBBrowserDB::decodeCSV(const QString & csvfilename, char sep, char quote, const QString& encoding, int maxrecords, int * numfields) -{ - QFile file(csvfilename); - QStringList result; - *numfields = 0; - int recs = 0; - - if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) { - return result; - } - - //Other than QFile, the QTextStream-class properly detects 2-Byte QChars and converts them accordingly (UTF-8) - QTextStream inStream(&file); - inStream.setCodec(encoding.toUtf8()); - - QProgressDialog progress(QObject::tr("Decoding CSV file..."), QObject::tr("Cancel"), 0, file.size()); - progress.setWindowModality(Qt::ApplicationModal); - - - - while (!inStream.atEnd()) { - - bool inquotemode = false; - bool inescapemode = false; - QString line = ""; - QString current = ""; - - line = inStream.readLine(); - - //For every Line, we iterate over the single QChars - QString::ConstIterator i = line.begin(); - - while (i != line.end()) { - - QChar c = *i; - - if (c==quote){ - if (inquotemode){ - if (inescapemode){ - inescapemode = false; - //add the escaped char here - current.append(c); - } else { - //are we escaping, or just finishing the quote? - i++; //Performing lookahead using the iterator - QChar d = *i; - - if (d==quote) { - inescapemode = true; - } else { - inquotemode = false; - } - i--; - } - } else { - inquotemode = true; - } - } else if (c==sep) { - if (inquotemode){ - //add the sep here - current.append(c); - } else { - //not quoting, start new record - result << current; - current = ""; - } - } else if (c==10 || c==13) { - if (inquotemode){ - //add the newline/carrier return - current.append(c); - } - } else if (c==32) { - - // Only append blanks if we are inside of quotes - if (inquotemode || quote == 0) { - current.append(c); - } - - } else {//another character type - current.append(c); - } - - i++; - } - - //Moved this block from (c==10), as line-separation is now handeled by the outer-loop - result << current; - - if (*numfields == 0){ - *numfields = result.count(); - } - recs++; - progress.setValue(file.pos()); - qApp->processEvents(); - - if ( (progress.wasCanceled() || recs>maxrecords) && maxrecords!=-1) { - break; - } - } - - file.close(); - - return result; - -} - QString DBBrowserDB::getPragma(const QString& pragma) { if(!isOpen()) diff --git a/src/sqlitedb.h b/src/sqlitedb.h index 38cf147a..fd86b200 100644 --- a/src/sqlitedb.h +++ b/src/sqlitedb.h @@ -101,8 +101,6 @@ public: sqlite3 * _db; - QStringList decodeCSV(const QString & csvfilename, char sep, char quote, const QString& encoding, int maxrecords, int * numfields); - objectMap objMap; QString lastErrorMessage; diff --git a/src/tests/TestImport.cpp b/src/tests/TestImport.cpp index e3d3c9ad..439f4232 100644 --- a/src/tests/TestImport.cpp +++ b/src/tests/TestImport.cpp @@ -1,10 +1,29 @@ #include #include #include +#include +#include "csvparser.h" #include "TestImport.h" #include "../sqlitedb.h" +Q_DECLARE_METATYPE(CSVParser::TCSVResult) + +TestImport::TestImport() +{ + // Init basic application + // The app needs to be initialized for the utf8 test + // to work + int argcount = 1; + const char* appname = "sqlb-unittests"; + app = new QApplication(argcount, const_cast(&appname)); +} + +TestImport::~TestImport() +{ + delete app; +} + void TestImport::csvImport() { // Fetch data @@ -13,12 +32,7 @@ void TestImport::csvImport() QFETCH(char, quote); QFETCH(QString, encoding); QFETCH(int, numfields); - QFETCH(QStringList, result); - - // Init basic application - int argcount = 1; - const char* appname = "sqlb-unittests"; - QApplication app(argcount, const_cast(&appname)); + QFETCH(QVector, result); // Create temporary CSV file QTemporaryFile file; @@ -28,12 +42,15 @@ void TestImport::csvImport() // Call decodeCSV function DBBrowserDB db; - int numfields_read; - QStringList retval = db.decodeCSV(file.fileName(), separator, quote, encoding, -1, &numfields_read); + + CSVParser csvparser(true, separator, quote); + file.seek(0); + QTextStream tstream(&file); + csvparser.parse(tstream); // Check return values - QCOMPARE(retval, result); - QCOMPARE(numfields_read, numfields); + QCOMPARE(csvparser.csv(), result); + QCOMPARE((int)csvparser.columns(), numfields); } void TestImport::csvImport_data() @@ -43,10 +60,12 @@ void TestImport::csvImport_data() QTest::addColumn("quote"); QTest::addColumn("encoding"); QTest::addColumn("numfields"); - QTest::addColumn("result"); + QTest::addColumn("result"); - QStringList result; - result << "a" << "b" << "c" << "d" << "e" << "f" << "g" << "h" << "i"; + CSVParser::TCSVResult result; + result.append(QStringList() << "a" << "b" << "c"); + result.append(QStringList() << "d" << "e" << "f"); + result.append(QStringList() << "g" << "h" << "i"); QTest::newRow("commas_noquotes") << "a,b,c\nd,e,f\ng,h,i\n" << ',' << (char)0 @@ -79,7 +98,7 @@ void TestImport::csvImport_data() << result; result.clear(); - result << "a" << "b" << "c"; + result.append(QStringList() << "a" << "b" << "c"); QTest::newRow("oneline") << "a,b,c" << ',' << (char)0 @@ -88,7 +107,8 @@ void TestImport::csvImport_data() << result; result.clear(); - result << "a,a\"" << "b" << "c" << "d" << "e" << "\"\"f,f"; + result.append(QStringList() << "a,a\"" << "b" << "c"); + result.append(QStringList() << "d" << "e" << "\"\"f,f"); QTest::newRow("manyquotes") << "\"a,a\"\"\",\"b\",\"c\"\n\"d\",\"e\",\"\"\"\"\"f,f\"\n" << ',' << '"' @@ -97,7 +117,7 @@ void TestImport::csvImport_data() << result; result.clear(); - result << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B"); + result.append(QStringList() << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B")); QString csv = QString::fromUtf8("\u4E18") + "," + QString::fromUtf8("\u4E26") + "," + QString::fromUtf8("\u4E4B") + "\n"; QTest::newRow("utf8chars") << csv << ',' diff --git a/src/tests/TestImport.h b/src/tests/TestImport.h index c2b252bd..669ae0a9 100644 --- a/src/tests/TestImport.h +++ b/src/tests/TestImport.h @@ -2,11 +2,19 @@ #define TESTIMPORT_H #include +#include class TestImport : public QObject { Q_OBJECT +public: + TestImport(); + ~TestImport(); + +private: + QApplication* app; + private slots: void csvImport(); void csvImport_data(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f6722d5f..5b627cd1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -32,6 +32,7 @@ set(SQLB_SRC ../src/sqlitedb.cpp ../src/sqlitetablemodel.cpp ../src/sqlitetypes.cpp + ../src/csvparser.cpp ../src/grammar/Sqlite3Lexer.cpp ../src/grammar/Sqlite3Parser.cpp ../src/tests/TestImport.cpp @@ -42,6 +43,7 @@ set(SQLB_HDR ../src/grammar/sqlite3TokenTypes.hpp ../src/grammar/Sqlite3Lexer.hpp ../src/grammar/Sqlite3Parser.hpp + ../src/csvparser.h ../src/sqlitetypes.h) set(SQLB_MOC_HDR