From b7a00d301a2a469ba4a4b430e7b3f1b13fdc2842 Mon Sep 17 00:00:00 2001 From: Martin Kleusberg Date: Sun, 10 Sep 2017 11:07:02 +0200 Subject: [PATCH] Don't track column count when parsing CSV files When parsing a CSV file we used to check the column count for each row and track the highest number of columns that we found. This information then could be used to create an INSERT statement large enough for all the data. This column number tracking code is removed by this commit. Instead it analyses the first 20 rows only. It does that while generating the field list. Performance-wise this should take a (very) little longer but makes it easier to improve the performance in other ways later which should more than compensate this commit. Feature-wise this should fix some (technically invalid) corner-case CSV files with fewer fields in the title row than in the other rows. It should also break some other (technically invalid) corner-case CSV files if they are imported into an existing table and have less columns than the existing table in their first 20 rows but later on the exact same number. Both cases, I think, don't matter too much. --- src/ImportCsvDialog.cpp | 81 +++++++++++++++++++++++++--------------- src/ImportCsvDialog.h | 2 +- src/csvparser.cpp | 2 - src/csvparser.h | 10 +---- src/tests/TestImport.cpp | 1 - 5 files changed, 52 insertions(+), 44 deletions(-) diff --git a/src/ImportCsvDialog.cpp b/src/ImportCsvDialog.cpp index 19ed122d..f2e37047 100644 --- a/src/ImportCsvDialog.cpp +++ b/src/ImportCsvDialog.cpp @@ -194,12 +194,15 @@ void ImportCsvDialog::updatePreview() csv.parse(tstream, 20); file.close(); + // Analyse CSV file + sqlb::FieldVector fieldList = generateFieldList(selectedFile); + // Reset preview widget ui->tablePreview->clear(); - ui->tablePreview->setColumnCount(csv.columns()); + ui->tablePreview->setColumnCount(fieldList.size()); // Exit if there are no lines to preview at all - if(csv.columns() == 0) + if(fieldList.size() == 0) return; // Use first row as header if necessary @@ -293,12 +296,12 @@ void ImportCsvDialog::updateSelection(bool selected) void ImportCsvDialog::matchSimilar() { auto item = ui->filePicker->currentItem(); - auto selectedHeader = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1)); + auto selectedHeader = generateFieldList(item->data(Qt::DisplayRole).toString()); for (int i = 0; i < ui->filePicker->count(); i++) { auto item = ui->filePicker->item(i); - auto header = generateFieldList(parseCSV(item->data(Qt::DisplayRole).toString(), 1)); + auto header = generateFieldList(item->data(Qt::DisplayRole).toString()); bool matchingHeader = false; if (selectedHeader.count() == header.count()) @@ -340,36 +343,50 @@ CSVParser ImportCsvDialog::parseCSV(const QString &fileName, qint64 count) return csv; } -sqlb::FieldVector ImportCsvDialog::generateFieldList(const CSVParser &parser) +sqlb::FieldVector ImportCsvDialog::generateFieldList(const QString& filename) { - if (parser.csv().size() == 0) return sqlb::FieldVector(); + // Parse the first couple of records of the CSV file and only analyse them + CSVParser parser = parseCSV(filename, 20); + + // If there is no data, we don't return any fields + if(parser.csv().size() == 0) + return sqlb::FieldVector(); + + // How many columns are there in the CSV file? + int columns = 0; + for(int i=0;i columns) + columns = parser.csv().at(i).size(); + } // Generate field names. These are either taken from the first CSV row or are generated in the format of "fieldXY" depending on the user input sqlb::FieldVector fieldList; - if(ui->checkboxHeader->isChecked()) + for(int i=0;icheckboxHeader->isChecked() && i < parser.csv().at(0).size()) { - // Remove invalid characters - QString thisfield = *it; - thisfield.replace("`", ""); - thisfield.replace(" ", ""); - thisfield.replace('"', ""); - thisfield.replace("'",""); - thisfield.replace(",",""); - thisfield.replace(";",""); - - // Avoid empty field names - if(thisfield.isEmpty()) - thisfield = QString("field%1").arg(std::distance(parser.csv().at(0).begin(), it) + 1); - - fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(thisfield, ""))); + // Take field name from CSV and remove invalid characters + fieldname = parser.csv().at(0).at(i); + fieldname.replace("`", ""); + fieldname.replace(" ", ""); + fieldname.replace('"', ""); + fieldname.replace("'",""); + fieldname.replace(",",""); + fieldname.replace(";",""); } - } else { - for(size_t i=0; i < parser.columns(); ++i) - fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(QString("field%1").arg(i+1), ""))); + + // If we don't have a field name by now, generate one + if(fieldname.isEmpty()) + fieldname = QString("field%1").arg(i+1); + + // TODO Here's also the place to do some sort of data type analysation of the CSV data + + // Add field to the column list + fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(fieldname, ""))); } return fieldList; @@ -396,11 +413,13 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) tableName = ui->editName->text(); } + // Analyse CSV file + sqlb::FieldVector fieldList = generateFieldList(fileName); + + // Parse entire file CSVParser csv = parseCSV(fileName); if (csv.csv().size() == 0) return; - sqlb::FieldVector fieldList = generateFieldList(csv); - #ifdef CSV_BENCHMARK qint64 timer_after_parsing = timer.elapsed(); #endif @@ -415,7 +434,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) const sqlb::ObjectPtr obj = pdb->getObjectByName(sqlb::ObjectIdentifier("main", tableName)); if(obj && obj->type() == sqlb::Object::Types::Table) { - if((size_t)obj.dynamicCast()->fields().size() != csv.columns()) + if(obj.dynamicCast()->fields().size() != fieldList.size()) { QMessageBox::warning(this, QApplication::applicationName(), tr("There is already a table of that name and an import into an existing table is only possible if the number of columns match.")); @@ -471,7 +490,7 @@ void ImportCsvDialog::importCsv(const QString& fileName, const QString &name) // Prepare the INSERT statement. The prepared statement can then be reused for each row to insert QString sQuery = QString("INSERT INTO %1 VALUES(").arg(sqlb::escapeIdentifier(tableName)); - for(size_t i=1;i<=csv.columns();i++) + for(int i=1;i<=fieldList.size();i++) sQuery.append(QString("?%1,").arg(i)); sQuery.chop(1); // Remove last comma sQuery.append(")"); diff --git a/src/ImportCsvDialog.h b/src/ImportCsvDialog.h index b4e355fe..c6780753 100644 --- a/src/ImportCsvDialog.h +++ b/src/ImportCsvDialog.h @@ -38,7 +38,7 @@ private: QCompleter* encodingCompleter; CSVParser parseCSV(const QString &f, qint64 count = -1); - sqlb::FieldVector generateFieldList(const CSVParser& parser); + sqlb::FieldVector generateFieldList(const QString& filename); void importCsv(const QString& f, const QString &n = QString()); diff --git a/src/csvparser.cpp b/src/csvparser.cpp index 69907a63..55e1825b 100644 --- a/src/csvparser.cpp +++ b/src/csvparser.cpp @@ -8,7 +8,6 @@ CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& , m_cFieldSeparator(fieldseparator) , m_cQuoteChar(quotechar) , m_pCSVProgress(0) - , m_nColumns(0) , m_nBufferSize(4096) { } @@ -32,7 +31,6 @@ inline void addColumn(QStringList& r, QString& field, bool trim) bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords) { m_vCSVData.clear(); - m_nColumns = 0; ParseStates state = StateNormal; QString fieldbuf; QStringList record; diff --git a/src/csvparser.h b/src/csvparser.h index d886ae23..6a6c9ca0 100644 --- a/src/csvparser.h +++ b/src/csvparser.h @@ -44,12 +44,6 @@ public: */ const TCSVResult& csv() const { return m_vCSVData; } - /*! - * \brief columns - * \return Number of columns parsed - */ - size_t columns() const { return m_nColumns; } - void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; } private: @@ -63,7 +57,6 @@ private: inline void addRow(QStringList& r) { m_vCSVData.append(r); - m_nColumns = std::max(r.size(), m_nColumns); r.clear(); } @@ -74,9 +67,8 @@ private: CSVProgress* m_pCSVProgress; TCSVResult m_vCSVData; - size_t m_nColumns; size_t m_nBufferSize; //! internal buffer read size }; -#endif // CSVPARSER_H +#endif diff --git a/src/tests/TestImport.cpp b/src/tests/TestImport.cpp index 6e53bc19..d7624a17 100644 --- a/src/tests/TestImport.cpp +++ b/src/tests/TestImport.cpp @@ -48,7 +48,6 @@ void TestImport::csvImport() // Check return values QCOMPARE(csvparser.csv(), result); - QCOMPARE((int)csvparser.columns(), numfields); } void TestImport::csvImport_data()