cvsparser: Newly implemented CSV Parser

Moved parser into it's own class
This parser now proper supports new lines in quoted text
and returns a QVector<QStringList> result.
This commit is contained in:
Peinthor Rene
2014-09-02 18:05:04 +02:00
parent 1430033767
commit 97e2025cc9
9 changed files with 387 additions and 180 deletions

View File

@@ -29,6 +29,7 @@ endif()
set(SQLB_HDR
src/gen_version.h
src/sqlitetypes.h
src/csvparser.h
src/grammar/sqlite3TokenTypes.hpp
src/grammar/Sqlite3Lexer.hpp
src/grammar/Sqlite3Parser.hpp
@@ -73,6 +74,7 @@ set(SQLB_SRC
src/sqlitetablemodel.cpp
src/sqlitetypes.cpp
src/sqltextedit.cpp
src/csvparser.cpp
src/DbStructureModel.cpp
src/grammar/Sqlite3Lexer.cpp
src/grammar/Sqlite3Parser.cpp

View File

@@ -1,6 +1,7 @@
#include "ImportCsvDialog.h"
#include "ui_ImportCsvDialog.h"
#include "sqlitedb.h"
#include "csvparser.h"
#include <QMessageBox>
#include <QProgressDialog>
@@ -9,6 +10,9 @@
#include <QTextCodec>
#include <QCompleter>
#include <sqlite3.h>
#include <QFile>
#include <QTextStream>
#include <memory>
ImportCsvDialog::ImportCsvDialog(const QString& filename, DBBrowserDB* db, QWidget* parent)
: QDialog(parent),
@@ -45,27 +49,77 @@ void rollback(ImportCsvDialog* dialog, DBBrowserDB* pdb, QProgressDialog& progre
}
}
class CSVImportProgress : public CSVProgress
{
public:
CSVImportProgress(size_t filesize)
{
m_pProgressDlg = new QProgressDialog(
QObject::tr("Decoding CSV file..."),
QObject::tr("Cancel"),
0,
filesize);
m_pProgressDlg->setWindowModality(Qt::ApplicationModal);
}
~CSVImportProgress()
{
delete m_pProgressDlg;
}
void start()
{
m_pProgressDlg->show();
}
bool update(size_t pos)
{
m_pProgressDlg->setValue(pos);
qApp->processEvents();
return !m_pProgressDlg->wasCanceled();
}
void end()
{
m_pProgressDlg->hide();
}
private:
QProgressDialog* m_pProgressDlg;
};
void ImportCsvDialog::accept()
{
QString sql;
// Parse all csv data
int numfields;
QStringList curList = pdb->decodeCSV(csvFilename, currentSeparatorChar(), currentQuoteChar(), currentEncoding(), -1, &numfields);
QFile file(csvFilename);
file.open(QIODevice::ReadOnly | QIODevice::Text);
// Can not operate on an empty result
if(numfields == 0)
CSVParser csv(true, currentSeparatorChar(), currentQuoteChar());
csv.setCSVProgress(new CSVImportProgress(file.size()));
QTextStream tstream(&file);
tstream.setCodec(currentEncoding().toUtf8());
csv.parse(tstream);
file.close();
if(csv.csv().size() == 0)
return;
// Generate field names. These are either taken from the first CSV row or are generated in the format of "fieldXY" depending on the user input
sqlb::FieldVector fieldList;
CSVParser::TCSVResult::const_iterator itBegin = csv.csv().begin();
if(ui->checkboxHeader->isChecked())
{
int cfieldnum = 0;
while(!curList.empty() && cfieldnum != numfields)
++itBegin;
for(QStringList::const_iterator it = csv.csv().at(0).begin();
it != csv.csv().at(0).end();
++it)
{
// Remove invalid characters
QString thisfield = curList.front();
QString thisfield = *it;
thisfield.replace("`", "");
thisfield.replace(" ", "");
thisfield.replace('"', "");
@@ -75,24 +129,19 @@ void ImportCsvDialog::accept()
// Avoid empty field names
if(thisfield.isEmpty())
thisfield = QString("field%1").arg(cfieldnum+1);
thisfield = QString("field%1").arg(std::distance(csv.csv().at(0).begin(), it) + 1);
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(thisfield, "")));
cfieldnum++;
curList.pop_front();
}
} else {
for(int i=0; i < numfields; ++i)
for(int i=0; i < csv.columns(); ++i)
fieldList.push_back(sqlb::FieldPtr(new sqlb::Field(QString("field%1").arg(i+1), "")));
}
// Show progress dialog
QProgressDialog progress(tr("Inserting data..."), tr("Cancel"), 0, curList.size());
QProgressDialog progress(tr("Inserting data..."), tr("Cancel"), 0, csv.csv().size());
progress.setWindowModality(Qt::ApplicationModal);
// declare local variables we will need before the rollback jump
int colNum = 0;
// Are we importing into an existing table?
bool importToExistingTable = false;
objectMap objects = pdb->getBrowsableObjects();
@@ -100,7 +149,7 @@ void ImportCsvDialog::accept()
{
if(i.value().gettype() == "table" && i.value().getname() == ui->editName->text())
{
if(i.value().table.fields().size() != numfields)
if(i.value().table.fields().size() != csv.columns())
{
QMessageBox::warning(this, QApplication::applicationName(),
tr("There is already a table of that name and an import into an existing table is only possible if the number of columns match."));
@@ -131,28 +180,30 @@ void ImportCsvDialog::accept()
}
// now lets import all data, one row at a time
for(int i=0;i<curList.size();++i)
for(CSVParser::TCSVResult::const_iterator it = itBegin;
it != csv.csv().end();
++it)
{
if(colNum == 0)
sql = QString("INSERT INTO `%1` VALUES(").arg(ui->editName->text());
sql = QString("INSERT INTO `%1` VALUES(").arg(ui->editName->text());
// need to mprintf here
char* formSQL = sqlite3_mprintf("%Q", (const char*)curList[i].toUtf8());
sql.append(formSQL);
if(formSQL)
sqlite3_free(formSQL);
colNum++;
if(colNum < numfields)
for(QStringList::const_iterator jt = it->begin(); jt != it->end(); ++jt)
{
sql.append(",");
} else {
colNum = 0;
sql.append(");");
if(!pdb->executeSQL(sql, false, false))
return rollback(this, pdb, progress, restorepointName);
// need to mprintf here
char* formSQL = sqlite3_mprintf("%Q", (const char*)jt->toUtf8());
sql.append(formSQL);
if(formSQL)
sqlite3_free(formSQL);
if(jt != (it->end() - 1))
sql.append((','));
}
progress.setValue(i);
sql.append(");");
if(!pdb->executeSQL(sql, false, false))
return rollback(this, pdb, progress, restorepointName);
progress.setValue(std::distance(csv.csv().begin(), it));
if(progress.wasCanceled())
return rollback(this, pdb, progress, restorepointName);
}
@@ -169,42 +220,52 @@ void ImportCsvDialog::updatePreview()
ui->editCustomEncoding->setVisible(ui->comboEncoding->currentIndex() == ui->comboEncoding->count()-1);
// Get preview data
int numfields;
int maxrecs = 20;
QStringList curList = pdb->decodeCSV(csvFilename, currentSeparatorChar(), currentQuoteChar(), currentEncoding(), maxrecs, &numfields);
QFile file(csvFilename);
file.open(QIODevice::ReadOnly | QIODevice::Text);
CSVParser csv(true, currentSeparatorChar(), currentQuoteChar());
QTextStream tstream(&file);
tstream.setCodec(currentEncoding().toUtf8());
csv.parse(tstream, 20);
file.close();
// Reset preview widget
ui->tablePreview->clear();
ui->tablePreview->setColumnCount(numfields);
ui->tablePreview->setColumnCount(csv.columns());
// Exit if there are no lines to preview at all
if(numfields == 0)
if(csv.columns() == 0)
return;
// Use first row as header if necessary
CSVParser::TCSVResult::const_iterator itBegin = csv.csv().begin();
if(ui->checkboxHeader->isChecked())
{
ui->tablePreview->setHorizontalHeaderLabels(curList);
// Remove this row to not show it in the data section
for(int e=0;e < numfields; ++e)
curList.pop_front();
ui->tablePreview->setHorizontalHeaderLabels(*itBegin);
++itBegin;
}
// Fill data section
ui->tablePreview->setRowCount(curList.count() / numfields);
int rowNum = 0;
int colNum = 0;
for(QStringList::Iterator ct=curList.begin();ct!=curList.end();++ct)
ui->tablePreview->setRowCount(std::distance(itBegin, csv.csv().end()));
for(CSVParser::TCSVResult::const_iterator ct = itBegin;
ct != csv.csv().end();
++ct)
{
if(colNum == 0)
ui->tablePreview->setVerticalHeaderItem(rowNum, new QTableWidgetItem(QString::number(rowNum + 1)));
ui->tablePreview->setItem(rowNum, colNum, new QTableWidgetItem(*ct));
colNum++;
if(colNum == numfields)
for(QStringList::const_iterator it = ct->begin(); it != ct->end(); ++it)
{
colNum = 0;
rowNum++;
int rowNum = std::distance(itBegin, ct);
if(it == ct->begin())
{
ui->tablePreview->setVerticalHeaderItem(
rowNum,
new QTableWidgetItem(QString::number(rowNum + 1)));
}
ui->tablePreview->setItem(
rowNum,
std::distance(ct->begin(), it),
new QTableWidgetItem(*it));
}
}
}

142
src/csvparser.cpp Normal file
View File

@@ -0,0 +1,142 @@
#include "csvparser.h"
#include <QTextStream>
#include <algorithm>
CSVParser::CSVParser(bool trimfields, const QChar& fieldseparator, const QChar& quotechar)
: m_bTrimFields(trimfields)
, m_cFieldSeparator(fieldseparator)
, m_cQuoteChar(quotechar)
, m_pCSVProgress(0)
, m_nBufferSize(4096)
{
}
CSVParser::~CSVParser()
{
delete m_pCSVProgress;
}
namespace {
inline void addColumn(QStringList& r, QString& field, bool trim)
{
if(trim)
r << field.trimmed();
else
r << field;
field.clear();
}
}
bool CSVParser::parse(QTextStream& stream, int64_t nMaxRecords)
{
m_vCSVData.clear();
m_nColumns = 0;
ParseStates state = StateNormal;
QString fieldbuf;
QStringList record;
if(m_pCSVProgress)
m_pCSVProgress->start();
while(!stream.atEnd())
{
QString sBuffer = stream.read(m_nBufferSize);
for(QString::iterator it = sBuffer.begin(); it != sBuffer.end(); ++it)
{
QChar c = *it;
switch(state)
{
case StateNormal:
{
if(c == m_cFieldSeparator)
{
addColumn(record, fieldbuf, m_bTrimFields);
}
else if(c == m_cQuoteChar)
{
state = StateInQuote;
}
else if(c == '\r')
{
// look ahead to check for newline
QString::iterator nit = it + 1;
if(nit != sBuffer.end() && *nit != '\n')
fieldbuf.append(c);
}
else if(c == '\n')
{
addColumn(record, fieldbuf, m_bTrimFields);
addRow(record);
}
else
{
fieldbuf.append(c);
}
}
break;
case StateInQuote:
{
if(c == m_cQuoteChar)
{
state = StateEndQuote;
}
else
{
fieldbuf.append(c);
}
}
break;
case StateEndQuote:
{
if(c == m_cQuoteChar)
{
state = StateInQuote;
fieldbuf.append(c);
}
else if(c == m_cFieldSeparator)
{
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);
}
else if(c == '\n')
{
state = StateNormal;
addColumn(record, fieldbuf, m_bTrimFields);
addRow(record);
}
else
{
state = StateNormal;
fieldbuf.append(c);
}
}
break;
}
if(nMaxRecords != -1 && m_vCSVData.size() >= nMaxRecords)
return true;
}
if(m_pCSVProgress && m_vCSVData.size() % 100 == 0)
{
if(!m_pCSVProgress->update(stream.pos()))
return false;
}
}
if(!fieldbuf.isEmpty())
{
addColumn(record, fieldbuf, m_bTrimFields);
addRow(record);
}
if(m_pCSVProgress)
m_pCSVProgress->end();
return state == StateNormal;
}

81
src/csvparser.h Normal file
View File

@@ -0,0 +1,81 @@
#ifndef CSVPARSER_H
#define CSVPARSER_H
#include <QChar>
#include <QVector>
#include <QStringList>
class QTextStream;
/*!
* \brief The CSVProgress class
*
* This is an abstract class you can provide overriden provde
* to the CSVParser to get progress updates.
*/
class CSVProgress
{
public:
virtual void start() = 0;
virtual bool update(size_t pos) = 0;
virtual void end() = 0;
};
class CSVParser
{
public:
typedef QVector<QStringList> TCSVResult;
CSVParser(bool trimfields = true, const QChar& fieldseparator = ',', const QChar& quotechar = '"');
~CSVParser();
/*!
* \brief parse the given stream
* \param stream Stream with the CSV parser
* \param nMaxRecords Max records too read, -1 if unlimited
* \return True if parsing worked.
*/
bool parse(QTextStream& stream, int64_t nMaxRecords = -1);
/*!
* \brief csv
* \return The parse result
*/
const TCSVResult& csv() const { return m_vCSVData; }
/*!
* \brief columns
* \return Number of columns parsed
*/
size_t columns() const { return m_nColumns; }
void setCSVProgress(CSVProgress* csvp) { m_pCSVProgress = csvp; }
private:
enum ParseStates
{
StateNormal,
StateInQuote,
StateEndQuote
};
inline void addRow(QStringList& r)
{
m_vCSVData.append(r);
m_nColumns = std::max<size_t>(r.size(), m_nColumns);
r.clear();
}
private:
bool m_bTrimFields;
QChar m_cFieldSeparator;
QChar m_cQuoteChar;
CSVProgress* m_pCSVProgress;
TCSVResult m_vCSVData;
size_t m_nColumns;
size_t m_nBufferSize; //! internal buffer read size
};
#endif // CSVPARSER_H

View File

@@ -5,7 +5,6 @@
#include <QMessageBox>
#include <QProgressDialog>
#include <QApplication>
#include <QTextStream>
#include <QSettings>
#include <QDebug>
#include <sqlite3.h>
@@ -854,112 +853,6 @@ void DBBrowserDB::updateSchema( )
}
}
QStringList DBBrowserDB::decodeCSV(const QString & csvfilename, char sep, char quote, const QString& encoding, int maxrecords, int * numfields)
{
QFile file(csvfilename);
QStringList result;
*numfields = 0;
int recs = 0;
if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
return result;
}
//Other than QFile, the QTextStream-class properly detects 2-Byte QChars and converts them accordingly (UTF-8)
QTextStream inStream(&file);
inStream.setCodec(encoding.toUtf8());
QProgressDialog progress(QObject::tr("Decoding CSV file..."), QObject::tr("Cancel"), 0, file.size());
progress.setWindowModality(Qt::ApplicationModal);
while (!inStream.atEnd()) {
bool inquotemode = false;
bool inescapemode = false;
QString line = "";
QString current = "";
line = inStream.readLine();
//For every Line, we iterate over the single QChars
QString::ConstIterator i = line.begin();
while (i != line.end()) {
QChar c = *i;
if (c==quote){
if (inquotemode){
if (inescapemode){
inescapemode = false;
//add the escaped char here
current.append(c);
} else {
//are we escaping, or just finishing the quote?
i++; //Performing lookahead using the iterator
QChar d = *i;
if (d==quote) {
inescapemode = true;
} else {
inquotemode = false;
}
i--;
}
} else {
inquotemode = true;
}
} else if (c==sep) {
if (inquotemode){
//add the sep here
current.append(c);
} else {
//not quoting, start new record
result << current;
current = "";
}
} else if (c==10 || c==13) {
if (inquotemode){
//add the newline/carrier return
current.append(c);
}
} else if (c==32) {
// Only append blanks if we are inside of quotes
if (inquotemode || quote == 0) {
current.append(c);
}
} else {//another character type
current.append(c);
}
i++;
}
//Moved this block from (c==10), as line-separation is now handeled by the outer-loop
result << current;
if (*numfields == 0){
*numfields = result.count();
}
recs++;
progress.setValue(file.pos());
qApp->processEvents();
if ( (progress.wasCanceled() || recs>maxrecords) && maxrecords!=-1) {
break;
}
}
file.close();
return result;
}
QString DBBrowserDB::getPragma(const QString& pragma)
{
if(!isOpen())

View File

@@ -101,8 +101,6 @@ public:
sqlite3 * _db;
QStringList decodeCSV(const QString & csvfilename, char sep, char quote, const QString& encoding, int maxrecords, int * numfields);
objectMap objMap;
QString lastErrorMessage;

View File

@@ -1,10 +1,29 @@
#include <QTemporaryFile>
#include <QtTest/QTest>
#include <QApplication>
#include <QTextStream>
#include "csvparser.h"
#include "TestImport.h"
#include "../sqlitedb.h"
Q_DECLARE_METATYPE(CSVParser::TCSVResult)
TestImport::TestImport()
{
// Init basic application
// The app needs to be initialized for the utf8 test
// to work
int argcount = 1;
const char* appname = "sqlb-unittests";
app = new QApplication(argcount, const_cast<char**>(&appname));
}
TestImport::~TestImport()
{
delete app;
}
void TestImport::csvImport()
{
// Fetch data
@@ -13,12 +32,7 @@ void TestImport::csvImport()
QFETCH(char, quote);
QFETCH(QString, encoding);
QFETCH(int, numfields);
QFETCH(QStringList, result);
// Init basic application
int argcount = 1;
const char* appname = "sqlb-unittests";
QApplication app(argcount, const_cast<char**>(&appname));
QFETCH(QVector<QStringList>, result);
// Create temporary CSV file
QTemporaryFile file;
@@ -28,12 +42,15 @@ void TestImport::csvImport()
// Call decodeCSV function
DBBrowserDB db;
int numfields_read;
QStringList retval = db.decodeCSV(file.fileName(), separator, quote, encoding, -1, &numfields_read);
CSVParser csvparser(true, separator, quote);
file.seek(0);
QTextStream tstream(&file);
csvparser.parse(tstream);
// Check return values
QCOMPARE(retval, result);
QCOMPARE(numfields_read, numfields);
QCOMPARE(csvparser.csv(), result);
QCOMPARE((int)csvparser.columns(), numfields);
}
void TestImport::csvImport_data()
@@ -43,10 +60,12 @@ void TestImport::csvImport_data()
QTest::addColumn<char>("quote");
QTest::addColumn<QString>("encoding");
QTest::addColumn<int>("numfields");
QTest::addColumn<QStringList>("result");
QTest::addColumn<CSVParser::TCSVResult>("result");
QStringList result;
result << "a" << "b" << "c" << "d" << "e" << "f" << "g" << "h" << "i";
CSVParser::TCSVResult result;
result.append(QStringList() << "a" << "b" << "c");
result.append(QStringList() << "d" << "e" << "f");
result.append(QStringList() << "g" << "h" << "i");
QTest::newRow("commas_noquotes") << "a,b,c\nd,e,f\ng,h,i\n"
<< ','
<< (char)0
@@ -79,7 +98,7 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result << "a" << "b" << "c";
result.append(QStringList() << "a" << "b" << "c");
QTest::newRow("oneline") << "a,b,c"
<< ','
<< (char)0
@@ -88,7 +107,8 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result << "a,a\"" << "b" << "c" << "d" << "e" << "\"\"f,f";
result.append(QStringList() << "a,a\"" << "b" << "c");
result.append(QStringList() << "d" << "e" << "\"\"f,f");
QTest::newRow("manyquotes") << "\"a,a\"\"\",\"b\",\"c\"\n\"d\",\"e\",\"\"\"\"\"f,f\"\n"
<< ','
<< '"'
@@ -97,7 +117,7 @@ void TestImport::csvImport_data()
<< result;
result.clear();
result << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B");
result.append(QStringList() << QString::fromUtf8("\u4E18") << QString::fromUtf8("\u4E26") << QString::fromUtf8("\u4E4B"));
QString csv = QString::fromUtf8("\u4E18") + "," + QString::fromUtf8("\u4E26") + "," + QString::fromUtf8("\u4E4B") + "\n";
QTest::newRow("utf8chars") << csv
<< ','

View File

@@ -2,11 +2,19 @@
#define TESTIMPORT_H
#include <QObject>
#include <QApplication>
class TestImport : public QObject
{
Q_OBJECT
public:
TestImport();
~TestImport();
private:
QApplication* app;
private slots:
void csvImport();
void csvImport_data();

View File

@@ -32,6 +32,7 @@ set(SQLB_SRC
../src/sqlitedb.cpp
../src/sqlitetablemodel.cpp
../src/sqlitetypes.cpp
../src/csvparser.cpp
../src/grammar/Sqlite3Lexer.cpp
../src/grammar/Sqlite3Parser.cpp
../src/tests/TestImport.cpp
@@ -42,6 +43,7 @@ set(SQLB_HDR
../src/grammar/sqlite3TokenTypes.hpp
../src/grammar/Sqlite3Lexer.hpp
../src/grammar/Sqlite3Parser.hpp
../src/csvparser.h
../src/sqlitetypes.h)
set(SQLB_MOC_HDR