Don't parse entire CSV file before inserting the first row

We were separating the CSV import into two steps: parsing the CSV file and inserting the parsed data. This had the advantages that it keeps the parsing code and the database code nicely separated and that we have full knowledge of the CSV file when we start inserting the data into the database. However, this made it necessary to keep the entire parser results in RAM. For large CSV files this uses enormous amounts of memory. This commit changes the import to parse the first 20 lines and analyse them. This should give us a good impression of what to expect from the rest of the file. Based on that information we then parse the file row by row and insert each row into the database as soon as it is parsed. This means we only have to keep one row at a time in memory while more or less keeping the possibility to analyse the file before inserting data. On my system this does seem to change the runtime for small files which take a little longer now (<5%), though these measurements aren't conclusive. For large files it, however, it changes memory consumption from using all memory and starting to swap within seconds to almost no memory consumption at all. And not having to swap speeds things up a lot.
2026-01-20 11:00:44 -06:00 · 2017-09-12 10:27:07 +02:00
parent e0ced4a0fa
commit 6ed8080fdb
5 changed files with 175 additions and 168 deletions
--- a/src/csvparser.cpp
+++ b/src/csvparser.cpp
@@ -28,9 +28,10 @@ inline void addColumn(QStringList& r, QString& field, bool trim)
 }
 }

-bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
+CSVParser::ParserResult CSVParser::parse(csvRowFunction insertFunction, QTextStream& stream, qint64 nMaxRecords)
 {
-    m_vCSVData.clear();
+    m_iParsedRows = 0;
+    m_insertFunction = insertFunction;
    ParseStates state = StateNormal;
    QString fieldbuf;
    QStringList record;
@@ -83,14 +84,16 @@ bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
                    {
                        addColumn(record, fieldbuf, m_bTrimFields);

-                        addRow(record);
+                        if(!addRow(record))
+                            return ParserResult::ParserResultError;
                    }
                }
                else if(c == '\n')
                {
                    addColumn(record, fieldbuf, m_bTrimFields);

-                    addRow(record);
+                    if(!addRow(record))
+                        return ParserResult::ParserResultError;
                }
                else
                {
@@ -127,7 +130,8 @@ bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
                    state = StateNormal;
                    addColumn(record, fieldbuf, m_bTrimFields);

-                    addRow(record);
+                    if(!addRow(record))
+                        return ParserResult::ParserResultError;
                }
                else if(c == '\r')
                {
@@ -147,7 +151,8 @@ bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
                    {
                        addColumn(record, fieldbuf, m_bTrimFields);

-                        addRow(record);
+                        if(!addRow(record))
+                            return ParserResult::ParserResultError;
                    }
                }
                else
@@ -159,14 +164,14 @@ bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
            break;
            }

-            if(nMaxRecords != -1 && m_vCSVData.size() >= nMaxRecords)
-                return true;
+            if(nMaxRecords != -1 && m_iParsedRows >= nMaxRecords)
+                return ParserResult::ParserResultSuccess;
        }

-        if(m_pCSVProgress && m_vCSVData.size() % 100 == 0)
+        if(m_pCSVProgress && m_iParsedRows % 100 == 0)
        {
            if(!m_pCSVProgress->update(stream.pos()))
-                return false;
+                return ParserResult::ParserResultCancelled;
        }
    }

@@ -174,11 +179,12 @@ bool CSVParser::parse(QTextStream& stream, qint64 nMaxRecords)
    {
        addColumn(record, fieldbuf, m_bTrimFields);

-        addRow(record);
+        if(!addRow(record))
+            return ParserResult::ParserResultError;
    }

    if(m_pCSVProgress)
        m_pCSVProgress->end();

-    return state == StateNormal;
+    return (state == StateNormal) ? ParserResult::ParserResultSuccess : ParserResult::ParserResultError;
 }