From c2eedbc20dea4368d96ce7a8d1d6149011cd42c3 Mon Sep 17 00:00:00 2001
From: mgrojo <mgrojo@gmail.com>
Date: Fri, 12 Apr 2019 20:10:03 +0200
Subject: [PATCH] Restore full conversion check for non truncated strings

The new text detection check implemented in
e2443d685fae620f6e5ce21eb25c3356006f8527 was giving problems with some
short by combinations that could look like as a truncated valid UTF-8
sequence, so it is better to only do this text, when the string has been
truncated. In this way, short byte sequences should be correctly checked.

See issue #1846
---
 src/Data.cpp | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/Data.cpp b/src/Data.cpp
index 8c1067d0..f123cdda 100644
--- a/src/Data.cpp
+++ b/src/Data.cpp
@@ -17,19 +17,30 @@ bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest)
     if(startsWithBom(data))
         return true;
 
-    // We can assume that the default encoding (UTF-8) and all the ISO-8859
-    // cannot contain character zero.
-    // This has to be checked explicitly because toUnicode() is using zero as
-    // a terminator for these encodings.
-    if((encoding.isEmpty() || encoding.startsWith("ISO-8859")) && data.contains('\0'))
-        return false;
-
-    // Truncate to the first couple of bytes for quick testing
+    // Truncate to the first few bytes for quick testing
     int testSize = quickTest? std::min(512, data.size()) : data.size();
-    QTextCodec::ConverterState state;
-    QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8());
-    const QString text = codec->toUnicode(data.constData(), testSize, &state);
-    return state.invalidChars <= 0;
+
+    // If the quick test has been requested and we have to truncate the string, we have to use
+    // an approach where truncated multibyte characters are not interpreted as invalid characters.
+    if(quickTest && data.size() > testSize) {
+
+        // We can assume that the default encoding (UTF-8) and all the ISO-8859
+        // cannot contain character zero.
+        // This has to be checked explicitly because toUnicode() is using zero as
+        // a terminator for these encodings.
+        if((encoding.isEmpty() || encoding.startsWith("ISO-8859")) && data.contains('\0'))
+            return false;
+
+        QTextCodec::ConverterState state;
+        QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8());
+        const QString text = codec->toUnicode(data.constData(), testSize, &state);
+        return state.invalidChars == 0;
+    } else {
+        // Convert to Unicode if necessary
+        data = decodeString(data, encoding);
+        // Perform check
+        return QString(data).toUtf8() == data;
+    }
 }
 
 bool startsWithBom(const QByteArray& data)