From e2443d685fae620f6e5ce21eb25c3356006f8527 Mon Sep 17 00:00:00 2001
From: mgrojo <mgrojo@gmail.com>
Date: Sat, 9 Feb 2019 19:03:02 +0100
Subject: [PATCH] Fix text detection check

Truncating the text in bytes boundaries for the quick test was breaking
the text detection for Russian and probably any script encoded in more than
one byte. The problem occurred probably when a multibyte character was
truncated at the 512 boundary. This is a bit improbable in latin-based
languages like German or Spanish, whose most characters are a byte, but
very easy in other scripts, like Cyrillic, whose characters are encoded in
more than one.

The new approach is based in QTextCodec finding invalid characters using
the current encoding, which seems immune to the truncation problem.
According to callgrind, it has also better performance, probably because it
does not involve memory comparison.

See issue #1731
---
 src/Data.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/src/Data.cpp b/src/Data.cpp
index fd4d2af2..72ac8e52 100644
--- a/src/Data.cpp
+++ b/src/Data.cpp
@@ -1,6 +1,7 @@
 #include "Data.h"
 
 #include <QTextCodec>
+#include <algorithm>
 
 // Note that these aren't all possible BOMs. But they are probably the most common ones.
 // The size is needed at least for the ones with character zero in them.
@@ -17,14 +18,11 @@ bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest)
         return true;
 
     // Truncate to the first couple of bytes for quick testing
-    if(quickTest)
-        data = data.left(512);
-
-    // Convert to Unicode if necessary
-    data = decodeString(data, encoding);
-
-    // Perform check
-    return QString(data).toUtf8() == data;
+    int testSize = quickTest? std::min(512, data.size()) : data.size();
+    QTextCodec::ConverterState state;
+    QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8());
+    const QString text = codec->toUnicode(data.constData(), testSize, &state);
+    return state.invalidChars <= 0;
 }
 
 bool startsWithBom(const QByteArray& data)