From e2443d685fae620f6e5ce21eb25c3356006f8527 Mon Sep 17 00:00:00 2001 From: mgrojo Date: Sat, 9 Feb 2019 19:03:02 +0100 Subject: [PATCH] Fix text detection check Truncating the text in bytes boundaries for the quick test was breaking the text detection for Russian and probably any script encoded in more than one byte. The problem occurred probably when a multibyte character was truncated at the 512 boundary. This is a bit improbable in latin-based languages like German or Spanish, whose most characters are a byte, but very easy in other scripts, like Cyrillic, whose characters are encoded in more than one. The new approach is based in QTextCodec finding invalid characters using the current encoding, which seems immune to the truncation problem. According to callgrind, it has also better performance, probably because it does not involve memory comparison. See issue #1731 --- src/Data.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/Data.cpp b/src/Data.cpp index fd4d2af2..72ac8e52 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -1,6 +1,7 @@ #include "Data.h" #include +#include // Note that these aren't all possible BOMs. But they are probably the most common ones. // The size is needed at least for the ones with character zero in them. @@ -17,14 +18,11 @@ bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest) return true; // Truncate to the first couple of bytes for quick testing - if(quickTest) - data = data.left(512); - - // Convert to Unicode if necessary - data = decodeString(data, encoding); - - // Perform check - return QString(data).toUtf8() == data; + int testSize = quickTest? std::min(512, data.size()) : data.size(); + QTextCodec::ConverterState state; + QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8()); + const QString text = codec->toUnicode(data.constData(), testSize, &state); + return state.invalidChars <= 0; } bool startsWithBom(const QByteArray& data)