From c2eedbc20dea4368d96ce7a8d1d6149011cd42c3 Mon Sep 17 00:00:00 2001 From: mgrojo Date: Fri, 12 Apr 2019 20:10:03 +0200 Subject: [PATCH] Restore full conversion check for non truncated strings The new text detection check implemented in e2443d685fae620f6e5ce21eb25c3356006f8527 was giving problems with some short by combinations that could look like as a truncated valid UTF-8 sequence, so it is better to only do this text, when the string has been truncated. In this way, short byte sequences should be correctly checked. See issue #1846 --- src/Data.cpp | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/Data.cpp b/src/Data.cpp index 8c1067d0..f123cdda 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -17,19 +17,30 @@ bool isTextOnly(QByteArray data, const QString& encoding, bool quickTest) if(startsWithBom(data)) return true; - // We can assume that the default encoding (UTF-8) and all the ISO-8859 - // cannot contain character zero. - // This has to be checked explicitly because toUnicode() is using zero as - // a terminator for these encodings. - if((encoding.isEmpty() || encoding.startsWith("ISO-8859")) && data.contains('\0')) - return false; - - // Truncate to the first couple of bytes for quick testing + // Truncate to the first few bytes for quick testing int testSize = quickTest? std::min(512, data.size()) : data.size(); - QTextCodec::ConverterState state; - QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8()); - const QString text = codec->toUnicode(data.constData(), testSize, &state); - return state.invalidChars <= 0; + + // If the quick test has been requested and we have to truncate the string, we have to use + // an approach where truncated multibyte characters are not interpreted as invalid characters. + if(quickTest && data.size() > testSize) { + + // We can assume that the default encoding (UTF-8) and all the ISO-8859 + // cannot contain character zero. + // This has to be checked explicitly because toUnicode() is using zero as + // a terminator for these encodings. + if((encoding.isEmpty() || encoding.startsWith("ISO-8859")) && data.contains('\0')) + return false; + + QTextCodec::ConverterState state; + QTextCodec *codec = encoding.isEmpty()? QTextCodec::codecForName("UTF-8") : QTextCodec::codecForName(encoding.toUtf8()); + const QString text = codec->toUnicode(data.constData(), testSize, &state); + return state.invalidChars == 0; + } else { + // Convert to Unicode if necessary + data = decodeString(data, encoding); + // Perform check + return QString(data).toUtf8() == data; + } } bool startsWithBom(const QByteArray& data)