From f3f70c2f90ea4a44b8601b3563a9d6389692bae4 Mon Sep 17 00:00:00 2001 From: Alex Turbov Date: Fri, 25 Oct 2024 03:10:14 +0400 Subject: [PATCH] StringAlgorithms: Refactor `cmTokenize()` function - Refactor and optimize the loop to make it shorter and faster - Make it push elements into an arbitrary (templated) output iterator - Make it a template on a separator type with the most used defaults - Add a backward compatible signature to return `std::vector` - Add an alternative function `cmTokenizedView()` to return a vector of string views --- Source/CPack/WiX/cmWIXAccessControlList.cxx | 5 +- Source/CPack/cmCPackInnoSetupGenerator.cxx | 2 +- Source/cmComputeLinkDepends.cxx | 16 ++-- Source/cmGeneratorTarget_Options.cxx | 6 +- Source/cmGlobalVisualStudio10Generator.cxx | 7 +- Source/cmGlobalVisualStudio8Generator.cxx | 7 +- ...cmGlobalVisualStudioVersionedGenerator.cxx | 7 +- Source/cmGlobalXCodeGenerator.cxx | 11 ++- Source/cmMakefile.cxx | 10 +- Source/cmSourceGroupCommand.cxx | 42 ++++----- Source/cmStringAlgorithms.cxx | 24 ----- Source/cmStringAlgorithms.h | 91 +++++++++++++++++-- Source/cmake.cxx | 4 +- Tests/CMakeLib/testStringAlgorithms.cxx | 21 ++++- 14 files changed, 157 insertions(+), 96 deletions(-) diff --git a/Source/CPack/WiX/cmWIXAccessControlList.cxx b/Source/CPack/WiX/cmWIXAccessControlList.cxx index 0ebe2f4677..5d0df0641a 100644 --- a/Source/CPack/WiX/cmWIXAccessControlList.cxx +++ b/Source/CPack/WiX/cmWIXAccessControlList.cxx @@ -50,14 +50,13 @@ void cmWIXAccessControlList::CreatePermissionElement(std::string const& entry) user = user_and_domain; } - std::vector permissions = cmTokenize(permission_string, ","); - this->SourceWriter.BeginElement("Permission"); this->SourceWriter.AddAttribute("User", std::string(user)); if (!domain.empty()) { this->SourceWriter.AddAttribute("Domain", std::string(domain)); } - for (std::string const& permission : permissions) { + for (auto permission : + cmTokenizedView(permission_string, ',', cmTokenizerMode::New)) { this->EmitBooleanAttribute(entry, cmTrimWhitespace(permission)); } this->SourceWriter.EndElement("Permission"); diff --git a/Source/CPack/cmCPackInnoSetupGenerator.cxx b/Source/CPack/cmCPackInnoSetupGenerator.cxx index ecf3950838..df99e15da1 100644 --- a/Source/CPack/cmCPackInnoSetupGenerator.cxx +++ b/Source/CPack/cmCPackInnoSetupGenerator.cxx @@ -980,7 +980,7 @@ bool cmCPackInnoSetupGenerator::BuildDownloadedComponentArchive( "Problem running certutil command: " << hashCmd << std::endl); } - *hash = cmTrimWhitespace(cmTokenize(hashOutput, "\n").at(1)); + *hash = cmTrimWhitespace(cmTokenizedView(hashOutput, '\n').at(1)); if (hash->length() != 64) { cmCPackLogger(cmCPackLog::LOG_WARNING, diff --git a/Source/cmComputeLinkDepends.cxx b/Source/cmComputeLinkDepends.cxx index 551a45b885..fde673a99f 100644 --- a/Source/cmComputeLinkDepends.cxx +++ b/Source/cmComputeLinkDepends.cxx @@ -257,7 +257,7 @@ const LinkLibraryFeatureAttributeSet& GetLinkLibraryFeatureAttributes( if (processingOption.match(1) == "LIBRARY_TYPE") { featureAttributes.LibraryTypes.clear(); for (auto const& value : - cmTokenize(processingOption.match(2), ","_s)) { + cmTokenize(processingOption.match(2), ',')) { if (value == "STATIC") { featureAttributes.LibraryTypes.emplace( cmStateEnums::STATIC_LIBRARY); @@ -292,7 +292,8 @@ const LinkLibraryFeatureAttributeSet& GetLinkLibraryFeatureAttributes( } } else if (processingOption.match(1) == "OVERRIDE") { featureAttributes.Override.clear(); - auto values = cmTokenize(processingOption.match(2), ","_s); + std::vector values = + cmTokenize(processingOption.match(2), ','); featureAttributes.Override.insert(values.begin(), values.end()); } } else { @@ -668,13 +669,14 @@ cmComputeLinkDepends::cmComputeLinkDepends(const cmGeneratorTarget* target, *linkLibraryOverride, target->GetLocalGenerator(), config, target, &dag, target, linkLanguage); - auto overrideList = cmTokenize(overrideValue, ","_s); + std::vector overrideList = + cmTokenize(overrideValue, ',', cmTokenizerMode::New); if (overrideList.size() >= 2) { auto const& feature = overrideList.front(); - for_each(overrideList.cbegin() + 1, overrideList.cend(), - [this, &feature](std::string const& item) { - this->LinkLibraryOverride.emplace(item, feature); - }); + std::for_each(overrideList.cbegin() + 1, overrideList.cend(), + [this, &feature](std::string const& item) { + this->LinkLibraryOverride.emplace(item, feature); + }); } } } diff --git a/Source/cmGeneratorTarget_Options.cxx b/Source/cmGeneratorTarget_Options.cxx index d8b3eb32de..8b978c5244 100644 --- a/Source/cmGeneratorTarget_Options.cxx +++ b/Source/cmGeneratorTarget_Options.cxx @@ -565,11 +565,11 @@ std::vector>& cmGeneratorTarget::ResolveLinkerWrapper( cmSystemTools::ParseUnixCommandLine( value.c_str() + LINKER_SHELL.length(), linkerOptions); } else { - linkerOptions = cmTokenize(value.substr(LINKER.length()), ","); + linkerOptions = + cmTokenize(value.substr(LINKER.length()), ',', cmTokenizerMode::New); } - if (linkerOptions.empty() || - (linkerOptions.size() == 1 && linkerOptions.front().empty())) { + if (linkerOptions.empty()) { continue; } diff --git a/Source/cmGlobalVisualStudio10Generator.cxx b/Source/cmGlobalVisualStudio10Generator.cxx index f9abe29c12..3d911bdee7 100644 --- a/Source/cmGlobalVisualStudio10Generator.cxx +++ b/Source/cmGlobalVisualStudio10Generator.cxx @@ -339,12 +339,13 @@ bool cmGlobalVisualStudio10Generator::SetGeneratorToolset( bool cmGlobalVisualStudio10Generator::ParseGeneratorToolset( std::string const& ts, cmMakefile* mf) { - std::vector const fields = cmTokenize(ts, ","); - auto fi = fields.begin(); - if (fi == fields.end()) { + std::vector const fields = + cmTokenize(ts, ',', cmTokenizerMode::New); + if (fields.empty()) { return true; } + auto fi = fields.begin(); // The first field may be the VS platform toolset. if (fi->find('=') == fi->npos) { this->GeneratorToolset = *fi; diff --git a/Source/cmGlobalVisualStudio8Generator.cxx b/Source/cmGlobalVisualStudio8Generator.cxx index b1fba8fe14..d9accad173 100644 --- a/Source/cmGlobalVisualStudio8Generator.cxx +++ b/Source/cmGlobalVisualStudio8Generator.cxx @@ -128,12 +128,13 @@ bool cmGlobalVisualStudio8Generator::ParseGeneratorPlatform( { this->GeneratorPlatform.clear(); - std::vector const fields = cmTokenize(p, ","); - auto fi = fields.begin(); - if (fi == fields.end()) { + std::vector const fields = + cmTokenize(p, ',', cmTokenizerMode::New); + if (fields.empty()) { return true; } + auto fi = fields.begin(); // The first field may be the VS platform. if (fi->find('=') == fi->npos) { this->GeneratorPlatform = *fi; diff --git a/Source/cmGlobalVisualStudioVersionedGenerator.cxx b/Source/cmGlobalVisualStudioVersionedGenerator.cxx index 14460fd765..ba51e3661b 100644 --- a/Source/cmGlobalVisualStudioVersionedGenerator.cxx +++ b/Source/cmGlobalVisualStudioVersionedGenerator.cxx @@ -584,12 +584,13 @@ bool cmGlobalVisualStudioVersionedGenerator::ParseGeneratorInstance( this->GeneratorInstance.clear(); this->GeneratorInstanceVersion.clear(); - std::vector const fields = cmTokenize(is, ","); - auto fi = fields.begin(); - if (fi == fields.end()) { + std::vector const fields = + cmTokenize(is, ',', cmTokenizerMode::New); + if (fields.empty()) { return true; } + auto fi = fields.begin(); // The first field may be the VS instance. if (fi->find('=') == fi->npos) { this->GeneratorInstance = *fi; diff --git a/Source/cmGlobalXCodeGenerator.cxx b/Source/cmGlobalXCodeGenerator.cxx index b38778a012..a45e23f330 100644 --- a/Source/cmGlobalXCodeGenerator.cxx +++ b/Source/cmGlobalXCodeGenerator.cxx @@ -355,12 +355,13 @@ bool cmGlobalXCodeGenerator::SetGeneratorToolset(std::string const& ts, bool cmGlobalXCodeGenerator::ParseGeneratorToolset(std::string const& ts, cmMakefile* mf) { - std::vector const fields = cmTokenize(ts, ","); - auto fi = fields.cbegin(); - if (fi == fields.cend()) { + std::vector const fields = + cmTokenize(ts, ',', cmTokenizerMode::New); + if (fields.empty()) { return true; } + auto fi = fields.cbegin(); // The first field may be the Xcode GCC_VERSION. if (fi->find('=') == fi->npos) { this->GeneratorToolset = *fi; @@ -4457,7 +4458,7 @@ cmXCodeObject* cmGlobalXCodeGenerator::CreateOrGetPBXGroup( if (it != this->TargetGroup.end()) { tgroup = it->second; } else { - std::vector tgt_folders = cmTokenize(target, "/"); + std::vector const tgt_folders = cmTokenize(target, '/'); std::string curr_tgt_folder; for (std::vector::size_type i = 0; i < tgt_folders.size(); i++) { @@ -4490,7 +4491,7 @@ cmXCodeObject* cmGlobalXCodeGenerator::CreateOrGetPBXGroup( // It's a recursive folder structure, let's find the real parent group if (sg->GetFullName() != sg->GetName()) { std::string curr_folder = cmStrCat(target, '/'); - for (auto const& folder : cmTokenize(sg->GetFullName(), "\\")) { + for (auto const& folder : cmTokenize(sg->GetFullName(), '\\')) { curr_folder += folder; auto const i_folder = this->GroupNameMap.find(curr_folder); // Create new folder diff --git a/Source/cmMakefile.cxx b/Source/cmMakefile.cxx index e73a49bb13..c6a831b29f 100644 --- a/Source/cmMakefile.cxx +++ b/Source/cmMakefile.cxx @@ -2387,13 +2387,9 @@ cmSourceGroup* cmMakefile::GetOrCreateSourceGroup( cmSourceGroup* cmMakefile::GetOrCreateSourceGroup(const std::string& name) { - std::string delimiters; - if (cmValue p = this->GetDefinition("SOURCE_GROUP_DELIMITER")) { - delimiters = *p; - } else { - delimiters = "/\\"; - } - return this->GetOrCreateSourceGroup(cmTokenize(name, delimiters)); + auto p = this->GetDefinition("SOURCE_GROUP_DELIMITER"); + return this->GetOrCreateSourceGroup( + cmTokenize(name, p ? cm::string_view(*p) : R"(\/)"_s)); } /** diff --git a/Source/cmSourceGroupCommand.cxx b/Source/cmSourceGroupCommand.cxx index bb928564c9..31f541e8e6 100644 --- a/Source/cmSourceGroupCommand.cxx +++ b/Source/cmSourceGroupCommand.cxx @@ -29,11 +29,6 @@ const std::string kFilesOptionName = "FILES"; const std::string kRegexOptionName = "REGULAR_EXPRESSION"; const std::string kSourceGroupOptionName = ""; -std::vector tokenizePath(const std::string& path) -{ - return cmTokenize(path, "\\/"); -} - std::set getSourceGroupFilesPaths( const std::string& root, const std::vector& files) { @@ -95,31 +90,28 @@ bool addFilesToItsSourceGroups(const std::string& root, cmSourceGroup* sg; for (std::string const& sgFilesPath : sgFilesPaths) { + std::vector tokenizedPath = cmTokenize( + prefix.empty() ? sgFilesPath : cmStrCat(prefix, '/', sgFilesPath), + R"(\/)", cmTokenizerMode::New); - std::vector tokenizedPath; - if (!prefix.empty()) { - tokenizedPath = tokenizePath(cmStrCat(prefix, '/', sgFilesPath)); - } else { - tokenizedPath = tokenizePath(sgFilesPath); + if (tokenizedPath.empty()) { + continue; + } + tokenizedPath.pop_back(); + + if (tokenizedPath.empty()) { + tokenizedPath.emplace_back(); } - if (!tokenizedPath.empty()) { - tokenizedPath.pop_back(); + sg = makefile.GetOrCreateSourceGroup(tokenizedPath); - if (tokenizedPath.empty()) { - tokenizedPath.emplace_back(); - } - - sg = makefile.GetOrCreateSourceGroup(tokenizedPath); - - if (!sg) { - errorMsg = "Could not create source group for file: " + sgFilesPath; - return false; - } - const std::string fullPath = - cmSystemTools::CollapseFullPath(sgFilesPath, root); - sg->AddGroupFile(fullPath); + if (!sg) { + errorMsg = "Could not create source group for file: " + sgFilesPath; + return false; } + const std::string fullPath = + cmSystemTools::CollapseFullPath(sgFilesPath, root); + sg->AddGroupFile(fullPath); } return true; diff --git a/Source/cmStringAlgorithms.cxx b/Source/cmStringAlgorithms.cxx index 332bd8d5ae..724ca75330 100644 --- a/Source/cmStringAlgorithms.cxx +++ b/Source/cmStringAlgorithms.cxx @@ -55,30 +55,6 @@ std::string cmEscapeQuotes(cm::string_view str) return result; } -std::vector cmTokenize(cm::string_view str, cm::string_view sep) -{ - std::vector tokens; - cm::string_view::size_type tokend = 0; - - do { - cm::string_view::size_type tokstart = str.find_first_not_of(sep, tokend); - if (tokstart == cm::string_view::npos) { - break; // no more tokens - } - tokend = str.find_first_of(sep, tokstart); - if (tokend == cm::string_view::npos) { - tokens.emplace_back(str.substr(tokstart)); - } else { - tokens.emplace_back(str.substr(tokstart, tokend - tokstart)); - } - } while (tokend != cm::string_view::npos); - - if (tokens.empty()) { - tokens.emplace_back(); - } - return tokens; -} - namespace { template inline void MakeDigits(cm::string_view& view, char (&digits)[N], diff --git a/Source/cmStringAlgorithms.h b/Source/cmStringAlgorithms.h index 2bd615a65d..72557d52f0 100644 --- a/Source/cmStringAlgorithms.h +++ b/Source/cmStringAlgorithms.h @@ -24,7 +24,7 @@ using cmStringRange = cmRange::const_iterator>; /** Returns length of a literal string. */ template -constexpr size_t cmStrLen(const char (&/*str*/)[N]) +constexpr size_t cmStrLen(const char (&)[N]) { return N - 1; } @@ -91,12 +91,12 @@ std::string cmJoinStrings(Range const& rng, cm::string_view separator, } std::string result; - result.reserve( - std::accumulate(std::begin(rng), std::end(rng), - initial.size() + (rng.size() - 1) * separator.size(), - [](std::size_t sum, const std::string& item) { - return sum + item.size(); - })); + result.reserve(std::accumulate( + std::begin(rng), std::end(rng), + initial.size() + (rng.size() - 1) * separator.size(), + [](std::size_t sum, typename Range::value_type const& item) { + return sum + item.size(); + })); result.append(std::begin(initial), std::end(initial)); auto begin = std::begin(rng); @@ -122,8 +122,81 @@ std::string cmJoin(std::vector const& rng, std::string cmJoin(cmStringRange const& rng, cm::string_view separator, cm::string_view initial = {}); -/** Extract tokens that are separated by any of the characters in @a sep. */ -std::vector cmTokenize(cm::string_view str, cm::string_view sep); +enum class cmTokenizerMode +{ + /// A backward-compatible behavior when in the case of no + /// tokens have found in an input text it'll return one empty + /// token in the result container (vector). + Legacy, + /// The new behavior is to return an empty vector. + New +}; + +/** + * \brief A generic version of a tokenizer. + * + * Extract tokens from the input string separated by any + * of the characters in `sep` and assign them to the + * given output iterator. + * + * The `mode` parameter defines the behavior in the case when + * no tokens have found in the input text. + * + */ +template +void cmTokenize(OutIt outIt, cm::string_view str, Sep sep, + cmTokenizerMode mode) +{ + auto hasTokens = false; + // clang-format off + for (auto start = str.find_first_not_of(sep) + , end = str.find_first_of(sep, start) + ; start != cm::string_view::npos + ; start = str.find_first_not_of(sep, end) + , end = str.find_first_of(sep, start) + , hasTokens = true + ) { + *outIt++ = StringT{ str.substr(start, end - start) }; + } + // clang-format on + if (!hasTokens && mode == cmTokenizerMode::Legacy) { + *outIt = {}; + } +} + +/** + * \brief Extract tokens that are separated by any of the + * characters in `sep`. + * + * Backward compatible signature. + * + * \return A vector of strings. + */ +template +std::vector cmTokenize( + cm::string_view str, Sep sep, cmTokenizerMode mode = cmTokenizerMode::Legacy) +{ + using StringType = std::string; + std::vector tokens; + cmTokenize(std::back_inserter(tokens), str, sep, mode); + return tokens; +} + +/** + * \brief Extract tokens that are separated by any of the + * characters in `sep`. + * + * \return A vector of string views. + */ +template +std::vector cmTokenizedView( + cm::string_view str, Sep sep, cmTokenizerMode mode = cmTokenizerMode::Legacy) +{ + using StringType = cm::string_view; + std::vector tokens; + cmTokenize(std::back_inserter(tokens), str, sep, mode); + return tokens; +} /** Concatenate string pieces into a single string. */ std::string cmCatViews( diff --git a/Source/cmake.cxx b/Source/cmake.cxx index 9acc26bb9b..2817819d57 100644 --- a/Source/cmake.cxx +++ b/Source/cmake.cxx @@ -1131,7 +1131,7 @@ void cmake::SetArgs(const std::vector& args) "--debug-find-pkg", "Provide a package argument for --debug-find-pkg", CommandArgument::Values::One, CommandArgument::RequiresSeparator::Yes, [](std::string const& value, cmake* state) -> bool { - std::vector find_pkgs(cmTokenize(value, ",")); + std::vector find_pkgs(cmTokenize(value, ',')); std::cout << "Running with debug output on for the 'find' commands " "for package(s)"; for (auto const& v : find_pkgs) { @@ -1145,7 +1145,7 @@ void cmake::SetArgs(const std::vector& args) "--debug-find-var", CommandArgument::Values::One, CommandArgument::RequiresSeparator::Yes, [](std::string const& value, cmake* state) -> bool { - std::vector find_vars(cmTokenize(value, ",")); + std::vector find_vars(cmTokenize(value, ',')); std::cout << "Running with debug output on for the variable(s)"; for (auto const& v : find_vars) { std::cout << ' ' << v; diff --git a/Tests/CMakeLib/testStringAlgorithms.cxx b/Tests/CMakeLib/testStringAlgorithms.cxx index 78442ba3fe..4d418b44fc 100644 --- a/Tests/CMakeLib/testStringAlgorithms.cxx +++ b/Tests/CMakeLib/testStringAlgorithms.cxx @@ -10,6 +10,7 @@ #include #include +#include #include "cmStringAlgorithms.h" @@ -103,7 +104,9 @@ int testStringAlgorithms(int /*unused*/, char* /*unused*/[]) { typedef std::vector VT; assert_ok(cmTokenize("", ";") == VT{ "" }, "cmTokenize empty"); - assert_ok(cmTokenize(";", ";") == VT{ "" }, "cmTokenize sep"); + assert_ok(cmTokenize(";", ";") == VT{ "" }, "cmTokenize sep (char*)"); + assert_ok(cmTokenize(";", ";"_s) == VT{ "" }, + "cmTokenize sep (string_view)"); assert_ok(cmTokenize("abc", ";") == VT{ "abc" }, "cmTokenize item"); assert_ok(cmTokenize("abc;", ";") == VT{ "abc" }, "cmTokenize item sep"); assert_ok(cmTokenize(";abc", ";") == VT{ "abc" }, "cmTokenize sep item"); @@ -112,6 +115,22 @@ int testStringAlgorithms(int /*unused*/, char* /*unused*/[]) assert_ok(cmTokenize("a1;a2;a3;a4", ";") == VT{ "a1", "a2", "a3", "a4" }, "cmTokenize multiple items"); } + { + typedef std::vector VT; + assert_ok(cmTokenizedView("", ';') == VT{ "" }, "cmTokenizedView empty"); + assert_ok(cmTokenizedView(";", ';') == VT{ "" }, "cmTokenizedView sep"); + assert_ok(cmTokenizedView("abc", ';') == VT{ "abc" }, + "cmTokenizedView item"); + assert_ok(cmTokenizedView("abc;", ';') == VT{ "abc" }, + "cmTokenizedView item sep"); + assert_ok(cmTokenizedView(";abc", ';') == VT{ "abc" }, + "cmTokenizedView sep item"); + assert_ok(cmTokenizedView("abc;;efg", ';') == VT{ "abc", "efg" }, + "cmTokenizedView item sep sep item"); + assert_ok(cmTokenizedView("a1;a2;a3;a4", ';') == + VT{ "a1", "a2", "a3", "a4" }, + "cmTokenizedView multiple items"); + } // ---------------------------------------------------------------------- // Test cmStrCat