string: Allow zero-length matches in all REGEX subcommands

The semantics mimic other languages like Python, Java, JS, etc.
To advance past a zero-length match, the search algorithm first
tries to find a non-zero alternative branch. If that fails, it
force-advances by 1 character.

Fixes: #13790, #13792, #18690, #26629
This commit is contained in:
Nikita Nemkin
2025-02-14 22:13:19 +05:00
parent ca65fa9a7f
commit 8d455809b0
8 changed files with 181 additions and 46 deletions

View File

@@ -122,6 +122,9 @@ Search and Replace With Regular Expressions
string instead of the beginning of each repeated search.
See policy :policy:`CMP0186`.
Zero-length matches are allowed in ``MATCHALL`` and ``REPLACE``.
Previously, they triggered an error.
The replacement expression may contain references to subexpressions that
didn't match anything. Previously, such references triggered an error.

View File

@@ -6,3 +6,5 @@ regex-fixes
* References to unmatched groups are allowed, they are replaced with empty
strings.
* Zero-length matches are always allowed.

View File

@@ -251,15 +251,7 @@ bool RegexMatch(std::vector<std::string> const& args,
std::string output;
if (re.find(input)) {
status.GetMakefile().StoreMatches(re);
std::string::size_type l = re.start();
std::string::size_type r = re.end();
if (r - l == 0) {
std::string e = "sub-command REGEX, mode MATCH regex \"" + regex +
"\" matched an empty string.";
status.SetError(e);
return false;
}
output = input.substr(l, r - l);
output = re.match();
}
// Store the output in the provided variable.
@@ -298,22 +290,24 @@ bool RegexMatchAll(std::vector<std::string> const& args,
// Scan through the input for all matches.
std::string output;
std::string::size_type base = 0;
while (re.find(input, base, optAnchor)) {
unsigned optNonEmpty = 0;
while (re.find(input, base, optAnchor | optNonEmpty)) {
status.GetMakefile().ClearMatches();
status.GetMakefile().StoreMatches(re);
std::string::size_type l = re.start();
std::string::size_type r = re.end();
if (r - l == 0) {
std::string e = "sub-command REGEX, mode MATCHALL regex \"" + regex +
"\" matched an empty string.";
status.SetError(e);
return false;
}
if (!output.empty()) {
if (!output.empty() || optNonEmpty) {
output += ";";
}
output += re.match();
base = r;
base = re.end();
if (re.start() == input.length()) {
break;
}
if (re.start() == re.end()) {
optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET;
} else {
optNonEmpty = 0;
}
}
// Store the output in the provided variable.

View File

@@ -33,25 +33,17 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
}
// Scan through the input for all matches.
auto& re = this->RegularExpression;
std::string::size_type base = 0;
while (this->RegularExpression.find(input, base, optAnchor)) {
unsigned optNonEmpty = 0;
while (re.find(input, base, optAnchor | optNonEmpty)) {
if (this->Makefile) {
this->Makefile->ClearMatches();
this->Makefile->StoreMatches(this->RegularExpression);
this->Makefile->StoreMatches(re);
}
auto l2 = this->RegularExpression.start();
auto r = this->RegularExpression.end();
// Concatenate the part of the input that was not matched.
output += input.substr(base, l2 - base);
// Make sure the match had some text.
if (r - l2 == 0) {
std::ostringstream error;
error << "regex \"" << this->RegExString << "\" matched an empty string";
this->ErrorString = error.str();
return false;
}
output += input.substr(base, re.start() - base);
// Concatenate the replacement for the match.
for (auto const& replacement : this->Replacements) {
@@ -61,7 +53,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
} else {
// Replace with part of the match.
auto n = replacement.Number;
if (n > this->RegularExpression.num_groups()) {
if (n > re.num_groups()) {
std::ostringstream error;
error << "replace expression \"" << this->ReplaceExpression
<< "\" contains an out-of-range escape for regex \""
@@ -69,12 +61,21 @@ bool cmStringReplaceHelper::Replace(std::string const& input,
this->ErrorString = error.str();
return false;
}
output += this->RegularExpression.match(n);
output += re.match(n);
}
}
// Move past the match.
base = r;
base = re.end();
if (re.start() == input.length()) {
break;
}
if (re.start() == re.end()) {
optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET;
} else {
optNonEmpty = 0;
}
}
// Concatenate the text after the last match.

View File

@@ -84,7 +84,7 @@ check_cmake_test(String
# Execute each test listed in StringTestScript.cmake:
#
set(scriptname "@CMAKE_CURRENT_SOURCE_DIR@/StringTestScript.cmake")
set(number_of_tests_expected 73)
set(number_of_tests_expected 70)
include("@CMAKE_CURRENT_SOURCE_DIR@/ExecuteScriptTests.cmake")
execute_all_script_tests(${scriptname} number_of_tests_executed)

View File

@@ -73,9 +73,6 @@ elseif(testname STREQUAL regex_match_multiple_inputs) # pass
elseif(testname STREQUAL regex_match_bad_regex) # fail
string(REGEX MATCH "(.*" v input)
elseif(testname STREQUAL regex_match_empty_string) # fail
string(REGEX MATCH "x*" v "")
elseif(testname STREQUAL regex_match_no_match) # pass
string(REGEX MATCH "xyz" v "abc")
message(STATUS "v='${v}'")
@@ -87,9 +84,6 @@ elseif(testname STREQUAL regex_matchall_multiple_inputs) # pass
elseif(testname STREQUAL regex_matchall_bad_regex) # fail
string(REGEX MATCHALL "(.*" v input)
elseif(testname STREQUAL regex_matchall_empty_string) # fail
string(REGEX MATCHALL "x*" v "")
elseif(testname STREQUAL regex_replace_ends_with_backslash) # fail
string(REGEX REPLACE "input" "output\\" v input1 input2 input3 input4)
@@ -107,9 +101,6 @@ elseif(testname STREQUAL regex_replace_has_bogus_escape) # fail
elseif(testname STREQUAL regex_replace_bad_regex) # fail
string(REGEX REPLACE "this (.*" "with that" v input)
elseif(testname STREQUAL regex_replace_empty_string) # fail
string(REGEX REPLACE "x*" "that" v "")
elseif(testname STREQUAL regex_replace_index_too_small) # fail
string(REGEX REPLACE "^this (.*)$" "with \\1 \\-1" v "this input")

View File

@@ -0,0 +1,143 @@
cmake_policy(SET CMP0186 NEW)
function(check_output name expected)
set(output "${${name}}")
if(NOT output STREQUAL expected)
message(FATAL_ERROR "\"string(REGEX)\" set ${name} to \"${output}\", expected \"${expected}\"")
endif()
endfunction()
# Zero-length matches in REGEX MATCH
string(REGEX MATCH "" out "")
check_output(out "")
string(REGEX MATCH "" out "a")
check_output(out "")
string(REGEX MATCH "a*" out "")
check_output(out "")
string(REGEX MATCH "a*" out "a")
check_output(out "a")
string(REGEX MATCH "a*" out "b")
check_output(out "")
string(REGEX MATCH "a*" out "ba")
check_output(out "")
# Zero-length matches in REGEX MATCHALL
string(REGEX MATCHALL "" out "")
check_output(out "")
string(REGEX MATCHALL "" out "ab")
check_output(out ";;")
string(REGEX MATCHALL "^" out "ab")
check_output(out "")
string(REGEX MATCHALL "(^|,)" out "a,b")
check_output(out ";,")
string(REGEX MATCHALL "(,|^)" out "a,b")
check_output(out ";,")
string(REGEX MATCHALL "(^|)" out "")
check_output(out "")
string(REGEX MATCHALL "(^|)" out "ab")
check_output(out ";;")
string(REGEX MATCHALL "a|^" out "ab")
check_output(out "a")
string(REGEX MATCHALL "$" out "ab")
check_output(out "")
string(REGEX MATCHALL "($|,)" out "a,b")
check_output(out ",;")
string(REGEX MATCHALL "(,|$)" out "a,b")
check_output(out ",;")
string(REGEX MATCHALL "(|$)" out "")
check_output(out "")
string(REGEX MATCHALL "(|$)" out "ab")
check_output(out ";;")
string(REGEX MATCHALL "(b|)" out "abc")
check_output(out ";b;;")
string(REGEX MATCHALL "(|b)" out "abc")
check_output(out ";;b;;")
string(REGEX MATCHALL "a*" out "aaa")
check_output(out "aaa;")
string(REGEX MATCHALL "(a)?(b)?" out "")
check_output(out "")
string(REGEX MATCHALL "(a)?(b)?" out "abba")
check_output(out "ab;b;a;")
# Zero-length matches in REGEX REPLACE
string(REGEX REPLACE "" "" out "")
check_output(out "")
string(REGEX REPLACE "" "x" out "")
check_output(out "x")
string(REGEX REPLACE "" "x" out "ab")
check_output(out "xaxbx")
string(REGEX REPLACE "^" "x" out "ab")
check_output(out "xab")
string(REGEX REPLACE "(^|,)" "x" out "a,b")
check_output(out "xaxb")
string(REGEX REPLACE "(,|^)" "x" out "a,b")
check_output(out "xaxb")
string(REGEX REPLACE "(^|)" "x" out "")
check_output(out "x")
string(REGEX REPLACE "(^|)" "x" out "ab")
check_output(out "xaxbx")
string(REGEX REPLACE "a|^" "x" out "ab")
check_output(out "xb")
string(REGEX REPLACE "$" "x" out "ab")
check_output(out "abx")
string(REGEX REPLACE "($|,)" "x" out "a,b")
check_output(out "axbx")
string(REGEX REPLACE "(,|$)" "x" out "a,b")
check_output(out "axbx")
string(REGEX REPLACE "(|$)" "x" out "")
check_output(out "x")
string(REGEX REPLACE "(|$)" "x" out "ab")
check_output(out "xaxbx")
string(REGEX REPLACE "(b|)" "x" out "abc")
check_output(out "xaxxcx")
string(REGEX REPLACE "(|b)" "x" out "abc")
check_output(out "xaxxxcx")
string(REGEX REPLACE "a*" "x" out "aaa")
check_output(out "xx")
string(REGEX REPLACE "(a)?(b)?" "x" out "")
check_output(out "x")
string(REGEX REPLACE "(a)?(b)?" "x" out "abba")
check_output(out "xxxx")

View File

@@ -35,6 +35,7 @@ run_cmake(UuidBadType)
run_cmake(RegexClear)
run_cmake(RegexMultiMatchClear)
run_cmake(RegexEmptyMatch)
run_cmake(CMP0186)
run_cmake(UTF-16BE)