From ca65fa9a7f3d72d62ce980ee16b3b20ed7bdedef Mon Sep 17 00:00:00 2001 From: Nikita Nemkin Date: Fri, 14 Feb 2025 22:13:18 +0500 Subject: string: Allow references to unmatched groups in REGEX REPLACE References to unmatched groups will be replaced with empty strings. Issue: #26629 Fixes: #19012 --- Help/command/string.rst | 3 +++ Help/release/dev/regex-fixes.rst | 3 +++ Source/cmStringReplaceHelper.cxx | 6 ++---- Tests/CMakeTests/StringTest.cmake.in | 2 +- Tests/CMakeTests/StringTestScript.cmake | 4 ++++ 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Help/command/string.rst b/Help/command/string.rst index c510ff4..b125e4b 100644 --- a/Help/command/string.rst +++ b/Help/command/string.rst @@ -122,6 +122,9 @@ Search and Replace With Regular Expressions string instead of the beginning of each repeated search. See policy :policy:`CMP0186`. + The replacement expression may contain references to subexpressions that + didn't match anything. Previously, such references triggered an error. + .. _`Regex Specification`: Regex Specification diff --git a/Help/release/dev/regex-fixes.rst b/Help/release/dev/regex-fixes.rst index e979c03..67f80ed 100644 --- a/Help/release/dev/regex-fixes.rst +++ b/Help/release/dev/regex-fixes.rst @@ -3,3 +3,6 @@ regex-fixes * Regular expressions match the ``^`` anchor at most once in repeated searches, at the start of the input. See policy :policy:`CMP0186`. + +* References to unmatched groups are allowed, they are replaced with empty + strings. diff --git a/Source/cmStringReplaceHelper.cxx b/Source/cmStringReplaceHelper.cxx index 5cd159e..e909b05 100644 --- a/Source/cmStringReplaceHelper.cxx +++ b/Source/cmStringReplaceHelper.cxx @@ -61,10 +61,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input, } else { // Replace with part of the match. auto n = replacement.Number; - auto start = this->RegularExpression.start(n); - if (start != std::string::npos) { - output += this->RegularExpression.match(n); - } else { + if (n > this->RegularExpression.num_groups()) { std::ostringstream error; error << "replace expression \"" << this->ReplaceExpression << "\" contains an out-of-range escape for regex \"" @@ -72,6 +69,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input, this->ErrorString = error.str(); return false; } + output += this->RegularExpression.match(n); } } diff --git a/Tests/CMakeTests/StringTest.cmake.in b/Tests/CMakeTests/StringTest.cmake.in index 6a94cc5..8746551 100644 --- a/Tests/CMakeTests/StringTest.cmake.in +++ b/Tests/CMakeTests/StringTest.cmake.in @@ -84,7 +84,7 @@ check_cmake_test(String # Execute each test listed in StringTestScript.cmake: # set(scriptname "@CMAKE_CURRENT_SOURCE_DIR@/StringTestScript.cmake") -set(number_of_tests_expected 72) +set(number_of_tests_expected 73) include("@CMAKE_CURRENT_SOURCE_DIR@/ExecuteScriptTests.cmake") execute_all_script_tests(${scriptname} number_of_tests_executed) diff --git a/Tests/CMakeTests/StringTestScript.cmake b/Tests/CMakeTests/StringTestScript.cmake index 7c45857..dc7b8ae 100644 --- a/Tests/CMakeTests/StringTestScript.cmake +++ b/Tests/CMakeTests/StringTestScript.cmake @@ -116,6 +116,10 @@ elseif(testname STREQUAL regex_replace_index_too_small) # fail elseif(testname STREQUAL regex_replace_index_too_large) # fail string(REGEX REPLACE "^this (.*)$" "with \\1 \\2" v "this input") +elseif(testname STREQUAL regex_replace_index_no_match) # pass + string(REGEX REPLACE "^(this (.*)|(that .*))$" "with \\1 \\2 \\3" v "this input") + message(STATUS "v='${v}'") + elseif(testname STREQUAL compare_no_mode) # fail string(COMPARE) -- cgit v0.12 From 8d455809b0836683466f4a7bad3579574b7217b3 Mon Sep 17 00:00:00 2001 From: Nikita Nemkin Date: Fri, 14 Feb 2025 22:13:19 +0500 Subject: string: Allow zero-length matches in all REGEX subcommands The semantics mimic other languages like Python, Java, JS, etc. To advance past a zero-length match, the search algorithm first tries to find a non-zero alternative branch. If that fails, it force-advances by 1 character. Fixes: #13790, #13792, #18690, #26629 --- Help/command/string.rst | 3 + Help/release/dev/regex-fixes.rst | 2 + Source/cmStringCommand.cxx | 34 +++---- Source/cmStringReplaceHelper.cxx | 33 +++---- Tests/CMakeTests/StringTest.cmake.in | 2 +- Tests/CMakeTests/StringTestScript.cmake | 9 -- Tests/RunCMake/string/RegexEmptyMatch.cmake | 143 ++++++++++++++++++++++++++++ Tests/RunCMake/string/RunCMakeTest.cmake | 1 + 8 files changed, 181 insertions(+), 46 deletions(-) create mode 100644 Tests/RunCMake/string/RegexEmptyMatch.cmake diff --git a/Help/command/string.rst b/Help/command/string.rst index b125e4b..d86efc5 100644 --- a/Help/command/string.rst +++ b/Help/command/string.rst @@ -122,6 +122,9 @@ Search and Replace With Regular Expressions string instead of the beginning of each repeated search. See policy :policy:`CMP0186`. + Zero-length matches are allowed in ``MATCHALL`` and ``REPLACE``. + Previously, they triggered an error. + The replacement expression may contain references to subexpressions that didn't match anything. Previously, such references triggered an error. diff --git a/Help/release/dev/regex-fixes.rst b/Help/release/dev/regex-fixes.rst index 67f80ed..82d1fad 100644 --- a/Help/release/dev/regex-fixes.rst +++ b/Help/release/dev/regex-fixes.rst @@ -6,3 +6,5 @@ regex-fixes * References to unmatched groups are allowed, they are replaced with empty strings. + +* Zero-length matches are always allowed. diff --git a/Source/cmStringCommand.cxx b/Source/cmStringCommand.cxx index f923ba6..19ce9f6 100644 --- a/Source/cmStringCommand.cxx +++ b/Source/cmStringCommand.cxx @@ -251,15 +251,7 @@ bool RegexMatch(std::vector const& args, std::string output; if (re.find(input)) { status.GetMakefile().StoreMatches(re); - std::string::size_type l = re.start(); - std::string::size_type r = re.end(); - if (r - l == 0) { - std::string e = "sub-command REGEX, mode MATCH regex \"" + regex + - "\" matched an empty string."; - status.SetError(e); - return false; - } - output = input.substr(l, r - l); + output = re.match(); } // Store the output in the provided variable. @@ -298,22 +290,24 @@ bool RegexMatchAll(std::vector const& args, // Scan through the input for all matches. std::string output; std::string::size_type base = 0; - while (re.find(input, base, optAnchor)) { + unsigned optNonEmpty = 0; + while (re.find(input, base, optAnchor | optNonEmpty)) { status.GetMakefile().ClearMatches(); status.GetMakefile().StoreMatches(re); - std::string::size_type l = re.start(); - std::string::size_type r = re.end(); - if (r - l == 0) { - std::string e = "sub-command REGEX, mode MATCHALL regex \"" + regex + - "\" matched an empty string."; - status.SetError(e); - return false; - } - if (!output.empty()) { + if (!output.empty() || optNonEmpty) { output += ";"; } output += re.match(); - base = r; + base = re.end(); + + if (re.start() == input.length()) { + break; + } + if (re.start() == re.end()) { + optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET; + } else { + optNonEmpty = 0; + } } // Store the output in the provided variable. diff --git a/Source/cmStringReplaceHelper.cxx b/Source/cmStringReplaceHelper.cxx index e909b05..025dfc7 100644 --- a/Source/cmStringReplaceHelper.cxx +++ b/Source/cmStringReplaceHelper.cxx @@ -33,25 +33,17 @@ bool cmStringReplaceHelper::Replace(std::string const& input, } // Scan through the input for all matches. + auto& re = this->RegularExpression; std::string::size_type base = 0; - while (this->RegularExpression.find(input, base, optAnchor)) { + unsigned optNonEmpty = 0; + while (re.find(input, base, optAnchor | optNonEmpty)) { if (this->Makefile) { this->Makefile->ClearMatches(); - this->Makefile->StoreMatches(this->RegularExpression); + this->Makefile->StoreMatches(re); } - auto l2 = this->RegularExpression.start(); - auto r = this->RegularExpression.end(); // Concatenate the part of the input that was not matched. - output += input.substr(base, l2 - base); - - // Make sure the match had some text. - if (r - l2 == 0) { - std::ostringstream error; - error << "regex \"" << this->RegExString << "\" matched an empty string"; - this->ErrorString = error.str(); - return false; - } + output += input.substr(base, re.start() - base); // Concatenate the replacement for the match. for (auto const& replacement : this->Replacements) { @@ -61,7 +53,7 @@ bool cmStringReplaceHelper::Replace(std::string const& input, } else { // Replace with part of the match. auto n = replacement.Number; - if (n > this->RegularExpression.num_groups()) { + if (n > re.num_groups()) { std::ostringstream error; error << "replace expression \"" << this->ReplaceExpression << "\" contains an out-of-range escape for regex \"" @@ -69,12 +61,21 @@ bool cmStringReplaceHelper::Replace(std::string const& input, this->ErrorString = error.str(); return false; } - output += this->RegularExpression.match(n); + output += re.match(n); } } // Move past the match. - base = r; + base = re.end(); + + if (re.start() == input.length()) { + break; + } + if (re.start() == re.end()) { + optNonEmpty = cmsys::RegularExpression::NONEMPTY_AT_OFFSET; + } else { + optNonEmpty = 0; + } } // Concatenate the text after the last match. diff --git a/Tests/CMakeTests/StringTest.cmake.in b/Tests/CMakeTests/StringTest.cmake.in index 8746551..ca2ee02 100644 --- a/Tests/CMakeTests/StringTest.cmake.in +++ b/Tests/CMakeTests/StringTest.cmake.in @@ -84,7 +84,7 @@ check_cmake_test(String # Execute each test listed in StringTestScript.cmake: # set(scriptname "@CMAKE_CURRENT_SOURCE_DIR@/StringTestScript.cmake") -set(number_of_tests_expected 73) +set(number_of_tests_expected 70) include("@CMAKE_CURRENT_SOURCE_DIR@/ExecuteScriptTests.cmake") execute_all_script_tests(${scriptname} number_of_tests_executed) diff --git a/Tests/CMakeTests/StringTestScript.cmake b/Tests/CMakeTests/StringTestScript.cmake index dc7b8ae..84b404c 100644 --- a/Tests/CMakeTests/StringTestScript.cmake +++ b/Tests/CMakeTests/StringTestScript.cmake @@ -73,9 +73,6 @@ elseif(testname STREQUAL regex_match_multiple_inputs) # pass elseif(testname STREQUAL regex_match_bad_regex) # fail string(REGEX MATCH "(.*" v input) -elseif(testname STREQUAL regex_match_empty_string) # fail - string(REGEX MATCH "x*" v "") - elseif(testname STREQUAL regex_match_no_match) # pass string(REGEX MATCH "xyz" v "abc") message(STATUS "v='${v}'") @@ -87,9 +84,6 @@ elseif(testname STREQUAL regex_matchall_multiple_inputs) # pass elseif(testname STREQUAL regex_matchall_bad_regex) # fail string(REGEX MATCHALL "(.*" v input) -elseif(testname STREQUAL regex_matchall_empty_string) # fail - string(REGEX MATCHALL "x*" v "") - elseif(testname STREQUAL regex_replace_ends_with_backslash) # fail string(REGEX REPLACE "input" "output\\" v input1 input2 input3 input4) @@ -107,9 +101,6 @@ elseif(testname STREQUAL regex_replace_has_bogus_escape) # fail elseif(testname STREQUAL regex_replace_bad_regex) # fail string(REGEX REPLACE "this (.*" "with that" v input) -elseif(testname STREQUAL regex_replace_empty_string) # fail - string(REGEX REPLACE "x*" "that" v "") - elseif(testname STREQUAL regex_replace_index_too_small) # fail string(REGEX REPLACE "^this (.*)$" "with \\1 \\-1" v "this input") diff --git a/Tests/RunCMake/string/RegexEmptyMatch.cmake b/Tests/RunCMake/string/RegexEmptyMatch.cmake new file mode 100644 index 0000000..1510137 --- /dev/null +++ b/Tests/RunCMake/string/RegexEmptyMatch.cmake @@ -0,0 +1,143 @@ +cmake_policy(SET CMP0186 NEW) + +function(check_output name expected) + set(output "${${name}}") + if(NOT output STREQUAL expected) + message(FATAL_ERROR "\"string(REGEX)\" set ${name} to \"${output}\", expected \"${expected}\"") + endif() +endfunction() + +# Zero-length matches in REGEX MATCH + +string(REGEX MATCH "" out "") +check_output(out "") + +string(REGEX MATCH "" out "a") +check_output(out "") + +string(REGEX MATCH "a*" out "") +check_output(out "") + +string(REGEX MATCH "a*" out "a") +check_output(out "a") + +string(REGEX MATCH "a*" out "b") +check_output(out "") + +string(REGEX MATCH "a*" out "ba") +check_output(out "") + +# Zero-length matches in REGEX MATCHALL + +string(REGEX MATCHALL "" out "") +check_output(out "") + +string(REGEX MATCHALL "" out "ab") +check_output(out ";;") + +string(REGEX MATCHALL "^" out "ab") +check_output(out "") + +string(REGEX MATCHALL "(^|,)" out "a,b") +check_output(out ";,") + +string(REGEX MATCHALL "(,|^)" out "a,b") +check_output(out ";,") + +string(REGEX MATCHALL "(^|)" out "") +check_output(out "") + +string(REGEX MATCHALL "(^|)" out "ab") +check_output(out ";;") + +string(REGEX MATCHALL "a|^" out "ab") +check_output(out "a") + +string(REGEX MATCHALL "$" out "ab") +check_output(out "") + +string(REGEX MATCHALL "($|,)" out "a,b") +check_output(out ",;") + +string(REGEX MATCHALL "(,|$)" out "a,b") +check_output(out ",;") + +string(REGEX MATCHALL "(|$)" out "") +check_output(out "") + +string(REGEX MATCHALL "(|$)" out "ab") +check_output(out ";;") + +string(REGEX MATCHALL "(b|)" out "abc") +check_output(out ";b;;") + +string(REGEX MATCHALL "(|b)" out "abc") +check_output(out ";;b;;") + +string(REGEX MATCHALL "a*" out "aaa") +check_output(out "aaa;") + +string(REGEX MATCHALL "(a)?(b)?" out "") +check_output(out "") + +string(REGEX MATCHALL "(a)?(b)?" out "abba") +check_output(out "ab;b;a;") + +# Zero-length matches in REGEX REPLACE + +string(REGEX REPLACE "" "" out "") +check_output(out "") + +string(REGEX REPLACE "" "x" out "") +check_output(out "x") + +string(REGEX REPLACE "" "x" out "ab") +check_output(out "xaxbx") + +string(REGEX REPLACE "^" "x" out "ab") +check_output(out "xab") + +string(REGEX REPLACE "(^|,)" "x" out "a,b") +check_output(out "xaxb") + +string(REGEX REPLACE "(,|^)" "x" out "a,b") +check_output(out "xaxb") + +string(REGEX REPLACE "(^|)" "x" out "") +check_output(out "x") + +string(REGEX REPLACE "(^|)" "x" out "ab") +check_output(out "xaxbx") + +string(REGEX REPLACE "a|^" "x" out "ab") +check_output(out "xb") + +string(REGEX REPLACE "$" "x" out "ab") +check_output(out "abx") + +string(REGEX REPLACE "($|,)" "x" out "a,b") +check_output(out "axbx") + +string(REGEX REPLACE "(,|$)" "x" out "a,b") +check_output(out "axbx") + +string(REGEX REPLACE "(|$)" "x" out "") +check_output(out "x") + +string(REGEX REPLACE "(|$)" "x" out "ab") +check_output(out "xaxbx") + +string(REGEX REPLACE "(b|)" "x" out "abc") +check_output(out "xaxxcx") + +string(REGEX REPLACE "(|b)" "x" out "abc") +check_output(out "xaxxxcx") + +string(REGEX REPLACE "a*" "x" out "aaa") +check_output(out "xx") + +string(REGEX REPLACE "(a)?(b)?" "x" out "") +check_output(out "x") + +string(REGEX REPLACE "(a)?(b)?" "x" out "abba") +check_output(out "xxxx") diff --git a/Tests/RunCMake/string/RunCMakeTest.cmake b/Tests/RunCMake/string/RunCMakeTest.cmake index e352fcb..91a03da 100644 --- a/Tests/RunCMake/string/RunCMakeTest.cmake +++ b/Tests/RunCMake/string/RunCMakeTest.cmake @@ -35,6 +35,7 @@ run_cmake(UuidBadType) run_cmake(RegexClear) run_cmake(RegexMultiMatchClear) +run_cmake(RegexEmptyMatch) run_cmake(CMP0186) run_cmake(UTF-16BE) -- cgit v0.12