Merge bitcoin/bitcoin#22953: refactor: introduce single-separator split helper (boost::split replacement)

a62e84438d27ee6213219fe2c233e58814fcbb5d fuzz: add `SplitString` fuzz target (MarcoFalke) 4fad7e46d94a0fdee4ff917e81360d7ae6bd8110 test: add unit tests for `SplitString` helper (Kiminuo) 9cc8e876e412056ed22d364538f0da3d5d71946d refactor: introduce single-separator split helper `SplitString` (Sebastian Falbesoner) Pull request description: This PR adds a simple string split helper `SplitString` that takes use of the spanparsing `Split` function that was first introduced in #13697 (commit fe8a7dcd78cfeedc9a7c705e91384f793822912b). This enables to replace most calls to `boost::split`, in the cases where only a single separator character is used. Note that while previous attempts to replace `boost::split` were controversial (e.g. #13751), this one has a trivial implementation: it merely uses an internal helper (that is unit tested and in regular use with output descriptiors) and converts its result from spans to strings. As a drawback though, not all `boost::split` instances can be tackled. As a possible optimization, one could return a vector of `std::string_view`s (available since C++17) instead of strings, to avoid copies. This would need more carefulness on the caller sites though, to avoid potential lifetime issues, and it's probably not worth it, considering that none of the places where strings are split are really performance-critical. ACKs for top commit: martinus: Code review ACK a62e84438d27ee6213219fe2c233e58814fcbb5d. Ran all tests. I also like that with `boost::split` it was not obvious that the resulting container was cleared, and with `SplitString` API that's obvious. Tree-SHA512: 10cb22619ebe46831b1f8e83584a89381a036b54c88701484ac00743e2a62cfe52c9f3ecdbb2d0815e536c99034558277cc263600ec3f3588b291c07eef8ed24
author: fanquake <fanquake@gmail.com> 2022-04-26 09:30:34 +0100
committer: fanquake <fanquake@gmail.com> 2022-04-26 09:54:49 +0100
commit: f436bfd126adaa64daba2e78016eb707e37eabce (patch)
tree: 1bb4dfcaf306fee1f5ee66e07e3c1aa54bec6675
parent: a19f641a80cb2841ca9f593e9be589dbc4b4ae7c (diff)
parent: a62e84438d27ee6213219fe2c233e58814fcbb5d (diff)
download: bitcoin-f436bfd126adaa64daba2e78016eb707e37eabce.tar.xz
14 files changed, 97 insertions, 71 deletions
diff --git a/src/bitcoin-tx.cpp b/src/bitcoin-tx.cpp
index 0b40626595..05f910e9cb 100644
--- a/src/bitcoin-tx.cpp
+++ b/src/bitcoin-tx.cpp
@@ -31,8 +31,6 @@
 #include <memory>
 #include <stdio.h>
 
-#include <boost/algorithm/string.hpp>
-
 static bool fCreateBlank;
 static std::map<std::string,UniValue> registers;
 static const int CONTINUE_EXECUTION=-1;
@@ -251,8 +249,7 @@ static T TrimAndParse(const std::string& int_str, const std::string& err)
 
 static void MutateTxAddInput(CMutableTransaction& tx, const std::string& strInput)
 {
-    std::vector<std::string> vStrInputParts;
-    boost::split(vStrInputParts, strInput, boost::is_any_of(":"));
+    std::vector<std::string> vStrInputParts = SplitString(strInput, ':');
 
     // separate TXID:VOUT in string
     if (vStrInputParts.size()<2)
@@ -287,8 +284,7 @@ static void MutateTxAddInput(CMutableTransaction& tx, const std::string& strInpu
 static void MutateTxAddOutAddr(CMutableTransaction& tx, const std::string& strInput)
 {
     // Separate into VALUE:ADDRESS
-    std::vector<std::string> vStrInputParts;
-    boost::split(vStrInputParts, strInput, boost::is_any_of(":"));
+    std::vector<std::string> vStrInputParts = SplitString(strInput, ':');
 
     if (vStrInputParts.size() != 2)
         throw std::runtime_error("TX output missing or too many separators");
@@ -312,8 +308,7 @@ static void MutateTxAddOutAddr(CMutableTransaction& tx, const std::string& strIn
 static void MutateTxAddOutPubKey(CMutableTransaction& tx, const std::string& strInput)
 {
     // Separate into VALUE:PUBKEY[:FLAGS]
-    std::vector<std::string> vStrInputParts;
-    boost::split(vStrInputParts, strInput, boost::is_any_of(":"));
+    std::vector<std::string> vStrInputParts = SplitString(strInput, ':');
 
     if (vStrInputParts.size() < 2 || vStrInputParts.size() > 3)
         throw std::runtime_error("TX output missing or too many separators");
@@ -356,8 +351,7 @@ static void MutateTxAddOutPubKey(CMutableTransaction& tx, const std::string& str
 static void MutateTxAddOutMultiSig(CMutableTransaction& tx, const std::string& strInput)
 {
     // Separate into VALUE:REQUIRED:NUMKEYS:PUBKEY1:PUBKEY2:....[:FLAGS]
-    std::vector<std::string> vStrInputParts;
-    boost::split(vStrInputParts, strInput, boost::is_any_of(":"));
+    std::vector<std::string> vStrInputParts = SplitString(strInput, ':');
 
     // Check that there are enough parameters
     if (vStrInputParts.size()<3)
@@ -460,8 +454,7 @@ static void MutateTxAddOutData(CMutableTransaction& tx, const std::string& strIn
 static void MutateTxAddOutScript(CMutableTransaction& tx, const std::string& strInput)
 {
     // separate VALUE:SCRIPT[:FLAGS]
-    std::vector<std::string> vStrInputParts;
-    boost::split(vStrInputParts, strInput, boost::is_any_of(":"));
+    std::vector<std::string> vStrInputParts = SplitString(strInput, ':');
     if (vStrInputParts.size() < 2)
         throw std::runtime_error("TX output missing separator");
 
diff --git a/src/chainparams.cpp b/src/chainparams.cpp
index c65a816a5f..7a7c72ea25 100644
--- a/src/chainparams.cpp
+++ b/src/chainparams.cpp
@@ -10,13 +10,11 @@
 #include <deploymentinfo.h>
 #include <hash.h> // for signet block challenge hash
 #include <script/interpreter.h>
+#include <util/string.h>
 #include <util/system.h>
 
 #include <assert.h>
 
-#include <boost/algorithm/string/classification.hpp>
-#include <boost/algorithm/string/split.hpp>
-
 static CBlock CreateGenesisBlock(const char* pszTimestamp, const CScript& genesisOutputScript, uint32_t nTime, uint32_t nNonce, uint32_t nBits, int32_t nVersion, const CAmount& genesisReward)
 {
     CMutableTransaction txNew;
@@ -528,8 +526,7 @@ void CRegTestParams::UpdateActivationParametersFromArgs(const ArgsManager& args)
     if (!args.IsArgSet("-vbparams")) return;
 
     for (const std::string& strDeployment : args.GetArgs("-vbparams")) {
-        std::vector<std::string> vDeploymentParams;
-        boost::split(vDeploymentParams, strDeployment, boost::is_any_of(":"));
+        std::vector<std::string> vDeploymentParams = SplitString(strDeployment, ':');
         if (vDeploymentParams.size() < 3 || 4 < vDeploymentParams.size()) {
             throw std::runtime_error("Version bits parameters malformed, expecting deployment:start:end[:min_activation_height]");
         }
diff --git a/src/rest.cpp b/src/rest.cpp
index 2aeb9c68c3..22b5d2e074 100644
--- a/src/rest.cpp
+++ b/src/rest.cpp
@@ -32,8 +32,6 @@
 #include <any>
 #include <string>
 
-#include <boost/algorithm/string.hpp>
-
 #include <univalue.h>
 
 using node::GetTransaction;
@@ -191,8 +189,7 @@ static bool rest_headers(const std::any& context,
         return false;
     std::string param;
     const RESTResponseFormat rf = ParseDataFormat(param, strURIPart);
-    std::vector<std::string> path;
-    boost::split(path, param, boost::is_any_of("/"));
+    std::vector<std::string> path = SplitString(param, '/');
 
     std::string raw_count;
     std::string hashStr;
@@ -362,8 +359,7 @@ static bool rest_filter_header(const std::any& context, HTTPRequest* req, const
     std::string param;
     const RESTResponseFormat rf = ParseDataFormat(param, strURIPart);
 
-    std::vector<std::string> uri_parts;
-    boost::split(uri_parts, param, boost::is_any_of("/"));
+    std::vector<std::string> uri_parts = SplitString(param, '/');
     std::string raw_count;
     std::string raw_blockhash;
     if (uri_parts.size() == 3) {
@@ -483,8 +479,7 @@ static bool rest_block_filter(const std::any& context, HTTPRequest* req, const s
     const RESTResponseFormat rf = ParseDataFormat(param, strURIPart);
 
     // request is sent over URI scheme /rest/blockfilter/filtertype/blockhash
-    std::vector<std::string> uri_parts;
-    boost::split(uri_parts, param, boost::is_any_of("/"));
+    std::vector<std::string> uri_parts = SplitString(param, '/');
     if (uri_parts.size() != 2) {
         return RESTERR(req, HTTP_BAD_REQUEST, "Invalid URI format. Expected /rest/blockfilter/<filtertype>/<blockhash>");
     }
@@ -712,7 +707,7 @@ static bool rest_getutxos(const std::any& context, HTTPRequest* req, const std::
     if (param.length() > 1)
     {
         std::string strUriParams = param.substr(1);
-        boost::split(uriParts, strUriParams, boost::is_any_of("/"));
+        uriParts = SplitString(strUriParams, '/');
     }
 
     // throw exception in case of an empty request
diff --git a/src/rpc/server.cpp b/src/rpc/server.cpp
index 333ed6f5da..6083c23d3b 100644
--- a/src/rpc/server.cpp
+++ b/src/rpc/server.cpp
@@ -9,10 +9,9 @@
 #include <shutdown.h>
 #include <sync.h>
 #include <util/strencodings.h>
+#include <util/string.h>
 #include <util/system.h>
 
-#include <boost/algorithm/string/classification.hpp>
-#include <boost/algorithm/string/split.hpp>
 #include <boost/signals2/signal.hpp>
 
 #include <cassert>
@@ -407,8 +406,7 @@ static inline JSONRPCRequest transformNamedArguments(const JSONRPCRequest& in, c
     // Process expected parameters.
     int hole = 0;
     for (const std::string &argNamePattern: argNames) {
-        std::vector<std::string> vargNames;
-        boost::algorithm::split(vargNames, argNamePattern, boost::algorithm::is_any_of("|"));
+        std::vector<std::string> vargNames = SplitString(argNamePattern, '|');
         auto fr = argsIn.end();
         for (const std::string & argName : vargNames) {
             fr = argsIn.find(argName);
diff --git a/src/rpc/util.cpp b/src/rpc/util.cpp
index b5e167a2a8..ef3e58494e 100644
--- a/src/rpc/util.cpp
+++ b/src/rpc/util.cpp
@@ -16,9 +16,6 @@
 
 #include <tuple>
 
-#include <boost/algorithm/string/classification.hpp>
-#include <boost/algorithm/string/split.hpp>
-
 const std::string UNIX_EPOCH_TIME = "UNIX epoch time";
 const std::string EXAMPLE_ADDRESS[2] = {"bc1q09vm5lfy0j5reeulh4x5752q25uqqvz34hufdl", "bc1q02ad21edsxd23d32dfgqqsz4vv4nmtfzuklhy3"};
 
@@ -514,8 +511,7 @@ RPCHelpMan::RPCHelpMan(std::string name, std::string description, std::vector<RP
 {
     std::set<std::string> named_args;
     for (const auto& arg : m_args) {
-        std::vector<std::string> names;
-        boost::split(names, arg.m_names, boost::is_any_of("|"));
+        std::vector<std::string> names = SplitString(arg.m_names, '|');
         // Should have unique named arguments
         for (const std::string& name : names) {
             CHECK_NONFATAL(named_args.insert(name).second);
@@ -666,8 +662,7 @@ UniValue RPCHelpMan::GetArgMap() const
     UniValue arr{UniValue::VARR};
     for (int i{0}; i < int(m_args.size()); ++i) {
         const auto& arg = m_args.at(i);
-        std::vector<std::string> arg_names;
-        boost::split(arg_names, arg.m_names, boost::is_any_of("|"));
+        std::vector<std::string> arg_names = SplitString(arg.m_names, '|');
         for (const auto& arg_name : arg_names) {
             UniValue map{UniValue::VARR};
             map.push_back(m_name);
diff --git a/src/test/fuzz/script_assets_test_minimizer.cpp b/src/test/fuzz/script_assets_test_minimizer.cpp
index 00a3bed12f..a18a0de47e 100644
--- a/src/test/fuzz/script_assets_test_minimizer.cpp
+++ b/src/test/fuzz/script_assets_test_minimizer.cpp
@@ -11,8 +11,8 @@
 #include <streams.h>
 #include <univalue.h>
 #include <util/strencodings.h>
+#include <util/string.h>
 
-#include <boost/algorithm/string.hpp>
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -130,8 +130,7 @@ unsigned int ParseScriptFlags(const std::string& str)
     if (str.empty()) return 0;
 
     unsigned int flags = 0;
-    std::vector<std::string> words;
-    boost::algorithm::split(words, str, boost::algorithm::is_any_of(","));
+    std::vector<std::string> words = SplitString(str, ',');
 
     for (const std::string& word : words) {
         auto it = FLAG_NAMES.find(word);
diff --git a/src/test/fuzz/string.cpp b/src/test/fuzz/string.cpp
index ca57af25c4..b4876d427f 100644
--- a/src/test/fuzz/string.cpp
+++ b/src/test/fuzz/string.cpp
@@ -224,6 +224,7 @@ FUZZ_TARGET(string)
         int64_t amount_out;
         (void)ParseFixedPoint(random_string_1, fuzzed_data_provider.ConsumeIntegralInRange<int>(0, 1024), &amount_out);
     }
+    (void)SplitString(random_string_1, fuzzed_data_provider.ConsumeIntegral<char>());
     {
         (void)Untranslated(random_string_1);
         const bilingual_str bs1{random_string_1, random_string_2};
diff --git a/src/test/transaction_tests.cpp b/src/test/transaction_tests.cpp
index 4fb7f9c405..df67841b66 100644
--- a/src/test/transaction_tests.cpp
+++ b/src/test/transaction_tests.cpp
@@ -31,8 +31,6 @@
 #include <map>
 #include <string>
 
-#include <boost/algorithm/string/classification.hpp>
-#include <boost/algorithm/string/split.hpp>
 #include <boost/test/unit_test.hpp>
 
 #include <univalue.h>
@@ -70,8 +68,7 @@ unsigned int ParseScriptFlags(std::string strFlags)
 {
     if (strFlags.empty() || strFlags == "NONE") return 0;
     unsigned int flags = 0;
-    std::vector<std::string> words;
-    boost::algorithm::split(words, strFlags, boost::algorithm::is_any_of(","));
+    std::vector<std::string> words = SplitString(strFlags, ',');
 
     for (const std::string& word : words)
     {
diff --git a/src/test/util_tests.cpp b/src/test/util_tests.cpp
index b5d8411e1d..779ed20032 100644
--- a/src/test/util_tests.cpp
+++ b/src/test/util_tests.cpp
@@ -2349,6 +2349,55 @@ BOOST_AUTO_TEST_CASE(test_spanparsing)
     BOOST_CHECK_EQUAL(SpanToStr(results[3]), "");
 }
 
+BOOST_AUTO_TEST_CASE(test_SplitString)
+{
+    // Empty string.
+    {
+        std::vector<std::string> result = SplitString("", '-');
+        BOOST_CHECK_EQUAL(result.size(), 1);
+        BOOST_CHECK_EQUAL(result[0], "");
+    }
+
+    // Empty items.
+    {
+        std::vector<std::string> result = SplitString("-", '-');
+        BOOST_CHECK_EQUAL(result.size(), 2);
+        BOOST_CHECK_EQUAL(result[0], "");
+        BOOST_CHECK_EQUAL(result[1], "");
+    }
+
+    // More empty items.
+    {
+        std::vector<std::string> result = SplitString("--", '-');
+        BOOST_CHECK_EQUAL(result.size(), 3);
+        BOOST_CHECK_EQUAL(result[0], "");
+        BOOST_CHECK_EQUAL(result[1], "");
+        BOOST_CHECK_EQUAL(result[2], "");
+    }
+
+    // Separator is not present.
+    {
+        std::vector<std::string> result = SplitString("abc", '-');
+        BOOST_CHECK_EQUAL(result.size(), 1);
+        BOOST_CHECK_EQUAL(result[0], "abc");
+    }
+
+    // Basic behavior.
+    {
+        std::vector<std::string> result = SplitString("a-b", '-');
+        BOOST_CHECK_EQUAL(result.size(), 2);
+        BOOST_CHECK_EQUAL(result[0], "a");
+        BOOST_CHECK_EQUAL(result[1], "b");
+    }
+
+    // Case-sensitivity of the separator.
+    {
+        std::vector<std::string> result = SplitString("AAA", 'a');
+        BOOST_CHECK_EQUAL(result.size(), 1);
+        BOOST_CHECK_EQUAL(result[0], "AAA");
+    }
+}
+
 BOOST_AUTO_TEST_CASE(test_LogEscapeMessage)
 {
     // ASCII and UTF-8 must pass through unaltered.
diff --git a/src/torcontrol.cpp b/src/torcontrol.cpp
index 74450f591d..287de9c794 100644
--- a/src/torcontrol.cpp
+++ b/src/torcontrol.cpp
@@ -24,9 +24,7 @@
 #include <set>
 #include <vector>
 
-#include <boost/algorithm/string/classification.hpp>
 #include <boost/algorithm/string/replace.hpp>
-#include <boost/algorithm/string/split.hpp>
 
 #include <event2/buffer.h>
 #include <event2/bufferevent.h>
@@ -347,8 +345,8 @@ void TorController::get_socks_cb(TorControlConnection& _conn, const TorControlRe
         for (const auto& line : reply.lines) {
             if (0 == line.compare(0, 20, "net/listeners/socks=")) {
                 const std::string port_list_str = line.substr(20);
-                std::vector<std::string> port_list;
-                boost::split(port_list, port_list_str, boost::is_any_of(" "));
+                std::vector<std::string> port_list = SplitString(port_list_str, ' ');
+
                 for (auto& portstr : port_list) {
                     if (portstr.empty()) continue;
                     if ((portstr[0] == '"' || portstr[0] == '\'') && portstr.size() >= 2 && (*portstr.rbegin() == portstr[0])) {
@@ -542,8 +540,10 @@ void TorController::protocolinfo_cb(TorControlConnection& _conn, const TorContro
             if (l.first == "AUTH") {
                 std::map<std::string,std::string> m = ParseTorReplyMapping(l.second);
                 std::map<std::string,std::string>::iterator i;
-                if ((i = m.find("METHODS")) != m.end())
-                    boost::split(methods, i->second, boost::is_any_of(","));
+                if ((i = m.find("METHODS")) != m.end()) {
+                    std::vector<std::string> m_vec = SplitString(i->second, ',');
+                    methods = std::set<std::string>(m_vec.begin(), m_vec.end());
+                }
                 if ((i = m.find("COOKIEFILE")) != m.end())
                     cookiefile = i->second;
             } else if (l.first == "VERSION") {
diff --git a/src/util/spanparsing.cpp b/src/util/spanparsing.cpp
index 50f6aee419..8614bd1176 100644
--- a/src/util/spanparsing.cpp
+++ b/src/util/spanparsing.cpp
@@ -48,20 +48,4 @@ Span<const char> Expr(Span<const char>& sp)
     return ret;
 }
 
-std::vector<Span<const char>> Split(const Span<const char>& sp, char sep)
-{
-    std::vector<Span<const char>> ret;
-    auto it = sp.begin();
-    auto start = it;
-    while (it != sp.end()) {
-        if (*it == sep) {
-            ret.emplace_back(start, it);
-            start = it + 1;
-        }
-        ++it;
-    }
-    ret.emplace_back(start, it);
-    return ret;
-}
-
 } // namespace spanparsing
diff --git a/src/util/spanparsing.h b/src/util/spanparsing.h
index fa2e698e6d..ebec8714a7 100644
--- a/src/util/spanparsing.h
+++ b/src/util/spanparsing.h
@@ -43,7 +43,22 @@ Span<const char> Expr(Span<const char>& sp);
  * Note that this function does not care about braces, so splitting
  * "foo(bar(1),2),3) on ',' will return {"foo(bar(1)", "2)", "3)"}.
  */
-std::vector<Span<const char>> Split(const Span<const char>& sp, char sep);
+template <typename T = Span<const char>>
+std::vector<T> Split(const Span<const char>& sp, char sep)
+{
+    std::vector<T> ret;
+    auto it = sp.begin();
+    auto start = it;
+    while (it != sp.end()) {
+        if (*it == sep) {
+            ret.emplace_back(start, it);
+            start = it + 1;
+        }
+        ++it;
+    }
+    ret.emplace_back(start, it);
+    return ret;
+}
 
 } // namespace spanparsing
 
diff --git a/src/util/string.h b/src/util/string.h
index a3b8df8d78..bcd6905fd5 100644
--- a/src/util/string.h
+++ b/src/util/string.h
@@ -6,6 +6,7 @@
 #define BITCOIN_UTIL_STRING_H
 
 #include <attributes.h>
+#include <util/spanparsing.h>
 
 #include <algorithm>
 #include <array>
@@ -15,6 +16,11 @@
 #include <string>
 #include <vector>
 
+[[nodiscard]] inline std::vector<std::string> SplitString(std::string_view str, char sep)
+{
+    return spanparsing::Split<std::string>(str, sep);
+}
+
 [[nodiscard]] inline std::string TrimString(const std::string& str, const std::string& pattern = " \f\n\r\t\v")
 {
     std::string::size_type front = str.find_first_not_of(pattern);
diff --git a/src/wallet/rpc/backup.cpp b/src/wallet/rpc/backup.cpp
index 7481072815..c888d66b43 100644
--- a/src/wallet/rpc/backup.cpp
+++ b/src/wallet/rpc/backup.cpp
@@ -26,8 +26,6 @@
 #include <tuple>
 #include <string>
 
-#include <boost/algorithm/string.hpp>
-
 #include <univalue.h>
 
 
@@ -546,8 +544,7 @@ RPCHelpMan importwallet()
             if (line.empty() || line[0] == '#')
                 continue;
 
-            std::vector<std::string> vstr;
-            boost::split(vstr, line, boost::is_any_of(" "));
+            std::vector<std::string> vstr = SplitString(line, ' ');
             if (vstr.size() < 2)
                 continue;
             CKey key = DecodeSecret(vstr[0]);
author	fanquake <fanquake@gmail.com>	2022-04-26 09:30:34 +0100
committer	fanquake <fanquake@gmail.com>	2022-04-26 09:54:49 +0100
commit	f436bfd126adaa64daba2e78016eb707e37eabce (patch)
tree	1bb4dfcaf306fee1f5ee66e07e3c1aa54bec6675
parent	a19f641a80cb2841ca9f593e9be589dbc4b4ae7c (diff)
parent	a62e84438d27ee6213219fe2c233e58814fcbb5d (diff)
download	bitcoin-f436bfd126adaa64daba2e78016eb707e37eabce.tar.xz