Merge bitcoin/bitcoin#20457: util: Make Parse{Int,UInt}{32,64} use locale independent std::from_chars(…) (C++17) instead of locale dependent strto{l,ll,ul,ull}

4747db876154ddd828c03d9eda10ecf8b25d8dc8 util: Introduce ToIntegral<T>(const std::string&) for locale independent parsing using std::from_chars(…) (C++17) (practicalswift) Pull request description: Make `Parse{Int,UInt}{32,64}` use locale independent `std::from_chars(…)` (C++17) instead of locale dependent `strto{l,ll,ul,ull}`. [About `std::from_chars`](https://en.cppreference.com/w/cpp/utility/from_chars): _"Unlike other parsing functions in C++ and C libraries, `std::from_chars` is locale-independent, non-allocating, and non-throwing."_ ACKs for top commit: laanwj: Code review ACK 4747db876154ddd828c03d9eda10ecf8b25d8dc8 Tree-SHA512: 40f2cd582bc19ddcf2c498eca3379167619eff6aa047bbac2f73b8fd8ecaefe5947c66700a189f83848751f9f8c05645e83afd4a44a1679062aee5440dba880a
author: W. J. van der Laan <laanwj@protonmail.com> 2021-09-30 14:47:03 +0200
committer: W. J. van der Laan <laanwj@protonmail.com> 2021-09-30 15:14:58 +0200
commit: 2d8e0c0c3c0d3c4cee7bb52d1edf501f40c53463 (patch)
tree: 8266dac1d118b5985f5f84b1550752198bc21f38
parent: 1cf7fb9fd65820b298a0eb28f6e2c4c7409c78d1 (diff)
parent: 4747db876154ddd828c03d9eda10ecf8b25d8dc8 (diff)
5 files changed, 270 insertions, 74 deletions
diff --git a/src/test/fuzz/string.cpp b/src/test/fuzz/string.cpp
index 0c1b45b86c..dc2bf7c860 100644
--- a/src/test/fuzz/string.cpp
+++ b/src/test/fuzz/string.cpp
@@ -31,9 +31,99 @@
 #include <version.h>
 
 #include <cstdint>
+#include <cstdlib>
 #include <string>
 #include <vector>
 
+namespace {
+bool LegacyParsePrechecks(const std::string& str)
+{
+    if (str.empty()) // No empty string allowed
+        return false;
+    if (str.size() >= 1 && (IsSpace(str[0]) || IsSpace(str[str.size() - 1]))) // No padding allowed
+        return false;
+    if (!ValidAsCString(str)) // No embedded NUL characters allowed
+        return false;
+    return true;
+}
+
+bool LegacyParseInt32(const std::string& str, int32_t* out)
+{
+    if (!LegacyParsePrechecks(str))
+        return false;
+    char* endp = nullptr;
+    errno = 0; // strtol will not set errno if valid
+    long int n = strtol(str.c_str(), &endp, 10);
+    if (out) *out = (int32_t)n;
+    // Note that strtol returns a *long int*, so even if strtol doesn't report an over/underflow
+    // we still have to check that the returned value is within the range of an *int32_t*. On 64-bit
+    // platforms the size of these types may be different.
+    return endp && *endp == 0 && !errno &&
+           n >= std::numeric_limits<int32_t>::min() &&
+           n <= std::numeric_limits<int32_t>::max();
+}
+
+bool LegacyParseInt64(const std::string& str, int64_t* out)
+{
+    if (!LegacyParsePrechecks(str))
+        return false;
+    char* endp = nullptr;
+    errno = 0; // strtoll will not set errno if valid
+    long long int n = strtoll(str.c_str(), &endp, 10);
+    if (out) *out = (int64_t)n;
+    // Note that strtoll returns a *long long int*, so even if strtol doesn't report an over/underflow
+    // we still have to check that the returned value is within the range of an *int64_t*.
+    return endp && *endp == 0 && !errno &&
+           n >= std::numeric_limits<int64_t>::min() &&
+           n <= std::numeric_limits<int64_t>::max();
+}
+
+bool LegacyParseUInt32(const std::string& str, uint32_t* out)
+{
+    if (!LegacyParsePrechecks(str))
+        return false;
+    if (str.size() >= 1 && str[0] == '-') // Reject negative values, unfortunately strtoul accepts these by default if they fit in the range
+        return false;
+    char* endp = nullptr;
+    errno = 0; // strtoul will not set errno if valid
+    unsigned long int n = strtoul(str.c_str(), &endp, 10);
+    if (out) *out = (uint32_t)n;
+    // Note that strtoul returns a *unsigned long int*, so even if it doesn't report an over/underflow
+    // we still have to check that the returned value is within the range of an *uint32_t*. On 64-bit
+    // platforms the size of these types may be different.
+    return endp && *endp == 0 && !errno &&
+           n <= std::numeric_limits<uint32_t>::max();
+}
+
+bool LegacyParseUInt8(const std::string& str, uint8_t* out)
+{
+    uint32_t u32;
+    if (!LegacyParseUInt32(str, &u32) || u32 > std::numeric_limits<uint8_t>::max()) {
+        return false;
+    }
+    if (out != nullptr) {
+        *out = static_cast<uint8_t>(u32);
+    }
+    return true;
+}
+
+bool LegacyParseUInt64(const std::string& str, uint64_t* out)
+{
+    if (!LegacyParsePrechecks(str))
+        return false;
+    if (str.size() >= 1 && str[0] == '-') // Reject negative values, unfortunately strtoull accepts these by default if they fit in the range
+        return false;
+    char* endp = nullptr;
+    errno = 0; // strtoull will not set errno if valid
+    unsigned long long int n = strtoull(str.c_str(), &endp, 10);
+    if (out) *out = (uint64_t)n;
+    // Note that strtoull returns a *unsigned long long int*, so even if it doesn't report an over/underflow
+    // we still have to check that the returned value is within the range of an *uint64_t*.
+    return endp && *endp == 0 && !errno &&
+           n <= std::numeric_limits<uint64_t>::max();
+}
+}; // namespace
+
 FUZZ_TARGET(string)
 {
     FuzzedDataProvider fuzzed_data_provider(buffer.data(), buffer.size());
@@ -133,4 +223,49 @@ FUZZ_TARGET(string)
         const bilingual_str bs2{random_string_2, random_string_1};
         (void)(bs1 + bs2);
     }
+    {
+        int32_t i32;
+        int64_t i64;
+        uint32_t u32;
+        uint64_t u64;
+        uint8_t u8;
+        const bool ok_i32 = ParseInt32(random_string_1, &i32);
+        const bool ok_i64 = ParseInt64(random_string_1, &i64);
+        const bool ok_u32 = ParseUInt32(random_string_1, &u32);
+        const bool ok_u64 = ParseUInt64(random_string_1, &u64);
+        const bool ok_u8 = ParseUInt8(random_string_1, &u8);
+
+        int32_t i32_legacy;
+        int64_t i64_legacy;
+        uint32_t u32_legacy;
+        uint64_t u64_legacy;
+        uint8_t u8_legacy;
+        const bool ok_i32_legacy = LegacyParseInt32(random_string_1, &i32_legacy);
+        const bool ok_i64_legacy = LegacyParseInt64(random_string_1, &i64_legacy);
+        const bool ok_u32_legacy = LegacyParseUInt32(random_string_1, &u32_legacy);
+        const bool ok_u64_legacy = LegacyParseUInt64(random_string_1, &u64_legacy);
+        const bool ok_u8_legacy = LegacyParseUInt8(random_string_1, &u8_legacy);
+
+        assert(ok_i32 == ok_i32_legacy);
+        assert(ok_i64 == ok_i64_legacy);
+        assert(ok_u32 == ok_u32_legacy);
+        assert(ok_u64 == ok_u64_legacy);
+        assert(ok_u8 == ok_u8_legacy);
+
+        if (ok_i32) {
+            assert(i32 == i32_legacy);
+        }
+        if (ok_i64) {
+            assert(i64 == i64_legacy);
+        }
+        if (ok_u32) {
+            assert(u32 == u32_legacy);
+        }
+        if (ok_u64) {
+            assert(u64 == u64_legacy);
+        }
+        if (ok_u8) {
+            assert(u8 == u8_legacy);
+        }
+    }
 }
diff --git a/src/test/util_tests.cpp b/src/test/util_tests.cpp
index a5c9d2ef6f..a13700d733 100644
--- a/src/test/util_tests.cpp
+++ b/src/test/util_tests.cpp
@@ -1474,6 +1474,81 @@ BOOST_AUTO_TEST_CASE(test_ParseInt32)
     BOOST_CHECK(!ParseInt32("32482348723847471234", nullptr));
 }
 
+BOOST_AUTO_TEST_CASE(test_ToIntegral)
+{
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("1234").value(), 1'234);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("0").value(), 0);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("01234").value(), 1'234);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("00000000000000001234").value(), 1'234);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("-00000000000000001234").value(), -1'234);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("00000000000000000000").value(), 0);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("-00000000000000000000").value(), 0);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("-1234").value(), -1'234);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("-1").value(), -1);
+
+    BOOST_CHECK(!ToIntegral<int32_t>(" 1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("1 "));
+    BOOST_CHECK(!ToIntegral<int32_t>("1a"));
+    BOOST_CHECK(!ToIntegral<int32_t>("1.1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("1.9"));
+    BOOST_CHECK(!ToIntegral<int32_t>("+01.9"));
+    BOOST_CHECK(!ToIntegral<int32_t>(" -1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("-1 "));
+    BOOST_CHECK(!ToIntegral<int32_t>(" -1 "));
+    BOOST_CHECK(!ToIntegral<int32_t>("+1"));
+    BOOST_CHECK(!ToIntegral<int32_t>(" +1"));
+    BOOST_CHECK(!ToIntegral<int32_t>(" +1 "));
+    BOOST_CHECK(!ToIntegral<int32_t>("+-1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("-+1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("++1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("--1"));
+    BOOST_CHECK(!ToIntegral<int32_t>(""));
+    BOOST_CHECK(!ToIntegral<int32_t>("aap"));
+    BOOST_CHECK(!ToIntegral<int32_t>("0x1"));
+    BOOST_CHECK(!ToIntegral<int32_t>("-32482348723847471234"));
+    BOOST_CHECK(!ToIntegral<int32_t>("32482348723847471234"));
+
+    BOOST_CHECK(!ToIntegral<int64_t>("-9223372036854775809"));
+    BOOST_CHECK_EQUAL(ToIntegral<int64_t>("-9223372036854775808").value(), -9'223'372'036'854'775'807LL - 1LL);
+    BOOST_CHECK_EQUAL(ToIntegral<int64_t>("9223372036854775807").value(), 9'223'372'036'854'775'807);
+    BOOST_CHECK(!ToIntegral<int64_t>("9223372036854775808"));
+
+    BOOST_CHECK(!ToIntegral<uint64_t>("-1"));
+    BOOST_CHECK_EQUAL(ToIntegral<uint64_t>("0").value(), 0U);
+    BOOST_CHECK_EQUAL(ToIntegral<uint64_t>("18446744073709551615").value(), 18'446'744'073'709'551'615ULL);
+    BOOST_CHECK(!ToIntegral<uint64_t>("18446744073709551616"));
+
+    BOOST_CHECK(!ToIntegral<int32_t>("-2147483649"));
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("-2147483648").value(), -2'147'483'648LL);
+    BOOST_CHECK_EQUAL(ToIntegral<int32_t>("2147483647").value(), 2'147'483'647);
+    BOOST_CHECK(!ToIntegral<int32_t>("2147483648"));
+
+    BOOST_CHECK(!ToIntegral<uint32_t>("-1"));
+    BOOST_CHECK_EQUAL(ToIntegral<uint32_t>("0").value(), 0U);
+    BOOST_CHECK_EQUAL(ToIntegral<uint32_t>("4294967295").value(), 4'294'967'295U);
+    BOOST_CHECK(!ToIntegral<uint32_t>("4294967296"));
+
+    BOOST_CHECK(!ToIntegral<int16_t>("-32769"));
+    BOOST_CHECK_EQUAL(ToIntegral<int16_t>("-32768").value(), -32'768);
+    BOOST_CHECK_EQUAL(ToIntegral<int16_t>("32767").value(), 32'767);
+    BOOST_CHECK(!ToIntegral<int16_t>("32768"));
+
+    BOOST_CHECK(!ToIntegral<uint16_t>("-1"));
+    BOOST_CHECK_EQUAL(ToIntegral<uint16_t>("0").value(), 0U);
+    BOOST_CHECK_EQUAL(ToIntegral<uint16_t>("65535").value(), 65'535U);
+    BOOST_CHECK(!ToIntegral<uint16_t>("65536"));
+
+    BOOST_CHECK(!ToIntegral<int8_t>("-129"));
+    BOOST_CHECK_EQUAL(ToIntegral<int8_t>("-128").value(), -128);
+    BOOST_CHECK_EQUAL(ToIntegral<int8_t>("127").value(), 127);
+    BOOST_CHECK(!ToIntegral<int8_t>("128"));
+
+    BOOST_CHECK(!ToIntegral<uint8_t>("-1"));
+    BOOST_CHECK_EQUAL(ToIntegral<uint8_t>("0").value(), 0U);
+    BOOST_CHECK_EQUAL(ToIntegral<uint8_t>("255").value(), 255U);
+    BOOST_CHECK(!ToIntegral<uint8_t>("256"));
+}
+
 BOOST_AUTO_TEST_CASE(test_ParseInt64)
 {
     int64_t n;
diff --git a/src/util/strencodings.cpp b/src/util/strencodings.cpp
index f514613f0d..0aa80ea0ae 100644
--- a/src/util/strencodings.cpp
+++ b/src/util/strencodings.cpp
@@ -11,8 +11,7 @@
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
-#include <errno.h>
-#include <limits>
+#include <optional>
 
 static const std::string CHARS_ALPHA_NUM = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
 
@@ -282,6 +281,32 @@ std::string DecodeBase32(const std::string& str, bool* pf_invalid)
     return std::string((const char*)vchRet.data(), vchRet.size());
 }
 
+[[nodiscard]] static bool ParsePrechecks(const std::string&);
+
+namespace {
+template <typename T>
+bool ParseIntegral(const std::string& str, T* out)
+{
+    static_assert(std::is_integral<T>::value);
+    if (!ParsePrechecks(str)) {
+        return false;
+    }
+    // Replicate the exact behavior of strtol/strtoll/strtoul/strtoull when
+    // handling leading +/- for backwards compatibility.
+    if (str.length() >= 2 && str[0] == '+' && str[1] == '-') {
+        return false;
+    }
+    const std::optional<T> opt_int = ToIntegral<T>((!str.empty() && str[0] == '+') ? str.substr(1) : str);
+    if (!opt_int) {
+        return false;
+    }
+    if (out != nullptr) {
+        *out = *opt_int;
+    }
+    return true;
+}
+}; // namespace
+
 [[nodiscard]] static bool ParsePrechecks(const std::string& str)
 {
     if (str.empty()) // No empty string allowed
@@ -293,95 +318,36 @@ std::string DecodeBase32(const std::string& str, bool* pf_invalid)
     return true;
 }
 
-bool ParseInt32(const std::string& str, int32_t *out)
+bool ParseInt32(const std::string& str, int32_t* out)
 {
-    if (!ParsePrechecks(str))
-        return false;
-    char *endp = nullptr;
-    errno = 0; // strtol will not set errno if valid
-    long int n = strtol(str.c_str(), &endp, 10);
-    if(out) *out = (int32_t)n;
-    // Note that strtol returns a *long int*, so even if strtol doesn't report an over/underflow
-    // we still have to check that the returned value is within the range of an *int32_t*. On 64-bit
-    // platforms the size of these types may be different.
-    return endp && *endp == 0 && !errno &&
-        n >= std::numeric_limits<int32_t>::min() &&
-        n <= std::numeric_limits<int32_t>::max();
+    return ParseIntegral<int32_t>(str, out);
 }
 
-bool ParseInt64(const std::string& str, int64_t *out)
+bool ParseInt64(const std::string& str, int64_t* out)
 {
-    if (!ParsePrechecks(str))
-        return false;
-    char *endp = nullptr;
-    errno = 0; // strtoll will not set errno if valid
-    long long int n = strtoll(str.c_str(), &endp, 10);
-    if(out) *out = (int64_t)n;
-    // Note that strtoll returns a *long long int*, so even if strtol doesn't report an over/underflow
-    // we still have to check that the returned value is within the range of an *int64_t*.
-    return endp && *endp == 0 && !errno &&
-        n >= std::numeric_limits<int64_t>::min() &&
-        n <= std::numeric_limits<int64_t>::max();
+    return ParseIntegral<int64_t>(str, out);
 }
 
-bool ParseUInt8(const std::string& str, uint8_t *out)
+bool ParseUInt8(const std::string& str, uint8_t* out)
 {
-    uint32_t u32;
-    if (!ParseUInt32(str, &u32) || u32 > std::numeric_limits<uint8_t>::max()) {
-        return false;
-    }
-    if (out != nullptr) {
-        *out = static_cast<uint8_t>(u32);
-    }
-    return true;
+    return ParseIntegral<uint8_t>(str, out);
 }
 
 bool ParseUInt16(const std::string& str, uint16_t* out)
 {
-    uint32_t u32;
-    if (!ParseUInt32(str, &u32) || u32 > std::numeric_limits<uint16_t>::max()) {
-        return false;
-    }
-    if (out != nullptr) {
-        *out = static_cast<uint16_t>(u32);
-    }
-    return true;
+    return ParseIntegral<uint16_t>(str, out);
 }
 
-bool ParseUInt32(const std::string& str, uint32_t *out)
+bool ParseUInt32(const std::string& str, uint32_t* out)
 {
-    if (!ParsePrechecks(str))
-        return false;
-    if (str.size() >= 1 && str[0] == '-') // Reject negative values, unfortunately strtoul accepts these by default if they fit in the range
-        return false;
-    char *endp = nullptr;
-    errno = 0; // strtoul will not set errno if valid
-    unsigned long int n = strtoul(str.c_str(), &endp, 10);
-    if(out) *out = (uint32_t)n;
-    // Note that strtoul returns a *unsigned long int*, so even if it doesn't report an over/underflow
-    // we still have to check that the returned value is within the range of an *uint32_t*. On 64-bit
-    // platforms the size of these types may be different.
-    return endp && *endp == 0 && !errno &&
-        n <= std::numeric_limits<uint32_t>::max();
+    return ParseIntegral<uint32_t>(str, out);
 }
 
-bool ParseUInt64(const std::string& str, uint64_t *out)
+bool ParseUInt64(const std::string& str, uint64_t* out)
 {
-    if (!ParsePrechecks(str))
-        return false;
-    if (str.size() >= 1 && str[0] == '-') // Reject negative values, unfortunately strtoull accepts these by default if they fit in the range
-        return false;
-    char *endp = nullptr;
-    errno = 0; // strtoull will not set errno if valid
-    unsigned long long int n = strtoull(str.c_str(), &endp, 10);
-    if(out) *out = (uint64_t)n;
-    // Note that strtoull returns a *unsigned long long int*, so even if it doesn't report an over/underflow
-    // we still have to check that the returned value is within the range of an *uint64_t*.
-    return endp && *endp == 0 && !errno &&
-        n <= std::numeric_limits<uint64_t>::max();
+    return ParseIntegral<uint64_t>(str, out);
 }
 
-
 bool ParseDouble(const std::string& str, double *out)
 {
     if (!ParsePrechecks(str))
diff --git a/src/util/strencodings.h b/src/util/strencodings.h
index 26dc0a0ce3..1217572c45 100644
--- a/src/util/strencodings.h
+++ b/src/util/strencodings.h
@@ -12,8 +12,10 @@
 #include <attributes.h>
 #include <span.h>
 
+#include <charconv>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -95,6 +97,24 @@ constexpr inline bool IsSpace(char c) noexcept {
 }
 
 /**
+ * Convert string to integral type T.
+ *
+ * @returns std::nullopt if the entire string could not be parsed, or if the
+ *   parsed value is not in the range representable by the type T.
+ */
+template <typename T>
+std::optional<T> ToIntegral(const std::string& str)
+{
+    static_assert(std::is_integral<T>::value);
+    T result;
+    const auto [first_nonmatching, error_condition] = std::from_chars(str.data(), str.data() + str.size(), result);
+    if (first_nonmatching != str.data() + str.size() || error_condition != std::errc{}) {
+        return std::nullopt;
+    }
+    return {result};
+}
+
+/**
  * Convert string to signed 32-bit integer with strict parse error feedback.
  * @returns true if the entire string could be parsed as valid integer,
  *   false if not the entire string could be parsed or when overflow or underflow occurred.
diff --git a/test/lint/lint-locale-dependence.sh b/test/lint/lint-locale-dependence.sh
index d6312270e7..fcc4883d0b 100755
--- a/test/lint/lint-locale-dependence.sh
+++ b/test/lint/lint-locale-dependence.sh
@@ -47,11 +47,11 @@ KNOWN_VIOLATIONS=(
     "src/test/dbwrapper_tests.cpp:.*snprintf"
     "src/test/fuzz/locale.cpp"
     "src/test/fuzz/parse_numbers.cpp:.*atoi"
+    "src/test/fuzz/string.cpp"
     "src/torcontrol.cpp:.*atoi"
     "src/torcontrol.cpp:.*strtol"
     "src/util/strencodings.cpp:.*atoi"
-    "src/util/strencodings.cpp:.*strtol"
-    "src/util/strencodings.cpp:.*strtoul"
+    "src/util/strencodings.cpp:.*strtoll"
     "src/util/strencodings.h:.*atoi"
     "src/util/system.cpp:.*atoi"
 )
author	W. J. van der Laan <laanwj@protonmail.com>	2021-09-30 14:47:03 +0200
committer	W. J. van der Laan <laanwj@protonmail.com>	2021-09-30 15:14:58 +0200
commit	2d8e0c0c3c0d3c4cee7bb52d1edf501f40c53463 (patch)
tree	8266dac1d118b5985f5f84b1550752198bc21f38
parent	1cf7fb9fd65820b298a0eb28f6e2c4c7409c78d1 (diff)
parent	4747db876154ddd828c03d9eda10ecf8b25d8dc8 (diff)