aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRussell Yanofsky <russ@yanofsky.org>2021-11-11 09:54:21 -0500
committerRussell Yanofsky <russ@yanofsky.org>2021-11-15 12:08:49 -0500
commit9b575f1c734c052b695ce921fb6412b22c18fdb4 (patch)
treeaa4c80b07b92e0385ae0c45c62b44fcb86045310
parent7f0f853373703a020529dd9394fca525475086b7 (diff)
Improve fs::PathToString documentation
-rw-r--r--doc/developer-notes.md6
-rw-r--r--src/dbwrapper.cpp4
-rw-r--r--src/fs.h45
3 files changed, 34 insertions, 21 deletions
diff --git a/doc/developer-notes.md b/doc/developer-notes.md
index 7ff1d36442..1888897856 100644
--- a/doc/developer-notes.md
+++ b/doc/developer-notes.md
@@ -1254,6 +1254,12 @@ A few guidelines for introducing and reviewing new RPC interfaces:
- *Rationale*: User-facing consistency.
+- Use `fs::path::u8string()` and `fs::u8path()` functions when converting path
+ to JSON strings, not `fs::PathToString` and `fs::PathFromString`
+
+ - *Rationale*: JSON strings are Unicode strings, not byte strings, and
+ RFC8259 requires JSON to be encoded as UTF-8.
+
Internal interface guidelines
-----------------------------
diff --git a/src/dbwrapper.cpp b/src/dbwrapper.cpp
index 2fdc54464a..dbae2c45f2 100644
--- a/src/dbwrapper.cpp
+++ b/src/dbwrapper.cpp
@@ -136,6 +136,10 @@ CDBWrapper::CDBWrapper(const fs::path& path, size_t nCacheSize, bool fMemory, bo
TryCreateDirectories(path);
LogPrintf("Opening LevelDB in %s\n", fs::PathToString(path));
}
+ // PathToString() return value is safe to pass to leveldb open function,
+ // because on POSIX leveldb passes the byte string directly to ::open(), and
+ // on Windows it converts from UTF-8 to UTF-16 before calling ::CreateFileW
+ // (see env_posix.cc and env_windows.cc).
leveldb::Status status = leveldb::DB::Open(options, fs::PathToString(path), &pdb);
dbwrapper_private::HandleError(status);
LogPrintf("Opened LevelDB successfully\n");
diff --git a/src/fs.h b/src/fs.h
index 4a0bf39e95..3cf4371fb4 100644
--- a/src/fs.h
+++ b/src/fs.h
@@ -94,31 +94,34 @@ static inline path operator+(path p1, path p2)
/**
* Convert path object to byte string. On POSIX, paths natively are byte
- * strings so this is trivial. On Windows, paths natively are Unicode, so an
- * encoding step is necessary.
+ * strings, so this is trivial. On Windows, paths natively are Unicode, so an
+ * encoding step is necessary. The inverse of \ref PathToString is \ref
+ * PathFromString. The strings returned and parsed by these functions can be
+ * used to call POSIX APIs, and for roundtrip conversion, logging, and
+ * debugging.
*
- * The inverse of \ref PathToString is \ref PathFromString. The strings
- * returned and parsed by these functions can be used to call POSIX APIs, and
- * for roundtrip conversion, logging, and debugging. But they are not
- * guaranteed to be valid UTF-8, and are generally meant to be used internally,
- * not externally. When communicating with external programs and libraries that
- * require UTF-8, fs::path::u8string() and fs::u8path() methods can be used.
- * For other applications, if support for non UTF-8 paths is required, or if
- * higher-level JSON or XML or URI or C-style escapes are preferred, it may be
- * also be appropriate to use different path encoding functions.
- *
- * Implementation note: On Windows, the std::filesystem::path(string)
- * constructor and std::filesystem::path::string() method are not safe to use
- * here, because these methods encode the path using C++'s narrow multibyte
- * encoding, which on Windows corresponds to the current "code page", which is
- * unpredictable and typically not able to represent all valid paths. So
- * std::filesystem::path::u8string() and std::filesystem::u8path() functions
- * are used instead on Windows. On POSIX, u8string/u8path functions are not
- * safe to use because paths are not always valid UTF-8, so plain string
- * methods which do not transform the path there are used.
+ * Because \ref PathToString and \ref PathFromString functions don't specify an
+ * encoding, they are meant to be used internally, not externally. They are not
+ * appropriate to use in applications requiring UTF-8, where
+ * fs::path::u8string() and fs::u8path() methods should be used instead. Other
+ * applications could require still different encodings. For example, JSON, XML,
+ * or URI applications might prefer to use higher level escapes (\uXXXX or
+ * &XXXX; or %XX) instead of multibyte encoding. Rust, Python, Java applications
+ * may require encoding paths with their respective UTF-8 derivatives WTF-8,
+ * PEP-383, and CESU-8 (see https://en.wikipedia.org/wiki/UTF-8#Derivatives).
*/
static inline std::string PathToString(const path& path)
{
+ // Implementation note: On Windows, the std::filesystem::path(string)
+ // constructor and std::filesystem::path::string() method are not safe to
+ // use here, because these methods encode the path using C++'s narrow
+ // multibyte encoding, which on Windows corresponds to the current "code
+ // page", which is unpredictable and typically not able to represent all
+ // valid paths. So std::filesystem::path::u8string() and
+ // std::filesystem::u8path() functions are used instead on Windows. On
+ // POSIX, u8string/u8path functions are not safe to use because paths are
+ // not always valid UTF-8, so plain string methods which do not transform
+ // the path there are used.
#ifdef WIN32
return path.u8string();
#else