aboutsummaryrefslogtreecommitdiff
path: root/src/leveldb/db
diff options
context:
space:
mode:
Diffstat (limited to 'src/leveldb/db')
-rw-r--r--src/leveldb/db/autocompact_test.cc112
-rw-r--r--src/leveldb/db/builder.cc79
-rw-r--r--src/leveldb/db/builder.h30
-rw-r--r--src/leveldb/db/c.cc562
-rw-r--r--src/leveldb/db/c_test.c384
-rw-r--r--src/leveldb/db/corruption_test.cc362
-rw-r--r--src/leveldb/db/db_impl.cc1555
-rw-r--r--src/leveldb/db/db_impl.h217
-rw-r--r--src/leveldb/db/db_iter.cc318
-rw-r--r--src/leveldb/db/db_iter.h26
-rw-r--r--src/leveldb/db/db_test.cc2302
-rw-r--r--src/leveldb/db/dbformat.cc137
-rw-r--r--src/leveldb/db/dbformat.h224
-rw-r--r--src/leveldb/db/dbformat_test.cc131
-rw-r--r--src/leveldb/db/dumpfile.cc232
-rw-r--r--src/leveldb/db/fault_injection_test.cc552
-rw-r--r--src/leveldb/db/filename.cc141
-rw-r--r--src/leveldb/db/filename.h84
-rw-r--r--src/leveldb/db/filename_test.cc131
-rw-r--r--src/leveldb/db/leveldbutil.cc64
-rw-r--r--src/leveldb/db/log_format.h35
-rw-r--r--src/leveldb/db/log_reader.cc274
-rw-r--r--src/leveldb/db/log_reader.h112
-rw-r--r--src/leveldb/db/log_test.cc562
-rw-r--r--src/leveldb/db/log_writer.cc111
-rw-r--r--src/leveldb/db/log_writer.h54
-rw-r--r--src/leveldb/db/memtable.cc137
-rw-r--r--src/leveldb/db/memtable.h87
-rw-r--r--src/leveldb/db/recovery_test.cc330
-rw-r--r--src/leveldb/db/repair.cc450
-rw-r--r--src/leveldb/db/skiplist.h382
-rw-r--r--src/leveldb/db/skiplist_test.cc369
-rw-r--r--src/leveldb/db/snapshot.h95
-rw-r--r--src/leveldb/db/table_cache.cc120
-rw-r--r--src/leveldb/db/table_cache.h58
-rw-r--r--src/leveldb/db/version_edit.cc257
-rw-r--r--src/leveldb/db/version_edit.h106
-rw-r--r--src/leveldb/db/version_edit_test.cc44
-rw-r--r--src/leveldb/db/version_set.cc1562
-rw-r--r--src/leveldb/db/version_set.h393
-rw-r--r--src/leveldb/db/version_set_test.cc332
-rw-r--r--src/leveldb/db/write_batch.cc150
-rw-r--r--src/leveldb/db/write_batch_internal.h45
-rw-r--r--src/leveldb/db/write_batch_test.cc137
44 files changed, 13845 insertions, 0 deletions
diff --git a/src/leveldb/db/autocompact_test.cc b/src/leveldb/db/autocompact_test.cc
new file mode 100644
index 0000000000..e6c97a05a6
--- /dev/null
+++ b/src/leveldb/db/autocompact_test.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class AutoCompactTest {
+ public:
+ AutoCompactTest() {
+ dbname_ = test::TmpDir() + "/autocompact_test";
+ tiny_cache_ = NewLRUCache(100);
+ options_.block_cache = tiny_cache_;
+ DestroyDB(dbname_, options_);
+ options_.create_if_missing = true;
+ options_.compression = kNoCompression;
+ ASSERT_OK(DB::Open(options_, dbname_, &db_));
+ }
+
+ ~AutoCompactTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ }
+
+ std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ void DoReads(int n);
+
+ private:
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+};
+
+static const int kValueSize = 200 * 1024;
+static const int kTotalSize = 100 * 1024 * 1024;
+static const int kCount = kTotalSize / kValueSize;
+
+// Read through the first n keys repeatedly and check that they get
+// compacted (verified by checking the size of the key space).
+void AutoCompactTest::DoReads(int n) {
+ std::string value(kValueSize, 'x');
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Fill database
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Delete everything
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Get initial measurement of the space we will be reading.
+ const int64_t initial_size = Size(Key(0), Key(n));
+ const int64_t initial_other_size = Size(Key(n), Key(kCount));
+
+ // Read until size drops significantly.
+ std::string limit_key = Key(n);
+ for (int read = 0; true; read++) {
+ ASSERT_LT(read, 100) << "Taking too long to compact";
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst();
+ iter->Valid() && iter->key().ToString() < limit_key; iter->Next()) {
+ // Drop data
+ }
+ delete iter;
+ // Wait a little bit to allow any triggered compactions to complete.
+ Env::Default()->SleepForMicroseconds(1000000);
+ uint64_t size = Size(Key(0), Key(n));
+ fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n", read + 1,
+ size / 1048576.0, Size(Key(n), Key(kCount)) / 1048576.0);
+ if (size <= initial_size / 10) {
+ break;
+ }
+ }
+
+ // Verify that the size of the key space not touched by the reads
+ // is pretty much unchanged.
+ const int64_t final_other_size = Size(Key(n), Key(kCount));
+ ASSERT_LE(final_other_size, initial_other_size + 1048576);
+ ASSERT_GE(final_other_size, initial_other_size / 5 - 1048576);
+}
+
+TEST(AutoCompactTest, ReadAll) { DoReads(kCount); }
+
+TEST(AutoCompactTest, ReadHalf) { DoReads(kCount / 2); }
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
new file mode 100644
index 0000000000..9520ee4535
--- /dev/null
+++ b/src/leveldb/db/builder.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+ TableCache* table_cache, Iterator* iter, FileMetaData* meta) {
+ Status s;
+ meta->file_size = 0;
+ iter->SeekToFirst();
+
+ std::string fname = TableFileName(dbname, meta->number);
+ if (iter->Valid()) {
+ WritableFile* file;
+ s = env->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TableBuilder* builder = new TableBuilder(options, file);
+ meta->smallest.DecodeFrom(iter->key());
+ for (; iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ meta->largest.DecodeFrom(key);
+ builder->Add(key, iter->value());
+ }
+
+ // Finish and check for builder errors
+ s = builder->Finish();
+ if (s.ok()) {
+ meta->file_size = builder->FileSize();
+ assert(meta->file_size > 0);
+ }
+ delete builder;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = file->Sync();
+ }
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = nullptr;
+
+ if (s.ok()) {
+ // Verify that the table is usable
+ Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number,
+ meta->file_size);
+ s = it->status();
+ delete it;
+ }
+ }
+
+ // Check for input iterator errors
+ if (!iter->status().ok()) {
+ s = iter->status();
+ }
+
+ if (s.ok() && meta->file_size > 0) {
+ // Keep it
+ } else {
+ env->DeleteFile(fname);
+ }
+ return s;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h
new file mode 100644
index 0000000000..7bd0b8049b
--- /dev/null
+++ b/src/leveldb/db/builder.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
+#define STORAGE_LEVELDB_DB_BUILDER_H_
+
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+class Iterator;
+class TableCache;
+class VersionEdit;
+
+// Build a Table file from the contents of *iter. The generated file
+// will be named according to meta->number. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+Status BuildTable(const std::string& dbname, Env* env, const Options& options,
+ TableCache* table_cache, Iterator* iter, FileMetaData* meta);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_BUILDER_H_
diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc
new file mode 100644
index 0000000000..3a492f9ac5
--- /dev/null
+++ b/src/leveldb/db/c.cc
@@ -0,0 +1,562 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/c.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "leveldb/cache.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/write_batch.h"
+
+using leveldb::Cache;
+using leveldb::Comparator;
+using leveldb::CompressionType;
+using leveldb::DB;
+using leveldb::Env;
+using leveldb::FileLock;
+using leveldb::FilterPolicy;
+using leveldb::Iterator;
+using leveldb::kMajorVersion;
+using leveldb::kMinorVersion;
+using leveldb::Logger;
+using leveldb::NewBloomFilterPolicy;
+using leveldb::NewLRUCache;
+using leveldb::Options;
+using leveldb::RandomAccessFile;
+using leveldb::Range;
+using leveldb::ReadOptions;
+using leveldb::SequentialFile;
+using leveldb::Slice;
+using leveldb::Snapshot;
+using leveldb::Status;
+using leveldb::WritableFile;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+
+extern "C" {
+
+struct leveldb_t {
+ DB* rep;
+};
+struct leveldb_iterator_t {
+ Iterator* rep;
+};
+struct leveldb_writebatch_t {
+ WriteBatch rep;
+};
+struct leveldb_snapshot_t {
+ const Snapshot* rep;
+};
+struct leveldb_readoptions_t {
+ ReadOptions rep;
+};
+struct leveldb_writeoptions_t {
+ WriteOptions rep;
+};
+struct leveldb_options_t {
+ Options rep;
+};
+struct leveldb_cache_t {
+ Cache* rep;
+};
+struct leveldb_seqfile_t {
+ SequentialFile* rep;
+};
+struct leveldb_randomfile_t {
+ RandomAccessFile* rep;
+};
+struct leveldb_writablefile_t {
+ WritableFile* rep;
+};
+struct leveldb_logger_t {
+ Logger* rep;
+};
+struct leveldb_filelock_t {
+ FileLock* rep;
+};
+
+struct leveldb_comparator_t : public Comparator {
+ ~leveldb_comparator_t() override { (*destructor_)(state_); }
+
+ int Compare(const Slice& a, const Slice& b) const override {
+ return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+ }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ // No-ops since the C binding does not support key shortening methods.
+ void FindShortestSeparator(std::string*, const Slice&) const override {}
+ void FindShortSuccessor(std::string* key) const override {}
+
+ void* state_;
+ void (*destructor_)(void*);
+ int (*compare_)(void*, const char* a, size_t alen, const char* b,
+ size_t blen);
+ const char* (*name_)(void*);
+};
+
+struct leveldb_filterpolicy_t : public FilterPolicy {
+ ~leveldb_filterpolicy_t() override { (*destructor_)(state_); }
+
+ const char* Name() const override { return (*name_)(state_); }
+
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override {
+ std::vector<const char*> key_pointers(n);
+ std::vector<size_t> key_sizes(n);
+ for (int i = 0; i < n; i++) {
+ key_pointers[i] = keys[i].data();
+ key_sizes[i] = keys[i].size();
+ }
+ size_t len;
+ char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+ dst->append(filter, len);
+ free(filter);
+ }
+
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override {
+ return (*key_match_)(state_, key.data(), key.size(), filter.data(),
+ filter.size());
+ }
+
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*create_)(void*, const char* const* key_array,
+ const size_t* key_length_array, int num_keys,
+ size_t* filter_length);
+ uint8_t (*key_match_)(void*, const char* key, size_t length,
+ const char* filter, size_t filter_length);
+};
+
+struct leveldb_env_t {
+ Env* rep;
+ bool is_default;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+ assert(errptr != nullptr);
+ if (s.ok()) {
+ return false;
+ } else if (*errptr == nullptr) {
+ *errptr = strdup(s.ToString().c_str());
+ } else {
+ // TODO(sanjay): Merge with existing error?
+ free(*errptr);
+ *errptr = strdup(s.ToString().c_str());
+ }
+ return true;
+}
+
+static char* CopyString(const std::string& str) {
+ char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+ memcpy(result, str.data(), sizeof(char) * str.size());
+ return result;
+}
+
+leveldb_t* leveldb_open(const leveldb_options_t* options, const char* name,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+ return nullptr;
+ }
+ leveldb_t* result = new leveldb_t;
+ result->rep = db;
+ return result;
+}
+
+void leveldb_close(leveldb_t* db) {
+ delete db->rep;
+ delete db;
+}
+
+void leveldb_put(leveldb_t* db, const leveldb_writeoptions_t* options,
+ const char* key, size_t keylen, const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void leveldb_delete(leveldb_t* db, const leveldb_writeoptions_t* options,
+ const char* key, size_t keylen, char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void leveldb_write(leveldb_t* db, const leveldb_writeoptions_t* options,
+ leveldb_writebatch_t* batch, char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* leveldb_get(leveldb_t* db, const leveldb_readoptions_t* options,
+ const char* key, size_t keylen, size_t* vallen,
+ char** errptr) {
+ char* result = nullptr;
+ std::string tmp;
+ Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+leveldb_iterator_t* leveldb_create_iterator(
+ leveldb_t* db, const leveldb_readoptions_t* options) {
+ leveldb_iterator_t* result = new leveldb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep);
+ return result;
+}
+
+const leveldb_snapshot_t* leveldb_create_snapshot(leveldb_t* db) {
+ leveldb_snapshot_t* result = new leveldb_snapshot_t;
+ result->rep = db->rep->GetSnapshot();
+ return result;
+}
+
+void leveldb_release_snapshot(leveldb_t* db,
+ const leveldb_snapshot_t* snapshot) {
+ db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+char* leveldb_property_value(leveldb_t* db, const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return nullptr;
+ }
+}
+
+void leveldb_approximate_sizes(leveldb_t* db, int num_ranges,
+ const char* const* range_start_key,
+ const size_t* range_start_key_len,
+ const char* const* range_limit_key,
+ const size_t* range_limit_key_len,
+ uint64_t* sizes) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+ delete[] ranges;
+}
+
+void leveldb_compact_range(leveldb_t* db, const char* start_key,
+ size_t start_key_len, const char* limit_key,
+ size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ // Pass null Slice if corresponding "const char*" is null
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void leveldb_destroy_db(const leveldb_options_t* options, const char* name,
+ char** errptr) {
+ SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void leveldb_repair_db(const leveldb_options_t* options, const char* name,
+ char** errptr) {
+ SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void leveldb_iter_destroy(leveldb_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+uint8_t leveldb_iter_valid(const leveldb_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
+ iter->rep->SeekToFirst();
+}
+
+void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
+ iter->rep->SeekToLast();
+}
+
+void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
+ iter->rep->Seek(Slice(k, klen));
+}
+
+void leveldb_iter_next(leveldb_iterator_t* iter) { iter->rep->Next(); }
+
+void leveldb_iter_prev(leveldb_iterator_t* iter) { iter->rep->Prev(); }
+
+const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
+ Slice s = iter->rep->key();
+ *klen = s.size();
+ return s.data();
+}
+
+const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
+ Slice s = iter->rep->value();
+ *vlen = s.size();
+ return s.data();
+}
+
+void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+leveldb_writebatch_t* leveldb_writebatch_create() {
+ return new leveldb_writebatch_t;
+}
+
+void leveldb_writebatch_destroy(leveldb_writebatch_t* b) { delete b; }
+
+void leveldb_writebatch_clear(leveldb_writebatch_t* b) { b->rep.Clear(); }
+
+void leveldb_writebatch_put(leveldb_writebatch_t* b, const char* key,
+ size_t klen, const char* val, size_t vlen) {
+ b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void leveldb_writebatch_delete(leveldb_writebatch_t* b, const char* key,
+ size_t klen) {
+ b->rep.Delete(Slice(key, klen));
+}
+
+void leveldb_writebatch_iterate(const leveldb_writebatch_t* b, void* state,
+ void (*put)(void*, const char* k, size_t klen,
+ const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k,
+ size_t klen)) {
+ class H : public WriteBatch::Handler {
+ public:
+ void* state_;
+ void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+ void (*deleted_)(void*, const char* k, size_t klen);
+ void Put(const Slice& key, const Slice& value) override {
+ (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+ }
+ void Delete(const Slice& key) override {
+ (*deleted_)(state_, key.data(), key.size());
+ }
+ };
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep.Iterate(&handler);
+}
+
+void leveldb_writebatch_append(leveldb_writebatch_t* destination,
+ const leveldb_writebatch_t* source) {
+ destination->rep.Append(source->rep);
+}
+
+leveldb_options_t* leveldb_options_create() { return new leveldb_options_t; }
+
+void leveldb_options_destroy(leveldb_options_t* options) { delete options; }
+
+void leveldb_options_set_comparator(leveldb_options_t* opt,
+ leveldb_comparator_t* cmp) {
+ opt->rep.comparator = cmp;
+}
+
+void leveldb_options_set_filter_policy(leveldb_options_t* opt,
+ leveldb_filterpolicy_t* policy) {
+ opt->rep.filter_policy = policy;
+}
+
+void leveldb_options_set_create_if_missing(leveldb_options_t* opt, uint8_t v) {
+ opt->rep.create_if_missing = v;
+}
+
+void leveldb_options_set_error_if_exists(leveldb_options_t* opt, uint8_t v) {
+ opt->rep.error_if_exists = v;
+}
+
+void leveldb_options_set_paranoid_checks(leveldb_options_t* opt, uint8_t v) {
+ opt->rep.paranoid_checks = v;
+}
+
+void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
+ opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
+ opt->rep.info_log = (l ? l->rep : nullptr);
+}
+
+void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
+ opt->rep.write_buffer_size = s;
+}
+
+void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
+ opt->rep.max_open_files = n;
+}
+
+void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
+ opt->rep.block_cache = c->rep;
+}
+
+void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
+ opt->rep.block_size = s;
+}
+
+void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
+ opt->rep.block_restart_interval = n;
+}
+
+void leveldb_options_set_max_file_size(leveldb_options_t* opt, size_t s) {
+ opt->rep.max_file_size = s;
+}
+
+void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
+ opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+leveldb_comparator_t* leveldb_comparator_create(
+ void* state, void (*destructor)(void*),
+ int (*compare)(void*, const char* a, size_t alen, const char* b,
+ size_t blen),
+ const char* (*name)(void*)) {
+ leveldb_comparator_t* result = new leveldb_comparator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->compare_ = compare;
+ result->name_ = name;
+ return result;
+}
+
+void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { delete cmp; }
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+ void* state, void (*destructor)(void*),
+ char* (*create_filter)(void*, const char* const* key_array,
+ const size_t* key_length_array, int num_keys,
+ size_t* filter_length),
+ uint8_t (*key_may_match)(void*, const char* key, size_t length,
+ const char* filter, size_t filter_length),
+ const char* (*name)(void*)) {
+ leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->create_ = create_filter;
+ result->key_match_ = key_may_match;
+ result->name_ = name;
+ return result;
+}
+
+void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
+ delete filter;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
+ // Make a leveldb_filterpolicy_t, but override all of its methods so
+ // they delegate to a NewBloomFilterPolicy() instead of user
+ // supplied C functions.
+ struct Wrapper : public leveldb_filterpolicy_t {
+ static void DoNothing(void*) {}
+
+ ~Wrapper() { delete rep_; }
+ const char* Name() const { return rep_->Name(); }
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+ return rep_->CreateFilter(keys, n, dst);
+ }
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+ return rep_->KeyMayMatch(key, filter);
+ }
+
+ const FilterPolicy* rep_;
+ };
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+ wrapper->state_ = nullptr;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+leveldb_readoptions_t* leveldb_readoptions_create() {
+ return new leveldb_readoptions_t;
+}
+
+void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) { delete opt; }
+
+void leveldb_readoptions_set_verify_checksums(leveldb_readoptions_t* opt,
+ uint8_t v) {
+ opt->rep.verify_checksums = v;
+}
+
+void leveldb_readoptions_set_fill_cache(leveldb_readoptions_t* opt, uint8_t v) {
+ opt->rep.fill_cache = v;
+}
+
+void leveldb_readoptions_set_snapshot(leveldb_readoptions_t* opt,
+ const leveldb_snapshot_t* snap) {
+ opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+leveldb_writeoptions_t* leveldb_writeoptions_create() {
+ return new leveldb_writeoptions_t;
+}
+
+void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) { delete opt; }
+
+void leveldb_writeoptions_set_sync(leveldb_writeoptions_t* opt, uint8_t v) {
+ opt->rep.sync = v;
+}
+
+leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
+ leveldb_cache_t* c = new leveldb_cache_t;
+ c->rep = NewLRUCache(capacity);
+ return c;
+}
+
+void leveldb_cache_destroy(leveldb_cache_t* cache) {
+ delete cache->rep;
+ delete cache;
+}
+
+leveldb_env_t* leveldb_create_default_env() {
+ leveldb_env_t* result = new leveldb_env_t;
+ result->rep = Env::Default();
+ result->is_default = true;
+ return result;
+}
+
+void leveldb_env_destroy(leveldb_env_t* env) {
+ if (!env->is_default) delete env->rep;
+ delete env;
+}
+
+char* leveldb_env_get_test_directory(leveldb_env_t* env) {
+ std::string result;
+ if (!env->rep->GetTestDirectory(&result).ok()) {
+ return nullptr;
+ }
+
+ char* buffer = static_cast<char*>(malloc(result.size() + 1));
+ memcpy(buffer, result.data(), result.size());
+ buffer[result.size()] = '\0';
+ return buffer;
+}
+
+void leveldb_free(void* ptr) { free(ptr); }
+
+int leveldb_major_version() { return kMajorVersion; }
+
+int leveldb_minor_version() { return kMinorVersion; }
+
+} // end extern "C"
diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c
new file mode 100644
index 0000000000..16c77eed6a
--- /dev/null
+++ b/src/leveldb/db/c_test.c
@@ -0,0 +1,384 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "leveldb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char* phase = "";
+
+static void StartPhase(const char* name) {
+ fprintf(stderr, "=== Test %s\n", name);
+ phase = name;
+}
+
+#define CheckNoError(err) \
+ if ((err) != NULL) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+ abort(); \
+ }
+
+#define CheckCondition(cond) \
+ if (!(cond)) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+ abort(); \
+ }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+ if (expected == NULL && v == NULL) {
+ // ok
+ } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+ memcmp(expected, v, n) == 0) {
+ // ok
+ return;
+ } else {
+ fprintf(stderr, "%s: expected '%s', got '%s'\n",
+ phase,
+ (expected ? expected : "(null)"),
+ (v ? v : "(null"));
+ abort();
+ }
+}
+
+static void Free(char** ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+}
+
+static void CheckGet(
+ leveldb_t* db,
+ const leveldb_readoptions_t* options,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckIter(leveldb_iterator_t* iter,
+ const char* key, const char* val) {
+ size_t len;
+ const char* str;
+ str = leveldb_iter_key(iter, &len);
+ CheckEqual(key, str, len);
+ str = leveldb_iter_value(iter, &len);
+ CheckEqual(val, str, len);
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckPut(void* ptr,
+ const char* k, size_t klen,
+ const char* v, size_t vlen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state < 2);
+ switch (*state) {
+ case 0:
+ CheckEqual("bar", k, klen);
+ CheckEqual("b", v, vlen);
+ break;
+ case 1:
+ CheckEqual("box", k, klen);
+ CheckEqual("c", v, vlen);
+ break;
+ }
+ (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state == 2);
+ CheckEqual("bar", k, klen);
+ (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+ const char* b, size_t blen) {
+ int n = (alen < blen) ? alen : blen;
+ int r = memcmp(a, b, n);
+ if (r == 0) {
+ if (alen < blen) r = -1;
+ else if (alen > blen) r = +1;
+ }
+ return r;
+}
+
+static const char* CmpName(void* arg) {
+ return "foo";
+}
+
+// Custom filter policy
+static uint8_t fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+ return "TestFilter";
+}
+static char* FilterCreate(
+ void* arg,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length) {
+ *filter_length = 4;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+uint8_t FilterKeyMatch(void* arg, const char* key, size_t length,
+ const char* filter, size_t filter_length) {
+ CheckCondition(filter_length == 4);
+ CheckCondition(memcmp(filter, "fake", 4) == 0);
+ return fake_filter_result;
+}
+
+int main(int argc, char** argv) {
+ leveldb_t* db;
+ leveldb_comparator_t* cmp;
+ leveldb_cache_t* cache;
+ leveldb_env_t* env;
+ leveldb_options_t* options;
+ leveldb_readoptions_t* roptions;
+ leveldb_writeoptions_t* woptions;
+ char* dbname;
+ char* err = NULL;
+ int run = -1;
+
+ CheckCondition(leveldb_major_version() >= 1);
+ CheckCondition(leveldb_minor_version() >= 1);
+
+ StartPhase("create_objects");
+ cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+ env = leveldb_create_default_env();
+ cache = leveldb_cache_create_lru(100000);
+ dbname = leveldb_env_get_test_directory(env);
+ CheckCondition(dbname != NULL);
+
+ options = leveldb_options_create();
+ leveldb_options_set_comparator(options, cmp);
+ leveldb_options_set_error_if_exists(options, 1);
+ leveldb_options_set_cache(options, cache);
+ leveldb_options_set_env(options, env);
+ leveldb_options_set_info_log(options, NULL);
+ leveldb_options_set_write_buffer_size(options, 100000);
+ leveldb_options_set_paranoid_checks(options, 1);
+ leveldb_options_set_max_open_files(options, 10);
+ leveldb_options_set_block_size(options, 1024);
+ leveldb_options_set_block_restart_interval(options, 8);
+ leveldb_options_set_max_file_size(options, 3 << 20);
+ leveldb_options_set_compression(options, leveldb_no_compression);
+
+ roptions = leveldb_readoptions_create();
+ leveldb_readoptions_set_verify_checksums(roptions, 1);
+ leveldb_readoptions_set_fill_cache(roptions, 0);
+
+ woptions = leveldb_writeoptions_create();
+ leveldb_writeoptions_set_sync(woptions, 1);
+
+ StartPhase("destroy");
+ leveldb_destroy_db(options, dbname, &err);
+ Free(&err);
+
+ StartPhase("open_error");
+ db = leveldb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+
+ StartPhase("leveldb_free");
+ db = leveldb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ leveldb_free(err);
+ err = NULL;
+
+ StartPhase("open");
+ leveldb_options_set_create_if_missing(options, 1);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+
+ StartPhase("put");
+ leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactall");
+ leveldb_compact_range(db, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrange");
+ leveldb_compact_range(db, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("writebatch");
+ {
+ leveldb_writebatch_t* wb = leveldb_writebatch_create();
+ leveldb_writebatch_put(wb, "foo", 3, "a", 1);
+ leveldb_writebatch_clear(wb);
+ leveldb_writebatch_put(wb, "bar", 3, "b", 1);
+ leveldb_writebatch_put(wb, "box", 3, "c", 1);
+
+ leveldb_writebatch_t* wb2 = leveldb_writebatch_create();
+ leveldb_writebatch_delete(wb2, "bar", 3);
+ leveldb_writebatch_append(wb, wb2);
+ leveldb_writebatch_destroy(wb2);
+
+ leveldb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+
+ int pos = 0;
+ leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ leveldb_writebatch_destroy(wb);
+ }
+
+ StartPhase("iter");
+ {
+ leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
+ CheckCondition(!leveldb_iter_valid(iter));
+ leveldb_iter_seek_to_first(iter);
+ CheckCondition(leveldb_iter_valid(iter));
+ CheckIter(iter, "box", "c");
+ leveldb_iter_next(iter);
+ CheckIter(iter, "foo", "hello");
+ leveldb_iter_prev(iter);
+ CheckIter(iter, "box", "c");
+ leveldb_iter_prev(iter);
+ CheckCondition(!leveldb_iter_valid(iter));
+ leveldb_iter_seek_to_last(iter);
+ CheckIter(iter, "foo", "hello");
+ leveldb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "box", "c");
+ leveldb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ leveldb_iter_destroy(iter);
+ }
+
+ StartPhase("approximate_sizes");
+ {
+ int i;
+ int n = 20000;
+ char keybuf[100];
+ char valbuf[100];
+ uint64_t sizes[2];
+ const char* start[2] = { "a", "k00000000000000010000" };
+ size_t start_len[2] = { 1, 21 };
+ const char* limit[2] = { "k00000000000000010000", "z" };
+ size_t limit_len[2] = { 21, 1 };
+ leveldb_writeoptions_set_sync(woptions, 0);
+ for (i = 0; i < n; i++) {
+ snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+ snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+ leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+ &err);
+ CheckNoError(err);
+ }
+ leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+ CheckCondition(sizes[0] > 0);
+ CheckCondition(sizes[1] > 0);
+ }
+
+ StartPhase("property");
+ {
+ char* prop = leveldb_property_value(db, "nosuchprop");
+ CheckCondition(prop == NULL);
+ prop = leveldb_property_value(db, "leveldb.stats");
+ CheckCondition(prop != NULL);
+ Free(&prop);
+ }
+
+ StartPhase("snapshot");
+ {
+ const leveldb_snapshot_t* snap;
+ snap = leveldb_create_snapshot(db);
+ leveldb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ leveldb_readoptions_set_snapshot(roptions, snap);
+ CheckGet(db, roptions, "foo", "hello");
+ leveldb_readoptions_set_snapshot(roptions, NULL);
+ CheckGet(db, roptions, "foo", NULL);
+ leveldb_release_snapshot(db, snap);
+ }
+
+ StartPhase("repair");
+ {
+ leveldb_close(db);
+ leveldb_options_set_create_if_missing(options, 0);
+ leveldb_options_set_error_if_exists(options, 0);
+ leveldb_repair_db(options, dbname, &err);
+ CheckNoError(err);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ leveldb_options_set_create_if_missing(options, 1);
+ leveldb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("filter");
+ for (run = 0; run < 2; run++) {
+ // First run uses custom filter, second run uses bloom filter
+ CheckNoError(err);
+ leveldb_filterpolicy_t* policy;
+ if (run == 0) {
+ policy = leveldb_filterpolicy_create(
+ NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+ } else {
+ policy = leveldb_filterpolicy_create_bloom(10);
+ }
+
+ // Create new database
+ leveldb_close(db);
+ leveldb_destroy_db(options, dbname, &err);
+ leveldb_options_set_filter_policy(options, policy);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ leveldb_compact_range(db, NULL, 0, NULL, 0);
+
+ fake_filter_result = 1;
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ if (phase == 0) {
+ // Must not find value when custom filter returns false
+ fake_filter_result = 0;
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ fake_filter_result = 1;
+
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ }
+ leveldb_options_set_filter_policy(options, NULL);
+ leveldb_filterpolicy_destroy(policy);
+ }
+
+ StartPhase("cleanup");
+ leveldb_close(db);
+ leveldb_options_destroy(options);
+ leveldb_readoptions_destroy(roptions);
+ leveldb_writeoptions_destroy(woptions);
+ leveldb_free(dbname);
+ leveldb_cache_destroy(cache);
+ leveldb_comparator_destroy(cmp);
+ leveldb_env_destroy(env);
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc
new file mode 100644
index 0000000000..42f5237c65
--- /dev/null
+++ b/src/leveldb/db/corruption_test.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+ CorruptionTest()
+ : db_(nullptr),
+ dbname_("/memenv/corruption_test"),
+ tiny_cache_(NewLRUCache(100)) {
+ options_.env = &env_;
+ options_.block_cache = tiny_cache_;
+ DestroyDB(dbname_, options_);
+
+ options_.create_if_missing = true;
+ Reopen();
+ options_.create_if_missing = false;
+ }
+
+ ~CorruptionTest() {
+ delete db_;
+ delete tiny_cache_;
+ }
+
+ Status TryReopen() {
+ delete db_;
+ db_ = nullptr;
+ return DB::Open(options_, dbname_, &db_);
+ }
+
+ void Reopen() { ASSERT_OK(TryReopen()); }
+
+ void RepairDB() {
+ delete db_;
+ db_ = nullptr;
+ ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
+ }
+
+ void Build(int n) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = 0; i < n; i++) {
+ // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ WriteOptions options;
+ // Corrupt() doesn't work without this sync on windows; stat reports 0 for
+ // the file size.
+ if (i == n - 1) {
+ options.sync = true;
+ }
+ ASSERT_OK(db_->Write(options, &batch));
+ }
+ }
+
+ void Check(int min_expected, int max_expected) {
+ int next_expected = 0;
+ int missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ if (in == "" || in == "~") {
+ // Ignore boundary keys.
+ continue;
+ }
+ if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(key, &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+
+ fprintf(stderr,
+ "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+ min_expected, max_expected, correct, bad_keys, bad_values, missed);
+ ASSERT_LE(min_expected, correct);
+ ASSERT_GE(max_expected, correct);
+ }
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_.target()->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ int picked_number = -1;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+ int(number) > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = number;
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+
+ uint64_t file_size;
+ ASSERT_OK(env_.target()->GetFileSize(fname, &file_size));
+
+ if (offset < 0) {
+ // Relative to end of file; make it absolute
+ if (-offset > file_size) {
+ offset = 0;
+ } else {
+ offset = file_size + offset;
+ }
+ }
+ if (offset > file_size) {
+ offset = file_size;
+ }
+ if (offset + bytes_to_corrupt > file_size) {
+ bytes_to_corrupt = file_size - offset;
+ }
+
+ // Do it
+ std::string contents;
+ Status s = ReadFileToString(env_.target(), fname, &contents);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ for (int i = 0; i < bytes_to_corrupt; i++) {
+ contents[i + offset] ^= 0x80;
+ }
+ s = WriteStringToFile(env_.target(), contents, fname);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ }
+
+ int Property(const std::string& name) {
+ std::string property;
+ int result;
+ if (db_->GetProperty(name, &property) &&
+ sscanf(property.c_str(), "%d", &result) == 1) {
+ return result;
+ } else {
+ return -1;
+ }
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+
+ test::ErrorEnv env_;
+ Options options_;
+ DB* db_;
+
+ private:
+ std::string dbname_;
+ Cache* tiny_cache_;
+};
+
+TEST(CorruptionTest, Recovery) {
+ Build(100);
+ Check(100, 100);
+ Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
+ Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
+ Reopen();
+
+ // The 64 records in the first two log blocks are completely lost.
+ Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+ env_.writable_file_error_ = true;
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+ // Do enough writing to force minor compaction
+ env_.writable_file_error_ = true;
+ const int num = 3 + (Options().write_buffer_size / kValueSize);
+ std::string value_storage;
+ Status s;
+ for (int i = 0; s.ok() && i < num; i++) {
+ WriteBatch batch;
+ batch.Put("a", Value(100, &value_storage));
+ s = db_->Write(WriteOptions(), &batch);
+ }
+ ASSERT_TRUE(!s.ok());
+ ASSERT_GE(env_.num_writable_file_errors_, 1);
+ env_.writable_file_error_ = false;
+ Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+ dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+ Corrupt(kTableFile, 100, 1);
+ Check(90, 99);
+}
+
+TEST(CorruptionTest, TableFileRepair) {
+ options_.block_size = 2 * kValueSize; // Limit scope of corruption
+ options_.paranoid_checks = true;
+ Reopen();
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+ dbi->TEST_CompactRange(1, nullptr, nullptr);
+
+ Corrupt(kTableFile, 100, 1);
+ RepairDB();
+ Reopen();
+ Check(95, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+ Build(10000); // Enough to build multiple Tables
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+
+ Corrupt(kTableFile, -2000, 500);
+ Reopen();
+ Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+ Build(1000);
+ RepairDB();
+ Reopen();
+ Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v5", v);
+ // Write something. If sequence number was not recovered properly,
+ // it will be hidden by an earlier write.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, nullptr, nullptr);
+
+ Corrupt(kDescriptorFile, 0, 1000);
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(5, 9);
+
+ // Force compactions by writing lots of values
+ Build(10000);
+ Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+ options_.paranoid_checks = true;
+ options_.write_buffer_size = 512 << 10;
+ Reopen();
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Make multiple inputs so we need to compact.
+ for (int i = 0; i < 2; i++) {
+ Build(10);
+ dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+ env_.SleepForMicroseconds(100000);
+ }
+ dbi->CompactRange(nullptr, nullptr);
+
+ // Write must fail because of corrupted table
+ std::string tmp1, tmp2;
+ Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
+ ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+
+ std::string tmp1, tmp2;
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+ dbi->TEST_CompactMemTable();
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
new file mode 100644
index 0000000000..65e31724bc
--- /dev/null
+++ b/src/leveldb/db/db_impl.cc
@@ -0,0 +1,1555 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <atomic>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/table_builder.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+const int kNumNonTableCacheFiles = 10;
+
+// Information kept for every waiting writer
+struct DBImpl::Writer {
+ explicit Writer(port::Mutex* mu)
+ : batch(nullptr), sync(false), done(false), cv(mu) {}
+
+ Status status;
+ WriteBatch* batch;
+ bool sync;
+ bool done;
+ port::CondVar cv;
+};
+
+struct DBImpl::CompactionState {
+ // Files produced by compaction
+ struct Output {
+ uint64_t number;
+ uint64_t file_size;
+ InternalKey smallest, largest;
+ };
+
+ Output* current_output() { return &outputs[outputs.size() - 1]; }
+
+ explicit CompactionState(Compaction* c)
+ : compaction(c),
+ smallest_snapshot(0),
+ outfile(nullptr),
+ builder(nullptr),
+ total_bytes(0) {}
+
+ Compaction* const compaction;
+
+ // Sequence numbers < smallest_snapshot are not significant since we
+ // will never have to service a snapshot below smallest_snapshot.
+ // Therefore if we have seen a sequence number S <= smallest_snapshot,
+ // we can drop all entries for the same key with sequence numbers < S.
+ SequenceNumber smallest_snapshot;
+
+ std::vector<Output> outputs;
+
+ // State kept for output being generated
+ WritableFile* outfile;
+ TableBuilder* builder;
+
+ uint64_t total_bytes;
+};
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+Options SanitizeOptions(const std::string& dbname,
+ const InternalKeyComparator* icmp,
+ const InternalFilterPolicy* ipolicy,
+ const Options& src) {
+ Options result = src;
+ result.comparator = icmp;
+ result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
+ ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
+ ClipToRange(&result.write_buffer_size, 64 << 10, 1 << 30);
+ ClipToRange(&result.max_file_size, 1 << 20, 1 << 30);
+ ClipToRange(&result.block_size, 1 << 10, 4 << 20);
+ if (result.info_log == nullptr) {
+ // Open a log file in the same directory as the db
+ src.env->CreateDir(dbname); // In case it does not exist
+ src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = nullptr;
+ }
+ }
+ if (result.block_cache == nullptr) {
+ result.block_cache = NewLRUCache(8 << 20);
+ }
+ return result;
+}
+
+static int TableCacheSize(const Options& sanitized_options) {
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ return sanitized_options.max_open_files - kNumNonTableCacheFiles;
+}
+
+DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
+ : env_(raw_options.env),
+ internal_comparator_(raw_options.comparator),
+ internal_filter_policy_(raw_options.filter_policy),
+ options_(SanitizeOptions(dbname, &internal_comparator_,
+ &internal_filter_policy_, raw_options)),
+ owns_info_log_(options_.info_log != raw_options.info_log),
+ owns_cache_(options_.block_cache != raw_options.block_cache),
+ dbname_(dbname),
+ table_cache_(new TableCache(dbname_, options_, TableCacheSize(options_))),
+ db_lock_(nullptr),
+ shutting_down_(false),
+ background_work_finished_signal_(&mutex_),
+ mem_(nullptr),
+ imm_(nullptr),
+ has_imm_(false),
+ logfile_(nullptr),
+ logfile_number_(0),
+ log_(nullptr),
+ seed_(0),
+ tmp_batch_(new WriteBatch),
+ background_compaction_scheduled_(false),
+ manual_compaction_(nullptr),
+ versions_(new VersionSet(dbname_, &options_, table_cache_,
+ &internal_comparator_)) {}
+
+DBImpl::~DBImpl() {
+ // Wait for background work to finish.
+ mutex_.Lock();
+ shutting_down_.store(true, std::memory_order_release);
+ while (background_compaction_scheduled_) {
+ background_work_finished_signal_.Wait();
+ }
+ mutex_.Unlock();
+
+ if (db_lock_ != nullptr) {
+ env_->UnlockFile(db_lock_);
+ }
+
+ delete versions_;
+ if (mem_ != nullptr) mem_->Unref();
+ if (imm_ != nullptr) imm_->Unref();
+ delete tmp_batch_;
+ delete log_;
+ delete logfile_;
+ delete table_cache_;
+
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+ if (owns_cache_) {
+ delete options_.block_cache;
+ }
+}
+
+Status DBImpl::NewDB() {
+ VersionEdit new_db;
+ new_db.SetComparatorName(user_comparator()->Name());
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(manifest, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ {
+ log::Writer log(file);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = file->Close();
+ }
+ }
+ delete file;
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(manifest);
+ }
+ return s;
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || options_.paranoid_checks) {
+ // No change needed
+ } else {
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ mutex_.AssertHeld();
+
+ if (!bg_error_.ok()) {
+ // After a background error, we don't know whether a new version may
+ // or may not have been committed, so we cannot safely garbage collect.
+ return;
+ }
+
+ // Make a set of all of the live files
+ std::set<uint64_t> live = pending_outputs_;
+ versions_->AddLiveFiles(&live);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+ uint64_t number;
+ FileType type;
+ std::vector<std::string> files_to_delete;
+ for (std::string& filename : filenames) {
+ if (ParseFileName(filename, &number, &type)) {
+ bool keep = true;
+ switch (type) {
+ case kLogFile:
+ keep = ((number >= versions_->LogNumber()) ||
+ (number == versions_->PrevLogNumber()));
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (in case there is a race that allows other incarnations)
+ keep = (number >= versions_->ManifestFileNumber());
+ break;
+ case kTableFile:
+ keep = (live.find(number) != live.end());
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live"
+ keep = (live.find(number) != live.end());
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kInfoLogFile:
+ keep = true;
+ break;
+ }
+
+ if (!keep) {
+ files_to_delete.push_back(std::move(filename));
+ if (type == kTableFile) {
+ table_cache_->Evict(number);
+ }
+ Log(options_.info_log, "Delete type=%d #%lld\n", static_cast<int>(type),
+ static_cast<unsigned long long>(number));
+ }
+ }
+ }
+
+ // While deleting all files unblock other threads. All files being deleted
+ // have unique names which will not collide with newly created files and
+ // are therefore safe to delete while allowing other threads to proceed.
+ mutex_.Unlock();
+ for (const std::string& filename : files_to_delete) {
+ env_->DeleteFile(dbname_ + "/" + filename);
+ }
+ mutex_.Lock();
+}
+
+Status DBImpl::Recover(VersionEdit* edit, bool* save_manifest) {
+ mutex_.AssertHeld();
+
+ // Ignore error from CreateDir since the creation of the DB is
+ // committed only when the descriptor is created, and this directory
+ // may already exist from a previous failed creation attempt.
+ env_->CreateDir(dbname_);
+ assert(db_lock_ == nullptr);
+ Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!env_->FileExists(CurrentFileName(dbname_))) {
+ if (options_.create_if_missing) {
+ s = NewDB();
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ dbname_, "does not exist (create_if_missing is false)");
+ }
+ } else {
+ if (options_.error_if_exists) {
+ return Status::InvalidArgument(dbname_,
+ "exists (error_if_exists is true)");
+ }
+ }
+
+ s = versions_->Recover(save_manifest);
+ if (!s.ok()) {
+ return s;
+ }
+ SequenceNumber max_sequence(0);
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that PrevLogNumber() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of leveldb.
+ const uint64_t min_log = versions_->LogNumber();
+ const uint64_t prev_log = versions_->PrevLogNumber();
+ std::vector<std::string> filenames;
+ s = env_->GetChildren(dbname_, &filenames);
+ if (!s.ok()) {
+ return s;
+ }
+ std::set<uint64_t> expected;
+ versions_->AddLiveFiles(&expected);
+ uint64_t number;
+ FileType type;
+ std::vector<uint64_t> logs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ expected.erase(number);
+ if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
+ logs.push_back(number);
+ }
+ }
+ if (!expected.empty()) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d missing files; e.g.",
+ static_cast<int>(expected.size()));
+ return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
+ }
+
+ // Recover in the order in which the logs were generated
+ std::sort(logs.begin(), logs.end());
+ for (size_t i = 0; i < logs.size(); i++) {
+ s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
+ &max_sequence);
+ if (!s.ok()) {
+ return s;
+ }
+
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(logs[i]);
+ }
+
+ if (versions_->LastSequence() < max_sequence) {
+ versions_->SetLastSequence(max_sequence);
+ }
+
+ return Status::OK();
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
+ bool* save_manifest, VersionEdit* edit,
+ SequenceNumber* max_sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+ Status* status; // null if options_.paranoid_checks==false
+ void Corruption(size_t bytes, const Status& s) override {
+ Log(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == nullptr ? "(ignoring error) " : ""), fname,
+ static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status != nullptr && this->status->ok()) *this->status = s;
+ }
+ };
+
+ mutex_.AssertHeld();
+
+ // Open the log file
+ std::string fname = LogFileName(dbname_, log_number);
+ SequentialFile* file;
+ Status status = env_->NewSequentialFile(fname, &file);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.fname = fname.c_str();
+ reporter.status = (options_.paranoid_checks ? &status : nullptr);
+ // We intentionally make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(file, &reporter, true /*checksum*/, 0 /*initial_offset*/);
+ Log(options_.info_log, "Recovering log #%llu",
+ (unsigned long long)log_number);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ int compactions = 0;
+ MemTable* mem = nullptr;
+ while (reader.ReadRecord(&record, &scratch) && status.ok()) {
+ if (record.size() < 12) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small", fname));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+
+ if (mem == nullptr) {
+ mem = new MemTable(internal_comparator_);
+ mem->Ref();
+ }
+ status = WriteBatchInternal::InsertInto(&batch, mem);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ break;
+ }
+ const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
+ WriteBatchInternal::Count(&batch) - 1;
+ if (last_seq > *max_sequence) {
+ *max_sequence = last_seq;
+ }
+
+ if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+ compactions++;
+ *save_manifest = true;
+ status = WriteLevel0Table(mem, edit, nullptr);
+ mem->Unref();
+ mem = nullptr;
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ break;
+ }
+ }
+ }
+
+ delete file;
+
+ // See if we should keep reusing the last log file.
+ if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
+ assert(logfile_ == nullptr);
+ assert(log_ == nullptr);
+ assert(mem_ == nullptr);
+ uint64_t lfile_size;
+ if (env_->GetFileSize(fname, &lfile_size).ok() &&
+ env_->NewAppendableFile(fname, &logfile_).ok()) {
+ Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
+ log_ = new log::Writer(logfile_, lfile_size);
+ logfile_number_ = log_number;
+ if (mem != nullptr) {
+ mem_ = mem;
+ mem = nullptr;
+ } else {
+ // mem can be nullptr if lognum exists but was empty.
+ mem_ = new MemTable(internal_comparator_);
+ mem_->Ref();
+ }
+ }
+ }
+
+ if (mem != nullptr) {
+ // mem did not get reused; compact it.
+ if (status.ok()) {
+ *save_manifest = true;
+ status = WriteLevel0Table(mem, edit, nullptr);
+ }
+ mem->Unref();
+ }
+
+ return status;
+}
+
+Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
+ Version* base) {
+ mutex_.AssertHeld();
+ const uint64_t start_micros = env_->NowMicros();
+ FileMetaData meta;
+ meta.number = versions_->NewFileNumber();
+ pending_outputs_.insert(meta.number);
+ Iterator* iter = mem->NewIterator();
+ Log(options_.info_log, "Level-0 table #%llu: started",
+ (unsigned long long)meta.number);
+
+ Status s;
+ {
+ mutex_.Unlock();
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+ mutex_.Lock();
+ }
+
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
+ (unsigned long long)meta.number, (unsigned long long)meta.file_size,
+ s.ToString().c_str());
+ delete iter;
+ pending_outputs_.erase(meta.number);
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ int level = 0;
+ if (s.ok() && meta.file_size > 0) {
+ const Slice min_user_key = meta.smallest.user_key();
+ const Slice max_user_key = meta.largest.user_key();
+ if (base != nullptr) {
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+ }
+ edit->AddFile(level, meta.number, meta.file_size, meta.smallest,
+ meta.largest);
+ }
+
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros;
+ stats.bytes_written = meta.file_size;
+ stats_[level].Add(stats);
+ return s;
+}
+
+void DBImpl::CompactMemTable() {
+ mutex_.AssertHeld();
+ assert(imm_ != nullptr);
+
+ // Save the contents of the memtable as a new Table
+ VersionEdit edit;
+ Version* base = versions_->current();
+ base->Ref();
+ Status s = WriteLevel0Table(imm_, &edit, base);
+ base->Unref();
+
+ if (s.ok() && shutting_down_.load(std::memory_order_acquire)) {
+ s = Status::IOError("Deleting DB during memtable compaction");
+ }
+
+ // Replace immutable memtable with the generated Table
+ if (s.ok()) {
+ edit.SetPrevLogNumber(0);
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
+ s = versions_->LogAndApply(&edit, &mutex_);
+ }
+
+ if (s.ok()) {
+ // Commit to the new state
+ imm_->Unref();
+ imm_ = nullptr;
+ has_imm_.store(false, std::memory_order_release);
+ DeleteObsoleteFiles();
+ } else {
+ RecordBackgroundError(s);
+ }
+}
+
+void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
+ int max_level_with_files = 1;
+ {
+ MutexLock l(&mutex_);
+ Version* base = versions_->current();
+ for (int level = 1; level < config::kNumLevels; level++) {
+ if (base->OverlapInLevel(level, begin, end)) {
+ max_level_with_files = level;
+ }
+ }
+ }
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
+ for (int level = 0; level < max_level_with_files; level++) {
+ TEST_CompactRange(level, begin, end);
+ }
+}
+
+void DBImpl::TEST_CompactRange(int level, const Slice* begin,
+ const Slice* end) {
+ assert(level >= 0);
+ assert(level + 1 < config::kNumLevels);
+
+ InternalKey begin_storage, end_storage;
+
+ ManualCompaction manual;
+ manual.level = level;
+ manual.done = false;
+ if (begin == nullptr) {
+ manual.begin = nullptr;
+ } else {
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+ manual.begin = &begin_storage;
+ }
+ if (end == nullptr) {
+ manual.end = nullptr;
+ } else {
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+ manual.end = &end_storage;
+ }
+
+ MutexLock l(&mutex_);
+ while (!manual.done && !shutting_down_.load(std::memory_order_acquire) &&
+ bg_error_.ok()) {
+ if (manual_compaction_ == nullptr) { // Idle
+ manual_compaction_ = &manual;
+ MaybeScheduleCompaction();
+ } else { // Running either my compaction or another compaction.
+ background_work_finished_signal_.Wait();
+ }
+ }
+ if (manual_compaction_ == &manual) {
+ // Cancel my manual compaction since we aborted early for some reason.
+ manual_compaction_ = nullptr;
+ }
+}
+
+Status DBImpl::TEST_CompactMemTable() {
+ // nullptr batch means just wait for earlier writes to be done
+ Status s = Write(WriteOptions(), nullptr);
+ if (s.ok()) {
+ // Wait until the compaction completes
+ MutexLock l(&mutex_);
+ while (imm_ != nullptr && bg_error_.ok()) {
+ background_work_finished_signal_.Wait();
+ }
+ if (imm_ != nullptr) {
+ s = bg_error_;
+ }
+ }
+ return s;
+}
+
+void DBImpl::RecordBackgroundError(const Status& s) {
+ mutex_.AssertHeld();
+ if (bg_error_.ok()) {
+ bg_error_ = s;
+ background_work_finished_signal_.SignalAll();
+ }
+}
+
+void DBImpl::MaybeScheduleCompaction() {
+ mutex_.AssertHeld();
+ if (background_compaction_scheduled_) {
+ // Already scheduled
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ // DB is being deleted; no more background compactions
+ } else if (!bg_error_.ok()) {
+ // Already got an error; no more changes
+ } else if (imm_ == nullptr && manual_compaction_ == nullptr &&
+ !versions_->NeedsCompaction()) {
+ // No work to be done
+ } else {
+ background_compaction_scheduled_ = true;
+ env_->Schedule(&DBImpl::BGWork, this);
+ }
+}
+
+void DBImpl::BGWork(void* db) {
+ reinterpret_cast<DBImpl*>(db)->BackgroundCall();
+}
+
+void DBImpl::BackgroundCall() {
+ MutexLock l(&mutex_);
+ assert(background_compaction_scheduled_);
+ if (shutting_down_.load(std::memory_order_acquire)) {
+ // No more background work when shutting down.
+ } else if (!bg_error_.ok()) {
+ // No more background work after a background error.
+ } else {
+ BackgroundCompaction();
+ }
+
+ background_compaction_scheduled_ = false;
+
+ // Previous compaction may have produced too many files in a level,
+ // so reschedule another compaction if needed.
+ MaybeScheduleCompaction();
+ background_work_finished_signal_.SignalAll();
+}
+
+void DBImpl::BackgroundCompaction() {
+ mutex_.AssertHeld();
+
+ if (imm_ != nullptr) {
+ CompactMemTable();
+ return;
+ }
+
+ Compaction* c;
+ bool is_manual = (manual_compaction_ != nullptr);
+ InternalKey manual_end;
+ if (is_manual) {
+ ManualCompaction* m = manual_compaction_;
+ c = versions_->CompactRange(m->level, m->begin, m->end);
+ m->done = (c == nullptr);
+ if (c != nullptr) {
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
+ }
+ Log(options_.info_log,
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
+ m->level, (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
+ } else {
+ c = versions_->PickCompaction();
+ }
+
+ Status status;
+ if (c == nullptr) {
+ // Nothing to do
+ } else if (!is_manual && c->IsTrivialMove()) {
+ // Move file to next level
+ assert(c->num_input_files(0) == 1);
+ FileMetaData* f = c->input(0, 0);
+ c->edit()->DeleteFile(c->level(), f->number);
+ c->edit()->AddFile(c->level() + 1, f->number, f->file_size, f->smallest,
+ f->largest);
+ status = versions_->LogAndApply(c->edit(), &mutex_);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ VersionSet::LevelSummaryStorage tmp;
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
+ static_cast<unsigned long long>(f->number), c->level() + 1,
+ static_cast<unsigned long long>(f->file_size),
+ status.ToString().c_str(), versions_->LevelSummary(&tmp));
+ } else {
+ CompactionState* compact = new CompactionState(c);
+ status = DoCompactionWork(compact);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ CleanupCompaction(compact);
+ c->ReleaseInputs();
+ DeleteObsoleteFiles();
+ }
+ delete c;
+
+ if (status.ok()) {
+ // Done
+ } else if (shutting_down_.load(std::memory_order_acquire)) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ Log(options_.info_log, "Compaction error: %s", status.ToString().c_str());
+ }
+
+ if (is_manual) {
+ ManualCompaction* m = manual_compaction_;
+ if (!status.ok()) {
+ m->done = true;
+ }
+ if (!m->done) {
+ // We only compacted part of the requested range. Update *m
+ // to the range that is left to be compacted.
+ m->tmp_storage = manual_end;
+ m->begin = &m->tmp_storage;
+ }
+ manual_compaction_ = nullptr;
+ }
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact) {
+ mutex_.AssertHeld();
+ if (compact->builder != nullptr) {
+ // May happen if we get a shutdown call in the middle of compaction
+ compact->builder->Abandon();
+ delete compact->builder;
+ } else {
+ assert(compact->outfile == nullptr);
+ }
+ delete compact->outfile;
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ pending_outputs_.erase(out.number);
+ }
+ delete compact;
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+ assert(compact != nullptr);
+ assert(compact->builder == nullptr);
+ uint64_t file_number;
+ {
+ mutex_.Lock();
+ file_number = versions_->NewFileNumber();
+ pending_outputs_.insert(file_number);
+ CompactionState::Output out;
+ out.number = file_number;
+ out.smallest.Clear();
+ out.largest.Clear();
+ compact->outputs.push_back(out);
+ mutex_.Unlock();
+ }
+
+ // Make the output file
+ std::string fname = TableFileName(dbname_, file_number);
+ Status s = env_->NewWritableFile(fname, &compact->outfile);
+ if (s.ok()) {
+ compact->builder = new TableBuilder(options_, compact->outfile);
+ }
+ return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+ Iterator* input) {
+ assert(compact != nullptr);
+ assert(compact->outfile != nullptr);
+ assert(compact->builder != nullptr);
+
+ const uint64_t output_number = compact->current_output()->number;
+ assert(output_number != 0);
+
+ // Check for iterator errors
+ Status s = input->status();
+ const uint64_t current_entries = compact->builder->NumEntries();
+ if (s.ok()) {
+ s = compact->builder->Finish();
+ } else {
+ compact->builder->Abandon();
+ }
+ const uint64_t current_bytes = compact->builder->FileSize();
+ compact->current_output()->file_size = current_bytes;
+ compact->total_bytes += current_bytes;
+ delete compact->builder;
+ compact->builder = nullptr;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = compact->outfile->Sync();
+ }
+ if (s.ok()) {
+ s = compact->outfile->Close();
+ }
+ delete compact->outfile;
+ compact->outfile = nullptr;
+
+ if (s.ok() && current_entries > 0) {
+ // Verify that the table is usable
+ Iterator* iter =
+ table_cache_->NewIterator(ReadOptions(), output_number, current_bytes);
+ s = iter->status();
+ delete iter;
+ if (s.ok()) {
+ Log(options_.info_log, "Generated table #%llu@%d: %lld keys, %lld bytes",
+ (unsigned long long)output_number, compact->compaction->level(),
+ (unsigned long long)current_entries,
+ (unsigned long long)current_bytes);
+ }
+ }
+ return s;
+}
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact) {
+ mutex_.AssertHeld();
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
+ compact->compaction->num_input_files(0), compact->compaction->level(),
+ compact->compaction->num_input_files(1), compact->compaction->level() + 1,
+ static_cast<long long>(compact->total_bytes));
+
+ // Add compaction outputs
+ compact->compaction->AddInputDeletions(compact->compaction->edit());
+ const int level = compact->compaction->level();
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ compact->compaction->edit()->AddFile(level + 1, out.number, out.file_size,
+ out.smallest, out.largest);
+ }
+ return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact) {
+ const uint64_t start_micros = env_->NowMicros();
+ int64_t imm_micros = 0; // Micros spent doing imm_ compactions
+
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
+ compact->compaction->num_input_files(0), compact->compaction->level(),
+ compact->compaction->num_input_files(1),
+ compact->compaction->level() + 1);
+
+ assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+ assert(compact->builder == nullptr);
+ assert(compact->outfile == nullptr);
+ if (snapshots_.empty()) {
+ compact->smallest_snapshot = versions_->LastSequence();
+ } else {
+ compact->smallest_snapshot = snapshots_.oldest()->sequence_number();
+ }
+
+ Iterator* input = versions_->MakeInputIterator(compact->compaction);
+
+ // Release mutex while we're actually doing the compaction work
+ mutex_.Unlock();
+
+ input->SeekToFirst();
+ Status status;
+ ParsedInternalKey ikey;
+ std::string current_user_key;
+ bool has_current_user_key = false;
+ SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
+ while (input->Valid() && !shutting_down_.load(std::memory_order_acquire)) {
+ // Prioritize immutable compaction work
+ if (has_imm_.load(std::memory_order_relaxed)) {
+ const uint64_t imm_start = env_->NowMicros();
+ mutex_.Lock();
+ if (imm_ != nullptr) {
+ CompactMemTable();
+ // Wake up MakeRoomForWrite() if necessary.
+ background_work_finished_signal_.SignalAll();
+ }
+ mutex_.Unlock();
+ imm_micros += (env_->NowMicros() - imm_start);
+ }
+
+ Slice key = input->key();
+ if (compact->compaction->ShouldStopBefore(key) &&
+ compact->builder != nullptr) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ // Handle key/value, add to state, etc.
+ bool drop = false;
+ if (!ParseInternalKey(key, &ikey)) {
+ // Do not hide error keys
+ current_user_key.clear();
+ has_current_user_key = false;
+ last_sequence_for_key = kMaxSequenceNumber;
+ } else {
+ if (!has_current_user_key ||
+ user_comparator()->Compare(ikey.user_key, Slice(current_user_key)) !=
+ 0) {
+ // First occurrence of this user key
+ current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+ has_current_user_key = true;
+ last_sequence_for_key = kMaxSequenceNumber;
+ }
+
+ if (last_sequence_for_key <= compact->smallest_snapshot) {
+ // Hidden by an newer entry for same user key
+ drop = true; // (A)
+ } else if (ikey.type == kTypeDeletion &&
+ ikey.sequence <= compact->smallest_snapshot &&
+ compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ drop = true;
+ }
+
+ last_sequence_for_key = ikey.sequence;
+ }
+#if 0
+ Log(options_.info_log,
+ " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+ "%d smallest_snapshot: %d",
+ ikey.user_key.ToString().c_str(),
+ (int)ikey.sequence, ikey.type, kTypeValue, drop,
+ compact->compaction->IsBaseLevelForKey(ikey.user_key),
+ (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+
+ if (!drop) {
+ // Open output file if necessary
+ if (compact->builder == nullptr) {
+ status = OpenCompactionOutputFile(compact);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (compact->builder->NumEntries() == 0) {
+ compact->current_output()->smallest.DecodeFrom(key);
+ }
+ compact->current_output()->largest.DecodeFrom(key);
+ compact->builder->Add(key, input->value());
+
+ // Close output file if it is big enough
+ if (compact->builder->FileSize() >=
+ compact->compaction->MaxOutputFileSize()) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+
+ input->Next();
+ }
+
+ if (status.ok() && shutting_down_.load(std::memory_order_acquire)) {
+ status = Status::IOError("Deleting DB during compaction");
+ }
+ if (status.ok() && compact->builder != nullptr) {
+ status = FinishCompactionOutputFile(compact, input);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ delete input;
+ input = nullptr;
+
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros - imm_micros;
+ for (int which = 0; which < 2; which++) {
+ for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+ stats.bytes_read += compact->compaction->input(which, i)->file_size;
+ }
+ }
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ stats.bytes_written += compact->outputs[i].file_size;
+ }
+
+ mutex_.Lock();
+ stats_[compact->compaction->level() + 1].Add(stats);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(compact);
+ }
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ VersionSet::LevelSummaryStorage tmp;
+ Log(options_.info_log, "compacted to: %s", versions_->LevelSummary(&tmp));
+ return status;
+}
+
+namespace {
+
+struct IterState {
+ port::Mutex* const mu;
+ Version* const version GUARDED_BY(mu);
+ MemTable* const mem GUARDED_BY(mu);
+ MemTable* const imm GUARDED_BY(mu);
+
+ IterState(port::Mutex* mutex, MemTable* mem, MemTable* imm, Version* version)
+ : mu(mutex), version(version), mem(mem), imm(imm) {}
+};
+
+static void CleanupIteratorState(void* arg1, void* arg2) {
+ IterState* state = reinterpret_cast<IterState*>(arg1);
+ state->mu->Lock();
+ state->mem->Unref();
+ if (state->imm != nullptr) state->imm->Unref();
+ state->version->Unref();
+ state->mu->Unlock();
+ delete state;
+}
+
+} // anonymous namespace
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed) {
+ mutex_.Lock();
+ *latest_snapshot = versions_->LastSequence();
+
+ // Collect together all needed child iterators
+ std::vector<Iterator*> list;
+ list.push_back(mem_->NewIterator());
+ mem_->Ref();
+ if (imm_ != nullptr) {
+ list.push_back(imm_->NewIterator());
+ imm_->Ref();
+ }
+ versions_->current()->AddIterators(options, &list);
+ Iterator* internal_iter =
+ NewMergingIterator(&internal_comparator_, &list[0], list.size());
+ versions_->current()->Ref();
+
+ IterState* cleanup = new IterState(&mutex_, mem_, imm_, versions_->current());
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
+
+ *seed = ++seed_;
+ mutex_.Unlock();
+ return internal_iter;
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator() {
+ SequenceNumber ignored;
+ uint32_t ignored_seed;
+ return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+ MutexLock l(&mutex_);
+ return versions_->MaxNextLevelOverlappingBytes();
+}
+
+Status DBImpl::Get(const ReadOptions& options, const Slice& key,
+ std::string* value) {
+ Status s;
+ MutexLock l(&mutex_);
+ SequenceNumber snapshot;
+ if (options.snapshot != nullptr) {
+ snapshot =
+ static_cast<const SnapshotImpl*>(options.snapshot)->sequence_number();
+ } else {
+ snapshot = versions_->LastSequence();
+ }
+
+ MemTable* mem = mem_;
+ MemTable* imm = imm_;
+ Version* current = versions_->current();
+ mem->Ref();
+ if (imm != nullptr) imm->Ref();
+ current->Ref();
+
+ bool have_stat_update = false;
+ Version::GetStats stats;
+
+ // Unlock while reading from files and memtables
+ {
+ mutex_.Unlock();
+ // First look in the memtable, then in the immutable memtable (if any).
+ LookupKey lkey(key, snapshot);
+ if (mem->Get(lkey, value, &s)) {
+ // Done
+ } else if (imm != nullptr && imm->Get(lkey, value, &s)) {
+ // Done
+ } else {
+ s = current->Get(options, lkey, value, &stats);
+ have_stat_update = true;
+ }
+ mutex_.Lock();
+ }
+
+ if (have_stat_update && current->UpdateStats(stats)) {
+ MaybeScheduleCompaction();
+ }
+ mem->Unref();
+ if (imm != nullptr) imm->Unref();
+ current->Unref();
+ return s;
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options) {
+ SequenceNumber latest_snapshot;
+ uint32_t seed;
+ Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
+ return NewDBIterator(this, user_comparator(), iter,
+ (options.snapshot != nullptr
+ ? static_cast<const SnapshotImpl*>(options.snapshot)
+ ->sequence_number()
+ : latest_snapshot),
+ seed);
+}
+
+void DBImpl::RecordReadSample(Slice key) {
+ MutexLock l(&mutex_);
+ if (versions_->current()->RecordReadSample(key)) {
+ MaybeScheduleCompaction();
+ }
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+ MutexLock l(&mutex_);
+ return snapshots_.New(versions_->LastSequence());
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* snapshot) {
+ MutexLock l(&mutex_);
+ snapshots_.Delete(static_cast<const SnapshotImpl*>(snapshot));
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
+ return DB::Put(o, key, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
+ return DB::Delete(options, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
+ Writer w(&mutex_);
+ w.batch = updates;
+ w.sync = options.sync;
+ w.done = false;
+
+ MutexLock l(&mutex_);
+ writers_.push_back(&w);
+ while (!w.done && &w != writers_.front()) {
+ w.cv.Wait();
+ }
+ if (w.done) {
+ return w.status;
+ }
+
+ // May temporarily unlock and wait.
+ Status status = MakeRoomForWrite(updates == nullptr);
+ uint64_t last_sequence = versions_->LastSequence();
+ Writer* last_writer = &w;
+ if (status.ok() && updates != nullptr) { // nullptr batch is for compactions
+ WriteBatch* write_batch = BuildBatchGroup(&last_writer);
+ WriteBatchInternal::SetSequence(write_batch, last_sequence + 1);
+ last_sequence += WriteBatchInternal::Count(write_batch);
+
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since &w is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into mem_.
+ {
+ mutex_.Unlock();
+ status = log_->AddRecord(WriteBatchInternal::Contents(write_batch));
+ bool sync_error = false;
+ if (status.ok() && options.sync) {
+ status = logfile_->Sync();
+ if (!status.ok()) {
+ sync_error = true;
+ }
+ }
+ if (status.ok()) {
+ status = WriteBatchInternal::InsertInto(write_batch, mem_);
+ }
+ mutex_.Lock();
+ if (sync_error) {
+ // The state of the log file is indeterminate: the log record we
+ // just added may or may not show up when the DB is re-opened.
+ // So we force the DB into a mode where all future writes fail.
+ RecordBackgroundError(status);
+ }
+ }
+ if (write_batch == tmp_batch_) tmp_batch_->Clear();
+
+ versions_->SetLastSequence(last_sequence);
+ }
+
+ while (true) {
+ Writer* ready = writers_.front();
+ writers_.pop_front();
+ if (ready != &w) {
+ ready->status = status;
+ ready->done = true;
+ ready->cv.Signal();
+ }
+ if (ready == last_writer) break;
+ }
+
+ // Notify new head of write queue
+ if (!writers_.empty()) {
+ writers_.front()->cv.Signal();
+ }
+
+ return status;
+}
+
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-null batch
+WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+ mutex_.AssertHeld();
+ assert(!writers_.empty());
+ Writer* first = writers_.front();
+ WriteBatch* result = first->batch;
+ assert(result != nullptr);
+
+ size_t size = WriteBatchInternal::ByteSize(first->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = 1 << 20;
+ if (size <= (128 << 10)) {
+ max_size = size + (128 << 10);
+ }
+
+ *last_writer = first;
+ std::deque<Writer*>::iterator iter = writers_.begin();
+ ++iter; // Advance past "first"
+ for (; iter != writers_.end(); ++iter) {
+ Writer* w = *iter;
+ if (w->sync && !first->sync) {
+ // Do not include a sync write into a batch handled by a non-sync write.
+ break;
+ }
+
+ if (w->batch != nullptr) {
+ size += WriteBatchInternal::ByteSize(w->batch);
+ if (size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+
+ // Append to *result
+ if (result == first->batch) {
+ // Switch to temporary batch instead of disturbing caller's batch
+ result = tmp_batch_;
+ assert(WriteBatchInternal::Count(result) == 0);
+ WriteBatchInternal::Append(result, first->batch);
+ }
+ WriteBatchInternal::Append(result, w->batch);
+ }
+ *last_writer = w;
+ }
+ return result;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::MakeRoomForWrite(bool force) {
+ mutex_.AssertHeld();
+ assert(!writers_.empty());
+ bool allow_delay = !force;
+ Status s;
+ while (true) {
+ if (!bg_error_.ok()) {
+ // Yield previous error
+ s = bg_error_;
+ break;
+ } else if (allow_delay && versions_->NumLevelFiles(0) >=
+ config::kL0_SlowdownWritesTrigger) {
+ // We are getting close to hitting a hard limit on the number of
+ // L0 files. Rather than delaying a single write by several
+ // seconds when we hit the hard limit, start delaying each
+ // individual write by 1ms to reduce latency variance. Also,
+ // this delay hands over some CPU to the compaction thread in
+ // case it is sharing the same core as the writer.
+ mutex_.Unlock();
+ env_->SleepForMicroseconds(1000);
+ allow_delay = false; // Do not delay a single write more than once
+ mutex_.Lock();
+ } else if (!force &&
+ (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+ // There is room in current memtable
+ break;
+ } else if (imm_ != nullptr) {
+ // We have filled up the current memtable, but the previous
+ // one is still being compacted, so we wait.
+ Log(options_.info_log, "Current memtable full; waiting...\n");
+ background_work_finished_signal_.Wait();
+ } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
+ // There are too many level-0 files.
+ Log(options_.info_log, "Too many L0 files; waiting...\n");
+ background_work_finished_signal_.Wait();
+ } else {
+ // Attempt to switch to a new memtable and trigger compaction of old
+ assert(versions_->PrevLogNumber() == 0);
+ uint64_t new_log_number = versions_->NewFileNumber();
+ WritableFile* lfile = nullptr;
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+ if (!s.ok()) {
+ // Avoid chewing through file number space in a tight loop.
+ versions_->ReuseFileNumber(new_log_number);
+ break;
+ }
+ delete log_;
+ delete logfile_;
+ logfile_ = lfile;
+ logfile_number_ = new_log_number;
+ log_ = new log::Writer(lfile);
+ imm_ = mem_;
+ has_imm_.store(true, std::memory_order_release);
+ mem_ = new MemTable(internal_comparator_);
+ mem_->Ref();
+ force = false; // Do not force another compaction if have room
+ MaybeScheduleCompaction();
+ }
+ }
+ return s;
+}
+
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+ value->clear();
+
+ MutexLock l(&mutex_);
+ Slice in = property;
+ Slice prefix("leveldb.");
+ if (!in.starts_with(prefix)) return false;
+ in.remove_prefix(prefix.size());
+
+ if (in.starts_with("num-files-at-level")) {
+ in.remove_prefix(strlen("num-files-at-level"));
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+ if (!ok || level >= config::kNumLevels) {
+ return false;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d",
+ versions_->NumLevelFiles(static_cast<int>(level)));
+ *value = buf;
+ return true;
+ }
+ } else if (in == "stats") {
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ " Compactions\n"
+ "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
+ "--------------------------------------------------\n");
+ value->append(buf);
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int files = versions_->NumLevelFiles(level);
+ if (stats_[level].micros > 0 || files > 0) {
+ snprintf(buf, sizeof(buf), "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", level,
+ files, versions_->NumLevelBytes(level) / 1048576.0,
+ stats_[level].micros / 1e6,
+ stats_[level].bytes_read / 1048576.0,
+ stats_[level].bytes_written / 1048576.0);
+ value->append(buf);
+ }
+ }
+ return true;
+ } else if (in == "sstables") {
+ *value = versions_->current()->DebugString();
+ return true;
+ } else if (in == "approximate-memory-usage") {
+ size_t total_usage = options_.block_cache->TotalCharge();
+ if (mem_) {
+ total_usage += mem_->ApproximateMemoryUsage();
+ }
+ if (imm_) {
+ total_usage += imm_->ApproximateMemoryUsage();
+ }
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%llu",
+ static_cast<unsigned long long>(total_usage));
+ value->append(buf);
+ return true;
+ }
+
+ return false;
+}
+
+void DBImpl::GetApproximateSizes(const Range* range, int n, uint64_t* sizes) {
+ // TODO(opt): better implementation
+ MutexLock l(&mutex_);
+ Version* v = versions_->current();
+ v->Ref();
+
+ for (int i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+ uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+ sizes[i] = (limit >= start ? limit - start : 0);
+ }
+
+ v->Unref();
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+ WriteBatch batch;
+ batch.Put(key, value);
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, const Slice& key) {
+ WriteBatch batch;
+ batch.Delete(key);
+ return Write(opt, &batch);
+}
+
+DB::~DB() = default;
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+ *dbptr = nullptr;
+
+ DBImpl* impl = new DBImpl(options, dbname);
+ impl->mutex_.Lock();
+ VersionEdit edit;
+ // Recover handles create_if_missing, error_if_exists
+ bool save_manifest = false;
+ Status s = impl->Recover(&edit, &save_manifest);
+ if (s.ok() && impl->mem_ == nullptr) {
+ // Create new log and a corresponding memtable.
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
+ WritableFile* lfile;
+ s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
+ &lfile);
+ if (s.ok()) {
+ edit.SetLogNumber(new_log_number);
+ impl->logfile_ = lfile;
+ impl->logfile_number_ = new_log_number;
+ impl->log_ = new log::Writer(lfile);
+ impl->mem_ = new MemTable(impl->internal_comparator_);
+ impl->mem_->Ref();
+ }
+ }
+ if (s.ok() && save_manifest) {
+ edit.SetPrevLogNumber(0); // No older logs needed after recovery.
+ edit.SetLogNumber(impl->logfile_number_);
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+ }
+ if (s.ok()) {
+ impl->DeleteObsoleteFiles();
+ impl->MaybeScheduleCompaction();
+ }
+ impl->mutex_.Unlock();
+ if (s.ok()) {
+ assert(impl->mem_ != nullptr);
+ *dbptr = impl;
+ } else {
+ delete impl;
+ }
+ return s;
+}
+
+Snapshot::~Snapshot() = default;
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+ Env* env = options.env;
+ std::vector<std::string> filenames;
+ Status result = env->GetChildren(dbname, &filenames);
+ if (!result.ok()) {
+ // Ignore error in case directory does not exist
+ return Status::OK();
+ }
+
+ FileLock* lock;
+ const std::string lockname = LockFileName(dbname);
+ result = env->LockFile(lockname, &lock);
+ if (result.ok()) {
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) &&
+ type != kDBLockFile) { // Lock file will be deleted at end
+ Status del = env->DeleteFile(dbname + "/" + filenames[i]);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->UnlockFile(lock); // Ignore error since state is already gone
+ env->DeleteFile(lockname);
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
+ }
+ return result;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
new file mode 100644
index 0000000000..685735c733
--- /dev/null
+++ b/src/leveldb/db/db_impl.h
@@ -0,0 +1,217 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
+#define STORAGE_LEVELDB_DB_DB_IMPL_H_
+
+#include <atomic>
+#include <deque>
+#include <set>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+
+namespace leveldb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+ DBImpl(const Options& options, const std::string& dbname);
+
+ DBImpl(const DBImpl&) = delete;
+ DBImpl& operator=(const DBImpl&) = delete;
+
+ ~DBImpl() override;
+
+ // Implementations of the DB interface
+ Status Put(const WriteOptions&, const Slice& key,
+ const Slice& value) override;
+ Status Delete(const WriteOptions&, const Slice& key) override;
+ Status Write(const WriteOptions& options, WriteBatch* updates) override;
+ Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) override;
+ Iterator* NewIterator(const ReadOptions&) override;
+ const Snapshot* GetSnapshot() override;
+ void ReleaseSnapshot(const Snapshot* snapshot) override;
+ bool GetProperty(const Slice& property, std::string* value) override;
+ void GetApproximateSizes(const Range* range, int n, uint64_t* sizes) override;
+ void CompactRange(const Slice* begin, const Slice* end) override;
+
+ // Extra methods (for testing) that are not in the public DB interface
+
+ // Compact any files in the named level that overlap [*begin,*end]
+ void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+
+ // Force current memtable contents to be compacted.
+ Status TEST_CompactMemTable();
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ Iterator* TEST_NewInternalIterator();
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t TEST_MaxNextLevelOverlappingBytes();
+
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes.
+ void RecordReadSample(Slice key);
+
+ private:
+ friend class DB;
+ struct CompactionState;
+ struct Writer;
+
+ // Information for a manual compaction
+ struct ManualCompaction {
+ int level;
+ bool done;
+ const InternalKey* begin; // null means beginning of key range
+ const InternalKey* end; // null means end of key range
+ InternalKey tmp_storage; // Used to keep track of compaction progress
+ };
+
+ // Per level compaction stats. stats_[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ struct CompactionStats {
+ CompactionStats() : micros(0), bytes_read(0), bytes_written(0) {}
+
+ void Add(const CompactionStats& c) {
+ this->micros += c.micros;
+ this->bytes_read += c.bytes_read;
+ this->bytes_written += c.bytes_written;
+ }
+
+ int64_t micros;
+ int64_t bytes_read;
+ int64_t bytes_written;
+ };
+
+ Iterator* NewInternalIterator(const ReadOptions&,
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed);
+
+ Status NewDB();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ Status Recover(VersionEdit* edit, bool* save_manifest)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ void MaybeIgnoreError(Status* s) const;
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Compact the in-memory write buffer to disk. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful.
+ // Errors are recorded in bg_error_.
+ void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
+ VersionEdit* edit, SequenceNumber* max_sequence)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status MakeRoomForWrite(bool force /* compact even if there is room? */)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ WriteBatch* BuildBatchGroup(Writer** last_writer)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ void RecordBackgroundError(const Status& s);
+
+ void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ static void BGWork(void* db);
+ void BackgroundCall();
+ void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ void CleanupCompaction(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ Status DoCompactionWork(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status OpenCompactionOutputFile(CompactionState* compact);
+ Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+ Status InstallCompactionResults(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ const Comparator* user_comparator() const {
+ return internal_comparator_.user_comparator();
+ }
+
+ // Constant after construction
+ Env* const env_;
+ const InternalKeyComparator internal_comparator_;
+ const InternalFilterPolicy internal_filter_policy_;
+ const Options options_; // options_.comparator == &internal_comparator_
+ const bool owns_info_log_;
+ const bool owns_cache_;
+ const std::string dbname_;
+
+ // table_cache_ provides its own synchronization
+ TableCache* const table_cache_;
+
+ // Lock over the persistent DB state. Non-null iff successfully acquired.
+ FileLock* db_lock_;
+
+ // State below is protected by mutex_
+ port::Mutex mutex_;
+ std::atomic<bool> shutting_down_;
+ port::CondVar background_work_finished_signal_ GUARDED_BY(mutex_);
+ MemTable* mem_;
+ MemTable* imm_ GUARDED_BY(mutex_); // Memtable being compacted
+ std::atomic<bool> has_imm_; // So bg thread can detect non-null imm_
+ WritableFile* logfile_;
+ uint64_t logfile_number_ GUARDED_BY(mutex_);
+ log::Writer* log_;
+ uint32_t seed_ GUARDED_BY(mutex_); // For sampling.
+
+ // Queue of writers.
+ std::deque<Writer*> writers_ GUARDED_BY(mutex_);
+ WriteBatch* tmp_batch_ GUARDED_BY(mutex_);
+
+ SnapshotList snapshots_ GUARDED_BY(mutex_);
+
+ // Set of table files to protect from deletion because they are
+ // part of ongoing compactions.
+ std::set<uint64_t> pending_outputs_ GUARDED_BY(mutex_);
+
+ // Has a background compaction been scheduled or is running?
+ bool background_compaction_scheduled_ GUARDED_BY(mutex_);
+
+ ManualCompaction* manual_compaction_ GUARDED_BY(mutex_);
+
+ VersionSet* const versions_ GUARDED_BY(mutex_);
+
+ // Have we encountered a background error in paranoid mode?
+ Status bg_error_ GUARDED_BY(mutex_);
+
+ CompactionStats stats_[config::kNumLevels] GUARDED_BY(mutex_);
+};
+
+// Sanitize db options. The caller should delete result.info_log if
+// it is not equal to src.info_log.
+Options SanitizeOptions(const std::string& db,
+ const InternalKeyComparator* icmp,
+ const InternalFilterPolicy* ipolicy,
+ const Options& src);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_
diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc
new file mode 100644
index 0000000000..98715a9502
--- /dev/null
+++ b/src/leveldb/db/db_iter.cc
@@ -0,0 +1,318 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey k;
+ if (!ParseInternalKey(iter->key(), &k)) {
+ fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+ } else {
+ fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+ }
+ }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries. DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter : public Iterator {
+ public:
+ // Which direction is the iterator currently moving?
+ // (1) When moving forward, the internal iterator is positioned at
+ // the exact entry that yields this->key(), this->value()
+ // (2) When moving backwards, the internal iterator is positioned
+ // just before all entries whose user key == this->key().
+ enum Direction { kForward, kReverse };
+
+ DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
+ uint32_t seed)
+ : db_(db),
+ user_comparator_(cmp),
+ iter_(iter),
+ sequence_(s),
+ direction_(kForward),
+ valid_(false),
+ rnd_(seed),
+ bytes_until_read_sampling_(RandomCompactionPeriod()) {}
+
+ DBIter(const DBIter&) = delete;
+ DBIter& operator=(const DBIter&) = delete;
+
+ ~DBIter() override { delete iter_; }
+ bool Valid() const override { return valid_; }
+ Slice key() const override {
+ assert(valid_);
+ return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_;
+ }
+ Slice value() const override {
+ assert(valid_);
+ return (direction_ == kForward) ? iter_->value() : saved_value_;
+ }
+ Status status() const override {
+ if (status_.ok()) {
+ return iter_->status();
+ } else {
+ return status_;
+ }
+ }
+
+ void Next() override;
+ void Prev() override;
+ void Seek(const Slice& target) override;
+ void SeekToFirst() override;
+ void SeekToLast() override;
+
+ private:
+ void FindNextUserEntry(bool skipping, std::string* skip);
+ void FindPrevUserEntry();
+ bool ParseKey(ParsedInternalKey* key);
+
+ inline void SaveKey(const Slice& k, std::string* dst) {
+ dst->assign(k.data(), k.size());
+ }
+
+ inline void ClearSavedValue() {
+ if (saved_value_.capacity() > 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ } else {
+ saved_value_.clear();
+ }
+ }
+
+ // Picks the number of bytes that can be read until a compaction is scheduled.
+ size_t RandomCompactionPeriod() {
+ return rnd_.Uniform(2 * config::kReadBytesPeriod);
+ }
+
+ DBImpl* db_;
+ const Comparator* const user_comparator_;
+ Iterator* const iter_;
+ SequenceNumber const sequence_;
+ Status status_;
+ std::string saved_key_; // == current key when direction_==kReverse
+ std::string saved_value_; // == current raw value when direction_==kReverse
+ Direction direction_;
+ bool valid_;
+ Random rnd_;
+ size_t bytes_until_read_sampling_;
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+ Slice k = iter_->key();
+
+ size_t bytes_read = k.size() + iter_->value().size();
+ while (bytes_until_read_sampling_ < bytes_read) {
+ bytes_until_read_sampling_ += RandomCompactionPeriod();
+ db_->RecordReadSample(k);
+ }
+ assert(bytes_until_read_sampling_ >= bytes_read);
+ bytes_until_read_sampling_ -= bytes_read;
+
+ if (!ParseInternalKey(k, ikey)) {
+ status_ = Status::Corruption("corrupted internal key in DBIter");
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void DBIter::Next() {
+ assert(valid_);
+
+ if (direction_ == kReverse) { // Switch directions?
+ direction_ = kForward;
+ // iter_ is pointing just before the entries for this->key(),
+ // so advance into the range of entries for this->key() and then
+ // use the normal skipping code below.
+ if (!iter_->Valid()) {
+ iter_->SeekToFirst();
+ } else {
+ iter_->Next();
+ }
+ if (!iter_->Valid()) {
+ valid_ = false;
+ saved_key_.clear();
+ return;
+ }
+ // saved_key_ already contains the key to skip past.
+ } else {
+ // Store in saved_key_ the current key so we skip it below.
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+
+ // iter_ is pointing to current key. We can now safely move to the next to
+ // avoid checking current key.
+ iter_->Next();
+ if (!iter_->Valid()) {
+ valid_ = false;
+ saved_key_.clear();
+ return;
+ }
+ }
+
+ FindNextUserEntry(true, &saved_key_);
+}
+
+void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
+ // Loop until we hit an acceptable entry to yield
+ assert(iter_->Valid());
+ assert(direction_ == kForward);
+ do {
+ ParsedInternalKey ikey;
+ if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+ switch (ikey.type) {
+ case kTypeDeletion:
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ SaveKey(ikey.user_key, skip);
+ skipping = true;
+ break;
+ case kTypeValue:
+ if (skipping &&
+ user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
+ // Entry hidden
+ } else {
+ valid_ = true;
+ saved_key_.clear();
+ return;
+ }
+ break;
+ }
+ }
+ iter_->Next();
+ } while (iter_->Valid());
+ saved_key_.clear();
+ valid_ = false;
+}
+
+void DBIter::Prev() {
+ assert(valid_);
+
+ if (direction_ == kForward) { // Switch directions?
+ // iter_ is pointing at the current entry. Scan backwards until
+ // the key changes so we can use the normal reverse scanning code.
+ assert(iter_->Valid()); // Otherwise valid_ would have been false
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+ while (true) {
+ iter_->Prev();
+ if (!iter_->Valid()) {
+ valid_ = false;
+ saved_key_.clear();
+ ClearSavedValue();
+ return;
+ }
+ if (user_comparator_->Compare(ExtractUserKey(iter_->key()), saved_key_) <
+ 0) {
+ break;
+ }
+ }
+ direction_ = kReverse;
+ }
+
+ FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+ assert(direction_ == kReverse);
+
+ ValueType value_type = kTypeDeletion;
+ if (iter_->Valid()) {
+ do {
+ ParsedInternalKey ikey;
+ if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+ if ((value_type != kTypeDeletion) &&
+ user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
+ // We encountered a non-deleted value in entries for previous keys,
+ break;
+ }
+ value_type = ikey.type;
+ if (value_type == kTypeDeletion) {
+ saved_key_.clear();
+ ClearSavedValue();
+ } else {
+ Slice raw_value = iter_->value();
+ if (saved_value_.capacity() > raw_value.size() + 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ }
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+ saved_value_.assign(raw_value.data(), raw_value.size());
+ }
+ }
+ iter_->Prev();
+ } while (iter_->Valid());
+ }
+
+ if (value_type == kTypeDeletion) {
+ // End
+ valid_ = false;
+ saved_key_.clear();
+ ClearSavedValue();
+ direction_ = kForward;
+ } else {
+ valid_ = true;
+ }
+}
+
+void DBIter::Seek(const Slice& target) {
+ direction_ = kForward;
+ ClearSavedValue();
+ saved_key_.clear();
+ AppendInternalKey(&saved_key_,
+ ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+ iter_->Seek(saved_key_);
+ if (iter_->Valid()) {
+ FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+ } else {
+ valid_ = false;
+ }
+}
+
+void DBIter::SeekToFirst() {
+ direction_ = kForward;
+ ClearSavedValue();
+ iter_->SeekToFirst();
+ if (iter_->Valid()) {
+ FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+ } else {
+ valid_ = false;
+ }
+}
+
+void DBIter::SeekToLast() {
+ direction_ = kReverse;
+ ClearSavedValue();
+ iter_->SeekToLast();
+ FindPrevUserEntry();
+}
+
+} // anonymous namespace
+
+Iterator* NewDBIterator(DBImpl* db, const Comparator* user_key_comparator,
+ Iterator* internal_iter, SequenceNumber sequence,
+ uint32_t seed) {
+ return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h
new file mode 100644
index 0000000000..fd93e912a0
--- /dev/null
+++ b/src/leveldb/db/db_iter.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
+#define STORAGE_LEVELDB_DB_DB_ITER_H_
+
+#include <stdint.h>
+
+#include "db/dbformat.h"
+#include "leveldb/db.h"
+
+namespace leveldb {
+
+class DBImpl;
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+Iterator* NewDBIterator(DBImpl* db, const Comparator* user_key_comparator,
+ Iterator* internal_iter, SequenceNumber sequence,
+ uint32_t seed);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_DB_ITER_H_
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
new file mode 100644
index 0000000000..beb1d3bdef
--- /dev/null
+++ b/src/leveldb/db/db_test.cc
@@ -0,0 +1,2302 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include <atomic>
+#include <string>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/table.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+static std::string RandomKey(Random* rnd) {
+ int len =
+ (rnd->OneIn(3) ? 1 // Short sometimes to encourage collisions
+ : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+ return test::RandomKey(rnd, len);
+}
+
+namespace {
+class AtomicCounter {
+ public:
+ AtomicCounter() : count_(0) {}
+ void Increment() { IncrementBy(1); }
+ void IncrementBy(int count) LOCKS_EXCLUDED(mu_) {
+ MutexLock l(&mu_);
+ count_ += count;
+ }
+ int Read() LOCKS_EXCLUDED(mu_) {
+ MutexLock l(&mu_);
+ return count_;
+ }
+ void Reset() LOCKS_EXCLUDED(mu_) {
+ MutexLock l(&mu_);
+ count_ = 0;
+ }
+
+ private:
+ port::Mutex mu_;
+ int count_ GUARDED_BY(mu_);
+};
+
+void DelayMilliseconds(int millis) {
+ Env::Default()->SleepForMicroseconds(millis * 1000);
+}
+} // namespace
+
+// Test Env to override default Env behavior for testing.
+class TestEnv : public EnvWrapper {
+ public:
+ explicit TestEnv(Env* base) : EnvWrapper(base), ignore_dot_files_(false) {}
+
+ void SetIgnoreDotFiles(bool ignored) { ignore_dot_files_ = ignored; }
+
+ Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) override {
+ Status s = target()->GetChildren(dir, result);
+ if (!s.ok() || !ignore_dot_files_) {
+ return s;
+ }
+
+ std::vector<std::string>::iterator it = result->begin();
+ while (it != result->end()) {
+ if ((*it == ".") || (*it == "..")) {
+ it = result->erase(it);
+ } else {
+ ++it;
+ }
+ }
+
+ return s;
+ }
+
+ private:
+ bool ignore_dot_files_;
+};
+
+// Special Env used to delay background operations.
+class SpecialEnv : public EnvWrapper {
+ public:
+ // sstable/log Sync() calls are blocked while this pointer is non-null.
+ std::atomic<bool> delay_data_sync_;
+
+ // sstable/log Sync() calls return an error.
+ std::atomic<bool> data_sync_error_;
+
+ // Simulate no-space errors while this pointer is non-null.
+ std::atomic<bool> no_space_;
+
+ // Simulate non-writable file system while this pointer is non-null.
+ std::atomic<bool> non_writable_;
+
+ // Force sync of manifest files to fail while this pointer is non-null.
+ std::atomic<bool> manifest_sync_error_;
+
+ // Force write to manifest files to fail while this pointer is non-null.
+ std::atomic<bool> manifest_write_error_;
+
+ bool count_random_reads_;
+ AtomicCounter random_read_counter_;
+
+ explicit SpecialEnv(Env* base)
+ : EnvWrapper(base),
+ delay_data_sync_(false),
+ data_sync_error_(false),
+ no_space_(false),
+ non_writable_(false),
+ manifest_sync_error_(false),
+ manifest_write_error_(false),
+ count_random_reads_(false) {}
+
+ Status NewWritableFile(const std::string& f, WritableFile** r) {
+ class DataFile : public WritableFile {
+ private:
+ SpecialEnv* const env_;
+ WritableFile* const base_;
+
+ public:
+ DataFile(SpecialEnv* env, WritableFile* base) : env_(env), base_(base) {}
+ ~DataFile() { delete base_; }
+ Status Append(const Slice& data) {
+ if (env_->no_space_.load(std::memory_order_acquire)) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Close() { return base_->Close(); }
+ Status Flush() { return base_->Flush(); }
+ Status Sync() {
+ if (env_->data_sync_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated data sync error");
+ }
+ while (env_->delay_data_sync_.load(std::memory_order_acquire)) {
+ DelayMilliseconds(100);
+ }
+ return base_->Sync();
+ }
+ std::string GetName() const override { return ""; }
+ };
+ class ManifestFile : public WritableFile {
+ private:
+ SpecialEnv* env_;
+ WritableFile* base_;
+
+ public:
+ ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) {}
+ ~ManifestFile() { delete base_; }
+ Status Append(const Slice& data) {
+ if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated writer error");
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Close() { return base_->Close(); }
+ Status Flush() { return base_->Flush(); }
+ Status Sync() {
+ if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated sync error");
+ } else {
+ return base_->Sync();
+ }
+ }
+ std::string GetName() const override { return ""; }
+ };
+
+ if (non_writable_.load(std::memory_order_acquire)) {
+ return Status::IOError("simulated write error");
+ }
+
+ Status s = target()->NewWritableFile(f, r);
+ if (s.ok()) {
+ if (strstr(f.c_str(), ".ldb") != nullptr ||
+ strstr(f.c_str(), ".log") != nullptr) {
+ *r = new DataFile(this, *r);
+ } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+ *r = new ManifestFile(this, *r);
+ }
+ }
+ return s;
+ }
+
+ Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+ class CountingFile : public RandomAccessFile {
+ private:
+ RandomAccessFile* target_;
+ AtomicCounter* counter_;
+
+ public:
+ CountingFile(RandomAccessFile* target, AtomicCounter* counter)
+ : target_(target), counter_(counter) {}
+ ~CountingFile() override { delete target_; }
+ Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const override {
+ counter_->Increment();
+ return target_->Read(offset, n, result, scratch);
+ }
+ std::string GetName() const override { return ""; }
+ };
+
+ Status s = target()->NewRandomAccessFile(f, r);
+ if (s.ok() && count_random_reads_) {
+ *r = new CountingFile(*r, &random_read_counter_);
+ }
+ return s;
+ }
+};
+
+class DBTest {
+ public:
+ std::string dbname_;
+ SpecialEnv* env_;
+ DB* db_;
+
+ Options last_options_;
+
+ DBTest() : env_(new SpecialEnv(Env::Default())), option_config_(kDefault) {
+ filter_policy_ = NewBloomFilterPolicy(10);
+ dbname_ = test::TmpDir() + "/db_test";
+ DestroyDB(dbname_, Options());
+ db_ = nullptr;
+ Reopen();
+ }
+
+ ~DBTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete env_;
+ delete filter_policy_;
+ }
+
+ // Switch to a fresh database with the next option configuration to
+ // test. Return false if there are no more configurations to test.
+ bool ChangeOptions() {
+ option_config_++;
+ if (option_config_ >= kEnd) {
+ return false;
+ } else {
+ DestroyAndReopen();
+ return true;
+ }
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ Options options;
+ options.reuse_logs = false;
+ switch (option_config_) {
+ case kReuse:
+ options.reuse_logs = true;
+ break;
+ case kFilter:
+ options.filter_policy = filter_policy_;
+ break;
+ case kUncompressed:
+ options.compression = kNoCompression;
+ break;
+ default:
+ break;
+ }
+ return options;
+ }
+
+ DBImpl* dbfull() { return reinterpret_cast<DBImpl*>(db_); }
+
+ void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+ void Close() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DestroyAndReopen(Options* options = nullptr) {
+ delete db_;
+ db_ = nullptr;
+ DestroyDB(dbname_, Options());
+ ASSERT_OK(TryReopen(options));
+ }
+
+ Status TryReopen(Options* options) {
+ delete db_;
+ db_ = nullptr;
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ last_options_ = opts;
+
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ Status Put(const std::string& k, const std::string& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ ReadOptions options;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ // Return a string that contains all key,value pairs in order,
+ // formatted like "(k1->v1)(k2->v2)".
+ std::string Contents() {
+ std::vector<std::string> forward;
+ std::string result;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string s = IterStatus(iter);
+ result.push_back('(');
+ result.append(s);
+ result.push_back(')');
+ forward.push_back(s);
+ }
+
+ // Check reverse iteration results are the reverse of forward results
+ size_t matched = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_LT(matched, forward.size());
+ ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+ matched++;
+ }
+ ASSERT_EQ(matched, forward.size());
+
+ delete iter;
+ return result;
+ }
+
+ std::string AllEntriesFor(const Slice& user_key) {
+ Iterator* iter = dbfull()->TEST_NewInternalIterator();
+ InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+ iter->Seek(target.Encode());
+ std::string result;
+ if (!iter->status().ok()) {
+ result = iter->status().ToString();
+ } else {
+ result = "[ ";
+ bool first = true;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(iter->key(), &ikey)) {
+ result += "CORRUPTED";
+ } else {
+ if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
+ break;
+ }
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ switch (ikey.type) {
+ case kTypeValue:
+ result += iter->value().ToString();
+ break;
+ case kTypeDeletion:
+ result += "DEL";
+ break;
+ }
+ }
+ iter->Next();
+ }
+ if (!first) {
+ result += " ";
+ }
+ result += "]";
+ }
+ delete iter;
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ ASSERT_TRUE(db_->GetProperty(
+ "leveldb.num-files-at-level" + NumberToString(level), &property));
+ return std::stoi(property);
+ }
+
+ int TotalTableFiles() {
+ int result = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ result += NumTableFilesAtLevel(level);
+ }
+ return result;
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ int last_non_zero_offset = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ int CountFiles() {
+ std::vector<std::string> files;
+ env_->GetChildren(dbname_, &files);
+ return static_cast<int>(files.size());
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ db_->CompactRange(&start, &limit);
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small_key,large_key].
+ void MakeTables(int n, const std::string& small_key,
+ const std::string& large_key) {
+ for (int i = 0; i < n; i++) {
+ Put(small_key, "begin");
+ Put(large_key, "end");
+ dbfull()->TEST_CompactMemTable();
+ }
+ }
+
+ // Prevent pushing of new sstables into deeper levels by adding
+ // tables that cover a specified range to all levels.
+ void FillLevels(const std::string& smallest, const std::string& largest) {
+ MakeTables(config::kNumLevels, smallest, largest);
+ }
+
+ void DumpFileCounts(const char* label) {
+ fprintf(stderr, "---\n%s:\n", label);
+ fprintf(
+ stderr, "maxoverlap: %lld\n",
+ static_cast<long long>(dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int num = NumTableFilesAtLevel(level);
+ if (num > 0) {
+ fprintf(stderr, " level %3d : %d files\n", level, num);
+ }
+ }
+ }
+
+ std::string DumpSSTableList() {
+ std::string property;
+ db_->GetProperty("leveldb.sstables", &property);
+ return property;
+ }
+
+ std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+ }
+
+ bool DeleteAnSSTFile() {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+ ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Returns number of files renamed.
+ int RenameLDBToSST() {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ int files_renamed = 0;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+ const std::string from = TableFileName(dbname_, number);
+ const std::string to = SSTTableFileName(dbname_, number);
+ ASSERT_OK(env_->RenameFile(from, to));
+ files_renamed++;
+ }
+ }
+ return files_renamed;
+ }
+
+ private:
+ // Sequence of option configurations to try
+ enum OptionConfig { kDefault, kReuse, kFilter, kUncompressed, kEnd };
+
+ const FilterPolicy* filter_policy_;
+ int option_config_;
+};
+
+TEST(DBTest, Empty) {
+ do {
+ ASSERT_TRUE(db_ != nullptr);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, EmptyKey) {
+ do {
+ ASSERT_OK(Put("", "v1"));
+ ASSERT_EQ("v1", Get(""));
+ ASSERT_OK(Put("", "v2"));
+ ASSERT_EQ("v2", Get(""));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, EmptyValue) {
+ do {
+ ASSERT_OK(Put("key", "v1"));
+ ASSERT_EQ("v1", Get("key"));
+ ASSERT_OK(Put("key", ""));
+ ASSERT_EQ("", Get("key"));
+ ASSERT_OK(Put("key", "v2"));
+ ASSERT_EQ("v2", Get("key"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, ReadWrite) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, PutDeleteGet) {
+ do {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_EQ("v2", Get("foo"));
+ ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromImmutableLayer) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(&options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+
+ // Block sync calls.
+ env_->delay_data_sync_.store(true, std::memory_order_release);
+ Put("k1", std::string(100000, 'x')); // Fill memtable.
+ Put("k2", std::string(100000, 'y')); // Trigger compaction.
+ ASSERT_EQ("v1", Get("foo"));
+ // Release sync calls.
+ env_->delay_data_sync_.store(false, std::memory_order_release);
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v1", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetMemUsage) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ std::string val;
+ ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
+ int mem_usage = std::stoi(val);
+ ASSERT_GT(mem_usage, 0);
+ ASSERT_LT(mem_usage, 5 * 1024 * 1024);
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+ do {
+ // Try with both a short key and a long key
+ for (int i = 0; i < 2; i++) {
+ std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+ ASSERT_OK(Put(key, "v1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Put(key, "v2"));
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s1));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s1));
+ db_->ReleaseSnapshot(s1);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetIdenticalSnapshots) {
+ do {
+ // Try with both a short key and a long key
+ for (int i = 0; i < 2; i++) {
+ std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+ ASSERT_OK(Put(key, "v1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ const Snapshot* s2 = db_->GetSnapshot();
+ const Snapshot* s3 = db_->GetSnapshot();
+ ASSERT_OK(Put(key, "v2"));
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s1));
+ ASSERT_EQ("v1", Get(key, s2));
+ ASSERT_EQ("v1", Get(key, s3));
+ db_->ReleaseSnapshot(s1);
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s2));
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ("v1", Get(key, s3));
+ db_->ReleaseSnapshot(s3);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, IterateOverEmptySnapshot) {
+ do {
+ const Snapshot* snapshot = db_->GetSnapshot();
+ ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+
+ Iterator* iterator1 = db_->NewIterator(read_options);
+ iterator1->SeekToFirst();
+ ASSERT_TRUE(!iterator1->Valid());
+ delete iterator1;
+
+ dbfull()->TEST_CompactMemTable();
+
+ Iterator* iterator2 = db_->NewIterator(read_options);
+ iterator2->SeekToFirst();
+ ASSERT_TRUE(!iterator2->Valid());
+ delete iterator2;
+
+ db_->ReleaseSnapshot(snapshot);
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+ do {
+ // Check that we process level-0 files in correct order. The code
+ // below generates two level-0 files where the earlier one comes
+ // before the later one in the level-0 file list since the earlier
+ // one has a smaller "smallest" key.
+ ASSERT_OK(Put("bar", "b"));
+ ASSERT_OK(Put("foo", "v1"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Put("foo", "v2"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ Compact("a", "z");
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_EQ("v2", Get("foo"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+ do {
+ // Arrange to have multiple files in a non-level-0 level.
+ ASSERT_OK(Put("a", "va"));
+ Compact("a", "b");
+ ASSERT_OK(Put("x", "vx"));
+ Compact("x", "y");
+ ASSERT_OK(Put("f", "vf"));
+ Compact("f", "g");
+ ASSERT_EQ("va", Get("a"));
+ ASSERT_EQ("vf", Get("f"));
+ ASSERT_EQ("vx", Get("x"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetEncountersEmptyLevel) {
+ do {
+ // Arrange for the following to happen:
+ // * sstable A in level 0
+ // * nothing in level 1
+ // * sstable B in level 2
+ // Then do enough Get() calls to arrange for an automatic compaction
+ // of sstable A. A bug would cause the compaction to be marked as
+ // occurring at level 1 (instead of the correct level 0).
+
+ // Step 1: First place sstables in levels 0 and 2
+ int compaction_count = 0;
+ while (NumTableFilesAtLevel(0) == 0 || NumTableFilesAtLevel(2) == 0) {
+ ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+ compaction_count++;
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ }
+
+ // Step 2: clear level 1 if necessary.
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+ // Step 3: read a bunch of times
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("missing"));
+ }
+
+ // Step 4: Wait for compaction to finish
+ DelayMilliseconds(1000);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, IterEmpty) {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterSingle) {
+ ASSERT_OK(Put("a", "va"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterMulti) {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", "vb"));
+ ASSERT_OK(Put("c", "vc"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("ax");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("z");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ // Switch from reverse to forward
+ iter->SeekToLast();
+ iter->Prev();
+ iter->Prev();
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Switch from forward to reverse
+ iter->SeekToFirst();
+ iter->Next();
+ iter->Next();
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Make sure iter stays at snapshot
+ ASSERT_OK(Put("a", "va2"));
+ ASSERT_OK(Put("a2", "va3"));
+ ASSERT_OK(Put("b", "vb2"));
+ ASSERT_OK(Put("c", "vc2"));
+ ASSERT_OK(Delete("b"));
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterSmallAndLargeMix) {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", std::string(100000, 'b')));
+ ASSERT_OK(Put("c", "vc"));
+ ASSERT_OK(Put("d", std::string(100000, 'd')));
+ ASSERT_OK(Put("e", std::string(100000, 'e')));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterMultiWithDelete) {
+ do {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", "vb"));
+ ASSERT_OK(Put("c", "vc"));
+ ASSERT_OK(Delete("b"));
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("c");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, Recover) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("baz", "v5"));
+
+ Reopen();
+ ASSERT_EQ("v1", Get("foo"));
+
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("v5", Get("baz"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_OK(Put("foo", "v4"));
+ ASSERT_EQ("v4", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_EQ("v5", Get("baz"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+ Reopen();
+ Reopen();
+ ASSERT_OK(Put("foo", "v3"));
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+ } while (ChangeOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST(DBTest, RecoverDuringMemtableCompaction) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 1000000;
+ Reopen(&options);
+
+ // Trigger a long memtable compaction and reopen the database during it
+ ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file
+ ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable
+ ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction
+ ASSERT_OK(Put("bar", "v2")); // Goes to new log file
+
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
+ ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
+ } while (ChangeOptions());
+}
+
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10000;
+ Reopen(&options);
+
+ const int N = 500;
+
+ int starting_num_tables = TotalTableFiles();
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = TotalTableFiles();
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+
+ Reopen();
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+ {
+ Options options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_OK(Put("big1", std::string(200000, '1')));
+ ASSERT_OK(Put("big2", std::string(200000, '2')));
+ ASSERT_OK(Put("small3", std::string(10, '3')));
+ ASSERT_OK(Put("small4", std::string(10, '4')));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ }
+
+ // Make sure that if we re-open with a small write buffer size that
+ // we flush table files in the middle of a large log file.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000;
+ Reopen(&options);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+ ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+ ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+ ASSERT_EQ(std::string(10, '3'), Get("small3"));
+ ASSERT_EQ(std::string(10, '4'), Get("small4"));
+ ASSERT_GT(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ Reopen(&options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ Reopen(&options);
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST(DBTest, RepeatedWritesToSameKey) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(&options);
+
+ // We must have at most one file per level except for level-0,
+ // which may have up to kL0_StopWritesTrigger files.
+ const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
+
+ Random rnd(301);
+ std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+ for (int i = 0; i < 5 * kMaxFiles; i++) {
+ Put("key", value);
+ ASSERT_LE(TotalTableFiles(), kMaxFiles);
+ fprintf(stderr, "after %d: %d files\n", i + 1, TotalTableFiles());
+ }
+}
+
+TEST(DBTest, SparseMerge) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ Reopen(&options);
+
+ FillLevels("A", "Z");
+
+ // Suppose there is:
+ // small amount of data with prefix A
+ // large amount of data with prefix B
+ // small amount of data with prefix C
+ // and that recent updates have made small changes to all three prefixes.
+ // Check that we do not do a compaction that merges all of B in one shot.
+ const std::string value(1000, 'x');
+ Put("A", "va");
+ // Write approximately 100MB of "B" values
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(key, value);
+ }
+ Put("C", "vc");
+ dbfull()->TEST_CompactMemTable();
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+
+ // Make sparse update
+ Put("A", "va2");
+ Put("B100", "bvalue2");
+ Put("C", "vc2");
+ dbfull()->TEST_CompactMemTable();
+
+ // Compactions should not cause us to create a situation where
+ // a file overlaps too much data at the next level.
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20 * 1048576);
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20 * 1048576);
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20 * 1048576);
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val), (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ DestroyAndReopen();
+
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+ Reopen(&options);
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ const int N = 80;
+ static const int S1 = 100000;
+ static const int S2 = 105000; // Allow some expansion from metadata
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
+ }
+
+ // 0 because GetApproximateSizes() does not account for memtable space
+ ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+
+ if (options.reuse_logs) {
+ // Recovery will reuse memtable, and GetApproximateSizes() does not
+ // account for memtable usage;
+ Reopen(&options);
+ ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+ continue;
+ }
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ for (int compact_start = 0; compact_start < N; compact_start += 10) {
+ for (int i = 0; i < N; i += 10) {
+ ASSERT_TRUE(Between(Size("", Key(i)), S1 * i, S2 * i));
+ ASSERT_TRUE(Between(Size("", Key(i) + ".suffix"), S1 * (i + 1),
+ S2 * (i + 1)));
+ ASSERT_TRUE(Between(Size(Key(i), Key(i + 10)), S1 * 10, S2 * 10));
+ }
+ ASSERT_TRUE(Between(Size("", Key(50)), S1 * 50, S2 * 50));
+ ASSERT_TRUE(Between(Size("", Key(50) + ".suffix"), S1 * 50, S2 * 50));
+
+ std::string cstart_str = Key(compact_start);
+ std::string cend_str = Key(compact_start + 9);
+ Slice cstart = cstart_str;
+ Slice cend = cend_str;
+ dbfull()->TEST_CompactRange(0, &cstart, &cend);
+ }
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+ do {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ Reopen();
+
+ Random rnd(301);
+ std::string big1 = RandomString(&rnd, 100000);
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(2), big1));
+ ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(4), big1));
+ ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+ ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+ if (options.reuse_logs) {
+ // Need to force a memtable compaction since recovery does not do so.
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ }
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+ ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+ ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+ ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+ ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+ ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+ ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+ ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+ ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
+
+ ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+ dbfull()->TEST_CompactRange(0, nullptr, nullptr);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, IteratorPinsRef) {
+ Put("foo", "hello");
+
+ // Get iterator that will yield the current contents of the DB.
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ // Write to force compactions
+ Put("foo", "newvalue1");
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+ }
+ Put("foo", "newvalue2");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("hello", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+}
+
+TEST(DBTest, Snapshot) {
+ do {
+ Put("foo", "v1");
+ const Snapshot* s1 = db_->GetSnapshot();
+ Put("foo", "v2");
+ const Snapshot* s2 = db_->GetSnapshot();
+ Put("foo", "v3");
+ const Snapshot* s3 = db_->GetSnapshot();
+
+ Put("foo", "v4");
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v3", Get("foo", s3));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s3);
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ("v4", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+ do {
+ Random rnd(301);
+ FillLevels("a", "z");
+
+ std::string big = RandomString(&rnd, 50000);
+ Put("foo", big);
+ Put("pastfoo", "v");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ Put("foo", "tiny");
+ Put("pastfoo2", "v2"); // Advance sequence number one more
+
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+ ASSERT_EQ(big, Get("foo", snapshot));
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+ Slice x("x");
+ dbfull()->TEST_CompactRange(0, nullptr, &x);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GE(NumTableFilesAtLevel(1), 1);
+ dbfull()->TEST_CompactRange(1, nullptr, &x);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, DeletionMarkers1) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1), 1);
+
+ Delete("foo");
+ Put("foo", "v2");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ Slice z("z");
+ dbfull()->TEST_CompactRange(last - 2, nullptr, &z);
+ // DEL eliminated, but v1 remains because we aren't compacting that level
+ // (DEL can be eliminated because v2 hides v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last - 1), 1);
+
+ Delete("foo");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr);
+ // DEL kept: "last" file overlaps
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, OverlapInLevel0) {
+ do {
+ ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
+
+ // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+ // 0.
+ ASSERT_OK(Put("100", "v100"));
+ ASSERT_OK(Put("999", "v999"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Delete("100"));
+ ASSERT_OK(Delete("999"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+
+ // Make files spanning the following ranges in level-0:
+ // files[0] 200 .. 900
+ // files[1] 300 .. 500
+ // Note that files are sorted by smallest key.
+ ASSERT_OK(Put("300", "v300"));
+ ASSERT_OK(Put("500", "v500"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Put("200", "v200"));
+ ASSERT_OK(Put("600", "v600"));
+ ASSERT_OK(Put("900", "v900"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("2,1,1", FilesPerLevel());
+
+ // Compact away the placeholder files we created initially
+ dbfull()->TEST_CompactRange(1, nullptr, nullptr);
+ dbfull()->TEST_CompactRange(2, nullptr, nullptr);
+ ASSERT_EQ("2", FilesPerLevel());
+
+ // Do a memtable compaction. Before bug-fix, the compaction would
+ // not detect the overlap with level-0 files and would incorrectly place
+ // the deletion in a deeper level.
+ ASSERT_OK(Delete("600"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_EQ("NOT_FOUND", Get("600"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_a) {
+ Reopen();
+ ASSERT_OK(Put("b", "v"));
+ Reopen();
+ ASSERT_OK(Delete("b"));
+ ASSERT_OK(Delete("a"));
+ Reopen();
+ ASSERT_OK(Delete("a"));
+ Reopen();
+ ASSERT_OK(Put("a", "v"));
+ Reopen();
+ Reopen();
+ ASSERT_EQ("(a->v)", Contents());
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ ASSERT_EQ("(a->v)", Contents());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_b) {
+ Reopen();
+ Put("", "");
+ Reopen();
+ Delete("e");
+ Put("", "");
+ Reopen();
+ Put("c", "cv");
+ Reopen();
+ Put("", "");
+ Reopen();
+ Put("", "");
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ Reopen();
+ Put("d", "dv");
+ Reopen();
+ Put("", "");
+ Reopen();
+ Delete("d");
+ Delete("b");
+ Reopen();
+ ASSERT_EQ("(->)(c->cv)", Contents());
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ ASSERT_EQ("(->)(c->cv)", Contents());
+}
+
+TEST(DBTest, Fflush_Issue474) {
+ static const int kNum = 100000;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < kNum; i++) {
+ fflush(nullptr);
+ ASSERT_OK(Put(RandomKey(&rnd), RandomString(&rnd, 100)));
+ }
+}
+
+TEST(DBTest, ComparatorCheck) {
+ class NewComparator : public Comparator {
+ public:
+ const char* Name() const override { return "leveldb.NewComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ NewComparator cmp;
+ Options new_options = CurrentOptions();
+ new_options.comparator = &cmp;
+ Status s = TryReopen(&new_options);
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+ << s.ToString();
+}
+
+TEST(DBTest, CustomComparator) {
+ class NumberComparator : public Comparator {
+ public:
+ const char* Name() const override { return "test.NumberComparator"; }
+ int Compare(const Slice& a, const Slice& b) const override {
+ return ToNumber(a) - ToNumber(b);
+ }
+ void FindShortestSeparator(std::string* s, const Slice& l) const override {
+ ToNumber(*s); // Check format
+ ToNumber(l); // Check format
+ }
+ void FindShortSuccessor(std::string* key) const override {
+ ToNumber(*key); // Check format
+ }
+
+ private:
+ static int ToNumber(const Slice& x) {
+ // Check that there are no extra characters.
+ ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+ << EscapeString(x);
+ int val;
+ char ignored;
+ ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+ << EscapeString(x);
+ return val;
+ }
+ };
+ NumberComparator cmp;
+ Options new_options = CurrentOptions();
+ new_options.create_if_missing = true;
+ new_options.comparator = &cmp;
+ new_options.filter_policy = nullptr; // Cannot use bloom filters
+ new_options.write_buffer_size = 1000; // Compact more often
+ DestroyAndReopen(&new_options);
+ ASSERT_OK(Put("[10]", "ten"));
+ ASSERT_OK(Put("[0x14]", "twenty"));
+ for (int i = 0; i < 2; i++) {
+ ASSERT_EQ("ten", Get("[10]"));
+ ASSERT_EQ("ten", Get("[0xa]"));
+ ASSERT_EQ("twenty", Get("[20]"));
+ ASSERT_EQ("twenty", Get("[0x14]"));
+ ASSERT_EQ("NOT_FOUND", Get("[15]"));
+ ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
+ Compact("[0]", "[9999]");
+ }
+
+ for (int run = 0; run < 2; run++) {
+ for (int i = 0; i < 1000; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "[%d]", i * 10);
+ ASSERT_OK(Put(buf, buf));
+ }
+ Compact("[0]", "[1000000]");
+ }
+}
+
+TEST(DBTest, ManualCompaction) {
+ ASSERT_EQ(config::kMaxMemCompactLevel, 2)
+ << "Need to update this test to match kMaxMemCompactLevel";
+
+ MakeTables(3, "p", "q");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range falls before files
+ Compact("", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range falls after files
+ Compact("r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range overlaps files
+ Compact("p1", "p9");
+ ASSERT_EQ("0,0,1", FilesPerLevel());
+
+ // Populate a different range
+ MakeTables(3, "c", "e");
+ ASSERT_EQ("1,1,2", FilesPerLevel());
+
+ // Compact just the new range
+ Compact("b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel());
+
+ // Compact all
+ MakeTables(1, "a", "z");
+ ASSERT_EQ("0,1,2", FilesPerLevel());
+ db_->CompactRange(nullptr, nullptr);
+ ASSERT_EQ("0,0,1", FilesPerLevel());
+}
+
+TEST(DBTest, DBOpen_Options) {
+ std::string dbname = test::TmpDir() + "/db_options_test";
+ DestroyDB(dbname, Options());
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = nullptr;
+ Options opts;
+ opts.create_if_missing = false;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does not exist, and create_if_missing == true: OK
+ opts.create_if_missing = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ // Does exist, and error_if_exists == true: error
+ opts.create_if_missing = false;
+ opts.error_if_exists = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+ ASSERT_TRUE(db == nullptr);
+
+ // Does exist, and error_if_exists == false: OK
+ opts.create_if_missing = true;
+ opts.error_if_exists = false;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+}
+
+TEST(DBTest, DestroyEmptyDir) {
+ std::string dbname = test::TmpDir() + "/db_empty_dir";
+ TestEnv env(Env::Default());
+ env.DeleteDir(dbname);
+ ASSERT_TRUE(!env.FileExists(dbname));
+
+ Options opts;
+ opts.env = &env;
+
+ ASSERT_OK(env.CreateDir(dbname));
+ ASSERT_TRUE(env.FileExists(dbname));
+ std::vector<std::string> children;
+ ASSERT_OK(env.GetChildren(dbname, &children));
+ // The stock Env's do not filter out '.' and '..' special files.
+ ASSERT_EQ(2, children.size());
+ ASSERT_OK(DestroyDB(dbname, opts));
+ ASSERT_TRUE(!env.FileExists(dbname));
+
+ // Should also be destroyed if Env is filtering out dot files.
+ env.SetIgnoreDotFiles(true);
+ ASSERT_OK(env.CreateDir(dbname));
+ ASSERT_TRUE(env.FileExists(dbname));
+ ASSERT_OK(env.GetChildren(dbname, &children));
+ ASSERT_EQ(0, children.size());
+ ASSERT_OK(DestroyDB(dbname, opts));
+ ASSERT_TRUE(!env.FileExists(dbname));
+}
+
+TEST(DBTest, DestroyOpenDB) {
+ std::string dbname = test::TmpDir() + "/open_db_dir";
+ env_->DeleteDir(dbname);
+ ASSERT_TRUE(!env_->FileExists(dbname));
+
+ Options opts;
+ opts.create_if_missing = true;
+ DB* db = nullptr;
+ ASSERT_OK(DB::Open(opts, dbname, &db));
+ ASSERT_TRUE(db != nullptr);
+
+ // Must fail to destroy an open db.
+ ASSERT_TRUE(env_->FileExists(dbname));
+ ASSERT_TRUE(!DestroyDB(dbname, Options()).ok());
+ ASSERT_TRUE(env_->FileExists(dbname));
+
+ delete db;
+ db = nullptr;
+
+ // Should succeed destroying a closed db.
+ ASSERT_OK(DestroyDB(dbname, Options()));
+ ASSERT_TRUE(!env_->FileExists(dbname));
+}
+
+TEST(DBTest, Locking) {
+ DB* db2 = nullptr;
+ Status s = DB::Open(CurrentOptions(), dbname_, &db2);
+ ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
+}
+
+// Check that number of files does not grow when we are out of space
+TEST(DBTest, NoSpace) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ Reopen(&options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ Compact("a", "z");
+ const int num_files = CountFiles();
+ // Force out-of-space errors.
+ env_->no_space_.store(true, std::memory_order_release);
+ for (int i = 0; i < 10; i++) {
+ for (int level = 0; level < config::kNumLevels - 1; level++) {
+ dbfull()->TEST_CompactRange(level, nullptr, nullptr);
+ }
+ }
+ env_->no_space_.store(false, std::memory_order_release);
+ ASSERT_LT(CountFiles(), num_files + 3);
+}
+
+TEST(DBTest, NonWritableFileSystem) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 1000;
+ options.env = env_;
+ Reopen(&options);
+ ASSERT_OK(Put("foo", "v1"));
+ // Force errors for new files.
+ env_->non_writable_.store(true, std::memory_order_release);
+ std::string big(100000, 'x');
+ int errors = 0;
+ for (int i = 0; i < 20; i++) {
+ fprintf(stderr, "iter %d; errors %d\n", i, errors);
+ if (!Put("foo", big).ok()) {
+ errors++;
+ DelayMilliseconds(100);
+ }
+ }
+ ASSERT_GT(errors, 0);
+ env_->non_writable_.store(false, std::memory_order_release);
+}
+
+TEST(DBTest, WriteSyncError) {
+ // Check that log sync errors cause the DB to disallow future writes.
+
+ // (a) Cause log sync calls to fail
+ Options options = CurrentOptions();
+ options.env = env_;
+ Reopen(&options);
+ env_->data_sync_error_.store(true, std::memory_order_release);
+
+ // (b) Normal write should succeed
+ WriteOptions w;
+ ASSERT_OK(db_->Put(w, "k1", "v1"));
+ ASSERT_EQ("v1", Get("k1"));
+
+ // (c) Do a sync write; should fail
+ w.sync = true;
+ ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+
+ // (d) make sync behave normally
+ env_->data_sync_error_.store(false, std::memory_order_release);
+
+ // (e) Do a non-sync write; should fail
+ w.sync = false;
+ ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+ ASSERT_EQ("NOT_FOUND", Get("k3"));
+}
+
+TEST(DBTest, ManifestWriteError) {
+ // Test for the following problem:
+ // (a) Compaction produces file F
+ // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+ // (c) GC deletes F
+ // (d) After reopening DB, reads fail since deleted F is named in log record
+
+ // We iterate twice. In the second iteration, everything is the
+ // same except the log record never makes it to the MANIFEST file.
+ for (int iter = 0; iter < 2; iter++) {
+ std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+ : &env_->manifest_write_error_;
+
+ // Insert foo=>bar mapping
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Memtable compaction (will succeed)
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
+
+ // Merging compaction (will fail)
+ error_type->store(true, std::memory_order_release);
+ dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Recovery: should not lose data
+ error_type->store(false, std::memory_order_release);
+ Reopen(&options);
+ ASSERT_EQ("bar", Get("foo"));
+ }
+}
+
+TEST(DBTest, MissingSSTFile) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Dump the memtable to disk.
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+
+ Close();
+ ASSERT_TRUE(DeleteAnSSTFile());
+ Options options = CurrentOptions();
+ options.paranoid_checks = true;
+ Status s = TryReopen(&options);
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("issing") != std::string::npos) << s.ToString();
+}
+
+TEST(DBTest, StillReadSST) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Dump the memtable to disk.
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+ Close();
+ ASSERT_GT(RenameLDBToSST(), 0);
+ Options options = CurrentOptions();
+ options.paranoid_checks = true;
+ Status s = TryReopen(&options);
+ ASSERT_TRUE(s.ok());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(DBTest, FilesDeletedAfterCompaction) {
+ ASSERT_OK(Put("foo", "v2"));
+ Compact("a", "z");
+ const int num_files = CountFiles();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("foo", "v2"));
+ Compact("a", "z");
+ }
+ ASSERT_EQ(CountFiles(), num_files);
+}
+
+TEST(DBTest, BloomFilter) {
+ env_->count_random_reads_ = true;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.block_cache = NewLRUCache(0); // Prevent cache hits
+ options.filter_policy = NewBloomFilterPolicy(10);
+ Reopen(&options);
+
+ // Populate multiple layers
+ const int N = 10000;
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ Compact("a", "z");
+ for (int i = 0; i < N; i += 100) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ dbfull()->TEST_CompactMemTable();
+
+ // Prevent auto compactions triggered by seeks
+ env_->delay_data_sync_.store(true, std::memory_order_release);
+
+ // Lookup present keys. Should rarely read from small sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i), Get(Key(i)));
+ }
+ int reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d present => %d reads\n", N, reads);
+ ASSERT_GE(reads, N);
+ ASSERT_LE(reads, N + 2 * N / 100);
+
+ // Lookup present keys. Should rarely read from either sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+ }
+ reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d missing => %d reads\n", N, reads);
+ ASSERT_LE(reads, 3 * N / 100);
+
+ env_->delay_data_sync_.store(false, std::memory_order_release);
+ Close();
+ delete options.block_cache;
+ delete options.filter_policy;
+}
+
+// Multi-threaded test:
+namespace {
+
+static const int kNumThreads = 4;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+ DBTest* test;
+ std::atomic<bool> stop;
+ std::atomic<int> counter[kNumThreads];
+ std::atomic<bool> thread_done[kNumThreads];
+};
+
+struct MTThread {
+ MTState* state;
+ int id;
+};
+
+static void MTThreadBody(void* arg) {
+ MTThread* t = reinterpret_cast<MTThread*>(arg);
+ int id = t->id;
+ DB* db = t->state->test->db_;
+ int counter = 0;
+ fprintf(stderr, "... starting thread %d\n", id);
+ Random rnd(1000 + id);
+ std::string value;
+ char valbuf[1500];
+ while (!t->state->stop.load(std::memory_order_acquire)) {
+ t->state->counter[id].store(counter, std::memory_order_release);
+
+ int key = rnd.Uniform(kNumKeys);
+ char keybuf[20];
+ snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+ if (rnd.OneIn(2)) {
+ // Write values of the form <key, my id, counter>.
+ // We add some padding for force compactions.
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", key, id,
+ static_cast<int>(counter));
+ ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf)));
+ } else {
+ // Read a value and verify that it matches the pattern written above.
+ Status s = db->Get(ReadOptions(), Slice(keybuf), &value);
+ if (s.IsNotFound()) {
+ // Key has not yet been written
+ } else {
+ // Check that the writer thread counter is >= the counter in the value
+ ASSERT_OK(s);
+ int k, w, c;
+ ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value;
+ ASSERT_EQ(k, key);
+ ASSERT_GE(w, 0);
+ ASSERT_LT(w, kNumThreads);
+ ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+ }
+ }
+ counter++;
+ }
+ t->state->thread_done[id].store(true, std::memory_order_release);
+ fprintf(stderr, "... stopping thread %d after %d ops\n", id, counter);
+}
+
+} // namespace
+
+TEST(DBTest, MultiThreaded) {
+ do {
+ // Initialize state
+ MTState mt;
+ mt.test = this;
+ mt.stop.store(false, std::memory_order_release);
+ for (int id = 0; id < kNumThreads; id++) {
+ mt.counter[id].store(false, std::memory_order_release);
+ mt.thread_done[id].store(false, std::memory_order_release);
+ }
+
+ // Start threads
+ MTThread thread[kNumThreads];
+ for (int id = 0; id < kNumThreads; id++) {
+ thread[id].state = &mt;
+ thread[id].id = id;
+ env_->StartThread(MTThreadBody, &thread[id]);
+ }
+
+ // Let them run for a while
+ DelayMilliseconds(kTestSeconds * 1000);
+
+ // Stop the threads and wait for them to finish
+ mt.stop.store(true, std::memory_order_release);
+ for (int id = 0; id < kNumThreads; id++) {
+ while (!mt.thread_done[id].load(std::memory_order_acquire)) {
+ DelayMilliseconds(100);
+ }
+ }
+ } while (ChangeOptions());
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB : public DB {
+ public:
+ class ModelSnapshot : public Snapshot {
+ public:
+ KVMap map_;
+ };
+
+ explicit ModelDB(const Options& options) : options_(options) {}
+ ~ModelDB() override = default;
+ Status Put(const WriteOptions& o, const Slice& k, const Slice& v) override {
+ return DB::Put(o, k, v);
+ }
+ Status Delete(const WriteOptions& o, const Slice& key) override {
+ return DB::Delete(o, key);
+ }
+ Status Get(const ReadOptions& options, const Slice& key,
+ std::string* value) override {
+ assert(false); // Not implemented
+ return Status::NotFound(key);
+ }
+ Iterator* NewIterator(const ReadOptions& options) override {
+ if (options.snapshot == nullptr) {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return new ModelIter(saved, true);
+ } else {
+ const KVMap* snapshot_state =
+ &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+ return new ModelIter(snapshot_state, false);
+ }
+ }
+ const Snapshot* GetSnapshot() override {
+ ModelSnapshot* snapshot = new ModelSnapshot;
+ snapshot->map_ = map_;
+ return snapshot;
+ }
+
+ void ReleaseSnapshot(const Snapshot* snapshot) override {
+ delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+ }
+ Status Write(const WriteOptions& options, WriteBatch* batch) override {
+ class Handler : public WriteBatch::Handler {
+ public:
+ KVMap* map_;
+ void Put(const Slice& key, const Slice& value) override {
+ (*map_)[key.ToString()] = value.ToString();
+ }
+ void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+ };
+ Handler handler;
+ handler.map_ = &map_;
+ return batch->Iterate(&handler);
+ }
+
+ bool GetProperty(const Slice& property, std::string* value) override {
+ return false;
+ }
+ void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) override {
+ for (int i = 0; i < n; i++) {
+ sizes[i] = 0;
+ }
+ }
+ void CompactRange(const Slice* start, const Slice* end) override {}
+
+ private:
+ class ModelIter : public Iterator {
+ public:
+ ModelIter(const KVMap* map, bool owned)
+ : map_(map), owned_(owned), iter_(map_->end()) {}
+ ~ModelIter() override {
+ if (owned_) delete map_;
+ }
+ bool Valid() const override { return iter_ != map_->end(); }
+ void SeekToFirst() override { iter_ = map_->begin(); }
+ void SeekToLast() override {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ void Seek(const Slice& k) override {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ void Next() override { ++iter_; }
+ void Prev() override { --iter_; }
+ Slice key() const override { return iter_->first; }
+ Slice value() const override { return iter_->second; }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const KVMap* const map_;
+ const bool owned_; // Do we own map_
+ KVMap::const_iterator iter_;
+ };
+ const Options options_;
+ KVMap map_;
+};
+
+static bool CompareIterators(int step, DB* model, DB* db,
+ const Snapshot* model_snap,
+ const Snapshot* db_snap) {
+ ReadOptions options;
+ options.snapshot = model_snap;
+ Iterator* miter = model->NewIterator(options);
+ options.snapshot = db_snap;
+ Iterator* dbiter = db->NewIterator(options);
+ bool ok = true;
+ int count = 0;
+ for (miter->SeekToFirst(), dbiter->SeekToFirst();
+ ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+ count++;
+ if (miter->key().compare(dbiter->key()) != 0) {
+ fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(dbiter->key()).c_str());
+ ok = false;
+ break;
+ }
+
+ if (miter->value().compare(dbiter->value()) != 0) {
+ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+ step, EscapeString(miter->key()).c_str(),
+ EscapeString(miter->value()).c_str(),
+ EscapeString(miter->value()).c_str());
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ if (miter->Valid() != dbiter->Valid()) {
+ fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+ step, miter->Valid(), dbiter->Valid());
+ ok = false;
+ }
+ }
+ fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
+ delete miter;
+ delete dbiter;
+ return ok;
+}
+
+TEST(DBTest, Randomized) {
+ Random rnd(test::RandomSeed());
+ do {
+ ModelDB model(CurrentOptions());
+ const int N = 10000;
+ const Snapshot* model_snap = nullptr;
+ const Snapshot* db_snap = nullptr;
+ std::string k, v;
+ for (int step = 0; step < N; step++) {
+ if (step % 100 == 0) {
+ fprintf(stderr, "Step %d of %d\n", step, N);
+ }
+ // TODO(sanjay): Test Get() works
+ int p = rnd.Uniform(100);
+ if (p < 45) { // Put
+ k = RandomKey(&rnd);
+ v = RandomString(
+ &rnd, rnd.OneIn(20) ? 100 + rnd.Uniform(100) : rnd.Uniform(8));
+ ASSERT_OK(model.Put(WriteOptions(), k, v));
+ ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+ } else if (p < 90) { // Delete
+ k = RandomKey(&rnd);
+ ASSERT_OK(model.Delete(WriteOptions(), k));
+ ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+ } else { // Multi-element batch
+ WriteBatch b;
+ const int num = rnd.Uniform(8);
+ for (int i = 0; i < num; i++) {
+ if (i == 0 || !rnd.OneIn(10)) {
+ k = RandomKey(&rnd);
+ } else {
+ // Periodically re-use the same key from the previous iter, so
+ // we have multiple entries in the write batch for the same key
+ }
+ if (rnd.OneIn(2)) {
+ v = RandomString(&rnd, rnd.Uniform(10));
+ b.Put(k, v);
+ } else {
+ b.Delete(k);
+ }
+ }
+ ASSERT_OK(model.Write(WriteOptions(), &b));
+ ASSERT_OK(db_->Write(WriteOptions(), &b));
+ }
+
+ if ((step % 100) == 0) {
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+ ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+ // Save a snapshot from each DB this time that we'll use next
+ // time we compare things, to make sure the current state is
+ // preserved with the snapshot
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+ Reopen();
+ ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+ model_snap = model.GetSnapshot();
+ db_snap = db_->GetSnapshot();
+ }
+ }
+ if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+ if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+ } while (ChangeOptions());
+}
+
+std::string MakeKey(unsigned int num) {
+ char buf[30];
+ snprintf(buf, sizeof(buf), "%016u", num);
+ return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+ std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
+ DestroyDB(dbname, Options());
+
+ DB* db = nullptr;
+ Options opts;
+ opts.create_if_missing = true;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != nullptr);
+
+ delete db;
+ db = nullptr;
+
+ Env* env = Env::Default();
+
+ port::Mutex mu;
+ MutexLock l(&mu);
+
+ InternalKeyComparator cmp(BytewiseComparator());
+ Options options;
+ VersionSet vset(dbname, &options, nullptr, &cmp);
+ bool save_manifest;
+ ASSERT_OK(vset.Recover(&save_manifest));
+ VersionEdit vbase;
+ uint64_t fnum = 1;
+ for (int i = 0; i < num_base_files; i++) {
+ InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
+ InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
+ vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+ }
+ ASSERT_OK(vset.LogAndApply(&vbase, &mu));
+
+ uint64_t start_micros = env->NowMicros();
+
+ for (int i = 0; i < iters; i++) {
+ VersionEdit vedit;
+ vedit.DeleteFile(2, fnum);
+ InternalKey start(MakeKey(2 * fnum), 1, kTypeValue);
+ InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion);
+ vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+ vset.LogAndApply(&vedit, &mu);
+ }
+ uint64_t stop_micros = env->NowMicros();
+ unsigned int us = stop_micros - start_micros;
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", num_base_files);
+ fprintf(stderr,
+ "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n", buf,
+ iters, us, ((float)us) / iters);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+ leveldb::BM_LogAndApply(1000, 1);
+ leveldb::BM_LogAndApply(1000, 100);
+ leveldb::BM_LogAndApply(1000, 10000);
+ leveldb::BM_LogAndApply(100, 100000);
+ return 0;
+ }
+
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc
new file mode 100644
index 0000000000..459eddf5b1
--- /dev/null
+++ b/src/leveldb/db/dbformat.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+
+#include <stdio.h>
+
+#include <sstream>
+
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+ assert(seq <= kMaxSequenceNumber);
+ assert(t <= kValueTypeForSeek);
+ return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+ result->append(key.user_key.data(), key.user_key.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString() const {
+ std::ostringstream ss;
+ ss << '\'' << EscapeString(user_key.ToString()) << "' @ " << sequence << " : "
+ << static_cast<int>(type);
+ return ss.str();
+}
+
+std::string InternalKey::DebugString() const {
+ ParsedInternalKey parsed;
+ if (ParseInternalKey(rep_, &parsed)) {
+ return parsed.DebugString();
+ }
+ std::ostringstream ss;
+ ss << "(bad)" << EscapeString(rep_);
+ return ss.str();
+}
+
+const char* InternalKeyComparator::Name() const {
+ return "leveldb.InternalKeyComparator";
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+ const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(std::string* start,
+ const Slice& limit) const {
+ // Attempt to shorten the user portion of the key
+ Slice user_start = ExtractUserKey(*start);
+ Slice user_limit = ExtractUserKey(limit);
+ std::string tmp(user_start.data(), user_start.size());
+ user_comparator_->FindShortestSeparator(&tmp, user_limit);
+ if (tmp.size() < user_start.size() &&
+ user_comparator_->Compare(user_start, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp,
+ PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+ assert(this->Compare(*start, tmp) < 0);
+ assert(this->Compare(tmp, limit) < 0);
+ start->swap(tmp);
+ }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+ Slice user_key = ExtractUserKey(*key);
+ std::string tmp(user_key.data(), user_key.size());
+ user_comparator_->FindShortSuccessor(&tmp);
+ if (tmp.size() < user_key.size() &&
+ user_comparator_->Compare(user_key, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp,
+ PackSequenceAndType(kMaxSequenceNumber, kValueTypeForSeek));
+ assert(this->Compare(*key, tmp) < 0);
+ key->swap(tmp);
+ }
+}
+
+const char* InternalFilterPolicy::Name() const { return user_policy_->Name(); }
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+ std::string* dst) const {
+ // We rely on the fact that the code in table.cc does not mind us
+ // adjusting keys[].
+ Slice* mkey = const_cast<Slice*>(keys);
+ for (int i = 0; i < n; i++) {
+ mkey[i] = ExtractUserKey(keys[i]);
+ // TODO(sanjay): Suppress dups?
+ }
+ user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+ return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+ size_t usize = user_key.size();
+ size_t needed = usize + 13; // A conservative estimate
+ char* dst;
+ if (needed <= sizeof(space_)) {
+ dst = space_;
+ } else {
+ dst = new char[needed];
+ }
+ start_ = dst;
+ dst = EncodeVarint32(dst, usize + 8);
+ kstart_ = dst;
+ memcpy(dst, user_key.data(), usize);
+ dst += usize;
+ EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+ dst += 8;
+ end_ = dst;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h
new file mode 100644
index 0000000000..a1c30ed88c
--- /dev/null
+++ b/src/leveldb/db/dbformat.h
@@ -0,0 +1,224 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DBFORMAT_H_
+#define STORAGE_LEVELDB_DB_DBFORMAT_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/slice.h"
+#include "leveldb/table_builder.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// Grouping of constants. We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+
+// Level-0 compaction is started when we hit this many files.
+static const int kL0_CompactionTrigger = 4;
+
+// Soft limit on number of level-0 files. We slow down writes at this point.
+static const int kL0_SlowdownWritesTrigger = 8;
+
+// Maximum number of level-0 files. We stop writes at this point.
+static const int kL0_StopWritesTrigger = 12;
+
+// Maximum level to which a new compacted memtable is pushed if it
+// does not create overlap. We try to push to level 2 to avoid the
+// relatively expensive level 0=>1 compactions and to avoid some
+// expensive manifest file operations. We do not push all the way to
+// the largest level since that can generate a lot of wasted disk
+// space if the same key space is being repeatedly overwritten.
+static const int kMaxMemCompactLevel = 2;
+
+// Approximate gap in bytes between samples of data read during iteration.
+static const int kReadBytesPeriod = 1048576;
+
+} // namespace config
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType { kTypeDeletion = 0x0, kTypeValue = 0x1 };
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeValue;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey() {} // Intentionally left uninitialized (for speed)
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) {}
+ std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+bool ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+ const Comparator* user_comparator_;
+
+ public:
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
+ const char* Name() const override;
+ int Compare(const Slice& a, const Slice& b) const override;
+ void FindShortestSeparator(std::string* start,
+ const Slice& limit) const override;
+ void FindShortSuccessor(std::string* key) const override;
+
+ const Comparator* user_comparator() const { return user_comparator_; }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+ const FilterPolicy* const user_policy_;
+
+ public:
+ explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) {}
+ const char* Name() const override;
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const override;
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const override;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+ std::string rep_;
+
+ public:
+ InternalKey() {} // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+ }
+
+ bool DecodeFrom(const Slice& s) {
+ rep_.assign(s.data(), s.size());
+ return !rep_.empty();
+ }
+
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void Clear() { rep_.clear(); }
+
+ std::string DebugString() const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+ const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result) {
+ const size_t n = internal_key.size();
+ if (n < 8) return false;
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ uint8_t c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ result->user_key = Slice(internal_key.data(), n - 8);
+ return (c <= static_cast<uint8_t>(kTypeValue));
+}
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+ // Initialize *this for looking up user_key at a snapshot with
+ // the specified sequence number.
+ LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+ LookupKey(const LookupKey&) = delete;
+ LookupKey& operator=(const LookupKey&) = delete;
+
+ ~LookupKey();
+
+ // Return a key suitable for lookup in a MemTable.
+ Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+ // Return an internal key (suitable for passing to an internal iterator)
+ Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+ // Return the user key
+ Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+ // We construct a char array of the form:
+ // klength varint32 <-- start_
+ // userkey char[klength] <-- kstart_
+ // tag uint64
+ // <-- end_
+ // The array is a suitable MemTable key.
+ // The suffix starting with "userkey" can be used as an InternalKey.
+ const char* start_;
+ const char* kstart_;
+ const char* end_;
+ char space_[200]; // Avoid allocation for short keys
+};
+
+inline LookupKey::~LookupKey() {
+ if (start_ != space_) delete[] start_;
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_DBFORMAT_H_
diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc
new file mode 100644
index 0000000000..1209369c31
--- /dev/null
+++ b/src/leveldb/db/dbformat_test.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string IKey(const std::string& user_key, uint64_t seq,
+ ValueType vt) {
+ std::string encoded;
+ AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+ return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+ return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+ return result;
+}
+
+static void TestKey(const std::string& key, uint64_t seq, ValueType vt) {
+ std::string encoded = IKey(key, seq, vt);
+
+ Slice in(encoded);
+ ParsedInternalKey decoded("", 0, kTypeValue);
+
+ ASSERT_TRUE(ParseInternalKey(in, &decoded));
+ ASSERT_EQ(key, decoded.user_key.ToString());
+ ASSERT_EQ(seq, decoded.sequence);
+ ASSERT_EQ(vt, decoded.type);
+
+ ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest {};
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+ const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"};
+ const uint64_t seq[] = {1,
+ 2,
+ 3,
+ (1ull << 8) - 1,
+ 1ull << 8,
+ (1ull << 8) + 1,
+ (1ull << 16) - 1,
+ 1ull << 16,
+ (1ull << 16) + 1,
+ (1ull << 32) - 1,
+ 1ull << 32,
+ (1ull << 32) + 1};
+ for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+ for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+ TestKey(keys[k], seq[s], kTypeValue);
+ TestKey("hello", 1, kTypeDeletion);
+ }
+ }
+}
+
+TEST(FormatTest, InternalKey_DecodeFromEmpty) {
+ InternalKey internal_key;
+
+ ASSERT_TRUE(!internal_key.DecodeFrom(""));
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+ // When user keys are same
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion)));
+
+ // When user keys are misordered
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue)));
+
+ // When user keys are different, but correctly ordered
+ ASSERT_EQ(
+ IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue)));
+
+ // When start user key is prefix of limit user key
+ ASSERT_EQ(
+ IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue)));
+
+ // When limit user key is prefix of start user key
+ ASSERT_EQ(
+ IKey("foobar", 100, kTypeValue),
+ Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ ShortSuccessor(IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+ ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST(FormatTest, ParsedInternalKeyDebugString) {
+ ParsedInternalKey key("The \"key\" in 'single quotes'", 42, kTypeValue);
+
+ ASSERT_EQ("'The \"key\" in 'single quotes'' @ 42 : 1", key.DebugString());
+}
+
+TEST(FormatTest, InternalKeyDebugString) {
+ InternalKey key("The \"key\" in 'single quotes'", 42, kTypeValue);
+ ASSERT_EQ("'The \"key\" in 'single quotes'' @ 42 : 1", key.DebugString());
+
+ InternalKey invalid_key;
+ ASSERT_EQ("(bad)", invalid_key.DebugString());
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/dumpfile.cc b/src/leveldb/db/dumpfile.cc
new file mode 100644
index 0000000000..77d59003cf
--- /dev/null
+++ b/src/leveldb/db/dumpfile.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/dumpfile.h"
+
+#include <stdio.h>
+
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+namespace {
+
+bool GuessType(const std::string& fname, FileType* type) {
+ size_t pos = fname.rfind('/');
+ std::string basename;
+ if (pos == std::string::npos) {
+ basename = fname;
+ } else {
+ basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1);
+ }
+ uint64_t ignored;
+ return ParseFileName(basename, &ignored, type);
+}
+
+// Notified when log reader encounters corruption.
+class CorruptionReporter : public log::Reader::Reporter {
+ public:
+ void Corruption(size_t bytes, const Status& status) override {
+ std::string r = "corruption: ";
+ AppendNumberTo(&r, bytes);
+ r += " bytes; ";
+ r += status.ToString();
+ r.push_back('\n');
+ dst_->Append(r);
+ }
+
+ WritableFile* dst_;
+};
+
+// Print contents of a log file. (*func)() is called on every record.
+Status PrintLogContents(Env* env, const std::string& fname,
+ void (*func)(uint64_t, Slice, WritableFile*),
+ WritableFile* dst) {
+ SequentialFile* file;
+ Status s = env->NewSequentialFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ CorruptionReporter reporter;
+ reporter.dst_ = dst;
+ log::Reader reader(file, &reporter, true, 0);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch)) {
+ (*func)(reader.LastRecordOffset(), record, dst);
+ }
+ delete file;
+ return Status::OK();
+}
+
+// Called on every item found in a WriteBatch.
+class WriteBatchItemPrinter : public WriteBatch::Handler {
+ public:
+ void Put(const Slice& key, const Slice& value) override {
+ std::string r = " put '";
+ AppendEscapedStringTo(&r, key);
+ r += "' '";
+ AppendEscapedStringTo(&r, value);
+ r += "'\n";
+ dst_->Append(r);
+ }
+ void Delete(const Slice& key) override {
+ std::string r = " del '";
+ AppendEscapedStringTo(&r, key);
+ r += "'\n";
+ dst_->Append(r);
+ }
+
+ WritableFile* dst_;
+};
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kLogFile.
+static void WriteBatchPrinter(uint64_t pos, Slice record, WritableFile* dst) {
+ std::string r = "--- offset ";
+ AppendNumberTo(&r, pos);
+ r += "; ";
+ if (record.size() < 12) {
+ r += "log record length ";
+ AppendNumberTo(&r, record.size());
+ r += " is too small\n";
+ dst->Append(r);
+ return;
+ }
+ WriteBatch batch;
+ WriteBatchInternal::SetContents(&batch, record);
+ r += "sequence ";
+ AppendNumberTo(&r, WriteBatchInternal::Sequence(&batch));
+ r.push_back('\n');
+ dst->Append(r);
+ WriteBatchItemPrinter batch_item_printer;
+ batch_item_printer.dst_ = dst;
+ Status s = batch.Iterate(&batch_item_printer);
+ if (!s.ok()) {
+ dst->Append(" error: " + s.ToString() + "\n");
+ }
+}
+
+Status DumpLog(Env* env, const std::string& fname, WritableFile* dst) {
+ return PrintLogContents(env, fname, WriteBatchPrinter, dst);
+}
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kDescriptorFile.
+static void VersionEditPrinter(uint64_t pos, Slice record, WritableFile* dst) {
+ std::string r = "--- offset ";
+ AppendNumberTo(&r, pos);
+ r += "; ";
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ r += s.ToString();
+ r.push_back('\n');
+ } else {
+ r += edit.DebugString();
+ }
+ dst->Append(r);
+}
+
+Status DumpDescriptor(Env* env, const std::string& fname, WritableFile* dst) {
+ return PrintLogContents(env, fname, VersionEditPrinter, dst);
+}
+
+Status DumpTable(Env* env, const std::string& fname, WritableFile* dst) {
+ uint64_t file_size;
+ RandomAccessFile* file = nullptr;
+ Table* table = nullptr;
+ Status s = env->GetFileSize(fname, &file_size);
+ if (s.ok()) {
+ s = env->NewRandomAccessFile(fname, &file);
+ }
+ if (s.ok()) {
+ // We use the default comparator, which may or may not match the
+ // comparator used in this database. However this should not cause
+ // problems since we only use Table operations that do not require
+ // any comparisons. In particular, we do not call Seek or Prev.
+ s = Table::Open(Options(), file, file_size, &table);
+ }
+ if (!s.ok()) {
+ delete table;
+ delete file;
+ return s;
+ }
+
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = table->NewIterator(ro);
+ std::string r;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ r.clear();
+ ParsedInternalKey key;
+ if (!ParseInternalKey(iter->key(), &key)) {
+ r = "badkey '";
+ AppendEscapedStringTo(&r, iter->key());
+ r += "' => '";
+ AppendEscapedStringTo(&r, iter->value());
+ r += "'\n";
+ dst->Append(r);
+ } else {
+ r = "'";
+ AppendEscapedStringTo(&r, key.user_key);
+ r += "' @ ";
+ AppendNumberTo(&r, key.sequence);
+ r += " : ";
+ if (key.type == kTypeDeletion) {
+ r += "del";
+ } else if (key.type == kTypeValue) {
+ r += "val";
+ } else {
+ AppendNumberTo(&r, key.type);
+ }
+ r += " => '";
+ AppendEscapedStringTo(&r, iter->value());
+ r += "'\n";
+ dst->Append(r);
+ }
+ }
+ s = iter->status();
+ if (!s.ok()) {
+ dst->Append("iterator error: " + s.ToString() + "\n");
+ }
+
+ delete iter;
+ delete table;
+ delete file;
+ return Status::OK();
+}
+
+} // namespace
+
+Status DumpFile(Env* env, const std::string& fname, WritableFile* dst) {
+ FileType ftype;
+ if (!GuessType(fname, &ftype)) {
+ return Status::InvalidArgument(fname + ": unknown file type");
+ }
+ switch (ftype) {
+ case kLogFile:
+ return DumpLog(env, fname, dst);
+ case kDescriptorFile:
+ return DumpDescriptor(env, fname, dst);
+ case kTableFile:
+ return DumpTable(env, fname, dst);
+ default:
+ break;
+ }
+ return Status::InvalidArgument(fname + ": not a dump-able file type");
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/fault_injection_test.cc b/src/leveldb/db/fault_injection_test.cc
new file mode 100644
index 0000000000..bf705cb60f
--- /dev/null
+++ b/src/leveldb/db/fault_injection_test.cc
@@ -0,0 +1,552 @@
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include <map>
+#include <set>
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+class FaultInjectionTestEnv;
+
+namespace {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+static std::string GetDirName(const std::string& filename) {
+ size_t found = filename.find_last_of("/\\");
+ if (found == std::string::npos) {
+ return "";
+ } else {
+ return filename.substr(0, found);
+ }
+}
+
+Status SyncDir(const std::string& dir) {
+ // As this is a test it isn't required to *actually* sync this directory.
+ return Status::OK();
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(const std::string& filename, uint64_t length) {
+ leveldb::Env* env = leveldb::Env::Default();
+
+ SequentialFile* orig_file;
+ Status s = env->NewSequentialFile(filename, &orig_file);
+ if (!s.ok()) return s;
+
+ char* scratch = new char[length];
+ leveldb::Slice result;
+ s = orig_file->Read(length, &result, scratch);
+ delete orig_file;
+ if (s.ok()) {
+ std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+ WritableFile* tmp_file;
+ s = env->NewWritableFile(tmp_name, &tmp_file);
+ if (s.ok()) {
+ s = tmp_file->Append(result);
+ delete tmp_file;
+ if (s.ok()) {
+ s = env->RenameFile(tmp_name, filename);
+ } else {
+ env->DeleteFile(tmp_name);
+ }
+ }
+ }
+
+ delete[] scratch;
+
+ return s;
+}
+
+struct FileState {
+ std::string filename_;
+ int64_t pos_;
+ int64_t pos_at_last_sync_;
+ int64_t pos_at_last_flush_;
+
+ FileState(const std::string& filename)
+ : filename_(filename),
+ pos_(-1),
+ pos_at_last_sync_(-1),
+ pos_at_last_flush_(-1) {}
+
+ FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+ bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+ Status DropUnsyncedData() const;
+};
+
+} // anonymous namespace
+
+// A wrapper around WritableFile which informs another Env whenever this file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+ TestWritableFile(const FileState& state, WritableFile* f,
+ FaultInjectionTestEnv* env);
+ ~TestWritableFile() override;
+ Status Append(const Slice& data) override;
+ Status Close() override;
+ Status Flush() override;
+ Status Sync() override;
+ std::string GetName() const override { return ""; }
+
+ private:
+ FileState state_;
+ WritableFile* target_;
+ bool writable_file_opened_;
+ FaultInjectionTestEnv* env_;
+
+ Status SyncParent();
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+ FaultInjectionTestEnv()
+ : EnvWrapper(Env::Default()), filesystem_active_(true) {}
+ ~FaultInjectionTestEnv() override = default;
+ Status NewWritableFile(const std::string& fname,
+ WritableFile** result) override;
+ Status NewAppendableFile(const std::string& fname,
+ WritableFile** result) override;
+ Status DeleteFile(const std::string& f) override;
+ Status RenameFile(const std::string& s, const std::string& t) override;
+
+ void WritableFileClosed(const FileState& state);
+ Status DropUnsyncedFileData();
+ Status DeleteFilesCreatedAfterLastDirSync();
+ void DirWasSynced();
+ bool IsFileCreatedSinceLastDirSync(const std::string& filename);
+ void ResetState();
+ void UntrackFile(const std::string& f);
+ // Setting the filesystem to inactive is the test equivalent to simulating a
+ // system reset. Setting to inactive will freeze our saved filesystem state so
+ // that it will stop being recorded. It can then be reset back to the state at
+ // the time of the reset.
+ bool IsFilesystemActive() LOCKS_EXCLUDED(mutex_) {
+ MutexLock l(&mutex_);
+ return filesystem_active_;
+ }
+ void SetFilesystemActive(bool active) LOCKS_EXCLUDED(mutex_) {
+ MutexLock l(&mutex_);
+ filesystem_active_ = active;
+ }
+
+ private:
+ port::Mutex mutex_;
+ std::map<std::string, FileState> db_file_state_ GUARDED_BY(mutex_);
+ std::set<std::string> new_files_since_last_dir_sync_ GUARDED_BY(mutex_);
+ bool filesystem_active_ GUARDED_BY(mutex_); // Record flushes, syncs, writes
+};
+
+TestWritableFile::TestWritableFile(const FileState& state, WritableFile* f,
+ FaultInjectionTestEnv* env)
+ : state_(state), target_(f), writable_file_opened_(true), env_(env) {
+ assert(f != nullptr);
+}
+
+TestWritableFile::~TestWritableFile() {
+ if (writable_file_opened_) {
+ Close();
+ }
+ delete target_;
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+ Status s = target_->Append(data);
+ if (s.ok() && env_->IsFilesystemActive()) {
+ state_.pos_ += data.size();
+ }
+ return s;
+}
+
+Status TestWritableFile::Close() {
+ writable_file_opened_ = false;
+ Status s = target_->Close();
+ if (s.ok()) {
+ env_->WritableFileClosed(state_);
+ }
+ return s;
+}
+
+Status TestWritableFile::Flush() {
+ Status s = target_->Flush();
+ if (s.ok() && env_->IsFilesystemActive()) {
+ state_.pos_at_last_flush_ = state_.pos_;
+ }
+ return s;
+}
+
+Status TestWritableFile::SyncParent() {
+ Status s = SyncDir(GetDirName(state_.filename_));
+ if (s.ok()) {
+ env_->DirWasSynced();
+ }
+ return s;
+}
+
+Status TestWritableFile::Sync() {
+ if (!env_->IsFilesystemActive()) {
+ return Status::OK();
+ }
+ // Ensure new files referred to by the manifest are in the filesystem.
+ Status s = target_->Sync();
+ if (s.ok()) {
+ state_.pos_at_last_sync_ = state_.pos_;
+ }
+ if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
+ Status ps = SyncParent();
+ if (s.ok() && !ps.ok()) {
+ s = ps;
+ }
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ WritableFile* actual_writable_file;
+ Status s = target()->NewWritableFile(fname, &actual_writable_file);
+ if (s.ok()) {
+ FileState state(fname);
+ state.pos_ = 0;
+ *result = new TestWritableFile(state, actual_writable_file, this);
+ // NewWritableFile doesn't append to files, so if the same file is
+ // opened again then it will be truncated - so forget our saved
+ // state.
+ UntrackFile(fname);
+ MutexLock l(&mutex_);
+ new_files_since_last_dir_sync_.insert(fname);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
+ WritableFile** result) {
+ WritableFile* actual_writable_file;
+ Status s = target()->NewAppendableFile(fname, &actual_writable_file);
+ if (s.ok()) {
+ FileState state(fname);
+ state.pos_ = 0;
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.count(fname) == 0) {
+ new_files_since_last_dir_sync_.insert(fname);
+ } else {
+ state = db_file_state_[fname];
+ }
+ }
+ *result = new TestWritableFile(state, actual_writable_file, this);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+ Status s;
+ MutexLock l(&mutex_);
+ for (const auto& kvp : db_file_state_) {
+ if (!s.ok()) {
+ break;
+ }
+ const FileState& state = kvp.second;
+ if (!state.IsFullySynced()) {
+ s = state.DropUnsyncedData();
+ }
+ }
+ return s;
+}
+
+void FaultInjectionTestEnv::DirWasSynced() {
+ MutexLock l(&mutex_);
+ new_files_since_last_dir_sync_.clear();
+}
+
+bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
+ const std::string& filename) {
+ MutexLock l(&mutex_);
+ return new_files_since_last_dir_sync_.find(filename) !=
+ new_files_since_last_dir_sync_.end();
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+ MutexLock l(&mutex_);
+ db_file_state_.erase(f);
+ new_files_since_last_dir_sync_.erase(f);
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+ Status s = EnvWrapper::DeleteFile(f);
+ ASSERT_OK(s);
+ if (s.ok()) {
+ UntrackFile(f);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+ const std::string& t) {
+ Status ret = EnvWrapper::RenameFile(s, t);
+
+ if (ret.ok()) {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ db_file_state_.erase(s);
+ }
+
+ if (new_files_since_last_dir_sync_.erase(s) != 0) {
+ assert(new_files_since_last_dir_sync_.find(t) ==
+ new_files_since_last_dir_sync_.end());
+ new_files_since_last_dir_sync_.insert(t);
+ }
+ }
+
+ return ret;
+}
+
+void FaultInjectionTestEnv::ResetState() {
+ // Since we are not destroying the database, the existing files
+ // should keep their recorded synced/flushed state. Therefore
+ // we do not reset db_file_state_ and new_files_since_last_dir_sync_.
+ SetFilesystemActive(true);
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+ // Because DeleteFile access this container make a copy to avoid deadlock
+ mutex_.Lock();
+ std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
+ new_files_since_last_dir_sync_.end());
+ mutex_.Unlock();
+ Status status;
+ for (const auto& new_file : new_files) {
+ Status delete_status = DeleteFile(new_file);
+ if (!delete_status.ok() && status.ok()) {
+ status = std::move(delete_status);
+ }
+ }
+ return status;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+ MutexLock l(&mutex_);
+ db_file_state_[state.filename_] = state;
+}
+
+Status FileState::DropUnsyncedData() const {
+ int64_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+ return Truncate(filename_, sync_pos);
+}
+
+class FaultInjectionTest {
+ public:
+ enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
+ enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
+
+ FaultInjectionTestEnv* env_;
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ FaultInjectionTest()
+ : env_(new FaultInjectionTestEnv),
+ tiny_cache_(NewLRUCache(100)),
+ db_(nullptr) {
+ dbname_ = test::TmpDir() + "/fault_test";
+ DestroyDB(dbname_, Options()); // Destroy any db from earlier run
+ options_.reuse_logs = true;
+ options_.env = env_;
+ options_.paranoid_checks = true;
+ options_.block_cache = tiny_cache_;
+ options_.create_if_missing = true;
+ }
+
+ ~FaultInjectionTest() {
+ CloseDB();
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ delete env_;
+ }
+
+ void ReuseLogs(bool reuse) { options_.reuse_logs = reuse; }
+
+ void Build(int start_idx, int num_vals) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = start_idx; i < start_idx + num_vals; i++) {
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ WriteOptions options;
+ ASSERT_OK(db_->Write(options, &batch));
+ }
+ }
+
+ Status ReadValue(int i, std::string* val) const {
+ std::string key_space, value_space;
+ Slice key = Key(i, &key_space);
+ Value(i, &value_space);
+ ReadOptions options;
+ return db_->Get(options, key, val);
+ }
+
+ Status Verify(int start_idx, int num_vals,
+ ExpectedVerifResult expected) const {
+ std::string val;
+ std::string value_space;
+ Status s;
+ for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+ Value(i, &value_space);
+ s = ReadValue(i, &val);
+ if (expected == VAL_EXPECT_NO_ERROR) {
+ if (s.ok()) {
+ ASSERT_EQ(value_space, val);
+ }
+ } else if (s.ok()) {
+ fprintf(stderr, "Expected an error at %d, but was OK\n", i);
+ s = Status::IOError(dbname_, "Expected value error:");
+ } else {
+ s = Status::OK(); // An expected error
+ }
+ }
+ return s;
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) const {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) const {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+
+ Status OpenDB() {
+ delete db_;
+ db_ = nullptr;
+ env_->ResetState();
+ return DB::Open(options_, dbname_, &db_);
+ }
+
+ void CloseDB() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ void DeleteAllData() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+ }
+
+ delete iter;
+ }
+
+ void ResetDBState(ResetMethod reset_method) {
+ switch (reset_method) {
+ case RESET_DROP_UNSYNCED_DATA:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ break;
+ case RESET_DELETE_UNSYNCED_FILES:
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+ DeleteAllData();
+ Build(0, num_pre_sync);
+ db_->CompactRange(nullptr, nullptr);
+ Build(num_pre_sync, num_post_sync);
+ }
+
+ void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+ int num_pre_sync, int num_post_sync) {
+ env_->SetFilesystemActive(false);
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+ FaultInjectionTest::VAL_EXPECT_ERROR));
+ }
+
+ void NoWriteTestPreFault() {}
+
+ void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ }
+
+ void DoTest() {
+ Random rnd(0);
+ ASSERT_OK(OpenDB());
+ for (size_t idx = 0; idx < kNumIterations; idx++) {
+ int num_pre_sync = rnd.Uniform(kMaxNumValues);
+ int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA, num_pre_sync,
+ num_post_sync);
+
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ // No new files created so we expect all values since no files will be
+ // dropped.
+ PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
+ num_pre_sync + num_post_sync, 0);
+
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
+ }
+ }
+};
+
+TEST(FaultInjectionTest, FaultTestNoLogReuse) {
+ ReuseLogs(false);
+ DoTest();
+}
+
+TEST(FaultInjectionTest, FaultTestWithLogReuse) {
+ ReuseLogs(true);
+ DoTest();
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc
new file mode 100644
index 0000000000..85de45c507
--- /dev/null
+++ b/src/leveldb/db/filename.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include "db/dbformat.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// A utility routine: write "data" to the named file and Sync() it.
+Status WriteStringToFileSync(Env* env, const Slice& data,
+ const std::string& fname);
+
+static std::string MakeFileName(const std::string& dbname, uint64_t number,
+ const char* suffix) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/%06llu.%s",
+ static_cast<unsigned long long>(number), suffix);
+ return dbname + buf;
+}
+
+std::string LogFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "log");
+}
+
+std::string TableFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "ldb");
+}
+
+std::string SSTTableFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+ static_cast<unsigned long long>(number));
+ return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+ return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) { return dbname + "/LOCK"; }
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG.old";
+}
+
+// Owned filenames have the form:
+// dbname/CURRENT
+// dbname/LOCK
+// dbname/LOG
+// dbname/LOG.old
+// dbname/MANIFEST-[0-9]+
+// dbname/[0-9]+.(log|sst|ldb)
+bool ParseFileName(const std::string& filename, uint64_t* number,
+ FileType* type) {
+ Slice rest(filename);
+ if (rest == "CURRENT") {
+ *number = 0;
+ *type = kCurrentFile;
+ } else if (rest == "LOCK") {
+ *number = 0;
+ *type = kDBLockFile;
+ } else if (rest == "LOG" || rest == "LOG.old") {
+ *number = 0;
+ *type = kInfoLogFile;
+ } else if (rest.starts_with("MANIFEST-")) {
+ rest.remove_prefix(strlen("MANIFEST-"));
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (!rest.empty()) {
+ return false;
+ }
+ *type = kDescriptorFile;
+ *number = num;
+ } else {
+ // Avoid strtoull() to keep filename format independent of the
+ // current locale
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ Slice suffix = rest;
+ if (suffix == Slice(".log")) {
+ *type = kLogFile;
+ } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
+ *type = kTableFile;
+ } else if (suffix == Slice(".dbtmp")) {
+ *type = kTempFile;
+ } else {
+ return false;
+ }
+ *number = num;
+ }
+ return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number) {
+ // Remove leading "dbname/" and add newline to manifest file name
+ std::string manifest = DescriptorFileName(dbname, descriptor_number);
+ Slice contents = manifest;
+ assert(contents.starts_with(dbname + "/"));
+ contents.remove_prefix(dbname.size() + 1);
+ std::string tmp = TempFileName(dbname, descriptor_number);
+ Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
+ if (s.ok()) {
+ s = env->RenameFile(tmp, CurrentFileName(dbname));
+ }
+ if (!s.ok()) {
+ env->DeleteFile(tmp);
+ }
+ return s;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h
new file mode 100644
index 0000000000..524e813c06
--- /dev/null
+++ b/src/leveldb/db/filename.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
+#define STORAGE_LEVELDB_DB_FILENAME_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+enum FileType {
+ kLogFile,
+ kDBLockFile,
+ kTableFile,
+ kDescriptorFile,
+ kCurrentFile,
+ kTempFile,
+ kInfoLogFile // Either the current one, or an old one
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+std::string LogFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the legacy file name for an sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number. The result will be
+// prefixed with "dbname".
+std::string DescriptorFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the current file. This file contains the name
+// of the current manifest file. The result will be prefixed with
+// "dbname".
+std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname". The result will be prefixed with "dbname".
+std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+std::string InfoLogFileName(const std::string& dbname);
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname);
+
+// If filename is a leveldb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number. If the
+// filename was successfully parsed, returns true. Else return false.
+bool ParseFileName(const std::string& filename, uint64_t* number,
+ FileType* type);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_FILENAME_H_
diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc
new file mode 100644
index 0000000000..952f32008e
--- /dev/null
+++ b/src/leveldb/db/filename_test.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FileNameTest {};
+
+TEST(FileNameTest, Parse) {
+ Slice db;
+ FileType type;
+ uint64_t number;
+
+ // Successful parses
+ static struct {
+ const char* fname;
+ uint64_t number;
+ FileType type;
+ } cases[] = {
+ {"100.log", 100, kLogFile},
+ {"0.log", 0, kLogFile},
+ {"0.sst", 0, kTableFile},
+ {"0.ldb", 0, kTableFile},
+ {"CURRENT", 0, kCurrentFile},
+ {"LOCK", 0, kDBLockFile},
+ {"MANIFEST-2", 2, kDescriptorFile},
+ {"MANIFEST-7", 7, kDescriptorFile},
+ {"LOG", 0, kInfoLogFile},
+ {"LOG.old", 0, kInfoLogFile},
+ {"18446744073709551615.log", 18446744073709551615ull, kLogFile},
+ };
+ for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ std::string f = cases[i].fname;
+ ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+ ASSERT_EQ(cases[i].type, type) << f;
+ ASSERT_EQ(cases[i].number, number) << f;
+ }
+
+ // Errors
+ static const char* errors[] = {"",
+ "foo",
+ "foo-dx-100.log",
+ ".log",
+ "",
+ "manifest",
+ "CURREN",
+ "CURRENTX",
+ "MANIFES",
+ "MANIFEST",
+ "MANIFEST-",
+ "XMANIFEST-3",
+ "MANIFEST-3x",
+ "LOC",
+ "LOCKx",
+ "LO",
+ "LOGx",
+ "18446744073709551616.log",
+ "184467440737095516150.log",
+ "100",
+ "100.",
+ "100.lop"};
+ for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+ std::string f = errors[i];
+ ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+ }
+}
+
+TEST(FileNameTest, Construction) {
+ uint64_t number;
+ FileType type;
+ std::string fname;
+
+ fname = CurrentFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kCurrentFile, type);
+
+ fname = LockFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kDBLockFile, type);
+
+ fname = LogFileName("foo", 192);
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(192, number);
+ ASSERT_EQ(kLogFile, type);
+
+ fname = TableFileName("bar", 200);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(200, number);
+ ASSERT_EQ(kTableFile, type);
+
+ fname = DescriptorFileName("bar", 100);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100, number);
+ ASSERT_EQ(kDescriptorFile, type);
+
+ fname = TempFileName("tmp", 999);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(999, number);
+ ASSERT_EQ(kTempFile, type);
+
+ fname = InfoLogFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kInfoLogFile, type);
+
+ fname = OldInfoLogFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kInfoLogFile, type);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/leveldbutil.cc b/src/leveldb/db/leveldbutil.cc
new file mode 100644
index 0000000000..9ed9667d37
--- /dev/null
+++ b/src/leveldb/db/leveldbutil.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include "leveldb/dumpfile.h"
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+namespace {
+
+class StdoutPrinter : public WritableFile {
+ public:
+ Status Append(const Slice& data) override {
+ fwrite(data.data(), 1, data.size(), stdout);
+ return Status::OK();
+ }
+ Status Close() override { return Status::OK(); }
+ Status Flush() override { return Status::OK(); }
+ Status Sync() override { return Status::OK(); }
+ std::string GetName() const override { return "[stdout]"; }
+};
+
+bool HandleDumpCommand(Env* env, char** files, int num) {
+ StdoutPrinter printer;
+ bool ok = true;
+ for (int i = 0; i < num; i++) {
+ Status s = DumpFile(env, files[i], &printer);
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ ok = false;
+ }
+ }
+ return ok;
+}
+
+} // namespace
+} // namespace leveldb
+
+static void Usage() {
+ fprintf(stderr,
+ "Usage: leveldbutil command...\n"
+ " dump files... -- dump contents of specified files\n");
+}
+
+int main(int argc, char** argv) {
+ leveldb::Env* env = leveldb::Env::Default();
+ bool ok = true;
+ if (argc < 2) {
+ Usage();
+ ok = false;
+ } else {
+ std::string command = argv[1];
+ if (command == "dump") {
+ ok = leveldb::HandleDumpCommand(env, argv + 2, argc - 2);
+ } else {
+ Usage();
+ ok = false;
+ }
+ }
+ return (ok ? 0 : 1);
+}
diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h
new file mode 100644
index 0000000000..356e69fca2
--- /dev/null
+++ b/src/leveldb/db/log_format.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.md for more detail.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+
+namespace leveldb {
+namespace log {
+
+enum RecordType {
+ // Zero is reserved for preallocated files
+ kZeroType = 0,
+
+ kFullType = 1,
+
+ // For fragments
+ kFirstType = 2,
+ kMiddleType = 3,
+ kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte).
+static const int kHeaderSize = 4 + 2 + 1;
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
new file mode 100644
index 0000000000..1ccfb7b34a
--- /dev/null
+++ b/src/leveldb/db/log_reader.cc
@@ -0,0 +1,274 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Reader::Reporter::~Reporter() = default;
+
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+ uint64_t initial_offset)
+ : file_(file),
+ reporter_(reporter),
+ checksum_(checksum),
+ backing_store_(new char[kBlockSize]),
+ buffer_(),
+ eof_(false),
+ last_record_offset_(0),
+ end_of_buffer_offset_(0),
+ initial_offset_(initial_offset),
+ resyncing_(initial_offset > 0) {}
+
+Reader::~Reader() { delete[] backing_store_; }
+
+bool Reader::SkipToInitialBlock() {
+ const size_t offset_in_block = initial_offset_ % kBlockSize;
+ uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+ // Don't search a block if we'd be in the trailer
+ if (offset_in_block > kBlockSize - 6) {
+ block_start_location += kBlockSize;
+ }
+
+ end_of_buffer_offset_ = block_start_location;
+
+ // Skip to start of first block that can contain the initial record
+ if (block_start_location > 0) {
+ Status skip_status = file_->Skip(block_start_location);
+ if (!skip_status.ok()) {
+ ReportDrop(block_start_location, skip_status);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+ if (last_record_offset_ < initial_offset_) {
+ if (!SkipToInitialBlock()) {
+ return false;
+ }
+ }
+
+ scratch->clear();
+ record->clear();
+ bool in_fragmented_record = false;
+ // Record offset of the logical record that we're reading
+ // 0 is a dummy value to make compilers happy
+ uint64_t prospective_record_offset = 0;
+
+ Slice fragment;
+ while (true) {
+ const unsigned int record_type = ReadPhysicalRecord(&fragment);
+
+ // ReadPhysicalRecord may have only had an empty trailer remaining in its
+ // internal buffer. Calculate the offset of the next physical record now
+ // that it has returned, properly accounting for its header size.
+ uint64_t physical_record_offset =
+ end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
+
+ if (resyncing_) {
+ if (record_type == kMiddleType) {
+ continue;
+ } else if (record_type == kLastType) {
+ resyncing_ = false;
+ continue;
+ } else {
+ resyncing_ = false;
+ }
+ }
+
+ switch (record_type) {
+ case kFullType:
+ if (in_fragmented_record) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ if (!scratch->empty()) {
+ ReportCorruption(scratch->size(), "partial record without end(1)");
+ }
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->clear();
+ *record = fragment;
+ last_record_offset_ = prospective_record_offset;
+ return true;
+
+ case kFirstType:
+ if (in_fragmented_record) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ if (!scratch->empty()) {
+ ReportCorruption(scratch->size(), "partial record without end(2)");
+ }
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->assign(fragment.data(), fragment.size());
+ in_fragmented_record = true;
+ break;
+
+ case kMiddleType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ return true;
+ }
+ break;
+
+ case kEof:
+ if (in_fragmented_record) {
+ // This can be caused by the writer dying immediately after
+ // writing a physical record but before completing the next; don't
+ // treat it as a corruption, just ignore the entire logical record.
+ scratch->clear();
+ }
+ return false;
+
+ case kBadRecord:
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+ ReportCorruption(
+ (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+ buf);
+ in_fragmented_record = false;
+ scratch->clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+uint64_t Reader::LastRecordOffset() { return last_record_offset_; }
+
+void Reader::ReportCorruption(uint64_t bytes, const char* reason) {
+ ReportDrop(bytes, Status::Corruption(reason, file_->GetName()));
+}
+
+void Reader::ReportDrop(uint64_t bytes, const Status& reason) {
+ if (reporter_ != nullptr &&
+ end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+ reporter_->Corruption(static_cast<size_t>(bytes), reason);
+ }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+ while (true) {
+ if (buffer_.size() < kHeaderSize) {
+ if (!eof_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ eof_ = true;
+ return kEof;
+ } else if (buffer_.size() < kBlockSize) {
+ eof_ = true;
+ }
+ continue;
+ } else {
+ // Note that if buffer_ is non-empty, we have a truncated header at the
+ // end of the file, which can be caused by the writer crashing in the
+ // middle of writing the header. Instead of considering this an error,
+ // just report EOF.
+ buffer_.clear();
+ return kEof;
+ }
+ }
+
+ // Parse the header
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ if (kHeaderSize + length > buffer_.size()) {
+ size_t drop_size = buffer_.size();
+ buffer_.clear();
+ if (!eof_) {
+ ReportCorruption(drop_size, "bad record length");
+ return kBadRecord;
+ }
+ // If the end of the file has been reached without reading |length| bytes
+ // of payload, assume the writer died in the middle of writing the record.
+ // Don't report a corruption.
+ return kEof;
+ }
+
+ if (type == kZeroType && length == 0) {
+ // Skip zero length record without reporting any drops since
+ // such records are produced by the mmap based writing code in
+ // env_posix.cc that preallocates file regions.
+ buffer_.clear();
+ return kBadRecord;
+ }
+
+ // Check crc
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+ if (actual_crc != expected_crc) {
+ // Drop the rest of the buffer since "length" itself may have
+ // been corrupted and if we trust it, we could find some
+ // fragment of a real log record that just happens to look
+ // like a valid log record.
+ size_t drop_size = buffer_.size();
+ buffer_.clear();
+ ReportCorruption(drop_size, "checksum mismatch");
+ return kBadRecord;
+ }
+ }
+
+ buffer_.remove_prefix(kHeaderSize + length);
+
+ // Skip physical record that started before initial_offset_
+ if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+ initial_offset_) {
+ result->clear();
+ return kBadRecord;
+ }
+
+ *result = Slice(header + kHeaderSize, length);
+ return type;
+ }
+}
+
+} // namespace log
+} // namespace leveldb
diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h
new file mode 100644
index 0000000000..001da8948a
--- /dev/null
+++ b/src/leveldb/db/log_reader.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
+#define STORAGE_LEVELDB_DB_LOG_READER_H_
+
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class SequentialFile;
+
+namespace log {
+
+class Reader {
+ public:
+ // Interface for reporting errors.
+ class Reporter {
+ public:
+ virtual ~Reporter();
+
+ // Some corruption was detected. "size" is the approximate number
+ // of bytes dropped due to the corruption.
+ virtual void Corruption(size_t bytes, const Status& status) = 0;
+ };
+
+ // Create a reader that will return log records from "*file".
+ // "*file" must remain live while this Reader is in use.
+ //
+ // If "reporter" is non-null, it is notified whenever some data is
+ // dropped due to a detected corruption. "*reporter" must remain
+ // live while this Reader is in use.
+ //
+ // If "checksum" is true, verify checksums if available.
+ //
+ // The Reader will start reading at the first record located at physical
+ // position >= initial_offset within the file.
+ Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+ uint64_t initial_offset);
+
+ Reader(const Reader&) = delete;
+ Reader& operator=(const Reader&) = delete;
+
+ ~Reader();
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. May use
+ // "*scratch" as temporary storage. The contents filled in *record
+ // will only be valid until the next mutating operation on this
+ // reader or the next mutation to *scratch.
+ bool ReadRecord(Slice* record, std::string* scratch);
+
+ // Returns the physical offset of the last record returned by ReadRecord.
+ //
+ // Undefined before the first call to ReadRecord.
+ uint64_t LastRecordOffset();
+
+ private:
+ // Extend record types with the following special values
+ enum {
+ kEof = kMaxRecordType + 1,
+ // Returned whenever we find an invalid physical record.
+ // Currently there are three situations in which this happens:
+ // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+ // * The record is a 0-length record (No drop is reported)
+ // * The record is below constructor's initial_offset (No drop is reported)
+ kBadRecord = kMaxRecordType + 2
+ };
+
+ // Skips all blocks that are completely before "initial_offset_".
+ //
+ // Returns true on success. Handles reporting.
+ bool SkipToInitialBlock();
+
+ // Return type, or one of the preceding special values
+ unsigned int ReadPhysicalRecord(Slice* result);
+
+ // Reports dropped bytes to the reporter.
+ // buffer_ must be updated to remove the dropped bytes prior to invocation.
+ void ReportCorruption(uint64_t bytes, const char* reason);
+ void ReportDrop(uint64_t bytes, const Status& reason);
+
+ SequentialFile* const file_;
+ Reporter* const reporter_;
+ bool const checksum_;
+ char* const backing_store_;
+ Slice buffer_;
+ bool eof_; // Last Read() indicated EOF by returning < kBlockSize
+
+ // Offset of the last record returned by ReadRecord.
+ uint64_t last_record_offset_;
+ // Offset of the first location past the end of buffer_.
+ uint64_t end_of_buffer_offset_;
+
+ // Offset at which to start looking for the first record to return
+ uint64_t const initial_offset_;
+
+ // True if we are resynchronizing after a seek (initial_offset_ > 0). In
+ // particular, a run of kMiddleType and kLastType records can be silently
+ // skipped in this mode
+ bool resyncing_;
+};
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_READER_H_
diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc
new file mode 100644
index 0000000000..41fc043068
--- /dev/null
+++ b/src/leveldb/db/log_test.cc
@@ -0,0 +1,562 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+ std::string result;
+ while (result.size() < n) {
+ result.append(partial_string);
+ }
+ result.resize(n);
+ return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d.", n);
+ return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+ return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ public:
+ LogTest()
+ : reading_(false),
+ writer_(new Writer(&dest_)),
+ reader_(new Reader(&source_, &report_, true /*checksum*/,
+ 0 /*initial_offset*/)) {}
+
+ ~LogTest() {
+ delete writer_;
+ delete reader_;
+ }
+
+ void ReopenForAppend() {
+ delete writer_;
+ writer_ = new Writer(&dest_, dest_.contents_.size());
+ }
+
+ void Write(const std::string& msg) {
+ ASSERT_TRUE(!reading_) << "Write() after starting to read";
+ writer_->AddRecord(Slice(msg));
+ }
+
+ size_t WrittenBytes() const { return dest_.contents_.size(); }
+
+ std::string Read() {
+ if (!reading_) {
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ }
+ std::string scratch;
+ Slice record;
+ if (reader_->ReadRecord(&record, &scratch)) {
+ return record.ToString();
+ } else {
+ return "EOF";
+ }
+ }
+
+ void IncrementByte(int offset, int delta) {
+ dest_.contents_[offset] += delta;
+ }
+
+ void SetByte(int offset, char new_byte) {
+ dest_.contents_[offset] = new_byte;
+ }
+
+ void ShrinkSize(int bytes) {
+ dest_.contents_.resize(dest_.contents_.size() - bytes);
+ }
+
+ void FixChecksum(int header_offset, int len) {
+ // Compute crc of type/len/data
+ uint32_t crc = crc32c::Value(&dest_.contents_[header_offset + 6], 1 + len);
+ crc = crc32c::Mask(crc);
+ EncodeFixed32(&dest_.contents_[header_offset], crc);
+ }
+
+ void ForceError() { source_.force_error_ = true; }
+
+ size_t DroppedBytes() const { return report_.dropped_bytes_; }
+
+ std::string ReportMessage() const { return report_.message_; }
+
+ // Returns OK iff recorded error message contains "msg"
+ std::string MatchError(const std::string& msg) const {
+ if (report_.message_.find(msg) == std::string::npos) {
+ return report_.message_;
+ } else {
+ return "OK";
+ }
+ }
+
+ void WriteInitialOffsetLog() {
+ for (int i = 0; i < num_initial_offset_records_; i++) {
+ std::string record(initial_offset_record_sizes_[i],
+ static_cast<char>('a' + i));
+ Write(record);
+ }
+ }
+
+ void StartReadingAt(uint64_t initial_offset) {
+ delete reader_;
+ reader_ = new Reader(&source_, &report_, true /*checksum*/, initial_offset);
+ }
+
+ void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+ WriteInitialOffsetLog();
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ Reader* offset_reader = new Reader(&source_, &report_, true /*checksum*/,
+ WrittenBytes() + offset_past_end);
+ Slice record;
+ std::string scratch;
+ ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+ delete offset_reader;
+ }
+
+ void CheckInitialOffsetRecord(uint64_t initial_offset,
+ int expected_record_offset) {
+ WriteInitialOffsetLog();
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ Reader* offset_reader =
+ new Reader(&source_, &report_, true /*checksum*/, initial_offset);
+
+ // Read all records from expected_record_offset through the last one.
+ ASSERT_LT(expected_record_offset, num_initial_offset_records_);
+ for (; expected_record_offset < num_initial_offset_records_;
+ ++expected_record_offset) {
+ Slice record;
+ std::string scratch;
+ ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+ ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+ record.size());
+ ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+ offset_reader->LastRecordOffset());
+ ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+ }
+ delete offset_reader;
+ }
+
+ private:
+ class StringDest : public WritableFile {
+ public:
+ Status Close() override { return Status::OK(); }
+ Status Flush() override { return Status::OK(); }
+ Status Sync() override { return Status::OK(); }
+ Status Append(const Slice& slice) override {
+ contents_.append(slice.data(), slice.size());
+ return Status::OK();
+ }
+ std::string GetName() const override { return ""; }
+
+ std::string contents_;
+ };
+
+ class StringSource : public SequentialFile {
+ public:
+ StringSource() : force_error_(false), returned_partial_(false) {}
+
+ Status Read(size_t n, Slice* result, char* scratch) override {
+ ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+ if (force_error_) {
+ force_error_ = false;
+ returned_partial_ = true;
+ return Status::Corruption("read error");
+ }
+
+ if (contents_.size() < n) {
+ n = contents_.size();
+ returned_partial_ = true;
+ }
+ *result = Slice(contents_.data(), n);
+ contents_.remove_prefix(n);
+ return Status::OK();
+ }
+
+ Status Skip(uint64_t n) override {
+ if (n > contents_.size()) {
+ contents_.clear();
+ return Status::NotFound("in-memory file skipped past end");
+ }
+
+ contents_.remove_prefix(n);
+
+ return Status::OK();
+ }
+ std::string GetName() const { return ""; }
+
+ Slice contents_;
+ bool force_error_;
+ bool returned_partial_;
+ };
+
+ class ReportCollector : public Reader::Reporter {
+ public:
+ ReportCollector() : dropped_bytes_(0) {}
+ void Corruption(size_t bytes, const Status& status) override {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+
+ size_t dropped_bytes_;
+ std::string message_;
+ };
+
+ // Record metadata for testing initial offset functionality
+ static size_t initial_offset_record_sizes_[];
+ static uint64_t initial_offset_last_record_offsets_[];
+ static int num_initial_offset_records_;
+
+ StringDest dest_;
+ StringSource source_;
+ ReportCollector report_;
+ bool reading_;
+ Writer* writer_;
+ Reader* reader_;
+};
+
+size_t LogTest::initial_offset_record_sizes_[] = {
+ 10000, // Two sizable records in first block
+ 10000,
+ 2 * log::kBlockSize - 1000, // Span three blocks
+ 1,
+ 13716, // Consume all but two bytes of block 3.
+ log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
+};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] = {
+ 0,
+ kHeaderSize + 10000,
+ 2 * (kHeaderSize + 10000),
+ 2 * (kHeaderSize + 10000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+ 2 * (kHeaderSize + 10000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize +
+ kHeaderSize + 1,
+ 3 * log::kBlockSize,
+};
+
+// LogTest::initial_offset_last_record_offsets_ must be defined before this.
+int LogTest::num_initial_offset_records_ =
+ sizeof(LogTest::initial_offset_last_record_offsets_) / sizeof(uint64_t);
+
+TEST(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST(LogTest, ReadWrite) {
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+ Write("small");
+ Write(BigString("medium", 50000));
+ Write(BigString("large", 100000));
+ ASSERT_EQ("small", Read());
+ ASSERT_EQ(BigString("medium", 50000), Read());
+ ASSERT_EQ(BigString("large", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+ // Make a trailer that is exactly the same length as an empty record.
+ const int n = kBlockSize - 2 * kHeaderSize;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+ // Make a trailer that is exactly the same length as an empty record.
+ const int n = kBlockSize - 2 * kHeaderSize;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+ const int n = kBlockSize - 2 * kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+ const int n = kBlockSize - 2 * kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, OpenForAppend) {
+ Write("hello");
+ ReopenForAppend();
+ Write("world");
+ ASSERT_EQ("hello", Read());
+ ASSERT_EQ("world", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+ const int N = 500;
+ Random write_rnd(301);
+ for (int i = 0; i < N; i++) {
+ Write(RandomSkewedString(i, &write_rnd));
+ }
+ Random read_rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+ Write("foo");
+ ForceError();
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+ Write("foo");
+ // Type is stored in header[6]
+ IncrementByte(6, 100);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecordIsIgnored) {
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read());
+ // Truncated last record is ignored, not treated as an error.
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, BadLength) {
+ const int kPayloadSize = kBlockSize - kHeaderSize;
+ Write(BigString("bar", kPayloadSize));
+ Write("foo");
+ // Least significant size byte is stored in header[4].
+ IncrementByte(4, 1);
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, BadLengthAtEndIsIgnored) {
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ChecksumMismatch) {
+ Write("foo");
+ IncrementByte(0, 10);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(10, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+ Write("foo");
+ SetByte(6, kMiddleType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+ Write("foo");
+ SetByte(6, kLastType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+ Write("foo");
+ Write("bar");
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+ Write("foo");
+ Write(BigString("bar", 100000));
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ(BigString("bar", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, MissingLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Remove the LAST block, including header.
+ ShrinkSize(14);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+}
+
+TEST(LogTest, PartialLastIsIgnored) {
+ Write(BigString("bar", kBlockSize));
+ // Cause a bad record length in the LAST block.
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+}
+
+TEST(LogTest, SkipIntoMultiRecord) {
+ // Consider a fragmented record:
+ // first(R1), middle(R1), last(R1), first(R2)
+ // If initial_offset points to a record after first(R1) but before first(R2)
+ // incomplete fragment errors are not actual errors, and must be suppressed
+ // until a new first or full record is encountered.
+ Write(BigString("foo", 3 * kBlockSize));
+ Write("correct");
+ StartReadingAt(kBlockSize);
+
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+ // Consider two fragmented records:
+ // first(R1) last(R1) first(R2) last(R2)
+ // where the middle two fragments disappear. We do not want
+ // first(R1),last(R2) to get joined and returned as a valid record.
+
+ // Write records that span two blocks
+ Write(BigString("foo", kBlockSize));
+ Write(BigString("bar", kBlockSize));
+ Write("correct");
+
+ // Wipe the middle block
+ for (int offset = kBlockSize; offset < 2 * kBlockSize; offset++) {
+ SetByte(offset, 'x');
+ }
+
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("EOF", Read());
+ const size_t dropped = DroppedBytes();
+ ASSERT_LE(dropped, 2 * kBlockSize + 100);
+ ASSERT_GE(dropped, 2 * kBlockSize);
+}
+
+TEST(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
+
+TEST(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
+
+TEST(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
+
+TEST(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); }
+
+TEST(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); }
+
+TEST(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); }
+
+TEST(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); }
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+ CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+ CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+ CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+ CheckInitialOffsetRecord(
+ 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+ 3);
+}
+
+TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
+ CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
+}
+
+TEST(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
+
+TEST(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
+
+} // namespace log
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc
new file mode 100644
index 0000000000..bfb16fb486
--- /dev/null
+++ b/src/leveldb/db/log_writer.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+static void InitTypeCrc(uint32_t* type_crc) {
+ for (int i = 0; i <= kMaxRecordType; i++) {
+ char t = static_cast<char>(i);
+ type_crc[i] = crc32c::Value(&t, 1);
+ }
+}
+
+Writer::Writer(WritableFile* dest) : dest_(dest), block_offset_(0) {
+ InitTypeCrc(type_crc_);
+}
+
+Writer::Writer(WritableFile* dest, uint64_t dest_length)
+ : dest_(dest), block_offset_(dest_length % kBlockSize) {
+ InitTypeCrc(type_crc_);
+}
+
+Writer::~Writer() = default;
+
+Status Writer::AddRecord(const Slice& slice) {
+ const char* ptr = slice.data();
+ size_t left = slice.size();
+
+ // Fragment the record if necessary and emit it. Note that if slice
+ // is empty, we still want to iterate once to emit a single
+ // zero-length record
+ Status s;
+ bool begin = true;
+ do {
+ const int leftover = kBlockSize - block_offset_;
+ assert(leftover >= 0);
+ if (leftover < kHeaderSize) {
+ // Switch to a new block
+ if (leftover > 0) {
+ // Fill the trailer (literal below relies on kHeaderSize being 7)
+ static_assert(kHeaderSize == 7, "");
+ dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+ }
+ block_offset_ = 0;
+ }
+
+ // Invariant: we never leave < kHeaderSize bytes in a block.
+ assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+ const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+ const size_t fragment_length = (left < avail) ? left : avail;
+
+ RecordType type;
+ const bool end = (left == fragment_length);
+ if (begin && end) {
+ type = kFullType;
+ } else if (begin) {
+ type = kFirstType;
+ } else if (end) {
+ type = kLastType;
+ } else {
+ type = kMiddleType;
+ }
+
+ s = EmitPhysicalRecord(type, ptr, fragment_length);
+ ptr += fragment_length;
+ left -= fragment_length;
+ begin = false;
+ } while (s.ok() && left > 0);
+ return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr,
+ size_t length) {
+ assert(length <= 0xffff); // Must fit in two bytes
+ assert(block_offset_ + kHeaderSize + length <= kBlockSize);
+
+ // Format the header
+ char buf[kHeaderSize];
+ buf[4] = static_cast<char>(length & 0xff);
+ buf[5] = static_cast<char>(length >> 8);
+ buf[6] = static_cast<char>(t);
+
+ // Compute the crc of the record type and the payload.
+ uint32_t crc = crc32c::Extend(type_crc_[t], ptr, length);
+ crc = crc32c::Mask(crc); // Adjust for storage
+ EncodeFixed32(buf, crc);
+
+ // Write the header and the payload
+ Status s = dest_->Append(Slice(buf, kHeaderSize));
+ if (s.ok()) {
+ s = dest_->Append(Slice(ptr, length));
+ if (s.ok()) {
+ s = dest_->Flush();
+ }
+ }
+ block_offset_ += kHeaderSize + length;
+ return s;
+}
+
+} // namespace log
+} // namespace leveldb
diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h
new file mode 100644
index 0000000000..c0a21147ee
--- /dev/null
+++ b/src/leveldb/db/log_writer.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
+#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
+
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class WritableFile;
+
+namespace log {
+
+class Writer {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this Writer is in use.
+ explicit Writer(WritableFile* dest);
+
+ // Create a writer that will append data to "*dest".
+ // "*dest" must have initial length "dest_length".
+ // "*dest" must remain live while this Writer is in use.
+ Writer(WritableFile* dest, uint64_t dest_length);
+
+ Writer(const Writer&) = delete;
+ Writer& operator=(const Writer&) = delete;
+
+ ~Writer();
+
+ Status AddRecord(const Slice& slice);
+
+ private:
+ Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+ WritableFile* dest_;
+ int block_offset_; // Current offset in block
+
+ // crc32c values for all supported record types. These are
+ // pre-computed to reduce the overhead of computing the crc of the
+ // record type stored in the header.
+ uint32_t type_crc_[kMaxRecordType + 1];
+};
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_
diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc
new file mode 100644
index 0000000000..00931d4671
--- /dev/null
+++ b/src/leveldb/db/memtable.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+#include "db/dbformat.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static Slice GetLengthPrefixedSlice(const char* data) {
+ uint32_t len;
+ const char* p = data;
+ p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
+ return Slice(p, len);
+}
+
+MemTable::MemTable(const InternalKeyComparator& comparator)
+ : comparator_(comparator), refs_(0), table_(comparator_, &arena_) {}
+
+MemTable::~MemTable() { assert(refs_ == 0); }
+
+size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
+
+int MemTable::KeyComparator::operator()(const char* aptr,
+ const char* bptr) const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice a = GetLengthPrefixedSlice(aptr);
+ Slice b = GetLengthPrefixedSlice(bptr);
+ return comparator.Compare(a, b);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+ scratch->clear();
+ PutVarint32(scratch, target.size());
+ scratch->append(target.data(), target.size());
+ return scratch->data();
+}
+
+class MemTableIterator : public Iterator {
+ public:
+ explicit MemTableIterator(MemTable::Table* table) : iter_(table) {}
+
+ MemTableIterator(const MemTableIterator&) = delete;
+ MemTableIterator& operator=(const MemTableIterator&) = delete;
+
+ ~MemTableIterator() override = default;
+
+ bool Valid() const override { return iter_.Valid(); }
+ void Seek(const Slice& k) override { iter_.Seek(EncodeKey(&tmp_, k)); }
+ void SeekToFirst() override { iter_.SeekToFirst(); }
+ void SeekToLast() override { iter_.SeekToLast(); }
+ void Next() override { iter_.Next(); }
+ void Prev() override { iter_.Prev(); }
+ Slice key() const override { return GetLengthPrefixedSlice(iter_.key()); }
+ Slice value() const override {
+ Slice key_slice = GetLengthPrefixedSlice(iter_.key());
+ return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ }
+
+ Status status() const override { return Status::OK(); }
+
+ private:
+ MemTable::Table::Iterator iter_;
+ std::string tmp_; // For passing to EncodeKey
+};
+
+Iterator* MemTable::NewIterator() { return new MemTableIterator(&table_); }
+
+void MemTable::Add(SequenceNumber s, ValueType type, const Slice& key,
+ const Slice& value) {
+ // Format of an entry is concatenation of:
+ // key_size : varint32 of internal_key.size()
+ // key bytes : char[internal_key.size()]
+ // value_size : varint32 of value.size()
+ // value bytes : char[value.size()]
+ size_t key_size = key.size();
+ size_t val_size = value.size();
+ size_t internal_key_size = key_size + 8;
+ const size_t encoded_len = VarintLength(internal_key_size) +
+ internal_key_size + VarintLength(val_size) +
+ val_size;
+ char* buf = arena_.Allocate(encoded_len);
+ char* p = EncodeVarint32(buf, internal_key_size);
+ memcpy(p, key.data(), key_size);
+ p += key_size;
+ EncodeFixed64(p, (s << 8) | type);
+ p += 8;
+ p = EncodeVarint32(p, val_size);
+ memcpy(p, value.data(), val_size);
+ assert(p + val_size == buf + encoded_len);
+ table_.Insert(buf);
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
+ Slice memkey = key.memtable_key();
+ Table::Iterator iter(&table_);
+ iter.Seek(memkey.data());
+ if (iter.Valid()) {
+ // entry format is:
+ // klength varint32
+ // userkey char[klength]
+ // tag uint64
+ // vlength varint32
+ // value char[vlength]
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter.key();
+ uint32_t key_length;
+ const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+ if (comparator_.comparator.user_comparator()->Compare(
+ Slice(key_ptr, key_length - 8), key.user_key()) == 0) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ switch (static_cast<ValueType>(tag & 0xff)) {
+ case kTypeValue: {
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+ value->assign(v.data(), v.size());
+ return true;
+ }
+ case kTypeDeletion:
+ *s = Status::NotFound(Slice());
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
new file mode 100644
index 0000000000..9d986b1070
--- /dev/null
+++ b/src/leveldb/db/memtable.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
+#define STORAGE_LEVELDB_DB_MEMTABLE_H_
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "leveldb/db.h"
+#include "util/arena.h"
+
+namespace leveldb {
+
+class InternalKeyComparator;
+class MemTableIterator;
+
+class MemTable {
+ public:
+ // MemTables are reference counted. The initial reference count
+ // is zero and the caller must call Ref() at least once.
+ explicit MemTable(const InternalKeyComparator& comparator);
+
+ MemTable(const MemTable&) = delete;
+ MemTable& operator=(const MemTable&) = delete;
+
+ // Increase reference count.
+ void Ref() { ++refs_; }
+
+ // Drop reference count. Delete if no more references exist.
+ void Unref() {
+ --refs_;
+ assert(refs_ >= 0);
+ if (refs_ <= 0) {
+ delete this;
+ }
+ }
+
+ // Returns an estimate of the number of bytes of data in use by this
+ // data structure. It is safe to call when MemTable is being modified.
+ size_t ApproximateMemoryUsage();
+
+ // Return an iterator that yields the contents of the memtable.
+ //
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live. The keys returned by this
+ // iterator are internal keys encoded by AppendInternalKey in the
+ // db/format.{h,cc} module.
+ Iterator* NewIterator();
+
+ // Add an entry into memtable that maps key to value at the
+ // specified sequence number and with the specified type.
+ // Typically value will be empty if type==kTypeDeletion.
+ void Add(SequenceNumber seq, ValueType type, const Slice& key,
+ const Slice& value);
+
+ // If memtable contains a value for key, store it in *value and return true.
+ // If memtable contains a deletion for key, store a NotFound() error
+ // in *status and return true.
+ // Else, return false.
+ bool Get(const LookupKey& key, std::string* value, Status* s);
+
+ private:
+ friend class MemTableIterator;
+ friend class MemTableBackwardIterator;
+
+ struct KeyComparator {
+ const InternalKeyComparator comparator;
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
+ int operator()(const char* a, const char* b) const;
+ };
+
+ typedef SkipList<const char*, KeyComparator> Table;
+
+ ~MemTable(); // Private since only Unref() should be used to delete it
+
+ KeyComparator comparator_;
+ int refs_;
+ Arena arena_;
+ Table table_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_
diff --git a/src/leveldb/db/recovery_test.cc b/src/leveldb/db/recovery_test.cc
new file mode 100644
index 0000000000..547a9591ea
--- /dev/null
+++ b/src/leveldb/db/recovery_test.cc
@@ -0,0 +1,330 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class RecoveryTest {
+ public:
+ RecoveryTest() : env_(Env::Default()), db_(nullptr) {
+ dbname_ = test::TmpDir() + "/recovery_test";
+ DestroyDB(dbname_, Options());
+ Open();
+ }
+
+ ~RecoveryTest() {
+ Close();
+ DestroyDB(dbname_, Options());
+ }
+
+ DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
+ Env* env() const { return env_; }
+
+ bool CanAppend() {
+ WritableFile* tmp;
+ Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
+ delete tmp;
+ if (s.IsNotSupportedError()) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void Close() {
+ delete db_;
+ db_ = nullptr;
+ }
+
+ Status OpenWithStatus(Options* options = nullptr) {
+ Close();
+ Options opts;
+ if (options != nullptr) {
+ opts = *options;
+ } else {
+ opts.reuse_logs = true; // TODO(sanjay): test both ways
+ opts.create_if_missing = true;
+ }
+ if (opts.env == nullptr) {
+ opts.env = env_;
+ }
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ void Open(Options* options = nullptr) {
+ ASSERT_OK(OpenWithStatus(options));
+ ASSERT_EQ(1, NumLogs());
+ }
+
+ Status Put(const std::string& k, const std::string& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+ std::string result;
+ Status s = db_->Get(ReadOptions(), k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string ManifestFileName() {
+ std::string current;
+ ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), &current));
+ size_t len = current.size();
+ if (len > 0 && current[len - 1] == '\n') {
+ current.resize(len - 1);
+ }
+ return dbname_ + "/" + current;
+ }
+
+ std::string LogName(uint64_t number) { return LogFileName(dbname_, number); }
+
+ size_t DeleteLogFiles() {
+ // Linux allows unlinking open files, but Windows does not.
+ // Closing the db allows for file deletion.
+ Close();
+ std::vector<uint64_t> logs = GetFiles(kLogFile);
+ for (size_t i = 0; i < logs.size(); i++) {
+ ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
+ }
+ return logs.size();
+ }
+
+ void DeleteManifestFile() { ASSERT_OK(env_->DeleteFile(ManifestFileName())); }
+
+ uint64_t FirstLogFile() { return GetFiles(kLogFile)[0]; }
+
+ std::vector<uint64_t> GetFiles(FileType t) {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ std::vector<uint64_t> result;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == t) {
+ result.push_back(number);
+ }
+ }
+ return result;
+ }
+
+ int NumLogs() { return GetFiles(kLogFile).size(); }
+
+ int NumTables() { return GetFiles(kTableFile).size(); }
+
+ uint64_t FileSize(const std::string& fname) {
+ uint64_t result;
+ ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
+ return result;
+ }
+
+ void CompactMemTable() { dbfull()->TEST_CompactMemTable(); }
+
+ // Directly construct a log file that sets key to val.
+ void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
+ std::string fname = LogFileName(dbname_, lognum);
+ WritableFile* file;
+ ASSERT_OK(env_->NewWritableFile(fname, &file));
+ log::Writer writer(file);
+ WriteBatch batch;
+ batch.Put(key, val);
+ WriteBatchInternal::SetSequence(&batch, seq);
+ ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
+ ASSERT_OK(file->Flush());
+ delete file;
+ }
+
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+};
+
+TEST(RecoveryTest, ManifestReused) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ std::string old_manifest = ManifestFileName();
+ Open();
+ ASSERT_EQ(old_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+ Open();
+ ASSERT_EQ(old_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(RecoveryTest, LargeManifestCompacted) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ std::string old_manifest = ManifestFileName();
+
+ // Pad with zeroes to make manifest file very big.
+ {
+ uint64_t len = FileSize(old_manifest);
+ WritableFile* file;
+ ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
+ std::string zeroes(3 * 1048576 - static_cast<size_t>(len), 0);
+ ASSERT_OK(file->Append(zeroes));
+ ASSERT_OK(file->Flush());
+ delete file;
+ }
+
+ Open();
+ std::string new_manifest = ManifestFileName();
+ ASSERT_NE(old_manifest, new_manifest);
+ ASSERT_GT(10000, FileSize(new_manifest));
+ ASSERT_EQ("bar", Get("foo"));
+
+ Open();
+ ASSERT_EQ(new_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(RecoveryTest, NoLogFiles) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ(1, DeleteLogFiles());
+ Open();
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ Open();
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(RecoveryTest, LogFileReuse) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put("foo", "bar"));
+ if (i == 0) {
+ // Compact to ensure current log is empty
+ CompactMemTable();
+ }
+ Close();
+ ASSERT_EQ(1, NumLogs());
+ uint64_t number = FirstLogFile();
+ if (i == 0) {
+ ASSERT_EQ(0, FileSize(LogName(number)));
+ } else {
+ ASSERT_LT(0, FileSize(LogName(number)));
+ }
+ Open();
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
+ ASSERT_EQ("bar", Get("foo"));
+ Open();
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
+ ASSERT_EQ("bar", Get("foo"));
+ }
+}
+
+TEST(RecoveryTest, MultipleMemTables) {
+ // Make a large log.
+ const int kNum = 1000;
+ for (int i = 0; i < kNum; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%050d", i);
+ ASSERT_OK(Put(buf, buf));
+ }
+ ASSERT_EQ(0, NumTables());
+ Close();
+ ASSERT_EQ(0, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ uint64_t old_log_file = FirstLogFile();
+
+ // Force creation of multiple memtables by reducing the write buffer size.
+ Options opt;
+ opt.reuse_logs = true;
+ opt.write_buffer_size = (kNum * 100) / 2;
+ Open(&opt);
+ ASSERT_LE(2, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
+ for (int i = 0; i < kNum; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%050d", i);
+ ASSERT_EQ(buf, Get(buf));
+ }
+}
+
+TEST(RecoveryTest, MultipleLogFiles) {
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ ASSERT_EQ(1, NumLogs());
+
+ // Make a bunch of uncompacted log files.
+ uint64_t old_log = FirstLogFile();
+ MakeLogFile(old_log + 1, 1000, "hello", "world");
+ MakeLogFile(old_log + 2, 1001, "hi", "there");
+ MakeLogFile(old_log + 3, 1002, "foo", "bar2");
+
+ // Recover and check that all log files were processed.
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ uint64_t new_log = FirstLogFile();
+ ASSERT_LE(old_log + 3, new_log);
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+
+ // Test that previous recovery produced recoverable state.
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ if (CanAppend()) {
+ ASSERT_EQ(new_log, FirstLogFile());
+ }
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+
+ // Check that introducing an older log file does not cause it to be re-read.
+ Close();
+ MakeLogFile(old_log + 1, 2000, "hello", "stale write");
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ if (CanAppend()) {
+ ASSERT_EQ(new_log, FirstLogFile());
+ }
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+}
+
+TEST(RecoveryTest, ManifestMissing) {
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ DeleteManifestFile();
+
+ Status status = OpenWithStatus();
+ ASSERT_TRUE(status.IsCorruption());
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc
new file mode 100644
index 0000000000..04847c3bbf
--- /dev/null
+++ b/src/leveldb/db/repair.cc
@@ -0,0 +1,450 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+// (a) smallest/largest for the table
+// (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+// - log number is set to zero
+// - next-file-number is set to 1 + largest file number we found
+// - last-sequence-number is set to largest sequence# found across
+// all tables (see 2c)
+// - compaction pointers are cleared
+// - every table file is added at level 0
+//
+// Possible optimization 1:
+// (a) Compute total size and use to pick appropriate max-level M
+// (b) Sort tables by largest sequence# in the table
+// (c) For each table: if it overlaps earlier table, place in level-0,
+// else place in level-M.
+// Possible optimization 2:
+// Store per-table metadata (smallest, largest, largest-seq#, ...)
+// in the table's meta section to speed up ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+ Repairer(const std::string& dbname, const Options& options)
+ : dbname_(dbname),
+ env_(options.env),
+ icmp_(options.comparator),
+ ipolicy_(options.filter_policy),
+ options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+ owns_info_log_(options_.info_log != options.info_log),
+ owns_cache_(options_.block_cache != options.block_cache),
+ next_file_number_(1) {
+ // TableCache can be small since we expect each table to be opened once.
+ table_cache_ = new TableCache(dbname_, options_, 10);
+ }
+
+ ~Repairer() {
+ delete table_cache_;
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+ if (owns_cache_) {
+ delete options_.block_cache;
+ }
+ }
+
+ Status Run() {
+ Status status = FindFiles();
+ if (status.ok()) {
+ ConvertLogFilesToTables();
+ ExtractMetaData();
+ status = WriteDescriptor();
+ }
+ if (status.ok()) {
+ unsigned long long bytes = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ bytes += tables_[i].meta.file_size;
+ }
+ Log(options_.info_log,
+ "**** Repaired leveldb %s; "
+ "recovered %d files; %llu bytes. "
+ "Some data may have been lost. "
+ "****",
+ dbname_.c_str(), static_cast<int>(tables_.size()), bytes);
+ }
+ return status;
+ }
+
+ private:
+ struct TableInfo {
+ FileMetaData meta;
+ SequenceNumber max_sequence;
+ };
+
+ Status FindFiles() {
+ std::vector<std::string> filenames;
+ Status status = env_->GetChildren(dbname_, &filenames);
+ if (!status.ok()) {
+ return status;
+ }
+ if (filenames.empty()) {
+ return Status::IOError(dbname_, "repair found no files");
+ }
+
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ if (type == kDescriptorFile) {
+ manifests_.push_back(filenames[i]);
+ } else {
+ if (number + 1 > next_file_number_) {
+ next_file_number_ = number + 1;
+ }
+ if (type == kLogFile) {
+ logs_.push_back(number);
+ } else if (type == kTableFile) {
+ table_numbers_.push_back(number);
+ } else {
+ // Ignore other files
+ }
+ }
+ }
+ }
+ return status;
+ }
+
+ void ConvertLogFilesToTables() {
+ for (size_t i = 0; i < logs_.size(); i++) {
+ std::string logname = LogFileName(dbname_, logs_[i]);
+ Status status = ConvertLogToTable(logs_[i]);
+ if (!status.ok()) {
+ Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+ (unsigned long long)logs_[i], status.ToString().c_str());
+ }
+ ArchiveFile(logname);
+ }
+ }
+
+ Status ConvertLogToTable(uint64_t log) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ uint64_t lognum;
+ void Corruption(size_t bytes, const Status& s) override {
+ // We print error messages for corruption, but continue repairing.
+ Log(info_log, "Log #%llu: dropping %d bytes; %s",
+ (unsigned long long)lognum, static_cast<int>(bytes),
+ s.ToString().c_str());
+ }
+ };
+
+ // Open the log file
+ std::string logname = LogFileName(dbname_, log);
+ SequentialFile* lfile;
+ Status status = env_->NewSequentialFile(logname, &lfile);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.lognum = log;
+ // We intentionally make log::Reader do checksumming so that
+ // corruptions cause entire commits to be skipped instead of
+ // propagating bad information (like overly large sequence
+ // numbers).
+ log::Reader reader(lfile, &reporter, false /*do not checksum*/,
+ 0 /*initial_offset*/);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ MemTable* mem = new MemTable(icmp_);
+ mem->Ref();
+ int counter = 0;
+ while (reader.ReadRecord(&record, &scratch)) {
+ if (record.size() < 12) {
+ reporter.Corruption(record.size(),
+ Status::Corruption("log record too small", logname));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ status = WriteBatchInternal::InsertInto(&batch, mem);
+ if (status.ok()) {
+ counter += WriteBatchInternal::Count(&batch);
+ } else {
+ Log(options_.info_log, "Log #%llu: ignoring %s",
+ (unsigned long long)log, status.ToString().c_str());
+ status = Status::OK(); // Keep going with rest of file
+ }
+ }
+ delete lfile;
+
+ // Do not record a version edit for this conversion to a Table
+ // since ExtractMetaData() will also generate edits.
+ FileMetaData meta;
+ meta.number = next_file_number_++;
+ Iterator* iter = mem->NewIterator();
+ status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+ delete iter;
+ mem->Unref();
+ mem = nullptr;
+ if (status.ok()) {
+ if (meta.file_size > 0) {
+ table_numbers_.push_back(meta.number);
+ }
+ }
+ Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+ (unsigned long long)log, counter, (unsigned long long)meta.number,
+ status.ToString().c_str());
+ return status;
+ }
+
+ void ExtractMetaData() {
+ for (size_t i = 0; i < table_numbers_.size(); i++) {
+ ScanTable(table_numbers_[i]);
+ }
+ }
+
+ Iterator* NewTableIterator(const FileMetaData& meta) {
+ // Same as compaction iterators: if paranoid_checks are on, turn
+ // on checksum verification.
+ ReadOptions r;
+ r.verify_checksums = options_.paranoid_checks;
+ return table_cache_->NewIterator(r, meta.number, meta.file_size);
+ }
+
+ void ScanTable(uint64_t number) {
+ TableInfo t;
+ t.meta.number = number;
+ std::string fname = TableFileName(dbname_, number);
+ Status status = env_->GetFileSize(fname, &t.meta.file_size);
+ if (!status.ok()) {
+ // Try alternate file name.
+ fname = SSTTableFileName(dbname_, number);
+ Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
+ if (s2.ok()) {
+ status = Status::OK();
+ }
+ }
+ if (!status.ok()) {
+ ArchiveFile(TableFileName(dbname_, number));
+ ArchiveFile(SSTTableFileName(dbname_, number));
+ Log(options_.info_log, "Table #%llu: dropped: %s",
+ (unsigned long long)t.meta.number, status.ToString().c_str());
+ return;
+ }
+
+ // Extract metadata by scanning through table.
+ int counter = 0;
+ Iterator* iter = NewTableIterator(t.meta);
+ bool empty = true;
+ ParsedInternalKey parsed;
+ t.max_sequence = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ if (!ParseInternalKey(key, &parsed)) {
+ Log(options_.info_log, "Table #%llu: unparsable key %s",
+ (unsigned long long)t.meta.number, EscapeString(key).c_str());
+ continue;
+ }
+
+ counter++;
+ if (empty) {
+ empty = false;
+ t.meta.smallest.DecodeFrom(key);
+ }
+ t.meta.largest.DecodeFrom(key);
+ if (parsed.sequence > t.max_sequence) {
+ t.max_sequence = parsed.sequence;
+ }
+ }
+ if (!iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
+ Log(options_.info_log, "Table #%llu: %d entries %s",
+ (unsigned long long)t.meta.number, counter, status.ToString().c_str());
+
+ if (status.ok()) {
+ tables_.push_back(t);
+ } else {
+ RepairTable(fname, t); // RepairTable archives input file.
+ }
+ }
+
+ void RepairTable(const std::string& src, TableInfo t) {
+ // We will copy src contents to a new table and then rename the
+ // new table over the source.
+
+ // Create builder.
+ std::string copy = TableFileName(dbname_, next_file_number_++);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(copy, &file);
+ if (!s.ok()) {
+ return;
+ }
+ TableBuilder* builder = new TableBuilder(options_, file);
+
+ // Copy data.
+ Iterator* iter = NewTableIterator(t.meta);
+ int counter = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ builder->Add(iter->key(), iter->value());
+ counter++;
+ }
+ delete iter;
+
+ ArchiveFile(src);
+ if (counter == 0) {
+ builder->Abandon(); // Nothing to save
+ } else {
+ s = builder->Finish();
+ if (s.ok()) {
+ t.meta.file_size = builder->FileSize();
+ }
+ }
+ delete builder;
+ builder = nullptr;
+
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = nullptr;
+
+ if (counter > 0 && s.ok()) {
+ std::string orig = TableFileName(dbname_, t.meta.number);
+ s = env_->RenameFile(copy, orig);
+ if (s.ok()) {
+ Log(options_.info_log, "Table #%llu: %d entries repaired",
+ (unsigned long long)t.meta.number, counter);
+ tables_.push_back(t);
+ }
+ }
+ if (!s.ok()) {
+ env_->DeleteFile(copy);
+ }
+ }
+
+ Status WriteDescriptor() {
+ std::string tmp = TempFileName(dbname_, 1);
+ WritableFile* file;
+ Status status = env_->NewWritableFile(tmp, &file);
+ if (!status.ok()) {
+ return status;
+ }
+
+ SequenceNumber max_sequence = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ if (max_sequence < tables_[i].max_sequence) {
+ max_sequence = tables_[i].max_sequence;
+ }
+ }
+
+ edit_.SetComparatorName(icmp_.user_comparator()->Name());
+ edit_.SetLogNumber(0);
+ edit_.SetNextFile(next_file_number_);
+ edit_.SetLastSequence(max_sequence);
+
+ for (size_t i = 0; i < tables_.size(); i++) {
+ // TODO(opt): separate out into multiple levels
+ const TableInfo& t = tables_[i];
+ edit_.AddFile(0, t.meta.number, t.meta.file_size, t.meta.smallest,
+ t.meta.largest);
+ }
+
+ // fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+ {
+ log::Writer log(file);
+ std::string record;
+ edit_.EncodeTo(&record);
+ status = log.AddRecord(record);
+ }
+ if (status.ok()) {
+ status = file->Close();
+ }
+ delete file;
+ file = nullptr;
+
+ if (!status.ok()) {
+ env_->DeleteFile(tmp);
+ } else {
+ // Discard older manifests
+ for (size_t i = 0; i < manifests_.size(); i++) {
+ ArchiveFile(dbname_ + "/" + manifests_[i]);
+ }
+
+ // Install new manifest
+ status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+ if (status.ok()) {
+ status = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(tmp);
+ }
+ }
+ return status;
+ }
+
+ void ArchiveFile(const std::string& fname) {
+ // Move into another directory. E.g., for
+ // dir/foo
+ // rename to
+ // dir/lost/foo
+ const char* slash = strrchr(fname.c_str(), '/');
+ std::string new_dir;
+ if (slash != nullptr) {
+ new_dir.assign(fname.data(), slash - fname.data());
+ }
+ new_dir.append("/lost");
+ env_->CreateDir(new_dir); // Ignore error
+ std::string new_file = new_dir;
+ new_file.append("/");
+ new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+ Status s = env_->RenameFile(fname, new_file);
+ Log(options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+ s.ToString().c_str());
+ }
+
+ const std::string dbname_;
+ Env* const env_;
+ InternalKeyComparator const icmp_;
+ InternalFilterPolicy const ipolicy_;
+ const Options options_;
+ bool owns_info_log_;
+ bool owns_cache_;
+ TableCache* table_cache_;
+ VersionEdit edit_;
+
+ std::vector<std::string> manifests_;
+ std::vector<uint64_t> table_numbers_;
+ std::vector<uint64_t> logs_;
+ std::vector<TableInfo> tables_;
+ uint64_t next_file_number_;
+};
+} // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+ Repairer repairer(dbname, options);
+ return repairer.Run();
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h
new file mode 100644
index 0000000000..a59b45b380
--- /dev/null
+++ b/src/leveldb/db/skiplist.h
@@ -0,0 +1,382 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
+#define STORAGE_LEVELDB_DB_SKIPLIST_H_
+
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress. Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed. This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template <typename Key, class Comparator>
+class SkipList {
+ private:
+ struct Node;
+
+ public:
+ // Create a new SkipList object that will use "cmp" for comparing keys,
+ // and will allocate memory using "*arena". Objects allocated in the arena
+ // must remain allocated for the lifetime of the skiplist object.
+ explicit SkipList(Comparator cmp, Arena* arena);
+
+ SkipList(const SkipList&) = delete;
+ SkipList& operator=(const SkipList&) = delete;
+
+ // Insert key into the list.
+ // REQUIRES: nothing that compares equal to key is currently in the list.
+ void Insert(const Key& key);
+
+ // Returns true iff an entry that compares equal to key is in the list.
+ bool Contains(const Key& key) const;
+
+ // Iteration over the contents of a skip list
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified list.
+ // The returned iterator is not valid.
+ explicit Iterator(const SkipList* list);
+
+ // Returns true iff the iterator is positioned at a valid node.
+ bool Valid() const;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ const Key& key() const;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ void Next();
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ void Prev();
+
+ // Advance to the first entry with a key >= target
+ void Seek(const Key& target);
+
+ // Position at the first entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToFirst();
+
+ // Position at the last entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToLast();
+
+ private:
+ const SkipList* list_;
+ Node* node_;
+ // Intentionally copyable
+ };
+
+ private:
+ enum { kMaxHeight = 12 };
+
+ inline int GetMaxHeight() const {
+ return max_height_.load(std::memory_order_relaxed);
+ }
+
+ Node* NewNode(const Key& key, int height);
+ int RandomHeight();
+ bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+ // Return true if key is greater than the data stored in "n"
+ bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+ // Return the earliest node that comes at or after key.
+ // Return nullptr if there is no such node.
+ //
+ // If prev is non-null, fills prev[level] with pointer to previous
+ // node at "level" for every level in [0..max_height_-1].
+ Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+ // Return the latest node with a key < key.
+ // Return head_ if there is no such node.
+ Node* FindLessThan(const Key& key) const;
+
+ // Return the last node in the list.
+ // Return head_ if list is empty.
+ Node* FindLast() const;
+
+ // Immutable after construction
+ Comparator const compare_;
+ Arena* const arena_; // Arena used for allocations of nodes
+
+ Node* const head_;
+
+ // Modified only by Insert(). Read racily by readers, but stale
+ // values are ok.
+ std::atomic<int> max_height_; // Height of the entire list
+
+ // Read/written only by Insert().
+ Random rnd_;
+};
+
+// Implementation details follow
+template <typename Key, class Comparator>
+struct SkipList<Key, Comparator>::Node {
+ explicit Node(const Key& k) : key(k) {}
+
+ Key const key;
+
+ // Accessors/mutators for links. Wrapped in methods so we can
+ // add the appropriate barriers as necessary.
+ Node* Next(int n) {
+ assert(n >= 0);
+ // Use an 'acquire load' so that we observe a fully initialized
+ // version of the returned Node.
+ return next_[n].load(std::memory_order_acquire);
+ }
+ void SetNext(int n, Node* x) {
+ assert(n >= 0);
+ // Use a 'release store' so that anybody who reads through this
+ // pointer observes a fully initialized version of the inserted node.
+ next_[n].store(x, std::memory_order_release);
+ }
+
+ // No-barrier variants that can be safely used in a few locations.
+ Node* NoBarrier_Next(int n) {
+ assert(n >= 0);
+ return next_[n].load(std::memory_order_relaxed);
+ }
+ void NoBarrier_SetNext(int n, Node* x) {
+ assert(n >= 0);
+ next_[n].store(x, std::memory_order_relaxed);
+ }
+
+ private:
+ // Array of length equal to the node height. next_[0] is lowest level link.
+ std::atomic<Node*> next_[1];
+};
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::NewNode(
+ const Key& key, int height) {
+ char* const node_memory = arena_->AllocateAligned(
+ sizeof(Node) + sizeof(std::atomic<Node*>) * (height - 1));
+ return new (node_memory) Node(key);
+}
+
+template <typename Key, class Comparator>
+inline SkipList<Key, Comparator>::Iterator::Iterator(const SkipList* list) {
+ list_ = list;
+ node_ = nullptr;
+}
+
+template <typename Key, class Comparator>
+inline bool SkipList<Key, Comparator>::Iterator::Valid() const {
+ return node_ != nullptr;
+}
+
+template <typename Key, class Comparator>
+inline const Key& SkipList<Key, Comparator>::Iterator::key() const {
+ assert(Valid());
+ return node_->key;
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Next() {
+ assert(Valid());
+ node_ = node_->Next(0);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Prev() {
+ // Instead of using explicit "prev" links, we just search for the
+ // last node that falls before key.
+ assert(Valid());
+ node_ = list_->FindLessThan(node_->key);
+ if (node_ == list_->head_) {
+ node_ = nullptr;
+ }
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::Seek(const Key& target) {
+ node_ = list_->FindGreaterOrEqual(target, nullptr);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToFirst() {
+ node_ = list_->head_->Next(0);
+}
+
+template <typename Key, class Comparator>
+inline void SkipList<Key, Comparator>::Iterator::SeekToLast() {
+ node_ = list_->FindLast();
+ if (node_ == list_->head_) {
+ node_ = nullptr;
+ }
+}
+
+template <typename Key, class Comparator>
+int SkipList<Key, Comparator>::RandomHeight() {
+ // Increase height with probability 1 in kBranching
+ static const unsigned int kBranching = 4;
+ int height = 1;
+ while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+ height++;
+ }
+ assert(height > 0);
+ assert(height <= kMaxHeight);
+ return height;
+}
+
+template <typename Key, class Comparator>
+bool SkipList<Key, Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+ // null n is considered infinite
+ return (n != nullptr) && (compare_(n->key, key) < 0);
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindGreaterOrEqual(const Key& key,
+ Node** prev) const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (KeyIsAfterNode(key, next)) {
+ // Keep searching in this list
+ x = next;
+ } else {
+ if (prev != nullptr) prev[level] = x;
+ if (level == 0) {
+ return next;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ }
+ }
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node*
+SkipList<Key, Comparator>::FindLessThan(const Key& key) const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ assert(x == head_ || compare_(x->key, key) < 0);
+ Node* next = x->Next(level);
+ if (next == nullptr || compare_(next->key, key) >= 0) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template <typename Key, class Comparator>
+typename SkipList<Key, Comparator>::Node* SkipList<Key, Comparator>::FindLast()
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (next == nullptr) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template <typename Key, class Comparator>
+SkipList<Key, Comparator>::SkipList(Comparator cmp, Arena* arena)
+ : compare_(cmp),
+ arena_(arena),
+ head_(NewNode(0 /* any key will do */, kMaxHeight)),
+ max_height_(1),
+ rnd_(0xdeadbeef) {
+ for (int i = 0; i < kMaxHeight; i++) {
+ head_->SetNext(i, nullptr);
+ }
+}
+
+template <typename Key, class Comparator>
+void SkipList<Key, Comparator>::Insert(const Key& key) {
+ // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+ // here since Insert() is externally synchronized.
+ Node* prev[kMaxHeight];
+ Node* x = FindGreaterOrEqual(key, prev);
+
+ // Our data structure does not allow duplicate insertion
+ assert(x == nullptr || !Equal(key, x->key));
+
+ int height = RandomHeight();
+ if (height > GetMaxHeight()) {
+ for (int i = GetMaxHeight(); i < height; i++) {
+ prev[i] = head_;
+ }
+ // It is ok to mutate max_height_ without any synchronization
+ // with concurrent readers. A concurrent reader that observes
+ // the new value of max_height_ will see either the old value of
+ // new level pointers from head_ (nullptr), or a new value set in
+ // the loop below. In the former case the reader will
+ // immediately drop to the next level since nullptr sorts after all
+ // keys. In the latter case the reader will use the new node.
+ max_height_.store(height, std::memory_order_relaxed);
+ }
+
+ x = NewNode(key, height);
+ for (int i = 0; i < height; i++) {
+ // NoBarrier_SetNext() suffices since we will add a barrier when
+ // we publish a pointer to "x" in prev[i].
+ x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+ prev[i]->SetNext(i, x);
+ }
+}
+
+template <typename Key, class Comparator>
+bool SkipList<Key, Comparator>::Contains(const Key& key) const {
+ Node* x = FindGreaterOrEqual(key, nullptr);
+ if (x != nullptr && Equal(key, x->key)) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_SKIPLIST_H_
diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc
new file mode 100644
index 0000000000..9fa2d96829
--- /dev/null
+++ b/src/leveldb/db/skiplist_test.cc
@@ -0,0 +1,369 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+
+#include <atomic>
+#include <set>
+
+#include "leveldb/env.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+typedef uint64_t Key;
+
+struct Comparator {
+ int operator()(const Key& a, const Key& b) const {
+ if (a < b) {
+ return -1;
+ } else if (a > b) {
+ return +1;
+ } else {
+ return 0;
+ }
+ }
+};
+
+class SkipTest {};
+
+TEST(SkipTest, Empty) {
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ ASSERT_TRUE(!list.Contains(10));
+
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToFirst();
+ ASSERT_TRUE(!iter.Valid());
+ iter.Seek(100);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToLast();
+ ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+ const int N = 2000;
+ const int R = 5000;
+ Random rnd(1000);
+ std::set<Key> keys;
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ for (int i = 0; i < N; i++) {
+ Key key = rnd.Next() % R;
+ if (keys.insert(key).second) {
+ list.Insert(key);
+ }
+ }
+
+ for (int i = 0; i < R; i++) {
+ if (list.Contains(i)) {
+ ASSERT_EQ(keys.count(i), 1);
+ } else {
+ ASSERT_EQ(keys.count(i), 0);
+ }
+ }
+
+ // Simple iterator tests
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+
+ iter.Seek(0);
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToFirst();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToLast();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.rbegin()), iter.key());
+ }
+
+ // Forward iteration test
+ for (int i = 0; i < R; i++) {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.Seek(i);
+
+ // Compare against model iterator
+ std::set<Key>::iterator model_iter = keys.lower_bound(i);
+ for (int j = 0; j < 3; j++) {
+ if (model_iter == keys.end()) {
+ ASSERT_TRUE(!iter.Valid());
+ break;
+ } else {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ ++model_iter;
+ iter.Next();
+ }
+ }
+ }
+
+ // Backward iteration test
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.SeekToLast();
+
+ // Compare against model iterator
+ for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+ model_iter != keys.rend(); ++model_iter) {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ iter.Prev();
+ }
+ ASSERT_TRUE(!iter.Valid());
+ }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructed. Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+// <key,gen,hash>
+// where:
+// key is in range [0..K-1]
+// gen is a generation number for key
+// hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key. We then iterate, including random
+// calls to Next() and Seek(). For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+ static const uint32_t K = 4;
+
+ static uint64_t key(Key key) { return (key >> 40); }
+ static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+ static uint64_t hash(Key key) { return key & 0xff; }
+
+ static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+ uint64_t data[2] = {k, g};
+ return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+ }
+
+ static Key MakeKey(uint64_t k, uint64_t g) {
+ static_assert(sizeof(Key) == sizeof(uint64_t), "");
+ assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
+ assert(g <= 0xffffffffu);
+ return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+ }
+
+ static bool IsValidKey(Key k) {
+ return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+ }
+
+ static Key RandomTarget(Random* rnd) {
+ switch (rnd->Next() % 10) {
+ case 0:
+ // Seek to beginning
+ return MakeKey(0, 0);
+ case 1:
+ // Seek to end
+ return MakeKey(K, 0);
+ default:
+ // Seek to middle
+ return MakeKey(rnd->Next() % K, 0);
+ }
+ }
+
+ // Per-key generation
+ struct State {
+ std::atomic<int> generation[K];
+ void Set(int k, int v) {
+ generation[k].store(v, std::memory_order_release);
+ }
+ int Get(int k) { return generation[k].load(std::memory_order_acquire); }
+
+ State() {
+ for (int k = 0; k < K; k++) {
+ Set(k, 0);
+ }
+ }
+ };
+
+ // Current state of the test
+ State current_;
+
+ Arena arena_;
+
+ // SkipList is not protected by mu_. We just use a single writer
+ // thread to modify it.
+ SkipList<Key, Comparator> list_;
+
+ public:
+ ConcurrentTest() : list_(Comparator(), &arena_) {}
+
+ // REQUIRES: External synchronization
+ void WriteStep(Random* rnd) {
+ const uint32_t k = rnd->Next() % K;
+ const intptr_t g = current_.Get(k) + 1;
+ const Key key = MakeKey(k, g);
+ list_.Insert(key);
+ current_.Set(k, g);
+ }
+
+ void ReadStep(Random* rnd) {
+ // Remember the initial committed state of the skiplist.
+ State initial_state;
+ for (int k = 0; k < K; k++) {
+ initial_state.Set(k, current_.Get(k));
+ }
+
+ Key pos = RandomTarget(rnd);
+ SkipList<Key, Comparator>::Iterator iter(&list_);
+ iter.Seek(pos);
+ while (true) {
+ Key current;
+ if (!iter.Valid()) {
+ current = MakeKey(K, 0);
+ } else {
+ current = iter.key();
+ ASSERT_TRUE(IsValidKey(current)) << current;
+ }
+ ASSERT_LE(pos, current) << "should not go backwards";
+
+ // Verify that everything in [pos,current) was not present in
+ // initial_state.
+ while (pos < current) {
+ ASSERT_LT(key(pos), K) << pos;
+
+ // Note that generation 0 is never inserted, so it is ok if
+ // <*,0,*> is missing.
+ ASSERT_TRUE((gen(pos) == 0) ||
+ (gen(pos) > static_cast<Key>(initial_state.Get(key(pos)))))
+ << "key: " << key(pos) << "; gen: " << gen(pos)
+ << "; initgen: " << initial_state.Get(key(pos));
+
+ // Advance to next key in the valid key space
+ if (key(pos) < key(current)) {
+ pos = MakeKey(key(pos) + 1, 0);
+ } else {
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ }
+ }
+
+ if (!iter.Valid()) {
+ break;
+ }
+
+ if (rnd->Next() % 2) {
+ iter.Next();
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ } else {
+ Key new_target = RandomTarget(rnd);
+ if (new_target > pos) {
+ pos = new_target;
+ iter.Seek(new_target);
+ }
+ }
+ }
+ }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+ ConcurrentTest test;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < 10000; i++) {
+ test.ReadStep(&rnd);
+ test.WriteStep(&rnd);
+ }
+}
+
+class TestState {
+ public:
+ ConcurrentTest t_;
+ int seed_;
+ std::atomic<bool> quit_flag_;
+
+ enum ReaderState { STARTING, RUNNING, DONE };
+
+ explicit TestState(int s)
+ : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {}
+
+ void Wait(ReaderState s) LOCKS_EXCLUDED(mu_) {
+ mu_.Lock();
+ while (state_ != s) {
+ state_cv_.Wait();
+ }
+ mu_.Unlock();
+ }
+
+ void Change(ReaderState s) LOCKS_EXCLUDED(mu_) {
+ mu_.Lock();
+ state_ = s;
+ state_cv_.Signal();
+ mu_.Unlock();
+ }
+
+ private:
+ port::Mutex mu_;
+ ReaderState state_ GUARDED_BY(mu_);
+ port::CondVar state_cv_ GUARDED_BY(mu_);
+};
+
+static void ConcurrentReader(void* arg) {
+ TestState* state = reinterpret_cast<TestState*>(arg);
+ Random rnd(state->seed_);
+ int64_t reads = 0;
+ state->Change(TestState::RUNNING);
+ while (!state->quit_flag_.load(std::memory_order_acquire)) {
+ state->t_.ReadStep(&rnd);
+ ++reads;
+ }
+ state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+ const int seed = test::RandomSeed() + (run * 100);
+ Random rnd(seed);
+ const int N = 1000;
+ const int kSize = 1000;
+ for (int i = 0; i < N; i++) {
+ if ((i % 100) == 0) {
+ fprintf(stderr, "Run %d of %d\n", i, N);
+ }
+ TestState state(seed + 1);
+ Env::Default()->Schedule(ConcurrentReader, &state);
+ state.Wait(TestState::RUNNING);
+ for (int i = 0; i < kSize; i++) {
+ state.t_.WriteStep(&rnd);
+ }
+ state.quit_flag_.store(true, std::memory_order_release);
+ state.Wait(TestState::DONE);
+ }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h
new file mode 100644
index 0000000000..9f1d66491d
--- /dev/null
+++ b/src/leveldb/db/snapshot.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+
+#include "db/dbformat.h"
+#include "leveldb/db.h"
+
+namespace leveldb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+ SnapshotImpl(SequenceNumber sequence_number)
+ : sequence_number_(sequence_number) {}
+
+ SequenceNumber sequence_number() const { return sequence_number_; }
+
+ private:
+ friend class SnapshotList;
+
+ // SnapshotImpl is kept in a doubly-linked circular list. The SnapshotList
+ // implementation operates on the next/previous fields direcly.
+ SnapshotImpl* prev_;
+ SnapshotImpl* next_;
+
+ const SequenceNumber sequence_number_;
+
+#if !defined(NDEBUG)
+ SnapshotList* list_ = nullptr;
+#endif // !defined(NDEBUG)
+};
+
+class SnapshotList {
+ public:
+ SnapshotList() : head_(0) {
+ head_.prev_ = &head_;
+ head_.next_ = &head_;
+ }
+
+ bool empty() const { return head_.next_ == &head_; }
+ SnapshotImpl* oldest() const {
+ assert(!empty());
+ return head_.next_;
+ }
+ SnapshotImpl* newest() const {
+ assert(!empty());
+ return head_.prev_;
+ }
+
+ // Creates a SnapshotImpl and appends it to the end of the list.
+ SnapshotImpl* New(SequenceNumber sequence_number) {
+ assert(empty() || newest()->sequence_number_ <= sequence_number);
+
+ SnapshotImpl* snapshot = new SnapshotImpl(sequence_number);
+
+#if !defined(NDEBUG)
+ snapshot->list_ = this;
+#endif // !defined(NDEBUG)
+ snapshot->next_ = &head_;
+ snapshot->prev_ = head_.prev_;
+ snapshot->prev_->next_ = snapshot;
+ snapshot->next_->prev_ = snapshot;
+ return snapshot;
+ }
+
+ // Removes a SnapshotImpl from this list.
+ //
+ // The snapshot must have been created by calling New() on this list.
+ //
+ // The snapshot pointer should not be const, because its memory is
+ // deallocated. However, that would force us to change DB::ReleaseSnapshot(),
+ // which is in the API, and currently takes a const Snapshot.
+ void Delete(const SnapshotImpl* snapshot) {
+#if !defined(NDEBUG)
+ assert(snapshot->list_ == this);
+#endif // !defined(NDEBUG)
+ snapshot->prev_->next_ = snapshot->next_;
+ snapshot->next_->prev_ = snapshot->prev_;
+ delete snapshot;
+ }
+
+ private:
+ // Dummy head of doubly-linked list of snapshots
+ SnapshotImpl head_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_
diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc
new file mode 100644
index 0000000000..73f05fd7b1
--- /dev/null
+++ b/src/leveldb/db/table_cache.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct TableAndFile {
+ RandomAccessFile* file;
+ Table* table;
+};
+
+static void DeleteEntry(const Slice& key, void* value) {
+ TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+ delete tf->table;
+ delete tf->file;
+ delete tf;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+ Cache* cache = reinterpret_cast<Cache*>(arg1);
+ Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+ cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname, const Options& options,
+ int entries)
+ : env_(options.env),
+ dbname_(dbname),
+ options_(options),
+ cache_(NewLRUCache(entries)) {}
+
+TableCache::~TableCache() { delete cache_; }
+
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
+ Cache::Handle** handle) {
+ Status s;
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ Slice key(buf, sizeof(buf));
+ *handle = cache_->Lookup(key);
+ if (*handle == nullptr) {
+ std::string fname = TableFileName(dbname_, file_number);
+ RandomAccessFile* file = nullptr;
+ Table* table = nullptr;
+ s = env_->NewRandomAccessFile(fname, &file);
+ if (!s.ok()) {
+ std::string old_fname = SSTTableFileName(dbname_, file_number);
+ if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
+ s = Status::OK();
+ }
+ }
+ if (s.ok()) {
+ s = Table::Open(options_, file, file_size, &table);
+ }
+
+ if (!s.ok()) {
+ assert(table == nullptr);
+ delete file;
+ // We do not cache error results so that if the error is transient,
+ // or somebody repairs the file, we recover automatically.
+ } else {
+ TableAndFile* tf = new TableAndFile;
+ tf->file = file;
+ tf->table = table;
+ *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+ }
+ }
+ return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+ uint64_t file_number, uint64_t file_size,
+ Table** tableptr) {
+ if (tableptr != nullptr) {
+ *tableptr = nullptr;
+ }
+
+ Cache::Handle* handle = nullptr;
+ Status s = FindTable(file_number, file_size, &handle);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+
+ Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+ Iterator* result = table->NewIterator(options);
+ result->RegisterCleanup(&UnrefEntry, cache_, handle);
+ if (tableptr != nullptr) {
+ *tableptr = table;
+ }
+ return result;
+}
+
+Status TableCache::Get(const ReadOptions& options, uint64_t file_number,
+ uint64_t file_size, const Slice& k, void* arg,
+ void (*handle_result)(void*, const Slice&,
+ const Slice&)) {
+ Cache::Handle* handle = nullptr;
+ Status s = FindTable(file_number, file_size, &handle);
+ if (s.ok()) {
+ Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+ s = t->InternalGet(options, k, arg, handle_result);
+ cache_->Release(handle);
+ }
+ return s;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h
new file mode 100644
index 0000000000..93069c8844
--- /dev/null
+++ b/src/leveldb/db/table_cache.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "leveldb/cache.h"
+#include "leveldb/table.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+class TableCache {
+ public:
+ TableCache(const std::string& dbname, const Options& options, int entries);
+ ~TableCache();
+
+ // Return an iterator for the specified file number (the corresponding
+ // file length must be exactly "file_size" bytes). If "tableptr" is
+ // non-null, also sets "*tableptr" to point to the Table object
+ // underlying the returned iterator, or to nullptr if no Table object
+ // underlies the returned iterator. The returned "*tableptr" object is owned
+ // by the cache and should not be deleted, and is valid for as long as the
+ // returned iterator is live.
+ Iterator* NewIterator(const ReadOptions& options, uint64_t file_number,
+ uint64_t file_size, Table** tableptr = nullptr);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call (*handle_result)(arg, found_key, found_value).
+ Status Get(const ReadOptions& options, uint64_t file_number,
+ uint64_t file_size, const Slice& k, void* arg,
+ void (*handle_result)(void*, const Slice&, const Slice&));
+
+ // Evict any entry for the specified file number
+ void Evict(uint64_t file_number);
+
+ private:
+ Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
+
+ Env* const env_;
+ const std::string dbname_;
+ const Options& options_;
+ Cache* cache_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc
new file mode 100644
index 0000000000..cd770ef12d
--- /dev/null
+++ b/src/leveldb/db/version_edit.cc
@@ -0,0 +1,257 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// Tag numbers for serialized VersionEdit. These numbers are written to
+// disk and should not be changed.
+enum Tag {
+ kComparator = 1,
+ kLogNumber = 2,
+ kNextFileNumber = 3,
+ kLastSequence = 4,
+ kCompactPointer = 5,
+ kDeletedFile = 6,
+ kNewFile = 7,
+ // 8 was used for large value refs
+ kPrevLogNumber = 9
+};
+
+void VersionEdit::Clear() {
+ comparator_.clear();
+ log_number_ = 0;
+ prev_log_number_ = 0;
+ last_sequence_ = 0;
+ next_file_number_ = 0;
+ has_comparator_ = false;
+ has_log_number_ = false;
+ has_prev_log_number_ = false;
+ has_next_file_number_ = false;
+ has_last_sequence_ = false;
+ deleted_files_.clear();
+ new_files_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+ if (has_comparator_) {
+ PutVarint32(dst, kComparator);
+ PutLengthPrefixedSlice(dst, comparator_);
+ }
+ if (has_log_number_) {
+ PutVarint32(dst, kLogNumber);
+ PutVarint64(dst, log_number_);
+ }
+ if (has_prev_log_number_) {
+ PutVarint32(dst, kPrevLogNumber);
+ PutVarint64(dst, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ PutVarint32(dst, kNextFileNumber);
+ PutVarint64(dst, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ PutVarint32(dst, kLastSequence);
+ PutVarint64(dst, last_sequence_);
+ }
+
+ for (size_t i = 0; i < compact_pointers_.size(); i++) {
+ PutVarint32(dst, kCompactPointer);
+ PutVarint32(dst, compact_pointers_[i].first); // level
+ PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+ }
+
+ for (const auto& deleted_file_kvp : deleted_files_) {
+ PutVarint32(dst, kDeletedFile);
+ PutVarint32(dst, deleted_file_kvp.first); // level
+ PutVarint64(dst, deleted_file_kvp.second); // file number
+ }
+
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ PutVarint32(dst, kNewFile);
+ PutVarint32(dst, new_files_[i].first); // level
+ PutVarint64(dst, f.number);
+ PutVarint64(dst, f.file_size);
+ PutLengthPrefixedSlice(dst, f.smallest.Encode());
+ PutLengthPrefixedSlice(dst, f.largest.Encode());
+ }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+ Slice str;
+ if (GetLengthPrefixedSlice(input, &str)) {
+ return dst->DecodeFrom(str);
+ } else {
+ return false;
+ }
+}
+
+static bool GetLevel(Slice* input, int* level) {
+ uint32_t v;
+ if (GetVarint32(input, &v) && v < config::kNumLevels) {
+ *level = v;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+ Clear();
+ Slice input = src;
+ const char* msg = nullptr;
+ uint32_t tag;
+
+ // Temporary storage for parsing
+ int level;
+ uint64_t number;
+ FileMetaData f;
+ Slice str;
+ InternalKey key;
+
+ while (msg == nullptr && GetVarint32(&input, &tag)) {
+ switch (tag) {
+ case kComparator:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ comparator_ = str.ToString();
+ has_comparator_ = true;
+ } else {
+ msg = "comparator name";
+ }
+ break;
+
+ case kLogNumber:
+ if (GetVarint64(&input, &log_number_)) {
+ has_log_number_ = true;
+ } else {
+ msg = "log number";
+ }
+ break;
+
+ case kPrevLogNumber:
+ if (GetVarint64(&input, &prev_log_number_)) {
+ has_prev_log_number_ = true;
+ } else {
+ msg = "previous log number";
+ }
+ break;
+
+ case kNextFileNumber:
+ if (GetVarint64(&input, &next_file_number_)) {
+ has_next_file_number_ = true;
+ } else {
+ msg = "next file number";
+ }
+ break;
+
+ case kLastSequence:
+ if (GetVarint64(&input, &last_sequence_)) {
+ has_last_sequence_ = true;
+ } else {
+ msg = "last sequence number";
+ }
+ break;
+
+ case kCompactPointer:
+ if (GetLevel(&input, &level) && GetInternalKey(&input, &key)) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ } else {
+ msg = "compaction pointer";
+ }
+ break;
+
+ case kDeletedFile:
+ if (GetLevel(&input, &level) && GetVarint64(&input, &number)) {
+ deleted_files_.insert(std::make_pair(level, number));
+ } else {
+ msg = "deleted file";
+ }
+ break;
+
+ case kNewFile:
+ if (GetLevel(&input, &level) && GetVarint64(&input, &f.number) &&
+ GetVarint64(&input, &f.file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest)) {
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ msg = "new-file entry";
+ }
+ break;
+
+ default:
+ msg = "unknown tag";
+ break;
+ }
+ }
+
+ if (msg == nullptr && !input.empty()) {
+ msg = "invalid tag";
+ }
+
+ Status result;
+ if (msg != nullptr) {
+ result = Status::Corruption("VersionEdit", msg);
+ }
+ return result;
+}
+
+std::string VersionEdit::DebugString() const {
+ std::string r;
+ r.append("VersionEdit {");
+ if (has_comparator_) {
+ r.append("\n Comparator: ");
+ r.append(comparator_);
+ }
+ if (has_log_number_) {
+ r.append("\n LogNumber: ");
+ AppendNumberTo(&r, log_number_);
+ }
+ if (has_prev_log_number_) {
+ r.append("\n PrevLogNumber: ");
+ AppendNumberTo(&r, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ r.append("\n NextFile: ");
+ AppendNumberTo(&r, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ r.append("\n LastSeq: ");
+ AppendNumberTo(&r, last_sequence_);
+ }
+ for (size_t i = 0; i < compact_pointers_.size(); i++) {
+ r.append("\n CompactPointer: ");
+ AppendNumberTo(&r, compact_pointers_[i].first);
+ r.append(" ");
+ r.append(compact_pointers_[i].second.DebugString());
+ }
+ for (const auto& deleted_files_kvp : deleted_files_) {
+ r.append("\n DeleteFile: ");
+ AppendNumberTo(&r, deleted_files_kvp.first);
+ r.append(" ");
+ AppendNumberTo(&r, deleted_files_kvp.second);
+ }
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ r.append("\n AddFile: ");
+ AppendNumberTo(&r, new_files_[i].first);
+ r.append(" ");
+ AppendNumberTo(&r, f.number);
+ r.append(" ");
+ AppendNumberTo(&r, f.file_size);
+ r.append(" ");
+ r.append(f.smallest.DebugString());
+ r.append(" .. ");
+ r.append(f.largest.DebugString());
+ }
+ r.append("\n}\n");
+ return r;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h
new file mode 100644
index 0000000000..0de4531773
--- /dev/null
+++ b/src/leveldb/db/version_edit.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class VersionSet;
+
+struct FileMetaData {
+ FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) {}
+
+ int refs;
+ int allowed_seeks; // Seeks allowed until compaction
+ uint64_t number;
+ uint64_t file_size; // File size in bytes
+ InternalKey smallest; // Smallest internal key served by table
+ InternalKey largest; // Largest internal key served by table
+};
+
+class VersionEdit {
+ public:
+ VersionEdit() { Clear(); }
+ ~VersionEdit() = default;
+
+ void Clear();
+
+ void SetComparatorName(const Slice& name) {
+ has_comparator_ = true;
+ comparator_ = name.ToString();
+ }
+ void SetLogNumber(uint64_t num) {
+ has_log_number_ = true;
+ log_number_ = num;
+ }
+ void SetPrevLogNumber(uint64_t num) {
+ has_prev_log_number_ = true;
+ prev_log_number_ = num;
+ }
+ void SetNextFile(uint64_t num) {
+ has_next_file_number_ = true;
+ next_file_number_ = num;
+ }
+ void SetLastSequence(SequenceNumber seq) {
+ has_last_sequence_ = true;
+ last_sequence_ = seq;
+ }
+ void SetCompactPointer(int level, const InternalKey& key) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ }
+
+ // Add the specified file at the specified number.
+ // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+ // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+ void AddFile(int level, uint64_t file, uint64_t file_size,
+ const InternalKey& smallest, const InternalKey& largest) {
+ FileMetaData f;
+ f.number = file;
+ f.file_size = file_size;
+ f.smallest = smallest;
+ f.largest = largest;
+ new_files_.push_back(std::make_pair(level, f));
+ }
+
+ // Delete the specified "file" from the specified "level".
+ void DeleteFile(int level, uint64_t file) {
+ deleted_files_.insert(std::make_pair(level, file));
+ }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(const Slice& src);
+
+ std::string DebugString() const;
+
+ private:
+ friend class VersionSet;
+
+ typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
+
+ std::string comparator_;
+ uint64_t log_number_;
+ uint64_t prev_log_number_;
+ uint64_t next_file_number_;
+ SequenceNumber last_sequence_;
+ bool has_comparator_;
+ bool has_log_number_;
+ bool has_prev_log_number_;
+ bool has_next_file_number_;
+ bool has_last_sequence_;
+
+ std::vector<std::pair<int, InternalKey>> compact_pointers_;
+ DeletedFileSet deleted_files_;
+ std::vector<std::pair<int, FileMetaData>> new_files_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc
new file mode 100644
index 0000000000..0b7cda8854
--- /dev/null
+++ b/src/leveldb/db/version_edit_test.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ parsed.EncodeTo(&encoded2);
+ ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest {};
+
+TEST(VersionEditTest, EncodeDecode) {
+ static const uint64_t kBig = 1ull << 50;
+
+ VersionEdit edit;
+ for (int i = 0; i < 4; i++) {
+ TestEncodeDecode(edit);
+ edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+ InternalKey("foo", kBig + 500 + i, kTypeValue),
+ InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+ edit.DeleteFile(4, kBig + 700 + i);
+ edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+ }
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
new file mode 100644
index 0000000000..cd07346ea8
--- /dev/null
+++ b/src/leveldb/db/version_set.cc
@@ -0,0 +1,1562 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table_builder.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static size_t TargetFileSize(const Options* options) {
+ return options->max_file_size;
+}
+
+// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static int64_t MaxGrandParentOverlapBytes(const Options* options) {
+ return 10 * TargetFileSize(options);
+}
+
+// Maximum number of bytes in all compacted files. We avoid expanding
+// the lower level file set of a compaction if it would make the
+// total compaction cover more than this many bytes.
+static int64_t ExpandedCompactionByteSizeLimit(const Options* options) {
+ return 25 * TargetFileSize(options);
+}
+
+static double MaxBytesForLevel(const Options* options, int level) {
+ // Note: the result for level zero is not really used since we set
+ // the level-0 compaction threshold based on number of files.
+
+ // Result for both level-0 and level-1
+ double result = 10. * 1048576.0;
+ while (level > 1) {
+ result *= 10;
+ level--;
+ }
+ return result;
+}
+
+static uint64_t MaxFileSizeForLevel(const Options* options, int level) {
+ // We could vary per level to reduce number of files?
+ return TargetFileSize(options);
+}
+
+static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ int64_t sum = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ sum += files[i]->file_size;
+ }
+ return sum;
+}
+
+Version::~Version() {
+ assert(refs_ == 0);
+
+ // Remove from linked list
+ prev_->next_ = next_;
+ next_->prev_ = prev_;
+
+ // Drop references to files
+ for (int level = 0; level < config::kNumLevels; level++) {
+ for (size_t i = 0; i < files_[level].size(); i++) {
+ FileMetaData* f = files_[level][i];
+ assert(f->refs > 0);
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& files, const Slice& key) {
+ uint32_t left = 0;
+ uint32_t right = files.size();
+ while (left < right) {
+ uint32_t mid = (left + right) / 2;
+ const FileMetaData* f = files[mid];
+ if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
+ // Key at "mid.largest" is < "target". Therefore all
+ // files at or before "mid" are uninteresting.
+ left = mid + 1;
+ } else {
+ // Key at "mid.largest" is >= "target". Therefore all files
+ // after "mid" are uninteresting.
+ right = mid;
+ }
+ }
+ return right;
+}
+
+static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
+ const FileMetaData* f) {
+ // null user_key occurs before all keys and is therefore never after *f
+ return (user_key != nullptr &&
+ ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
+ const FileMetaData* f) {
+ // null user_key occurs after all keys and is therefore never before *f
+ return (user_key != nullptr &&
+ ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+}
+
+bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const std::vector<FileMetaData*>& files,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ const Comparator* ucmp = icmp.user_comparator();
+ if (!disjoint_sorted_files) {
+ // Need to check against all files
+ for (size_t i = 0; i < files.size(); i++) {
+ const FileMetaData* f = files[i];
+ if (AfterFile(ucmp, smallest_user_key, f) ||
+ BeforeFile(ucmp, largest_user_key, f)) {
+ // No overlap
+ } else {
+ return true; // Overlap
+ }
+ }
+ return false;
+ }
+
+ // Binary search over file list
+ uint32_t index = 0;
+ if (smallest_user_key != nullptr) {
+ // Find the earliest possible internal key for smallest_user_key
+ InternalKey small_key(*smallest_user_key, kMaxSequenceNumber,
+ kValueTypeForSeek);
+ index = FindFile(icmp, files, small_key.Encode());
+ }
+
+ if (index >= files.size()) {
+ // beginning of range is after all files, so no overlap.
+ return false;
+ }
+
+ return !BeforeFile(ucmp, largest_user_key, files[index]);
+}
+
+// An internal iterator. For a given version/level pair, yields
+// information about the files in the level. For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+ LevelFileNumIterator(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>* flist)
+ : icmp_(icmp), flist_(flist), index_(flist->size()) { // Marks as invalid
+ }
+ bool Valid() const override { return index_ < flist_->size(); }
+ void Seek(const Slice& target) override {
+ index_ = FindFile(icmp_, *flist_, target);
+ }
+ void SeekToFirst() override { index_ = 0; }
+ void SeekToLast() override {
+ index_ = flist_->empty() ? 0 : flist_->size() - 1;
+ }
+ void Next() override {
+ assert(Valid());
+ index_++;
+ }
+ void Prev() override {
+ assert(Valid());
+ if (index_ == 0) {
+ index_ = flist_->size(); // Marks as invalid
+ } else {
+ index_--;
+ }
+ }
+ Slice key() const override {
+ assert(Valid());
+ return (*flist_)[index_]->largest.Encode();
+ }
+ Slice value() const override {
+ assert(Valid());
+ EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+ EncodeFixed64(value_buf_ + 8, (*flist_)[index_]->file_size);
+ return Slice(value_buf_, sizeof(value_buf_));
+ }
+ Status status() const override { return Status::OK(); }
+
+ private:
+ const InternalKeyComparator icmp_;
+ const std::vector<FileMetaData*>* const flist_;
+ uint32_t index_;
+
+ // Backing store for value(). Holds the file number and size.
+ mutable char value_buf_[16];
+};
+
+static Iterator* GetFileIterator(void* arg, const ReadOptions& options,
+ const Slice& file_value) {
+ TableCache* cache = reinterpret_cast<TableCache*>(arg);
+ if (file_value.size() != 16) {
+ return NewErrorIterator(
+ Status::Corruption("FileReader invoked with unexpected value"));
+ } else {
+ return cache->NewIterator(options, DecodeFixed64(file_value.data()),
+ DecodeFixed64(file_value.data() + 8));
+ }
+}
+
+Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
+ int level) const {
+ return NewTwoLevelIterator(
+ new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator,
+ vset_->table_cache_, options);
+}
+
+void Version::AddIterators(const ReadOptions& options,
+ std::vector<Iterator*>* iters) {
+ // Merge all level zero files together since they may overlap
+ for (size_t i = 0; i < files_[0].size(); i++) {
+ iters->push_back(vset_->table_cache_->NewIterator(
+ options, files_[0][i]->number, files_[0][i]->file_size));
+ }
+
+ // For levels > 0, we can use a concatenating iterator that sequentially
+ // walks through the non-overlapping files in the level, opening them
+ // lazily.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ if (!files_[level].empty()) {
+ iters->push_back(NewConcatenatingIterator(options, level));
+ }
+ }
+}
+
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+ kNotFound,
+ kFound,
+ kDeleted,
+ kCorrupt,
+};
+struct Saver {
+ SaverState state;
+ const Comparator* ucmp;
+ Slice user_key;
+ std::string* value;
+};
+} // namespace
+static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+ Saver* s = reinterpret_cast<Saver*>(arg);
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(ikey, &parsed_key)) {
+ s->state = kCorrupt;
+ } else {
+ if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+ s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
+ if (s->state == kFound) {
+ s->value->assign(v.data(), v.size());
+ }
+ }
+ }
+}
+
+static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
+ return a->number > b->number;
+}
+
+void Version::ForEachOverlapping(Slice user_key, Slice internal_key, void* arg,
+ bool (*func)(void*, int, FileMetaData*)) {
+ const Comparator* ucmp = vset_->icmp_.user_comparator();
+
+ // Search level-0 in order from newest to oldest.
+ std::vector<FileMetaData*> tmp;
+ tmp.reserve(files_[0].size());
+ for (uint32_t i = 0; i < files_[0].size(); i++) {
+ FileMetaData* f = files_[0][i];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+ ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ tmp.push_back(f);
+ }
+ }
+ if (!tmp.empty()) {
+ std::sort(tmp.begin(), tmp.end(), NewestFirst);
+ for (uint32_t i = 0; i < tmp.size(); i++) {
+ if (!(*func)(arg, 0, tmp[i])) {
+ return;
+ }
+ }
+ }
+
+ // Search other levels.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ size_t num_files = files_[level].size();
+ if (num_files == 0) continue;
+
+ // Binary search to find earliest index whose largest key >= internal_key.
+ uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
+ if (index < num_files) {
+ FileMetaData* f = files_[level][index];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
+ // All of "f" is past any data for user_key
+ } else {
+ if (!(*func)(arg, level, f)) {
+ return;
+ }
+ }
+ }
+ }
+}
+
+Status Version::Get(const ReadOptions& options, const LookupKey& k,
+ std::string* value, GetStats* stats) {
+ stats->seek_file = nullptr;
+ stats->seek_file_level = -1;
+
+ struct State {
+ Saver saver;
+ GetStats* stats;
+ const ReadOptions* options;
+ Slice ikey;
+ FileMetaData* last_file_read;
+ int last_file_read_level;
+
+ VersionSet* vset;
+ Status s;
+ bool found;
+
+ static bool Match(void* arg, int level, FileMetaData* f) {
+ State* state = reinterpret_cast<State*>(arg);
+
+ if (state->stats->seek_file == nullptr &&
+ state->last_file_read != nullptr) {
+ // We have had more than one seek for this read. Charge the 1st file.
+ state->stats->seek_file = state->last_file_read;
+ state->stats->seek_file_level = state->last_file_read_level;
+ }
+
+ state->last_file_read = f;
+ state->last_file_read_level = level;
+
+ state->s = state->vset->table_cache_->Get(*state->options, f->number,
+ f->file_size, state->ikey,
+ &state->saver, SaveValue);
+ if (!state->s.ok()) {
+ state->found = true;
+ return false;
+ }
+ switch (state->saver.state) {
+ case kNotFound:
+ return true; // Keep searching in other files
+ case kFound:
+ state->found = true;
+ return false;
+ case kDeleted:
+ return false;
+ case kCorrupt:
+ state->s =
+ Status::Corruption("corrupted key for ", state->saver.user_key);
+ state->found = true;
+ return false;
+ }
+
+ // Not reached. Added to avoid false compilation warnings of
+ // "control reaches end of non-void function".
+ return false;
+ }
+ };
+
+ State state;
+ state.found = false;
+ state.stats = stats;
+ state.last_file_read = nullptr;
+ state.last_file_read_level = -1;
+
+ state.options = &options;
+ state.ikey = k.internal_key();
+ state.vset = vset_;
+
+ state.saver.state = kNotFound;
+ state.saver.ucmp = vset_->icmp_.user_comparator();
+ state.saver.user_key = k.user_key();
+ state.saver.value = value;
+
+ ForEachOverlapping(state.saver.user_key, state.ikey, &state, &State::Match);
+
+ return state.found ? state.s : Status::NotFound(Slice());
+}
+
+bool Version::UpdateStats(const GetStats& stats) {
+ FileMetaData* f = stats.seek_file;
+ if (f != nullptr) {
+ f->allowed_seeks--;
+ if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) {
+ file_to_compact_ = f;
+ file_to_compact_level_ = stats.seek_file_level;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Version::RecordReadSample(Slice internal_key) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(internal_key, &ikey)) {
+ return false;
+ }
+
+ struct State {
+ GetStats stats; // Holds first matching file
+ int matches;
+
+ static bool Match(void* arg, int level, FileMetaData* f) {
+ State* state = reinterpret_cast<State*>(arg);
+ state->matches++;
+ if (state->matches == 1) {
+ // Remember first match.
+ state->stats.seek_file = f;
+ state->stats.seek_file_level = level;
+ }
+ // We can stop iterating once we have a second match.
+ return state->matches < 2;
+ }
+ };
+
+ State state;
+ state.matches = 0;
+ ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
+
+ // Must have at least two matches since we want to merge across
+ // files. But what if we have a single file that contains many
+ // overwrites and deletions? Should we have another mechanism for
+ // finding such files?
+ if (state.matches >= 2) {
+ // 1MB cost is about 1 seek (see comment in Builder::Apply).
+ return UpdateStats(state.stats);
+ }
+ return false;
+}
+
+void Version::Ref() { ++refs_; }
+
+void Version::Unref() {
+ assert(this != &vset_->dummy_versions_);
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ delete this;
+ }
+}
+
+bool Version::OverlapInLevel(int level, const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
+ smallest_user_key, largest_user_key);
+}
+
+int Version::PickLevelForMemTableOutput(const Slice& smallest_user_key,
+ const Slice& largest_user_key) {
+ int level = 0;
+ if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
+ // Push to next level if there is no overlap in next level,
+ // and the #bytes overlapping in the level after that are limited.
+ InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+ std::vector<FileMetaData*> overlaps;
+ while (level < config::kMaxMemCompactLevel) {
+ if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
+ break;
+ }
+ if (level + 2 < config::kNumLevels) {
+ // Check that file does not overlap too many grandparent bytes.
+ GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+ const int64_t sum = TotalFileSize(overlaps);
+ if (sum > MaxGrandParentOverlapBytes(vset_->options_)) {
+ break;
+ }
+ }
+ level++;
+ }
+ }
+ return level;
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+void Version::GetOverlappingInputs(int level, const InternalKey* begin,
+ const InternalKey* end,
+ std::vector<FileMetaData*>* inputs) {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ inputs->clear();
+ Slice user_begin, user_end;
+ if (begin != nullptr) {
+ user_begin = begin->user_key();
+ }
+ if (end != nullptr) {
+ user_end = end->user_key();
+ }
+ const Comparator* user_cmp = vset_->icmp_.user_comparator();
+ for (size_t i = 0; i < files_[level].size();) {
+ FileMetaData* f = files_[level][i++];
+ const Slice file_start = f->smallest.user_key();
+ const Slice file_limit = f->largest.user_key();
+ if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) {
+ // "f" is completely before specified range; skip it
+ } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) {
+ // "f" is completely after specified range; skip it
+ } else {
+ inputs->push_back(f);
+ if (level == 0) {
+ // Level-0 files may overlap each other. So check if the newly
+ // added file has expanded the range. If so, restart search.
+ if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) {
+ user_begin = file_start;
+ inputs->clear();
+ i = 0;
+ } else if (end != nullptr &&
+ user_cmp->Compare(file_limit, user_end) > 0) {
+ user_end = file_limit;
+ inputs->clear();
+ i = 0;
+ }
+ }
+ }
+ }
+}
+
+std::string Version::DebugString() const {
+ std::string r;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ // E.g.,
+ // --- level 1 ---
+ // 17:123['a' .. 'd']
+ // 20:43['e' .. 'g']
+ r.append("--- level ");
+ AppendNumberTo(&r, level);
+ r.append(" ---\n");
+ const std::vector<FileMetaData*>& files = files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ r.push_back(' ');
+ AppendNumberTo(&r, files[i]->number);
+ r.push_back(':');
+ AppendNumberTo(&r, files[i]->file_size);
+ r.append("[");
+ r.append(files[i]->smallest.DebugString());
+ r.append(" .. ");
+ r.append(files[i]->largest.DebugString());
+ r.append("]\n");
+ }
+ }
+ return r;
+}
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+ // Helper to sort by v->files_[file_number].smallest
+ struct BySmallestKey {
+ const InternalKeyComparator* internal_comparator;
+
+ bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+ int r = internal_comparator->Compare(f1->smallest, f2->smallest);
+ if (r != 0) {
+ return (r < 0);
+ } else {
+ // Break ties by file number
+ return (f1->number < f2->number);
+ }
+ }
+ };
+
+ typedef std::set<FileMetaData*, BySmallestKey> FileSet;
+ struct LevelState {
+ std::set<uint64_t> deleted_files;
+ FileSet* added_files;
+ };
+
+ VersionSet* vset_;
+ Version* base_;
+ LevelState levels_[config::kNumLevels];
+
+ public:
+ // Initialize a builder with the files from *base and other info from *vset
+ Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) {
+ base_->Ref();
+ BySmallestKey cmp;
+ cmp.internal_comparator = &vset_->icmp_;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ levels_[level].added_files = new FileSet(cmp);
+ }
+ }
+
+ ~Builder() {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const FileSet* added = levels_[level].added_files;
+ std::vector<FileMetaData*> to_unref;
+ to_unref.reserve(added->size());
+ for (FileSet::const_iterator it = added->begin(); it != added->end();
+ ++it) {
+ to_unref.push_back(*it);
+ }
+ delete added;
+ for (uint32_t i = 0; i < to_unref.size(); i++) {
+ FileMetaData* f = to_unref[i];
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+ base_->Unref();
+ }
+
+ // Apply all of the edits in *edit to the current state.
+ void Apply(VersionEdit* edit) {
+ // Update compaction pointers
+ for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
+ const int level = edit->compact_pointers_[i].first;
+ vset_->compact_pointer_[level] =
+ edit->compact_pointers_[i].second.Encode().ToString();
+ }
+
+ // Delete files
+ for (const auto& deleted_file_set_kvp : edit->deleted_files_) {
+ const int level = deleted_file_set_kvp.first;
+ const uint64_t number = deleted_file_set_kvp.second;
+ levels_[level].deleted_files.insert(number);
+ }
+
+ // Add new files
+ for (size_t i = 0; i < edit->new_files_.size(); i++) {
+ const int level = edit->new_files_[i].first;
+ FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+ f->refs = 1;
+
+ // We arrange to automatically compact this file after
+ // a certain number of seeks. Let's assume:
+ // (1) One seek costs 10ms
+ // (2) Writing or reading 1MB costs 10ms (100MB/s)
+ // (3) A compaction of 1MB does 25MB of IO:
+ // 1MB read from this level
+ // 10-12MB read from next level (boundaries may be misaligned)
+ // 10-12MB written to next level
+ // This implies that 25 seeks cost the same as the compaction
+ // of 1MB of data. I.e., one seek costs approximately the
+ // same as the compaction of 40KB of data. We are a little
+ // conservative and allow approximately one seek for every 16KB
+ // of data before triggering a compaction.
+ f->allowed_seeks = static_cast<int>((f->file_size / 16384U));
+ if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+
+ levels_[level].deleted_files.erase(f->number);
+ levels_[level].added_files->insert(f);
+ }
+ }
+
+ // Save the current state in *v.
+ void SaveTo(Version* v) {
+ BySmallestKey cmp;
+ cmp.internal_comparator = &vset_->icmp_;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ // Merge the set of added files with the set of pre-existing files.
+ // Drop any deleted files. Store the result in *v.
+ const std::vector<FileMetaData*>& base_files = base_->files_[level];
+ std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
+ std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
+ const FileSet* added_files = levels_[level].added_files;
+ v->files_[level].reserve(base_files.size() + added_files->size());
+ for (const auto& added_file : *added_files) {
+ // Add all smaller files listed in base_
+ for (std::vector<FileMetaData*>::const_iterator bpos =
+ std::upper_bound(base_iter, base_end, added_file, cmp);
+ base_iter != bpos; ++base_iter) {
+ MaybeAddFile(v, level, *base_iter);
+ }
+
+ MaybeAddFile(v, level, added_file);
+ }
+
+ // Add remaining base files
+ for (; base_iter != base_end; ++base_iter) {
+ MaybeAddFile(v, level, *base_iter);
+ }
+
+#ifndef NDEBUG
+ // Make sure there is no overlap in levels > 0
+ if (level > 0) {
+ for (uint32_t i = 1; i < v->files_[level].size(); i++) {
+ const InternalKey& prev_end = v->files_[level][i - 1]->largest;
+ const InternalKey& this_begin = v->files_[level][i]->smallest;
+ if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+ fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+ prev_end.DebugString().c_str(),
+ this_begin.DebugString().c_str());
+ abort();
+ }
+ }
+ }
+#endif
+ }
+ }
+
+ void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+ if (levels_[level].deleted_files.count(f->number) > 0) {
+ // File is deleted: do nothing
+ } else {
+ std::vector<FileMetaData*>* files = &v->files_[level];
+ if (level > 0 && !files->empty()) {
+ // Must not overlap
+ assert(vset_->icmp_.Compare((*files)[files->size() - 1]->largest,
+ f->smallest) < 0);
+ }
+ f->refs++;
+ files->push_back(f);
+ }
+ }
+};
+
+VersionSet::VersionSet(const std::string& dbname, const Options* options,
+ TableCache* table_cache,
+ const InternalKeyComparator* cmp)
+ : env_(options->env),
+ dbname_(dbname),
+ options_(options),
+ table_cache_(table_cache),
+ icmp_(*cmp),
+ next_file_number_(2),
+ manifest_file_number_(0), // Filled by Recover()
+ last_sequence_(0),
+ log_number_(0),
+ prev_log_number_(0),
+ descriptor_file_(nullptr),
+ descriptor_log_(nullptr),
+ dummy_versions_(this),
+ current_(nullptr) {
+ AppendVersion(new Version(this));
+}
+
+VersionSet::~VersionSet() {
+ current_->Unref();
+ assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
+ delete descriptor_log_;
+ delete descriptor_file_;
+}
+
+void VersionSet::AppendVersion(Version* v) {
+ // Make "v" current
+ assert(v->refs_ == 0);
+ assert(v != current_);
+ if (current_ != nullptr) {
+ current_->Unref();
+ }
+ current_ = v;
+ v->Ref();
+
+ // Append to linked list
+ v->prev_ = dummy_versions_.prev_;
+ v->next_ = &dummy_versions_;
+ v->prev_->next_ = v;
+ v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
+ if (edit->has_log_number_) {
+ assert(edit->log_number_ >= log_number_);
+ assert(edit->log_number_ < next_file_number_);
+ } else {
+ edit->SetLogNumber(log_number_);
+ }
+
+ if (!edit->has_prev_log_number_) {
+ edit->SetPrevLogNumber(prev_log_number_);
+ }
+
+ edit->SetNextFile(next_file_number_);
+ edit->SetLastSequence(last_sequence_);
+
+ Version* v = new Version(this);
+ {
+ Builder builder(this, current_);
+ builder.Apply(edit);
+ builder.SaveTo(v);
+ }
+ Finalize(v);
+
+ // Initialize new descriptor log file if necessary by creating
+ // a temporary file that contains a snapshot of the current version.
+ std::string new_manifest_file;
+ Status s;
+ if (descriptor_log_ == nullptr) {
+ // No reason to unlock *mu here since we only hit this path in the
+ // first call to LogAndApply (when opening the database).
+ assert(descriptor_file_ == nullptr);
+ new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+ edit->SetNextFile(next_file_number_);
+ s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+ if (s.ok()) {
+ descriptor_log_ = new log::Writer(descriptor_file_);
+ s = WriteSnapshot(descriptor_log_);
+ }
+ }
+
+ // Unlock during expensive MANIFEST log write
+ {
+ mu->Unlock();
+
+ // Write new record to MANIFEST log
+ if (s.ok()) {
+ std::string record;
+ edit->EncodeTo(&record);
+ s = descriptor_log_->AddRecord(record);
+ if (s.ok()) {
+ s = descriptor_file_->Sync();
+ }
+ if (!s.ok()) {
+ Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+ }
+ }
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (s.ok() && !new_manifest_file.empty()) {
+ s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+ }
+
+ mu->Lock();
+ }
+
+ // Install the new version
+ if (s.ok()) {
+ AppendVersion(v);
+ log_number_ = edit->log_number_;
+ prev_log_number_ = edit->prev_log_number_;
+ } else {
+ delete v;
+ if (!new_manifest_file.empty()) {
+ delete descriptor_log_;
+ delete descriptor_file_;
+ descriptor_log_ = nullptr;
+ descriptor_file_ = nullptr;
+ env_->DeleteFile(new_manifest_file);
+ }
+ }
+
+ return s;
+}
+
+Status VersionSet::Recover(bool* save_manifest) {
+ struct LogReporter : public log::Reader::Reporter {
+ Status* status;
+ void Corruption(size_t bytes, const Status& s) override {
+ if (this->status->ok()) *this->status = s;
+ }
+ };
+
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string current;
+ Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+ if (!s.ok()) {
+ return s;
+ }
+ if (current.empty() || current[current.size() - 1] != '\n') {
+ return Status::Corruption("CURRENT file does not end with newline");
+ }
+ current.resize(current.size() - 1);
+
+ std::string dscname = dbname_ + "/" + current;
+ SequentialFile* file;
+ s = env_->NewSequentialFile(dscname, &file);
+ if (!s.ok()) {
+ if (s.IsNotFound()) {
+ return Status::Corruption("CURRENT points to a non-existent file",
+ s.ToString());
+ }
+ return s;
+ }
+
+ bool have_log_number = false;
+ bool have_prev_log_number = false;
+ bool have_next_file = false;
+ bool have_last_sequence = false;
+ uint64_t next_file = 0;
+ uint64_t last_sequence = 0;
+ uint64_t log_number = 0;
+ uint64_t prev_log_number = 0;
+ Builder builder(this, current_);
+
+ {
+ LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(file, &reporter, true /*checksum*/,
+ 0 /*initial_offset*/);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (s.ok()) {
+ if (edit.has_comparator_ &&
+ edit.comparator_ != icmp_.user_comparator()->Name()) {
+ s = Status::InvalidArgument(
+ edit.comparator_ + " does not match existing comparator ",
+ icmp_.user_comparator()->Name());
+ }
+ }
+
+ if (s.ok()) {
+ builder.Apply(&edit);
+ }
+
+ if (edit.has_log_number_) {
+ log_number = edit.log_number_;
+ have_log_number = true;
+ }
+
+ if (edit.has_prev_log_number_) {
+ prev_log_number = edit.prev_log_number_;
+ have_prev_log_number = true;
+ }
+
+ if (edit.has_next_file_number_) {
+ next_file = edit.next_file_number_;
+ have_next_file = true;
+ }
+
+ if (edit.has_last_sequence_) {
+ last_sequence = edit.last_sequence_;
+ have_last_sequence = true;
+ }
+ }
+ }
+ delete file;
+ file = nullptr;
+
+ if (s.ok()) {
+ if (!have_next_file) {
+ s = Status::Corruption("no meta-nextfile entry in descriptor");
+ } else if (!have_log_number) {
+ s = Status::Corruption("no meta-lognumber entry in descriptor");
+ } else if (!have_last_sequence) {
+ s = Status::Corruption("no last-sequence-number entry in descriptor");
+ }
+
+ if (!have_prev_log_number) {
+ prev_log_number = 0;
+ }
+
+ MarkFileNumberUsed(prev_log_number);
+ MarkFileNumberUsed(log_number);
+ }
+
+ if (s.ok()) {
+ Version* v = new Version(this);
+ builder.SaveTo(v);
+ // Install recovered version
+ Finalize(v);
+ AppendVersion(v);
+ manifest_file_number_ = next_file;
+ next_file_number_ = next_file + 1;
+ last_sequence_ = last_sequence;
+ log_number_ = log_number;
+ prev_log_number_ = prev_log_number;
+
+ // See if we can reuse the existing MANIFEST file.
+ if (ReuseManifest(dscname, current)) {
+ // No need to save new manifest
+ } else {
+ *save_manifest = true;
+ }
+ }
+
+ return s;
+}
+
+bool VersionSet::ReuseManifest(const std::string& dscname,
+ const std::string& dscbase) {
+ if (!options_->reuse_logs) {
+ return false;
+ }
+ FileType manifest_type;
+ uint64_t manifest_number;
+ uint64_t manifest_size;
+ if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
+ manifest_type != kDescriptorFile ||
+ !env_->GetFileSize(dscname, &manifest_size).ok() ||
+ // Make new compacted MANIFEST if old one is too big
+ manifest_size >= TargetFileSize(options_)) {
+ return false;
+ }
+
+ assert(descriptor_file_ == nullptr);
+ assert(descriptor_log_ == nullptr);
+ Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
+ if (!r.ok()) {
+ Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
+ assert(descriptor_file_ == nullptr);
+ return false;
+ }
+
+ Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
+ descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
+ manifest_file_number_ = manifest_number;
+ return true;
+}
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+ if (next_file_number_ <= number) {
+ next_file_number_ = number + 1;
+ }
+}
+
+void VersionSet::Finalize(Version* v) {
+ // Precomputed best level for next compaction
+ int best_level = -1;
+ double best_score = -1;
+
+ for (int level = 0; level < config::kNumLevels - 1; level++) {
+ double score;
+ if (level == 0) {
+ // We treat level-0 specially by bounding the number of files
+ // instead of number of bytes for two reasons:
+ //
+ // (1) With larger write-buffer sizes, it is nice not to do too
+ // many level-0 compactions.
+ //
+ // (2) The files in level-0 are merged on every read and
+ // therefore we wish to avoid too many files when the individual
+ // file size is small (perhaps because of a small write-buffer
+ // setting, or very high compression ratios, or lots of
+ // overwrites/deletions).
+ score = v->files_[level].size() /
+ static_cast<double>(config::kL0_CompactionTrigger);
+ } else {
+ // Compute the ratio of current size to size limit.
+ const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+ score =
+ static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level);
+ }
+
+ if (score > best_score) {
+ best_level = level;
+ best_score = score;
+ }
+ }
+
+ v->compaction_level_ = best_level;
+ v->compaction_score_ = best_score;
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+ // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+ // Save metadata
+ VersionEdit edit;
+ edit.SetComparatorName(icmp_.user_comparator()->Name());
+
+ // Save compaction pointers
+ for (int level = 0; level < config::kNumLevels; level++) {
+ if (!compact_pointer_[level].empty()) {
+ InternalKey key;
+ key.DecodeFrom(compact_pointer_[level]);
+ edit.SetCompactPointer(level, key);
+ }
+ }
+
+ // Save files
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = current_->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ const FileMetaData* f = files[i];
+ edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
+ }
+ }
+
+ std::string record;
+ edit.EncodeTo(&record);
+ return log->AddRecord(record);
+}
+
+int VersionSet::NumLevelFiles(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return current_->files_[level].size();
+}
+
+const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
+ // Update code if kNumLevels changes
+ static_assert(config::kNumLevels == 7, "");
+ snprintf(scratch->buffer, sizeof(scratch->buffer),
+ "files[ %d %d %d %d %d %d %d ]", int(current_->files_[0].size()),
+ int(current_->files_[1].size()), int(current_->files_[2].size()),
+ int(current_->files_[3].size()), int(current_->files_[4].size()),
+ int(current_->files_[5].size()), int(current_->files_[6].size()));
+ return scratch->buffer;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+ uint64_t result = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+ // Entire file is before "ikey", so just add the file size
+ result += files[i]->file_size;
+ } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+ // Entire file is after "ikey", so ignore
+ if (level > 0) {
+ // Files other than level 0 are sorted by meta->smallest, so
+ // no further files in this level will contain data for
+ // "ikey".
+ break;
+ }
+ } else {
+ // "ikey" falls in the range for this table. Add the
+ // approximate offset of "ikey" within the table.
+ Table* tableptr;
+ Iterator* iter = table_cache_->NewIterator(
+ ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
+ if (tableptr != nullptr) {
+ result += tableptr->ApproximateOffsetOf(ikey.Encode());
+ }
+ delete iter;
+ }
+ }
+ }
+ return result;
+}
+
+void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
+ for (Version* v = dummy_versions_.next_; v != &dummy_versions_;
+ v = v->next_) {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ live->insert(files[i]->number);
+ }
+ }
+ }
+}
+
+int64_t VersionSet::NumLevelBytes(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return TotalFileSize(current_->files_[level]);
+}
+
+int64_t VersionSet::MaxNextLevelOverlappingBytes() {
+ int64_t result = 0;
+ std::vector<FileMetaData*> overlaps;
+ for (int level = 1; level < config::kNumLevels - 1; level++) {
+ for (size_t i = 0; i < current_->files_[level].size(); i++) {
+ const FileMetaData* f = current_->files_[level][i];
+ current_->GetOverlappingInputs(level + 1, &f->smallest, &f->largest,
+ &overlaps);
+ const int64_t sum = TotalFileSize(overlaps);
+ if (sum > result) {
+ result = sum;
+ }
+ }
+ }
+ return result;
+}
+
+// Stores the minimal range that covers all entries in inputs in
+// *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
+ InternalKey* smallest, InternalKey* largest) {
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_.Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_.Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+}
+
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest, InternalKey* largest) {
+ std::vector<FileMetaData*> all = inputs1;
+ all.insert(all.end(), inputs2.begin(), inputs2.end());
+ GetRange(all, smallest, largest);
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+ ReadOptions options;
+ options.verify_checksums = options_->paranoid_checks;
+ options.fill_cache = false;
+
+ // Level-0 files have to be merged together. For other levels,
+ // we will make a concatenating iterator per level.
+ // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+ const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+ Iterator** list = new Iterator*[space];
+ int num = 0;
+ for (int which = 0; which < 2; which++) {
+ if (!c->inputs_[which].empty()) {
+ if (c->level() + which == 0) {
+ const std::vector<FileMetaData*>& files = c->inputs_[which];
+ for (size_t i = 0; i < files.size(); i++) {
+ list[num++] = table_cache_->NewIterator(options, files[i]->number,
+ files[i]->file_size);
+ }
+ } else {
+ // Create concatenating iterator for the files from this level
+ list[num++] = NewTwoLevelIterator(
+ new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
+ &GetFileIterator, table_cache_, options);
+ }
+ }
+ }
+ assert(num <= space);
+ Iterator* result = NewMergingIterator(&icmp_, list, num);
+ delete[] list;
+ return result;
+}
+
+Compaction* VersionSet::PickCompaction() {
+ Compaction* c;
+ int level;
+
+ // We prefer compactions triggered by too much data in a level over
+ // the compactions triggered by seeks.
+ const bool size_compaction = (current_->compaction_score_ >= 1);
+ const bool seek_compaction = (current_->file_to_compact_ != nullptr);
+ if (size_compaction) {
+ level = current_->compaction_level_;
+ assert(level >= 0);
+ assert(level + 1 < config::kNumLevels);
+ c = new Compaction(options_, level);
+
+ // Pick the first file that comes after compact_pointer_[level]
+ for (size_t i = 0; i < current_->files_[level].size(); i++) {
+ FileMetaData* f = current_->files_[level][i];
+ if (compact_pointer_[level].empty() ||
+ icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
+ c->inputs_[0].push_back(f);
+ break;
+ }
+ }
+ if (c->inputs_[0].empty()) {
+ // Wrap-around to the beginning of the key space
+ c->inputs_[0].push_back(current_->files_[level][0]);
+ }
+ } else if (seek_compaction) {
+ level = current_->file_to_compact_level_;
+ c = new Compaction(options_, level);
+ c->inputs_[0].push_back(current_->file_to_compact_);
+ } else {
+ return nullptr;
+ }
+
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+
+ // Files in level 0 may overlap each other, so pick up all overlapping ones
+ if (level == 0) {
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+ assert(!c->inputs_[0].empty());
+ }
+
+ SetupOtherInputs(c);
+
+ return c;
+}
+
+// Finds the largest key in a vector of files. Returns true if files it not
+// empty.
+bool FindLargestKey(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& files,
+ InternalKey* largest_key) {
+ if (files.empty()) {
+ return false;
+ }
+ *largest_key = files[0]->largest;
+ for (size_t i = 1; i < files.size(); ++i) {
+ FileMetaData* f = files[i];
+ if (icmp.Compare(f->largest, *largest_key) > 0) {
+ *largest_key = f->largest;
+ }
+ }
+ return true;
+}
+
+// Finds minimum file b2=(l2, u2) in level file for which l2 > u1 and
+// user_key(l2) = user_key(u1)
+FileMetaData* FindSmallestBoundaryFile(
+ const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& level_files,
+ const InternalKey& largest_key) {
+ const Comparator* user_cmp = icmp.user_comparator();
+ FileMetaData* smallest_boundary_file = nullptr;
+ for (size_t i = 0; i < level_files.size(); ++i) {
+ FileMetaData* f = level_files[i];
+ if (icmp.Compare(f->smallest, largest_key) > 0 &&
+ user_cmp->Compare(f->smallest.user_key(), largest_key.user_key()) ==
+ 0) {
+ if (smallest_boundary_file == nullptr ||
+ icmp.Compare(f->smallest, smallest_boundary_file->smallest) < 0) {
+ smallest_boundary_file = f;
+ }
+ }
+ }
+ return smallest_boundary_file;
+}
+
+// Extracts the largest file b1 from |compaction_files| and then searches for a
+// b2 in |level_files| for which user_key(u1) = user_key(l2). If it finds such a
+// file b2 (known as a boundary file) it adds it to |compaction_files| and then
+// searches again using this new upper bound.
+//
+// If there are two blocks, b1=(l1, u1) and b2=(l2, u2) and
+// user_key(u1) = user_key(l2), and if we compact b1 but not b2 then a
+// subsequent get operation will yield an incorrect result because it will
+// return the record from b2 in level i rather than from b1 because it searches
+// level by level for records matching the supplied user key.
+//
+// parameters:
+// in level_files: List of files to search for boundary files.
+// in/out compaction_files: List of files to extend by adding boundary files.
+void AddBoundaryInputs(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& level_files,
+ std::vector<FileMetaData*>* compaction_files) {
+ InternalKey largest_key;
+
+ // Quick return if compaction_files is empty.
+ if (!FindLargestKey(icmp, *compaction_files, &largest_key)) {
+ return;
+ }
+
+ bool continue_searching = true;
+ while (continue_searching) {
+ FileMetaData* smallest_boundary_file =
+ FindSmallestBoundaryFile(icmp, level_files, largest_key);
+
+ // If a boundary file was found advance largest_key, otherwise we're done.
+ if (smallest_boundary_file != NULL) {
+ compaction_files->push_back(smallest_boundary_file);
+ largest_key = smallest_boundary_file->largest;
+ } else {
+ continue_searching = false;
+ }
+ }
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+ const int level = c->level();
+ InternalKey smallest, largest;
+
+ AddBoundaryInputs(icmp_, current_->files_[level], &c->inputs_[0]);
+ GetRange(c->inputs_[0], &smallest, &largest);
+
+ current_->GetOverlappingInputs(level + 1, &smallest, &largest,
+ &c->inputs_[1]);
+
+ // Get entire range covered by compaction
+ InternalKey all_start, all_limit;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+ // See if we can grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up.
+ if (!c->inputs_[1].empty()) {
+ std::vector<FileMetaData*> expanded0;
+ current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
+ AddBoundaryInputs(icmp_, current_->files_[level], &expanded0);
+ const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+ const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+ const int64_t expanded0_size = TotalFileSize(expanded0);
+ if (expanded0.size() > c->inputs_[0].size() &&
+ inputs1_size + expanded0_size <
+ ExpandedCompactionByteSizeLimit(options_)) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded0, &new_start, &new_limit);
+ std::vector<FileMetaData*> expanded1;
+ current_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
+ &expanded1);
+ if (expanded1.size() == c->inputs_[1].size()) {
+ Log(options_->info_log,
+ "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
+ level, int(c->inputs_[0].size()), int(c->inputs_[1].size()),
+ long(inputs0_size), long(inputs1_size), int(expanded0.size()),
+ int(expanded1.size()), long(expanded0_size), long(inputs1_size));
+ smallest = new_start;
+ largest = new_limit;
+ c->inputs_[0] = expanded0;
+ c->inputs_[1] = expanded1;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+ }
+ }
+ }
+
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2)
+ if (level + 2 < config::kNumLevels) {
+ current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+ &c->grandparents_);
+ }
+
+ // Update the place where we will do the next compaction for this level.
+ // We update this immediately instead of waiting for the VersionEdit
+ // to be applied so that if the compaction fails, we will try a different
+ // key range next time.
+ compact_pointer_[level] = largest.Encode().ToString();
+ c->edit_.SetCompactPointer(level, largest);
+}
+
+Compaction* VersionSet::CompactRange(int level, const InternalKey* begin,
+ const InternalKey* end) {
+ std::vector<FileMetaData*> inputs;
+ current_->GetOverlappingInputs(level, begin, end, &inputs);
+ if (inputs.empty()) {
+ return nullptr;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (level > 0) {
+ const uint64_t limit = MaxFileSizeForLevel(options_, level);
+ uint64_t total = 0;
+ for (size_t i = 0; i < inputs.size(); i++) {
+ uint64_t s = inputs[i]->file_size;
+ total += s;
+ if (total >= limit) {
+ inputs.resize(i + 1);
+ break;
+ }
+ }
+ }
+
+ Compaction* c = new Compaction(options_, level);
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+ c->inputs_[0] = inputs;
+ SetupOtherInputs(c);
+ return c;
+}
+
+Compaction::Compaction(const Options* options, int level)
+ : level_(level),
+ max_output_file_size_(MaxFileSizeForLevel(options, level)),
+ input_version_(nullptr),
+ grandparent_index_(0),
+ seen_key_(false),
+ overlapped_bytes_(0) {
+ for (int i = 0; i < config::kNumLevels; i++) {
+ level_ptrs_[i] = 0;
+ }
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ }
+}
+
+bool Compaction::IsTrivialMove() const {
+ const VersionSet* vset = input_version_->vset_;
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ return (num_input_files(0) == 1 && num_input_files(1) == 0 &&
+ TotalFileSize(grandparents_) <=
+ MaxGrandParentOverlapBytes(vset->options_));
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+ for (int which = 0; which < 2; which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+ }
+ }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+ for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
+ const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+ while (level_ptrs_[lvl] < files.size()) {
+ FileMetaData* f = files[level_ptrs_[lvl]];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so definitely not base level
+ return false;
+ }
+ break;
+ }
+ level_ptrs_[lvl]++;
+ }
+ }
+ return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+ const VersionSet* vset = input_version_->vset_;
+ // Scan to find earliest grandparent file that contains key.
+ const InternalKeyComparator* icmp = &vset->icmp_;
+ while (grandparent_index_ < grandparents_.size() &&
+ icmp->Compare(internal_key,
+ grandparents_[grandparent_index_]->largest.Encode()) >
+ 0) {
+ if (seen_key_) {
+ overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+ }
+ grandparent_index_++;
+ }
+ seen_key_ = true;
+
+ if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) {
+ // Too much overlap for current output; start new output
+ overlapped_bytes_ = 0;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void Compaction::ReleaseInputs() {
+ if (input_version_ != nullptr) {
+ input_version_->Unref();
+ input_version_ = nullptr;
+ }
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
new file mode 100644
index 0000000000..69f3d70133
--- /dev/null
+++ b/src/leveldb/db/version_set.h
@@ -0,0 +1,393 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions. The
+// newest version is called "current". Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level. The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
+#define STORAGE_LEVELDB_DB_VERSION_SET_H_
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+
+namespace leveldb {
+
+namespace log {
+class Writer;
+}
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableBuilder;
+class TableCache;
+class Version;
+class VersionSet;
+class WritableFile;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+int FindFile(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& files, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+// in sorted order.
+bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const std::vector<FileMetaData*>& files,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+class Version {
+ public:
+ // Lookup the value for key. If found, store it in *val and
+ // return OK. Else return a non-OK status. Fills *stats.
+ // REQUIRES: lock is not held
+ struct GetStats {
+ FileMetaData* seek_file;
+ int seek_file_level;
+ };
+
+ // Append to *iters a sequence of iterators that will
+ // yield the contents of this Version when merged together.
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
+
+ Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
+ GetStats* stats);
+
+ // Adds "stats" into the current state. Returns true if a new
+ // compaction may need to be triggered, false otherwise.
+ // REQUIRES: lock is held
+ bool UpdateStats(const GetStats& stats);
+
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes. Returns true if a new compaction may need to be triggered.
+ // REQUIRES: lock is held
+ bool RecordReadSample(Slice key);
+
+ // Reference count management (so Versions do not disappear out from
+ // under live iterators)
+ void Ref();
+ void Unref();
+
+ void GetOverlappingInputs(
+ int level,
+ const InternalKey* begin, // nullptr means before all keys
+ const InternalKey* end, // nullptr means after all keys
+ std::vector<FileMetaData*>* inputs);
+
+ // Returns true iff some file in the specified level overlaps
+ // some part of [*smallest_user_key,*largest_user_key].
+ // smallest_user_key==nullptr represents a key smaller than all the DB's keys.
+ // largest_user_key==nullptr represents a key largest than all the DB's keys.
+ bool OverlapInLevel(int level, const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+ // Return the level at which we should place a new memtable compaction
+ // result that covers the range [smallest_user_key,largest_user_key].
+ int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+ const Slice& largest_user_key);
+
+ int NumFiles(int level) const { return files_[level].size(); }
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString() const;
+
+ private:
+ friend class Compaction;
+ friend class VersionSet;
+
+ class LevelFileNumIterator;
+
+ explicit Version(VersionSet* vset)
+ : vset_(vset),
+ next_(this),
+ prev_(this),
+ refs_(0),
+ file_to_compact_(nullptr),
+ file_to_compact_level_(-1),
+ compaction_score_(-1),
+ compaction_level_(-1) {}
+
+ Version(const Version&) = delete;
+ Version& operator=(const Version&) = delete;
+
+ ~Version();
+
+ Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+
+ // Call func(arg, level, f) for every file that overlaps user_key in
+ // order from newest to oldest. If an invocation of func returns
+ // false, makes no more calls.
+ //
+ // REQUIRES: user portion of internal_key == user_key.
+ void ForEachOverlapping(Slice user_key, Slice internal_key, void* arg,
+ bool (*func)(void*, int, FileMetaData*));
+
+ VersionSet* vset_; // VersionSet to which this Version belongs
+ Version* next_; // Next version in linked list
+ Version* prev_; // Previous version in linked list
+ int refs_; // Number of live refs to this version
+
+ // List of files per level
+ std::vector<FileMetaData*> files_[config::kNumLevels];
+
+ // Next file to compact based on seek stats.
+ FileMetaData* file_to_compact_;
+ int file_to_compact_level_;
+
+ // Level that should be compacted next and its compaction score.
+ // Score < 1 means compaction is not strictly needed. These fields
+ // are initialized by Finalize().
+ double compaction_score_;
+ int compaction_level_;
+};
+
+class VersionSet {
+ public:
+ VersionSet(const std::string& dbname, const Options* options,
+ TableCache* table_cache, const InternalKeyComparator*);
+ VersionSet(const VersionSet&) = delete;
+ VersionSet& operator=(const VersionSet&) = delete;
+
+ ~VersionSet();
+
+ // Apply *edit to the current version to form a new descriptor that
+ // is both saved to persistent state and installed as the new
+ // current version. Will release *mu while actually writing to the file.
+ // REQUIRES: *mu is held on entry.
+ // REQUIRES: no other thread concurrently calls LogAndApply()
+ Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
+ EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+ // Recover the last saved descriptor from persistent storage.
+ Status Recover(bool* save_manifest);
+
+ // Return the current version.
+ Version* current() const { return current_; }
+
+ // Return the current manifest file number
+ uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+ // Allocate and return a new file number
+ uint64_t NewFileNumber() { return next_file_number_++; }
+
+ // Arrange to reuse "file_number" unless a newer file number has
+ // already been allocated.
+ // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+ void ReuseFileNumber(uint64_t file_number) {
+ if (next_file_number_ == file_number + 1) {
+ next_file_number_ = file_number;
+ }
+ }
+
+ // Return the number of Table files at the specified level.
+ int NumLevelFiles(int level) const;
+
+ // Return the combined file size of all files at the specified level.
+ int64_t NumLevelBytes(int level) const;
+
+ // Return the last sequence number.
+ uint64_t LastSequence() const { return last_sequence_; }
+
+ // Set the last sequence number to s.
+ void SetLastSequence(uint64_t s) {
+ assert(s >= last_sequence_);
+ last_sequence_ = s;
+ }
+
+ // Mark the specified file number as used.
+ void MarkFileNumberUsed(uint64_t number);
+
+ // Return the current log file number.
+ uint64_t LogNumber() const { return log_number_; }
+
+ // Return the log file number for the log file that is currently
+ // being compacted, or zero if there is no such log file.
+ uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+ // Pick level and inputs for a new compaction.
+ // Returns nullptr if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ Compaction* PickCompaction();
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns nullptr if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ Compaction* CompactRange(int level, const InternalKey* begin,
+ const InternalKey* end);
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t MaxNextLevelOverlappingBytes();
+
+ // Create an iterator that reads over the compaction inputs for "*c".
+ // The caller should delete the iterator when no longer needed.
+ Iterator* MakeInputIterator(Compaction* c);
+
+ // Returns true iff some level needs a compaction.
+ bool NeedsCompaction() const {
+ Version* v = current_;
+ return (v->compaction_score_ >= 1) || (v->file_to_compact_ != nullptr);
+ }
+
+ // Add all files listed in any live version to *live.
+ // May also mutate some internal state.
+ void AddLiveFiles(std::set<uint64_t>* live);
+
+ // Return the approximate offset in the database of the data for
+ // "key" as of version "v".
+ uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+ // Return a human-readable short (single-line) summary of the number
+ // of files per level. Uses *scratch as backing store.
+ struct LevelSummaryStorage {
+ char buffer[100];
+ };
+ const char* LevelSummary(LevelSummaryStorage* scratch) const;
+
+ private:
+ class Builder;
+
+ friend class Compaction;
+ friend class Version;
+
+ bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
+
+ void Finalize(Version* v);
+
+ void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
+ InternalKey* largest);
+
+ void GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest, InternalKey* largest);
+
+ void SetupOtherInputs(Compaction* c);
+
+ // Save current contents to *log
+ Status WriteSnapshot(log::Writer* log);
+
+ void AppendVersion(Version* v);
+
+ Env* const env_;
+ const std::string dbname_;
+ const Options* const options_;
+ TableCache* const table_cache_;
+ const InternalKeyComparator icmp_;
+ uint64_t next_file_number_;
+ uint64_t manifest_file_number_;
+ uint64_t last_sequence_;
+ uint64_t log_number_;
+ uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
+
+ // Opened lazily
+ WritableFile* descriptor_file_;
+ log::Writer* descriptor_log_;
+ Version dummy_versions_; // Head of circular doubly-linked list of versions.
+ Version* current_; // == dummy_versions_.prev_
+
+ // Per-level key at which the next compaction at that level should start.
+ // Either an empty string, or a valid InternalKey.
+ std::string compact_pointer_[config::kNumLevels];
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+ ~Compaction();
+
+ // Return the level that is being compacted. Inputs from "level"
+ // and "level+1" will be merged to produce a set of "level+1" files.
+ int level() const { return level_; }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // "which" must be either 0 or 1
+ int num_input_files(int which) const { return inputs_[which].size(); }
+
+ // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+ FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the information we have available guarantees that
+ // the compaction is producing data in "level+1" for which no data exists
+ // in levels greater than "level+1".
+ bool IsBaseLevelForKey(const Slice& user_key);
+
+ // Returns true iff we should stop building the current output
+ // before processing "internal_key".
+ bool ShouldStopBefore(const Slice& internal_key);
+
+ // Release the input version for the compaction, once the compaction
+ // is successful.
+ void ReleaseInputs();
+
+ private:
+ friend class Version;
+ friend class VersionSet;
+
+ Compaction(const Options* options, int level);
+
+ int level_;
+ uint64_t max_output_file_size_;
+ Version* input_version_;
+ VersionEdit edit_;
+
+ // Each compaction reads inputs from "level_" and "level_+1"
+ std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
+
+ // State used to check for number of overlapping grandparent files
+ // (parent == level_ + 1, grandparent == level_ + 2)
+ std::vector<FileMetaData*> grandparents_;
+ size_t grandparent_index_; // Index in grandparent_starts_
+ bool seen_key_; // Some output key has been seen
+ int64_t overlapped_bytes_; // Bytes of overlap between current output
+ // and grandparent files
+
+ // State for implementing IsBaseLevelForKey
+
+ // level_ptrs_ holds indices into input_version_->levels_: our state
+ // is that we are positioned at one of the file ranges for each
+ // higher level than the ones involved in this compaction (i.e. for
+ // all L >= level_ + 2).
+ size_t level_ptrs_[config::kNumLevels];
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc
new file mode 100644
index 0000000000..c1056a1e7d
--- /dev/null
+++ b/src/leveldb/db/version_set_test.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class FindFileTest {
+ public:
+ FindFileTest() : disjoint_sorted_files_(true) {}
+
+ ~FindFileTest() {
+ for (int i = 0; i < files_.size(); i++) {
+ delete files_[i];
+ }
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ FileMetaData* f = new FileMetaData;
+ f->number = files_.size() + 1;
+ f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+ f->largest = InternalKey(largest, largest_seq, kTypeValue);
+ files_.push_back(f);
+ }
+
+ int Find(const char* key) {
+ InternalKey target(key, 100, kTypeValue);
+ InternalKeyComparator cmp(BytewiseComparator());
+ return FindFile(cmp, files_, target.Encode());
+ }
+
+ bool Overlaps(const char* smallest, const char* largest) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ Slice s(smallest != nullptr ? smallest : "");
+ Slice l(largest != nullptr ? largest : "");
+ return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+ (smallest != nullptr ? &s : nullptr),
+ (largest != nullptr ? &l : nullptr));
+ }
+
+ bool disjoint_sorted_files_;
+
+ private:
+ std::vector<FileMetaData*> files_;
+};
+
+TEST(FindFileTest, Empty) {
+ ASSERT_EQ(0, Find("foo"));
+ ASSERT_TRUE(!Overlaps("a", "z"));
+ ASSERT_TRUE(!Overlaps(nullptr, "z"));
+ ASSERT_TRUE(!Overlaps("a", nullptr));
+ ASSERT_TRUE(!Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Single) {
+ Add("p", "q");
+ ASSERT_EQ(0, Find("a"));
+ ASSERT_EQ(0, Find("p"));
+ ASSERT_EQ(0, Find("p1"));
+ ASSERT_EQ(0, Find("q"));
+ ASSERT_EQ(1, Find("q1"));
+ ASSERT_EQ(1, Find("z"));
+
+ ASSERT_TRUE(!Overlaps("a", "b"));
+ ASSERT_TRUE(!Overlaps("z1", "z2"));
+ ASSERT_TRUE(Overlaps("a", "p"));
+ ASSERT_TRUE(Overlaps("a", "q"));
+ ASSERT_TRUE(Overlaps("a", "z"));
+ ASSERT_TRUE(Overlaps("p", "p1"));
+ ASSERT_TRUE(Overlaps("p", "q"));
+ ASSERT_TRUE(Overlaps("p", "z"));
+ ASSERT_TRUE(Overlaps("p1", "p2"));
+ ASSERT_TRUE(Overlaps("p1", "z"));
+ ASSERT_TRUE(Overlaps("q", "q"));
+ ASSERT_TRUE(Overlaps("q", "q1"));
+
+ ASSERT_TRUE(!Overlaps(nullptr, "j"));
+ ASSERT_TRUE(!Overlaps("r", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "p"));
+ ASSERT_TRUE(Overlaps(nullptr, "p1"));
+ ASSERT_TRUE(Overlaps("q", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST(FindFileTest, Multiple) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_EQ(0, Find("100"));
+ ASSERT_EQ(0, Find("150"));
+ ASSERT_EQ(0, Find("151"));
+ ASSERT_EQ(0, Find("199"));
+ ASSERT_EQ(0, Find("200"));
+ ASSERT_EQ(1, Find("201"));
+ ASSERT_EQ(1, Find("249"));
+ ASSERT_EQ(1, Find("250"));
+ ASSERT_EQ(2, Find("251"));
+ ASSERT_EQ(2, Find("299"));
+ ASSERT_EQ(2, Find("300"));
+ ASSERT_EQ(2, Find("349"));
+ ASSERT_EQ(2, Find("350"));
+ ASSERT_EQ(3, Find("351"));
+ ASSERT_EQ(3, Find("400"));
+ ASSERT_EQ(3, Find("450"));
+ ASSERT_EQ(4, Find("451"));
+
+ ASSERT_TRUE(!Overlaps("100", "149"));
+ ASSERT_TRUE(!Overlaps("251", "299"));
+ ASSERT_TRUE(!Overlaps("451", "500"));
+ ASSERT_TRUE(!Overlaps("351", "399"));
+
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_TRUE(!Overlaps(nullptr, "149"));
+ ASSERT_TRUE(!Overlaps("451", nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, nullptr));
+ ASSERT_TRUE(Overlaps(nullptr, "150"));
+ ASSERT_TRUE(Overlaps(nullptr, "199"));
+ ASSERT_TRUE(Overlaps(nullptr, "200"));
+ ASSERT_TRUE(Overlaps(nullptr, "201"));
+ ASSERT_TRUE(Overlaps(nullptr, "400"));
+ ASSERT_TRUE(Overlaps(nullptr, "800"));
+ ASSERT_TRUE(Overlaps("100", nullptr));
+ ASSERT_TRUE(Overlaps("200", nullptr));
+ ASSERT_TRUE(Overlaps("449", nullptr));
+ ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+ Add("200", "200", 5000, 3000);
+ ASSERT_TRUE(!Overlaps("199", "199"));
+ ASSERT_TRUE(!Overlaps("201", "300"));
+ ASSERT_TRUE(Overlaps("200", "200"));
+ ASSERT_TRUE(Overlaps("190", "200"));
+ ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+ Add("150", "600");
+ Add("400", "500");
+ disjoint_sorted_files_ = false;
+ ASSERT_TRUE(!Overlaps("100", "149"));
+ ASSERT_TRUE(!Overlaps("601", "700"));
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+ ASSERT_TRUE(Overlaps("450", "700"));
+ ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+void AddBoundaryInputs(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& level_files,
+ std::vector<FileMetaData*>* compaction_files);
+
+class AddBoundaryInputsTest {
+ public:
+ std::vector<FileMetaData*> level_files_;
+ std::vector<FileMetaData*> compaction_files_;
+ std::vector<FileMetaData*> all_files_;
+ InternalKeyComparator icmp_;
+
+ AddBoundaryInputsTest() : icmp_(BytewiseComparator()) {}
+
+ ~AddBoundaryInputsTest() {
+ for (size_t i = 0; i < all_files_.size(); ++i) {
+ delete all_files_[i];
+ }
+ all_files_.clear();
+ }
+
+ FileMetaData* CreateFileMetaData(uint64_t number, InternalKey smallest,
+ InternalKey largest) {
+ FileMetaData* f = new FileMetaData();
+ f->number = number;
+ f->smallest = smallest;
+ f->largest = largest;
+ all_files_.push_back(f);
+ return f;
+ }
+};
+
+TEST(AddBoundaryInputsTest, TestEmptyFileSets) {
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_TRUE(compaction_files_.empty());
+ ASSERT_TRUE(level_files_.empty());
+}
+
+TEST(AddBoundaryInputsTest, TestEmptyLevelFiles) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 2, kTypeValue),
+ InternalKey(InternalKey("100", 1, kTypeValue)));
+ compaction_files_.push_back(f1);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_EQ(1, compaction_files_.size());
+ ASSERT_EQ(f1, compaction_files_[0]);
+ ASSERT_TRUE(level_files_.empty());
+}
+
+TEST(AddBoundaryInputsTest, TestEmptyCompactionFiles) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 2, kTypeValue),
+ InternalKey(InternalKey("100", 1, kTypeValue)));
+ level_files_.push_back(f1);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_TRUE(compaction_files_.empty());
+ ASSERT_EQ(1, level_files_.size());
+ ASSERT_EQ(f1, level_files_[0]);
+}
+
+TEST(AddBoundaryInputsTest, TestNoBoundaryFiles) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 2, kTypeValue),
+ InternalKey(InternalKey("100", 1, kTypeValue)));
+ FileMetaData* f2 =
+ CreateFileMetaData(1, InternalKey("200", 2, kTypeValue),
+ InternalKey(InternalKey("200", 1, kTypeValue)));
+ FileMetaData* f3 =
+ CreateFileMetaData(1, InternalKey("300", 2, kTypeValue),
+ InternalKey(InternalKey("300", 1, kTypeValue)));
+
+ level_files_.push_back(f3);
+ level_files_.push_back(f2);
+ level_files_.push_back(f1);
+ compaction_files_.push_back(f2);
+ compaction_files_.push_back(f3);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_EQ(2, compaction_files_.size());
+}
+
+TEST(AddBoundaryInputsTest, TestOneBoundaryFiles) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 3, kTypeValue),
+ InternalKey(InternalKey("100", 2, kTypeValue)));
+ FileMetaData* f2 =
+ CreateFileMetaData(1, InternalKey("100", 1, kTypeValue),
+ InternalKey(InternalKey("200", 3, kTypeValue)));
+ FileMetaData* f3 =
+ CreateFileMetaData(1, InternalKey("300", 2, kTypeValue),
+ InternalKey(InternalKey("300", 1, kTypeValue)));
+
+ level_files_.push_back(f3);
+ level_files_.push_back(f2);
+ level_files_.push_back(f1);
+ compaction_files_.push_back(f1);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_EQ(2, compaction_files_.size());
+ ASSERT_EQ(f1, compaction_files_[0]);
+ ASSERT_EQ(f2, compaction_files_[1]);
+}
+
+TEST(AddBoundaryInputsTest, TestTwoBoundaryFiles) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 6, kTypeValue),
+ InternalKey(InternalKey("100", 5, kTypeValue)));
+ FileMetaData* f2 =
+ CreateFileMetaData(1, InternalKey("100", 2, kTypeValue),
+ InternalKey(InternalKey("300", 1, kTypeValue)));
+ FileMetaData* f3 =
+ CreateFileMetaData(1, InternalKey("100", 4, kTypeValue),
+ InternalKey(InternalKey("100", 3, kTypeValue)));
+
+ level_files_.push_back(f2);
+ level_files_.push_back(f3);
+ level_files_.push_back(f1);
+ compaction_files_.push_back(f1);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_EQ(3, compaction_files_.size());
+ ASSERT_EQ(f1, compaction_files_[0]);
+ ASSERT_EQ(f3, compaction_files_[1]);
+ ASSERT_EQ(f2, compaction_files_[2]);
+}
+
+TEST(AddBoundaryInputsTest, TestDisjoinFilePointers) {
+ FileMetaData* f1 =
+ CreateFileMetaData(1, InternalKey("100", 6, kTypeValue),
+ InternalKey(InternalKey("100", 5, kTypeValue)));
+ FileMetaData* f2 =
+ CreateFileMetaData(1, InternalKey("100", 6, kTypeValue),
+ InternalKey(InternalKey("100", 5, kTypeValue)));
+ FileMetaData* f3 =
+ CreateFileMetaData(1, InternalKey("100", 2, kTypeValue),
+ InternalKey(InternalKey("300", 1, kTypeValue)));
+ FileMetaData* f4 =
+ CreateFileMetaData(1, InternalKey("100", 4, kTypeValue),
+ InternalKey(InternalKey("100", 3, kTypeValue)));
+
+ level_files_.push_back(f2);
+ level_files_.push_back(f3);
+ level_files_.push_back(f4);
+
+ compaction_files_.push_back(f1);
+
+ AddBoundaryInputs(icmp_, level_files_, &compaction_files_);
+ ASSERT_EQ(3, compaction_files_.size());
+ ASSERT_EQ(f1, compaction_files_[0]);
+ ASSERT_EQ(f4, compaction_files_[1]);
+ ASSERT_EQ(f3, compaction_files_[2]);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }
diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc
new file mode 100644
index 0000000000..b54313c35e
--- /dev/null
+++ b/src/leveldb/db/write_batch.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring |
+// kTypeDeletion varstring
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "leveldb/write_batch.h"
+
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch() { Clear(); }
+
+WriteBatch::~WriteBatch() = default;
+
+WriteBatch::Handler::~Handler() = default;
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(kHeader);
+}
+
+size_t WriteBatch::ApproximateSize() const { return rep_.size(); }
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ Slice input(rep_);
+ if (input.size() < kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ input.remove_prefix(kHeader);
+ Slice key, value;
+ int found = 0;
+ while (!input.empty()) {
+ found++;
+ char tag = input[0];
+ input.remove_prefix(1);
+ switch (tag) {
+ case kTypeValue:
+ if (GetLengthPrefixedSlice(&input, &key) &&
+ GetLengthPrefixedSlice(&input, &value)) {
+ handler->Put(key, value);
+ } else {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeDeletion:
+ if (GetLengthPrefixedSlice(&input, &key)) {
+ handler->Delete(key);
+ } else {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (found != WriteBatchInternal::Count(this)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeValue));
+ PutLengthPrefixedSlice(&rep_, key);
+ PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeDeletion));
+ PutLengthPrefixedSlice(&rep_, key);
+}
+
+void WriteBatch::Append(const WriteBatch& source) {
+ WriteBatchInternal::Append(this, &source);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+ SequenceNumber sequence_;
+ MemTable* mem_;
+
+ void Put(const Slice& key, const Slice& value) override {
+ mem_->Add(sequence_, kTypeValue, key, value);
+ sequence_++;
+ }
+ void Delete(const Slice& key) override {
+ mem_->Add(sequence_, kTypeDeletion, key, Slice());
+ sequence_++;
+ }
+};
+} // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* memtable) {
+ MemTableInserter inserter;
+ inserter.sequence_ = WriteBatchInternal::Sequence(b);
+ inserter.mem_ = memtable;
+ return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= kHeader);
+ b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+ SetCount(dst, Count(dst) + Count(src));
+ assert(src->rep_.size() >= kHeader);
+ dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h
new file mode 100644
index 0000000000..fce86e3f1f
--- /dev/null
+++ b/src/leveldb/db/write_batch_internal.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "db/dbformat.h"
+#include "leveldb/write_batch.h"
+
+namespace leveldb {
+
+class MemTable;
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+ // Return the number of entries in the batch.
+ static int Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, int n);
+
+ // Return the sequence number for the start of this batch.
+ static SequenceNumber Sequence(const WriteBatch* batch);
+
+ // Store the specified number as the sequence number for the start of
+ // this batch.
+ static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+ static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); }
+
+ static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); }
+
+ static void SetContents(WriteBatch* batch, const Slice& contents);
+
+ static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+
+ static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc
new file mode 100644
index 0000000000..c32317fb5e
--- /dev/null
+++ b/src/leveldb/db/write_batch_test.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string PrintContents(WriteBatch* b) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ MemTable* mem = new MemTable(cmp);
+ mem->Ref();
+ std::string state;
+ Status s = WriteBatchInternal::InsertInto(b, mem);
+ int count = 0;
+ Iterator* iter = mem->NewIterator();
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+ switch (ikey.type) {
+ case kTypeValue:
+ state.append("Put(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ break;
+ case kTypeDeletion:
+ state.append("Delete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ break;
+ }
+ state.append("@");
+ state.append(NumberToString(ikey.sequence));
+ }
+ delete iter;
+ if (!s.ok()) {
+ state.append("ParseError()");
+ } else if (count != WriteBatchInternal::Count(b)) {
+ state.append("CountMismatch()");
+ }
+ mem->Unref();
+ return state;
+}
+
+class WriteBatchTest {};
+
+TEST(WriteBatchTest, Empty) {
+ WriteBatch batch;
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+}
+
+TEST(WriteBatchTest, Multiple) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ batch.Put(Slice("baz"), Slice("boo"));
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ(
+ "Put(baz, boo)@102"
+ "Delete(box)@101"
+ "Put(foo, bar)@100",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Corruption) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ WriteBatchInternal::SetSequence(&batch, 200);
+ Slice contents = WriteBatchInternal::Contents(&batch);
+ WriteBatchInternal::SetContents(&batch,
+ Slice(contents.data(), contents.size() - 1));
+ ASSERT_EQ(
+ "Put(foo, bar)@200"
+ "ParseError()",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+ WriteBatch b1, b2;
+ WriteBatchInternal::SetSequence(&b1, 200);
+ WriteBatchInternal::SetSequence(&b2, 300);
+ b1.Append(b2);
+ ASSERT_EQ("", PrintContents(&b1));
+ b2.Put("a", "va");
+ b1.Append(b2);
+ ASSERT_EQ("Put(a, va)@200", PrintContents(&b1));
+ b2.Clear();
+ b2.Put("b", "vb");
+ b1.Append(b2);
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@201",
+ PrintContents(&b1));
+ b2.Delete("foo");
+ b1.Append(b2);
+ ASSERT_EQ(
+ "Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+}
+
+TEST(WriteBatchTest, ApproximateSize) {
+ WriteBatch batch;
+ size_t empty_size = batch.ApproximateSize();
+
+ batch.Put(Slice("foo"), Slice("bar"));
+ size_t one_key_size = batch.ApproximateSize();
+ ASSERT_LT(empty_size, one_key_size);
+
+ batch.Put(Slice("baz"), Slice("boo"));
+ size_t two_keys_size = batch.ApproximateSize();
+ ASSERT_LT(one_key_size, two_keys_size);
+
+ batch.Delete(Slice("box"));
+ size_t post_delete_size = batch.ApproximateSize();
+ ASSERT_LT(two_keys_size, post_delete_size);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) { return leveldb::test::RunAllTests(); }