aboutsummaryrefslogtreecommitdiff
path: root/src/leveldb/db
diff options
context:
space:
mode:
Diffstat (limited to 'src/leveldb/db')
-rw-r--r--src/leveldb/db/autocompact_test.cc118
-rw-r--r--src/leveldb/db/builder.cc88
-rw-r--r--src/leveldb/db/builder.h34
-rw-r--r--src/leveldb/db/c.cc595
-rw-r--r--src/leveldb/db/c_test.c390
-rw-r--r--src/leveldb/db/corruption_test.cc374
-rw-r--r--src/leveldb/db/db_bench.cc979
-rw-r--r--src/leveldb/db/db_impl.cc1513
-rw-r--r--src/leveldb/db/db_impl.h211
-rw-r--r--src/leveldb/db/db_iter.cc317
-rw-r--r--src/leveldb/db/db_iter.h28
-rw-r--r--src/leveldb/db/db_test.cc2128
-rw-r--r--src/leveldb/db/dbformat.cc140
-rw-r--r--src/leveldb/db/dbformat.h230
-rw-r--r--src/leveldb/db/dbformat_test.cc112
-rw-r--r--src/leveldb/db/filename.cc149
-rw-r--r--src/leveldb/db/filename.h85
-rw-r--r--src/leveldb/db/filename_test.cc123
-rw-r--r--src/leveldb/db/leveldb_main.cc238
-rw-r--r--src/leveldb/db/log_format.h35
-rw-r--r--src/leveldb/db/log_reader.cc259
-rw-r--r--src/leveldb/db/log_reader.h108
-rw-r--r--src/leveldb/db/log_test.cc500
-rw-r--r--src/leveldb/db/log_writer.cc103
-rw-r--r--src/leveldb/db/log_writer.h48
-rw-r--r--src/leveldb/db/memtable.cc145
-rw-r--r--src/leveldb/db/memtable.h91
-rw-r--r--src/leveldb/db/repair.cc462
-rw-r--r--src/leveldb/db/skiplist.h379
-rw-r--r--src/leveldb/db/skiplist_test.cc378
-rw-r--r--src/leveldb/db/snapshot.h66
-rw-r--r--src/leveldb/db/table_cache.cc127
-rw-r--r--src/leveldb/db/table_cache.h61
-rw-r--r--src/leveldb/db/version_edit.cc266
-rw-r--r--src/leveldb/db/version_edit.h107
-rw-r--r--src/leveldb/db/version_edit_test.cc46
-rw-r--r--src/leveldb/db/version_set.cc1498
-rw-r--r--src/leveldb/db/version_set.h396
-rw-r--r--src/leveldb/db/version_set_test.cc179
-rw-r--r--src/leveldb/db/write_batch.cc147
-rw-r--r--src/leveldb/db/write_batch_internal.h49
-rw-r--r--src/leveldb/db/write_batch_test.cc120
42 files changed, 13422 insertions, 0 deletions
diff --git a/src/leveldb/db/autocompact_test.cc b/src/leveldb/db/autocompact_test.cc
new file mode 100644
index 0000000000..d20a2362c3
--- /dev/null
+++ b/src/leveldb/db/autocompact_test.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2013 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+#include "db/db_impl.h"
+#include "leveldb/cache.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class AutoCompactTest {
+ public:
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ AutoCompactTest() {
+ dbname_ = test::TmpDir() + "/autocompact_test";
+ tiny_cache_ = NewLRUCache(100);
+ options_.block_cache = tiny_cache_;
+ DestroyDB(dbname_, options_);
+ options_.create_if_missing = true;
+ options_.compression = kNoCompression;
+ ASSERT_OK(DB::Open(options_, dbname_, &db_));
+ }
+
+ ~AutoCompactTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ }
+
+ std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ void DoReads(int n);
+};
+
+static const int kValueSize = 200 * 1024;
+static const int kTotalSize = 100 * 1024 * 1024;
+static const int kCount = kTotalSize / kValueSize;
+
+// Read through the first n keys repeatedly and check that they get
+// compacted (verified by checking the size of the key space).
+void AutoCompactTest::DoReads(int n) {
+ std::string value(kValueSize, 'x');
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Fill database
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Put(WriteOptions(), Key(i), value));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Delete everything
+ for (int i = 0; i < kCount; i++) {
+ ASSERT_OK(db_->Delete(WriteOptions(), Key(i)));
+ }
+ ASSERT_OK(dbi->TEST_CompactMemTable());
+
+ // Get initial measurement of the space we will be reading.
+ const int64_t initial_size = Size(Key(0), Key(n));
+ const int64_t initial_other_size = Size(Key(n), Key(kCount));
+
+ // Read until size drops significantly.
+ std::string limit_key = Key(n);
+ for (int read = 0; true; read++) {
+ ASSERT_LT(read, 100) << "Taking too long to compact";
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst();
+ iter->Valid() && iter->key().ToString() < limit_key;
+ iter->Next()) {
+ // Drop data
+ }
+ delete iter;
+ // Wait a little bit to allow any triggered compactions to complete.
+ Env::Default()->SleepForMicroseconds(1000000);
+ uint64_t size = Size(Key(0), Key(n));
+ fprintf(stderr, "iter %3d => %7.3f MB [other %7.3f MB]\n",
+ read+1, size/1048576.0, Size(Key(n), Key(kCount))/1048576.0);
+ if (size <= initial_size/10) {
+ break;
+ }
+ }
+
+ // Verify that the size of the key space not touched by the reads
+ // is pretty much unchanged.
+ const int64_t final_other_size = Size(Key(n), Key(kCount));
+ ASSERT_LE(final_other_size, initial_other_size + 1048576);
+ ASSERT_GE(final_other_size, initial_other_size/5 - 1048576);
+}
+
+TEST(AutoCompactTest, ReadAll) {
+ DoReads(kCount);
+}
+
+TEST(AutoCompactTest, ReadHalf) {
+ DoReads(kCount/2);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
new file mode 100644
index 0000000000..f419882197
--- /dev/null
+++ b/src/leveldb/db/builder.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+Status BuildTable(const std::string& dbname,
+ Env* env,
+ const Options& options,
+ TableCache* table_cache,
+ Iterator* iter,
+ FileMetaData* meta) {
+ Status s;
+ meta->file_size = 0;
+ iter->SeekToFirst();
+
+ std::string fname = TableFileName(dbname, meta->number);
+ if (iter->Valid()) {
+ WritableFile* file;
+ s = env->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TableBuilder* builder = new TableBuilder(options, file);
+ meta->smallest.DecodeFrom(iter->key());
+ for (; iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ meta->largest.DecodeFrom(key);
+ builder->Add(key, iter->value());
+ }
+
+ // Finish and check for builder errors
+ if (s.ok()) {
+ s = builder->Finish();
+ if (s.ok()) {
+ meta->file_size = builder->FileSize();
+ assert(meta->file_size > 0);
+ }
+ } else {
+ builder->Abandon();
+ }
+ delete builder;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = file->Sync();
+ }
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (s.ok()) {
+ // Verify that the table is usable
+ Iterator* it = table_cache->NewIterator(ReadOptions(),
+ meta->number,
+ meta->file_size);
+ s = it->status();
+ delete it;
+ }
+ }
+
+ // Check for input iterator errors
+ if (!iter->status().ok()) {
+ s = iter->status();
+ }
+
+ if (s.ok() && meta->file_size > 0) {
+ // Keep it
+ } else {
+ env->DeleteFile(fname);
+ }
+ return s;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h
new file mode 100644
index 0000000000..62431fcf44
--- /dev/null
+++ b/src/leveldb/db/builder.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
+#define STORAGE_LEVELDB_DB_BUILDER_H_
+
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+class Iterator;
+class TableCache;
+class VersionEdit;
+
+// Build a Table file from the contents of *iter. The generated file
+// will be named according to meta->number. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+ Env* env,
+ const Options& options,
+ TableCache* table_cache,
+ Iterator* iter,
+ FileMetaData* meta);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_BUILDER_H_
diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc
new file mode 100644
index 0000000000..08ff0ad90a
--- /dev/null
+++ b/src/leveldb/db/c.cc
@@ -0,0 +1,595 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/c.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "leveldb/cache.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/write_batch.h"
+
+using leveldb::Cache;
+using leveldb::Comparator;
+using leveldb::CompressionType;
+using leveldb::DB;
+using leveldb::Env;
+using leveldb::FileLock;
+using leveldb::FilterPolicy;
+using leveldb::Iterator;
+using leveldb::kMajorVersion;
+using leveldb::kMinorVersion;
+using leveldb::Logger;
+using leveldb::NewBloomFilterPolicy;
+using leveldb::NewLRUCache;
+using leveldb::Options;
+using leveldb::RandomAccessFile;
+using leveldb::Range;
+using leveldb::ReadOptions;
+using leveldb::SequentialFile;
+using leveldb::Slice;
+using leveldb::Snapshot;
+using leveldb::Status;
+using leveldb::WritableFile;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+
+extern "C" {
+
+struct leveldb_t { DB* rep; };
+struct leveldb_iterator_t { Iterator* rep; };
+struct leveldb_writebatch_t { WriteBatch rep; };
+struct leveldb_snapshot_t { const Snapshot* rep; };
+struct leveldb_readoptions_t { ReadOptions rep; };
+struct leveldb_writeoptions_t { WriteOptions rep; };
+struct leveldb_options_t { Options rep; };
+struct leveldb_cache_t { Cache* rep; };
+struct leveldb_seqfile_t { SequentialFile* rep; };
+struct leveldb_randomfile_t { RandomAccessFile* rep; };
+struct leveldb_writablefile_t { WritableFile* rep; };
+struct leveldb_logger_t { Logger* rep; };
+struct leveldb_filelock_t { FileLock* rep; };
+
+struct leveldb_comparator_t : public Comparator {
+ void* state_;
+ void (*destructor_)(void*);
+ int (*compare_)(
+ void*,
+ const char* a, size_t alen,
+ const char* b, size_t blen);
+ const char* (*name_)(void*);
+
+ virtual ~leveldb_comparator_t() {
+ (*destructor_)(state_);
+ }
+
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+ }
+
+ virtual const char* Name() const {
+ return (*name_)(state_);
+ }
+
+ // No-ops since the C binding does not support key shortening methods.
+ virtual void FindShortestSeparator(std::string*, const Slice&) const { }
+ virtual void FindShortSuccessor(std::string* key) const { }
+};
+
+struct leveldb_filterpolicy_t : public FilterPolicy {
+ void* state_;
+ void (*destructor_)(void*);
+ const char* (*name_)(void*);
+ char* (*create_)(
+ void*,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length);
+ unsigned char (*key_match_)(
+ void*,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length);
+
+ virtual ~leveldb_filterpolicy_t() {
+ (*destructor_)(state_);
+ }
+
+ virtual const char* Name() const {
+ return (*name_)(state_);
+ }
+
+ virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+ std::vector<const char*> key_pointers(n);
+ std::vector<size_t> key_sizes(n);
+ for (int i = 0; i < n; i++) {
+ key_pointers[i] = keys[i].data();
+ key_sizes[i] = keys[i].size();
+ }
+ size_t len;
+ char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+ dst->append(filter, len);
+ free(filter);
+ }
+
+ virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+ return (*key_match_)(state_, key.data(), key.size(),
+ filter.data(), filter.size());
+ }
+};
+
+struct leveldb_env_t {
+ Env* rep;
+ bool is_default;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+ assert(errptr != NULL);
+ if (s.ok()) {
+ return false;
+ } else if (*errptr == NULL) {
+ *errptr = strdup(s.ToString().c_str());
+ } else {
+ // TODO(sanjay): Merge with existing error?
+ free(*errptr);
+ *errptr = strdup(s.ToString().c_str());
+ }
+ return true;
+}
+
+static char* CopyString(const std::string& str) {
+ char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+ memcpy(result, str.data(), sizeof(char) * str.size());
+ return result;
+}
+
+leveldb_t* leveldb_open(
+ const leveldb_options_t* options,
+ const char* name,
+ char** errptr) {
+ DB* db;
+ if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+ return NULL;
+ }
+ leveldb_t* result = new leveldb_t;
+ result->rep = db;
+ return result;
+}
+
+void leveldb_close(leveldb_t* db) {
+ delete db->rep;
+ delete db;
+}
+
+void leveldb_put(
+ leveldb_t* db,
+ const leveldb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ const char* val, size_t vallen,
+ char** errptr) {
+ SaveError(errptr,
+ db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void leveldb_delete(
+ leveldb_t* db,
+ const leveldb_writeoptions_t* options,
+ const char* key, size_t keylen,
+ char** errptr) {
+ SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+
+void leveldb_write(
+ leveldb_t* db,
+ const leveldb_writeoptions_t* options,
+ leveldb_writebatch_t* batch,
+ char** errptr) {
+ SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* leveldb_get(
+ leveldb_t* db,
+ const leveldb_readoptions_t* options,
+ const char* key, size_t keylen,
+ size_t* vallen,
+ char** errptr) {
+ char* result = NULL;
+ std::string tmp;
+ Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+ if (s.ok()) {
+ *vallen = tmp.size();
+ result = CopyString(tmp);
+ } else {
+ *vallen = 0;
+ if (!s.IsNotFound()) {
+ SaveError(errptr, s);
+ }
+ }
+ return result;
+}
+
+leveldb_iterator_t* leveldb_create_iterator(
+ leveldb_t* db,
+ const leveldb_readoptions_t* options) {
+ leveldb_iterator_t* result = new leveldb_iterator_t;
+ result->rep = db->rep->NewIterator(options->rep);
+ return result;
+}
+
+const leveldb_snapshot_t* leveldb_create_snapshot(
+ leveldb_t* db) {
+ leveldb_snapshot_t* result = new leveldb_snapshot_t;
+ result->rep = db->rep->GetSnapshot();
+ return result;
+}
+
+void leveldb_release_snapshot(
+ leveldb_t* db,
+ const leveldb_snapshot_t* snapshot) {
+ db->rep->ReleaseSnapshot(snapshot->rep);
+ delete snapshot;
+}
+
+char* leveldb_property_value(
+ leveldb_t* db,
+ const char* propname) {
+ std::string tmp;
+ if (db->rep->GetProperty(Slice(propname), &tmp)) {
+ // We use strdup() since we expect human readable output.
+ return strdup(tmp.c_str());
+ } else {
+ return NULL;
+ }
+}
+
+void leveldb_approximate_sizes(
+ leveldb_t* db,
+ int num_ranges,
+ const char* const* range_start_key, const size_t* range_start_key_len,
+ const char* const* range_limit_key, const size_t* range_limit_key_len,
+ uint64_t* sizes) {
+ Range* ranges = new Range[num_ranges];
+ for (int i = 0; i < num_ranges; i++) {
+ ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+ ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+ }
+ db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+ delete[] ranges;
+}
+
+void leveldb_compact_range(
+ leveldb_t* db,
+ const char* start_key, size_t start_key_len,
+ const char* limit_key, size_t limit_key_len) {
+ Slice a, b;
+ db->rep->CompactRange(
+ // Pass NULL Slice if corresponding "const char*" is NULL
+ (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
+ (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
+}
+
+void leveldb_destroy_db(
+ const leveldb_options_t* options,
+ const char* name,
+ char** errptr) {
+ SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void leveldb_repair_db(
+ const leveldb_options_t* options,
+ const char* name,
+ char** errptr) {
+ SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void leveldb_iter_destroy(leveldb_iterator_t* iter) {
+ delete iter->rep;
+ delete iter;
+}
+
+unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
+ return iter->rep->Valid();
+}
+
+void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
+ iter->rep->SeekToFirst();
+}
+
+void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
+ iter->rep->SeekToLast();
+}
+
+void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
+ iter->rep->Seek(Slice(k, klen));
+}
+
+void leveldb_iter_next(leveldb_iterator_t* iter) {
+ iter->rep->Next();
+}
+
+void leveldb_iter_prev(leveldb_iterator_t* iter) {
+ iter->rep->Prev();
+}
+
+const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
+ Slice s = iter->rep->key();
+ *klen = s.size();
+ return s.data();
+}
+
+const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
+ Slice s = iter->rep->value();
+ *vlen = s.size();
+ return s.data();
+}
+
+void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
+ SaveError(errptr, iter->rep->status());
+}
+
+leveldb_writebatch_t* leveldb_writebatch_create() {
+ return new leveldb_writebatch_t;
+}
+
+void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
+ delete b;
+}
+
+void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
+ b->rep.Clear();
+}
+
+void leveldb_writebatch_put(
+ leveldb_writebatch_t* b,
+ const char* key, size_t klen,
+ const char* val, size_t vlen) {
+ b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void leveldb_writebatch_delete(
+ leveldb_writebatch_t* b,
+ const char* key, size_t klen) {
+ b->rep.Delete(Slice(key, klen));
+}
+
+void leveldb_writebatch_iterate(
+ leveldb_writebatch_t* b,
+ void* state,
+ void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+ void (*deleted)(void*, const char* k, size_t klen)) {
+ class H : public WriteBatch::Handler {
+ public:
+ void* state_;
+ void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+ void (*deleted_)(void*, const char* k, size_t klen);
+ virtual void Put(const Slice& key, const Slice& value) {
+ (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+ }
+ virtual void Delete(const Slice& key) {
+ (*deleted_)(state_, key.data(), key.size());
+ }
+ };
+ H handler;
+ handler.state_ = state;
+ handler.put_ = put;
+ handler.deleted_ = deleted;
+ b->rep.Iterate(&handler);
+}
+
+leveldb_options_t* leveldb_options_create() {
+ return new leveldb_options_t;
+}
+
+void leveldb_options_destroy(leveldb_options_t* options) {
+ delete options;
+}
+
+void leveldb_options_set_comparator(
+ leveldb_options_t* opt,
+ leveldb_comparator_t* cmp) {
+ opt->rep.comparator = cmp;
+}
+
+void leveldb_options_set_filter_policy(
+ leveldb_options_t* opt,
+ leveldb_filterpolicy_t* policy) {
+ opt->rep.filter_policy = policy;
+}
+
+void leveldb_options_set_create_if_missing(
+ leveldb_options_t* opt, unsigned char v) {
+ opt->rep.create_if_missing = v;
+}
+
+void leveldb_options_set_error_if_exists(
+ leveldb_options_t* opt, unsigned char v) {
+ opt->rep.error_if_exists = v;
+}
+
+void leveldb_options_set_paranoid_checks(
+ leveldb_options_t* opt, unsigned char v) {
+ opt->rep.paranoid_checks = v;
+}
+
+void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
+ opt->rep.env = (env ? env->rep : NULL);
+}
+
+void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
+ opt->rep.info_log = (l ? l->rep : NULL);
+}
+
+void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
+ opt->rep.write_buffer_size = s;
+}
+
+void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
+ opt->rep.max_open_files = n;
+}
+
+void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
+ opt->rep.block_cache = c->rep;
+}
+
+void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
+ opt->rep.block_size = s;
+}
+
+void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
+ opt->rep.block_restart_interval = n;
+}
+
+void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
+ opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+leveldb_comparator_t* leveldb_comparator_create(
+ void* state,
+ void (*destructor)(void*),
+ int (*compare)(
+ void*,
+ const char* a, size_t alen,
+ const char* b, size_t blen),
+ const char* (*name)(void*)) {
+ leveldb_comparator_t* result = new leveldb_comparator_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->compare_ = compare;
+ result->name_ = name;
+ return result;
+}
+
+void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
+ delete cmp;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+ void* state,
+ void (*destructor)(void*),
+ char* (*create_filter)(
+ void*,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length),
+ unsigned char (*key_may_match)(
+ void*,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length),
+ const char* (*name)(void*)) {
+ leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
+ result->state_ = state;
+ result->destructor_ = destructor;
+ result->create_ = create_filter;
+ result->key_match_ = key_may_match;
+ result->name_ = name;
+ return result;
+}
+
+void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
+ delete filter;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
+ // Make a leveldb_filterpolicy_t, but override all of its methods so
+ // they delegate to a NewBloomFilterPolicy() instead of user
+ // supplied C functions.
+ struct Wrapper : public leveldb_filterpolicy_t {
+ const FilterPolicy* rep_;
+ ~Wrapper() { delete rep_; }
+ const char* Name() const { return rep_->Name(); }
+ void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+ return rep_->CreateFilter(keys, n, dst);
+ }
+ bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+ return rep_->KeyMayMatch(key, filter);
+ }
+ static void DoNothing(void*) { }
+ };
+ Wrapper* wrapper = new Wrapper;
+ wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+ wrapper->state_ = NULL;
+ wrapper->destructor_ = &Wrapper::DoNothing;
+ return wrapper;
+}
+
+leveldb_readoptions_t* leveldb_readoptions_create() {
+ return new leveldb_readoptions_t;
+}
+
+void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
+ delete opt;
+}
+
+void leveldb_readoptions_set_verify_checksums(
+ leveldb_readoptions_t* opt,
+ unsigned char v) {
+ opt->rep.verify_checksums = v;
+}
+
+void leveldb_readoptions_set_fill_cache(
+ leveldb_readoptions_t* opt, unsigned char v) {
+ opt->rep.fill_cache = v;
+}
+
+void leveldb_readoptions_set_snapshot(
+ leveldb_readoptions_t* opt,
+ const leveldb_snapshot_t* snap) {
+ opt->rep.snapshot = (snap ? snap->rep : NULL);
+}
+
+leveldb_writeoptions_t* leveldb_writeoptions_create() {
+ return new leveldb_writeoptions_t;
+}
+
+void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
+ delete opt;
+}
+
+void leveldb_writeoptions_set_sync(
+ leveldb_writeoptions_t* opt, unsigned char v) {
+ opt->rep.sync = v;
+}
+
+leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
+ leveldb_cache_t* c = new leveldb_cache_t;
+ c->rep = NewLRUCache(capacity);
+ return c;
+}
+
+void leveldb_cache_destroy(leveldb_cache_t* cache) {
+ delete cache->rep;
+ delete cache;
+}
+
+leveldb_env_t* leveldb_create_default_env() {
+ leveldb_env_t* result = new leveldb_env_t;
+ result->rep = Env::Default();
+ result->is_default = true;
+ return result;
+}
+
+void leveldb_env_destroy(leveldb_env_t* env) {
+ if (!env->is_default) delete env->rep;
+ delete env;
+}
+
+void leveldb_free(void* ptr) {
+ free(ptr);
+}
+
+int leveldb_major_version() {
+ return kMajorVersion;
+}
+
+int leveldb_minor_version() {
+ return kMinorVersion;
+}
+
+} // end extern "C"
diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c
new file mode 100644
index 0000000000..7cd5ee0207
--- /dev/null
+++ b/src/leveldb/db/c_test.c
@@ -0,0 +1,390 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+ Use of this source code is governed by a BSD-style license that can be
+ found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "leveldb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+ fprintf(stderr, "=== Test %s\n", name);
+ phase = name;
+}
+
+static const char* GetTempDir(void) {
+ const char* ret = getenv("TEST_TMPDIR");
+ if (ret == NULL || ret[0] == '\0')
+ ret = "/tmp";
+ return ret;
+}
+
+#define CheckNoError(err) \
+ if ((err) != NULL) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+ abort(); \
+ }
+
+#define CheckCondition(cond) \
+ if (!(cond)) { \
+ fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+ abort(); \
+ }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+ if (expected == NULL && v == NULL) {
+ // ok
+ } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+ memcmp(expected, v, n) == 0) {
+ // ok
+ return;
+ } else {
+ fprintf(stderr, "%s: expected '%s', got '%s'\n",
+ phase,
+ (expected ? expected : "(null)"),
+ (v ? v : "(null"));
+ abort();
+ }
+}
+
+static void Free(char** ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+}
+
+static void CheckGet(
+ leveldb_t* db,
+ const leveldb_readoptions_t* options,
+ const char* key,
+ const char* expected) {
+ char* err = NULL;
+ size_t val_len;
+ char* val;
+ val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
+ CheckNoError(err);
+ CheckEqual(expected, val, val_len);
+ Free(&val);
+}
+
+static void CheckIter(leveldb_iterator_t* iter,
+ const char* key, const char* val) {
+ size_t len;
+ const char* str;
+ str = leveldb_iter_key(iter, &len);
+ CheckEqual(key, str, len);
+ str = leveldb_iter_value(iter, &len);
+ CheckEqual(val, str, len);
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckPut(void* ptr,
+ const char* k, size_t klen,
+ const char* v, size_t vlen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state < 2);
+ switch (*state) {
+ case 0:
+ CheckEqual("bar", k, klen);
+ CheckEqual("b", v, vlen);
+ break;
+ case 1:
+ CheckEqual("box", k, klen);
+ CheckEqual("c", v, vlen);
+ break;
+ }
+ (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+ int* state = (int*) ptr;
+ CheckCondition(*state == 2);
+ CheckEqual("bar", k, klen);
+ (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+ const char* b, size_t blen) {
+ int n = (alen < blen) ? alen : blen;
+ int r = memcmp(a, b, n);
+ if (r == 0) {
+ if (alen < blen) r = -1;
+ else if (alen > blen) r = +1;
+ }
+ return r;
+}
+
+static const char* CmpName(void* arg) {
+ return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+ return "TestFilter";
+}
+static char* FilterCreate(
+ void* arg,
+ const char* const* key_array, const size_t* key_length_array,
+ int num_keys,
+ size_t* filter_length) {
+ *filter_length = 4;
+ char* result = malloc(4);
+ memcpy(result, "fake", 4);
+ return result;
+}
+unsigned char FilterKeyMatch(
+ void* arg,
+ const char* key, size_t length,
+ const char* filter, size_t filter_length) {
+ CheckCondition(filter_length == 4);
+ CheckCondition(memcmp(filter, "fake", 4) == 0);
+ return fake_filter_result;
+}
+
+int main(int argc, char** argv) {
+ leveldb_t* db;
+ leveldb_comparator_t* cmp;
+ leveldb_cache_t* cache;
+ leveldb_env_t* env;
+ leveldb_options_t* options;
+ leveldb_readoptions_t* roptions;
+ leveldb_writeoptions_t* woptions;
+ char* err = NULL;
+ int run = -1;
+
+ CheckCondition(leveldb_major_version() >= 1);
+ CheckCondition(leveldb_minor_version() >= 1);
+
+ snprintf(dbname, sizeof(dbname),
+ "%s/leveldb_c_test-%d",
+ GetTempDir(),
+ ((int) geteuid()));
+
+ StartPhase("create_objects");
+ cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+ env = leveldb_create_default_env();
+ cache = leveldb_cache_create_lru(100000);
+
+ options = leveldb_options_create();
+ leveldb_options_set_comparator(options, cmp);
+ leveldb_options_set_error_if_exists(options, 1);
+ leveldb_options_set_cache(options, cache);
+ leveldb_options_set_env(options, env);
+ leveldb_options_set_info_log(options, NULL);
+ leveldb_options_set_write_buffer_size(options, 100000);
+ leveldb_options_set_paranoid_checks(options, 1);
+ leveldb_options_set_max_open_files(options, 10);
+ leveldb_options_set_block_size(options, 1024);
+ leveldb_options_set_block_restart_interval(options, 8);
+ leveldb_options_set_compression(options, leveldb_no_compression);
+
+ roptions = leveldb_readoptions_create();
+ leveldb_readoptions_set_verify_checksums(roptions, 1);
+ leveldb_readoptions_set_fill_cache(roptions, 0);
+
+ woptions = leveldb_writeoptions_create();
+ leveldb_writeoptions_set_sync(woptions, 1);
+
+ StartPhase("destroy");
+ leveldb_destroy_db(options, dbname, &err);
+ Free(&err);
+
+ StartPhase("open_error");
+ db = leveldb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ Free(&err);
+
+ StartPhase("leveldb_free");
+ db = leveldb_open(options, dbname, &err);
+ CheckCondition(err != NULL);
+ leveldb_free(err);
+ err = NULL;
+
+ StartPhase("open");
+ leveldb_options_set_create_if_missing(options, 1);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+
+ StartPhase("put");
+ leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactall");
+ leveldb_compact_range(db, NULL, 0, NULL, 0);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("compactrange");
+ leveldb_compact_range(db, "a", 1, "z", 1);
+ CheckGet(db, roptions, "foo", "hello");
+
+ StartPhase("writebatch");
+ {
+ leveldb_writebatch_t* wb = leveldb_writebatch_create();
+ leveldb_writebatch_put(wb, "foo", 3, "a", 1);
+ leveldb_writebatch_clear(wb);
+ leveldb_writebatch_put(wb, "bar", 3, "b", 1);
+ leveldb_writebatch_put(wb, "box", 3, "c", 1);
+ leveldb_writebatch_delete(wb, "bar", 3);
+ leveldb_write(db, woptions, wb, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", "hello");
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ int pos = 0;
+ leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+ CheckCondition(pos == 3);
+ leveldb_writebatch_destroy(wb);
+ }
+
+ StartPhase("iter");
+ {
+ leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
+ CheckCondition(!leveldb_iter_valid(iter));
+ leveldb_iter_seek_to_first(iter);
+ CheckCondition(leveldb_iter_valid(iter));
+ CheckIter(iter, "box", "c");
+ leveldb_iter_next(iter);
+ CheckIter(iter, "foo", "hello");
+ leveldb_iter_prev(iter);
+ CheckIter(iter, "box", "c");
+ leveldb_iter_prev(iter);
+ CheckCondition(!leveldb_iter_valid(iter));
+ leveldb_iter_seek_to_last(iter);
+ CheckIter(iter, "foo", "hello");
+ leveldb_iter_seek(iter, "b", 1);
+ CheckIter(iter, "box", "c");
+ leveldb_iter_get_error(iter, &err);
+ CheckNoError(err);
+ leveldb_iter_destroy(iter);
+ }
+
+ StartPhase("approximate_sizes");
+ {
+ int i;
+ int n = 20000;
+ char keybuf[100];
+ char valbuf[100];
+ uint64_t sizes[2];
+ const char* start[2] = { "a", "k00000000000000010000" };
+ size_t start_len[2] = { 1, 21 };
+ const char* limit[2] = { "k00000000000000010000", "z" };
+ size_t limit_len[2] = { 21, 1 };
+ leveldb_writeoptions_set_sync(woptions, 0);
+ for (i = 0; i < n; i++) {
+ snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+ snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+ leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+ &err);
+ CheckNoError(err);
+ }
+ leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+ CheckCondition(sizes[0] > 0);
+ CheckCondition(sizes[1] > 0);
+ }
+
+ StartPhase("property");
+ {
+ char* prop = leveldb_property_value(db, "nosuchprop");
+ CheckCondition(prop == NULL);
+ prop = leveldb_property_value(db, "leveldb.stats");
+ CheckCondition(prop != NULL);
+ Free(&prop);
+ }
+
+ StartPhase("snapshot");
+ {
+ const leveldb_snapshot_t* snap;
+ snap = leveldb_create_snapshot(db);
+ leveldb_delete(db, woptions, "foo", 3, &err);
+ CheckNoError(err);
+ leveldb_readoptions_set_snapshot(roptions, snap);
+ CheckGet(db, roptions, "foo", "hello");
+ leveldb_readoptions_set_snapshot(roptions, NULL);
+ CheckGet(db, roptions, "foo", NULL);
+ leveldb_release_snapshot(db, snap);
+ }
+
+ StartPhase("repair");
+ {
+ leveldb_close(db);
+ leveldb_options_set_create_if_missing(options, 0);
+ leveldb_options_set_error_if_exists(options, 0);
+ leveldb_repair_db(options, dbname, &err);
+ CheckNoError(err);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ CheckGet(db, roptions, "box", "c");
+ leveldb_options_set_create_if_missing(options, 1);
+ leveldb_options_set_error_if_exists(options, 1);
+ }
+
+ StartPhase("filter");
+ for (run = 0; run < 2; run++) {
+ // First run uses custom filter, second run uses bloom filter
+ CheckNoError(err);
+ leveldb_filterpolicy_t* policy;
+ if (run == 0) {
+ policy = leveldb_filterpolicy_create(
+ NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+ } else {
+ policy = leveldb_filterpolicy_create_bloom(10);
+ }
+
+ // Create new database
+ leveldb_close(db);
+ leveldb_destroy_db(options, dbname, &err);
+ leveldb_options_set_filter_policy(options, policy);
+ db = leveldb_open(options, dbname, &err);
+ CheckNoError(err);
+ leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+ CheckNoError(err);
+ leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+ CheckNoError(err);
+ leveldb_compact_range(db, NULL, 0, NULL, 0);
+
+ fake_filter_result = 1;
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ if (phase == 0) {
+ // Must not find value when custom filter returns false
+ fake_filter_result = 0;
+ CheckGet(db, roptions, "foo", NULL);
+ CheckGet(db, roptions, "bar", NULL);
+ fake_filter_result = 1;
+
+ CheckGet(db, roptions, "foo", "foovalue");
+ CheckGet(db, roptions, "bar", "barvalue");
+ }
+ leveldb_options_set_filter_policy(options, NULL);
+ leveldb_filterpolicy_destroy(policy);
+ }
+
+ StartPhase("cleanup");
+ leveldb_close(db);
+ leveldb_options_destroy(options);
+ leveldb_readoptions_destroy(roptions);
+ leveldb_writeoptions_destroy(woptions);
+ leveldb_cache_destroy(cache);
+ leveldb_comparator_destroy(cmp);
+ leveldb_env_destroy(env);
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc
new file mode 100644
index 0000000000..96afc68913
--- /dev/null
+++ b/src/leveldb/db/corruption_test.cc
@@ -0,0 +1,374 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+ test::ErrorEnv env_;
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ CorruptionTest() {
+ tiny_cache_ = NewLRUCache(100);
+ options_.env = &env_;
+ options_.block_cache = tiny_cache_;
+ dbname_ = test::TmpDir() + "/db_test";
+ DestroyDB(dbname_, options_);
+
+ db_ = NULL;
+ options_.create_if_missing = true;
+ Reopen();
+ options_.create_if_missing = false;
+ }
+
+ ~CorruptionTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ }
+
+ Status TryReopen() {
+ delete db_;
+ db_ = NULL;
+ return DB::Open(options_, dbname_, &db_);
+ }
+
+ void Reopen() {
+ ASSERT_OK(TryReopen());
+ }
+
+ void RepairDB() {
+ delete db_;
+ db_ = NULL;
+ ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
+ }
+
+ void Build(int n) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = 0; i < n; i++) {
+ //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ WriteOptions options;
+ // Corrupt() doesn't work without this sync on windows; stat reports 0 for
+ // the file size.
+ if (i == n - 1) {
+ options.sync = true;
+ }
+ ASSERT_OK(db_->Write(options, &batch));
+ }
+ }
+
+ void Check(int min_expected, int max_expected) {
+ int next_expected = 0;
+ int missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ if (in == "" || in == "~") {
+ // Ignore boundary keys.
+ continue;
+ }
+ if (!ConsumeDecimalNumber(&in, &key) ||
+ !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(key, &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+
+ fprintf(stderr,
+ "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+ min_expected, max_expected, correct, bad_keys, bad_values, missed);
+ ASSERT_LE(min_expected, correct);
+ ASSERT_GE(max_expected, correct);
+ }
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ std::string fname;
+ int picked_number = -1;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) &&
+ type == filetype &&
+ int(number) > picked_number) { // Pick latest file
+ fname = dbname_ + "/" + filenames[i];
+ picked_number = number;
+ }
+ }
+ ASSERT_TRUE(!fname.empty()) << filetype;
+
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ const char* msg = strerror(errno);
+ ASSERT_TRUE(false) << fname << ": " << msg;
+ }
+
+ if (offset < 0) {
+ // Relative to end of file; make it absolute
+ if (-offset > sbuf.st_size) {
+ offset = 0;
+ } else {
+ offset = sbuf.st_size + offset;
+ }
+ }
+ if (offset > sbuf.st_size) {
+ offset = sbuf.st_size;
+ }
+ if (offset + bytes_to_corrupt > sbuf.st_size) {
+ bytes_to_corrupt = sbuf.st_size - offset;
+ }
+
+ // Do it
+ std::string contents;
+ Status s = ReadFileToString(Env::Default(), fname, &contents);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ for (int i = 0; i < bytes_to_corrupt; i++) {
+ contents[i + offset] ^= 0x80;
+ }
+ s = WriteStringToFile(Env::Default(), contents, fname);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ }
+
+ int Property(const std::string& name) {
+ std::string property;
+ int result;
+ if (db_->GetProperty(name, &property) &&
+ sscanf(property.c_str(), "%d", &result) == 1) {
+ return result;
+ } else {
+ return -1;
+ }
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+};
+
+TEST(CorruptionTest, Recovery) {
+ Build(100);
+ Check(100, 100);
+ Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
+ Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block
+ Reopen();
+
+ // The 64 records in the first two log blocks are completely lost.
+ Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+ env_.writable_file_error_ = true;
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+ // Do enough writing to force minor compaction
+ env_.writable_file_error_ = true;
+ const int num = 3 + (Options().write_buffer_size / kValueSize);
+ std::string value_storage;
+ Status s;
+ for (int i = 0; s.ok() && i < num; i++) {
+ WriteBatch batch;
+ batch.Put("a", Value(100, &value_storage));
+ s = db_->Write(WriteOptions(), &batch);
+ }
+ ASSERT_TRUE(!s.ok());
+ ASSERT_GE(env_.num_writable_file_errors_, 1);
+ env_.writable_file_error_ = false;
+ Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, NULL, NULL);
+ dbi->TEST_CompactRange(1, NULL, NULL);
+
+ Corrupt(kTableFile, 100, 1);
+ Check(90, 99);
+}
+
+TEST(CorruptionTest, TableFileRepair) {
+ options_.block_size = 2 * kValueSize; // Limit scope of corruption
+ options_.paranoid_checks = true;
+ Reopen();
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, NULL, NULL);
+ dbi->TEST_CompactRange(1, NULL, NULL);
+
+ Corrupt(kTableFile, 100, 1);
+ RepairDB();
+ Reopen();
+ Check(95, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+ Build(10000); // Enough to build multiple Tables
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+
+ Corrupt(kTableFile, -2000, 500);
+ Reopen();
+ Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+ Build(1000);
+ RepairDB();
+ Reopen();
+ Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v5", v);
+ // Write something. If sequence number was not recovered properly,
+ // it will be hidden by an earlier write.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, NULL, NULL);
+
+ Corrupt(kDescriptorFile, 0, 1000);
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(5, 9);
+
+ // Force compactions by writing lots of values
+ Build(10000);
+ Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+ options_.paranoid_checks = true;
+ options_.write_buffer_size = 512 << 10;
+ Reopen();
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+ // Make multiple inputs so we need to compact.
+ for (int i = 0; i < 2; i++) {
+ Build(10);
+ dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+ env_.SleepForMicroseconds(100000);
+ }
+ dbi->CompactRange(NULL, NULL);
+
+ // Write must fail because of corrupted table
+ std::string tmp1, tmp2;
+ Status s = db_->Put(WriteOptions(), Key(5, &tmp1), Value(5, &tmp2));
+ ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+
+ std::string tmp1, tmp2;
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+ dbi->TEST_CompactMemTable();
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc
new file mode 100644
index 0000000000..fc46d89693
--- /dev/null
+++ b/src/leveldb/db/db_bench.cc
@@ -0,0 +1,979 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
+#include "port/port.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/testutil.h"
+
+// Comma-separated list of operations to run in the specified order
+// Actual benchmarks:
+// fillseq -- write N values in sequential key order in async mode
+// fillrandom -- write N values in random key order in async mode
+// overwrite -- overwrite N values in random key order in async mode
+// fillsync -- write N/100 values in random key order in sync mode
+// fill100K -- write N/1000 100K values in random order in async mode
+// deleteseq -- delete N keys in sequential order
+// deleterandom -- delete N keys in random order
+// readseq -- read N times sequentially
+// readreverse -- read N times in reverse order
+// readrandom -- read N times in random order
+// readmissing -- read N missing keys in random order
+// readhot -- read N times in random order from 1% section of DB
+// seekrandom -- N random seeks
+// crc32c -- repeated crc32c of 4K of data
+// acquireload -- load N*1000 times
+// Meta operations:
+// compact -- Compact the entire DB
+// stats -- Print DB stats
+// sstables -- Print sstable info
+// heapprofile -- Dump a heap profile (if supported by this port)
+static const char* FLAGS_benchmarks =
+ "fillseq,"
+ "fillsync,"
+ "fillrandom,"
+ "overwrite,"
+ "readrandom,"
+ "readrandom," // Extra run to allow previous compactions to quiesce
+ "readseq,"
+ "readreverse,"
+ "compact,"
+ "readrandom,"
+ "readseq,"
+ "readreverse,"
+ "fill100K,"
+ "crc32c,"
+ "snappycomp,"
+ "snappyuncomp,"
+ "acquireload,"
+ ;
+
+// Number of key/values to place in database
+static int FLAGS_num = 1000000;
+
+// Number of read operations to do. If negative, do FLAGS_num reads.
+static int FLAGS_reads = -1;
+
+// Number of concurrent threads to run.
+static int FLAGS_threads = 1;
+
+// Size of each value
+static int FLAGS_value_size = 100;
+
+// Arrange to generate values that shrink to this fraction of
+// their original size after compression
+static double FLAGS_compression_ratio = 0.5;
+
+// Print histogram of operation timings
+static bool FLAGS_histogram = false;
+
+// Number of bytes to buffer in memtable before compacting
+// (initialized to default value by "main")
+static int FLAGS_write_buffer_size = 0;
+
+// Number of bytes to use as a cache of uncompressed data.
+// Negative means use default settings.
+static int FLAGS_cache_size = -1;
+
+// Maximum number of files to keep open at the same time (use default if == 0)
+static int FLAGS_open_files = 0;
+
+// Bloom filter bits per key.
+// Negative means use default settings.
+static int FLAGS_bloom_bits = -1;
+
+// If true, do not destroy the existing database. If you set this
+// flag and also specify a benchmark that wants a fresh database, that
+// benchmark will fail.
+static bool FLAGS_use_existing_db = false;
+
+// Use the db with the following name.
+static const char* FLAGS_db = NULL;
+
+namespace leveldb {
+
+namespace {
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+ std::string data_;
+ int pos_;
+
+ public:
+ RandomGenerator() {
+ // We use a limited amount of data over and over again and ensure
+ // that it is larger than the compression window (32KB), and also
+ // large enough to serve all typical value sizes we want to write.
+ Random rnd(301);
+ std::string piece;
+ while (data_.size() < 1048576) {
+ // Add a short fragment that is as compressible as specified
+ // by FLAGS_compression_ratio.
+ test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+ data_.append(piece);
+ }
+ pos_ = 0;
+ }
+
+ Slice Generate(size_t len) {
+ if (pos_ + len > data_.size()) {
+ pos_ = 0;
+ assert(len < data_.size());
+ }
+ pos_ += len;
+ return Slice(data_.data() + pos_ - len, len);
+ }
+};
+
+static Slice TrimSpace(Slice s) {
+ size_t start = 0;
+ while (start < s.size() && isspace(s[start])) {
+ start++;
+ }
+ size_t limit = s.size();
+ while (limit > start && isspace(s[limit-1])) {
+ limit--;
+ }
+ return Slice(s.data() + start, limit - start);
+}
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+ if (msg.empty()) return;
+ if (!str->empty()) {
+ str->push_back(' ');
+ }
+ str->append(msg.data(), msg.size());
+}
+
+class Stats {
+ private:
+ double start_;
+ double finish_;
+ double seconds_;
+ int done_;
+ int next_report_;
+ int64_t bytes_;
+ double last_op_finish_;
+ Histogram hist_;
+ std::string message_;
+
+ public:
+ Stats() { Start(); }
+
+ void Start() {
+ next_report_ = 100;
+ last_op_finish_ = start_;
+ hist_.Clear();
+ done_ = 0;
+ bytes_ = 0;
+ seconds_ = 0;
+ start_ = Env::Default()->NowMicros();
+ finish_ = start_;
+ message_.clear();
+ }
+
+ void Merge(const Stats& other) {
+ hist_.Merge(other.hist_);
+ done_ += other.done_;
+ bytes_ += other.bytes_;
+ seconds_ += other.seconds_;
+ if (other.start_ < start_) start_ = other.start_;
+ if (other.finish_ > finish_) finish_ = other.finish_;
+
+ // Just keep the messages from one thread
+ if (message_.empty()) message_ = other.message_;
+ }
+
+ void Stop() {
+ finish_ = Env::Default()->NowMicros();
+ seconds_ = (finish_ - start_) * 1e-6;
+ }
+
+ void AddMessage(Slice msg) {
+ AppendWithSpace(&message_, msg);
+ }
+
+ void FinishedSingleOp() {
+ if (FLAGS_histogram) {
+ double now = Env::Default()->NowMicros();
+ double micros = now - last_op_finish_;
+ hist_.Add(micros);
+ if (micros > 20000) {
+ fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+ fflush(stderr);
+ }
+ last_op_finish_ = now;
+ }
+
+ done_++;
+ if (done_ >= next_report_) {
+ if (next_report_ < 1000) next_report_ += 100;
+ else if (next_report_ < 5000) next_report_ += 500;
+ else if (next_report_ < 10000) next_report_ += 1000;
+ else if (next_report_ < 50000) next_report_ += 5000;
+ else if (next_report_ < 100000) next_report_ += 10000;
+ else if (next_report_ < 500000) next_report_ += 50000;
+ else next_report_ += 100000;
+ fprintf(stderr, "... finished %d ops%30s\r", done_, "");
+ fflush(stderr);
+ }
+ }
+
+ void AddBytes(int64_t n) {
+ bytes_ += n;
+ }
+
+ void Report(const Slice& name) {
+ // Pretend at least one op was done in case we are running a benchmark
+ // that does not call FinishedSingleOp().
+ if (done_ < 1) done_ = 1;
+
+ std::string extra;
+ if (bytes_ > 0) {
+ // Rate is computed on actual elapsed time, not the sum of per-thread
+ // elapsed times.
+ double elapsed = (finish_ - start_) * 1e-6;
+ char rate[100];
+ snprintf(rate, sizeof(rate), "%6.1f MB/s",
+ (bytes_ / 1048576.0) / elapsed);
+ extra = rate;
+ }
+ AppendWithSpace(&extra, message_);
+
+ fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
+ name.ToString().c_str(),
+ seconds_ * 1e6 / done_,
+ (extra.empty() ? "" : " "),
+ extra.c_str());
+ if (FLAGS_histogram) {
+ fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+ }
+ fflush(stdout);
+ }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+ port::Mutex mu;
+ port::CondVar cv;
+ int total;
+
+ // Each thread goes through the following states:
+ // (1) initializing
+ // (2) waiting for others to be initialized
+ // (3) running
+ // (4) done
+
+ int num_initialized;
+ int num_done;
+ bool start;
+
+ SharedState() : cv(&mu) { }
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+ int tid; // 0..n-1 when running in n threads
+ Random rand; // Has different seeds for different threads
+ Stats stats;
+ SharedState* shared;
+
+ ThreadState(int index)
+ : tid(index),
+ rand(1000 + index) {
+ }
+};
+
+} // namespace
+
+class Benchmark {
+ private:
+ Cache* cache_;
+ const FilterPolicy* filter_policy_;
+ DB* db_;
+ int num_;
+ int value_size_;
+ int entries_per_batch_;
+ WriteOptions write_options_;
+ int reads_;
+ int heap_counter_;
+
+ void PrintHeader() {
+ const int kKeySize = 16;
+ PrintEnvironment();
+ fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
+ fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
+ FLAGS_value_size,
+ static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+ fprintf(stdout, "Entries: %d\n", num_);
+ fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
+ ((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
+ / 1048576.0));
+ fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
+ (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
+ / 1048576.0));
+ PrintWarnings();
+ fprintf(stdout, "------------------------------------------------\n");
+ }
+
+ void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+ fprintf(stdout,
+ "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+ );
+#endif
+#ifndef NDEBUG
+ fprintf(stdout,
+ "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+
+ // See if snappy is working by attempting to compress a compressible string
+ const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
+ std::string compressed;
+ if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
+ fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
+ } else if (compressed.size() >= sizeof(text)) {
+ fprintf(stdout, "WARNING: Snappy compression is not effective\n");
+ }
+ }
+
+ void PrintEnvironment() {
+ fprintf(stderr, "LevelDB: version %d.%d\n",
+ kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+ time_t now = time(NULL);
+ fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
+
+ FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+ if (cpuinfo != NULL) {
+ char line[1000];
+ int num_cpus = 0;
+ std::string cpu_type;
+ std::string cache_size;
+ while (fgets(line, sizeof(line), cpuinfo) != NULL) {
+ const char* sep = strchr(line, ':');
+ if (sep == NULL) {
+ continue;
+ }
+ Slice key = TrimSpace(Slice(line, sep - 1 - line));
+ Slice val = TrimSpace(Slice(sep + 1));
+ if (key == "model name") {
+ ++num_cpus;
+ cpu_type = val.ToString();
+ } else if (key == "cache size") {
+ cache_size = val.ToString();
+ }
+ }
+ fclose(cpuinfo);
+ fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
+ fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
+ }
+#endif
+ }
+
+ public:
+ Benchmark()
+ : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
+ filter_policy_(FLAGS_bloom_bits >= 0
+ ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+ : NULL),
+ db_(NULL),
+ num_(FLAGS_num),
+ value_size_(FLAGS_value_size),
+ entries_per_batch_(1),
+ reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+ heap_counter_(0) {
+ std::vector<std::string> files;
+ Env::Default()->GetChildren(FLAGS_db, &files);
+ for (size_t i = 0; i < files.size(); i++) {
+ if (Slice(files[i]).starts_with("heap-")) {
+ Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+ }
+ }
+ if (!FLAGS_use_existing_db) {
+ DestroyDB(FLAGS_db, Options());
+ }
+ }
+
+ ~Benchmark() {
+ delete db_;
+ delete cache_;
+ delete filter_policy_;
+ }
+
+ void Run() {
+ PrintHeader();
+ Open();
+
+ const char* benchmarks = FLAGS_benchmarks;
+ while (benchmarks != NULL) {
+ const char* sep = strchr(benchmarks, ',');
+ Slice name;
+ if (sep == NULL) {
+ name = benchmarks;
+ benchmarks = NULL;
+ } else {
+ name = Slice(benchmarks, sep - benchmarks);
+ benchmarks = sep + 1;
+ }
+
+ // Reset parameters that may be overriddden bwlow
+ num_ = FLAGS_num;
+ reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+ value_size_ = FLAGS_value_size;
+ entries_per_batch_ = 1;
+ write_options_ = WriteOptions();
+
+ void (Benchmark::*method)(ThreadState*) = NULL;
+ bool fresh_db = false;
+ int num_threads = FLAGS_threads;
+
+ if (name == Slice("fillseq")) {
+ fresh_db = true;
+ method = &Benchmark::WriteSeq;
+ } else if (name == Slice("fillbatch")) {
+ fresh_db = true;
+ entries_per_batch_ = 1000;
+ method = &Benchmark::WriteSeq;
+ } else if (name == Slice("fillrandom")) {
+ fresh_db = true;
+ method = &Benchmark::WriteRandom;
+ } else if (name == Slice("overwrite")) {
+ fresh_db = false;
+ method = &Benchmark::WriteRandom;
+ } else if (name == Slice("fillsync")) {
+ fresh_db = true;
+ num_ /= 1000;
+ write_options_.sync = true;
+ method = &Benchmark::WriteRandom;
+ } else if (name == Slice("fill100K")) {
+ fresh_db = true;
+ num_ /= 1000;
+ value_size_ = 100 * 1000;
+ method = &Benchmark::WriteRandom;
+ } else if (name == Slice("readseq")) {
+ method = &Benchmark::ReadSequential;
+ } else if (name == Slice("readreverse")) {
+ method = &Benchmark::ReadReverse;
+ } else if (name == Slice("readrandom")) {
+ method = &Benchmark::ReadRandom;
+ } else if (name == Slice("readmissing")) {
+ method = &Benchmark::ReadMissing;
+ } else if (name == Slice("seekrandom")) {
+ method = &Benchmark::SeekRandom;
+ } else if (name == Slice("readhot")) {
+ method = &Benchmark::ReadHot;
+ } else if (name == Slice("readrandomsmall")) {
+ reads_ /= 1000;
+ method = &Benchmark::ReadRandom;
+ } else if (name == Slice("deleteseq")) {
+ method = &Benchmark::DeleteSeq;
+ } else if (name == Slice("deleterandom")) {
+ method = &Benchmark::DeleteRandom;
+ } else if (name == Slice("readwhilewriting")) {
+ num_threads++; // Add extra thread for writing
+ method = &Benchmark::ReadWhileWriting;
+ } else if (name == Slice("compact")) {
+ method = &Benchmark::Compact;
+ } else if (name == Slice("crc32c")) {
+ method = &Benchmark::Crc32c;
+ } else if (name == Slice("acquireload")) {
+ method = &Benchmark::AcquireLoad;
+ } else if (name == Slice("snappycomp")) {
+ method = &Benchmark::SnappyCompress;
+ } else if (name == Slice("snappyuncomp")) {
+ method = &Benchmark::SnappyUncompress;
+ } else if (name == Slice("heapprofile")) {
+ HeapProfile();
+ } else if (name == Slice("stats")) {
+ PrintStats("leveldb.stats");
+ } else if (name == Slice("sstables")) {
+ PrintStats("leveldb.sstables");
+ } else {
+ if (name != Slice()) { // No error message for empty name
+ fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+ }
+ }
+
+ if (fresh_db) {
+ if (FLAGS_use_existing_db) {
+ fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+ name.ToString().c_str());
+ method = NULL;
+ } else {
+ delete db_;
+ db_ = NULL;
+ DestroyDB(FLAGS_db, Options());
+ Open();
+ }
+ }
+
+ if (method != NULL) {
+ RunBenchmark(num_threads, name, method);
+ }
+ }
+ }
+
+ private:
+ struct ThreadArg {
+ Benchmark* bm;
+ SharedState* shared;
+ ThreadState* thread;
+ void (Benchmark::*method)(ThreadState*);
+ };
+
+ static void ThreadBody(void* v) {
+ ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+ SharedState* shared = arg->shared;
+ ThreadState* thread = arg->thread;
+ {
+ MutexLock l(&shared->mu);
+ shared->num_initialized++;
+ if (shared->num_initialized >= shared->total) {
+ shared->cv.SignalAll();
+ }
+ while (!shared->start) {
+ shared->cv.Wait();
+ }
+ }
+
+ thread->stats.Start();
+ (arg->bm->*(arg->method))(thread);
+ thread->stats.Stop();
+
+ {
+ MutexLock l(&shared->mu);
+ shared->num_done++;
+ if (shared->num_done >= shared->total) {
+ shared->cv.SignalAll();
+ }
+ }
+ }
+
+ void RunBenchmark(int n, Slice name,
+ void (Benchmark::*method)(ThreadState*)) {
+ SharedState shared;
+ shared.total = n;
+ shared.num_initialized = 0;
+ shared.num_done = 0;
+ shared.start = false;
+
+ ThreadArg* arg = new ThreadArg[n];
+ for (int i = 0; i < n; i++) {
+ arg[i].bm = this;
+ arg[i].method = method;
+ arg[i].shared = &shared;
+ arg[i].thread = new ThreadState(i);
+ arg[i].thread->shared = &shared;
+ Env::Default()->StartThread(ThreadBody, &arg[i]);
+ }
+
+ shared.mu.Lock();
+ while (shared.num_initialized < n) {
+ shared.cv.Wait();
+ }
+
+ shared.start = true;
+ shared.cv.SignalAll();
+ while (shared.num_done < n) {
+ shared.cv.Wait();
+ }
+ shared.mu.Unlock();
+
+ for (int i = 1; i < n; i++) {
+ arg[0].thread->stats.Merge(arg[i].thread->stats);
+ }
+ arg[0].thread->stats.Report(name);
+
+ for (int i = 0; i < n; i++) {
+ delete arg[i].thread;
+ }
+ delete[] arg;
+ }
+
+ void Crc32c(ThreadState* thread) {
+ // Checksum about 500MB of data total
+ const int size = 4096;
+ const char* label = "(4K per op)";
+ std::string data(size, 'x');
+ int64_t bytes = 0;
+ uint32_t crc = 0;
+ while (bytes < 500 * 1048576) {
+ crc = crc32c::Value(data.data(), size);
+ thread->stats.FinishedSingleOp();
+ bytes += size;
+ }
+ // Print so result is not dead
+ fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+
+ thread->stats.AddBytes(bytes);
+ thread->stats.AddMessage(label);
+ }
+
+ void AcquireLoad(ThreadState* thread) {
+ int dummy;
+ port::AtomicPointer ap(&dummy);
+ int count = 0;
+ void *ptr = NULL;
+ thread->stats.AddMessage("(each op is 1000 loads)");
+ while (count < 100000) {
+ for (int i = 0; i < 1000; i++) {
+ ptr = ap.Acquire_Load();
+ }
+ count++;
+ thread->stats.FinishedSingleOp();
+ }
+ if (ptr == NULL) exit(1); // Disable unused variable warning.
+ }
+
+ void SnappyCompress(ThreadState* thread) {
+ RandomGenerator gen;
+ Slice input = gen.Generate(Options().block_size);
+ int64_t bytes = 0;
+ int64_t produced = 0;
+ bool ok = true;
+ std::string compressed;
+ while (ok && bytes < 1024 * 1048576) { // Compress 1G
+ ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+ produced += compressed.size();
+ bytes += input.size();
+ thread->stats.FinishedSingleOp();
+ }
+
+ if (!ok) {
+ thread->stats.AddMessage("(snappy failure)");
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+ (produced * 100.0) / bytes);
+ thread->stats.AddMessage(buf);
+ thread->stats.AddBytes(bytes);
+ }
+ }
+
+ void SnappyUncompress(ThreadState* thread) {
+ RandomGenerator gen;
+ Slice input = gen.Generate(Options().block_size);
+ std::string compressed;
+ bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+ int64_t bytes = 0;
+ char* uncompressed = new char[input.size()];
+ while (ok && bytes < 1024 * 1048576) { // Compress 1G
+ ok = port::Snappy_Uncompress(compressed.data(), compressed.size(),
+ uncompressed);
+ bytes += input.size();
+ thread->stats.FinishedSingleOp();
+ }
+ delete[] uncompressed;
+
+ if (!ok) {
+ thread->stats.AddMessage("(snappy failure)");
+ } else {
+ thread->stats.AddBytes(bytes);
+ }
+ }
+
+ void Open() {
+ assert(db_ == NULL);
+ Options options;
+ options.create_if_missing = !FLAGS_use_existing_db;
+ options.block_cache = cache_;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ options.max_open_files = FLAGS_open_files;
+ options.filter_policy = filter_policy_;
+ Status s = DB::Open(options, FLAGS_db, &db_);
+ if (!s.ok()) {
+ fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ void WriteSeq(ThreadState* thread) {
+ DoWrite(thread, true);
+ }
+
+ void WriteRandom(ThreadState* thread) {
+ DoWrite(thread, false);
+ }
+
+ void DoWrite(ThreadState* thread, bool seq) {
+ if (num_ != FLAGS_num) {
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%d ops)", num_);
+ thread->stats.AddMessage(msg);
+ }
+
+ RandomGenerator gen;
+ WriteBatch batch;
+ Status s;
+ int64_t bytes = 0;
+ for (int i = 0; i < num_; i += entries_per_batch_) {
+ batch.Clear();
+ for (int j = 0; j < entries_per_batch_; j++) {
+ const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+ char key[100];
+ snprintf(key, sizeof(key), "%016d", k);
+ batch.Put(key, gen.Generate(value_size_));
+ bytes += value_size_ + strlen(key);
+ thread->stats.FinishedSingleOp();
+ }
+ s = db_->Write(write_options_, &batch);
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+ thread->stats.AddBytes(bytes);
+ }
+
+ void ReadSequential(ThreadState* thread) {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int i = 0;
+ int64_t bytes = 0;
+ for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+ bytes += iter->key().size() + iter->value().size();
+ thread->stats.FinishedSingleOp();
+ ++i;
+ }
+ delete iter;
+ thread->stats.AddBytes(bytes);
+ }
+
+ void ReadReverse(ThreadState* thread) {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int i = 0;
+ int64_t bytes = 0;
+ for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+ bytes += iter->key().size() + iter->value().size();
+ thread->stats.FinishedSingleOp();
+ ++i;
+ }
+ delete iter;
+ thread->stats.AddBytes(bytes);
+ }
+
+ void ReadRandom(ThreadState* thread) {
+ ReadOptions options;
+ std::string value;
+ int found = 0;
+ for (int i = 0; i < reads_; i++) {
+ char key[100];
+ const int k = thread->rand.Next() % FLAGS_num;
+ snprintf(key, sizeof(key), "%016d", k);
+ if (db_->Get(options, key, &value).ok()) {
+ found++;
+ }
+ thread->stats.FinishedSingleOp();
+ }
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+ thread->stats.AddMessage(msg);
+ }
+
+ void ReadMissing(ThreadState* thread) {
+ ReadOptions options;
+ std::string value;
+ for (int i = 0; i < reads_; i++) {
+ char key[100];
+ const int k = thread->rand.Next() % FLAGS_num;
+ snprintf(key, sizeof(key), "%016d.", k);
+ db_->Get(options, key, &value);
+ thread->stats.FinishedSingleOp();
+ }
+ }
+
+ void ReadHot(ThreadState* thread) {
+ ReadOptions options;
+ std::string value;
+ const int range = (FLAGS_num + 99) / 100;
+ for (int i = 0; i < reads_; i++) {
+ char key[100];
+ const int k = thread->rand.Next() % range;
+ snprintf(key, sizeof(key), "%016d", k);
+ db_->Get(options, key, &value);
+ thread->stats.FinishedSingleOp();
+ }
+ }
+
+ void SeekRandom(ThreadState* thread) {
+ ReadOptions options;
+ std::string value;
+ int found = 0;
+ for (int i = 0; i < reads_; i++) {
+ Iterator* iter = db_->NewIterator(options);
+ char key[100];
+ const int k = thread->rand.Next() % FLAGS_num;
+ snprintf(key, sizeof(key), "%016d", k);
+ iter->Seek(key);
+ if (iter->Valid() && iter->key() == key) found++;
+ delete iter;
+ thread->stats.FinishedSingleOp();
+ }
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+ thread->stats.AddMessage(msg);
+ }
+
+ void DoDelete(ThreadState* thread, bool seq) {
+ RandomGenerator gen;
+ WriteBatch batch;
+ Status s;
+ for (int i = 0; i < num_; i += entries_per_batch_) {
+ batch.Clear();
+ for (int j = 0; j < entries_per_batch_; j++) {
+ const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+ char key[100];
+ snprintf(key, sizeof(key), "%016d", k);
+ batch.Delete(key);
+ thread->stats.FinishedSingleOp();
+ }
+ s = db_->Write(write_options_, &batch);
+ if (!s.ok()) {
+ fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+ }
+
+ void DeleteSeq(ThreadState* thread) {
+ DoDelete(thread, true);
+ }
+
+ void DeleteRandom(ThreadState* thread) {
+ DoDelete(thread, false);
+ }
+
+ void ReadWhileWriting(ThreadState* thread) {
+ if (thread->tid > 0) {
+ ReadRandom(thread);
+ } else {
+ // Special thread that keeps writing until other threads are done.
+ RandomGenerator gen;
+ while (true) {
+ {
+ MutexLock l(&thread->shared->mu);
+ if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+ // Other threads have finished
+ break;
+ }
+ }
+
+ const int k = thread->rand.Next() % FLAGS_num;
+ char key[100];
+ snprintf(key, sizeof(key), "%016d", k);
+ Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ // Do not count any of the preceding work/delay in stats.
+ thread->stats.Start();
+ }
+ }
+
+ void Compact(ThreadState* thread) {
+ db_->CompactRange(NULL, NULL);
+ }
+
+ void PrintStats(const char* key) {
+ std::string stats;
+ if (!db_->GetProperty(key, &stats)) {
+ stats = "(failed)";
+ }
+ fprintf(stdout, "\n%s\n", stats.c_str());
+ }
+
+ static void WriteToFile(void* arg, const char* buf, int n) {
+ reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
+ }
+
+ void HeapProfile() {
+ char fname[100];
+ snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
+ WritableFile* file;
+ Status s = Env::Default()->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ return;
+ }
+ bool ok = port::GetHeapProfile(WriteToFile, file);
+ delete file;
+ if (!ok) {
+ fprintf(stderr, "heap profiling not supported\n");
+ Env::Default()->DeleteFile(fname);
+ }
+ }
+};
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
+ FLAGS_open_files = leveldb::Options().max_open_files;
+ std::string default_db_path;
+
+ for (int i = 1; i < argc; i++) {
+ double d;
+ int n;
+ char junk;
+ if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
+ FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
+ } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
+ FLAGS_compression_ratio = d;
+ } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_histogram = n;
+ } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_use_existing_db = n;
+ } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
+ FLAGS_num = n;
+ } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
+ FLAGS_reads = n;
+ } else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
+ FLAGS_threads = n;
+ } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
+ FLAGS_value_size = n;
+ } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+ FLAGS_write_buffer_size = n;
+ } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
+ FLAGS_cache_size = n;
+ } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
+ FLAGS_bloom_bits = n;
+ } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
+ FLAGS_open_files = n;
+ } else if (strncmp(argv[i], "--db=", 5) == 0) {
+ FLAGS_db = argv[i] + 5;
+ } else {
+ fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
+ exit(1);
+ }
+ }
+
+ // Choose a location for the test database if none given with --db=<path>
+ if (FLAGS_db == NULL) {
+ leveldb::Env::Default()->GetTestDirectory(&default_db_path);
+ default_db_path += "/dbbench";
+ FLAGS_db = default_db_path.c_str();
+ }
+
+ leveldb::Benchmark benchmark;
+ benchmark.Run();
+ return 0;
+}
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
new file mode 100644
index 0000000000..faf5e7d7ba
--- /dev/null
+++ b/src/leveldb/db/db_impl.cc
@@ -0,0 +1,1513 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/table_builder.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+const int kNumNonTableCacheFiles = 10;
+
+// Information kept for every waiting writer
+struct DBImpl::Writer {
+ Status status;
+ WriteBatch* batch;
+ bool sync;
+ bool done;
+ port::CondVar cv;
+
+ explicit Writer(port::Mutex* mu) : cv(mu) { }
+};
+
+struct DBImpl::CompactionState {
+ Compaction* const compaction;
+
+ // Sequence numbers < smallest_snapshot are not significant since we
+ // will never have to service a snapshot below smallest_snapshot.
+ // Therefore if we have seen a sequence number S <= smallest_snapshot,
+ // we can drop all entries for the same key with sequence numbers < S.
+ SequenceNumber smallest_snapshot;
+
+ // Files produced by compaction
+ struct Output {
+ uint64_t number;
+ uint64_t file_size;
+ InternalKey smallest, largest;
+ };
+ std::vector<Output> outputs;
+
+ // State kept for output being generated
+ WritableFile* outfile;
+ TableBuilder* builder;
+
+ uint64_t total_bytes;
+
+ Output* current_output() { return &outputs[outputs.size()-1]; }
+
+ explicit CompactionState(Compaction* c)
+ : compaction(c),
+ outfile(NULL),
+ builder(NULL),
+ total_bytes(0) {
+ }
+};
+
+// Fix user-supplied options to be reasonable
+template <class T,class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+ if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+Options SanitizeOptions(const std::string& dbname,
+ const InternalKeyComparator* icmp,
+ const InternalFilterPolicy* ipolicy,
+ const Options& src) {
+ Options result = src;
+ result.comparator = icmp;
+ result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
+ ClipToRange(&result.max_open_files, 64 + kNumNonTableCacheFiles, 50000);
+ ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
+ ClipToRange(&result.block_size, 1<<10, 4<<20);
+ if (result.info_log == NULL) {
+ // Open a log file in the same directory as the db
+ src.env->CreateDir(dbname); // In case it does not exist
+ src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
+ Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = NULL;
+ }
+ }
+ if (result.block_cache == NULL) {
+ result.block_cache = NewLRUCache(8 << 20);
+ }
+ return result;
+}
+
+DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
+ : env_(raw_options.env),
+ internal_comparator_(raw_options.comparator),
+ internal_filter_policy_(raw_options.filter_policy),
+ options_(SanitizeOptions(dbname, &internal_comparator_,
+ &internal_filter_policy_, raw_options)),
+ owns_info_log_(options_.info_log != raw_options.info_log),
+ owns_cache_(options_.block_cache != raw_options.block_cache),
+ dbname_(dbname),
+ db_lock_(NULL),
+ shutting_down_(NULL),
+ bg_cv_(&mutex_),
+ mem_(new MemTable(internal_comparator_)),
+ imm_(NULL),
+ logfile_(NULL),
+ logfile_number_(0),
+ log_(NULL),
+ seed_(0),
+ tmp_batch_(new WriteBatch),
+ bg_compaction_scheduled_(false),
+ manual_compaction_(NULL) {
+ mem_->Ref();
+ has_imm_.Release_Store(NULL);
+
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ const int table_cache_size = options_.max_open_files - kNumNonTableCacheFiles;
+ table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
+
+ versions_ = new VersionSet(dbname_, &options_, table_cache_,
+ &internal_comparator_);
+}
+
+DBImpl::~DBImpl() {
+ // Wait for background work to finish
+ mutex_.Lock();
+ shutting_down_.Release_Store(this); // Any non-NULL value is ok
+ while (bg_compaction_scheduled_) {
+ bg_cv_.Wait();
+ }
+ mutex_.Unlock();
+
+ if (db_lock_ != NULL) {
+ env_->UnlockFile(db_lock_);
+ }
+
+ delete versions_;
+ if (mem_ != NULL) mem_->Unref();
+ if (imm_ != NULL) imm_->Unref();
+ delete tmp_batch_;
+ delete log_;
+ delete logfile_;
+ delete table_cache_;
+
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+ if (owns_cache_) {
+ delete options_.block_cache;
+ }
+}
+
+Status DBImpl::NewDB() {
+ VersionEdit new_db;
+ new_db.SetComparatorName(user_comparator()->Name());
+ new_db.SetLogNumber(0);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(manifest, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ {
+ log::Writer log(file);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = file->Close();
+ }
+ }
+ delete file;
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(manifest);
+ }
+ return s;
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || options_.paranoid_checks) {
+ // No change needed
+ } else {
+ Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ if (!bg_error_.ok()) {
+ // After a background error, we don't know whether a new version may
+ // or may not have been committed, so we cannot safely garbage collect.
+ return;
+ }
+
+ // Make a set of all of the live files
+ std::set<uint64_t> live = pending_outputs_;
+ versions_->AddLiveFiles(&live);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ bool keep = true;
+ switch (type) {
+ case kLogFile:
+ keep = ((number >= versions_->LogNumber()) ||
+ (number == versions_->PrevLogNumber()));
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (in case there is a race that allows other incarnations)
+ keep = (number >= versions_->ManifestFileNumber());
+ break;
+ case kTableFile:
+ keep = (live.find(number) != live.end());
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live"
+ keep = (live.find(number) != live.end());
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kInfoLogFile:
+ keep = true;
+ break;
+ }
+
+ if (!keep) {
+ if (type == kTableFile) {
+ table_cache_->Evict(number);
+ }
+ Log(options_.info_log, "Delete type=%d #%lld\n",
+ int(type),
+ static_cast<unsigned long long>(number));
+ env_->DeleteFile(dbname_ + "/" + filenames[i]);
+ }
+ }
+ }
+}
+
+Status DBImpl::Recover(VersionEdit* edit) {
+ mutex_.AssertHeld();
+
+ // Ignore error from CreateDir since the creation of the DB is
+ // committed only when the descriptor is created, and this directory
+ // may already exist from a previous failed creation attempt.
+ env_->CreateDir(dbname_);
+ assert(db_lock_ == NULL);
+ Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!env_->FileExists(CurrentFileName(dbname_))) {
+ if (options_.create_if_missing) {
+ s = NewDB();
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ dbname_, "does not exist (create_if_missing is false)");
+ }
+ } else {
+ if (options_.error_if_exists) {
+ return Status::InvalidArgument(
+ dbname_, "exists (error_if_exists is true)");
+ }
+ }
+
+ s = versions_->Recover();
+ if (s.ok()) {
+ SequenceNumber max_sequence(0);
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that PrevLogNumber() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of leveldb.
+ const uint64_t min_log = versions_->LogNumber();
+ const uint64_t prev_log = versions_->PrevLogNumber();
+ std::vector<std::string> filenames;
+ s = env_->GetChildren(dbname_, &filenames);
+ if (!s.ok()) {
+ return s;
+ }
+ std::set<uint64_t> expected;
+ versions_->AddLiveFiles(&expected);
+ uint64_t number;
+ FileType type;
+ std::vector<uint64_t> logs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ expected.erase(number);
+ if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
+ logs.push_back(number);
+ }
+ }
+ if (!expected.empty()) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d missing files; e.g.",
+ static_cast<int>(expected.size()));
+ return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
+ }
+
+ // Recover in the order in which the logs were generated
+ std::sort(logs.begin(), logs.end());
+ for (size_t i = 0; i < logs.size(); i++) {
+ s = RecoverLogFile(logs[i], edit, &max_sequence);
+
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(logs[i]);
+ }
+
+ if (s.ok()) {
+ if (versions_->LastSequence() < max_sequence) {
+ versions_->SetLastSequence(max_sequence);
+ }
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number,
+ VersionEdit* edit,
+ SequenceNumber* max_sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ const char* fname;
+ Status* status; // NULL if options_.paranoid_checks==false
+ virtual void Corruption(size_t bytes, const Status& s) {
+ Log(info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == NULL ? "(ignoring error) " : ""),
+ fname, static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status != NULL && this->status->ok()) *this->status = s;
+ }
+ };
+
+ mutex_.AssertHeld();
+
+ // Open the log file
+ std::string fname = LogFileName(dbname_, log_number);
+ SequentialFile* file;
+ Status status = env_->NewSequentialFile(fname, &file);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.fname = fname.c_str();
+ reporter.status = (options_.paranoid_checks ? &status : NULL);
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(file, &reporter, true/*checksum*/,
+ 0/*initial_offset*/);
+ Log(options_.info_log, "Recovering log #%llu",
+ (unsigned long long) log_number);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ MemTable* mem = NULL;
+ while (reader.ReadRecord(&record, &scratch) &&
+ status.ok()) {
+ if (record.size() < 12) {
+ reporter.Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+
+ if (mem == NULL) {
+ mem = new MemTable(internal_comparator_);
+ mem->Ref();
+ }
+ status = WriteBatchInternal::InsertInto(&batch, mem);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ break;
+ }
+ const SequenceNumber last_seq =
+ WriteBatchInternal::Sequence(&batch) +
+ WriteBatchInternal::Count(&batch) - 1;
+ if (last_seq > *max_sequence) {
+ *max_sequence = last_seq;
+ }
+
+ if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+ status = WriteLevel0Table(mem, edit, NULL);
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ break;
+ }
+ mem->Unref();
+ mem = NULL;
+ }
+ }
+
+ if (status.ok() && mem != NULL) {
+ status = WriteLevel0Table(mem, edit, NULL);
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ }
+
+ if (mem != NULL) mem->Unref();
+ delete file;
+ return status;
+}
+
+Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
+ Version* base) {
+ mutex_.AssertHeld();
+ const uint64_t start_micros = env_->NowMicros();
+ FileMetaData meta;
+ meta.number = versions_->NewFileNumber();
+ pending_outputs_.insert(meta.number);
+ Iterator* iter = mem->NewIterator();
+ Log(options_.info_log, "Level-0 table #%llu: started",
+ (unsigned long long) meta.number);
+
+ Status s;
+ {
+ mutex_.Unlock();
+ s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+ mutex_.Lock();
+ }
+
+ Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
+ (unsigned long long) meta.number,
+ (unsigned long long) meta.file_size,
+ s.ToString().c_str());
+ delete iter;
+ pending_outputs_.erase(meta.number);
+
+
+ // Note that if file_size is zero, the file has been deleted and
+ // should not be added to the manifest.
+ int level = 0;
+ if (s.ok() && meta.file_size > 0) {
+ const Slice min_user_key = meta.smallest.user_key();
+ const Slice max_user_key = meta.largest.user_key();
+ if (base != NULL) {
+ level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+ }
+ edit->AddFile(level, meta.number, meta.file_size,
+ meta.smallest, meta.largest);
+ }
+
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros;
+ stats.bytes_written = meta.file_size;
+ stats_[level].Add(stats);
+ return s;
+}
+
+void DBImpl::CompactMemTable() {
+ mutex_.AssertHeld();
+ assert(imm_ != NULL);
+
+ // Save the contents of the memtable as a new Table
+ VersionEdit edit;
+ Version* base = versions_->current();
+ base->Ref();
+ Status s = WriteLevel0Table(imm_, &edit, base);
+ base->Unref();
+
+ if (s.ok() && shutting_down_.Acquire_Load()) {
+ s = Status::IOError("Deleting DB during memtable compaction");
+ }
+
+ // Replace immutable memtable with the generated Table
+ if (s.ok()) {
+ edit.SetPrevLogNumber(0);
+ edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed
+ s = versions_->LogAndApply(&edit, &mutex_);
+ }
+
+ if (s.ok()) {
+ // Commit to the new state
+ imm_->Unref();
+ imm_ = NULL;
+ has_imm_.Release_Store(NULL);
+ DeleteObsoleteFiles();
+ } else {
+ RecordBackgroundError(s);
+ }
+}
+
+void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
+ int max_level_with_files = 1;
+ {
+ MutexLock l(&mutex_);
+ Version* base = versions_->current();
+ for (int level = 1; level < config::kNumLevels; level++) {
+ if (base->OverlapInLevel(level, begin, end)) {
+ max_level_with_files = level;
+ }
+ }
+ }
+ TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
+ for (int level = 0; level < max_level_with_files; level++) {
+ TEST_CompactRange(level, begin, end);
+ }
+}
+
+void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
+ assert(level >= 0);
+ assert(level + 1 < config::kNumLevels);
+
+ InternalKey begin_storage, end_storage;
+
+ ManualCompaction manual;
+ manual.level = level;
+ manual.done = false;
+ if (begin == NULL) {
+ manual.begin = NULL;
+ } else {
+ begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+ manual.begin = &begin_storage;
+ }
+ if (end == NULL) {
+ manual.end = NULL;
+ } else {
+ end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+ manual.end = &end_storage;
+ }
+
+ MutexLock l(&mutex_);
+ while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) {
+ if (manual_compaction_ == NULL) { // Idle
+ manual_compaction_ = &manual;
+ MaybeScheduleCompaction();
+ } else { // Running either my compaction or another compaction.
+ bg_cv_.Wait();
+ }
+ }
+ if (manual_compaction_ == &manual) {
+ // Cancel my manual compaction since we aborted early for some reason.
+ manual_compaction_ = NULL;
+ }
+}
+
+Status DBImpl::TEST_CompactMemTable() {
+ // NULL batch means just wait for earlier writes to be done
+ Status s = Write(WriteOptions(), NULL);
+ if (s.ok()) {
+ // Wait until the compaction completes
+ MutexLock l(&mutex_);
+ while (imm_ != NULL && bg_error_.ok()) {
+ bg_cv_.Wait();
+ }
+ if (imm_ != NULL) {
+ s = bg_error_;
+ }
+ }
+ return s;
+}
+
+void DBImpl::RecordBackgroundError(const Status& s) {
+ mutex_.AssertHeld();
+ if (bg_error_.ok()) {
+ bg_error_ = s;
+ bg_cv_.SignalAll();
+ }
+}
+
+void DBImpl::MaybeScheduleCompaction() {
+ mutex_.AssertHeld();
+ if (bg_compaction_scheduled_) {
+ // Already scheduled
+ } else if (shutting_down_.Acquire_Load()) {
+ // DB is being deleted; no more background compactions
+ } else if (!bg_error_.ok()) {
+ // Already got an error; no more changes
+ } else if (imm_ == NULL &&
+ manual_compaction_ == NULL &&
+ !versions_->NeedsCompaction()) {
+ // No work to be done
+ } else {
+ bg_compaction_scheduled_ = true;
+ env_->Schedule(&DBImpl::BGWork, this);
+ }
+}
+
+void DBImpl::BGWork(void* db) {
+ reinterpret_cast<DBImpl*>(db)->BackgroundCall();
+}
+
+void DBImpl::BackgroundCall() {
+ MutexLock l(&mutex_);
+ assert(bg_compaction_scheduled_);
+ if (shutting_down_.Acquire_Load()) {
+ // No more background work when shutting down.
+ } else if (!bg_error_.ok()) {
+ // No more background work after a background error.
+ } else {
+ BackgroundCompaction();
+ }
+
+ bg_compaction_scheduled_ = false;
+
+ // Previous compaction may have produced too many files in a level,
+ // so reschedule another compaction if needed.
+ MaybeScheduleCompaction();
+ bg_cv_.SignalAll();
+}
+
+void DBImpl::BackgroundCompaction() {
+ mutex_.AssertHeld();
+
+ if (imm_ != NULL) {
+ CompactMemTable();
+ return;
+ }
+
+ Compaction* c;
+ bool is_manual = (manual_compaction_ != NULL);
+ InternalKey manual_end;
+ if (is_manual) {
+ ManualCompaction* m = manual_compaction_;
+ c = versions_->CompactRange(m->level, m->begin, m->end);
+ m->done = (c == NULL);
+ if (c != NULL) {
+ manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
+ }
+ Log(options_.info_log,
+ "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
+ m->level,
+ (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+ (m->end ? m->end->DebugString().c_str() : "(end)"),
+ (m->done ? "(end)" : manual_end.DebugString().c_str()));
+ } else {
+ c = versions_->PickCompaction();
+ }
+
+ Status status;
+ if (c == NULL) {
+ // Nothing to do
+ } else if (!is_manual && c->IsTrivialMove()) {
+ // Move file to next level
+ assert(c->num_input_files(0) == 1);
+ FileMetaData* f = c->input(0, 0);
+ c->edit()->DeleteFile(c->level(), f->number);
+ c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
+ f->smallest, f->largest);
+ status = versions_->LogAndApply(c->edit(), &mutex_);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ VersionSet::LevelSummaryStorage tmp;
+ Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
+ static_cast<unsigned long long>(f->number),
+ c->level() + 1,
+ static_cast<unsigned long long>(f->file_size),
+ status.ToString().c_str(),
+ versions_->LevelSummary(&tmp));
+ } else {
+ CompactionState* compact = new CompactionState(c);
+ status = DoCompactionWork(compact);
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ CleanupCompaction(compact);
+ c->ReleaseInputs();
+ DeleteObsoleteFiles();
+ }
+ delete c;
+
+ if (status.ok()) {
+ // Done
+ } else if (shutting_down_.Acquire_Load()) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ Log(options_.info_log,
+ "Compaction error: %s", status.ToString().c_str());
+ }
+
+ if (is_manual) {
+ ManualCompaction* m = manual_compaction_;
+ if (!status.ok()) {
+ m->done = true;
+ }
+ if (!m->done) {
+ // We only compacted part of the requested range. Update *m
+ // to the range that is left to be compacted.
+ m->tmp_storage = manual_end;
+ m->begin = &m->tmp_storage;
+ }
+ manual_compaction_ = NULL;
+ }
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact) {
+ mutex_.AssertHeld();
+ if (compact->builder != NULL) {
+ // May happen if we get a shutdown call in the middle of compaction
+ compact->builder->Abandon();
+ delete compact->builder;
+ } else {
+ assert(compact->outfile == NULL);
+ }
+ delete compact->outfile;
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ pending_outputs_.erase(out.number);
+ }
+ delete compact;
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+ assert(compact != NULL);
+ assert(compact->builder == NULL);
+ uint64_t file_number;
+ {
+ mutex_.Lock();
+ file_number = versions_->NewFileNumber();
+ pending_outputs_.insert(file_number);
+ CompactionState::Output out;
+ out.number = file_number;
+ out.smallest.Clear();
+ out.largest.Clear();
+ compact->outputs.push_back(out);
+ mutex_.Unlock();
+ }
+
+ // Make the output file
+ std::string fname = TableFileName(dbname_, file_number);
+ Status s = env_->NewWritableFile(fname, &compact->outfile);
+ if (s.ok()) {
+ compact->builder = new TableBuilder(options_, compact->outfile);
+ }
+ return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+ Iterator* input) {
+ assert(compact != NULL);
+ assert(compact->outfile != NULL);
+ assert(compact->builder != NULL);
+
+ const uint64_t output_number = compact->current_output()->number;
+ assert(output_number != 0);
+
+ // Check for iterator errors
+ Status s = input->status();
+ const uint64_t current_entries = compact->builder->NumEntries();
+ if (s.ok()) {
+ s = compact->builder->Finish();
+ } else {
+ compact->builder->Abandon();
+ }
+ const uint64_t current_bytes = compact->builder->FileSize();
+ compact->current_output()->file_size = current_bytes;
+ compact->total_bytes += current_bytes;
+ delete compact->builder;
+ compact->builder = NULL;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = compact->outfile->Sync();
+ }
+ if (s.ok()) {
+ s = compact->outfile->Close();
+ }
+ delete compact->outfile;
+ compact->outfile = NULL;
+
+ if (s.ok() && current_entries > 0) {
+ // Verify that the table is usable
+ Iterator* iter = table_cache_->NewIterator(ReadOptions(),
+ output_number,
+ current_bytes);
+ s = iter->status();
+ delete iter;
+ if (s.ok()) {
+ Log(options_.info_log,
+ "Generated table #%llu: %lld keys, %lld bytes",
+ (unsigned long long) output_number,
+ (unsigned long long) current_entries,
+ (unsigned long long) current_bytes);
+ }
+ }
+ return s;
+}
+
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact) {
+ mutex_.AssertHeld();
+ Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
+ compact->compaction->num_input_files(0),
+ compact->compaction->level(),
+ compact->compaction->num_input_files(1),
+ compact->compaction->level() + 1,
+ static_cast<long long>(compact->total_bytes));
+
+ // Add compaction outputs
+ compact->compaction->AddInputDeletions(compact->compaction->edit());
+ const int level = compact->compaction->level();
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ compact->compaction->edit()->AddFile(
+ level + 1,
+ out.number, out.file_size, out.smallest, out.largest);
+ }
+ return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact) {
+ const uint64_t start_micros = env_->NowMicros();
+ int64_t imm_micros = 0; // Micros spent doing imm_ compactions
+
+ Log(options_.info_log, "Compacting %d@%d + %d@%d files",
+ compact->compaction->num_input_files(0),
+ compact->compaction->level(),
+ compact->compaction->num_input_files(1),
+ compact->compaction->level() + 1);
+
+ assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+ assert(compact->builder == NULL);
+ assert(compact->outfile == NULL);
+ if (snapshots_.empty()) {
+ compact->smallest_snapshot = versions_->LastSequence();
+ } else {
+ compact->smallest_snapshot = snapshots_.oldest()->number_;
+ }
+
+ // Release mutex while we're actually doing the compaction work
+ mutex_.Unlock();
+
+ Iterator* input = versions_->MakeInputIterator(compact->compaction);
+ input->SeekToFirst();
+ Status status;
+ ParsedInternalKey ikey;
+ std::string current_user_key;
+ bool has_current_user_key = false;
+ SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
+ for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+ // Prioritize immutable compaction work
+ if (has_imm_.NoBarrier_Load() != NULL) {
+ const uint64_t imm_start = env_->NowMicros();
+ mutex_.Lock();
+ if (imm_ != NULL) {
+ CompactMemTable();
+ bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary
+ }
+ mutex_.Unlock();
+ imm_micros += (env_->NowMicros() - imm_start);
+ }
+
+ Slice key = input->key();
+ if (compact->compaction->ShouldStopBefore(key) &&
+ compact->builder != NULL) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ // Handle key/value, add to state, etc.
+ bool drop = false;
+ if (!ParseInternalKey(key, &ikey)) {
+ // Do not hide error keys
+ current_user_key.clear();
+ has_current_user_key = false;
+ last_sequence_for_key = kMaxSequenceNumber;
+ } else {
+ if (!has_current_user_key ||
+ user_comparator()->Compare(ikey.user_key,
+ Slice(current_user_key)) != 0) {
+ // First occurrence of this user key
+ current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+ has_current_user_key = true;
+ last_sequence_for_key = kMaxSequenceNumber;
+ }
+
+ if (last_sequence_for_key <= compact->smallest_snapshot) {
+ // Hidden by an newer entry for same user key
+ drop = true; // (A)
+ } else if (ikey.type == kTypeDeletion &&
+ ikey.sequence <= compact->smallest_snapshot &&
+ compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ drop = true;
+ }
+
+ last_sequence_for_key = ikey.sequence;
+ }
+#if 0
+ Log(options_.info_log,
+ " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+ "%d smallest_snapshot: %d",
+ ikey.user_key.ToString().c_str(),
+ (int)ikey.sequence, ikey.type, kTypeValue, drop,
+ compact->compaction->IsBaseLevelForKey(ikey.user_key),
+ (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+
+ if (!drop) {
+ // Open output file if necessary
+ if (compact->builder == NULL) {
+ status = OpenCompactionOutputFile(compact);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (compact->builder->NumEntries() == 0) {
+ compact->current_output()->smallest.DecodeFrom(key);
+ }
+ compact->current_output()->largest.DecodeFrom(key);
+ compact->builder->Add(key, input->value());
+
+ // Close output file if it is big enough
+ if (compact->builder->FileSize() >=
+ compact->compaction->MaxOutputFileSize()) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+
+ input->Next();
+ }
+
+ if (status.ok() && shutting_down_.Acquire_Load()) {
+ status = Status::IOError("Deleting DB during compaction");
+ }
+ if (status.ok() && compact->builder != NULL) {
+ status = FinishCompactionOutputFile(compact, input);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ delete input;
+ input = NULL;
+
+ CompactionStats stats;
+ stats.micros = env_->NowMicros() - start_micros - imm_micros;
+ for (int which = 0; which < 2; which++) {
+ for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+ stats.bytes_read += compact->compaction->input(which, i)->file_size;
+ }
+ }
+ for (size_t i = 0; i < compact->outputs.size(); i++) {
+ stats.bytes_written += compact->outputs[i].file_size;
+ }
+
+ mutex_.Lock();
+ stats_[compact->compaction->level() + 1].Add(stats);
+
+ if (status.ok()) {
+ status = InstallCompactionResults(compact);
+ }
+ if (!status.ok()) {
+ RecordBackgroundError(status);
+ }
+ VersionSet::LevelSummaryStorage tmp;
+ Log(options_.info_log,
+ "compacted to: %s", versions_->LevelSummary(&tmp));
+ return status;
+}
+
+namespace {
+struct IterState {
+ port::Mutex* mu;
+ Version* version;
+ MemTable* mem;
+ MemTable* imm;
+};
+
+static void CleanupIteratorState(void* arg1, void* arg2) {
+ IterState* state = reinterpret_cast<IterState*>(arg1);
+ state->mu->Lock();
+ state->mem->Unref();
+ if (state->imm != NULL) state->imm->Unref();
+ state->version->Unref();
+ state->mu->Unlock();
+ delete state;
+}
+} // namespace
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed) {
+ IterState* cleanup = new IterState;
+ mutex_.Lock();
+ *latest_snapshot = versions_->LastSequence();
+
+ // Collect together all needed child iterators
+ std::vector<Iterator*> list;
+ list.push_back(mem_->NewIterator());
+ mem_->Ref();
+ if (imm_ != NULL) {
+ list.push_back(imm_->NewIterator());
+ imm_->Ref();
+ }
+ versions_->current()->AddIterators(options, &list);
+ Iterator* internal_iter =
+ NewMergingIterator(&internal_comparator_, &list[0], list.size());
+ versions_->current()->Ref();
+
+ cleanup->mu = &mutex_;
+ cleanup->mem = mem_;
+ cleanup->imm = imm_;
+ cleanup->version = versions_->current();
+ internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
+
+ *seed = ++seed_;
+ mutex_.Unlock();
+ return internal_iter;
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator() {
+ SequenceNumber ignored;
+ uint32_t ignored_seed;
+ return NewInternalIterator(ReadOptions(), &ignored, &ignored_seed);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+ MutexLock l(&mutex_);
+ return versions_->MaxNextLevelOverlappingBytes();
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+ const Slice& key,
+ std::string* value) {
+ Status s;
+ MutexLock l(&mutex_);
+ SequenceNumber snapshot;
+ if (options.snapshot != NULL) {
+ snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+ } else {
+ snapshot = versions_->LastSequence();
+ }
+
+ MemTable* mem = mem_;
+ MemTable* imm = imm_;
+ Version* current = versions_->current();
+ mem->Ref();
+ if (imm != NULL) imm->Ref();
+ current->Ref();
+
+ bool have_stat_update = false;
+ Version::GetStats stats;
+
+ // Unlock while reading from files and memtables
+ {
+ mutex_.Unlock();
+ // First look in the memtable, then in the immutable memtable (if any).
+ LookupKey lkey(key, snapshot);
+ if (mem->Get(lkey, value, &s)) {
+ // Done
+ } else if (imm != NULL && imm->Get(lkey, value, &s)) {
+ // Done
+ } else {
+ s = current->Get(options, lkey, value, &stats);
+ have_stat_update = true;
+ }
+ mutex_.Lock();
+ }
+
+ if (have_stat_update && current->UpdateStats(stats)) {
+ MaybeScheduleCompaction();
+ }
+ mem->Unref();
+ if (imm != NULL) imm->Unref();
+ current->Unref();
+ return s;
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options) {
+ SequenceNumber latest_snapshot;
+ uint32_t seed;
+ Iterator* iter = NewInternalIterator(options, &latest_snapshot, &seed);
+ return NewDBIterator(
+ this, user_comparator(), iter,
+ (options.snapshot != NULL
+ ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+ : latest_snapshot),
+ seed);
+}
+
+void DBImpl::RecordReadSample(Slice key) {
+ MutexLock l(&mutex_);
+ if (versions_->current()->RecordReadSample(key)) {
+ MaybeScheduleCompaction();
+ }
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+ MutexLock l(&mutex_);
+ return snapshots_.New(versions_->LastSequence());
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+ MutexLock l(&mutex_);
+ snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
+ return DB::Put(o, key, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
+ return DB::Delete(options, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+ Writer w(&mutex_);
+ w.batch = my_batch;
+ w.sync = options.sync;
+ w.done = false;
+
+ MutexLock l(&mutex_);
+ writers_.push_back(&w);
+ while (!w.done && &w != writers_.front()) {
+ w.cv.Wait();
+ }
+ if (w.done) {
+ return w.status;
+ }
+
+ // May temporarily unlock and wait.
+ Status status = MakeRoomForWrite(my_batch == NULL);
+ uint64_t last_sequence = versions_->LastSequence();
+ Writer* last_writer = &w;
+ if (status.ok() && my_batch != NULL) { // NULL batch is for compactions
+ WriteBatch* updates = BuildBatchGroup(&last_writer);
+ WriteBatchInternal::SetSequence(updates, last_sequence + 1);
+ last_sequence += WriteBatchInternal::Count(updates);
+
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since &w is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into mem_.
+ {
+ mutex_.Unlock();
+ status = log_->AddRecord(WriteBatchInternal::Contents(updates));
+ bool sync_error = false;
+ if (status.ok() && options.sync) {
+ status = logfile_->Sync();
+ if (!status.ok()) {
+ sync_error = true;
+ }
+ }
+ if (status.ok()) {
+ status = WriteBatchInternal::InsertInto(updates, mem_);
+ }
+ mutex_.Lock();
+ if (sync_error) {
+ // The state of the log file is indeterminate: the log record we
+ // just added may or may not show up when the DB is re-opened.
+ // So we force the DB into a mode where all future writes fail.
+ RecordBackgroundError(status);
+ }
+ }
+ if (updates == tmp_batch_) tmp_batch_->Clear();
+
+ versions_->SetLastSequence(last_sequence);
+ }
+
+ while (true) {
+ Writer* ready = writers_.front();
+ writers_.pop_front();
+ if (ready != &w) {
+ ready->status = status;
+ ready->done = true;
+ ready->cv.Signal();
+ }
+ if (ready == last_writer) break;
+ }
+
+ // Notify new head of write queue
+ if (!writers_.empty()) {
+ writers_.front()->cv.Signal();
+ }
+
+ return status;
+}
+
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-NULL batch
+WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+ assert(!writers_.empty());
+ Writer* first = writers_.front();
+ WriteBatch* result = first->batch;
+ assert(result != NULL);
+
+ size_t size = WriteBatchInternal::ByteSize(first->batch);
+
+ // Allow the group to grow up to a maximum size, but if the
+ // original write is small, limit the growth so we do not slow
+ // down the small write too much.
+ size_t max_size = 1 << 20;
+ if (size <= (128<<10)) {
+ max_size = size + (128<<10);
+ }
+
+ *last_writer = first;
+ std::deque<Writer*>::iterator iter = writers_.begin();
+ ++iter; // Advance past "first"
+ for (; iter != writers_.end(); ++iter) {
+ Writer* w = *iter;
+ if (w->sync && !first->sync) {
+ // Do not include a sync write into a batch handled by a non-sync write.
+ break;
+ }
+
+ if (w->batch != NULL) {
+ size += WriteBatchInternal::ByteSize(w->batch);
+ if (size > max_size) {
+ // Do not make batch too big
+ break;
+ }
+
+ // Append to *reuslt
+ if (result == first->batch) {
+ // Switch to temporary batch instead of disturbing caller's batch
+ result = tmp_batch_;
+ assert(WriteBatchInternal::Count(result) == 0);
+ WriteBatchInternal::Append(result, first->batch);
+ }
+ WriteBatchInternal::Append(result, w->batch);
+ }
+ *last_writer = w;
+ }
+ return result;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::MakeRoomForWrite(bool force) {
+ mutex_.AssertHeld();
+ assert(!writers_.empty());
+ bool allow_delay = !force;
+ Status s;
+ while (true) {
+ if (!bg_error_.ok()) {
+ // Yield previous error
+ s = bg_error_;
+ break;
+ } else if (
+ allow_delay &&
+ versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
+ // We are getting close to hitting a hard limit on the number of
+ // L0 files. Rather than delaying a single write by several
+ // seconds when we hit the hard limit, start delaying each
+ // individual write by 1ms to reduce latency variance. Also,
+ // this delay hands over some CPU to the compaction thread in
+ // case it is sharing the same core as the writer.
+ mutex_.Unlock();
+ env_->SleepForMicroseconds(1000);
+ allow_delay = false; // Do not delay a single write more than once
+ mutex_.Lock();
+ } else if (!force &&
+ (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+ // There is room in current memtable
+ break;
+ } else if (imm_ != NULL) {
+ // We have filled up the current memtable, but the previous
+ // one is still being compacted, so we wait.
+ Log(options_.info_log, "Current memtable full; waiting...\n");
+ bg_cv_.Wait();
+ } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
+ // There are too many level-0 files.
+ Log(options_.info_log, "Too many L0 files; waiting...\n");
+ bg_cv_.Wait();
+ } else {
+ // Attempt to switch to a new memtable and trigger compaction of old
+ assert(versions_->PrevLogNumber() == 0);
+ uint64_t new_log_number = versions_->NewFileNumber();
+ WritableFile* lfile = NULL;
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+ if (!s.ok()) {
+ // Avoid chewing through file number space in a tight loop.
+ versions_->ReuseFileNumber(new_log_number);
+ break;
+ }
+ delete log_;
+ delete logfile_;
+ logfile_ = lfile;
+ logfile_number_ = new_log_number;
+ log_ = new log::Writer(lfile);
+ imm_ = mem_;
+ has_imm_.Release_Store(imm_);
+ mem_ = new MemTable(internal_comparator_);
+ mem_->Ref();
+ force = false; // Do not force another compaction if have room
+ MaybeScheduleCompaction();
+ }
+ }
+ return s;
+}
+
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+ value->clear();
+
+ MutexLock l(&mutex_);
+ Slice in = property;
+ Slice prefix("leveldb.");
+ if (!in.starts_with(prefix)) return false;
+ in.remove_prefix(prefix.size());
+
+ if (in.starts_with("num-files-at-level")) {
+ in.remove_prefix(strlen("num-files-at-level"));
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+ if (!ok || level >= config::kNumLevels) {
+ return false;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d",
+ versions_->NumLevelFiles(static_cast<int>(level)));
+ *value = buf;
+ return true;
+ }
+ } else if (in == "stats") {
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ " Compactions\n"
+ "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
+ "--------------------------------------------------\n"
+ );
+ value->append(buf);
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int files = versions_->NumLevelFiles(level);
+ if (stats_[level].micros > 0 || files > 0) {
+ snprintf(
+ buf, sizeof(buf),
+ "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
+ level,
+ files,
+ versions_->NumLevelBytes(level) / 1048576.0,
+ stats_[level].micros / 1e6,
+ stats_[level].bytes_read / 1048576.0,
+ stats_[level].bytes_written / 1048576.0);
+ value->append(buf);
+ }
+ }
+ return true;
+ } else if (in == "sstables") {
+ *value = versions_->current()->DebugString();
+ return true;
+ }
+
+ return false;
+}
+
+void DBImpl::GetApproximateSizes(
+ const Range* range, int n,
+ uint64_t* sizes) {
+ // TODO(opt): better implementation
+ Version* v;
+ {
+ MutexLock l(&mutex_);
+ versions_->current()->Ref();
+ v = versions_->current();
+ }
+
+ for (int i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+ uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+ sizes[i] = (limit >= start ? limit - start : 0);
+ }
+
+ {
+ MutexLock l(&mutex_);
+ v->Unref();
+ }
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+ WriteBatch batch;
+ batch.Put(key, value);
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, const Slice& key) {
+ WriteBatch batch;
+ batch.Delete(key);
+ return Write(opt, &batch);
+}
+
+DB::~DB() { }
+
+Status DB::Open(const Options& options, const std::string& dbname,
+ DB** dbptr) {
+ *dbptr = NULL;
+
+ DBImpl* impl = new DBImpl(options, dbname);
+ impl->mutex_.Lock();
+ VersionEdit edit;
+ Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
+ if (s.ok()) {
+ uint64_t new_log_number = impl->versions_->NewFileNumber();
+ WritableFile* lfile;
+ s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
+ &lfile);
+ if (s.ok()) {
+ edit.SetLogNumber(new_log_number);
+ impl->logfile_ = lfile;
+ impl->logfile_number_ = new_log_number;
+ impl->log_ = new log::Writer(lfile);
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+ }
+ if (s.ok()) {
+ impl->DeleteObsoleteFiles();
+ impl->MaybeScheduleCompaction();
+ }
+ }
+ impl->mutex_.Unlock();
+ if (s.ok()) {
+ *dbptr = impl;
+ } else {
+ delete impl;
+ }
+ return s;
+}
+
+Snapshot::~Snapshot() {
+}
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+ Env* env = options.env;
+ std::vector<std::string> filenames;
+ // Ignore error in case directory does not exist
+ env->GetChildren(dbname, &filenames);
+ if (filenames.empty()) {
+ return Status::OK();
+ }
+
+ FileLock* lock;
+ const std::string lockname = LockFileName(dbname);
+ Status result = env->LockFile(lockname, &lock);
+ if (result.ok()) {
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) &&
+ type != kDBLockFile) { // Lock file will be deleted at end
+ Status del = env->DeleteFile(dbname + "/" + filenames[i]);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->UnlockFile(lock); // Ignore error since state is already gone
+ env->DeleteFile(lockname);
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
+ }
+ return result;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
new file mode 100644
index 0000000000..cfc998164a
--- /dev/null
+++ b/src/leveldb/db/db_impl.h
@@ -0,0 +1,211 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
+#define STORAGE_LEVELDB_DB_DB_IMPL_H_
+
+#include <deque>
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+
+namespace leveldb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+ DBImpl(const Options& options, const std::string& dbname);
+ virtual ~DBImpl();
+
+ // Implementations of the DB interface
+ virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+ virtual Status Delete(const WriteOptions&, const Slice& key);
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key,
+ std::string* value);
+ virtual Iterator* NewIterator(const ReadOptions&);
+ virtual const Snapshot* GetSnapshot();
+ virtual void ReleaseSnapshot(const Snapshot* snapshot);
+ virtual bool GetProperty(const Slice& property, std::string* value);
+ virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+ virtual void CompactRange(const Slice* begin, const Slice* end);
+
+ // Extra methods (for testing) that are not in the public DB interface
+
+ // Compact any files in the named level that overlap [*begin,*end]
+ void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+
+ // Force current memtable contents to be compacted.
+ Status TEST_CompactMemTable();
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ Iterator* TEST_NewInternalIterator();
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t TEST_MaxNextLevelOverlappingBytes();
+
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes.
+ void RecordReadSample(Slice key);
+
+ private:
+ friend class DB;
+ struct CompactionState;
+ struct Writer;
+
+ Iterator* NewInternalIterator(const ReadOptions&,
+ SequenceNumber* latest_snapshot,
+ uint32_t* seed);
+
+ Status NewDB();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ void MaybeIgnoreError(Status* s) const;
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles();
+
+ // Compact the in-memory write buffer to disk. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful.
+ // Errors are recorded in bg_error_.
+ void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status RecoverLogFile(uint64_t log_number,
+ VersionEdit* edit,
+ SequenceNumber* max_sequence)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status MakeRoomForWrite(bool force /* compact even if there is room? */)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ WriteBatch* BuildBatchGroup(Writer** last_writer);
+
+ void RecordBackgroundError(const Status& s);
+
+ void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ static void BGWork(void* db);
+ void BackgroundCall();
+ void BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ void CleanupCompaction(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ Status DoCompactionWork(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ Status OpenCompactionOutputFile(CompactionState* compact);
+ Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+ Status InstallCompactionResults(CompactionState* compact)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Constant after construction
+ Env* const env_;
+ const InternalKeyComparator internal_comparator_;
+ const InternalFilterPolicy internal_filter_policy_;
+ const Options options_; // options_.comparator == &internal_comparator_
+ bool owns_info_log_;
+ bool owns_cache_;
+ const std::string dbname_;
+
+ // table_cache_ provides its own synchronization
+ TableCache* table_cache_;
+
+ // Lock over the persistent DB state. Non-NULL iff successfully acquired.
+ FileLock* db_lock_;
+
+ // State below is protected by mutex_
+ port::Mutex mutex_;
+ port::AtomicPointer shutting_down_;
+ port::CondVar bg_cv_; // Signalled when background work finishes
+ MemTable* mem_;
+ MemTable* imm_; // Memtable being compacted
+ port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_
+ WritableFile* logfile_;
+ uint64_t logfile_number_;
+ log::Writer* log_;
+ uint32_t seed_; // For sampling.
+
+ // Queue of writers.
+ std::deque<Writer*> writers_;
+ WriteBatch* tmp_batch_;
+
+ SnapshotList snapshots_;
+
+ // Set of table files to protect from deletion because they are
+ // part of ongoing compactions.
+ std::set<uint64_t> pending_outputs_;
+
+ // Has a background compaction been scheduled or is running?
+ bool bg_compaction_scheduled_;
+
+ // Information for a manual compaction
+ struct ManualCompaction {
+ int level;
+ bool done;
+ const InternalKey* begin; // NULL means beginning of key range
+ const InternalKey* end; // NULL means end of key range
+ InternalKey tmp_storage; // Used to keep track of compaction progress
+ };
+ ManualCompaction* manual_compaction_;
+
+ VersionSet* versions_;
+
+ // Have we encountered a background error in paranoid mode?
+ Status bg_error_;
+
+ // Per level compaction stats. stats_[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ struct CompactionStats {
+ int64_t micros;
+ int64_t bytes_read;
+ int64_t bytes_written;
+
+ CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
+
+ void Add(const CompactionStats& c) {
+ this->micros += c.micros;
+ this->bytes_read += c.bytes_read;
+ this->bytes_written += c.bytes_written;
+ }
+ };
+ CompactionStats stats_[config::kNumLevels];
+
+ // No copying allowed
+ DBImpl(const DBImpl&);
+ void operator=(const DBImpl&);
+
+ const Comparator* user_comparator() const {
+ return internal_comparator_.user_comparator();
+ }
+};
+
+// Sanitize db options. The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+ const InternalKeyComparator* icmp,
+ const InternalFilterPolicy* ipolicy,
+ const Options& src);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_
diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc
new file mode 100644
index 0000000000..3b2035e9e3
--- /dev/null
+++ b/src/leveldb/db/db_iter.cc
@@ -0,0 +1,317 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include "db/filename.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey k;
+ if (!ParseInternalKey(iter->key(), &k)) {
+ fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+ } else {
+ fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+ }
+ }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries. DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+ // Which direction is the iterator currently moving?
+ // (1) When moving forward, the internal iterator is positioned at
+ // the exact entry that yields this->key(), this->value()
+ // (2) When moving backwards, the internal iterator is positioned
+ // just before all entries whose user key == this->key().
+ enum Direction {
+ kForward,
+ kReverse
+ };
+
+ DBIter(DBImpl* db, const Comparator* cmp, Iterator* iter, SequenceNumber s,
+ uint32_t seed)
+ : db_(db),
+ user_comparator_(cmp),
+ iter_(iter),
+ sequence_(s),
+ direction_(kForward),
+ valid_(false),
+ rnd_(seed),
+ bytes_counter_(RandomPeriod()) {
+ }
+ virtual ~DBIter() {
+ delete iter_;
+ }
+ virtual bool Valid() const { return valid_; }
+ virtual Slice key() const {
+ assert(valid_);
+ return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_;
+ }
+ virtual Slice value() const {
+ assert(valid_);
+ return (direction_ == kForward) ? iter_->value() : saved_value_;
+ }
+ virtual Status status() const {
+ if (status_.ok()) {
+ return iter_->status();
+ } else {
+ return status_;
+ }
+ }
+
+ virtual void Next();
+ virtual void Prev();
+ virtual void Seek(const Slice& target);
+ virtual void SeekToFirst();
+ virtual void SeekToLast();
+
+ private:
+ void FindNextUserEntry(bool skipping, std::string* skip);
+ void FindPrevUserEntry();
+ bool ParseKey(ParsedInternalKey* key);
+
+ inline void SaveKey(const Slice& k, std::string* dst) {
+ dst->assign(k.data(), k.size());
+ }
+
+ inline void ClearSavedValue() {
+ if (saved_value_.capacity() > 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ } else {
+ saved_value_.clear();
+ }
+ }
+
+ // Pick next gap with average value of config::kReadBytesPeriod.
+ ssize_t RandomPeriod() {
+ return rnd_.Uniform(2*config::kReadBytesPeriod);
+ }
+
+ DBImpl* db_;
+ const Comparator* const user_comparator_;
+ Iterator* const iter_;
+ SequenceNumber const sequence_;
+
+ Status status_;
+ std::string saved_key_; // == current key when direction_==kReverse
+ std::string saved_value_; // == current raw value when direction_==kReverse
+ Direction direction_;
+ bool valid_;
+
+ Random rnd_;
+ ssize_t bytes_counter_;
+
+ // No copying allowed
+ DBIter(const DBIter&);
+ void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+ Slice k = iter_->key();
+ ssize_t n = k.size() + iter_->value().size();
+ bytes_counter_ -= n;
+ while (bytes_counter_ < 0) {
+ bytes_counter_ += RandomPeriod();
+ db_->RecordReadSample(k);
+ }
+ if (!ParseInternalKey(k, ikey)) {
+ status_ = Status::Corruption("corrupted internal key in DBIter");
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void DBIter::Next() {
+ assert(valid_);
+
+ if (direction_ == kReverse) { // Switch directions?
+ direction_ = kForward;
+ // iter_ is pointing just before the entries for this->key(),
+ // so advance into the range of entries for this->key() and then
+ // use the normal skipping code below.
+ if (!iter_->Valid()) {
+ iter_->SeekToFirst();
+ } else {
+ iter_->Next();
+ }
+ if (!iter_->Valid()) {
+ valid_ = false;
+ saved_key_.clear();
+ return;
+ }
+ // saved_key_ already contains the key to skip past.
+ } else {
+ // Store in saved_key_ the current key so we skip it below.
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+ }
+
+ FindNextUserEntry(true, &saved_key_);
+}
+
+void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
+ // Loop until we hit an acceptable entry to yield
+ assert(iter_->Valid());
+ assert(direction_ == kForward);
+ do {
+ ParsedInternalKey ikey;
+ if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+ switch (ikey.type) {
+ case kTypeDeletion:
+ // Arrange to skip all upcoming entries for this key since
+ // they are hidden by this deletion.
+ SaveKey(ikey.user_key, skip);
+ skipping = true;
+ break;
+ case kTypeValue:
+ if (skipping &&
+ user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
+ // Entry hidden
+ } else {
+ valid_ = true;
+ saved_key_.clear();
+ return;
+ }
+ break;
+ }
+ }
+ iter_->Next();
+ } while (iter_->Valid());
+ saved_key_.clear();
+ valid_ = false;
+}
+
+void DBIter::Prev() {
+ assert(valid_);
+
+ if (direction_ == kForward) { // Switch directions?
+ // iter_ is pointing at the current entry. Scan backwards until
+ // the key changes so we can use the normal reverse scanning code.
+ assert(iter_->Valid()); // Otherwise valid_ would have been false
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+ while (true) {
+ iter_->Prev();
+ if (!iter_->Valid()) {
+ valid_ = false;
+ saved_key_.clear();
+ ClearSavedValue();
+ return;
+ }
+ if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+ saved_key_) < 0) {
+ break;
+ }
+ }
+ direction_ = kReverse;
+ }
+
+ FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+ assert(direction_ == kReverse);
+
+ ValueType value_type = kTypeDeletion;
+ if (iter_->Valid()) {
+ do {
+ ParsedInternalKey ikey;
+ if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+ if ((value_type != kTypeDeletion) &&
+ user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
+ // We encountered a non-deleted value in entries for previous keys,
+ break;
+ }
+ value_type = ikey.type;
+ if (value_type == kTypeDeletion) {
+ saved_key_.clear();
+ ClearSavedValue();
+ } else {
+ Slice raw_value = iter_->value();
+ if (saved_value_.capacity() > raw_value.size() + 1048576) {
+ std::string empty;
+ swap(empty, saved_value_);
+ }
+ SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+ saved_value_.assign(raw_value.data(), raw_value.size());
+ }
+ }
+ iter_->Prev();
+ } while (iter_->Valid());
+ }
+
+ if (value_type == kTypeDeletion) {
+ // End
+ valid_ = false;
+ saved_key_.clear();
+ ClearSavedValue();
+ direction_ = kForward;
+ } else {
+ valid_ = true;
+ }
+}
+
+void DBIter::Seek(const Slice& target) {
+ direction_ = kForward;
+ ClearSavedValue();
+ saved_key_.clear();
+ AppendInternalKey(
+ &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+ iter_->Seek(saved_key_);
+ if (iter_->Valid()) {
+ FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+ } else {
+ valid_ = false;
+ }
+}
+
+void DBIter::SeekToFirst() {
+ direction_ = kForward;
+ ClearSavedValue();
+ iter_->SeekToFirst();
+ if (iter_->Valid()) {
+ FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+ } else {
+ valid_ = false;
+ }
+}
+
+void DBIter::SeekToLast() {
+ direction_ = kReverse;
+ ClearSavedValue();
+ iter_->SeekToLast();
+ FindPrevUserEntry();
+}
+
+} // anonymous namespace
+
+Iterator* NewDBIterator(
+ DBImpl* db,
+ const Comparator* user_key_comparator,
+ Iterator* internal_iter,
+ SequenceNumber sequence,
+ uint32_t seed) {
+ return new DBIter(db, user_key_comparator, internal_iter, sequence, seed);
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h
new file mode 100644
index 0000000000..04927e937b
--- /dev/null
+++ b/src/leveldb/db/db_iter.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
+#define STORAGE_LEVELDB_DB_DB_ITER_H_
+
+#include <stdint.h>
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class DBImpl;
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+ DBImpl* db,
+ const Comparator* user_key_comparator,
+ Iterator* internal_iter,
+ SequenceNumber sequence,
+ uint32_t seed);
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_DB_ITER_H_
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
new file mode 100644
index 0000000000..280b01c14b
--- /dev/null
+++ b/src/leveldb/db/db_test.cc
@@ -0,0 +1,2128 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+namespace {
+class AtomicCounter {
+ private:
+ port::Mutex mu_;
+ int count_;
+ public:
+ AtomicCounter() : count_(0) { }
+ void Increment() {
+ IncrementBy(1);
+ }
+ void IncrementBy(int count) {
+ MutexLock l(&mu_);
+ count_ += count;
+ }
+ int Read() {
+ MutexLock l(&mu_);
+ return count_;
+ }
+ void Reset() {
+ MutexLock l(&mu_);
+ count_ = 0;
+ }
+};
+
+void DelayMilliseconds(int millis) {
+ Env::Default()->SleepForMicroseconds(millis * 1000);
+}
+}
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+ // sstable/log Sync() calls are blocked while this pointer is non-NULL.
+ port::AtomicPointer delay_data_sync_;
+
+ // sstable/log Sync() calls return an error.
+ port::AtomicPointer data_sync_error_;
+
+ // Simulate no-space errors while this pointer is non-NULL.
+ port::AtomicPointer no_space_;
+
+ // Simulate non-writable file system while this pointer is non-NULL
+ port::AtomicPointer non_writable_;
+
+ // Force sync of manifest files to fail while this pointer is non-NULL
+ port::AtomicPointer manifest_sync_error_;
+
+ // Force write to manifest files to fail while this pointer is non-NULL
+ port::AtomicPointer manifest_write_error_;
+
+ bool count_random_reads_;
+ AtomicCounter random_read_counter_;
+
+ explicit SpecialEnv(Env* base) : EnvWrapper(base) {
+ delay_data_sync_.Release_Store(NULL);
+ data_sync_error_.Release_Store(NULL);
+ no_space_.Release_Store(NULL);
+ non_writable_.Release_Store(NULL);
+ count_random_reads_ = false;
+ manifest_sync_error_.Release_Store(NULL);
+ manifest_write_error_.Release_Store(NULL);
+ }
+
+ Status NewWritableFile(const std::string& f, WritableFile** r) {
+ class DataFile : public WritableFile {
+ private:
+ SpecialEnv* env_;
+ WritableFile* base_;
+
+ public:
+ DataFile(SpecialEnv* env, WritableFile* base)
+ : env_(env),
+ base_(base) {
+ }
+ ~DataFile() { delete base_; }
+ Status Append(const Slice& data) {
+ if (env_->no_space_.Acquire_Load() != NULL) {
+ // Drop writes on the floor
+ return Status::OK();
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Close() { return base_->Close(); }
+ Status Flush() { return base_->Flush(); }
+ Status Sync() {
+ if (env_->data_sync_error_.Acquire_Load() != NULL) {
+ return Status::IOError("simulated data sync error");
+ }
+ while (env_->delay_data_sync_.Acquire_Load() != NULL) {
+ DelayMilliseconds(100);
+ }
+ return base_->Sync();
+ }
+ };
+ class ManifestFile : public WritableFile {
+ private:
+ SpecialEnv* env_;
+ WritableFile* base_;
+ public:
+ ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { }
+ ~ManifestFile() { delete base_; }
+ Status Append(const Slice& data) {
+ if (env_->manifest_write_error_.Acquire_Load() != NULL) {
+ return Status::IOError("simulated writer error");
+ } else {
+ return base_->Append(data);
+ }
+ }
+ Status Close() { return base_->Close(); }
+ Status Flush() { return base_->Flush(); }
+ Status Sync() {
+ if (env_->manifest_sync_error_.Acquire_Load() != NULL) {
+ return Status::IOError("simulated sync error");
+ } else {
+ return base_->Sync();
+ }
+ }
+ };
+
+ if (non_writable_.Acquire_Load() != NULL) {
+ return Status::IOError("simulated write error");
+ }
+
+ Status s = target()->NewWritableFile(f, r);
+ if (s.ok()) {
+ if (strstr(f.c_str(), ".ldb") != NULL ||
+ strstr(f.c_str(), ".log") != NULL) {
+ *r = new DataFile(this, *r);
+ } else if (strstr(f.c_str(), "MANIFEST") != NULL) {
+ *r = new ManifestFile(this, *r);
+ }
+ }
+ return s;
+ }
+
+ Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+ class CountingFile : public RandomAccessFile {
+ private:
+ RandomAccessFile* target_;
+ AtomicCounter* counter_;
+ public:
+ CountingFile(RandomAccessFile* target, AtomicCounter* counter)
+ : target_(target), counter_(counter) {
+ }
+ virtual ~CountingFile() { delete target_; }
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ counter_->Increment();
+ return target_->Read(offset, n, result, scratch);
+ }
+ };
+
+ Status s = target()->NewRandomAccessFile(f, r);
+ if (s.ok() && count_random_reads_) {
+ *r = new CountingFile(*r, &random_read_counter_);
+ }
+ return s;
+ }
+};
+
+class DBTest {
+ private:
+ const FilterPolicy* filter_policy_;
+
+ // Sequence of option configurations to try
+ enum OptionConfig {
+ kDefault,
+ kFilter,
+ kUncompressed,
+ kEnd
+ };
+ int option_config_;
+
+ public:
+ std::string dbname_;
+ SpecialEnv* env_;
+ DB* db_;
+
+ Options last_options_;
+
+ DBTest() : option_config_(kDefault),
+ env_(new SpecialEnv(Env::Default())) {
+ filter_policy_ = NewBloomFilterPolicy(10);
+ dbname_ = test::TmpDir() + "/db_test";
+ DestroyDB(dbname_, Options());
+ db_ = NULL;
+ Reopen();
+ }
+
+ ~DBTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ delete env_;
+ delete filter_policy_;
+ }
+
+ // Switch to a fresh database with the next option configuration to
+ // test. Return false if there are no more configurations to test.
+ bool ChangeOptions() {
+ option_config_++;
+ if (option_config_ >= kEnd) {
+ return false;
+ } else {
+ DestroyAndReopen();
+ return true;
+ }
+ }
+
+ // Return the current option configuration.
+ Options CurrentOptions() {
+ Options options;
+ switch (option_config_) {
+ case kFilter:
+ options.filter_policy = filter_policy_;
+ break;
+ case kUncompressed:
+ options.compression = kNoCompression;
+ break;
+ default:
+ break;
+ }
+ return options;
+ }
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ void Reopen(Options* options = NULL) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void Close() {
+ delete db_;
+ db_ = NULL;
+ }
+
+ void DestroyAndReopen(Options* options = NULL) {
+ delete db_;
+ db_ = NULL;
+ DestroyDB(dbname_, Options());
+ ASSERT_OK(TryReopen(options));
+ }
+
+ Status TryReopen(Options* options) {
+ delete db_;
+ db_ = NULL;
+ Options opts;
+ if (options != NULL) {
+ opts = *options;
+ } else {
+ opts = CurrentOptions();
+ opts.create_if_missing = true;
+ }
+ last_options_ = opts;
+
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ Status Put(const std::string& k, const std::string& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ Status Delete(const std::string& k) {
+ return db_->Delete(WriteOptions(), k);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
+ ReadOptions options;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ // Return a string that contains all key,value pairs in order,
+ // formatted like "(k1->v1)(k2->v2)".
+ std::string Contents() {
+ std::vector<std::string> forward;
+ std::string result;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ std::string s = IterStatus(iter);
+ result.push_back('(');
+ result.append(s);
+ result.push_back(')');
+ forward.push_back(s);
+ }
+
+ // Check reverse iteration results are the reverse of forward results
+ size_t matched = 0;
+ for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+ ASSERT_LT(matched, forward.size());
+ ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+ matched++;
+ }
+ ASSERT_EQ(matched, forward.size());
+
+ delete iter;
+ return result;
+ }
+
+ std::string AllEntriesFor(const Slice& user_key) {
+ Iterator* iter = dbfull()->TEST_NewInternalIterator();
+ InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+ iter->Seek(target.Encode());
+ std::string result;
+ if (!iter->status().ok()) {
+ result = iter->status().ToString();
+ } else {
+ result = "[ ";
+ bool first = true;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(iter->key(), &ikey)) {
+ result += "CORRUPTED";
+ } else {
+ if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
+ break;
+ }
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ switch (ikey.type) {
+ case kTypeValue:
+ result += iter->value().ToString();
+ break;
+ case kTypeDeletion:
+ result += "DEL";
+ break;
+ }
+ }
+ iter->Next();
+ }
+ if (!first) {
+ result += " ";
+ }
+ result += "]";
+ }
+ delete iter;
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ std::string property;
+ ASSERT_TRUE(
+ db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
+ &property));
+ return atoi(property.c_str());
+ }
+
+ int TotalTableFiles() {
+ int result = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ result += NumTableFilesAtLevel(level);
+ }
+ return result;
+ }
+
+ // Return spread of files per level
+ std::string FilesPerLevel() {
+ std::string result;
+ int last_non_zero_offset = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int f = NumTableFilesAtLevel(level);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+ result += buf;
+ if (f > 0) {
+ last_non_zero_offset = result.size();
+ }
+ }
+ result.resize(last_non_zero_offset);
+ return result;
+ }
+
+ int CountFiles() {
+ std::vector<std::string> files;
+ env_->GetChildren(dbname_, &files);
+ return static_cast<int>(files.size());
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ void Compact(const Slice& start, const Slice& limit) {
+ db_->CompactRange(&start, &limit);
+ }
+
+ // Do n memtable compactions, each of which produces an sstable
+ // covering the range [small,large].
+ void MakeTables(int n, const std::string& small, const std::string& large) {
+ for (int i = 0; i < n; i++) {
+ Put(small, "begin");
+ Put(large, "end");
+ dbfull()->TEST_CompactMemTable();
+ }
+ }
+
+ // Prevent pushing of new sstables into deeper levels by adding
+ // tables that cover a specified range to all levels.
+ void FillLevels(const std::string& smallest, const std::string& largest) {
+ MakeTables(config::kNumLevels, smallest, largest);
+ }
+
+ void DumpFileCounts(const char* label) {
+ fprintf(stderr, "---\n%s:\n", label);
+ fprintf(stderr, "maxoverlap: %lld\n",
+ static_cast<long long>(
+ dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int num = NumTableFilesAtLevel(level);
+ if (num > 0) {
+ fprintf(stderr, " level %3d : %d files\n", level, num);
+ }
+ }
+ }
+
+ std::string DumpSSTableList() {
+ std::string property;
+ db_->GetProperty("leveldb.sstables", &property);
+ return property;
+ }
+
+ std::string IterStatus(Iterator* iter) {
+ std::string result;
+ if (iter->Valid()) {
+ result = iter->key().ToString() + "->" + iter->value().ToString();
+ } else {
+ result = "(invalid)";
+ }
+ return result;
+ }
+
+ bool DeleteAnSSTFile() {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+ ASSERT_OK(env_->DeleteFile(TableFileName(dbname_, number)));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Returns number of files renamed.
+ int RenameLDBToSST() {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ uint64_t number;
+ FileType type;
+ int files_renamed = 0;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) {
+ const std::string from = TableFileName(dbname_, number);
+ const std::string to = SSTTableFileName(dbname_, number);
+ ASSERT_OK(env_->RenameFile(from, to));
+ files_renamed++;
+ }
+ }
+ return files_renamed;
+ }
+};
+
+TEST(DBTest, Empty) {
+ do {
+ ASSERT_TRUE(db_ != NULL);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, ReadWrite) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, PutDeleteGet) {
+ do {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_EQ("v2", Get("foo"));
+ ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromImmutableLayer) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(&options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+
+ env_->delay_data_sync_.Release_Store(env_); // Block sync calls
+ Put("k1", std::string(100000, 'x')); // Fill memtable
+ Put("k2", std::string(100000, 'y')); // Trigger compaction
+ ASSERT_EQ("v1", Get("foo"));
+ env_->delay_data_sync_.Release_Store(NULL); // Release sync calls
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v1", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+ do {
+ // Try with both a short key and a long key
+ for (int i = 0; i < 2; i++) {
+ std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+ ASSERT_OK(Put(key, "v1"));
+ const Snapshot* s1 = db_->GetSnapshot();
+ ASSERT_OK(Put(key, "v2"));
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s1));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get(key));
+ ASSERT_EQ("v1", Get(key, s1));
+ db_->ReleaseSnapshot(s1);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+ do {
+ // Check that we process level-0 files in correct order. The code
+ // below generates two level-0 files where the earlier one comes
+ // before the later one in the level-0 file list since the earlier
+ // one has a smaller "smallest" key.
+ ASSERT_OK(Put("bar", "b"));
+ ASSERT_OK(Put("foo", "v1"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Put("foo", "v2"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ Compact("a", "z");
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Put("foo", "v2"));
+ ASSERT_EQ("v2", Get("foo"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("v2", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+ do {
+ // Arrange to have multiple files in a non-level-0 level.
+ ASSERT_OK(Put("a", "va"));
+ Compact("a", "b");
+ ASSERT_OK(Put("x", "vx"));
+ Compact("x", "y");
+ ASSERT_OK(Put("f", "vf"));
+ Compact("f", "g");
+ ASSERT_EQ("va", Get("a"));
+ ASSERT_EQ("vf", Get("f"));
+ ASSERT_EQ("vx", Get("x"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, GetEncountersEmptyLevel) {
+ do {
+ // Arrange for the following to happen:
+ // * sstable A in level 0
+ // * nothing in level 1
+ // * sstable B in level 2
+ // Then do enough Get() calls to arrange for an automatic compaction
+ // of sstable A. A bug would cause the compaction to be marked as
+ // occuring at level 1 (instead of the correct level 0).
+
+ // Step 1: First place sstables in levels 0 and 2
+ int compaction_count = 0;
+ while (NumTableFilesAtLevel(0) == 0 ||
+ NumTableFilesAtLevel(2) == 0) {
+ ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+ compaction_count++;
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ }
+
+ // Step 2: clear level 1 if necessary.
+ dbfull()->TEST_CompactRange(1, NULL, NULL);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+ ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+ // Step 3: read a bunch of times
+ for (int i = 0; i < 1000; i++) {
+ ASSERT_EQ("NOT_FOUND", Get("missing"));
+ }
+
+ // Step 4: Wait for compaction to finish
+ DelayMilliseconds(1000);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, IterEmpty) {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("foo");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterSingle) {
+ ASSERT_OK(Put("a", "va"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterMulti) {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", "vb"));
+ ASSERT_OK(Put("c", "vc"));
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->Seek("");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("a");
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Seek("ax");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("b");
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Seek("z");
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ // Switch from reverse to forward
+ iter->SeekToLast();
+ iter->Prev();
+ iter->Prev();
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Switch from forward to reverse
+ iter->SeekToFirst();
+ iter->Next();
+ iter->Next();
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+
+ // Make sure iter stays at snapshot
+ ASSERT_OK(Put("a", "va2"));
+ ASSERT_OK(Put("a2", "va3"));
+ ASSERT_OK(Put("b", "vb2"));
+ ASSERT_OK(Put("c", "vc2"));
+ ASSERT_OK(Delete("b"));
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->vb");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterSmallAndLargeMix) {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", std::string(100000, 'b')));
+ ASSERT_OK(Put("c", "vc"));
+ ASSERT_OK(Put("d", std::string(100000, 'd')));
+ ASSERT_OK(Put("e", std::string(100000, 'e')));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ iter->SeekToFirst();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Next();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ iter->SeekToLast();
+ ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+ delete iter;
+}
+
+TEST(DBTest, IterMultiWithDelete) {
+ do {
+ ASSERT_OK(Put("a", "va"));
+ ASSERT_OK(Put("b", "vb"));
+ ASSERT_OK(Put("c", "vc"));
+ ASSERT_OK(Delete("b"));
+ ASSERT_EQ("NOT_FOUND", Get("b"));
+
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ iter->Seek("c");
+ ASSERT_EQ(IterStatus(iter), "c->vc");
+ iter->Prev();
+ ASSERT_EQ(IterStatus(iter), "a->va");
+ delete iter;
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, Recover) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("baz", "v5"));
+
+ Reopen();
+ ASSERT_EQ("v1", Get("foo"));
+
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("v5", Get("baz"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_OK(Put("foo", "v4"));
+ ASSERT_EQ("v4", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_EQ("v5", Get("baz"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+ Reopen();
+ Reopen();
+ ASSERT_OK(Put("foo", "v3"));
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+ } while (ChangeOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST(DBTest, RecoverDuringMemtableCompaction) {
+ do {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 1000000;
+ Reopen(&options);
+
+ // Trigger a long memtable compaction and reopen the database during it
+ ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file
+ ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable
+ ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction
+ ASSERT_OK(Put("bar", "v2")); // Goes to new log file
+
+ Reopen(&options);
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
+ ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
+ } while (ChangeOptions());
+}
+
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 10000;
+ Reopen(&options);
+
+ const int N = 500;
+
+ int starting_num_tables = TotalTableFiles();
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = TotalTableFiles();
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+
+ Reopen();
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+ {
+ Options options = CurrentOptions();
+ Reopen(&options);
+ ASSERT_OK(Put("big1", std::string(200000, '1')));
+ ASSERT_OK(Put("big2", std::string(200000, '2')));
+ ASSERT_OK(Put("small3", std::string(10, '3')));
+ ASSERT_OK(Put("small4", std::string(10, '4')));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ }
+
+ // Make sure that if we re-open with a small write buffer size that
+ // we flush table files in the middle of a large log file.
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000;
+ Reopen(&options);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+ ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+ ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+ ASSERT_EQ(std::string(10, '3'), Get("small3"));
+ ASSERT_EQ(std::string(10, '4'), Get("small4"));
+ ASSERT_GT(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ Reopen(&options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ Reopen(&options);
+ dbfull()->TEST_CompactRange(0, NULL, NULL);
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+TEST(DBTest, RepeatedWritesToSameKey) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.write_buffer_size = 100000; // Small write buffer
+ Reopen(&options);
+
+ // We must have at most one file per level except for level-0,
+ // which may have up to kL0_StopWritesTrigger files.
+ const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
+
+ Random rnd(301);
+ std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+ for (int i = 0; i < 5 * kMaxFiles; i++) {
+ Put("key", value);
+ ASSERT_LE(TotalTableFiles(), kMaxFiles);
+ fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles());
+ }
+}
+
+TEST(DBTest, SparseMerge) {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ Reopen(&options);
+
+ FillLevels("A", "Z");
+
+ // Suppose there is:
+ // small amount of data with prefix A
+ // large amount of data with prefix B
+ // small amount of data with prefix C
+ // and that recent updates have made small changes to all three prefixes.
+ // Check that we do not do a compaction that merges all of B in one shot.
+ const std::string value(1000, 'x');
+ Put("A", "va");
+ // Write approximately 100MB of "B" values
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(key, value);
+ }
+ Put("C", "vc");
+ dbfull()->TEST_CompactMemTable();
+ dbfull()->TEST_CompactRange(0, NULL, NULL);
+
+ // Make sparse update
+ Put("A", "va2");
+ Put("B100", "bvalue2");
+ Put("C", "vc2");
+ dbfull()->TEST_CompactMemTable();
+
+ // Compactions should not cause us to create a situation where
+ // a file overlaps too much data at the next level.
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+ dbfull()->TEST_CompactRange(0, NULL, NULL);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+ dbfull()->TEST_CompactRange(1, NULL, NULL);
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val),
+ (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+ do {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ DestroyAndReopen();
+
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+ Reopen(&options);
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ const int N = 80;
+ static const int S1 = 100000;
+ static const int S2 = 105000; // Allow some expansion from metadata
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
+ }
+
+ // 0 because GetApproximateSizes() does not account for memtable space
+ ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ for (int compact_start = 0; compact_start < N; compact_start += 10) {
+ for (int i = 0; i < N; i += 10) {
+ ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
+ ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
+ ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
+ }
+ ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
+ ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
+
+ std::string cstart_str = Key(compact_start);
+ std::string cend_str = Key(compact_start + 9);
+ Slice cstart = cstart_str;
+ Slice cend = cend_str;
+ dbfull()->TEST_CompactRange(0, &cstart, &cend);
+ }
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+ do {
+ Options options = CurrentOptions();
+ options.compression = kNoCompression;
+ Reopen();
+
+ Random rnd(301);
+ std::string big1 = RandomString(&rnd, 100000);
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(2), big1));
+ ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(4), big1));
+ ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+ ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+ ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+ ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+ ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+ ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+ ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+ ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+ ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+ ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
+
+ ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+ dbfull()->TEST_CompactRange(0, NULL, NULL);
+ }
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, IteratorPinsRef) {
+ Put("foo", "hello");
+
+ // Get iterator that will yield the current contents of the DB.
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ // Write to force compactions
+ Put("foo", "newvalue1");
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+ }
+ Put("foo", "newvalue2");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("hello", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+}
+
+TEST(DBTest, Snapshot) {
+ do {
+ Put("foo", "v1");
+ const Snapshot* s1 = db_->GetSnapshot();
+ Put("foo", "v2");
+ const Snapshot* s2 = db_->GetSnapshot();
+ Put("foo", "v3");
+ const Snapshot* s3 = db_->GetSnapshot();
+
+ Put("foo", "v4");
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v3", Get("foo", s3));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s3);
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ("v4", Get("foo"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+ do {
+ Random rnd(301);
+ FillLevels("a", "z");
+
+ std::string big = RandomString(&rnd, 50000);
+ Put("foo", big);
+ Put("pastfoo", "v");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ Put("foo", "tiny");
+ Put("pastfoo2", "v2"); // Advance sequence number one more
+
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+ ASSERT_EQ(big, Get("foo", snapshot));
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+ Slice x("x");
+ dbfull()->TEST_CompactRange(0, NULL, &x);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GE(NumTableFilesAtLevel(1), 1);
+ dbfull()->TEST_CompactRange(1, NULL, &x);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, DeletionMarkers1) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+ Delete("foo");
+ Put("foo", "v2");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ Slice z("z");
+ dbfull()->TEST_CompactRange(last-2, NULL, &z);
+ // DEL eliminated, but v1 remains because we aren't compacting that level
+ // (DEL can be eliminated because v2 hides v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+ dbfull()->TEST_CompactRange(last-1, NULL, NULL);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level
+
+ // Place a table at level last-1 to prevent merging with preceding mutation
+ Put("a", "begin");
+ Put("z", "end");
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+ ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+ Delete("foo");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last-2, NULL, NULL);
+ // DEL kept: "last" file overlaps
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(last-1, NULL, NULL);
+ // Merging last-1 w/ last, so we are the base level for "foo", so
+ // DEL is removed. (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, OverlapInLevel0) {
+ do {
+ ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
+
+ // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
+ ASSERT_OK(Put("100", "v100"));
+ ASSERT_OK(Put("999", "v999"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Delete("100"));
+ ASSERT_OK(Delete("999"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("0,1,1", FilesPerLevel());
+
+ // Make files spanning the following ranges in level-0:
+ // files[0] 200 .. 900
+ // files[1] 300 .. 500
+ // Note that files are sorted by smallest key.
+ ASSERT_OK(Put("300", "v300"));
+ ASSERT_OK(Put("500", "v500"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_OK(Put("200", "v200"));
+ ASSERT_OK(Put("600", "v600"));
+ ASSERT_OK(Put("900", "v900"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("2,1,1", FilesPerLevel());
+
+ // Compact away the placeholder files we created initially
+ dbfull()->TEST_CompactRange(1, NULL, NULL);
+ dbfull()->TEST_CompactRange(2, NULL, NULL);
+ ASSERT_EQ("2", FilesPerLevel());
+
+ // Do a memtable compaction. Before bug-fix, the compaction would
+ // not detect the overlap with level-0 files and would incorrectly place
+ // the deletion in a deeper level.
+ ASSERT_OK(Delete("600"));
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("3", FilesPerLevel());
+ ASSERT_EQ("NOT_FOUND", Get("600"));
+ } while (ChangeOptions());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_a) {
+ Reopen();
+ ASSERT_OK(Put("b", "v"));
+ Reopen();
+ ASSERT_OK(Delete("b"));
+ ASSERT_OK(Delete("a"));
+ Reopen();
+ ASSERT_OK(Delete("a"));
+ Reopen();
+ ASSERT_OK(Put("a", "v"));
+ Reopen();
+ Reopen();
+ ASSERT_EQ("(a->v)", Contents());
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ ASSERT_EQ("(a->v)", Contents());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_b) {
+ Reopen();
+ Put("","");
+ Reopen();
+ Delete("e");
+ Put("","");
+ Reopen();
+ Put("c", "cv");
+ Reopen();
+ Put("","");
+ Reopen();
+ Put("","");
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ Reopen();
+ Put("d","dv");
+ Reopen();
+ Put("","");
+ Reopen();
+ Delete("d");
+ Delete("b");
+ Reopen();
+ ASSERT_EQ("(->)(c->cv)", Contents());
+ DelayMilliseconds(1000); // Wait for compaction to finish
+ ASSERT_EQ("(->)(c->cv)", Contents());
+}
+
+TEST(DBTest, ComparatorCheck) {
+ class NewComparator : public Comparator {
+ public:
+ virtual const char* Name() const { return "leveldb.NewComparator"; }
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ virtual void FindShortSuccessor(std::string* key) const {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ NewComparator cmp;
+ Options new_options = CurrentOptions();
+ new_options.comparator = &cmp;
+ Status s = TryReopen(&new_options);
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+ << s.ToString();
+}
+
+TEST(DBTest, CustomComparator) {
+ class NumberComparator : public Comparator {
+ public:
+ virtual const char* Name() const { return "test.NumberComparator"; }
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return ToNumber(a) - ToNumber(b);
+ }
+ virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+ ToNumber(*s); // Check format
+ ToNumber(l); // Check format
+ }
+ virtual void FindShortSuccessor(std::string* key) const {
+ ToNumber(*key); // Check format
+ }
+ private:
+ static int ToNumber(const Slice& x) {
+ // Check that there are no extra characters.
+ ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
+ << EscapeString(x);
+ int val;
+ char ignored;
+ ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+ << EscapeString(x);
+ return val;
+ }
+ };
+ NumberComparator cmp;
+ Options new_options = CurrentOptions();
+ new_options.create_if_missing = true;
+ new_options.comparator = &cmp;
+ new_options.filter_policy = NULL; // Cannot use bloom filters
+ new_options.write_buffer_size = 1000; // Compact more often
+ DestroyAndReopen(&new_options);
+ ASSERT_OK(Put("[10]", "ten"));
+ ASSERT_OK(Put("[0x14]", "twenty"));
+ for (int i = 0; i < 2; i++) {
+ ASSERT_EQ("ten", Get("[10]"));
+ ASSERT_EQ("ten", Get("[0xa]"));
+ ASSERT_EQ("twenty", Get("[20]"));
+ ASSERT_EQ("twenty", Get("[0x14]"));
+ ASSERT_EQ("NOT_FOUND", Get("[15]"));
+ ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
+ Compact("[0]", "[9999]");
+ }
+
+ for (int run = 0; run < 2; run++) {
+ for (int i = 0; i < 1000; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "[%d]", i*10);
+ ASSERT_OK(Put(buf, buf));
+ }
+ Compact("[0]", "[1000000]");
+ }
+}
+
+TEST(DBTest, ManualCompaction) {
+ ASSERT_EQ(config::kMaxMemCompactLevel, 2)
+ << "Need to update this test to match kMaxMemCompactLevel";
+
+ MakeTables(3, "p", "q");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range falls before files
+ Compact("", "c");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range falls after files
+ Compact("r", "z");
+ ASSERT_EQ("1,1,1", FilesPerLevel());
+
+ // Compaction range overlaps files
+ Compact("p1", "p9");
+ ASSERT_EQ("0,0,1", FilesPerLevel());
+
+ // Populate a different range
+ MakeTables(3, "c", "e");
+ ASSERT_EQ("1,1,2", FilesPerLevel());
+
+ // Compact just the new range
+ Compact("b", "f");
+ ASSERT_EQ("0,0,2", FilesPerLevel());
+
+ // Compact all
+ MakeTables(1, "a", "z");
+ ASSERT_EQ("0,1,2", FilesPerLevel());
+ db_->CompactRange(NULL, NULL);
+ ASSERT_EQ("0,0,1", FilesPerLevel());
+}
+
+TEST(DBTest, DBOpen_Options) {
+ std::string dbname = test::TmpDir() + "/db_options_test";
+ DestroyDB(dbname, Options());
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = NULL;
+ Options opts;
+ opts.create_if_missing = false;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
+ ASSERT_TRUE(db == NULL);
+
+ // Does not exist, and create_if_missing == true: OK
+ opts.create_if_missing = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != NULL);
+
+ delete db;
+ db = NULL;
+
+ // Does exist, and error_if_exists == true: error
+ opts.create_if_missing = false;
+ opts.error_if_exists = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
+ ASSERT_TRUE(db == NULL);
+
+ // Does exist, and error_if_exists == false: OK
+ opts.create_if_missing = true;
+ opts.error_if_exists = false;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != NULL);
+
+ delete db;
+ db = NULL;
+}
+
+TEST(DBTest, Locking) {
+ DB* db2 = NULL;
+ Status s = DB::Open(CurrentOptions(), dbname_, &db2);
+ ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db";
+}
+
+// Check that number of files does not grow when we are out of space
+TEST(DBTest, NoSpace) {
+ Options options = CurrentOptions();
+ options.env = env_;
+ Reopen(&options);
+
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ Compact("a", "z");
+ const int num_files = CountFiles();
+ env_->no_space_.Release_Store(env_); // Force out-of-space errors
+ for (int i = 0; i < 10; i++) {
+ for (int level = 0; level < config::kNumLevels-1; level++) {
+ dbfull()->TEST_CompactRange(level, NULL, NULL);
+ }
+ }
+ env_->no_space_.Release_Store(NULL);
+ ASSERT_LT(CountFiles(), num_files + 3);
+}
+
+TEST(DBTest, NonWritableFileSystem) {
+ Options options = CurrentOptions();
+ options.write_buffer_size = 1000;
+ options.env = env_;
+ Reopen(&options);
+ ASSERT_OK(Put("foo", "v1"));
+ env_->non_writable_.Release_Store(env_); // Force errors for new files
+ std::string big(100000, 'x');
+ int errors = 0;
+ for (int i = 0; i < 20; i++) {
+ fprintf(stderr, "iter %d; errors %d\n", i, errors);
+ if (!Put("foo", big).ok()) {
+ errors++;
+ DelayMilliseconds(100);
+ }
+ }
+ ASSERT_GT(errors, 0);
+ env_->non_writable_.Release_Store(NULL);
+}
+
+TEST(DBTest, WriteSyncError) {
+ // Check that log sync errors cause the DB to disallow future writes.
+
+ // (a) Cause log sync calls to fail
+ Options options = CurrentOptions();
+ options.env = env_;
+ Reopen(&options);
+ env_->data_sync_error_.Release_Store(env_);
+
+ // (b) Normal write should succeed
+ WriteOptions w;
+ ASSERT_OK(db_->Put(w, "k1", "v1"));
+ ASSERT_EQ("v1", Get("k1"));
+
+ // (c) Do a sync write; should fail
+ w.sync = true;
+ ASSERT_TRUE(!db_->Put(w, "k2", "v2").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+
+ // (d) make sync behave normally
+ env_->data_sync_error_.Release_Store(NULL);
+
+ // (e) Do a non-sync write; should fail
+ w.sync = false;
+ ASSERT_TRUE(!db_->Put(w, "k3", "v3").ok());
+ ASSERT_EQ("v1", Get("k1"));
+ ASSERT_EQ("NOT_FOUND", Get("k2"));
+ ASSERT_EQ("NOT_FOUND", Get("k3"));
+}
+
+TEST(DBTest, ManifestWriteError) {
+ // Test for the following problem:
+ // (a) Compaction produces file F
+ // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+ // (c) GC deletes F
+ // (d) After reopening DB, reads fail since deleted F is named in log record
+
+ // We iterate twice. In the second iteration, everything is the
+ // same except the log record never makes it to the MANIFEST file.
+ for (int iter = 0; iter < 2; iter++) {
+ port::AtomicPointer* error_type = (iter == 0)
+ ? &env_->manifest_sync_error_
+ : &env_->manifest_write_error_;
+
+ // Insert foo=>bar mapping
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.create_if_missing = true;
+ options.error_if_exists = false;
+ DestroyAndReopen(&options);
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Memtable compaction (will succeed)
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+ const int last = config::kMaxMemCompactLevel;
+ ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level
+
+ // Merging compaction (will fail)
+ error_type->Release_Store(env_);
+ dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Recovery: should not lose data
+ error_type->Release_Store(NULL);
+ Reopen(&options);
+ ASSERT_EQ("bar", Get("foo"));
+ }
+}
+
+TEST(DBTest, MissingSSTFile) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Dump the memtable to disk.
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+
+ Close();
+ ASSERT_TRUE(DeleteAnSSTFile());
+ Options options = CurrentOptions();
+ options.paranoid_checks = true;
+ Status s = TryReopen(&options);
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("issing") != std::string::npos)
+ << s.ToString();
+}
+
+TEST(DBTest, StillReadSST) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ("bar", Get("foo"));
+
+ // Dump the memtable to disk.
+ dbfull()->TEST_CompactMemTable();
+ ASSERT_EQ("bar", Get("foo"));
+ Close();
+ ASSERT_GT(RenameLDBToSST(), 0);
+ Options options = CurrentOptions();
+ options.paranoid_checks = true;
+ Status s = TryReopen(&options);
+ ASSERT_TRUE(s.ok());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(DBTest, FilesDeletedAfterCompaction) {
+ ASSERT_OK(Put("foo", "v2"));
+ Compact("a", "z");
+ const int num_files = CountFiles();
+ for (int i = 0; i < 10; i++) {
+ ASSERT_OK(Put("foo", "v2"));
+ Compact("a", "z");
+ }
+ ASSERT_EQ(CountFiles(), num_files);
+}
+
+TEST(DBTest, BloomFilter) {
+ env_->count_random_reads_ = true;
+ Options options = CurrentOptions();
+ options.env = env_;
+ options.block_cache = NewLRUCache(0); // Prevent cache hits
+ options.filter_policy = NewBloomFilterPolicy(10);
+ Reopen(&options);
+
+ // Populate multiple layers
+ const int N = 10000;
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ Compact("a", "z");
+ for (int i = 0; i < N; i += 100) {
+ ASSERT_OK(Put(Key(i), Key(i)));
+ }
+ dbfull()->TEST_CompactMemTable();
+
+ // Prevent auto compactions triggered by seeks
+ env_->delay_data_sync_.Release_Store(env_);
+
+ // Lookup present keys. Should rarely read from small sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i), Get(Key(i)));
+ }
+ int reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d present => %d reads\n", N, reads);
+ ASSERT_GE(reads, N);
+ ASSERT_LE(reads, N + 2*N/100);
+
+ // Lookup present keys. Should rarely read from either sstable.
+ env_->random_read_counter_.Reset();
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+ }
+ reads = env_->random_read_counter_.Read();
+ fprintf(stderr, "%d missing => %d reads\n", N, reads);
+ ASSERT_LE(reads, 3*N/100);
+
+ env_->delay_data_sync_.Release_Store(NULL);
+ Close();
+ delete options.block_cache;
+ delete options.filter_policy;
+}
+
+// Multi-threaded test:
+namespace {
+
+static const int kNumThreads = 4;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+ DBTest* test;
+ port::AtomicPointer stop;
+ port::AtomicPointer counter[kNumThreads];
+ port::AtomicPointer thread_done[kNumThreads];
+};
+
+struct MTThread {
+ MTState* state;
+ int id;
+};
+
+static void MTThreadBody(void* arg) {
+ MTThread* t = reinterpret_cast<MTThread*>(arg);
+ int id = t->id;
+ DB* db = t->state->test->db_;
+ uintptr_t counter = 0;
+ fprintf(stderr, "... starting thread %d\n", id);
+ Random rnd(1000 + id);
+ std::string value;
+ char valbuf[1500];
+ while (t->state->stop.Acquire_Load() == NULL) {
+ t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+
+ int key = rnd.Uniform(kNumKeys);
+ char keybuf[20];
+ snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+ if (rnd.OneIn(2)) {
+ // Write values of the form <key, my id, counter>.
+ // We add some padding for force compactions.
+ snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d",
+ key, id, static_cast<int>(counter));
+ ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf)));
+ } else {
+ // Read a value and verify that it matches the pattern written above.
+ Status s = db->Get(ReadOptions(), Slice(keybuf), &value);
+ if (s.IsNotFound()) {
+ // Key has not yet been written
+ } else {
+ // Check that the writer thread counter is >= the counter in the value
+ ASSERT_OK(s);
+ int k, w, c;
+ ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value;
+ ASSERT_EQ(k, key);
+ ASSERT_GE(w, 0);
+ ASSERT_LT(w, kNumThreads);
+ ASSERT_LE(static_cast<uintptr_t>(c), reinterpret_cast<uintptr_t>(
+ t->state->counter[w].Acquire_Load()));
+ }
+ }
+ counter++;
+ }
+ t->state->thread_done[id].Release_Store(t);
+ fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+} // namespace
+
+TEST(DBTest, MultiThreaded) {
+ do {
+ // Initialize state
+ MTState mt;
+ mt.test = this;
+ mt.stop.Release_Store(0);
+ for (int id = 0; id < kNumThreads; id++) {
+ mt.counter[id].Release_Store(0);
+ mt.thread_done[id].Release_Store(0);
+ }
+
+ // Start threads
+ MTThread thread[kNumThreads];
+ for (int id = 0; id < kNumThreads; id++) {
+ thread[id].state = &mt;
+ thread[id].id = id;
+ env_->StartThread(MTThreadBody, &thread[id]);
+ }
+
+ // Let them run for a while
+ DelayMilliseconds(kTestSeconds * 1000);
+
+ // Stop the threads and wait for them to finish
+ mt.stop.Release_Store(&mt);
+ for (int id = 0; id < kNumThreads; id++) {
+ while (mt.thread_done[id].Acquire_Load() == NULL) {
+ DelayMilliseconds(100);
+ }
+ }
+ } while (ChangeOptions());
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB: public DB {
+ public:
+ class ModelSnapshot : public Snapshot {
+ public:
+ KVMap map_;
+ };
+
+ explicit ModelDB(const Options& options): options_(options) { }
+ ~ModelDB() { }
+ virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
+ return DB::Put(o, k, v);
+ }
+ virtual Status Delete(const WriteOptions& o, const Slice& key) {
+ return DB::Delete(o, key);
+ }
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, std::string* value) {
+ assert(false); // Not implemented
+ return Status::NotFound(key);
+ }
+ virtual Iterator* NewIterator(const ReadOptions& options) {
+ if (options.snapshot == NULL) {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return new ModelIter(saved, true);
+ } else {
+ const KVMap* snapshot_state =
+ &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+ return new ModelIter(snapshot_state, false);
+ }
+ }
+ virtual const Snapshot* GetSnapshot() {
+ ModelSnapshot* snapshot = new ModelSnapshot;
+ snapshot->map_ = map_;
+ return snapshot;
+ }
+
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+ delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+ }
+ virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+ class Handler : public WriteBatch::Handler {
+ public:
+ KVMap* map_;
+ virtual void Put(const Slice& key, const Slice& value) {
+ (*map_)[key.ToString()] = value.ToString();
+ }
+ virtual void Delete(const Slice& key) {
+ map_->erase(key.ToString());
+ }
+ };
+ Handler handler;
+ handler.map_ = &map_;
+ return batch->Iterate(&handler);
+ }
+
+ virtual bool GetProperty(const Slice& property, std::string* value) {
+ return false;
+ }
+ virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
+ for (int i = 0; i < n; i++) {
+ sizes[i] = 0;
+ }
+ }
+ virtual void CompactRange(const Slice* start, const Slice* end) {
+ }
+
+ private:
+ class ModelIter: public Iterator {
+ public:
+ ModelIter(const KVMap* map, bool owned)
+ : map_(map), owned_(owned), iter_(map_->end()) {
+ }
+ ~ModelIter() {
+ if (owned_) delete map_;
+ }
+ virtual bool Valid() const { return iter_ != map_->end(); }
+ virtual void SeekToFirst() { iter_ = map_->begin(); }
+ virtual void SeekToLast() {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ virtual void Seek(const Slice& k) {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ virtual void Next() { ++iter_; }
+ virtual void Prev() { --iter_; }
+ virtual Slice key() const { return iter_->first; }
+ virtual Slice value() const { return iter_->second; }
+ virtual Status status() const { return Status::OK(); }
+ private:
+ const KVMap* const map_;
+ const bool owned_; // Do we own map_
+ KVMap::const_iterator iter_;
+ };
+ const Options options_;
+ KVMap map_;
+};
+
+static std::string RandomKey(Random* rnd) {
+ int len = (rnd->OneIn(3)
+ ? 1 // Short sometimes to encourage collisions
+ : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+ return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+ DB* model,
+ DB* db,
+ const Snapshot* model_snap,
+ const Snapshot* db_snap) {
+ ReadOptions options;
+ options.snapshot = model_snap;
+ Iterator* miter = model->NewIterator(options);
+ options.snapshot = db_snap;
+ Iterator* dbiter = db->NewIterator(options);
+ bool ok = true;
+ int count = 0;
+ for (miter->SeekToFirst(), dbiter->SeekToFirst();
+ ok && miter->Valid() && dbiter->Valid();
+ miter->Next(), dbiter->Next()) {
+ count++;
+ if (miter->key().compare(dbiter->key()) != 0) {
+ fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+ step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(dbiter->key()).c_str());
+ ok = false;
+ break;
+ }
+
+ if (miter->value().compare(dbiter->value()) != 0) {
+ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+ step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(miter->value()).c_str(),
+ EscapeString(miter->value()).c_str());
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ if (miter->Valid() != dbiter->Valid()) {
+ fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+ step, miter->Valid(), dbiter->Valid());
+ ok = false;
+ }
+ }
+ fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
+ delete miter;
+ delete dbiter;
+ return ok;
+}
+
+TEST(DBTest, Randomized) {
+ Random rnd(test::RandomSeed());
+ do {
+ ModelDB model(CurrentOptions());
+ const int N = 10000;
+ const Snapshot* model_snap = NULL;
+ const Snapshot* db_snap = NULL;
+ std::string k, v;
+ for (int step = 0; step < N; step++) {
+ if (step % 100 == 0) {
+ fprintf(stderr, "Step %d of %d\n", step, N);
+ }
+ // TODO(sanjay): Test Get() works
+ int p = rnd.Uniform(100);
+ if (p < 45) { // Put
+ k = RandomKey(&rnd);
+ v = RandomString(&rnd,
+ rnd.OneIn(20)
+ ? 100 + rnd.Uniform(100)
+ : rnd.Uniform(8));
+ ASSERT_OK(model.Put(WriteOptions(), k, v));
+ ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+ } else if (p < 90) { // Delete
+ k = RandomKey(&rnd);
+ ASSERT_OK(model.Delete(WriteOptions(), k));
+ ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+ } else { // Multi-element batch
+ WriteBatch b;
+ const int num = rnd.Uniform(8);
+ for (int i = 0; i < num; i++) {
+ if (i == 0 || !rnd.OneIn(10)) {
+ k = RandomKey(&rnd);
+ } else {
+ // Periodically re-use the same key from the previous iter, so
+ // we have multiple entries in the write batch for the same key
+ }
+ if (rnd.OneIn(2)) {
+ v = RandomString(&rnd, rnd.Uniform(10));
+ b.Put(k, v);
+ } else {
+ b.Delete(k);
+ }
+ }
+ ASSERT_OK(model.Write(WriteOptions(), &b));
+ ASSERT_OK(db_->Write(WriteOptions(), &b));
+ }
+
+ if ((step % 100) == 0) {
+ ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+ ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+ // Save a snapshot from each DB this time that we'll use next
+ // time we compare things, to make sure the current state is
+ // preserved with the snapshot
+ if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+ if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+
+ Reopen();
+ ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+
+ model_snap = model.GetSnapshot();
+ db_snap = db_->GetSnapshot();
+ }
+ }
+ if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+ if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+ } while (ChangeOptions());
+}
+
+std::string MakeKey(unsigned int num) {
+ char buf[30];
+ snprintf(buf, sizeof(buf), "%016u", num);
+ return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+ std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
+ DestroyDB(dbname, Options());
+
+ DB* db = NULL;
+ Options opts;
+ opts.create_if_missing = true;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != NULL);
+
+ delete db;
+ db = NULL;
+
+ Env* env = Env::Default();
+
+ port::Mutex mu;
+ MutexLock l(&mu);
+
+ InternalKeyComparator cmp(BytewiseComparator());
+ Options options;
+ VersionSet vset(dbname, &options, NULL, &cmp);
+ ASSERT_OK(vset.Recover());
+ VersionEdit vbase;
+ uint64_t fnum = 1;
+ for (int i = 0; i < num_base_files; i++) {
+ InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+ InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+ vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+ }
+ ASSERT_OK(vset.LogAndApply(&vbase, &mu));
+
+ uint64_t start_micros = env->NowMicros();
+
+ for (int i = 0; i < iters; i++) {
+ VersionEdit vedit;
+ vedit.DeleteFile(2, fnum);
+ InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+ InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+ vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+ vset.LogAndApply(&vedit, &mu);
+ }
+ uint64_t stop_micros = env->NowMicros();
+ unsigned int us = stop_micros - start_micros;
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%d", num_base_files);
+ fprintf(stderr,
+ "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n",
+ buf, iters, us, ((float)us) / iters);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+ leveldb::BM_LogAndApply(1000, 1);
+ leveldb::BM_LogAndApply(1000, 100);
+ leveldb::BM_LogAndApply(1000, 10000);
+ leveldb::BM_LogAndApply(100, 100000);
+ return 0;
+ }
+
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc
new file mode 100644
index 0000000000..20a7ca4462
--- /dev/null
+++ b/src/leveldb/db/dbformat.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+ assert(seq <= kMaxSequenceNumber);
+ assert(t <= kValueTypeForSeek);
+ return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+ result->append(key.user_key.data(), key.user_key.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString() const {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "' @ %llu : %d",
+ (unsigned long long) sequence,
+ int(type));
+ std::string result = "'";
+ result += EscapeString(user_key.ToString());
+ result += buf;
+ return result;
+}
+
+std::string InternalKey::DebugString() const {
+ std::string result;
+ ParsedInternalKey parsed;
+ if (ParseInternalKey(rep_, &parsed)) {
+ result = parsed.DebugString();
+ } else {
+ result = "(bad)";
+ result.append(EscapeString(rep_));
+ }
+ return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+ return "leveldb.InternalKeyComparator";
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+ const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const {
+ // Attempt to shorten the user portion of the key
+ Slice user_start = ExtractUserKey(*start);
+ Slice user_limit = ExtractUserKey(limit);
+ std::string tmp(user_start.data(), user_start.size());
+ user_comparator_->FindShortestSeparator(&tmp, user_limit);
+ if (tmp.size() < user_start.size() &&
+ user_comparator_->Compare(user_start, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+ assert(this->Compare(*start, tmp) < 0);
+ assert(this->Compare(tmp, limit) < 0);
+ start->swap(tmp);
+ }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+ Slice user_key = ExtractUserKey(*key);
+ std::string tmp(user_key.data(), user_key.size());
+ user_comparator_->FindShortSuccessor(&tmp);
+ if (tmp.size() < user_key.size() &&
+ user_comparator_->Compare(user_key, tmp) < 0) {
+ // User key has become shorter physically, but larger logically.
+ // Tack on the earliest possible number to the shortened user key.
+ PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+ assert(this->Compare(*key, tmp) < 0);
+ key->swap(tmp);
+ }
+}
+
+const char* InternalFilterPolicy::Name() const {
+ return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+ std::string* dst) const {
+ // We rely on the fact that the code in table.cc does not mind us
+ // adjusting keys[].
+ Slice* mkey = const_cast<Slice*>(keys);
+ for (int i = 0; i < n; i++) {
+ mkey[i] = ExtractUserKey(keys[i]);
+ // TODO(sanjay): Suppress dups?
+ }
+ user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+ return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+ size_t usize = user_key.size();
+ size_t needed = usize + 13; // A conservative estimate
+ char* dst;
+ if (needed <= sizeof(space_)) {
+ dst = space_;
+ } else {
+ dst = new char[needed];
+ }
+ start_ = dst;
+ dst = EncodeVarint32(dst, usize + 8);
+ kstart_ = dst;
+ memcpy(dst, user_key.data(), usize);
+ dst += usize;
+ EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+ dst += 8;
+ end_ = dst;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h
new file mode 100644
index 0000000000..5d8a032bd3
--- /dev/null
+++ b/src/leveldb/db/dbformat.h
@@ -0,0 +1,230 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
+
+#include <stdio.h>
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/slice.h"
+#include "leveldb/table_builder.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// Grouping of constants. We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+
+// Level-0 compaction is started when we hit this many files.
+static const int kL0_CompactionTrigger = 4;
+
+// Soft limit on number of level-0 files. We slow down writes at this point.
+static const int kL0_SlowdownWritesTrigger = 8;
+
+// Maximum number of level-0 files. We stop writes at this point.
+static const int kL0_StopWritesTrigger = 12;
+
+// Maximum level to which a new compacted memtable is pushed if it
+// does not create overlap. We try to push to level 2 to avoid the
+// relatively expensive level 0=>1 compactions and to avoid some
+// expensive manifest file operations. We do not push all the way to
+// the largest level since that can generate a lot of wasted disk
+// space if the same key space is being repeatedly overwritten.
+static const int kMaxMemCompactLevel = 2;
+
+// Approximate gap in bytes between samples of data read during iteration.
+static const int kReadBytesPeriod = 1048576;
+
+} // namespace config
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+ kTypeDeletion = 0x0,
+ kTypeValue = 0x1
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeValue;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+ ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) { }
+ std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+ const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ const size_t n = internal_key.size();
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+ const Comparator* user_comparator_;
+ public:
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
+ virtual const char* Name() const;
+ virtual int Compare(const Slice& a, const Slice& b) const;
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const;
+ virtual void FindShortSuccessor(std::string* key) const;
+
+ const Comparator* user_comparator() const { return user_comparator_; }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+ const FilterPolicy* const user_policy_;
+ public:
+ explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+ virtual const char* Name() const;
+ virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+ virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+ std::string rep_;
+ public:
+ InternalKey() { } // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+ }
+
+ void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void Clear() { rep_.clear(); }
+
+ std::string DebugString() const;
+};
+
+inline int InternalKeyComparator::Compare(
+ const InternalKey& a, const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result) {
+ const size_t n = internal_key.size();
+ if (n < 8) return false;
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ result->user_key = Slice(internal_key.data(), n - 8);
+ return (c <= static_cast<unsigned char>(kTypeValue));
+}
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+ // Initialize *this for looking up user_key at a snapshot with
+ // the specified sequence number.
+ LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+ ~LookupKey();
+
+ // Return a key suitable for lookup in a MemTable.
+ Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+ // Return an internal key (suitable for passing to an internal iterator)
+ Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+ // Return the user key
+ Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+ // We construct a char array of the form:
+ // klength varint32 <-- start_
+ // userkey char[klength] <-- kstart_
+ // tag uint64
+ // <-- end_
+ // The array is a suitable MemTable key.
+ // The suffix starting with "userkey" can be used as an InternalKey.
+ const char* start_;
+ const char* kstart_;
+ const char* end_;
+ char space_[200]; // Avoid allocation for short keys
+
+ // No copying allowed
+ LookupKey(const LookupKey&);
+ void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+ if (start_ != space_) delete[] start_;
+}
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_FORMAT_H_
diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc
new file mode 100644
index 0000000000..5d82f5d313
--- /dev/null
+++ b/src/leveldb/db/dbformat_test.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string IKey(const std::string& user_key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded;
+ AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+ return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+ return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+ return result;
+}
+
+static void TestKey(const std::string& key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded = IKey(key, seq, vt);
+
+ Slice in(encoded);
+ ParsedInternalKey decoded("", 0, kTypeValue);
+
+ ASSERT_TRUE(ParseInternalKey(in, &decoded));
+ ASSERT_EQ(key, decoded.user_key.ToString());
+ ASSERT_EQ(seq, decoded.sequence);
+ ASSERT_EQ(vt, decoded.type);
+
+ ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+ const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+ const uint64_t seq[] = {
+ 1, 2, 3,
+ (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+ (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+ (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+ };
+ for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+ for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+ TestKey(keys[k], seq[s], kTypeValue);
+ TestKey("hello", 1, kTypeDeletion);
+ }
+ }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+ // When user keys are same
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 99, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 101, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeDeletion)));
+
+ // When user keys are misordered
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("bar", 99, kTypeValue)));
+
+ // When user keys are different, but correctly ordered
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("hello", 200, kTypeValue)));
+
+ // When start user key is prefix of limit user key
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foobar", 200, kTypeValue)));
+
+ // When limit user key is prefix of start user key
+ ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+ Shorten(IKey("foobar", 100, kTypeValue),
+ IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ ShortSuccessor(IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+ ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc
new file mode 100644
index 0000000000..27d750697b
--- /dev/null
+++ b/src/leveldb/db/filename.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// A utility routine: write "data" to the named file and Sync() it.
+extern Status WriteStringToFileSync(Env* env, const Slice& data,
+ const std::string& fname);
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+ const char* suffix) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/%06llu.%s",
+ static_cast<unsigned long long>(number),
+ suffix);
+ return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "log");
+}
+
+// TableFileName returns the filenames we usually write to, while
+// SSTTableFileName returns the alternative filenames we also try to read from
+// for backward compatibility. For now, swap them around.
+// TODO: when compatibility is no longer necessary, swap them back
+// (TableFileName to use "ldb" and SSTTableFileName to use "sst").
+std::string TableFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "sst");
+}
+
+std::string SSTTableFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "ldb");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+ static_cast<unsigned long long>(number));
+ return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+ return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+ return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG.old";
+}
+
+
+// Owned filenames have the form:
+// dbname/CURRENT
+// dbname/LOCK
+// dbname/LOG
+// dbname/LOG.old
+// dbname/MANIFEST-[0-9]+
+// dbname/[0-9]+.(log|sst|ldb)
+bool ParseFileName(const std::string& fname,
+ uint64_t* number,
+ FileType* type) {
+ Slice rest(fname);
+ if (rest == "CURRENT") {
+ *number = 0;
+ *type = kCurrentFile;
+ } else if (rest == "LOCK") {
+ *number = 0;
+ *type = kDBLockFile;
+ } else if (rest == "LOG" || rest == "LOG.old") {
+ *number = 0;
+ *type = kInfoLogFile;
+ } else if (rest.starts_with("MANIFEST-")) {
+ rest.remove_prefix(strlen("MANIFEST-"));
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (!rest.empty()) {
+ return false;
+ }
+ *type = kDescriptorFile;
+ *number = num;
+ } else {
+ // Avoid strtoull() to keep filename format independent of the
+ // current locale
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ Slice suffix = rest;
+ if (suffix == Slice(".log")) {
+ *type = kLogFile;
+ } else if (suffix == Slice(".sst") || suffix == Slice(".ldb")) {
+ *type = kTableFile;
+ } else if (suffix == Slice(".dbtmp")) {
+ *type = kTempFile;
+ } else {
+ return false;
+ }
+ *number = num;
+ }
+ return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number) {
+ // Remove leading "dbname/" and add newline to manifest file name
+ std::string manifest = DescriptorFileName(dbname, descriptor_number);
+ Slice contents = manifest;
+ assert(contents.starts_with(dbname + "/"));
+ contents.remove_prefix(dbname.size() + 1);
+ std::string tmp = TempFileName(dbname, descriptor_number);
+ Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
+ if (s.ok()) {
+ s = env->RenameFile(tmp, CurrentFileName(dbname));
+ }
+ if (!s.ok()) {
+ env->DeleteFile(tmp);
+ }
+ return s;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h
new file mode 100644
index 0000000000..87a752605d
--- /dev/null
+++ b/src/leveldb/db/filename.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
+#define STORAGE_LEVELDB_DB_FILENAME_H_
+
+#include <stdint.h>
+#include <string>
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+enum FileType {
+ kLogFile,
+ kDBLockFile,
+ kTableFile,
+ kDescriptorFile,
+ kCurrentFile,
+ kTempFile,
+ kInfoLogFile // Either the current one, or an old one
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the legacy file name for an sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string SSTTableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number. The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+ uint64_t number);
+
+// Return the name of the current file. This file contains the name
+// of the current manifest file. The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname". The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname);
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname);
+
+// If filename is a leveldb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number. If the
+// filename was successfully parsed, returns true. Else return false.
+extern bool ParseFileName(const std::string& filename,
+ uint64_t* number,
+ FileType* type);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number);
+
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_FILENAME_H_
diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc
new file mode 100644
index 0000000000..a32556deaf
--- /dev/null
+++ b/src/leveldb/db/filename_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+ Slice db;
+ FileType type;
+ uint64_t number;
+
+ // Successful parses
+ static struct {
+ const char* fname;
+ uint64_t number;
+ FileType type;
+ } cases[] = {
+ { "100.log", 100, kLogFile },
+ { "0.log", 0, kLogFile },
+ { "0.sst", 0, kTableFile },
+ { "0.ldb", 0, kTableFile },
+ { "CURRENT", 0, kCurrentFile },
+ { "LOCK", 0, kDBLockFile },
+ { "MANIFEST-2", 2, kDescriptorFile },
+ { "MANIFEST-7", 7, kDescriptorFile },
+ { "LOG", 0, kInfoLogFile },
+ { "LOG.old", 0, kInfoLogFile },
+ { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+ };
+ for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ std::string f = cases[i].fname;
+ ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+ ASSERT_EQ(cases[i].type, type) << f;
+ ASSERT_EQ(cases[i].number, number) << f;
+ }
+
+ // Errors
+ static const char* errors[] = {
+ "",
+ "foo",
+ "foo-dx-100.log",
+ ".log",
+ "",
+ "manifest",
+ "CURREN",
+ "CURRENTX",
+ "MANIFES",
+ "MANIFEST",
+ "MANIFEST-",
+ "XMANIFEST-3",
+ "MANIFEST-3x",
+ "LOC",
+ "LOCKx",
+ "LO",
+ "LOGx",
+ "18446744073709551616.log",
+ "184467440737095516150.log",
+ "100",
+ "100.",
+ "100.lop"
+ };
+ for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+ std::string f = errors[i];
+ ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+ }
+}
+
+TEST(FileNameTest, Construction) {
+ uint64_t number;
+ FileType type;
+ std::string fname;
+
+ fname = CurrentFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kCurrentFile, type);
+
+ fname = LockFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kDBLockFile, type);
+
+ fname = LogFileName("foo", 192);
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(192, number);
+ ASSERT_EQ(kLogFile, type);
+
+ fname = TableFileName("bar", 200);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(200, number);
+ ASSERT_EQ(kTableFile, type);
+
+ fname = DescriptorFileName("bar", 100);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(100, number);
+ ASSERT_EQ(kDescriptorFile, type);
+
+ fname = TempFileName("tmp", 999);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+ ASSERT_EQ(999, number);
+ ASSERT_EQ(kTempFile, type);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/leveldb_main.cc b/src/leveldb/db/leveldb_main.cc
new file mode 100644
index 0000000000..995d76107a
--- /dev/null
+++ b/src/leveldb/db/leveldb_main.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+namespace {
+
+bool GuessType(const std::string& fname, FileType* type) {
+ size_t pos = fname.rfind('/');
+ std::string basename;
+ if (pos == std::string::npos) {
+ basename = fname;
+ } else {
+ basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1);
+ }
+ uint64_t ignored;
+ return ParseFileName(basename, &ignored, type);
+}
+
+// Notified when log reader encounters corruption.
+class CorruptionReporter : public log::Reader::Reporter {
+ public:
+ virtual void Corruption(size_t bytes, const Status& status) {
+ printf("corruption: %d bytes; %s\n",
+ static_cast<int>(bytes),
+ status.ToString().c_str());
+ }
+};
+
+// Print contents of a log file. (*func)() is called on every record.
+bool PrintLogContents(Env* env, const std::string& fname,
+ void (*func)(Slice)) {
+ SequentialFile* file;
+ Status s = env->NewSequentialFile(fname, &file);
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ return false;
+ }
+ CorruptionReporter reporter;
+ log::Reader reader(file, &reporter, true, 0);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch)) {
+ printf("--- offset %llu; ",
+ static_cast<unsigned long long>(reader.LastRecordOffset()));
+ (*func)(record);
+ }
+ delete file;
+ return true;
+}
+
+// Called on every item found in a WriteBatch.
+class WriteBatchItemPrinter : public WriteBatch::Handler {
+ public:
+ uint64_t offset_;
+ uint64_t sequence_;
+
+ virtual void Put(const Slice& key, const Slice& value) {
+ printf(" put '%s' '%s'\n",
+ EscapeString(key).c_str(),
+ EscapeString(value).c_str());
+ }
+ virtual void Delete(const Slice& key) {
+ printf(" del '%s'\n",
+ EscapeString(key).c_str());
+ }
+};
+
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kLogFile.
+static void WriteBatchPrinter(Slice record) {
+ if (record.size() < 12) {
+ printf("log record length %d is too small\n",
+ static_cast<int>(record.size()));
+ return;
+ }
+ WriteBatch batch;
+ WriteBatchInternal::SetContents(&batch, record);
+ printf("sequence %llu\n",
+ static_cast<unsigned long long>(WriteBatchInternal::Sequence(&batch)));
+ WriteBatchItemPrinter batch_item_printer;
+ Status s = batch.Iterate(&batch_item_printer);
+ if (!s.ok()) {
+ printf(" error: %s\n", s.ToString().c_str());
+ }
+}
+
+bool DumpLog(Env* env, const std::string& fname) {
+ return PrintLogContents(env, fname, WriteBatchPrinter);
+}
+
+// Called on every log record (each one of which is a WriteBatch)
+// found in a kDescriptorFile.
+static void VersionEditPrinter(Slice record) {
+ VersionEdit edit;
+ Status s = edit.DecodeFrom(record);
+ if (!s.ok()) {
+ printf("%s\n", s.ToString().c_str());
+ return;
+ }
+ printf("%s", edit.DebugString().c_str());
+}
+
+bool DumpDescriptor(Env* env, const std::string& fname) {
+ return PrintLogContents(env, fname, VersionEditPrinter);
+}
+
+bool DumpTable(Env* env, const std::string& fname) {
+ uint64_t file_size;
+ RandomAccessFile* file = NULL;
+ Table* table = NULL;
+ Status s = env->GetFileSize(fname, &file_size);
+ if (s.ok()) {
+ s = env->NewRandomAccessFile(fname, &file);
+ }
+ if (s.ok()) {
+ // We use the default comparator, which may or may not match the
+ // comparator used in this database. However this should not cause
+ // problems since we only use Table operations that do not require
+ // any comparisons. In particular, we do not call Seek or Prev.
+ s = Table::Open(Options(), file, file_size, &table);
+ }
+ if (!s.ok()) {
+ fprintf(stderr, "%s\n", s.ToString().c_str());
+ delete table;
+ delete file;
+ return false;
+ }
+
+ ReadOptions ro;
+ ro.fill_cache = false;
+ Iterator* iter = table->NewIterator(ro);
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey key;
+ if (!ParseInternalKey(iter->key(), &key)) {
+ printf("badkey '%s' => '%s'\n",
+ EscapeString(iter->key()).c_str(),
+ EscapeString(iter->value()).c_str());
+ } else {
+ char kbuf[20];
+ const char* type;
+ if (key.type == kTypeDeletion) {
+ type = "del";
+ } else if (key.type == kTypeValue) {
+ type = "val";
+ } else {
+ snprintf(kbuf, sizeof(kbuf), "%d", static_cast<int>(key.type));
+ type = kbuf;
+ }
+ printf("'%s' @ %8llu : %s => '%s'\n",
+ EscapeString(key.user_key).c_str(),
+ static_cast<unsigned long long>(key.sequence),
+ type,
+ EscapeString(iter->value()).c_str());
+ }
+ }
+ s = iter->status();
+ if (!s.ok()) {
+ printf("iterator error: %s\n", s.ToString().c_str());
+ }
+
+ delete iter;
+ delete table;
+ delete file;
+ return true;
+}
+
+bool DumpFile(Env* env, const std::string& fname) {
+ FileType ftype;
+ if (!GuessType(fname, &ftype)) {
+ fprintf(stderr, "%s: unknown file type\n", fname.c_str());
+ return false;
+ }
+ switch (ftype) {
+ case kLogFile: return DumpLog(env, fname);
+ case kDescriptorFile: return DumpDescriptor(env, fname);
+ case kTableFile: return DumpTable(env, fname);
+
+ default: {
+ fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str());
+ break;
+ }
+ }
+ return false;
+}
+
+bool HandleDumpCommand(Env* env, char** files, int num) {
+ bool ok = true;
+ for (int i = 0; i < num; i++) {
+ ok &= DumpFile(env, files[i]);
+ }
+ return ok;
+}
+
+}
+} // namespace leveldb
+
+static void Usage() {
+ fprintf(
+ stderr,
+ "Usage: leveldbutil command...\n"
+ " dump files... -- dump contents of specified files\n"
+ );
+}
+
+int main(int argc, char** argv) {
+ leveldb::Env* env = leveldb::Env::Default();
+ bool ok = true;
+ if (argc < 2) {
+ Usage();
+ ok = false;
+ } else {
+ std::string command = argv[1];
+ if (command == "dump") {
+ ok = leveldb::HandleDumpCommand(env, argv+2, argc-2);
+ } else {
+ Usage();
+ ok = false;
+ }
+ }
+ return (ok ? 0 : 1);
+}
diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h
new file mode 100644
index 0000000000..2690cb9789
--- /dev/null
+++ b/src/leveldb/db/log_format.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+
+namespace leveldb {
+namespace log {
+
+enum RecordType {
+ // Zero is reserved for preallocated files
+ kZeroType = 0,
+
+ kFullType = 1,
+
+ // For fragments
+ kFirstType = 2,
+ kMiddleType = 3,
+ kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
new file mode 100644
index 0000000000..b35f115aad
--- /dev/null
+++ b/src/leveldb/db/log_reader.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+ uint64_t initial_offset)
+ : file_(file),
+ reporter_(reporter),
+ checksum_(checksum),
+ backing_store_(new char[kBlockSize]),
+ buffer_(),
+ eof_(false),
+ last_record_offset_(0),
+ end_of_buffer_offset_(0),
+ initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+ delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+ size_t offset_in_block = initial_offset_ % kBlockSize;
+ uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+ // Don't search a block if we'd be in the trailer
+ if (offset_in_block > kBlockSize - 6) {
+ offset_in_block = 0;
+ block_start_location += kBlockSize;
+ }
+
+ end_of_buffer_offset_ = block_start_location;
+
+ // Skip to start of first block that can contain the initial record
+ if (block_start_location > 0) {
+ Status skip_status = file_->Skip(block_start_location);
+ if (!skip_status.ok()) {
+ ReportDrop(block_start_location, skip_status);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+ if (last_record_offset_ < initial_offset_) {
+ if (!SkipToInitialBlock()) {
+ return false;
+ }
+ }
+
+ scratch->clear();
+ record->clear();
+ bool in_fragmented_record = false;
+ // Record offset of the logical record that we're reading
+ // 0 is a dummy value to make compilers happy
+ uint64_t prospective_record_offset = 0;
+
+ Slice fragment;
+ while (true) {
+ uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+ const unsigned int record_type = ReadPhysicalRecord(&fragment);
+ switch (record_type) {
+ case kFullType:
+ if (in_fragmented_record) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ if (scratch->empty()) {
+ in_fragmented_record = false;
+ } else {
+ ReportCorruption(scratch->size(), "partial record without end(1)");
+ }
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->clear();
+ *record = fragment;
+ last_record_offset_ = prospective_record_offset;
+ return true;
+
+ case kFirstType:
+ if (in_fragmented_record) {
+ // Handle bug in earlier versions of log::Writer where
+ // it could emit an empty kFirstType record at the tail end
+ // of a block followed by a kFullType or kFirstType record
+ // at the beginning of the next block.
+ if (scratch->empty()) {
+ in_fragmented_record = false;
+ } else {
+ ReportCorruption(scratch->size(), "partial record without end(2)");
+ }
+ }
+ prospective_record_offset = physical_record_offset;
+ scratch->assign(fragment.data(), fragment.size());
+ in_fragmented_record = true;
+ break;
+
+ case kMiddleType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(1)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ if (!in_fragmented_record) {
+ ReportCorruption(fragment.size(),
+ "missing start of fragmented record(2)");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ *record = Slice(*scratch);
+ last_record_offset_ = prospective_record_offset;
+ return true;
+ }
+ break;
+
+ case kEof:
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "partial record without end(3)");
+ scratch->clear();
+ }
+ return false;
+
+ case kBadRecord:
+ if (in_fragmented_record) {
+ ReportCorruption(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ default: {
+ char buf[40];
+ snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+ ReportCorruption(
+ (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+ buf);
+ in_fragmented_record = false;
+ scratch->clear();
+ break;
+ }
+ }
+ }
+ return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+ return last_record_offset_;
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+ ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+ if (reporter_ != NULL &&
+ end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+ reporter_->Corruption(bytes, reason);
+ }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+ while (true) {
+ if (buffer_.size() < kHeaderSize) {
+ if (!eof_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+ end_of_buffer_offset_ += buffer_.size();
+ if (!status.ok()) {
+ buffer_.clear();
+ ReportDrop(kBlockSize, status);
+ eof_ = true;
+ return kEof;
+ } else if (buffer_.size() < kBlockSize) {
+ eof_ = true;
+ }
+ continue;
+ } else if (buffer_.size() == 0) {
+ // End of file
+ return kEof;
+ } else {
+ size_t drop_size = buffer_.size();
+ buffer_.clear();
+ ReportCorruption(drop_size, "truncated record at end of file");
+ return kEof;
+ }
+ }
+
+ // Parse the header
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ if (kHeaderSize + length > buffer_.size()) {
+ size_t drop_size = buffer_.size();
+ buffer_.clear();
+ ReportCorruption(drop_size, "bad record length");
+ return kBadRecord;
+ }
+
+ if (type == kZeroType && length == 0) {
+ // Skip zero length record without reporting any drops since
+ // such records are produced by the mmap based writing code in
+ // env_posix.cc that preallocates file regions.
+ buffer_.clear();
+ return kBadRecord;
+ }
+
+ // Check crc
+ if (checksum_) {
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+ if (actual_crc != expected_crc) {
+ // Drop the rest of the buffer since "length" itself may have
+ // been corrupted and if we trust it, we could find some
+ // fragment of a real log record that just happens to look
+ // like a valid log record.
+ size_t drop_size = buffer_.size();
+ buffer_.clear();
+ ReportCorruption(drop_size, "checksum mismatch");
+ return kBadRecord;
+ }
+ }
+
+ buffer_.remove_prefix(kHeaderSize + length);
+
+ // Skip physical record that started before initial_offset_
+ if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+ initial_offset_) {
+ result->clear();
+ return kBadRecord;
+ }
+
+ *result = Slice(header + kHeaderSize, length);
+ return type;
+ }
+}
+
+} // namespace log
+} // namespace leveldb
diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h
new file mode 100644
index 0000000000..82d4bee68d
--- /dev/null
+++ b/src/leveldb/db/log_reader.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
+#define STORAGE_LEVELDB_DB_LOG_READER_H_
+
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class SequentialFile;
+
+namespace log {
+
+class Reader {
+ public:
+ // Interface for reporting errors.
+ class Reporter {
+ public:
+ virtual ~Reporter();
+
+ // Some corruption was detected. "size" is the approximate number
+ // of bytes dropped due to the corruption.
+ virtual void Corruption(size_t bytes, const Status& status) = 0;
+ };
+
+ // Create a reader that will return log records from "*file".
+ // "*file" must remain live while this Reader is in use.
+ //
+ // If "reporter" is non-NULL, it is notified whenever some data is
+ // dropped due to a detected corruption. "*reporter" must remain
+ // live while this Reader is in use.
+ //
+ // If "checksum" is true, verify checksums if available.
+ //
+ // The Reader will start reading at the first record located at physical
+ // position >= initial_offset within the file.
+ Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+ uint64_t initial_offset);
+
+ ~Reader();
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. May use
+ // "*scratch" as temporary storage. The contents filled in *record
+ // will only be valid until the next mutating operation on this
+ // reader or the next mutation to *scratch.
+ bool ReadRecord(Slice* record, std::string* scratch);
+
+ // Returns the physical offset of the last record returned by ReadRecord.
+ //
+ // Undefined before the first call to ReadRecord.
+ uint64_t LastRecordOffset();
+
+ private:
+ SequentialFile* const file_;
+ Reporter* const reporter_;
+ bool const checksum_;
+ char* const backing_store_;
+ Slice buffer_;
+ bool eof_; // Last Read() indicated EOF by returning < kBlockSize
+
+ // Offset of the last record returned by ReadRecord.
+ uint64_t last_record_offset_;
+ // Offset of the first location past the end of buffer_.
+ uint64_t end_of_buffer_offset_;
+
+ // Offset at which to start looking for the first record to return
+ uint64_t const initial_offset_;
+
+ // Extend record types with the following special values
+ enum {
+ kEof = kMaxRecordType + 1,
+ // Returned whenever we find an invalid physical record.
+ // Currently there are three situations in which this happens:
+ // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+ // * The record is a 0-length record (No drop is reported)
+ // * The record is below constructor's initial_offset (No drop is reported)
+ kBadRecord = kMaxRecordType + 2
+ };
+
+ // Skips all blocks that are completely before "initial_offset_".
+ //
+ // Returns true on success. Handles reporting.
+ bool SkipToInitialBlock();
+
+ // Return type, or one of the preceding special values
+ unsigned int ReadPhysicalRecord(Slice* result);
+
+ // Reports dropped bytes to the reporter.
+ // buffer_ must be updated to remove the dropped bytes prior to invocation.
+ void ReportCorruption(size_t bytes, const char* reason);
+ void ReportDrop(size_t bytes, const Status& reason);
+
+ // No copying allowed
+ Reader(const Reader&);
+ void operator=(const Reader&);
+};
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_READER_H_
diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc
new file mode 100644
index 0000000000..4c5cf87573
--- /dev/null
+++ b/src/leveldb/db/log_test.cc
@@ -0,0 +1,500 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+ std::string result;
+ while (result.size() < n) {
+ result.append(partial_string);
+ }
+ result.resize(n);
+ return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d.", n);
+ return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+ return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+ class StringDest : public WritableFile {
+ public:
+ std::string contents_;
+
+ virtual Status Close() { return Status::OK(); }
+ virtual Status Flush() { return Status::OK(); }
+ virtual Status Sync() { return Status::OK(); }
+ virtual Status Append(const Slice& slice) {
+ contents_.append(slice.data(), slice.size());
+ return Status::OK();
+ }
+ };
+
+ class StringSource : public SequentialFile {
+ public:
+ Slice contents_;
+ bool force_error_;
+ bool returned_partial_;
+ StringSource() : force_error_(false), returned_partial_(false) { }
+
+ virtual Status Read(size_t n, Slice* result, char* scratch) {
+ ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+ if (force_error_) {
+ force_error_ = false;
+ returned_partial_ = true;
+ return Status::Corruption("read error");
+ }
+
+ if (contents_.size() < n) {
+ n = contents_.size();
+ returned_partial_ = true;
+ }
+ *result = Slice(contents_.data(), n);
+ contents_.remove_prefix(n);
+ return Status::OK();
+ }
+
+ virtual Status Skip(uint64_t n) {
+ if (n > contents_.size()) {
+ contents_.clear();
+ return Status::NotFound("in-memory file skipepd past end");
+ }
+
+ contents_.remove_prefix(n);
+
+ return Status::OK();
+ }
+ };
+
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) { }
+ virtual void Corruption(size_t bytes, const Status& status) {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ StringDest dest_;
+ StringSource source_;
+ ReportCollector report_;
+ bool reading_;
+ Writer writer_;
+ Reader reader_;
+
+ // Record metadata for testing initial offset functionality
+ static size_t initial_offset_record_sizes_[];
+ static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+ LogTest() : reading_(false),
+ writer_(&dest_),
+ reader_(&source_, &report_, true/*checksum*/,
+ 0/*initial_offset*/) {
+ }
+
+ void Write(const std::string& msg) {
+ ASSERT_TRUE(!reading_) << "Write() after starting to read";
+ writer_.AddRecord(Slice(msg));
+ }
+
+ size_t WrittenBytes() const {
+ return dest_.contents_.size();
+ }
+
+ std::string Read() {
+ if (!reading_) {
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ }
+ std::string scratch;
+ Slice record;
+ if (reader_.ReadRecord(&record, &scratch)) {
+ return record.ToString();
+ } else {
+ return "EOF";
+ }
+ }
+
+ void IncrementByte(int offset, int delta) {
+ dest_.contents_[offset] += delta;
+ }
+
+ void SetByte(int offset, char new_byte) {
+ dest_.contents_[offset] = new_byte;
+ }
+
+ void ShrinkSize(int bytes) {
+ dest_.contents_.resize(dest_.contents_.size() - bytes);
+ }
+
+ void FixChecksum(int header_offset, int len) {
+ // Compute crc of type/len/data
+ uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
+ crc = crc32c::Mask(crc);
+ EncodeFixed32(&dest_.contents_[header_offset], crc);
+ }
+
+ void ForceError() {
+ source_.force_error_ = true;
+ }
+
+ size_t DroppedBytes() const {
+ return report_.dropped_bytes_;
+ }
+
+ std::string ReportMessage() const {
+ return report_.message_;
+ }
+
+ // Returns OK iff recorded error message contains "msg"
+ std::string MatchError(const std::string& msg) const {
+ if (report_.message_.find(msg) == std::string::npos) {
+ return report_.message_;
+ } else {
+ return "OK";
+ }
+ }
+
+ void WriteInitialOffsetLog() {
+ for (int i = 0; i < 4; i++) {
+ std::string record(initial_offset_record_sizes_[i],
+ static_cast<char>('a' + i));
+ Write(record);
+ }
+ }
+
+ void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+ WriteInitialOffsetLog();
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+ WrittenBytes() + offset_past_end);
+ Slice record;
+ std::string scratch;
+ ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+ delete offset_reader;
+ }
+
+ void CheckInitialOffsetRecord(uint64_t initial_offset,
+ int expected_record_offset) {
+ WriteInitialOffsetLog();
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+ initial_offset);
+ Slice record;
+ std::string scratch;
+ ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+ ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+ record.size());
+ ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+ offset_reader->LastRecordOffset());
+ ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+ delete offset_reader;
+ }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+ {10000, // Two sizable records in first block
+ 10000,
+ 2 * log::kBlockSize - 1000, // Span three blocks
+ 1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+ {0,
+ kHeaderSize + 10000,
+ 2 * (kHeaderSize + 10000),
+ 2 * (kHeaderSize + 10000) +
+ (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+ Write("small");
+ Write(BigString("medium", 50000));
+ Write(BigString("large", 100000));
+ ASSERT_EQ("small", Read());
+ ASSERT_EQ(BigString("medium", 50000), Read());
+ ASSERT_EQ(BigString("large", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+ // Make a trailer that is exactly the same length as an empty record.
+ const int n = kBlockSize - 2*kHeaderSize;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+ // Make a trailer that is exactly the same length as an empty record.
+ const int n = kBlockSize - 2*kHeaderSize;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+ const int n = kBlockSize - 2*kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+ const int n = kBlockSize - 2*kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+ const int N = 500;
+ Random write_rnd(301);
+ for (int i = 0; i < N; i++) {
+ Write(RandomSkewedString(i, &write_rnd));
+ }
+ Random read_rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+ Write("foo");
+ ForceError();
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+ Write("foo");
+ // Type is stored in header[6]
+ IncrementByte(6, 100);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+ Write("foo");
+ IncrementByte(0, 10);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(10, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+ Write("foo");
+ SetByte(6, kMiddleType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+ Write("foo");
+ SetByte(6, kLastType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+ Write("foo");
+ Write("bar");
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+ Write("foo");
+ Write(BigString("bar", 100000));
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ(BigString("bar", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+ // Consider two fragmented records:
+ // first(R1) last(R1) first(R2) last(R2)
+ // where the middle two fragments disappear. We do not want
+ // first(R1),last(R2) to get joined and returned as a valid record.
+
+ // Write records that span two blocks
+ Write(BigString("foo", kBlockSize));
+ Write(BigString("bar", kBlockSize));
+ Write("correct");
+
+ // Wipe the middle block
+ for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+ SetByte(offset, 'x');
+ }
+
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("EOF", Read());
+ const int dropped = DroppedBytes();
+ ASSERT_LE(dropped, 2*kBlockSize + 100);
+ ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+ CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+ CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+ CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+ CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+ CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+ CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+ CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+ CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+ CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+ CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+ CheckInitialOffsetRecord(
+ 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+ 3);
+}
+
+TEST(LogTest, ReadEnd) {
+ CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+ CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+} // namespace log
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc
new file mode 100644
index 0000000000..2da99ac088
--- /dev/null
+++ b/src/leveldb/db/log_writer.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Writer::Writer(WritableFile* dest)
+ : dest_(dest),
+ block_offset_(0) {
+ for (int i = 0; i <= kMaxRecordType; i++) {
+ char t = static_cast<char>(i);
+ type_crc_[i] = crc32c::Value(&t, 1);
+ }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+ const char* ptr = slice.data();
+ size_t left = slice.size();
+
+ // Fragment the record if necessary and emit it. Note that if slice
+ // is empty, we still want to iterate once to emit a single
+ // zero-length record
+ Status s;
+ bool begin = true;
+ do {
+ const int leftover = kBlockSize - block_offset_;
+ assert(leftover >= 0);
+ if (leftover < kHeaderSize) {
+ // Switch to a new block
+ if (leftover > 0) {
+ // Fill the trailer (literal below relies on kHeaderSize being 7)
+ assert(kHeaderSize == 7);
+ dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+ }
+ block_offset_ = 0;
+ }
+
+ // Invariant: we never leave < kHeaderSize bytes in a block.
+ assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+ const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+ const size_t fragment_length = (left < avail) ? left : avail;
+
+ RecordType type;
+ const bool end = (left == fragment_length);
+ if (begin && end) {
+ type = kFullType;
+ } else if (begin) {
+ type = kFirstType;
+ } else if (end) {
+ type = kLastType;
+ } else {
+ type = kMiddleType;
+ }
+
+ s = EmitPhysicalRecord(type, ptr, fragment_length);
+ ptr += fragment_length;
+ left -= fragment_length;
+ begin = false;
+ } while (s.ok() && left > 0);
+ return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+ assert(n <= 0xffff); // Must fit in two bytes
+ assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+ // Format the header
+ char buf[kHeaderSize];
+ buf[4] = static_cast<char>(n & 0xff);
+ buf[5] = static_cast<char>(n >> 8);
+ buf[6] = static_cast<char>(t);
+
+ // Compute the crc of the record type and the payload.
+ uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+ crc = crc32c::Mask(crc); // Adjust for storage
+ EncodeFixed32(buf, crc);
+
+ // Write the header and the payload
+ Status s = dest_->Append(Slice(buf, kHeaderSize));
+ if (s.ok()) {
+ s = dest_->Append(Slice(ptr, n));
+ if (s.ok()) {
+ s = dest_->Flush();
+ }
+ }
+ block_offset_ += kHeaderSize + n;
+ return s;
+}
+
+} // namespace log
+} // namespace leveldb
diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h
new file mode 100644
index 0000000000..a3a954d967
--- /dev/null
+++ b/src/leveldb/db/log_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
+#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
+
+#include <stdint.h>
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class WritableFile;
+
+namespace log {
+
+class Writer {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this Writer is in use.
+ explicit Writer(WritableFile* dest);
+ ~Writer();
+
+ Status AddRecord(const Slice& slice);
+
+ private:
+ WritableFile* dest_;
+ int block_offset_; // Current offset in block
+
+ // crc32c values for all supported record types. These are
+ // pre-computed to reduce the overhead of computing the crc of the
+ // record type stored in the header.
+ uint32_t type_crc_[kMaxRecordType + 1];
+
+ Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+ // No copying allowed
+ Writer(const Writer&);
+ void operator=(const Writer&);
+};
+
+} // namespace log
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_
diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc
new file mode 100644
index 0000000000..bfec0a7e7a
--- /dev/null
+++ b/src/leveldb/db/memtable.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+#include "db/dbformat.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static Slice GetLengthPrefixedSlice(const char* data) {
+ uint32_t len;
+ const char* p = data;
+ p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
+ return Slice(p, len);
+}
+
+MemTable::MemTable(const InternalKeyComparator& cmp)
+ : comparator_(cmp),
+ refs_(0),
+ table_(comparator_, &arena_) {
+}
+
+MemTable::~MemTable() {
+ assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+ const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice a = GetLengthPrefixedSlice(aptr);
+ Slice b = GetLengthPrefixedSlice(bptr);
+ return comparator.Compare(a, b);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+ scratch->clear();
+ PutVarint32(scratch, target.size());
+ scratch->append(target.data(), target.size());
+ return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+ explicit MemTableIterator(MemTable::Table* table) : iter_(table) { }
+
+ virtual bool Valid() const { return iter_.Valid(); }
+ virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); }
+ virtual void SeekToFirst() { iter_.SeekToFirst(); }
+ virtual void SeekToLast() { iter_.SeekToLast(); }
+ virtual void Next() { iter_.Next(); }
+ virtual void Prev() { iter_.Prev(); }
+ virtual Slice key() const { return GetLengthPrefixedSlice(iter_.key()); }
+ virtual Slice value() const {
+ Slice key_slice = GetLengthPrefixedSlice(iter_.key());
+ return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ }
+
+ virtual Status status() const { return Status::OK(); }
+
+ private:
+ MemTable::Table::Iterator iter_;
+ std::string tmp_; // For passing to EncodeKey
+
+ // No copying allowed
+ MemTableIterator(const MemTableIterator&);
+ void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator() {
+ return new MemTableIterator(&table_);
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+ const Slice& key,
+ const Slice& value) {
+ // Format of an entry is concatenation of:
+ // key_size : varint32 of internal_key.size()
+ // key bytes : char[internal_key.size()]
+ // value_size : varint32 of value.size()
+ // value bytes : char[value.size()]
+ size_t key_size = key.size();
+ size_t val_size = value.size();
+ size_t internal_key_size = key_size + 8;
+ const size_t encoded_len =
+ VarintLength(internal_key_size) + internal_key_size +
+ VarintLength(val_size) + val_size;
+ char* buf = arena_.Allocate(encoded_len);
+ char* p = EncodeVarint32(buf, internal_key_size);
+ memcpy(p, key.data(), key_size);
+ p += key_size;
+ EncodeFixed64(p, (s << 8) | type);
+ p += 8;
+ p = EncodeVarint32(p, val_size);
+ memcpy(p, value.data(), val_size);
+ assert((p + val_size) - buf == encoded_len);
+ table_.Insert(buf);
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
+ Slice memkey = key.memtable_key();
+ Table::Iterator iter(&table_);
+ iter.Seek(memkey.data());
+ if (iter.Valid()) {
+ // entry format is:
+ // klength varint32
+ // userkey char[klength]
+ // tag uint64
+ // vlength varint32
+ // value char[vlength]
+ // Check that it belongs to same user key. We do not check the
+ // sequence number since the Seek() call above should have skipped
+ // all entries with overly large sequence numbers.
+ const char* entry = iter.key();
+ uint32_t key_length;
+ const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
+ if (comparator_.comparator.user_comparator()->Compare(
+ Slice(key_ptr, key_length - 8),
+ key.user_key()) == 0) {
+ // Correct user key
+ const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+ switch (static_cast<ValueType>(tag & 0xff)) {
+ case kTypeValue: {
+ Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+ value->assign(v.data(), v.size());
+ return true;
+ }
+ case kTypeDeletion:
+ *s = Status::NotFound(Slice());
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
new file mode 100644
index 0000000000..92e90bb099
--- /dev/null
+++ b/src/leveldb/db/memtable.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
+#define STORAGE_LEVELDB_DB_MEMTABLE_H_
+
+#include <string>
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "util/arena.h"
+
+namespace leveldb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableIterator;
+
+class MemTable {
+ public:
+ // MemTables are reference counted. The initial reference count
+ // is zero and the caller must call Ref() at least once.
+ explicit MemTable(const InternalKeyComparator& comparator);
+
+ // Increase reference count.
+ void Ref() { ++refs_; }
+
+ // Drop reference count. Delete if no more references exist.
+ void Unref() {
+ --refs_;
+ assert(refs_ >= 0);
+ if (refs_ <= 0) {
+ delete this;
+ }
+ }
+
+ // Returns an estimate of the number of bytes of data in use by this
+ // data structure.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ size_t ApproximateMemoryUsage();
+
+ // Return an iterator that yields the contents of the memtable.
+ //
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live. The keys returned by this
+ // iterator are internal keys encoded by AppendInternalKey in the
+ // db/format.{h,cc} module.
+ Iterator* NewIterator();
+
+ // Add an entry into memtable that maps key to value at the
+ // specified sequence number and with the specified type.
+ // Typically value will be empty if type==kTypeDeletion.
+ void Add(SequenceNumber seq, ValueType type,
+ const Slice& key,
+ const Slice& value);
+
+ // If memtable contains a value for key, store it in *value and return true.
+ // If memtable contains a deletion for key, store a NotFound() error
+ // in *status and return true.
+ // Else, return false.
+ bool Get(const LookupKey& key, std::string* value, Status* s);
+
+ private:
+ ~MemTable(); // Private since only Unref() should be used to delete it
+
+ struct KeyComparator {
+ const InternalKeyComparator comparator;
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+ int operator()(const char* a, const char* b) const;
+ };
+ friend class MemTableIterator;
+ friend class MemTableBackwardIterator;
+
+ typedef SkipList<const char*, KeyComparator> Table;
+
+ KeyComparator comparator_;
+ int refs_;
+ Arena arena_;
+ Table table_;
+
+ // No copying allowed
+ MemTable(const MemTable&);
+ void operator=(const MemTable&);
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_
diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc
new file mode 100644
index 0000000000..96c9b37af1
--- /dev/null
+++ b/src/leveldb/db/repair.cc
@@ -0,0 +1,462 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+// (a) smallest/largest for the table
+// (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+// - log number is set to zero
+// - next-file-number is set to 1 + largest file number we found
+// - last-sequence-number is set to largest sequence# found across
+// all tables (see 2c)
+// - compaction pointers are cleared
+// - every table file is added at level 0
+//
+// Possible optimization 1:
+// (a) Compute total size and use to pick appropriate max-level M
+// (b) Sort tables by largest sequence# in the table
+// (c) For each table: if it overlaps earlier table, place in level-0,
+// else place in level-M.
+// Possible optimization 2:
+// Store per-table metadata (smallest, largest, largest-seq#, ...)
+// in the table's meta section to speed up ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+ Repairer(const std::string& dbname, const Options& options)
+ : dbname_(dbname),
+ env_(options.env),
+ icmp_(options.comparator),
+ ipolicy_(options.filter_policy),
+ options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+ owns_info_log_(options_.info_log != options.info_log),
+ owns_cache_(options_.block_cache != options.block_cache),
+ next_file_number_(1) {
+ // TableCache can be small since we expect each table to be opened once.
+ table_cache_ = new TableCache(dbname_, &options_, 10);
+ }
+
+ ~Repairer() {
+ delete table_cache_;
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+ if (owns_cache_) {
+ delete options_.block_cache;
+ }
+ }
+
+ Status Run() {
+ Status status = FindFiles();
+ if (status.ok()) {
+ ConvertLogFilesToTables();
+ ExtractMetaData();
+ status = WriteDescriptor();
+ }
+ if (status.ok()) {
+ unsigned long long bytes = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ bytes += tables_[i].meta.file_size;
+ }
+ Log(options_.info_log,
+ "**** Repaired leveldb %s; "
+ "recovered %d files; %llu bytes. "
+ "Some data may have been lost. "
+ "****",
+ dbname_.c_str(),
+ static_cast<int>(tables_.size()),
+ bytes);
+ }
+ return status;
+ }
+
+ private:
+ struct TableInfo {
+ FileMetaData meta;
+ SequenceNumber max_sequence;
+ };
+
+ std::string const dbname_;
+ Env* const env_;
+ InternalKeyComparator const icmp_;
+ InternalFilterPolicy const ipolicy_;
+ Options const options_;
+ bool owns_info_log_;
+ bool owns_cache_;
+ TableCache* table_cache_;
+ VersionEdit edit_;
+
+ std::vector<std::string> manifests_;
+ std::vector<uint64_t> table_numbers_;
+ std::vector<uint64_t> logs_;
+ std::vector<TableInfo> tables_;
+ uint64_t next_file_number_;
+
+ Status FindFiles() {
+ std::vector<std::string> filenames;
+ Status status = env_->GetChildren(dbname_, &filenames);
+ if (!status.ok()) {
+ return status;
+ }
+ if (filenames.empty()) {
+ return Status::IOError(dbname_, "repair found no files");
+ }
+
+ uint64_t number;
+ FileType type;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ if (type == kDescriptorFile) {
+ manifests_.push_back(filenames[i]);
+ } else {
+ if (number + 1 > next_file_number_) {
+ next_file_number_ = number + 1;
+ }
+ if (type == kLogFile) {
+ logs_.push_back(number);
+ } else if (type == kTableFile) {
+ table_numbers_.push_back(number);
+ } else {
+ // Ignore other files
+ }
+ }
+ }
+ }
+ return status;
+ }
+
+ void ConvertLogFilesToTables() {
+ for (size_t i = 0; i < logs_.size(); i++) {
+ std::string logname = LogFileName(dbname_, logs_[i]);
+ Status status = ConvertLogToTable(logs_[i]);
+ if (!status.ok()) {
+ Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+ (unsigned long long) logs_[i],
+ status.ToString().c_str());
+ }
+ ArchiveFile(logname);
+ }
+ }
+
+ Status ConvertLogToTable(uint64_t log) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ Logger* info_log;
+ uint64_t lognum;
+ virtual void Corruption(size_t bytes, const Status& s) {
+ // We print error messages for corruption, but continue repairing.
+ Log(info_log, "Log #%llu: dropping %d bytes; %s",
+ (unsigned long long) lognum,
+ static_cast<int>(bytes),
+ s.ToString().c_str());
+ }
+ };
+
+ // Open the log file
+ std::string logname = LogFileName(dbname_, log);
+ SequentialFile* lfile;
+ Status status = env_->NewSequentialFile(logname, &lfile);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.lognum = log;
+ // We intentially make log::Reader do checksumming so that
+ // corruptions cause entire commits to be skipped instead of
+ // propagating bad information (like overly large sequence
+ // numbers).
+ log::Reader reader(lfile, &reporter, false/*do not checksum*/,
+ 0/*initial_offset*/);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ MemTable* mem = new MemTable(icmp_);
+ mem->Ref();
+ int counter = 0;
+ while (reader.ReadRecord(&record, &scratch)) {
+ if (record.size() < 12) {
+ reporter.Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ status = WriteBatchInternal::InsertInto(&batch, mem);
+ if (status.ok()) {
+ counter += WriteBatchInternal::Count(&batch);
+ } else {
+ Log(options_.info_log, "Log #%llu: ignoring %s",
+ (unsigned long long) log,
+ status.ToString().c_str());
+ status = Status::OK(); // Keep going with rest of file
+ }
+ }
+ delete lfile;
+
+ // Do not record a version edit for this conversion to a Table
+ // since ExtractMetaData() will also generate edits.
+ FileMetaData meta;
+ meta.number = next_file_number_++;
+ Iterator* iter = mem->NewIterator();
+ status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+ delete iter;
+ mem->Unref();
+ mem = NULL;
+ if (status.ok()) {
+ if (meta.file_size > 0) {
+ table_numbers_.push_back(meta.number);
+ }
+ }
+ Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+ (unsigned long long) log,
+ counter,
+ (unsigned long long) meta.number,
+ status.ToString().c_str());
+ return status;
+ }
+
+ void ExtractMetaData() {
+ std::vector<TableInfo> kept;
+ for (size_t i = 0; i < table_numbers_.size(); i++) {
+ ScanTable(table_numbers_[i]);
+ }
+ }
+
+ Iterator* NewTableIterator(const FileMetaData& meta) {
+ // Same as compaction iterators: if paranoid_checks are on, turn
+ // on checksum verification.
+ ReadOptions r;
+ r.verify_checksums = options_.paranoid_checks;
+ return table_cache_->NewIterator(r, meta.number, meta.file_size);
+ }
+
+ void ScanTable(uint64_t number) {
+ TableInfo t;
+ t.meta.number = number;
+ std::string fname = TableFileName(dbname_, number);
+ Status status = env_->GetFileSize(fname, &t.meta.file_size);
+ if (!status.ok()) {
+ // Try alternate file name.
+ fname = SSTTableFileName(dbname_, number);
+ Status s2 = env_->GetFileSize(fname, &t.meta.file_size);
+ if (s2.ok()) {
+ status = Status::OK();
+ }
+ }
+ if (!status.ok()) {
+ ArchiveFile(TableFileName(dbname_, number));
+ ArchiveFile(SSTTableFileName(dbname_, number));
+ Log(options_.info_log, "Table #%llu: dropped: %s",
+ (unsigned long long) t.meta.number,
+ status.ToString().c_str());
+ return;
+ }
+
+ // Extract metadata by scanning through table.
+ int counter = 0;
+ Iterator* iter = NewTableIterator(t.meta);
+ bool empty = true;
+ ParsedInternalKey parsed;
+ t.max_sequence = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ if (!ParseInternalKey(key, &parsed)) {
+ Log(options_.info_log, "Table #%llu: unparsable key %s",
+ (unsigned long long) t.meta.number,
+ EscapeString(key).c_str());
+ continue;
+ }
+
+ counter++;
+ if (empty) {
+ empty = false;
+ t.meta.smallest.DecodeFrom(key);
+ }
+ t.meta.largest.DecodeFrom(key);
+ if (parsed.sequence > t.max_sequence) {
+ t.max_sequence = parsed.sequence;
+ }
+ }
+ if (!iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
+ Log(options_.info_log, "Table #%llu: %d entries %s",
+ (unsigned long long) t.meta.number,
+ counter,
+ status.ToString().c_str());
+
+ if (status.ok()) {
+ tables_.push_back(t);
+ } else {
+ RepairTable(fname, t); // RepairTable archives input file.
+ }
+ }
+
+ void RepairTable(const std::string& src, TableInfo t) {
+ // We will copy src contents to a new table and then rename the
+ // new table over the source.
+
+ // Create builder.
+ std::string copy = TableFileName(dbname_, next_file_number_++);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(copy, &file);
+ if (!s.ok()) {
+ return;
+ }
+ TableBuilder* builder = new TableBuilder(options_, file);
+
+ // Copy data.
+ Iterator* iter = NewTableIterator(t.meta);
+ int counter = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ builder->Add(iter->key(), iter->value());
+ counter++;
+ }
+ delete iter;
+
+ ArchiveFile(src);
+ if (counter == 0) {
+ builder->Abandon(); // Nothing to save
+ } else {
+ s = builder->Finish();
+ if (s.ok()) {
+ t.meta.file_size = builder->FileSize();
+ }
+ }
+ delete builder;
+ builder = NULL;
+
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (counter > 0 && s.ok()) {
+ std::string orig = TableFileName(dbname_, t.meta.number);
+ s = env_->RenameFile(copy, orig);
+ if (s.ok()) {
+ Log(options_.info_log, "Table #%llu: %d entries repaired",
+ (unsigned long long) t.meta.number, counter);
+ tables_.push_back(t);
+ }
+ }
+ if (!s.ok()) {
+ env_->DeleteFile(copy);
+ }
+ }
+
+ Status WriteDescriptor() {
+ std::string tmp = TempFileName(dbname_, 1);
+ WritableFile* file;
+ Status status = env_->NewWritableFile(tmp, &file);
+ if (!status.ok()) {
+ return status;
+ }
+
+ SequenceNumber max_sequence = 0;
+ for (size_t i = 0; i < tables_.size(); i++) {
+ if (max_sequence < tables_[i].max_sequence) {
+ max_sequence = tables_[i].max_sequence;
+ }
+ }
+
+ edit_.SetComparatorName(icmp_.user_comparator()->Name());
+ edit_.SetLogNumber(0);
+ edit_.SetNextFile(next_file_number_);
+ edit_.SetLastSequence(max_sequence);
+
+ for (size_t i = 0; i < tables_.size(); i++) {
+ // TODO(opt): separate out into multiple levels
+ const TableInfo& t = tables_[i];
+ edit_.AddFile(0, t.meta.number, t.meta.file_size,
+ t.meta.smallest, t.meta.largest);
+ }
+
+ //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+ {
+ log::Writer log(file);
+ std::string record;
+ edit_.EncodeTo(&record);
+ status = log.AddRecord(record);
+ }
+ if (status.ok()) {
+ status = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (!status.ok()) {
+ env_->DeleteFile(tmp);
+ } else {
+ // Discard older manifests
+ for (size_t i = 0; i < manifests_.size(); i++) {
+ ArchiveFile(dbname_ + "/" + manifests_[i]);
+ }
+
+ // Install new manifest
+ status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+ if (status.ok()) {
+ status = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(tmp);
+ }
+ }
+ return status;
+ }
+
+ void ArchiveFile(const std::string& fname) {
+ // Move into another directory. E.g., for
+ // dir/foo
+ // rename to
+ // dir/lost/foo
+ const char* slash = strrchr(fname.c_str(), '/');
+ std::string new_dir;
+ if (slash != NULL) {
+ new_dir.assign(fname.data(), slash - fname.data());
+ }
+ new_dir.append("/lost");
+ env_->CreateDir(new_dir); // Ignore error
+ std::string new_file = new_dir;
+ new_file.append("/");
+ new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+ Status s = env_->RenameFile(fname, new_file);
+ Log(options_.info_log, "Archiving %s: %s\n",
+ fname.c_str(), s.ToString().c_str());
+ }
+};
+} // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+ Repairer repairer(dbname, options);
+ return repairer.Run();
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h
new file mode 100644
index 0000000000..af85be6d01
--- /dev/null
+++ b/src/leveldb/db/skiplist.h
@@ -0,0 +1,379 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress. Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed. This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+ struct Node;
+
+ public:
+ // Create a new SkipList object that will use "cmp" for comparing keys,
+ // and will allocate memory using "*arena". Objects allocated in the arena
+ // must remain allocated for the lifetime of the skiplist object.
+ explicit SkipList(Comparator cmp, Arena* arena);
+
+ // Insert key into the list.
+ // REQUIRES: nothing that compares equal to key is currently in the list.
+ void Insert(const Key& key);
+
+ // Returns true iff an entry that compares equal to key is in the list.
+ bool Contains(const Key& key) const;
+
+ // Iteration over the contents of a skip list
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified list.
+ // The returned iterator is not valid.
+ explicit Iterator(const SkipList* list);
+
+ // Returns true iff the iterator is positioned at a valid node.
+ bool Valid() const;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ const Key& key() const;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ void Next();
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ void Prev();
+
+ // Advance to the first entry with a key >= target
+ void Seek(const Key& target);
+
+ // Position at the first entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToFirst();
+
+ // Position at the last entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToLast();
+
+ private:
+ const SkipList* list_;
+ Node* node_;
+ // Intentionally copyable
+ };
+
+ private:
+ enum { kMaxHeight = 12 };
+
+ // Immutable after construction
+ Comparator const compare_;
+ Arena* const arena_; // Arena used for allocations of nodes
+
+ Node* const head_;
+
+ // Modified only by Insert(). Read racily by readers, but stale
+ // values are ok.
+ port::AtomicPointer max_height_; // Height of the entire list
+
+ inline int GetMaxHeight() const {
+ return static_cast<int>(
+ reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+ }
+
+ // Read/written only by Insert().
+ Random rnd_;
+
+ Node* NewNode(const Key& key, int height);
+ int RandomHeight();
+ bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+ // Return true if key is greater than the data stored in "n"
+ bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+ // Return the earliest node that comes at or after key.
+ // Return NULL if there is no such node.
+ //
+ // If prev is non-NULL, fills prev[level] with pointer to previous
+ // node at "level" for every level in [0..max_height_-1].
+ Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+ // Return the latest node with a key < key.
+ // Return head_ if there is no such node.
+ Node* FindLessThan(const Key& key) const;
+
+ // Return the last node in the list.
+ // Return head_ if list is empty.
+ Node* FindLast() const;
+
+ // No copying allowed
+ SkipList(const SkipList&);
+ void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+ explicit Node(const Key& k) : key(k) { }
+
+ Key const key;
+
+ // Accessors/mutators for links. Wrapped in methods so we can
+ // add the appropriate barriers as necessary.
+ Node* Next(int n) {
+ assert(n >= 0);
+ // Use an 'acquire load' so that we observe a fully initialized
+ // version of the returned Node.
+ return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+ }
+ void SetNext(int n, Node* x) {
+ assert(n >= 0);
+ // Use a 'release store' so that anybody who reads through this
+ // pointer observes a fully initialized version of the inserted node.
+ next_[n].Release_Store(x);
+ }
+
+ // No-barrier variants that can be safely used in a few locations.
+ Node* NoBarrier_Next(int n) {
+ assert(n >= 0);
+ return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+ }
+ void NoBarrier_SetNext(int n, Node* x) {
+ assert(n >= 0);
+ next_[n].NoBarrier_Store(x);
+ }
+
+ private:
+ // Array of length equal to the node height. next_[0] is lowest level link.
+ port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+ char* mem = arena_->AllocateAligned(
+ sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+ return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+ list_ = list;
+ node_ = NULL;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+ return node_ != NULL;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+ assert(Valid());
+ return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+ assert(Valid());
+ node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+ // Instead of using explicit "prev" links, we just search for the
+ // last node that falls before key.
+ assert(Valid());
+ node_ = list_->FindLessThan(node_->key);
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+ node_ = list_->FindGreaterOrEqual(target, NULL);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+ node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+ node_ = list_->FindLast();
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+ // Increase height with probability 1 in kBranching
+ static const unsigned int kBranching = 4;
+ int height = 1;
+ while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+ height++;
+ }
+ assert(height > 0);
+ assert(height <= kMaxHeight);
+ return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+ // NULL n is considered infinite
+ return (n != NULL) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (KeyIsAfterNode(key, next)) {
+ // Keep searching in this list
+ x = next;
+ } else {
+ if (prev != NULL) prev[level] = x;
+ if (level == 0) {
+ return next;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ assert(x == head_ || compare_(x->key, key) < 0);
+ Node* next = x->Next(level);
+ if (next == NULL || compare_(next->key, key) >= 0) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (next == NULL) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+ : compare_(cmp),
+ arena_(arena),
+ head_(NewNode(0 /* any key will do */, kMaxHeight)),
+ max_height_(reinterpret_cast<void*>(1)),
+ rnd_(0xdeadbeef) {
+ for (int i = 0; i < kMaxHeight; i++) {
+ head_->SetNext(i, NULL);
+ }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+ // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+ // here since Insert() is externally synchronized.
+ Node* prev[kMaxHeight];
+ Node* x = FindGreaterOrEqual(key, prev);
+
+ // Our data structure does not allow duplicate insertion
+ assert(x == NULL || !Equal(key, x->key));
+
+ int height = RandomHeight();
+ if (height > GetMaxHeight()) {
+ for (int i = GetMaxHeight(); i < height; i++) {
+ prev[i] = head_;
+ }
+ //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+ // It is ok to mutate max_height_ without any synchronization
+ // with concurrent readers. A concurrent reader that observes
+ // the new value of max_height_ will see either the old value of
+ // new level pointers from head_ (NULL), or a new value set in
+ // the loop below. In the former case the reader will
+ // immediately drop to the next level since NULL sorts after all
+ // keys. In the latter case the reader will use the new node.
+ max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+ }
+
+ x = NewNode(key, height);
+ for (int i = 0; i < height; i++) {
+ // NoBarrier_SetNext() suffices since we will add a barrier when
+ // we publish a pointer to "x" in prev[i].
+ x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+ prev[i]->SetNext(i, x);
+ }
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+ Node* x = FindGreaterOrEqual(key, NULL);
+ if (x != NULL && Equal(key, x->key)) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc
new file mode 100644
index 0000000000..c78f4b4fb1
--- /dev/null
+++ b/src/leveldb/db/skiplist_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "leveldb/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+typedef uint64_t Key;
+
+struct Comparator {
+ int operator()(const Key& a, const Key& b) const {
+ if (a < b) {
+ return -1;
+ } else if (a > b) {
+ return +1;
+ } else {
+ return 0;
+ }
+ }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ ASSERT_TRUE(!list.Contains(10));
+
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToFirst();
+ ASSERT_TRUE(!iter.Valid());
+ iter.Seek(100);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToLast();
+ ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+ const int N = 2000;
+ const int R = 5000;
+ Random rnd(1000);
+ std::set<Key> keys;
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ for (int i = 0; i < N; i++) {
+ Key key = rnd.Next() % R;
+ if (keys.insert(key).second) {
+ list.Insert(key);
+ }
+ }
+
+ for (int i = 0; i < R; i++) {
+ if (list.Contains(i)) {
+ ASSERT_EQ(keys.count(i), 1);
+ } else {
+ ASSERT_EQ(keys.count(i), 0);
+ }
+ }
+
+ // Simple iterator tests
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+
+ iter.Seek(0);
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToFirst();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToLast();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.rbegin()), iter.key());
+ }
+
+ // Forward iteration test
+ for (int i = 0; i < R; i++) {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.Seek(i);
+
+ // Compare against model iterator
+ std::set<Key>::iterator model_iter = keys.lower_bound(i);
+ for (int j = 0; j < 3; j++) {
+ if (model_iter == keys.end()) {
+ ASSERT_TRUE(!iter.Valid());
+ break;
+ } else {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ ++model_iter;
+ iter.Next();
+ }
+ }
+ }
+
+ // Backward iteration test
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.SeekToLast();
+
+ // Compare against model iterator
+ for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+ model_iter != keys.rend();
+ ++model_iter) {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ iter.Prev();
+ }
+ ASSERT_TRUE(!iter.Valid());
+ }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor. Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+// <key,gen,hash>
+// where:
+// key is in range [0..K-1]
+// gen is a generation number for key
+// hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key. We then iterate, including random
+// calls to Next() and Seek(). For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+ static const uint32_t K = 4;
+
+ static uint64_t key(Key key) { return (key >> 40); }
+ static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+ static uint64_t hash(Key key) { return key & 0xff; }
+
+ static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+ uint64_t data[2] = { k, g };
+ return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+ }
+
+ static Key MakeKey(uint64_t k, uint64_t g) {
+ assert(sizeof(Key) == sizeof(uint64_t));
+ assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
+ assert(g <= 0xffffffffu);
+ return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+ }
+
+ static bool IsValidKey(Key k) {
+ return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+ }
+
+ static Key RandomTarget(Random* rnd) {
+ switch (rnd->Next() % 10) {
+ case 0:
+ // Seek to beginning
+ return MakeKey(0, 0);
+ case 1:
+ // Seek to end
+ return MakeKey(K, 0);
+ default:
+ // Seek to middle
+ return MakeKey(rnd->Next() % K, 0);
+ }
+ }
+
+ // Per-key generation
+ struct State {
+ port::AtomicPointer generation[K];
+ void Set(int k, intptr_t v) {
+ generation[k].Release_Store(reinterpret_cast<void*>(v));
+ }
+ intptr_t Get(int k) {
+ return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+ }
+
+ State() {
+ for (int k = 0; k < K; k++) {
+ Set(k, 0);
+ }
+ }
+ };
+
+ // Current state of the test
+ State current_;
+
+ Arena arena_;
+
+ // SkipList is not protected by mu_. We just use a single writer
+ // thread to modify it.
+ SkipList<Key, Comparator> list_;
+
+ public:
+ ConcurrentTest() : list_(Comparator(), &arena_) { }
+
+ // REQUIRES: External synchronization
+ void WriteStep(Random* rnd) {
+ const uint32_t k = rnd->Next() % K;
+ const intptr_t g = current_.Get(k) + 1;
+ const Key key = MakeKey(k, g);
+ list_.Insert(key);
+ current_.Set(k, g);
+ }
+
+ void ReadStep(Random* rnd) {
+ // Remember the initial committed state of the skiplist.
+ State initial_state;
+ for (int k = 0; k < K; k++) {
+ initial_state.Set(k, current_.Get(k));
+ }
+
+ Key pos = RandomTarget(rnd);
+ SkipList<Key, Comparator>::Iterator iter(&list_);
+ iter.Seek(pos);
+ while (true) {
+ Key current;
+ if (!iter.Valid()) {
+ current = MakeKey(K, 0);
+ } else {
+ current = iter.key();
+ ASSERT_TRUE(IsValidKey(current)) << current;
+ }
+ ASSERT_LE(pos, current) << "should not go backwards";
+
+ // Verify that everything in [pos,current) was not present in
+ // initial_state.
+ while (pos < current) {
+ ASSERT_LT(key(pos), K) << pos;
+
+ // Note that generation 0 is never inserted, so it is ok if
+ // <*,0,*> is missing.
+ ASSERT_TRUE((gen(pos) == 0) ||
+ (gen(pos) > initial_state.Get(key(pos)))
+ ) << "key: " << key(pos)
+ << "; gen: " << gen(pos)
+ << "; initgen: "
+ << initial_state.Get(key(pos));
+
+ // Advance to next key in the valid key space
+ if (key(pos) < key(current)) {
+ pos = MakeKey(key(pos) + 1, 0);
+ } else {
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ }
+ }
+
+ if (!iter.Valid()) {
+ break;
+ }
+
+ if (rnd->Next() % 2) {
+ iter.Next();
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ } else {
+ Key new_target = RandomTarget(rnd);
+ if (new_target > pos) {
+ pos = new_target;
+ iter.Seek(new_target);
+ }
+ }
+ }
+ }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+ ConcurrentTest test;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < 10000; i++) {
+ test.ReadStep(&rnd);
+ test.WriteStep(&rnd);
+ }
+}
+
+class TestState {
+ public:
+ ConcurrentTest t_;
+ int seed_;
+ port::AtomicPointer quit_flag_;
+
+ enum ReaderState {
+ STARTING,
+ RUNNING,
+ DONE
+ };
+
+ explicit TestState(int s)
+ : seed_(s),
+ quit_flag_(NULL),
+ state_(STARTING),
+ state_cv_(&mu_) {}
+
+ void Wait(ReaderState s) {
+ mu_.Lock();
+ while (state_ != s) {
+ state_cv_.Wait();
+ }
+ mu_.Unlock();
+ }
+
+ void Change(ReaderState s) {
+ mu_.Lock();
+ state_ = s;
+ state_cv_.Signal();
+ mu_.Unlock();
+ }
+
+ private:
+ port::Mutex mu_;
+ ReaderState state_;
+ port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+ TestState* state = reinterpret_cast<TestState*>(arg);
+ Random rnd(state->seed_);
+ int64_t reads = 0;
+ state->Change(TestState::RUNNING);
+ while (!state->quit_flag_.Acquire_Load()) {
+ state->t_.ReadStep(&rnd);
+ ++reads;
+ }
+ state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+ const int seed = test::RandomSeed() + (run * 100);
+ Random rnd(seed);
+ const int N = 1000;
+ const int kSize = 1000;
+ for (int i = 0; i < N; i++) {
+ if ((i % 100) == 0) {
+ fprintf(stderr, "Run %d of %d\n", i, N);
+ }
+ TestState state(seed + 1);
+ Env::Default()->Schedule(ConcurrentReader, &state);
+ state.Wait(TestState::RUNNING);
+ for (int i = 0; i < kSize; i++) {
+ state.t_.WriteStep(&rnd);
+ }
+ state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
+ state.Wait(TestState::DONE);
+ }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h
new file mode 100644
index 0000000000..e7f8fd2c37
--- /dev/null
+++ b/src/leveldb/db/snapshot.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+
+#include "leveldb/db.h"
+
+namespace leveldb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+ SequenceNumber number_; // const after creation
+
+ private:
+ friend class SnapshotList;
+
+ // SnapshotImpl is kept in a doubly-linked circular list
+ SnapshotImpl* prev_;
+ SnapshotImpl* next_;
+
+ SnapshotList* list_; // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+ SnapshotList() {
+ list_.prev_ = &list_;
+ list_.next_ = &list_;
+ }
+
+ bool empty() const { return list_.next_ == &list_; }
+ SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+ SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+ const SnapshotImpl* New(SequenceNumber seq) {
+ SnapshotImpl* s = new SnapshotImpl;
+ s->number_ = seq;
+ s->list_ = this;
+ s->next_ = &list_;
+ s->prev_ = list_.prev_;
+ s->prev_->next_ = s;
+ s->next_->prev_ = s;
+ return s;
+ }
+
+ void Delete(const SnapshotImpl* s) {
+ assert(s->list_ == this);
+ s->prev_->next_ = s->next_;
+ s->next_->prev_ = s->prev_;
+ delete s;
+ }
+
+ private:
+ // Dummy head of doubly-linked list of snapshots
+ SnapshotImpl list_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_
diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc
new file mode 100644
index 0000000000..e3d82cd3ea
--- /dev/null
+++ b/src/leveldb/db/table_cache.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct TableAndFile {
+ RandomAccessFile* file;
+ Table* table;
+};
+
+static void DeleteEntry(const Slice& key, void* value) {
+ TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+ delete tf->table;
+ delete tf->file;
+ delete tf;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+ Cache* cache = reinterpret_cast<Cache*>(arg1);
+ Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+ cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname,
+ const Options* options,
+ int entries)
+ : env_(options->env),
+ dbname_(dbname),
+ options_(options),
+ cache_(NewLRUCache(entries)) {
+}
+
+TableCache::~TableCache() {
+ delete cache_;
+}
+
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
+ Cache::Handle** handle) {
+ Status s;
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ Slice key(buf, sizeof(buf));
+ *handle = cache_->Lookup(key);
+ if (*handle == NULL) {
+ std::string fname = TableFileName(dbname_, file_number);
+ RandomAccessFile* file = NULL;
+ Table* table = NULL;
+ s = env_->NewRandomAccessFile(fname, &file);
+ if (!s.ok()) {
+ std::string old_fname = SSTTableFileName(dbname_, file_number);
+ if (env_->NewRandomAccessFile(old_fname, &file).ok()) {
+ s = Status::OK();
+ }
+ }
+ if (s.ok()) {
+ s = Table::Open(*options_, file, file_size, &table);
+ }
+
+ if (!s.ok()) {
+ assert(table == NULL);
+ delete file;
+ // We do not cache error results so that if the error is transient,
+ // or somebody repairs the file, we recover automatically.
+ } else {
+ TableAndFile* tf = new TableAndFile;
+ tf->file = file;
+ tf->table = table;
+ *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+ }
+ }
+ return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+ uint64_t file_number,
+ uint64_t file_size,
+ Table** tableptr) {
+ if (tableptr != NULL) {
+ *tableptr = NULL;
+ }
+
+ Cache::Handle* handle = NULL;
+ Status s = FindTable(file_number, file_size, &handle);
+ if (!s.ok()) {
+ return NewErrorIterator(s);
+ }
+
+ Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+ Iterator* result = table->NewIterator(options);
+ result->RegisterCleanup(&UnrefEntry, cache_, handle);
+ if (tableptr != NULL) {
+ *tableptr = table;
+ }
+ return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+ uint64_t file_number,
+ uint64_t file_size,
+ const Slice& k,
+ void* arg,
+ void (*saver)(void*, const Slice&, const Slice&)) {
+ Cache::Handle* handle = NULL;
+ Status s = FindTable(file_number, file_size, &handle);
+ if (s.ok()) {
+ Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+ s = t->InternalGet(options, k, arg, saver);
+ cache_->Release(handle);
+ }
+ return s;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h
new file mode 100644
index 0000000000..8cf4aaf12d
--- /dev/null
+++ b/src/leveldb/db/table_cache.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "leveldb/cache.h"
+#include "leveldb/table.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+class TableCache {
+ public:
+ TableCache(const std::string& dbname, const Options* options, int entries);
+ ~TableCache();
+
+ // Return an iterator for the specified file number (the corresponding
+ // file length must be exactly "file_size" bytes). If "tableptr" is
+ // non-NULL, also sets "*tableptr" to point to the Table object
+ // underlying the returned iterator, or NULL if no Table object underlies
+ // the returned iterator. The returned "*tableptr" object is owned by
+ // the cache and should not be deleted, and is valid for as long as the
+ // returned iterator is live.
+ Iterator* NewIterator(const ReadOptions& options,
+ uint64_t file_number,
+ uint64_t file_size,
+ Table** tableptr = NULL);
+
+ // If a seek to internal key "k" in specified file finds an entry,
+ // call (*handle_result)(arg, found_key, found_value).
+ Status Get(const ReadOptions& options,
+ uint64_t file_number,
+ uint64_t file_size,
+ const Slice& k,
+ void* arg,
+ void (*handle_result)(void*, const Slice&, const Slice&));
+
+ // Evict any entry for the specified file number
+ void Evict(uint64_t file_number);
+
+ private:
+ Env* const env_;
+ const std::string dbname_;
+ const Options* options_;
+ Cache* cache_;
+
+ Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc
new file mode 100644
index 0000000000..f10a2d58b2
--- /dev/null
+++ b/src/leveldb/db/version_edit.cc
@@ -0,0 +1,266 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// Tag numbers for serialized VersionEdit. These numbers are written to
+// disk and should not be changed.
+enum Tag {
+ kComparator = 1,
+ kLogNumber = 2,
+ kNextFileNumber = 3,
+ kLastSequence = 4,
+ kCompactPointer = 5,
+ kDeletedFile = 6,
+ kNewFile = 7,
+ // 8 was used for large value refs
+ kPrevLogNumber = 9
+};
+
+void VersionEdit::Clear() {
+ comparator_.clear();
+ log_number_ = 0;
+ prev_log_number_ = 0;
+ last_sequence_ = 0;
+ next_file_number_ = 0;
+ has_comparator_ = false;
+ has_log_number_ = false;
+ has_prev_log_number_ = false;
+ has_next_file_number_ = false;
+ has_last_sequence_ = false;
+ deleted_files_.clear();
+ new_files_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+ if (has_comparator_) {
+ PutVarint32(dst, kComparator);
+ PutLengthPrefixedSlice(dst, comparator_);
+ }
+ if (has_log_number_) {
+ PutVarint32(dst, kLogNumber);
+ PutVarint64(dst, log_number_);
+ }
+ if (has_prev_log_number_) {
+ PutVarint32(dst, kPrevLogNumber);
+ PutVarint64(dst, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ PutVarint32(dst, kNextFileNumber);
+ PutVarint64(dst, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ PutVarint32(dst, kLastSequence);
+ PutVarint64(dst, last_sequence_);
+ }
+
+ for (size_t i = 0; i < compact_pointers_.size(); i++) {
+ PutVarint32(dst, kCompactPointer);
+ PutVarint32(dst, compact_pointers_[i].first); // level
+ PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+ }
+
+ for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+ iter != deleted_files_.end();
+ ++iter) {
+ PutVarint32(dst, kDeletedFile);
+ PutVarint32(dst, iter->first); // level
+ PutVarint64(dst, iter->second); // file number
+ }
+
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ PutVarint32(dst, kNewFile);
+ PutVarint32(dst, new_files_[i].first); // level
+ PutVarint64(dst, f.number);
+ PutVarint64(dst, f.file_size);
+ PutLengthPrefixedSlice(dst, f.smallest.Encode());
+ PutLengthPrefixedSlice(dst, f.largest.Encode());
+ }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+ Slice str;
+ if (GetLengthPrefixedSlice(input, &str)) {
+ dst->DecodeFrom(str);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool GetLevel(Slice* input, int* level) {
+ uint32_t v;
+ if (GetVarint32(input, &v) &&
+ v < config::kNumLevels) {
+ *level = v;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+ Clear();
+ Slice input = src;
+ const char* msg = NULL;
+ uint32_t tag;
+
+ // Temporary storage for parsing
+ int level;
+ uint64_t number;
+ FileMetaData f;
+ Slice str;
+ InternalKey key;
+
+ while (msg == NULL && GetVarint32(&input, &tag)) {
+ switch (tag) {
+ case kComparator:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ comparator_ = str.ToString();
+ has_comparator_ = true;
+ } else {
+ msg = "comparator name";
+ }
+ break;
+
+ case kLogNumber:
+ if (GetVarint64(&input, &log_number_)) {
+ has_log_number_ = true;
+ } else {
+ msg = "log number";
+ }
+ break;
+
+ case kPrevLogNumber:
+ if (GetVarint64(&input, &prev_log_number_)) {
+ has_prev_log_number_ = true;
+ } else {
+ msg = "previous log number";
+ }
+ break;
+
+ case kNextFileNumber:
+ if (GetVarint64(&input, &next_file_number_)) {
+ has_next_file_number_ = true;
+ } else {
+ msg = "next file number";
+ }
+ break;
+
+ case kLastSequence:
+ if (GetVarint64(&input, &last_sequence_)) {
+ has_last_sequence_ = true;
+ } else {
+ msg = "last sequence number";
+ }
+ break;
+
+ case kCompactPointer:
+ if (GetLevel(&input, &level) &&
+ GetInternalKey(&input, &key)) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ } else {
+ msg = "compaction pointer";
+ }
+ break;
+
+ case kDeletedFile:
+ if (GetLevel(&input, &level) &&
+ GetVarint64(&input, &number)) {
+ deleted_files_.insert(std::make_pair(level, number));
+ } else {
+ msg = "deleted file";
+ }
+ break;
+
+ case kNewFile:
+ if (GetLevel(&input, &level) &&
+ GetVarint64(&input, &f.number) &&
+ GetVarint64(&input, &f.file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest)) {
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ msg = "new-file entry";
+ }
+ break;
+
+ default:
+ msg = "unknown tag";
+ break;
+ }
+ }
+
+ if (msg == NULL && !input.empty()) {
+ msg = "invalid tag";
+ }
+
+ Status result;
+ if (msg != NULL) {
+ result = Status::Corruption("VersionEdit", msg);
+ }
+ return result;
+}
+
+std::string VersionEdit::DebugString() const {
+ std::string r;
+ r.append("VersionEdit {");
+ if (has_comparator_) {
+ r.append("\n Comparator: ");
+ r.append(comparator_);
+ }
+ if (has_log_number_) {
+ r.append("\n LogNumber: ");
+ AppendNumberTo(&r, log_number_);
+ }
+ if (has_prev_log_number_) {
+ r.append("\n PrevLogNumber: ");
+ AppendNumberTo(&r, prev_log_number_);
+ }
+ if (has_next_file_number_) {
+ r.append("\n NextFile: ");
+ AppendNumberTo(&r, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ r.append("\n LastSeq: ");
+ AppendNumberTo(&r, last_sequence_);
+ }
+ for (size_t i = 0; i < compact_pointers_.size(); i++) {
+ r.append("\n CompactPointer: ");
+ AppendNumberTo(&r, compact_pointers_[i].first);
+ r.append(" ");
+ r.append(compact_pointers_[i].second.DebugString());
+ }
+ for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+ iter != deleted_files_.end();
+ ++iter) {
+ r.append("\n DeleteFile: ");
+ AppendNumberTo(&r, iter->first);
+ r.append(" ");
+ AppendNumberTo(&r, iter->second);
+ }
+ for (size_t i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ r.append("\n AddFile: ");
+ AppendNumberTo(&r, new_files_[i].first);
+ r.append(" ");
+ AppendNumberTo(&r, f.number);
+ r.append(" ");
+ AppendNumberTo(&r, f.file_size);
+ r.append(" ");
+ r.append(f.smallest.DebugString());
+ r.append(" .. ");
+ r.append(f.largest.DebugString());
+ }
+ r.append("\n}\n");
+ return r;
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h
new file mode 100644
index 0000000000..eaef77b327
--- /dev/null
+++ b/src/leveldb/db/version_edit.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class VersionSet;
+
+struct FileMetaData {
+ int refs;
+ int allowed_seeks; // Seeks allowed until compaction
+ uint64_t number;
+ uint64_t file_size; // File size in bytes
+ InternalKey smallest; // Smallest internal key served by table
+ InternalKey largest; // Largest internal key served by table
+
+ FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
+};
+
+class VersionEdit {
+ public:
+ VersionEdit() { Clear(); }
+ ~VersionEdit() { }
+
+ void Clear();
+
+ void SetComparatorName(const Slice& name) {
+ has_comparator_ = true;
+ comparator_ = name.ToString();
+ }
+ void SetLogNumber(uint64_t num) {
+ has_log_number_ = true;
+ log_number_ = num;
+ }
+ void SetPrevLogNumber(uint64_t num) {
+ has_prev_log_number_ = true;
+ prev_log_number_ = num;
+ }
+ void SetNextFile(uint64_t num) {
+ has_next_file_number_ = true;
+ next_file_number_ = num;
+ }
+ void SetLastSequence(SequenceNumber seq) {
+ has_last_sequence_ = true;
+ last_sequence_ = seq;
+ }
+ void SetCompactPointer(int level, const InternalKey& key) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ }
+
+ // Add the specified file at the specified number.
+ // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+ // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+ void AddFile(int level, uint64_t file,
+ uint64_t file_size,
+ const InternalKey& smallest,
+ const InternalKey& largest) {
+ FileMetaData f;
+ f.number = file;
+ f.file_size = file_size;
+ f.smallest = smallest;
+ f.largest = largest;
+ new_files_.push_back(std::make_pair(level, f));
+ }
+
+ // Delete the specified "file" from the specified "level".
+ void DeleteFile(int level, uint64_t file) {
+ deleted_files_.insert(std::make_pair(level, file));
+ }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(const Slice& src);
+
+ std::string DebugString() const;
+
+ private:
+ friend class VersionSet;
+
+ typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+ std::string comparator_;
+ uint64_t log_number_;
+ uint64_t prev_log_number_;
+ uint64_t next_file_number_;
+ SequenceNumber last_sequence_;
+ bool has_comparator_;
+ bool has_log_number_;
+ bool has_prev_log_number_;
+ bool has_next_file_number_;
+ bool has_last_sequence_;
+
+ std::vector< std::pair<int, InternalKey> > compact_pointers_;
+ DeletedFileSet deleted_files_;
+ std::vector< std::pair<int, FileMetaData> > new_files_;
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc
new file mode 100644
index 0000000000..280310b49d
--- /dev/null
+++ b/src/leveldb/db/version_edit_test.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ parsed.EncodeTo(&encoded2);
+ ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+ static const uint64_t kBig = 1ull << 50;
+
+ VersionEdit edit;
+ for (int i = 0; i < 4; i++) {
+ TestEncodeDecode(edit);
+ edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+ InternalKey("foo", kBig + 500 + i, kTypeValue),
+ InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+ edit.DeleteFile(4, kBig + 700 + i);
+ edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+ }
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
new file mode 100644
index 0000000000..517edd3b18
--- /dev/null
+++ b/src/leveldb/db/version_set.cc
@@ -0,0 +1,1498 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table_builder.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static const int kTargetFileSize = 2 * 1048576;
+
+// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
+
+// Maximum number of bytes in all compacted files. We avoid expanding
+// the lower level file set of a compaction if it would make the
+// total compaction cover more than this many bytes.
+static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize;
+
+static double MaxBytesForLevel(int level) {
+ // Note: the result for level zero is not really used since we set
+ // the level-0 compaction threshold based on number of files.
+ double result = 10 * 1048576.0; // Result for both level-0 and level-1
+ while (level > 1) {
+ result *= 10;
+ level--;
+ }
+ return result;
+}
+
+static uint64_t MaxFileSizeForLevel(int level) {
+ return kTargetFileSize; // We could vary per level to reduce number of files?
+}
+
+static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+ int64_t sum = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ sum += files[i]->file_size;
+ }
+ return sum;
+}
+
+namespace {
+std::string IntSetToString(const std::set<uint64_t>& s) {
+ std::string result = "{";
+ for (std::set<uint64_t>::const_iterator it = s.begin();
+ it != s.end();
+ ++it) {
+ result += (result.size() > 1) ? "," : "";
+ result += NumberToString(*it);
+ }
+ result += "}";
+ return result;
+}
+} // namespace
+
+Version::~Version() {
+ assert(refs_ == 0);
+
+ // Remove from linked list
+ prev_->next_ = next_;
+ next_->prev_ = prev_;
+
+ // Drop references to files
+ for (int level = 0; level < config::kNumLevels; level++) {
+ for (size_t i = 0; i < files_[level].size(); i++) {
+ FileMetaData* f = files_[level][i];
+ assert(f->refs > 0);
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& files,
+ const Slice& key) {
+ uint32_t left = 0;
+ uint32_t right = files.size();
+ while (left < right) {
+ uint32_t mid = (left + right) / 2;
+ const FileMetaData* f = files[mid];
+ if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
+ // Key at "mid.largest" is < "target". Therefore all
+ // files at or before "mid" are uninteresting.
+ left = mid + 1;
+ } else {
+ // Key at "mid.largest" is >= "target". Therefore all files
+ // after "mid" are uninteresting.
+ right = mid;
+ }
+ }
+ return right;
+}
+
+static bool AfterFile(const Comparator* ucmp,
+ const Slice* user_key, const FileMetaData* f) {
+ // NULL user_key occurs before all keys and is therefore never after *f
+ return (user_key != NULL &&
+ ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+ const Slice* user_key, const FileMetaData* f) {
+ // NULL user_key occurs after all keys and is therefore never before *f
+ return (user_key != NULL &&
+ ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+}
+
+bool SomeFileOverlapsRange(
+ const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const std::vector<FileMetaData*>& files,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ const Comparator* ucmp = icmp.user_comparator();
+ if (!disjoint_sorted_files) {
+ // Need to check against all files
+ for (size_t i = 0; i < files.size(); i++) {
+ const FileMetaData* f = files[i];
+ if (AfterFile(ucmp, smallest_user_key, f) ||
+ BeforeFile(ucmp, largest_user_key, f)) {
+ // No overlap
+ } else {
+ return true; // Overlap
+ }
+ }
+ return false;
+ }
+
+ // Binary search over file list
+ uint32_t index = 0;
+ if (smallest_user_key != NULL) {
+ // Find the earliest possible internal key for smallest_user_key
+ InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
+ index = FindFile(icmp, files, small.Encode());
+ }
+
+ if (index >= files.size()) {
+ // beginning of range is after all files, so no overlap.
+ return false;
+ }
+
+ return !BeforeFile(ucmp, largest_user_key, files[index]);
+}
+
+// An internal iterator. For a given version/level pair, yields
+// information about the files in the level. For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+ LevelFileNumIterator(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>* flist)
+ : icmp_(icmp),
+ flist_(flist),
+ index_(flist->size()) { // Marks as invalid
+ }
+ virtual bool Valid() const {
+ return index_ < flist_->size();
+ }
+ virtual void Seek(const Slice& target) {
+ index_ = FindFile(icmp_, *flist_, target);
+ }
+ virtual void SeekToFirst() { index_ = 0; }
+ virtual void SeekToLast() {
+ index_ = flist_->empty() ? 0 : flist_->size() - 1;
+ }
+ virtual void Next() {
+ assert(Valid());
+ index_++;
+ }
+ virtual void Prev() {
+ assert(Valid());
+ if (index_ == 0) {
+ index_ = flist_->size(); // Marks as invalid
+ } else {
+ index_--;
+ }
+ }
+ Slice key() const {
+ assert(Valid());
+ return (*flist_)[index_]->largest.Encode();
+ }
+ Slice value() const {
+ assert(Valid());
+ EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+ EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
+ return Slice(value_buf_, sizeof(value_buf_));
+ }
+ virtual Status status() const { return Status::OK(); }
+ private:
+ const InternalKeyComparator icmp_;
+ const std::vector<FileMetaData*>* const flist_;
+ uint32_t index_;
+
+ // Backing store for value(). Holds the file number and size.
+ mutable char value_buf_[16];
+};
+
+static Iterator* GetFileIterator(void* arg,
+ const ReadOptions& options,
+ const Slice& file_value) {
+ TableCache* cache = reinterpret_cast<TableCache*>(arg);
+ if (file_value.size() != 16) {
+ return NewErrorIterator(
+ Status::Corruption("FileReader invoked with unexpected value"));
+ } else {
+ return cache->NewIterator(options,
+ DecodeFixed64(file_value.data()),
+ DecodeFixed64(file_value.data() + 8));
+ }
+}
+
+Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
+ int level) const {
+ return NewTwoLevelIterator(
+ new LevelFileNumIterator(vset_->icmp_, &files_[level]),
+ &GetFileIterator, vset_->table_cache_, options);
+}
+
+void Version::AddIterators(const ReadOptions& options,
+ std::vector<Iterator*>* iters) {
+ // Merge all level zero files together since they may overlap
+ for (size_t i = 0; i < files_[0].size(); i++) {
+ iters->push_back(
+ vset_->table_cache_->NewIterator(
+ options, files_[0][i]->number, files_[0][i]->file_size));
+ }
+
+ // For levels > 0, we can use a concatenating iterator that sequentially
+ // walks through the non-overlapping files in the level, opening them
+ // lazily.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ if (!files_[level].empty()) {
+ iters->push_back(NewConcatenatingIterator(options, level));
+ }
+ }
+}
+
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+ kNotFound,
+ kFound,
+ kDeleted,
+ kCorrupt,
+};
+struct Saver {
+ SaverState state;
+ const Comparator* ucmp;
+ Slice user_key;
+ std::string* value;
+};
+}
+static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+ Saver* s = reinterpret_cast<Saver*>(arg);
+ ParsedInternalKey parsed_key;
+ if (!ParseInternalKey(ikey, &parsed_key)) {
+ s->state = kCorrupt;
+ } else {
+ if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+ s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
+ if (s->state == kFound) {
+ s->value->assign(v.data(), v.size());
+ }
+ }
+ }
+}
+
+static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
+ return a->number > b->number;
+}
+
+void Version::ForEachOverlapping(Slice user_key, Slice internal_key,
+ void* arg,
+ bool (*func)(void*, int, FileMetaData*)) {
+ // TODO(sanjay): Change Version::Get() to use this function.
+ const Comparator* ucmp = vset_->icmp_.user_comparator();
+
+ // Search level-0 in order from newest to oldest.
+ std::vector<FileMetaData*> tmp;
+ tmp.reserve(files_[0].size());
+ for (uint32_t i = 0; i < files_[0].size(); i++) {
+ FileMetaData* f = files_[0][i];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+ ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ tmp.push_back(f);
+ }
+ }
+ if (!tmp.empty()) {
+ std::sort(tmp.begin(), tmp.end(), NewestFirst);
+ for (uint32_t i = 0; i < tmp.size(); i++) {
+ if (!(*func)(arg, 0, tmp[i])) {
+ return;
+ }
+ }
+ }
+
+ // Search other levels.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ size_t num_files = files_[level].size();
+ if (num_files == 0) continue;
+
+ // Binary search to find earliest index whose largest key >= internal_key.
+ uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key);
+ if (index < num_files) {
+ FileMetaData* f = files_[level][index];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) {
+ // All of "f" is past any data for user_key
+ } else {
+ if (!(*func)(arg, level, f)) {
+ return;
+ }
+ }
+ }
+ }
+}
+
+Status Version::Get(const ReadOptions& options,
+ const LookupKey& k,
+ std::string* value,
+ GetStats* stats) {
+ Slice ikey = k.internal_key();
+ Slice user_key = k.user_key();
+ const Comparator* ucmp = vset_->icmp_.user_comparator();
+ Status s;
+
+ stats->seek_file = NULL;
+ stats->seek_file_level = -1;
+ FileMetaData* last_file_read = NULL;
+ int last_file_read_level = -1;
+
+ // We can search level-by-level since entries never hop across
+ // levels. Therefore we are guaranteed that if we find data
+ // in an smaller level, later levels are irrelevant.
+ std::vector<FileMetaData*> tmp;
+ FileMetaData* tmp2;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ size_t num_files = files_[level].size();
+ if (num_files == 0) continue;
+
+ // Get the list of files to search in this level
+ FileMetaData* const* files = &files_[level][0];
+ if (level == 0) {
+ // Level-0 files may overlap each other. Find all files that
+ // overlap user_key and process them in order from newest to oldest.
+ tmp.reserve(num_files);
+ for (uint32_t i = 0; i < num_files; i++) {
+ FileMetaData* f = files[i];
+ if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+ ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ tmp.push_back(f);
+ }
+ }
+ if (tmp.empty()) continue;
+
+ std::sort(tmp.begin(), tmp.end(), NewestFirst);
+ files = &tmp[0];
+ num_files = tmp.size();
+ } else {
+ // Binary search to find earliest index whose largest key >= ikey.
+ uint32_t index = FindFile(vset_->icmp_, files_[level], ikey);
+ if (index >= num_files) {
+ files = NULL;
+ num_files = 0;
+ } else {
+ tmp2 = files[index];
+ if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) {
+ // All of "tmp2" is past any data for user_key
+ files = NULL;
+ num_files = 0;
+ } else {
+ files = &tmp2;
+ num_files = 1;
+ }
+ }
+ }
+
+ for (uint32_t i = 0; i < num_files; ++i) {
+ if (last_file_read != NULL && stats->seek_file == NULL) {
+ // We have had more than one seek for this read. Charge the 1st file.
+ stats->seek_file = last_file_read;
+ stats->seek_file_level = last_file_read_level;
+ }
+
+ FileMetaData* f = files[i];
+ last_file_read = f;
+ last_file_read_level = level;
+
+ Saver saver;
+ saver.state = kNotFound;
+ saver.ucmp = ucmp;
+ saver.user_key = user_key;
+ saver.value = value;
+ s = vset_->table_cache_->Get(options, f->number, f->file_size,
+ ikey, &saver, SaveValue);
+ if (!s.ok()) {
+ return s;
+ }
+ switch (saver.state) {
+ case kNotFound:
+ break; // Keep searching in other files
+ case kFound:
+ return s;
+ case kDeleted:
+ s = Status::NotFound(Slice()); // Use empty error message for speed
+ return s;
+ case kCorrupt:
+ s = Status::Corruption("corrupted key for ", user_key);
+ return s;
+ }
+ }
+ }
+
+ return Status::NotFound(Slice()); // Use an empty error message for speed
+}
+
+bool Version::UpdateStats(const GetStats& stats) {
+ FileMetaData* f = stats.seek_file;
+ if (f != NULL) {
+ f->allowed_seeks--;
+ if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
+ file_to_compact_ = f;
+ file_to_compact_level_ = stats.seek_file_level;
+ return true;
+ }
+ }
+ return false;
+}
+
+bool Version::RecordReadSample(Slice internal_key) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(internal_key, &ikey)) {
+ return false;
+ }
+
+ struct State {
+ GetStats stats; // Holds first matching file
+ int matches;
+
+ static bool Match(void* arg, int level, FileMetaData* f) {
+ State* state = reinterpret_cast<State*>(arg);
+ state->matches++;
+ if (state->matches == 1) {
+ // Remember first match.
+ state->stats.seek_file = f;
+ state->stats.seek_file_level = level;
+ }
+ // We can stop iterating once we have a second match.
+ return state->matches < 2;
+ }
+ };
+
+ State state;
+ state.matches = 0;
+ ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match);
+
+ // Must have at least two matches since we want to merge across
+ // files. But what if we have a single file that contains many
+ // overwrites and deletions? Should we have another mechanism for
+ // finding such files?
+ if (state.matches >= 2) {
+ // 1MB cost is about 1 seek (see comment in Builder::Apply).
+ return UpdateStats(state.stats);
+ }
+ return false;
+}
+
+void Version::Ref() {
+ ++refs_;
+}
+
+void Version::Unref() {
+ assert(this != &vset_->dummy_versions_);
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ delete this;
+ }
+}
+
+bool Version::OverlapInLevel(int level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key) {
+ return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
+ smallest_user_key, largest_user_key);
+}
+
+int Version::PickLevelForMemTableOutput(
+ const Slice& smallest_user_key,
+ const Slice& largest_user_key) {
+ int level = 0;
+ if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
+ // Push to next level if there is no overlap in next level,
+ // and the #bytes overlapping in the level after that are limited.
+ InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+ std::vector<FileMetaData*> overlaps;
+ while (level < config::kMaxMemCompactLevel) {
+ if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
+ break;
+ }
+ if (level + 2 < config::kNumLevels) {
+ // Check that file does not overlap too many grandparent bytes.
+ GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+ const int64_t sum = TotalFileSize(overlaps);
+ if (sum > kMaxGrandParentOverlapBytes) {
+ break;
+ }
+ }
+ level++;
+ }
+ }
+ return level;
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+void Version::GetOverlappingInputs(
+ int level,
+ const InternalKey* begin,
+ const InternalKey* end,
+ std::vector<FileMetaData*>* inputs) {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ inputs->clear();
+ Slice user_begin, user_end;
+ if (begin != NULL) {
+ user_begin = begin->user_key();
+ }
+ if (end != NULL) {
+ user_end = end->user_key();
+ }
+ const Comparator* user_cmp = vset_->icmp_.user_comparator();
+ for (size_t i = 0; i < files_[level].size(); ) {
+ FileMetaData* f = files_[level][i++];
+ const Slice file_start = f->smallest.user_key();
+ const Slice file_limit = f->largest.user_key();
+ if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
+ // "f" is completely before specified range; skip it
+ } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
+ // "f" is completely after specified range; skip it
+ } else {
+ inputs->push_back(f);
+ if (level == 0) {
+ // Level-0 files may overlap each other. So check if the newly
+ // added file has expanded the range. If so, restart search.
+ if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) {
+ user_begin = file_start;
+ inputs->clear();
+ i = 0;
+ } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) {
+ user_end = file_limit;
+ inputs->clear();
+ i = 0;
+ }
+ }
+ }
+ }
+}
+
+std::string Version::DebugString() const {
+ std::string r;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ // E.g.,
+ // --- level 1 ---
+ // 17:123['a' .. 'd']
+ // 20:43['e' .. 'g']
+ r.append("--- level ");
+ AppendNumberTo(&r, level);
+ r.append(" ---\n");
+ const std::vector<FileMetaData*>& files = files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ r.push_back(' ');
+ AppendNumberTo(&r, files[i]->number);
+ r.push_back(':');
+ AppendNumberTo(&r, files[i]->file_size);
+ r.append("[");
+ r.append(files[i]->smallest.DebugString());
+ r.append(" .. ");
+ r.append(files[i]->largest.DebugString());
+ r.append("]\n");
+ }
+ }
+ return r;
+}
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+ // Helper to sort by v->files_[file_number].smallest
+ struct BySmallestKey {
+ const InternalKeyComparator* internal_comparator;
+
+ bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+ int r = internal_comparator->Compare(f1->smallest, f2->smallest);
+ if (r != 0) {
+ return (r < 0);
+ } else {
+ // Break ties by file number
+ return (f1->number < f2->number);
+ }
+ }
+ };
+
+ typedef std::set<FileMetaData*, BySmallestKey> FileSet;
+ struct LevelState {
+ std::set<uint64_t> deleted_files;
+ FileSet* added_files;
+ };
+
+ VersionSet* vset_;
+ Version* base_;
+ LevelState levels_[config::kNumLevels];
+
+ public:
+ // Initialize a builder with the files from *base and other info from *vset
+ Builder(VersionSet* vset, Version* base)
+ : vset_(vset),
+ base_(base) {
+ base_->Ref();
+ BySmallestKey cmp;
+ cmp.internal_comparator = &vset_->icmp_;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ levels_[level].added_files = new FileSet(cmp);
+ }
+ }
+
+ ~Builder() {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const FileSet* added = levels_[level].added_files;
+ std::vector<FileMetaData*> to_unref;
+ to_unref.reserve(added->size());
+ for (FileSet::const_iterator it = added->begin();
+ it != added->end(); ++it) {
+ to_unref.push_back(*it);
+ }
+ delete added;
+ for (uint32_t i = 0; i < to_unref.size(); i++) {
+ FileMetaData* f = to_unref[i];
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+ base_->Unref();
+ }
+
+ // Apply all of the edits in *edit to the current state.
+ void Apply(VersionEdit* edit) {
+ // Update compaction pointers
+ for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
+ const int level = edit->compact_pointers_[i].first;
+ vset_->compact_pointer_[level] =
+ edit->compact_pointers_[i].second.Encode().ToString();
+ }
+
+ // Delete files
+ const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+ for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
+ iter != del.end();
+ ++iter) {
+ const int level = iter->first;
+ const uint64_t number = iter->second;
+ levels_[level].deleted_files.insert(number);
+ }
+
+ // Add new files
+ for (size_t i = 0; i < edit->new_files_.size(); i++) {
+ const int level = edit->new_files_[i].first;
+ FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+ f->refs = 1;
+
+ // We arrange to automatically compact this file after
+ // a certain number of seeks. Let's assume:
+ // (1) One seek costs 10ms
+ // (2) Writing or reading 1MB costs 10ms (100MB/s)
+ // (3) A compaction of 1MB does 25MB of IO:
+ // 1MB read from this level
+ // 10-12MB read from next level (boundaries may be misaligned)
+ // 10-12MB written to next level
+ // This implies that 25 seeks cost the same as the compaction
+ // of 1MB of data. I.e., one seek costs approximately the
+ // same as the compaction of 40KB of data. We are a little
+ // conservative and allow approximately one seek for every 16KB
+ // of data before triggering a compaction.
+ f->allowed_seeks = (f->file_size / 16384);
+ if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+
+ levels_[level].deleted_files.erase(f->number);
+ levels_[level].added_files->insert(f);
+ }
+ }
+
+ // Save the current state in *v.
+ void SaveTo(Version* v) {
+ BySmallestKey cmp;
+ cmp.internal_comparator = &vset_->icmp_;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ // Merge the set of added files with the set of pre-existing files.
+ // Drop any deleted files. Store the result in *v.
+ const std::vector<FileMetaData*>& base_files = base_->files_[level];
+ std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
+ std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
+ const FileSet* added = levels_[level].added_files;
+ v->files_[level].reserve(base_files.size() + added->size());
+ for (FileSet::const_iterator added_iter = added->begin();
+ added_iter != added->end();
+ ++added_iter) {
+ // Add all smaller files listed in base_
+ for (std::vector<FileMetaData*>::const_iterator bpos
+ = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+ base_iter != bpos;
+ ++base_iter) {
+ MaybeAddFile(v, level, *base_iter);
+ }
+
+ MaybeAddFile(v, level, *added_iter);
+ }
+
+ // Add remaining base files
+ for (; base_iter != base_end; ++base_iter) {
+ MaybeAddFile(v, level, *base_iter);
+ }
+
+#ifndef NDEBUG
+ // Make sure there is no overlap in levels > 0
+ if (level > 0) {
+ for (uint32_t i = 1; i < v->files_[level].size(); i++) {
+ const InternalKey& prev_end = v->files_[level][i-1]->largest;
+ const InternalKey& this_begin = v->files_[level][i]->smallest;
+ if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+ fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+ prev_end.DebugString().c_str(),
+ this_begin.DebugString().c_str());
+ abort();
+ }
+ }
+ }
+#endif
+ }
+ }
+
+ void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+ if (levels_[level].deleted_files.count(f->number) > 0) {
+ // File is deleted: do nothing
+ } else {
+ std::vector<FileMetaData*>* files = &v->files_[level];
+ if (level > 0 && !files->empty()) {
+ // Must not overlap
+ assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
+ f->smallest) < 0);
+ }
+ f->refs++;
+ files->push_back(f);
+ }
+ }
+};
+
+VersionSet::VersionSet(const std::string& dbname,
+ const Options* options,
+ TableCache* table_cache,
+ const InternalKeyComparator* cmp)
+ : env_(options->env),
+ dbname_(dbname),
+ options_(options),
+ table_cache_(table_cache),
+ icmp_(*cmp),
+ next_file_number_(2),
+ manifest_file_number_(0), // Filled by Recover()
+ last_sequence_(0),
+ log_number_(0),
+ prev_log_number_(0),
+ descriptor_file_(NULL),
+ descriptor_log_(NULL),
+ dummy_versions_(this),
+ current_(NULL) {
+ AppendVersion(new Version(this));
+}
+
+VersionSet::~VersionSet() {
+ current_->Unref();
+ assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty
+ delete descriptor_log_;
+ delete descriptor_file_;
+}
+
+void VersionSet::AppendVersion(Version* v) {
+ // Make "v" current
+ assert(v->refs_ == 0);
+ assert(v != current_);
+ if (current_ != NULL) {
+ current_->Unref();
+ }
+ current_ = v;
+ v->Ref();
+
+ // Append to linked list
+ v->prev_ = dummy_versions_.prev_;
+ v->next_ = &dummy_versions_;
+ v->prev_->next_ = v;
+ v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
+ if (edit->has_log_number_) {
+ assert(edit->log_number_ >= log_number_);
+ assert(edit->log_number_ < next_file_number_);
+ } else {
+ edit->SetLogNumber(log_number_);
+ }
+
+ if (!edit->has_prev_log_number_) {
+ edit->SetPrevLogNumber(prev_log_number_);
+ }
+
+ edit->SetNextFile(next_file_number_);
+ edit->SetLastSequence(last_sequence_);
+
+ Version* v = new Version(this);
+ {
+ Builder builder(this, current_);
+ builder.Apply(edit);
+ builder.SaveTo(v);
+ }
+ Finalize(v);
+
+ // Initialize new descriptor log file if necessary by creating
+ // a temporary file that contains a snapshot of the current version.
+ std::string new_manifest_file;
+ Status s;
+ if (descriptor_log_ == NULL) {
+ // No reason to unlock *mu here since we only hit this path in the
+ // first call to LogAndApply (when opening the database).
+ assert(descriptor_file_ == NULL);
+ new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+ edit->SetNextFile(next_file_number_);
+ s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+ if (s.ok()) {
+ descriptor_log_ = new log::Writer(descriptor_file_);
+ s = WriteSnapshot(descriptor_log_);
+ }
+ }
+
+ // Unlock during expensive MANIFEST log write
+ {
+ mu->Unlock();
+
+ // Write new record to MANIFEST log
+ if (s.ok()) {
+ std::string record;
+ edit->EncodeTo(&record);
+ s = descriptor_log_->AddRecord(record);
+ if (s.ok()) {
+ s = descriptor_file_->Sync();
+ }
+ if (!s.ok()) {
+ Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str());
+ }
+ }
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (s.ok() && !new_manifest_file.empty()) {
+ s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+ }
+
+ mu->Lock();
+ }
+
+ // Install the new version
+ if (s.ok()) {
+ AppendVersion(v);
+ log_number_ = edit->log_number_;
+ prev_log_number_ = edit->prev_log_number_;
+ } else {
+ delete v;
+ if (!new_manifest_file.empty()) {
+ delete descriptor_log_;
+ delete descriptor_file_;
+ descriptor_log_ = NULL;
+ descriptor_file_ = NULL;
+ env_->DeleteFile(new_manifest_file);
+ }
+ }
+
+ return s;
+}
+
+Status VersionSet::Recover() {
+ struct LogReporter : public log::Reader::Reporter {
+ Status* status;
+ virtual void Corruption(size_t bytes, const Status& s) {
+ if (this->status->ok()) *this->status = s;
+ }
+ };
+
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string current;
+ Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+ if (!s.ok()) {
+ return s;
+ }
+ if (current.empty() || current[current.size()-1] != '\n') {
+ return Status::Corruption("CURRENT file does not end with newline");
+ }
+ current.resize(current.size() - 1);
+
+ std::string dscname = dbname_ + "/" + current;
+ SequentialFile* file;
+ s = env_->NewSequentialFile(dscname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ bool have_log_number = false;
+ bool have_prev_log_number = false;
+ bool have_next_file = false;
+ bool have_last_sequence = false;
+ uint64_t next_file = 0;
+ uint64_t last_sequence = 0;
+ uint64_t log_number = 0;
+ uint64_t prev_log_number = 0;
+ Builder builder(this, current_);
+
+ {
+ LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (s.ok()) {
+ if (edit.has_comparator_ &&
+ edit.comparator_ != icmp_.user_comparator()->Name()) {
+ s = Status::InvalidArgument(
+ edit.comparator_ + " does not match existing comparator ",
+ icmp_.user_comparator()->Name());
+ }
+ }
+
+ if (s.ok()) {
+ builder.Apply(&edit);
+ }
+
+ if (edit.has_log_number_) {
+ log_number = edit.log_number_;
+ have_log_number = true;
+ }
+
+ if (edit.has_prev_log_number_) {
+ prev_log_number = edit.prev_log_number_;
+ have_prev_log_number = true;
+ }
+
+ if (edit.has_next_file_number_) {
+ next_file = edit.next_file_number_;
+ have_next_file = true;
+ }
+
+ if (edit.has_last_sequence_) {
+ last_sequence = edit.last_sequence_;
+ have_last_sequence = true;
+ }
+ }
+ }
+ delete file;
+ file = NULL;
+
+ if (s.ok()) {
+ if (!have_next_file) {
+ s = Status::Corruption("no meta-nextfile entry in descriptor");
+ } else if (!have_log_number) {
+ s = Status::Corruption("no meta-lognumber entry in descriptor");
+ } else if (!have_last_sequence) {
+ s = Status::Corruption("no last-sequence-number entry in descriptor");
+ }
+
+ if (!have_prev_log_number) {
+ prev_log_number = 0;
+ }
+
+ MarkFileNumberUsed(prev_log_number);
+ MarkFileNumberUsed(log_number);
+ }
+
+ if (s.ok()) {
+ Version* v = new Version(this);
+ builder.SaveTo(v);
+ // Install recovered version
+ Finalize(v);
+ AppendVersion(v);
+ manifest_file_number_ = next_file;
+ next_file_number_ = next_file + 1;
+ last_sequence_ = last_sequence;
+ log_number_ = log_number;
+ prev_log_number_ = prev_log_number;
+ }
+
+ return s;
+}
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+ if (next_file_number_ <= number) {
+ next_file_number_ = number + 1;
+ }
+}
+
+void VersionSet::Finalize(Version* v) {
+ // Precomputed best level for next compaction
+ int best_level = -1;
+ double best_score = -1;
+
+ for (int level = 0; level < config::kNumLevels-1; level++) {
+ double score;
+ if (level == 0) {
+ // We treat level-0 specially by bounding the number of files
+ // instead of number of bytes for two reasons:
+ //
+ // (1) With larger write-buffer sizes, it is nice not to do too
+ // many level-0 compactions.
+ //
+ // (2) The files in level-0 are merged on every read and
+ // therefore we wish to avoid too many files when the individual
+ // file size is small (perhaps because of a small write-buffer
+ // setting, or very high compression ratios, or lots of
+ // overwrites/deletions).
+ score = v->files_[level].size() /
+ static_cast<double>(config::kL0_CompactionTrigger);
+ } else {
+ // Compute the ratio of current size to size limit.
+ const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+ score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
+ }
+
+ if (score > best_score) {
+ best_level = level;
+ best_score = score;
+ }
+ }
+
+ v->compaction_level_ = best_level;
+ v->compaction_score_ = best_score;
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+ // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+ // Save metadata
+ VersionEdit edit;
+ edit.SetComparatorName(icmp_.user_comparator()->Name());
+
+ // Save compaction pointers
+ for (int level = 0; level < config::kNumLevels; level++) {
+ if (!compact_pointer_[level].empty()) {
+ InternalKey key;
+ key.DecodeFrom(compact_pointer_[level]);
+ edit.SetCompactPointer(level, key);
+ }
+ }
+
+ // Save files
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = current_->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ const FileMetaData* f = files[i];
+ edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
+ }
+ }
+
+ std::string record;
+ edit.EncodeTo(&record);
+ return log->AddRecord(record);
+}
+
+int VersionSet::NumLevelFiles(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return current_->files_[level].size();
+}
+
+const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
+ // Update code if kNumLevels changes
+ assert(config::kNumLevels == 7);
+ snprintf(scratch->buffer, sizeof(scratch->buffer),
+ "files[ %d %d %d %d %d %d %d ]",
+ int(current_->files_[0].size()),
+ int(current_->files_[1].size()),
+ int(current_->files_[2].size()),
+ int(current_->files_[3].size()),
+ int(current_->files_[4].size()),
+ int(current_->files_[5].size()),
+ int(current_->files_[6].size()));
+ return scratch->buffer;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+ uint64_t result = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+ // Entire file is before "ikey", so just add the file size
+ result += files[i]->file_size;
+ } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+ // Entire file is after "ikey", so ignore
+ if (level > 0) {
+ // Files other than level 0 are sorted by meta->smallest, so
+ // no further files in this level will contain data for
+ // "ikey".
+ break;
+ }
+ } else {
+ // "ikey" falls in the range for this table. Add the
+ // approximate offset of "ikey" within the table.
+ Table* tableptr;
+ Iterator* iter = table_cache_->NewIterator(
+ ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
+ if (tableptr != NULL) {
+ result += tableptr->ApproximateOffsetOf(ikey.Encode());
+ }
+ delete iter;
+ }
+ }
+ }
+ return result;
+}
+
+void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
+ for (Version* v = dummy_versions_.next_;
+ v != &dummy_versions_;
+ v = v->next_) {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (size_t i = 0; i < files.size(); i++) {
+ live->insert(files[i]->number);
+ }
+ }
+ }
+}
+
+int64_t VersionSet::NumLevelBytes(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return TotalFileSize(current_->files_[level]);
+}
+
+int64_t VersionSet::MaxNextLevelOverlappingBytes() {
+ int64_t result = 0;
+ std::vector<FileMetaData*> overlaps;
+ for (int level = 1; level < config::kNumLevels - 1; level++) {
+ for (size_t i = 0; i < current_->files_[level].size(); i++) {
+ const FileMetaData* f = current_->files_[level][i];
+ current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
+ &overlaps);
+ const int64_t sum = TotalFileSize(overlaps);
+ if (sum > result) {
+ result = sum;
+ }
+ }
+ }
+ return result;
+}
+
+// Stores the minimal range that covers all entries in inputs in
+// *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) {
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+ for (size_t i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_.Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_.Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+}
+
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) {
+ std::vector<FileMetaData*> all = inputs1;
+ all.insert(all.end(), inputs2.begin(), inputs2.end());
+ GetRange(all, smallest, largest);
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+ ReadOptions options;
+ options.verify_checksums = options_->paranoid_checks;
+ options.fill_cache = false;
+
+ // Level-0 files have to be merged together. For other levels,
+ // we will make a concatenating iterator per level.
+ // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+ const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+ Iterator** list = new Iterator*[space];
+ int num = 0;
+ for (int which = 0; which < 2; which++) {
+ if (!c->inputs_[which].empty()) {
+ if (c->level() + which == 0) {
+ const std::vector<FileMetaData*>& files = c->inputs_[which];
+ for (size_t i = 0; i < files.size(); i++) {
+ list[num++] = table_cache_->NewIterator(
+ options, files[i]->number, files[i]->file_size);
+ }
+ } else {
+ // Create concatenating iterator for the files from this level
+ list[num++] = NewTwoLevelIterator(
+ new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
+ &GetFileIterator, table_cache_, options);
+ }
+ }
+ }
+ assert(num <= space);
+ Iterator* result = NewMergingIterator(&icmp_, list, num);
+ delete[] list;
+ return result;
+}
+
+Compaction* VersionSet::PickCompaction() {
+ Compaction* c;
+ int level;
+
+ // We prefer compactions triggered by too much data in a level over
+ // the compactions triggered by seeks.
+ const bool size_compaction = (current_->compaction_score_ >= 1);
+ const bool seek_compaction = (current_->file_to_compact_ != NULL);
+ if (size_compaction) {
+ level = current_->compaction_level_;
+ assert(level >= 0);
+ assert(level+1 < config::kNumLevels);
+ c = new Compaction(level);
+
+ // Pick the first file that comes after compact_pointer_[level]
+ for (size_t i = 0; i < current_->files_[level].size(); i++) {
+ FileMetaData* f = current_->files_[level][i];
+ if (compact_pointer_[level].empty() ||
+ icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
+ c->inputs_[0].push_back(f);
+ break;
+ }
+ }
+ if (c->inputs_[0].empty()) {
+ // Wrap-around to the beginning of the key space
+ c->inputs_[0].push_back(current_->files_[level][0]);
+ }
+ } else if (seek_compaction) {
+ level = current_->file_to_compact_level_;
+ c = new Compaction(level);
+ c->inputs_[0].push_back(current_->file_to_compact_);
+ } else {
+ return NULL;
+ }
+
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+
+ // Files in level 0 may overlap each other, so pick up all overlapping ones
+ if (level == 0) {
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+ assert(!c->inputs_[0].empty());
+ }
+
+ SetupOtherInputs(c);
+
+ return c;
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+ const int level = c->level();
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+
+ current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
+
+ // Get entire range covered by compaction
+ InternalKey all_start, all_limit;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+ // See if we can grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up.
+ if (!c->inputs_[1].empty()) {
+ std::vector<FileMetaData*> expanded0;
+ current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
+ const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+ const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+ const int64_t expanded0_size = TotalFileSize(expanded0);
+ if (expanded0.size() > c->inputs_[0].size() &&
+ inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded0, &new_start, &new_limit);
+ std::vector<FileMetaData*> expanded1;
+ current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
+ &expanded1);
+ if (expanded1.size() == c->inputs_[1].size()) {
+ Log(options_->info_log,
+ "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
+ level,
+ int(c->inputs_[0].size()),
+ int(c->inputs_[1].size()),
+ long(inputs0_size), long(inputs1_size),
+ int(expanded0.size()),
+ int(expanded1.size()),
+ long(expanded0_size), long(inputs1_size));
+ smallest = new_start;
+ largest = new_limit;
+ c->inputs_[0] = expanded0;
+ c->inputs_[1] = expanded1;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+ }
+ }
+ }
+
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2)
+ if (level + 2 < config::kNumLevels) {
+ current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+ &c->grandparents_);
+ }
+
+ if (false) {
+ Log(options_->info_log, "Compacting %d '%s' .. '%s'",
+ level,
+ smallest.DebugString().c_str(),
+ largest.DebugString().c_str());
+ }
+
+ // Update the place where we will do the next compaction for this level.
+ // We update this immediately instead of waiting for the VersionEdit
+ // to be applied so that if the compaction fails, we will try a different
+ // key range next time.
+ compact_pointer_[level] = largest.Encode().ToString();
+ c->edit_.SetCompactPointer(level, largest);
+}
+
+Compaction* VersionSet::CompactRange(
+ int level,
+ const InternalKey* begin,
+ const InternalKey* end) {
+ std::vector<FileMetaData*> inputs;
+ current_->GetOverlappingInputs(level, begin, end, &inputs);
+ if (inputs.empty()) {
+ return NULL;
+ }
+
+ // Avoid compacting too much in one shot in case the range is large.
+ // But we cannot do this for level-0 since level-0 files can overlap
+ // and we must not pick one file and drop another older file if the
+ // two files overlap.
+ if (level > 0) {
+ const uint64_t limit = MaxFileSizeForLevel(level);
+ uint64_t total = 0;
+ for (size_t i = 0; i < inputs.size(); i++) {
+ uint64_t s = inputs[i]->file_size;
+ total += s;
+ if (total >= limit) {
+ inputs.resize(i + 1);
+ break;
+ }
+ }
+ }
+
+ Compaction* c = new Compaction(level);
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+ c->inputs_[0] = inputs;
+ SetupOtherInputs(c);
+ return c;
+}
+
+Compaction::Compaction(int level)
+ : level_(level),
+ max_output_file_size_(MaxFileSizeForLevel(level)),
+ input_version_(NULL),
+ grandparent_index_(0),
+ seen_key_(false),
+ overlapped_bytes_(0) {
+ for (int i = 0; i < config::kNumLevels; i++) {
+ level_ptrs_[i] = 0;
+ }
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != NULL) {
+ input_version_->Unref();
+ }
+}
+
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there is lots of overlapping grandparent data.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ return (num_input_files(0) == 1 &&
+ num_input_files(1) == 0 &&
+ TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+ for (int which = 0; which < 2; which++) {
+ for (size_t i = 0; i < inputs_[which].size(); i++) {
+ edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+ }
+ }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+ for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
+ const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+ for (; level_ptrs_[lvl] < files.size(); ) {
+ FileMetaData* f = files[level_ptrs_[lvl]];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so definitely not base level
+ return false;
+ }
+ break;
+ }
+ level_ptrs_[lvl]++;
+ }
+ }
+ return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+ // Scan to find earliest grandparent file that contains key.
+ const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+ while (grandparent_index_ < grandparents_.size() &&
+ icmp->Compare(internal_key,
+ grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+ if (seen_key_) {
+ overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+ }
+ grandparent_index_++;
+ }
+ seen_key_ = true;
+
+ if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) {
+ // Too much overlap for current output; start new output
+ overlapped_bytes_ = 0;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void Compaction::ReleaseInputs() {
+ if (input_version_ != NULL) {
+ input_version_->Unref();
+ input_version_ = NULL;
+ }
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
new file mode 100644
index 0000000000..8dc14b8e01
--- /dev/null
+++ b/src/leveldb/db/version_set.h
@@ -0,0 +1,396 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions. The
+// newest version is called "current". Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level. The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
+#define STORAGE_LEVELDB_DB_VERSION_SET_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "port/thread_annotations.h"
+
+namespace leveldb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableBuilder;
+class TableCache;
+class Version;
+class VersionSet;
+class WritableFile;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+ const std::vector<FileMetaData*>& files,
+ const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==NULL represents a key smaller than all keys in the DB.
+// largest==NULL represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+// in sorted order.
+extern bool SomeFileOverlapsRange(
+ const InternalKeyComparator& icmp,
+ bool disjoint_sorted_files,
+ const std::vector<FileMetaData*>& files,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+class Version {
+ public:
+ // Append to *iters a sequence of iterators that will
+ // yield the contents of this Version when merged together.
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
+
+ // Lookup the value for key. If found, store it in *val and
+ // return OK. Else return a non-OK status. Fills *stats.
+ // REQUIRES: lock is not held
+ struct GetStats {
+ FileMetaData* seek_file;
+ int seek_file_level;
+ };
+ Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
+ GetStats* stats);
+
+ // Adds "stats" into the current state. Returns true if a new
+ // compaction may need to be triggered, false otherwise.
+ // REQUIRES: lock is held
+ bool UpdateStats(const GetStats& stats);
+
+ // Record a sample of bytes read at the specified internal key.
+ // Samples are taken approximately once every config::kReadBytesPeriod
+ // bytes. Returns true if a new compaction may need to be triggered.
+ // REQUIRES: lock is held
+ bool RecordReadSample(Slice key);
+
+ // Reference count management (so Versions do not disappear out from
+ // under live iterators)
+ void Ref();
+ void Unref();
+
+ void GetOverlappingInputs(
+ int level,
+ const InternalKey* begin, // NULL means before all keys
+ const InternalKey* end, // NULL means after all keys
+ std::vector<FileMetaData*>* inputs);
+
+ // Returns true iff some file in the specified level overlaps
+ // some part of [*smallest_user_key,*largest_user_key].
+ // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+ // largest_user_key==NULL represents a key largest than all keys in the DB.
+ bool OverlapInLevel(int level,
+ const Slice* smallest_user_key,
+ const Slice* largest_user_key);
+
+ // Return the level at which we should place a new memtable compaction
+ // result that covers the range [smallest_user_key,largest_user_key].
+ int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+ const Slice& largest_user_key);
+
+ int NumFiles(int level) const { return files_[level].size(); }
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString() const;
+
+ private:
+ friend class Compaction;
+ friend class VersionSet;
+
+ class LevelFileNumIterator;
+ Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+
+ // Call func(arg, level, f) for every file that overlaps user_key in
+ // order from newest to oldest. If an invocation of func returns
+ // false, makes no more calls.
+ //
+ // REQUIRES: user portion of internal_key == user_key.
+ void ForEachOverlapping(Slice user_key, Slice internal_key,
+ void* arg,
+ bool (*func)(void*, int, FileMetaData*));
+
+ VersionSet* vset_; // VersionSet to which this Version belongs
+ Version* next_; // Next version in linked list
+ Version* prev_; // Previous version in linked list
+ int refs_; // Number of live refs to this version
+
+ // List of files per level
+ std::vector<FileMetaData*> files_[config::kNumLevels];
+
+ // Next file to compact based on seek stats.
+ FileMetaData* file_to_compact_;
+ int file_to_compact_level_;
+
+ // Level that should be compacted next and its compaction score.
+ // Score < 1 means compaction is not strictly needed. These fields
+ // are initialized by Finalize().
+ double compaction_score_;
+ int compaction_level_;
+
+ explicit Version(VersionSet* vset)
+ : vset_(vset), next_(this), prev_(this), refs_(0),
+ file_to_compact_(NULL),
+ file_to_compact_level_(-1),
+ compaction_score_(-1),
+ compaction_level_(-1) {
+ }
+
+ ~Version();
+
+ // No copying allowed
+ Version(const Version&);
+ void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+ VersionSet(const std::string& dbname,
+ const Options* options,
+ TableCache* table_cache,
+ const InternalKeyComparator*);
+ ~VersionSet();
+
+ // Apply *edit to the current version to form a new descriptor that
+ // is both saved to persistent state and installed as the new
+ // current version. Will release *mu while actually writing to the file.
+ // REQUIRES: *mu is held on entry.
+ // REQUIRES: no other thread concurrently calls LogAndApply()
+ Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
+ EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+ // Recover the last saved descriptor from persistent storage.
+ Status Recover();
+
+ // Return the current version.
+ Version* current() const { return current_; }
+
+ // Return the current manifest file number
+ uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+ // Allocate and return a new file number
+ uint64_t NewFileNumber() { return next_file_number_++; }
+
+ // Arrange to reuse "file_number" unless a newer file number has
+ // already been allocated.
+ // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+ void ReuseFileNumber(uint64_t file_number) {
+ if (next_file_number_ == file_number + 1) {
+ next_file_number_ = file_number;
+ }
+ }
+
+ // Return the number of Table files at the specified level.
+ int NumLevelFiles(int level) const;
+
+ // Return the combined file size of all files at the specified level.
+ int64_t NumLevelBytes(int level) const;
+
+ // Return the last sequence number.
+ uint64_t LastSequence() const { return last_sequence_; }
+
+ // Set the last sequence number to s.
+ void SetLastSequence(uint64_t s) {
+ assert(s >= last_sequence_);
+ last_sequence_ = s;
+ }
+
+ // Mark the specified file number as used.
+ void MarkFileNumberUsed(uint64_t number);
+
+ // Return the current log file number.
+ uint64_t LogNumber() const { return log_number_; }
+
+ // Return the log file number for the log file that is currently
+ // being compacted, or zero if there is no such log file.
+ uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+ // Pick level and inputs for a new compaction.
+ // Returns NULL if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ Compaction* PickCompaction();
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns NULL if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ Compaction* CompactRange(
+ int level,
+ const InternalKey* begin,
+ const InternalKey* end);
+
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64_t MaxNextLevelOverlappingBytes();
+
+ // Create an iterator that reads over the compaction inputs for "*c".
+ // The caller should delete the iterator when no longer needed.
+ Iterator* MakeInputIterator(Compaction* c);
+
+ // Returns true iff some level needs a compaction.
+ bool NeedsCompaction() const {
+ Version* v = current_;
+ return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL);
+ }
+
+ // Add all files listed in any live version to *live.
+ // May also mutate some internal state.
+ void AddLiveFiles(std::set<uint64_t>* live);
+
+ // Return the approximate offset in the database of the data for
+ // "key" as of version "v".
+ uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+ // Return a human-readable short (single-line) summary of the number
+ // of files per level. Uses *scratch as backing store.
+ struct LevelSummaryStorage {
+ char buffer[100];
+ };
+ const char* LevelSummary(LevelSummaryStorage* scratch) const;
+
+ private:
+ class Builder;
+
+ friend class Compaction;
+ friend class Version;
+
+ void Finalize(Version* v);
+
+ void GetRange(const std::vector<FileMetaData*>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest);
+
+ void GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest);
+
+ void SetupOtherInputs(Compaction* c);
+
+ // Save current contents to *log
+ Status WriteSnapshot(log::Writer* log);
+
+ void AppendVersion(Version* v);
+
+ Env* const env_;
+ const std::string dbname_;
+ const Options* const options_;
+ TableCache* const table_cache_;
+ const InternalKeyComparator icmp_;
+ uint64_t next_file_number_;
+ uint64_t manifest_file_number_;
+ uint64_t last_sequence_;
+ uint64_t log_number_;
+ uint64_t prev_log_number_; // 0 or backing store for memtable being compacted
+
+ // Opened lazily
+ WritableFile* descriptor_file_;
+ log::Writer* descriptor_log_;
+ Version dummy_versions_; // Head of circular doubly-linked list of versions.
+ Version* current_; // == dummy_versions_.prev_
+
+ // Per-level key at which the next compaction at that level should start.
+ // Either an empty string, or a valid InternalKey.
+ std::string compact_pointer_[config::kNumLevels];
+
+ // No copying allowed
+ VersionSet(const VersionSet&);
+ void operator=(const VersionSet&);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+ ~Compaction();
+
+ // Return the level that is being compacted. Inputs from "level"
+ // and "level+1" will be merged to produce a set of "level+1" files.
+ int level() const { return level_; }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // "which" must be either 0 or 1
+ int num_input_files(int which) const { return inputs_[which].size(); }
+
+ // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+ FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the information we have available guarantees that
+ // the compaction is producing data in "level+1" for which no data exists
+ // in levels greater than "level+1".
+ bool IsBaseLevelForKey(const Slice& user_key);
+
+ // Returns true iff we should stop building the current output
+ // before processing "internal_key".
+ bool ShouldStopBefore(const Slice& internal_key);
+
+ // Release the input version for the compaction, once the compaction
+ // is successful.
+ void ReleaseInputs();
+
+ private:
+ friend class Version;
+ friend class VersionSet;
+
+ explicit Compaction(int level);
+
+ int level_;
+ uint64_t max_output_file_size_;
+ Version* input_version_;
+ VersionEdit edit_;
+
+ // Each compaction reads inputs from "level_" and "level_+1"
+ std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
+
+ // State used to check for number of of overlapping grandparent files
+ // (parent == level_ + 1, grandparent == level_ + 2)
+ std::vector<FileMetaData*> grandparents_;
+ size_t grandparent_index_; // Index in grandparent_starts_
+ bool seen_key_; // Some output key has been seen
+ int64_t overlapped_bytes_; // Bytes of overlap between current output
+ // and grandparent files
+
+ // State for implementing IsBaseLevelForKey
+
+ // level_ptrs_ holds indices into input_version_->levels_: our state
+ // is that we are positioned at one of the file ranges for each
+ // higher level than the ones involved in this compaction (i.e. for
+ // all L >= level_ + 2).
+ size_t level_ptrs_[config::kNumLevels];
+};
+
+} // namespace leveldb
+
+#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc
new file mode 100644
index 0000000000..501e34d133
--- /dev/null
+++ b/src/leveldb/db/version_set_test.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class FindFileTest {
+ public:
+ std::vector<FileMetaData*> files_;
+ bool disjoint_sorted_files_;
+
+ FindFileTest() : disjoint_sorted_files_(true) { }
+
+ ~FindFileTest() {
+ for (int i = 0; i < files_.size(); i++) {
+ delete files_[i];
+ }
+ }
+
+ void Add(const char* smallest, const char* largest,
+ SequenceNumber smallest_seq = 100,
+ SequenceNumber largest_seq = 100) {
+ FileMetaData* f = new FileMetaData;
+ f->number = files_.size() + 1;
+ f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+ f->largest = InternalKey(largest, largest_seq, kTypeValue);
+ files_.push_back(f);
+ }
+
+ int Find(const char* key) {
+ InternalKey target(key, 100, kTypeValue);
+ InternalKeyComparator cmp(BytewiseComparator());
+ return FindFile(cmp, files_, target.Encode());
+ }
+
+ bool Overlaps(const char* smallest, const char* largest) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ Slice s(smallest != NULL ? smallest : "");
+ Slice l(largest != NULL ? largest : "");
+ return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+ (smallest != NULL ? &s : NULL),
+ (largest != NULL ? &l : NULL));
+ }
+};
+
+TEST(FindFileTest, Empty) {
+ ASSERT_EQ(0, Find("foo"));
+ ASSERT_TRUE(! Overlaps("a", "z"));
+ ASSERT_TRUE(! Overlaps(NULL, "z"));
+ ASSERT_TRUE(! Overlaps("a", NULL));
+ ASSERT_TRUE(! Overlaps(NULL, NULL));
+}
+
+TEST(FindFileTest, Single) {
+ Add("p", "q");
+ ASSERT_EQ(0, Find("a"));
+ ASSERT_EQ(0, Find("p"));
+ ASSERT_EQ(0, Find("p1"));
+ ASSERT_EQ(0, Find("q"));
+ ASSERT_EQ(1, Find("q1"));
+ ASSERT_EQ(1, Find("z"));
+
+ ASSERT_TRUE(! Overlaps("a", "b"));
+ ASSERT_TRUE(! Overlaps("z1", "z2"));
+ ASSERT_TRUE(Overlaps("a", "p"));
+ ASSERT_TRUE(Overlaps("a", "q"));
+ ASSERT_TRUE(Overlaps("a", "z"));
+ ASSERT_TRUE(Overlaps("p", "p1"));
+ ASSERT_TRUE(Overlaps("p", "q"));
+ ASSERT_TRUE(Overlaps("p", "z"));
+ ASSERT_TRUE(Overlaps("p1", "p2"));
+ ASSERT_TRUE(Overlaps("p1", "z"));
+ ASSERT_TRUE(Overlaps("q", "q"));
+ ASSERT_TRUE(Overlaps("q", "q1"));
+
+ ASSERT_TRUE(! Overlaps(NULL, "j"));
+ ASSERT_TRUE(! Overlaps("r", NULL));
+ ASSERT_TRUE(Overlaps(NULL, "p"));
+ ASSERT_TRUE(Overlaps(NULL, "p1"));
+ ASSERT_TRUE(Overlaps("q", NULL));
+ ASSERT_TRUE(Overlaps(NULL, NULL));
+}
+
+
+TEST(FindFileTest, Multiple) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_EQ(0, Find("100"));
+ ASSERT_EQ(0, Find("150"));
+ ASSERT_EQ(0, Find("151"));
+ ASSERT_EQ(0, Find("199"));
+ ASSERT_EQ(0, Find("200"));
+ ASSERT_EQ(1, Find("201"));
+ ASSERT_EQ(1, Find("249"));
+ ASSERT_EQ(1, Find("250"));
+ ASSERT_EQ(2, Find("251"));
+ ASSERT_EQ(2, Find("299"));
+ ASSERT_EQ(2, Find("300"));
+ ASSERT_EQ(2, Find("349"));
+ ASSERT_EQ(2, Find("350"));
+ ASSERT_EQ(3, Find("351"));
+ ASSERT_EQ(3, Find("400"));
+ ASSERT_EQ(3, Find("450"));
+ ASSERT_EQ(4, Find("451"));
+
+ ASSERT_TRUE(! Overlaps("100", "149"));
+ ASSERT_TRUE(! Overlaps("251", "299"));
+ ASSERT_TRUE(! Overlaps("451", "500"));
+ ASSERT_TRUE(! Overlaps("351", "399"));
+
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+ Add("150", "200");
+ Add("200", "250");
+ Add("300", "350");
+ Add("400", "450");
+ ASSERT_TRUE(! Overlaps(NULL, "149"));
+ ASSERT_TRUE(! Overlaps("451", NULL));
+ ASSERT_TRUE(Overlaps(NULL, NULL));
+ ASSERT_TRUE(Overlaps(NULL, "150"));
+ ASSERT_TRUE(Overlaps(NULL, "199"));
+ ASSERT_TRUE(Overlaps(NULL, "200"));
+ ASSERT_TRUE(Overlaps(NULL, "201"));
+ ASSERT_TRUE(Overlaps(NULL, "400"));
+ ASSERT_TRUE(Overlaps(NULL, "800"));
+ ASSERT_TRUE(Overlaps("100", NULL));
+ ASSERT_TRUE(Overlaps("200", NULL));
+ ASSERT_TRUE(Overlaps("449", NULL));
+ ASSERT_TRUE(Overlaps("450", NULL));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+ Add("200", "200", 5000, 3000);
+ ASSERT_TRUE(! Overlaps("199", "199"));
+ ASSERT_TRUE(! Overlaps("201", "300"));
+ ASSERT_TRUE(Overlaps("200", "200"));
+ ASSERT_TRUE(Overlaps("190", "200"));
+ ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+ Add("150", "600");
+ Add("400", "500");
+ disjoint_sorted_files_ = false;
+ ASSERT_TRUE(! Overlaps("100", "149"));
+ ASSERT_TRUE(! Overlaps("601", "700"));
+ ASSERT_TRUE(Overlaps("100", "150"));
+ ASSERT_TRUE(Overlaps("100", "200"));
+ ASSERT_TRUE(Overlaps("100", "300"));
+ ASSERT_TRUE(Overlaps("100", "400"));
+ ASSERT_TRUE(Overlaps("100", "500"));
+ ASSERT_TRUE(Overlaps("375", "400"));
+ ASSERT_TRUE(Overlaps("450", "450"));
+ ASSERT_TRUE(Overlaps("450", "500"));
+ ASSERT_TRUE(Overlaps("450", "700"));
+ ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc
new file mode 100644
index 0000000000..33f4a4257e
--- /dev/null
+++ b/src/leveldb/db/write_batch.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring |
+// kTypeDeletion varstring
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "leveldb/write_batch.h"
+
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch() {
+ Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(kHeader);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+ Slice input(rep_);
+ if (input.size() < kHeader) {
+ return Status::Corruption("malformed WriteBatch (too small)");
+ }
+
+ input.remove_prefix(kHeader);
+ Slice key, value;
+ int found = 0;
+ while (!input.empty()) {
+ found++;
+ char tag = input[0];
+ input.remove_prefix(1);
+ switch (tag) {
+ case kTypeValue:
+ if (GetLengthPrefixedSlice(&input, &key) &&
+ GetLengthPrefixedSlice(&input, &value)) {
+ handler->Put(key, value);
+ } else {
+ return Status::Corruption("bad WriteBatch Put");
+ }
+ break;
+ case kTypeDeletion:
+ if (GetLengthPrefixedSlice(&input, &key)) {
+ handler->Delete(key);
+ } else {
+ return Status::Corruption("bad WriteBatch Delete");
+ }
+ break;
+ default:
+ return Status::Corruption("unknown WriteBatch tag");
+ }
+ }
+ if (found != WriteBatchInternal::Count(this)) {
+ return Status::Corruption("WriteBatch has wrong count");
+ } else {
+ return Status::OK();
+ }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeValue));
+ PutLengthPrefixedSlice(&rep_, key);
+ PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeDeletion));
+ PutLengthPrefixedSlice(&rep_, key);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+ SequenceNumber sequence_;
+ MemTable* mem_;
+
+ virtual void Put(const Slice& key, const Slice& value) {
+ mem_->Add(sequence_, kTypeValue, key, value);
+ sequence_++;
+ }
+ virtual void Delete(const Slice& key) {
+ mem_->Add(sequence_, kTypeDeletion, key, Slice());
+ sequence_++;
+ }
+};
+} // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+ MemTable* memtable) {
+ MemTableInserter inserter;
+ inserter.sequence_ = WriteBatchInternal::Sequence(b);
+ inserter.mem_ = memtable;
+ return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= kHeader);
+ b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+ SetCount(dst, Count(dst) + Count(src));
+ assert(src->rep_.size() >= kHeader);
+ dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+} // namespace leveldb
diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h
new file mode 100644
index 0000000000..4423a7f318
--- /dev/null
+++ b/src/leveldb/db/write_batch_internal.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "leveldb/write_batch.h"
+
+namespace leveldb {
+
+class MemTable;
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+ // Return the number of entries in the batch.
+ static int Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, int n);
+
+ // Return the seqeunce number for the start of this batch.
+ static SequenceNumber Sequence(const WriteBatch* batch);
+
+ // Store the specified number as the seqeunce number for the start of
+ // this batch.
+ static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+ static Slice Contents(const WriteBatch* batch) {
+ return Slice(batch->rep_);
+ }
+
+ static size_t ByteSize(const WriteBatch* batch) {
+ return batch->rep_.size();
+ }
+
+ static void SetContents(WriteBatch* batch, const Slice& contents);
+
+ static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+
+ static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+} // namespace leveldb
+
+
+#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc
new file mode 100644
index 0000000000..9064e3d85e
--- /dev/null
+++ b/src/leveldb/db/write_batch_test.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string PrintContents(WriteBatch* b) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ MemTable* mem = new MemTable(cmp);
+ mem->Ref();
+ std::string state;
+ Status s = WriteBatchInternal::InsertInto(b, mem);
+ int count = 0;
+ Iterator* iter = mem->NewIterator();
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+ switch (ikey.type) {
+ case kTypeValue:
+ state.append("Put(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ count++;
+ break;
+ case kTypeDeletion:
+ state.append("Delete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ count++;
+ break;
+ }
+ state.append("@");
+ state.append(NumberToString(ikey.sequence));
+ }
+ delete iter;
+ if (!s.ok()) {
+ state.append("ParseError()");
+ } else if (count != WriteBatchInternal::Count(b)) {
+ state.append("CountMismatch()");
+ }
+ mem->Unref();
+ return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+ WriteBatch batch;
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+}
+
+TEST(WriteBatchTest, Multiple) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ batch.Put(Slice("baz"), Slice("boo"));
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ("Put(baz, boo)@102"
+ "Delete(box)@101"
+ "Put(foo, bar)@100",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Corruption) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ WriteBatchInternal::SetSequence(&batch, 200);
+ Slice contents = WriteBatchInternal::Contents(&batch);
+ WriteBatchInternal::SetContents(&batch,
+ Slice(contents.data(),contents.size()-1));
+ ASSERT_EQ("Put(foo, bar)@200"
+ "ParseError()",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+ WriteBatch b1, b2;
+ WriteBatchInternal::SetSequence(&b1, 200);
+ WriteBatchInternal::SetSequence(&b2, 300);
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("",
+ PrintContents(&b1));
+ b2.Put("a", "va");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200",
+ PrintContents(&b1));
+ b2.Clear();
+ b2.Put("b", "vb");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200"
+ "Put(b, vb)@201",
+ PrintContents(&b1));
+ b2.Delete("foo");
+ WriteBatchInternal::Append(&b1, &b2);
+ ASSERT_EQ("Put(a, va)@200"
+ "Put(b, vb)@202"
+ "Put(b, vb)@201"
+ "Delete(foo)@203",
+ PrintContents(&b1));
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}