diff options
Diffstat (limited to 'src/leveldb/db')
41 files changed, 0 insertions, 12942 deletions
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc deleted file mode 100644 index f419882197..0000000000 --- a/src/leveldb/db/builder.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/builder.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" - -namespace leveldb { - -Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta) { - Status s; - meta->file_size = 0; - iter->SeekToFirst(); - - std::string fname = TableFileName(dbname, meta->number); - if (iter->Valid()) { - WritableFile* file; - s = env->NewWritableFile(fname, &file); - if (!s.ok()) { - return s; - } - - TableBuilder* builder = new TableBuilder(options, file); - meta->smallest.DecodeFrom(iter->key()); - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - meta->largest.DecodeFrom(key); - builder->Add(key, iter->value()); - } - - // Finish and check for builder errors - if (s.ok()) { - s = builder->Finish(); - if (s.ok()) { - meta->file_size = builder->FileSize(); - assert(meta->file_size > 0); - } - } else { - builder->Abandon(); - } - delete builder; - - // Finish and check for file errors - if (s.ok()) { - s = file->Sync(); - } - if (s.ok()) { - s = file->Close(); - } - delete file; - file = NULL; - - if (s.ok()) { - // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), - meta->number, - meta->file_size); - s = it->status(); - delete it; - } - } - - // Check for input iterator errors - if (!iter->status().ok()) { - s = iter->status(); - } - - if (s.ok() && meta->file_size > 0) { - // Keep it - } else { - env->DeleteFile(fname); - } - return s; -} - -} // namespace leveldb diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h deleted file mode 100644 index 62431fcf44..0000000000 --- a/src/leveldb/db/builder.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_BUILDER_H_ -#define STORAGE_LEVELDB_DB_BUILDER_H_ - -#include "leveldb/status.h" - -namespace leveldb { - -struct Options; -struct FileMetaData; - -class Env; -class Iterator; -class TableCache; -class VersionEdit; - -// Build a Table file from the contents of *iter. The generated file -// will be named according to meta->number. On success, the rest of -// *meta will be filled with metadata about the generated table. -// If no data is present in *iter, meta->file_size will be set to -// zero, and no Table file will be produced. -extern Status BuildTable(const std::string& dbname, - Env* env, - const Options& options, - TableCache* table_cache, - Iterator* iter, - FileMetaData* meta); - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_BUILDER_H_ diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc deleted file mode 100644 index 08ff0ad90a..0000000000 --- a/src/leveldb/db/c.cc +++ /dev/null @@ -1,595 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/c.h" - -#include <stdlib.h> -#include <unistd.h> -#include "leveldb/cache.h" -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/filter_policy.h" -#include "leveldb/iterator.h" -#include "leveldb/options.h" -#include "leveldb/status.h" -#include "leveldb/write_batch.h" - -using leveldb::Cache; -using leveldb::Comparator; -using leveldb::CompressionType; -using leveldb::DB; -using leveldb::Env; -using leveldb::FileLock; -using leveldb::FilterPolicy; -using leveldb::Iterator; -using leveldb::kMajorVersion; -using leveldb::kMinorVersion; -using leveldb::Logger; -using leveldb::NewBloomFilterPolicy; -using leveldb::NewLRUCache; -using leveldb::Options; -using leveldb::RandomAccessFile; -using leveldb::Range; -using leveldb::ReadOptions; -using leveldb::SequentialFile; -using leveldb::Slice; -using leveldb::Snapshot; -using leveldb::Status; -using leveldb::WritableFile; -using leveldb::WriteBatch; -using leveldb::WriteOptions; - -extern "C" { - -struct leveldb_t { DB* rep; }; -struct leveldb_iterator_t { Iterator* rep; }; -struct leveldb_writebatch_t { WriteBatch rep; }; -struct leveldb_snapshot_t { const Snapshot* rep; }; -struct leveldb_readoptions_t { ReadOptions rep; }; -struct leveldb_writeoptions_t { WriteOptions rep; }; -struct leveldb_options_t { Options rep; }; -struct leveldb_cache_t { Cache* rep; }; -struct leveldb_seqfile_t { SequentialFile* rep; }; -struct leveldb_randomfile_t { RandomAccessFile* rep; }; -struct leveldb_writablefile_t { WritableFile* rep; }; -struct leveldb_logger_t { Logger* rep; }; -struct leveldb_filelock_t { FileLock* rep; }; - -struct leveldb_comparator_t : public Comparator { - void* state_; - void (*destructor_)(void*); - int (*compare_)( - void*, - const char* a, size_t alen, - const char* b, size_t blen); - const char* (*name_)(void*); - - virtual ~leveldb_comparator_t() { - (*destructor_)(state_); - } - - virtual int Compare(const Slice& a, const Slice& b) const { - return (*compare_)(state_, a.data(), a.size(), b.data(), b.size()); - } - - virtual const char* Name() const { - return (*name_)(state_); - } - - // No-ops since the C binding does not support key shortening methods. - virtual void FindShortestSeparator(std::string*, const Slice&) const { } - virtual void FindShortSuccessor(std::string* key) const { } -}; - -struct leveldb_filterpolicy_t : public FilterPolicy { - void* state_; - void (*destructor_)(void*); - const char* (*name_)(void*); - char* (*create_)( - void*, - const char* const* key_array, const size_t* key_length_array, - int num_keys, - size_t* filter_length); - unsigned char (*key_match_)( - void*, - const char* key, size_t length, - const char* filter, size_t filter_length); - - virtual ~leveldb_filterpolicy_t() { - (*destructor_)(state_); - } - - virtual const char* Name() const { - return (*name_)(state_); - } - - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { - std::vector<const char*> key_pointers(n); - std::vector<size_t> key_sizes(n); - for (int i = 0; i < n; i++) { - key_pointers[i] = keys[i].data(); - key_sizes[i] = keys[i].size(); - } - size_t len; - char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len); - dst->append(filter, len); - free(filter); - } - - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { - return (*key_match_)(state_, key.data(), key.size(), - filter.data(), filter.size()); - } -}; - -struct leveldb_env_t { - Env* rep; - bool is_default; -}; - -static bool SaveError(char** errptr, const Status& s) { - assert(errptr != NULL); - if (s.ok()) { - return false; - } else if (*errptr == NULL) { - *errptr = strdup(s.ToString().c_str()); - } else { - // TODO(sanjay): Merge with existing error? - free(*errptr); - *errptr = strdup(s.ToString().c_str()); - } - return true; -} - -static char* CopyString(const std::string& str) { - char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size())); - memcpy(result, str.data(), sizeof(char) * str.size()); - return result; -} - -leveldb_t* leveldb_open( - const leveldb_options_t* options, - const char* name, - char** errptr) { - DB* db; - if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) { - return NULL; - } - leveldb_t* result = new leveldb_t; - result->rep = db; - return result; -} - -void leveldb_close(leveldb_t* db) { - delete db->rep; - delete db; -} - -void leveldb_put( - leveldb_t* db, - const leveldb_writeoptions_t* options, - const char* key, size_t keylen, - const char* val, size_t vallen, - char** errptr) { - SaveError(errptr, - db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))); -} - -void leveldb_delete( - leveldb_t* db, - const leveldb_writeoptions_t* options, - const char* key, size_t keylen, - char** errptr) { - SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen))); -} - - -void leveldb_write( - leveldb_t* db, - const leveldb_writeoptions_t* options, - leveldb_writebatch_t* batch, - char** errptr) { - SaveError(errptr, db->rep->Write(options->rep, &batch->rep)); -} - -char* leveldb_get( - leveldb_t* db, - const leveldb_readoptions_t* options, - const char* key, size_t keylen, - size_t* vallen, - char** errptr) { - char* result = NULL; - std::string tmp; - Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp); - if (s.ok()) { - *vallen = tmp.size(); - result = CopyString(tmp); - } else { - *vallen = 0; - if (!s.IsNotFound()) { - SaveError(errptr, s); - } - } - return result; -} - -leveldb_iterator_t* leveldb_create_iterator( - leveldb_t* db, - const leveldb_readoptions_t* options) { - leveldb_iterator_t* result = new leveldb_iterator_t; - result->rep = db->rep->NewIterator(options->rep); - return result; -} - -const leveldb_snapshot_t* leveldb_create_snapshot( - leveldb_t* db) { - leveldb_snapshot_t* result = new leveldb_snapshot_t; - result->rep = db->rep->GetSnapshot(); - return result; -} - -void leveldb_release_snapshot( - leveldb_t* db, - const leveldb_snapshot_t* snapshot) { - db->rep->ReleaseSnapshot(snapshot->rep); - delete snapshot; -} - -char* leveldb_property_value( - leveldb_t* db, - const char* propname) { - std::string tmp; - if (db->rep->GetProperty(Slice(propname), &tmp)) { - // We use strdup() since we expect human readable output. - return strdup(tmp.c_str()); - } else { - return NULL; - } -} - -void leveldb_approximate_sizes( - leveldb_t* db, - int num_ranges, - const char* const* range_start_key, const size_t* range_start_key_len, - const char* const* range_limit_key, const size_t* range_limit_key_len, - uint64_t* sizes) { - Range* ranges = new Range[num_ranges]; - for (int i = 0; i < num_ranges; i++) { - ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]); - ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]); - } - db->rep->GetApproximateSizes(ranges, num_ranges, sizes); - delete[] ranges; -} - -void leveldb_compact_range( - leveldb_t* db, - const char* start_key, size_t start_key_len, - const char* limit_key, size_t limit_key_len) { - Slice a, b; - db->rep->CompactRange( - // Pass NULL Slice if corresponding "const char*" is NULL - (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL), - (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL)); -} - -void leveldb_destroy_db( - const leveldb_options_t* options, - const char* name, - char** errptr) { - SaveError(errptr, DestroyDB(name, options->rep)); -} - -void leveldb_repair_db( - const leveldb_options_t* options, - const char* name, - char** errptr) { - SaveError(errptr, RepairDB(name, options->rep)); -} - -void leveldb_iter_destroy(leveldb_iterator_t* iter) { - delete iter->rep; - delete iter; -} - -unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) { - return iter->rep->Valid(); -} - -void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) { - iter->rep->SeekToFirst(); -} - -void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) { - iter->rep->SeekToLast(); -} - -void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) { - iter->rep->Seek(Slice(k, klen)); -} - -void leveldb_iter_next(leveldb_iterator_t* iter) { - iter->rep->Next(); -} - -void leveldb_iter_prev(leveldb_iterator_t* iter) { - iter->rep->Prev(); -} - -const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) { - Slice s = iter->rep->key(); - *klen = s.size(); - return s.data(); -} - -const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) { - Slice s = iter->rep->value(); - *vlen = s.size(); - return s.data(); -} - -void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) { - SaveError(errptr, iter->rep->status()); -} - -leveldb_writebatch_t* leveldb_writebatch_create() { - return new leveldb_writebatch_t; -} - -void leveldb_writebatch_destroy(leveldb_writebatch_t* b) { - delete b; -} - -void leveldb_writebatch_clear(leveldb_writebatch_t* b) { - b->rep.Clear(); -} - -void leveldb_writebatch_put( - leveldb_writebatch_t* b, - const char* key, size_t klen, - const char* val, size_t vlen) { - b->rep.Put(Slice(key, klen), Slice(val, vlen)); -} - -void leveldb_writebatch_delete( - leveldb_writebatch_t* b, - const char* key, size_t klen) { - b->rep.Delete(Slice(key, klen)); -} - -void leveldb_writebatch_iterate( - leveldb_writebatch_t* b, - void* state, - void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen), - void (*deleted)(void*, const char* k, size_t klen)) { - class H : public WriteBatch::Handler { - public: - void* state_; - void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen); - void (*deleted_)(void*, const char* k, size_t klen); - virtual void Put(const Slice& key, const Slice& value) { - (*put_)(state_, key.data(), key.size(), value.data(), value.size()); - } - virtual void Delete(const Slice& key) { - (*deleted_)(state_, key.data(), key.size()); - } - }; - H handler; - handler.state_ = state; - handler.put_ = put; - handler.deleted_ = deleted; - b->rep.Iterate(&handler); -} - -leveldb_options_t* leveldb_options_create() { - return new leveldb_options_t; -} - -void leveldb_options_destroy(leveldb_options_t* options) { - delete options; -} - -void leveldb_options_set_comparator( - leveldb_options_t* opt, - leveldb_comparator_t* cmp) { - opt->rep.comparator = cmp; -} - -void leveldb_options_set_filter_policy( - leveldb_options_t* opt, - leveldb_filterpolicy_t* policy) { - opt->rep.filter_policy = policy; -} - -void leveldb_options_set_create_if_missing( - leveldb_options_t* opt, unsigned char v) { - opt->rep.create_if_missing = v; -} - -void leveldb_options_set_error_if_exists( - leveldb_options_t* opt, unsigned char v) { - opt->rep.error_if_exists = v; -} - -void leveldb_options_set_paranoid_checks( - leveldb_options_t* opt, unsigned char v) { - opt->rep.paranoid_checks = v; -} - -void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) { - opt->rep.env = (env ? env->rep : NULL); -} - -void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) { - opt->rep.info_log = (l ? l->rep : NULL); -} - -void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) { - opt->rep.write_buffer_size = s; -} - -void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) { - opt->rep.max_open_files = n; -} - -void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) { - opt->rep.block_cache = c->rep; -} - -void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) { - opt->rep.block_size = s; -} - -void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) { - opt->rep.block_restart_interval = n; -} - -void leveldb_options_set_compression(leveldb_options_t* opt, int t) { - opt->rep.compression = static_cast<CompressionType>(t); -} - -leveldb_comparator_t* leveldb_comparator_create( - void* state, - void (*destructor)(void*), - int (*compare)( - void*, - const char* a, size_t alen, - const char* b, size_t blen), - const char* (*name)(void*)) { - leveldb_comparator_t* result = new leveldb_comparator_t; - result->state_ = state; - result->destructor_ = destructor; - result->compare_ = compare; - result->name_ = name; - return result; -} - -void leveldb_comparator_destroy(leveldb_comparator_t* cmp) { - delete cmp; -} - -leveldb_filterpolicy_t* leveldb_filterpolicy_create( - void* state, - void (*destructor)(void*), - char* (*create_filter)( - void*, - const char* const* key_array, const size_t* key_length_array, - int num_keys, - size_t* filter_length), - unsigned char (*key_may_match)( - void*, - const char* key, size_t length, - const char* filter, size_t filter_length), - const char* (*name)(void*)) { - leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t; - result->state_ = state; - result->destructor_ = destructor; - result->create_ = create_filter; - result->key_match_ = key_may_match; - result->name_ = name; - return result; -} - -void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) { - delete filter; -} - -leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) { - // Make a leveldb_filterpolicy_t, but override all of its methods so - // they delegate to a NewBloomFilterPolicy() instead of user - // supplied C functions. - struct Wrapper : public leveldb_filterpolicy_t { - const FilterPolicy* rep_; - ~Wrapper() { delete rep_; } - const char* Name() const { return rep_->Name(); } - void CreateFilter(const Slice* keys, int n, std::string* dst) const { - return rep_->CreateFilter(keys, n, dst); - } - bool KeyMayMatch(const Slice& key, const Slice& filter) const { - return rep_->KeyMayMatch(key, filter); - } - static void DoNothing(void*) { } - }; - Wrapper* wrapper = new Wrapper; - wrapper->rep_ = NewBloomFilterPolicy(bits_per_key); - wrapper->state_ = NULL; - wrapper->destructor_ = &Wrapper::DoNothing; - return wrapper; -} - -leveldb_readoptions_t* leveldb_readoptions_create() { - return new leveldb_readoptions_t; -} - -void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) { - delete opt; -} - -void leveldb_readoptions_set_verify_checksums( - leveldb_readoptions_t* opt, - unsigned char v) { - opt->rep.verify_checksums = v; -} - -void leveldb_readoptions_set_fill_cache( - leveldb_readoptions_t* opt, unsigned char v) { - opt->rep.fill_cache = v; -} - -void leveldb_readoptions_set_snapshot( - leveldb_readoptions_t* opt, - const leveldb_snapshot_t* snap) { - opt->rep.snapshot = (snap ? snap->rep : NULL); -} - -leveldb_writeoptions_t* leveldb_writeoptions_create() { - return new leveldb_writeoptions_t; -} - -void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) { - delete opt; -} - -void leveldb_writeoptions_set_sync( - leveldb_writeoptions_t* opt, unsigned char v) { - opt->rep.sync = v; -} - -leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) { - leveldb_cache_t* c = new leveldb_cache_t; - c->rep = NewLRUCache(capacity); - return c; -} - -void leveldb_cache_destroy(leveldb_cache_t* cache) { - delete cache->rep; - delete cache; -} - -leveldb_env_t* leveldb_create_default_env() { - leveldb_env_t* result = new leveldb_env_t; - result->rep = Env::Default(); - result->is_default = true; - return result; -} - -void leveldb_env_destroy(leveldb_env_t* env) { - if (!env->is_default) delete env->rep; - delete env; -} - -void leveldb_free(void* ptr) { - free(ptr); -} - -int leveldb_major_version() { - return kMajorVersion; -} - -int leveldb_minor_version() { - return kMinorVersion; -} - -} // end extern "C" diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c deleted file mode 100644 index 7cd5ee0207..0000000000 --- a/src/leveldb/db/c_test.c +++ /dev/null @@ -1,390 +0,0 @@ -/* Copyright (c) 2011 The LevelDB Authors. All rights reserved. - Use of this source code is governed by a BSD-style license that can be - found in the LICENSE file. See the AUTHORS file for names of contributors. */ - -#include "leveldb/c.h" - -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> -#include <unistd.h> - -const char* phase = ""; -static char dbname[200]; - -static void StartPhase(const char* name) { - fprintf(stderr, "=== Test %s\n", name); - phase = name; -} - -static const char* GetTempDir(void) { - const char* ret = getenv("TEST_TMPDIR"); - if (ret == NULL || ret[0] == '\0') - ret = "/tmp"; - return ret; -} - -#define CheckNoError(err) \ - if ((err) != NULL) { \ - fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \ - abort(); \ - } - -#define CheckCondition(cond) \ - if (!(cond)) { \ - fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \ - abort(); \ - } - -static void CheckEqual(const char* expected, const char* v, size_t n) { - if (expected == NULL && v == NULL) { - // ok - } else if (expected != NULL && v != NULL && n == strlen(expected) && - memcmp(expected, v, n) == 0) { - // ok - return; - } else { - fprintf(stderr, "%s: expected '%s', got '%s'\n", - phase, - (expected ? expected : "(null)"), - (v ? v : "(null")); - abort(); - } -} - -static void Free(char** ptr) { - if (*ptr) { - free(*ptr); - *ptr = NULL; - } -} - -static void CheckGet( - leveldb_t* db, - const leveldb_readoptions_t* options, - const char* key, - const char* expected) { - char* err = NULL; - size_t val_len; - char* val; - val = leveldb_get(db, options, key, strlen(key), &val_len, &err); - CheckNoError(err); - CheckEqual(expected, val, val_len); - Free(&val); -} - -static void CheckIter(leveldb_iterator_t* iter, - const char* key, const char* val) { - size_t len; - const char* str; - str = leveldb_iter_key(iter, &len); - CheckEqual(key, str, len); - str = leveldb_iter_value(iter, &len); - CheckEqual(val, str, len); -} - -// Callback from leveldb_writebatch_iterate() -static void CheckPut(void* ptr, - const char* k, size_t klen, - const char* v, size_t vlen) { - int* state = (int*) ptr; - CheckCondition(*state < 2); - switch (*state) { - case 0: - CheckEqual("bar", k, klen); - CheckEqual("b", v, vlen); - break; - case 1: - CheckEqual("box", k, klen); - CheckEqual("c", v, vlen); - break; - } - (*state)++; -} - -// Callback from leveldb_writebatch_iterate() -static void CheckDel(void* ptr, const char* k, size_t klen) { - int* state = (int*) ptr; - CheckCondition(*state == 2); - CheckEqual("bar", k, klen); - (*state)++; -} - -static void CmpDestroy(void* arg) { } - -static int CmpCompare(void* arg, const char* a, size_t alen, - const char* b, size_t blen) { - int n = (alen < blen) ? alen : blen; - int r = memcmp(a, b, n); - if (r == 0) { - if (alen < blen) r = -1; - else if (alen > blen) r = +1; - } - return r; -} - -static const char* CmpName(void* arg) { - return "foo"; -} - -// Custom filter policy -static unsigned char fake_filter_result = 1; -static void FilterDestroy(void* arg) { } -static const char* FilterName(void* arg) { - return "TestFilter"; -} -static char* FilterCreate( - void* arg, - const char* const* key_array, const size_t* key_length_array, - int num_keys, - size_t* filter_length) { - *filter_length = 4; - char* result = malloc(4); - memcpy(result, "fake", 4); - return result; -} -unsigned char FilterKeyMatch( - void* arg, - const char* key, size_t length, - const char* filter, size_t filter_length) { - CheckCondition(filter_length == 4); - CheckCondition(memcmp(filter, "fake", 4) == 0); - return fake_filter_result; -} - -int main(int argc, char** argv) { - leveldb_t* db; - leveldb_comparator_t* cmp; - leveldb_cache_t* cache; - leveldb_env_t* env; - leveldb_options_t* options; - leveldb_readoptions_t* roptions; - leveldb_writeoptions_t* woptions; - char* err = NULL; - int run = -1; - - CheckCondition(leveldb_major_version() >= 1); - CheckCondition(leveldb_minor_version() >= 1); - - snprintf(dbname, sizeof(dbname), - "%s/leveldb_c_test-%d", - GetTempDir(), - ((int) geteuid())); - - StartPhase("create_objects"); - cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); - env = leveldb_create_default_env(); - cache = leveldb_cache_create_lru(100000); - - options = leveldb_options_create(); - leveldb_options_set_comparator(options, cmp); - leveldb_options_set_error_if_exists(options, 1); - leveldb_options_set_cache(options, cache); - leveldb_options_set_env(options, env); - leveldb_options_set_info_log(options, NULL); - leveldb_options_set_write_buffer_size(options, 100000); - leveldb_options_set_paranoid_checks(options, 1); - leveldb_options_set_max_open_files(options, 10); - leveldb_options_set_block_size(options, 1024); - leveldb_options_set_block_restart_interval(options, 8); - leveldb_options_set_compression(options, leveldb_no_compression); - - roptions = leveldb_readoptions_create(); - leveldb_readoptions_set_verify_checksums(roptions, 1); - leveldb_readoptions_set_fill_cache(roptions, 0); - - woptions = leveldb_writeoptions_create(); - leveldb_writeoptions_set_sync(woptions, 1); - - StartPhase("destroy"); - leveldb_destroy_db(options, dbname, &err); - Free(&err); - - StartPhase("open_error"); - db = leveldb_open(options, dbname, &err); - CheckCondition(err != NULL); - Free(&err); - - StartPhase("leveldb_free"); - db = leveldb_open(options, dbname, &err); - CheckCondition(err != NULL); - leveldb_free(err); - err = NULL; - - StartPhase("open"); - leveldb_options_set_create_if_missing(options, 1); - db = leveldb_open(options, dbname, &err); - CheckNoError(err); - CheckGet(db, roptions, "foo", NULL); - - StartPhase("put"); - leveldb_put(db, woptions, "foo", 3, "hello", 5, &err); - CheckNoError(err); - CheckGet(db, roptions, "foo", "hello"); - - StartPhase("compactall"); - leveldb_compact_range(db, NULL, 0, NULL, 0); - CheckGet(db, roptions, "foo", "hello"); - - StartPhase("compactrange"); - leveldb_compact_range(db, "a", 1, "z", 1); - CheckGet(db, roptions, "foo", "hello"); - - StartPhase("writebatch"); - { - leveldb_writebatch_t* wb = leveldb_writebatch_create(); - leveldb_writebatch_put(wb, "foo", 3, "a", 1); - leveldb_writebatch_clear(wb); - leveldb_writebatch_put(wb, "bar", 3, "b", 1); - leveldb_writebatch_put(wb, "box", 3, "c", 1); - leveldb_writebatch_delete(wb, "bar", 3); - leveldb_write(db, woptions, wb, &err); - CheckNoError(err); - CheckGet(db, roptions, "foo", "hello"); - CheckGet(db, roptions, "bar", NULL); - CheckGet(db, roptions, "box", "c"); - int pos = 0; - leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel); - CheckCondition(pos == 3); - leveldb_writebatch_destroy(wb); - } - - StartPhase("iter"); - { - leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_first(iter); - CheckCondition(leveldb_iter_valid(iter)); - CheckIter(iter, "box", "c"); - leveldb_iter_next(iter); - CheckIter(iter, "foo", "hello"); - leveldb_iter_prev(iter); - CheckIter(iter, "box", "c"); - leveldb_iter_prev(iter); - CheckCondition(!leveldb_iter_valid(iter)); - leveldb_iter_seek_to_last(iter); - CheckIter(iter, "foo", "hello"); - leveldb_iter_seek(iter, "b", 1); - CheckIter(iter, "box", "c"); - leveldb_iter_get_error(iter, &err); - CheckNoError(err); - leveldb_iter_destroy(iter); - } - - StartPhase("approximate_sizes"); - { - int i; - int n = 20000; - char keybuf[100]; - char valbuf[100]; - uint64_t sizes[2]; - const char* start[2] = { "a", "k00000000000000010000" }; - size_t start_len[2] = { 1, 21 }; - const char* limit[2] = { "k00000000000000010000", "z" }; - size_t limit_len[2] = { 21, 1 }; - leveldb_writeoptions_set_sync(woptions, 0); - for (i = 0; i < n; i++) { - snprintf(keybuf, sizeof(keybuf), "k%020d", i); - snprintf(valbuf, sizeof(valbuf), "v%020d", i); - leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf), - &err); - CheckNoError(err); - } - leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes); - CheckCondition(sizes[0] > 0); - CheckCondition(sizes[1] > 0); - } - - StartPhase("property"); - { - char* prop = leveldb_property_value(db, "nosuchprop"); - CheckCondition(prop == NULL); - prop = leveldb_property_value(db, "leveldb.stats"); - CheckCondition(prop != NULL); - Free(&prop); - } - - StartPhase("snapshot"); - { - const leveldb_snapshot_t* snap; - snap = leveldb_create_snapshot(db); - leveldb_delete(db, woptions, "foo", 3, &err); - CheckNoError(err); - leveldb_readoptions_set_snapshot(roptions, snap); - CheckGet(db, roptions, "foo", "hello"); - leveldb_readoptions_set_snapshot(roptions, NULL); - CheckGet(db, roptions, "foo", NULL); - leveldb_release_snapshot(db, snap); - } - - StartPhase("repair"); - { - leveldb_close(db); - leveldb_options_set_create_if_missing(options, 0); - leveldb_options_set_error_if_exists(options, 0); - leveldb_repair_db(options, dbname, &err); - CheckNoError(err); - db = leveldb_open(options, dbname, &err); - CheckNoError(err); - CheckGet(db, roptions, "foo", NULL); - CheckGet(db, roptions, "bar", NULL); - CheckGet(db, roptions, "box", "c"); - leveldb_options_set_create_if_missing(options, 1); - leveldb_options_set_error_if_exists(options, 1); - } - - StartPhase("filter"); - for (run = 0; run < 2; run++) { - // First run uses custom filter, second run uses bloom filter - CheckNoError(err); - leveldb_filterpolicy_t* policy; - if (run == 0) { - policy = leveldb_filterpolicy_create( - NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName); - } else { - policy = leveldb_filterpolicy_create_bloom(10); - } - - // Create new database - leveldb_close(db); - leveldb_destroy_db(options, dbname, &err); - leveldb_options_set_filter_policy(options, policy); - db = leveldb_open(options, dbname, &err); - CheckNoError(err); - leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err); - CheckNoError(err); - leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err); - CheckNoError(err); - leveldb_compact_range(db, NULL, 0, NULL, 0); - - fake_filter_result = 1; - CheckGet(db, roptions, "foo", "foovalue"); - CheckGet(db, roptions, "bar", "barvalue"); - if (phase == 0) { - // Must not find value when custom filter returns false - fake_filter_result = 0; - CheckGet(db, roptions, "foo", NULL); - CheckGet(db, roptions, "bar", NULL); - fake_filter_result = 1; - - CheckGet(db, roptions, "foo", "foovalue"); - CheckGet(db, roptions, "bar", "barvalue"); - } - leveldb_options_set_filter_policy(options, NULL); - leveldb_filterpolicy_destroy(policy); - } - - StartPhase("cleanup"); - leveldb_close(db); - leveldb_options_destroy(options); - leveldb_readoptions_destroy(roptions); - leveldb_writeoptions_destroy(woptions); - leveldb_cache_destroy(cache); - leveldb_comparator_destroy(cmp); - leveldb_env_destroy(env); - - fprintf(stderr, "PASS\n"); - return 0; -} diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc deleted file mode 100644 index 31b2d5f416..0000000000 --- a/src/leveldb/db/corruption_test.cc +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include <errno.h> -#include <fcntl.h> -#include <sys/stat.h> -#include <sys/types.h> -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/log_format.h" -#include "db/version_set.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static const int kValueSize = 1000; - -class CorruptionTest { - public: - test::ErrorEnv env_; - std::string dbname_; - Cache* tiny_cache_; - Options options_; - DB* db_; - - CorruptionTest() { - tiny_cache_ = NewLRUCache(100); - options_.env = &env_; - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, options_); - - db_ = NULL; - options_.create_if_missing = true; - Reopen(); - options_.create_if_missing = false; - } - - ~CorruptionTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete tiny_cache_; - } - - Status TryReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - Options opt = (options ? *options : options_); - opt.env = &env_; - opt.block_cache = tiny_cache_; - return DB::Open(opt, dbname_, &db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void RepairDB() { - delete db_; - db_ = NULL; - ASSERT_OK(::leveldb::RepairDB(dbname_, options_)); - } - - void Build(int n) { - std::string key_space, value_space; - WriteBatch batch; - for (int i = 0; i < n; i++) { - //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n); - Slice key = Key(i, &key_space); - batch.Clear(); - batch.Put(key, Value(i, &value_space)); - ASSERT_OK(db_->Write(WriteOptions(), &batch)); - } - } - - void Check(int min_expected, int max_expected) { - int next_expected = 0; - int missed = 0; - int bad_keys = 0; - int bad_values = 0; - int correct = 0; - std::string value_space; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - uint64_t key; - Slice in(iter->key()); - if (!ConsumeDecimalNumber(&in, &key) || - !in.empty() || - key < next_expected) { - bad_keys++; - continue; - } - missed += (key - next_expected); - next_expected = key + 1; - if (iter->value() != Value(key, &value_space)) { - bad_values++; - } else { - correct++; - } - } - delete iter; - - fprintf(stderr, - "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n", - min_expected, max_expected, correct, bad_keys, bad_values, missed); - ASSERT_LE(min_expected, correct); - ASSERT_GE(max_expected, correct); - } - - void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { - // Pick file to corrupt - std::vector<std::string> filenames; - ASSERT_OK(env_.GetChildren(dbname_, &filenames)); - uint64_t number; - FileType type; - std::string fname; - int picked_number = -1; - for (int i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type == filetype && - int(number) > picked_number) { // Pick latest file - fname = dbname_ + "/" + filenames[i]; - picked_number = number; - } - } - ASSERT_TRUE(!fname.empty()) << filetype; - - struct stat sbuf; - if (stat(fname.c_str(), &sbuf) != 0) { - const char* msg = strerror(errno); - ASSERT_TRUE(false) << fname << ": " << msg; - } - - if (offset < 0) { - // Relative to end of file; make it absolute - if (-offset > sbuf.st_size) { - offset = 0; - } else { - offset = sbuf.st_size + offset; - } - } - if (offset > sbuf.st_size) { - offset = sbuf.st_size; - } - if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = sbuf.st_size - offset; - } - - // Do it - std::string contents; - Status s = ReadFileToString(Env::Default(), fname, &contents); - ASSERT_TRUE(s.ok()) << s.ToString(); - for (int i = 0; i < bytes_to_corrupt; i++) { - contents[i + offset] ^= 0x80; - } - s = WriteStringToFile(Env::Default(), contents, fname); - ASSERT_TRUE(s.ok()) << s.ToString(); - } - - int Property(const std::string& name) { - std::string property; - int result; - if (db_->GetProperty(name, &property) && - sscanf(property.c_str(), "%d", &result) == 1) { - return result; - } else { - return -1; - } - } - - // Return the ith key - Slice Key(int i, std::string* storage) { - char buf[100]; - snprintf(buf, sizeof(buf), "%016d", i); - storage->assign(buf, strlen(buf)); - return Slice(*storage); - } - - // Return the value to associate with the specified key - Slice Value(int k, std::string* storage) { - Random r(k); - return test::RandomString(&r, kValueSize, storage); - } -}; - -TEST(CorruptionTest, Recovery) { - Build(100); - Check(100, 100); - Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record - Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block - Reopen(); - - // The 64 records in the first two log blocks are completely lost. - Check(36, 36); -} - -TEST(CorruptionTest, RecoverWriteError) { - env_.writable_file_error_ = true; - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); -} - -TEST(CorruptionTest, NewFileErrorDuringWrite) { - // Do enough writing to force minor compaction - env_.writable_file_error_ = true; - const int num = 3 + (Options().write_buffer_size / kValueSize); - std::string value_storage; - Status s; - for (int i = 0; s.ok() && i < num; i++) { - WriteBatch batch; - batch.Put("a", Value(100, &value_storage)); - s = db_->Write(WriteOptions(), &batch); - } - ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_.num_writable_file_errors_, 1); - env_.writable_file_error_ = false; - Reopen(); -} - -TEST(CorruptionTest, TableFile) { - Build(100); - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, NULL, NULL); - dbi->TEST_CompactRange(1, NULL, NULL); - - Corrupt(kTableFile, 100, 1); - Check(99, 99); -} - -TEST(CorruptionTest, TableFileIndexData) { - Build(10000); // Enough to build multiple Tables - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - dbi->TEST_CompactMemTable(); - - Corrupt(kTableFile, -2000, 500); - Reopen(); - Check(5000, 9999); -} - -TEST(CorruptionTest, MissingDescriptor) { - Build(1000); - RepairDB(); - Reopen(); - Check(1000, 1000); -} - -TEST(CorruptionTest, SequenceNumberRecovery) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5")); - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v5", v); - // Write something. If sequence number was not recovered properly, - // it will be hidden by an earlier write. - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6")); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); - Reopen(); - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("v6", v); -} - -TEST(CorruptionTest, CorruptedDescriptor) { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello")); - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - dbi->TEST_CompactMemTable(); - dbi->TEST_CompactRange(0, NULL, NULL); - - Corrupt(kDescriptorFile, 0, 1000); - Status s = TryReopen(); - ASSERT_TRUE(!s.ok()); - - RepairDB(); - Reopen(); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("hello", v); -} - -TEST(CorruptionTest, CompactionInputError) { - Build(10); - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - dbi->TEST_CompactMemTable(); - const int last = config::kMaxMemCompactLevel; - ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last))); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Force compactions by writing lots of values - Build(10000); - Check(10000, 10000); -} - -TEST(CorruptionTest, CompactionInputErrorParanoid) { - Options options; - options.paranoid_checks = true; - options.write_buffer_size = 1048576; - Reopen(&options); - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - - // Fill levels >= 1 so memtable compaction outputs to level 1 - for (int level = 1; level < config::kNumLevels; level++) { - dbi->Put(WriteOptions(), "", "begin"); - dbi->Put(WriteOptions(), "~", "end"); - dbi->TEST_CompactMemTable(); - } - - Build(10); - dbi->TEST_CompactMemTable(); - ASSERT_EQ(1, Property("leveldb.num-files-at-level0")); - - Corrupt(kTableFile, 100, 1); - Check(9, 9); - - // Write must eventually fail because of corrupted table - Status s; - std::string tmp1, tmp2; - for (int i = 0; i < 10000 && s.ok(); i++) { - s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2)); - } - ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db"; -} - -TEST(CorruptionTest, UnrelatedKeys) { - Build(10); - DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); - dbi->TEST_CompactMemTable(); - Corrupt(kTableFile, 100, 1); - - std::string tmp1, tmp2; - ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); - std::string v; - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); - dbi->TEST_CompactMemTable(); - ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v)); - ASSERT_EQ(Value(1000, &tmp2).ToString(), v); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc deleted file mode 100644 index 7abdf87587..0000000000 --- a/src/leveldb/db/db_bench.cc +++ /dev/null @@ -1,979 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include <sys/types.h> -#include <stdio.h> -#include <stdlib.h> -#include "db/db_impl.h" -#include "db/version_set.h" -#include "leveldb/cache.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/write_batch.h" -#include "port/port.h" -#include "util/crc32c.h" -#include "util/histogram.h" -#include "util/mutexlock.h" -#include "util/random.h" -#include "util/testutil.h" - -// Comma-separated list of operations to run in the specified order -// Actual benchmarks: -// fillseq -- write N values in sequential key order in async mode -// fillrandom -- write N values in random key order in async mode -// overwrite -- overwrite N values in random key order in async mode -// fillsync -- write N/100 values in random key order in sync mode -// fill100K -- write N/1000 100K values in random order in async mode -// deleteseq -- delete N keys in sequential order -// deleterandom -- delete N keys in random order -// readseq -- read N times sequentially -// readreverse -- read N times in reverse order -// readrandom -- read N times in random order -// readmissing -- read N missing keys in random order -// readhot -- read N times in random order from 1% section of DB -// seekrandom -- N random seeks -// crc32c -- repeated crc32c of 4K of data -// acquireload -- load N*1000 times -// Meta operations: -// compact -- Compact the entire DB -// stats -- Print DB stats -// sstables -- Print sstable info -// heapprofile -- Dump a heap profile (if supported by this port) -static const char* FLAGS_benchmarks = - "fillseq," - "fillsync," - "fillrandom," - "overwrite," - "readrandom," - "readrandom," // Extra run to allow previous compactions to quiesce - "readseq," - "readreverse," - "compact," - "readrandom," - "readseq," - "readreverse," - "fill100K," - "crc32c," - "snappycomp," - "snappyuncomp," - "acquireload," - ; - -// Number of key/values to place in database -static int FLAGS_num = 1000000; - -// Number of read operations to do. If negative, do FLAGS_num reads. -static int FLAGS_reads = -1; - -// Number of concurrent threads to run. -static int FLAGS_threads = 1; - -// Size of each value -static int FLAGS_value_size = 100; - -// Arrange to generate values that shrink to this fraction of -// their original size after compression -static double FLAGS_compression_ratio = 0.5; - -// Print histogram of operation timings -static bool FLAGS_histogram = false; - -// Number of bytes to buffer in memtable before compacting -// (initialized to default value by "main") -static int FLAGS_write_buffer_size = 0; - -// Number of bytes to use as a cache of uncompressed data. -// Negative means use default settings. -static int FLAGS_cache_size = -1; - -// Maximum number of files to keep open at the same time (use default if == 0) -static int FLAGS_open_files = 0; - -// Bloom filter bits per key. -// Negative means use default settings. -static int FLAGS_bloom_bits = -1; - -// If true, do not destroy the existing database. If you set this -// flag and also specify a benchmark that wants a fresh database, that -// benchmark will fail. -static bool FLAGS_use_existing_db = false; - -// Use the db with the following name. -static const char* FLAGS_db = NULL; - -namespace leveldb { - -namespace { - -// Helper for quickly generating random data. -class RandomGenerator { - private: - std::string data_; - int pos_; - - public: - RandomGenerator() { - // We use a limited amount of data over and over again and ensure - // that it is larger than the compression window (32KB), and also - // large enough to serve all typical value sizes we want to write. - Random rnd(301); - std::string piece; - while (data_.size() < 1048576) { - // Add a short fragment that is as compressible as specified - // by FLAGS_compression_ratio. - test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); - data_.append(piece); - } - pos_ = 0; - } - - Slice Generate(int len) { - if (pos_ + len > data_.size()) { - pos_ = 0; - assert(len < data_.size()); - } - pos_ += len; - return Slice(data_.data() + pos_ - len, len); - } -}; - -static Slice TrimSpace(Slice s) { - int start = 0; - while (start < s.size() && isspace(s[start])) { - start++; - } - int limit = s.size(); - while (limit > start && isspace(s[limit-1])) { - limit--; - } - return Slice(s.data() + start, limit - start); -} - -static void AppendWithSpace(std::string* str, Slice msg) { - if (msg.empty()) return; - if (!str->empty()) { - str->push_back(' '); - } - str->append(msg.data(), msg.size()); -} - -class Stats { - private: - double start_; - double finish_; - double seconds_; - int done_; - int next_report_; - int64_t bytes_; - double last_op_finish_; - Histogram hist_; - std::string message_; - - public: - Stats() { Start(); } - - void Start() { - next_report_ = 100; - last_op_finish_ = start_; - hist_.Clear(); - done_ = 0; - bytes_ = 0; - seconds_ = 0; - start_ = Env::Default()->NowMicros(); - finish_ = start_; - message_.clear(); - } - - void Merge(const Stats& other) { - hist_.Merge(other.hist_); - done_ += other.done_; - bytes_ += other.bytes_; - seconds_ += other.seconds_; - if (other.start_ < start_) start_ = other.start_; - if (other.finish_ > finish_) finish_ = other.finish_; - - // Just keep the messages from one thread - if (message_.empty()) message_ = other.message_; - } - - void Stop() { - finish_ = Env::Default()->NowMicros(); - seconds_ = (finish_ - start_) * 1e-6; - } - - void AddMessage(Slice msg) { - AppendWithSpace(&message_, msg); - } - - void FinishedSingleOp() { - if (FLAGS_histogram) { - double now = Env::Default()->NowMicros(); - double micros = now - last_op_finish_; - hist_.Add(micros); - if (micros > 20000) { - fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); - fflush(stderr); - } - last_op_finish_ = now; - } - - done_++; - if (done_ >= next_report_) { - if (next_report_ < 1000) next_report_ += 100; - else if (next_report_ < 5000) next_report_ += 500; - else if (next_report_ < 10000) next_report_ += 1000; - else if (next_report_ < 50000) next_report_ += 5000; - else if (next_report_ < 100000) next_report_ += 10000; - else if (next_report_ < 500000) next_report_ += 50000; - else next_report_ += 100000; - fprintf(stderr, "... finished %d ops%30s\r", done_, ""); - fflush(stderr); - } - } - - void AddBytes(int64_t n) { - bytes_ += n; - } - - void Report(const Slice& name) { - // Pretend at least one op was done in case we are running a benchmark - // that does not call FinishedSingleOp(). - if (done_ < 1) done_ = 1; - - std::string extra; - if (bytes_ > 0) { - // Rate is computed on actual elapsed time, not the sum of per-thread - // elapsed times. - double elapsed = (finish_ - start_) * 1e-6; - char rate[100]; - snprintf(rate, sizeof(rate), "%6.1f MB/s", - (bytes_ / 1048576.0) / elapsed); - extra = rate; - } - AppendWithSpace(&extra, message_); - - fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", - name.ToString().c_str(), - seconds_ * 1e6 / done_, - (extra.empty() ? "" : " "), - extra.c_str()); - if (FLAGS_histogram) { - fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); - } - fflush(stdout); - } -}; - -// State shared by all concurrent executions of the same benchmark. -struct SharedState { - port::Mutex mu; - port::CondVar cv; - int total; - - // Each thread goes through the following states: - // (1) initializing - // (2) waiting for others to be initialized - // (3) running - // (4) done - - int num_initialized; - int num_done; - bool start; - - SharedState() : cv(&mu) { } -}; - -// Per-thread state for concurrent executions of the same benchmark. -struct ThreadState { - int tid; // 0..n-1 when running in n threads - Random rand; // Has different seeds for different threads - Stats stats; - SharedState* shared; - - ThreadState(int index) - : tid(index), - rand(1000 + index) { - } -}; - -} // namespace - -class Benchmark { - private: - Cache* cache_; - const FilterPolicy* filter_policy_; - DB* db_; - int num_; - int value_size_; - int entries_per_batch_; - WriteOptions write_options_; - int reads_; - int heap_counter_; - - void PrintHeader() { - const int kKeySize = 16; - PrintEnvironment(); - fprintf(stdout, "Keys: %d bytes each\n", kKeySize); - fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", - FLAGS_value_size, - static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); - fprintf(stdout, "Entries: %d\n", num_); - fprintf(stdout, "RawSize: %.1f MB (estimated)\n", - ((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_) - / 1048576.0)); - fprintf(stdout, "FileSize: %.1f MB (estimated)\n", - (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) - / 1048576.0)); - PrintWarnings(); - fprintf(stdout, "------------------------------------------------\n"); - } - - void PrintWarnings() { -#if defined(__GNUC__) && !defined(__OPTIMIZE__) - fprintf(stdout, - "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" - ); -#endif -#ifndef NDEBUG - fprintf(stdout, - "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); -#endif - - // See if snappy is working by attempting to compress a compressible string - const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - std::string compressed; - if (!port::Snappy_Compress(text, sizeof(text), &compressed)) { - fprintf(stdout, "WARNING: Snappy compression is not enabled\n"); - } else if (compressed.size() >= sizeof(text)) { - fprintf(stdout, "WARNING: Snappy compression is not effective\n"); - } - } - - void PrintEnvironment() { - fprintf(stderr, "LevelDB: version %d.%d\n", - kMajorVersion, kMinorVersion); - -#if defined(__linux) - time_t now = time(NULL); - fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline - - FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); - if (cpuinfo != NULL) { - char line[1000]; - int num_cpus = 0; - std::string cpu_type; - std::string cache_size; - while (fgets(line, sizeof(line), cpuinfo) != NULL) { - const char* sep = strchr(line, ':'); - if (sep == NULL) { - continue; - } - Slice key = TrimSpace(Slice(line, sep - 1 - line)); - Slice val = TrimSpace(Slice(sep + 1)); - if (key == "model name") { - ++num_cpus; - cpu_type = val.ToString(); - } else if (key == "cache size") { - cache_size = val.ToString(); - } - } - fclose(cpuinfo); - fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); - fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); - } -#endif - } - - public: - Benchmark() - : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL), - filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : NULL), - db_(NULL), - num_(FLAGS_num), - value_size_(FLAGS_value_size), - entries_per_batch_(1), - reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), - heap_counter_(0) { - std::vector<std::string> files; - Env::Default()->GetChildren(FLAGS_db, &files); - for (int i = 0; i < files.size(); i++) { - if (Slice(files[i]).starts_with("heap-")) { - Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]); - } - } - if (!FLAGS_use_existing_db) { - DestroyDB(FLAGS_db, Options()); - } - } - - ~Benchmark() { - delete db_; - delete cache_; - delete filter_policy_; - } - - void Run() { - PrintHeader(); - Open(); - - const char* benchmarks = FLAGS_benchmarks; - while (benchmarks != NULL) { - const char* sep = strchr(benchmarks, ','); - Slice name; - if (sep == NULL) { - name = benchmarks; - benchmarks = NULL; - } else { - name = Slice(benchmarks, sep - benchmarks); - benchmarks = sep + 1; - } - - // Reset parameters that may be overriddden bwlow - num_ = FLAGS_num; - reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads); - value_size_ = FLAGS_value_size; - entries_per_batch_ = 1; - write_options_ = WriteOptions(); - - void (Benchmark::*method)(ThreadState*) = NULL; - bool fresh_db = false; - int num_threads = FLAGS_threads; - - if (name == Slice("fillseq")) { - fresh_db = true; - method = &Benchmark::WriteSeq; - } else if (name == Slice("fillbatch")) { - fresh_db = true; - entries_per_batch_ = 1000; - method = &Benchmark::WriteSeq; - } else if (name == Slice("fillrandom")) { - fresh_db = true; - method = &Benchmark::WriteRandom; - } else if (name == Slice("overwrite")) { - fresh_db = false; - method = &Benchmark::WriteRandom; - } else if (name == Slice("fillsync")) { - fresh_db = true; - num_ /= 1000; - write_options_.sync = true; - method = &Benchmark::WriteRandom; - } else if (name == Slice("fill100K")) { - fresh_db = true; - num_ /= 1000; - value_size_ = 100 * 1000; - method = &Benchmark::WriteRandom; - } else if (name == Slice("readseq")) { - method = &Benchmark::ReadSequential; - } else if (name == Slice("readreverse")) { - method = &Benchmark::ReadReverse; - } else if (name == Slice("readrandom")) { - method = &Benchmark::ReadRandom; - } else if (name == Slice("readmissing")) { - method = &Benchmark::ReadMissing; - } else if (name == Slice("seekrandom")) { - method = &Benchmark::SeekRandom; - } else if (name == Slice("readhot")) { - method = &Benchmark::ReadHot; - } else if (name == Slice("readrandomsmall")) { - reads_ /= 1000; - method = &Benchmark::ReadRandom; - } else if (name == Slice("deleteseq")) { - method = &Benchmark::DeleteSeq; - } else if (name == Slice("deleterandom")) { - method = &Benchmark::DeleteRandom; - } else if (name == Slice("readwhilewriting")) { - num_threads++; // Add extra thread for writing - method = &Benchmark::ReadWhileWriting; - } else if (name == Slice("compact")) { - method = &Benchmark::Compact; - } else if (name == Slice("crc32c")) { - method = &Benchmark::Crc32c; - } else if (name == Slice("acquireload")) { - method = &Benchmark::AcquireLoad; - } else if (name == Slice("snappycomp")) { - method = &Benchmark::SnappyCompress; - } else if (name == Slice("snappyuncomp")) { - method = &Benchmark::SnappyUncompress; - } else if (name == Slice("heapprofile")) { - HeapProfile(); - } else if (name == Slice("stats")) { - PrintStats("leveldb.stats"); - } else if (name == Slice("sstables")) { - PrintStats("leveldb.sstables"); - } else { - if (name != Slice()) { // No error message for empty name - fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); - } - } - - if (fresh_db) { - if (FLAGS_use_existing_db) { - fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n", - name.ToString().c_str()); - method = NULL; - } else { - delete db_; - db_ = NULL; - DestroyDB(FLAGS_db, Options()); - Open(); - } - } - - if (method != NULL) { - RunBenchmark(num_threads, name, method); - } - } - } - - private: - struct ThreadArg { - Benchmark* bm; - SharedState* shared; - ThreadState* thread; - void (Benchmark::*method)(ThreadState*); - }; - - static void ThreadBody(void* v) { - ThreadArg* arg = reinterpret_cast<ThreadArg*>(v); - SharedState* shared = arg->shared; - ThreadState* thread = arg->thread; - { - MutexLock l(&shared->mu); - shared->num_initialized++; - if (shared->num_initialized >= shared->total) { - shared->cv.SignalAll(); - } - while (!shared->start) { - shared->cv.Wait(); - } - } - - thread->stats.Start(); - (arg->bm->*(arg->method))(thread); - thread->stats.Stop(); - - { - MutexLock l(&shared->mu); - shared->num_done++; - if (shared->num_done >= shared->total) { - shared->cv.SignalAll(); - } - } - } - - void RunBenchmark(int n, Slice name, - void (Benchmark::*method)(ThreadState*)) { - SharedState shared; - shared.total = n; - shared.num_initialized = 0; - shared.num_done = 0; - shared.start = false; - - ThreadArg* arg = new ThreadArg[n]; - for (int i = 0; i < n; i++) { - arg[i].bm = this; - arg[i].method = method; - arg[i].shared = &shared; - arg[i].thread = new ThreadState(i); - arg[i].thread->shared = &shared; - Env::Default()->StartThread(ThreadBody, &arg[i]); - } - - shared.mu.Lock(); - while (shared.num_initialized < n) { - shared.cv.Wait(); - } - - shared.start = true; - shared.cv.SignalAll(); - while (shared.num_done < n) { - shared.cv.Wait(); - } - shared.mu.Unlock(); - - for (int i = 1; i < n; i++) { - arg[0].thread->stats.Merge(arg[i].thread->stats); - } - arg[0].thread->stats.Report(name); - - for (int i = 0; i < n; i++) { - delete arg[i].thread; - } - delete[] arg; - } - - void Crc32c(ThreadState* thread) { - // Checksum about 500MB of data total - const int size = 4096; - const char* label = "(4K per op)"; - std::string data(size, 'x'); - int64_t bytes = 0; - uint32_t crc = 0; - while (bytes < 500 * 1048576) { - crc = crc32c::Value(data.data(), size); - thread->stats.FinishedSingleOp(); - bytes += size; - } - // Print so result is not dead - fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc)); - - thread->stats.AddBytes(bytes); - thread->stats.AddMessage(label); - } - - void AcquireLoad(ThreadState* thread) { - int dummy; - port::AtomicPointer ap(&dummy); - int count = 0; - void *ptr = NULL; - thread->stats.AddMessage("(each op is 1000 loads)"); - while (count < 100000) { - for (int i = 0; i < 1000; i++) { - ptr = ap.Acquire_Load(); - } - count++; - thread->stats.FinishedSingleOp(); - } - if (ptr == NULL) exit(1); // Disable unused variable warning. - } - - void SnappyCompress(ThreadState* thread) { - RandomGenerator gen; - Slice input = gen.Generate(Options().block_size); - int64_t bytes = 0; - int64_t produced = 0; - bool ok = true; - std::string compressed; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - produced += compressed.size(); - bytes += input.size(); - thread->stats.FinishedSingleOp(); - } - - if (!ok) { - thread->stats.AddMessage("(snappy failure)"); - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "(output: %.1f%%)", - (produced * 100.0) / bytes); - thread->stats.AddMessage(buf); - thread->stats.AddBytes(bytes); - } - } - - void SnappyUncompress(ThreadState* thread) { - RandomGenerator gen; - Slice input = gen.Generate(Options().block_size); - std::string compressed; - bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed); - int64_t bytes = 0; - char* uncompressed = new char[input.size()]; - while (ok && bytes < 1024 * 1048576) { // Compress 1G - ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - uncompressed); - bytes += input.size(); - thread->stats.FinishedSingleOp(); - } - delete[] uncompressed; - - if (!ok) { - thread->stats.AddMessage("(snappy failure)"); - } else { - thread->stats.AddBytes(bytes); - } - } - - void Open() { - assert(db_ == NULL); - Options options; - options.create_if_missing = !FLAGS_use_existing_db; - options.block_cache = cache_; - options.write_buffer_size = FLAGS_write_buffer_size; - options.max_open_files = FLAGS_open_files; - options.filter_policy = filter_policy_; - Status s = DB::Open(options, FLAGS_db, &db_); - if (!s.ok()) { - fprintf(stderr, "open error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - void WriteSeq(ThreadState* thread) { - DoWrite(thread, true); - } - - void WriteRandom(ThreadState* thread) { - DoWrite(thread, false); - } - - void DoWrite(ThreadState* thread, bool seq) { - if (num_ != FLAGS_num) { - char msg[100]; - snprintf(msg, sizeof(msg), "(%d ops)", num_); - thread->stats.AddMessage(msg); - } - - RandomGenerator gen; - WriteBatch batch; - Status s; - int64_t bytes = 0; - for (int i = 0; i < num_; i += entries_per_batch_) { - batch.Clear(); - for (int j = 0; j < entries_per_batch_; j++) { - const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num); - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - batch.Put(key, gen.Generate(value_size_)); - bytes += value_size_ + strlen(key); - thread->stats.FinishedSingleOp(); - } - s = db_->Write(write_options_, &batch); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - thread->stats.AddBytes(bytes); - } - - void ReadSequential(ThreadState* thread) { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - int64_t bytes = 0; - for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { - bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedSingleOp(); - ++i; - } - delete iter; - thread->stats.AddBytes(bytes); - } - - void ReadReverse(ThreadState* thread) { - Iterator* iter = db_->NewIterator(ReadOptions()); - int i = 0; - int64_t bytes = 0; - for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { - bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedSingleOp(); - ++i; - } - delete iter; - thread->stats.AddBytes(bytes); - } - - void ReadRandom(ThreadState* thread) { - ReadOptions options; - std::string value; - int found = 0; - for (int i = 0; i < reads_; i++) { - char key[100]; - const int k = thread->rand.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d", k); - if (db_->Get(options, key, &value).ok()) { - found++; - } - thread->stats.FinishedSingleOp(); - } - char msg[100]; - snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_); - thread->stats.AddMessage(msg); - } - - void ReadMissing(ThreadState* thread) { - ReadOptions options; - std::string value; - for (int i = 0; i < reads_; i++) { - char key[100]; - const int k = thread->rand.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d.", k); - db_->Get(options, key, &value); - thread->stats.FinishedSingleOp(); - } - } - - void ReadHot(ThreadState* thread) { - ReadOptions options; - std::string value; - const int range = (FLAGS_num + 99) / 100; - for (int i = 0; i < reads_; i++) { - char key[100]; - const int k = thread->rand.Next() % range; - snprintf(key, sizeof(key), "%016d", k); - db_->Get(options, key, &value); - thread->stats.FinishedSingleOp(); - } - } - - void SeekRandom(ThreadState* thread) { - ReadOptions options; - std::string value; - int found = 0; - for (int i = 0; i < reads_; i++) { - Iterator* iter = db_->NewIterator(options); - char key[100]; - const int k = thread->rand.Next() % FLAGS_num; - snprintf(key, sizeof(key), "%016d", k); - iter->Seek(key); - if (iter->Valid() && iter->key() == key) found++; - delete iter; - thread->stats.FinishedSingleOp(); - } - char msg[100]; - snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_); - thread->stats.AddMessage(msg); - } - - void DoDelete(ThreadState* thread, bool seq) { - RandomGenerator gen; - WriteBatch batch; - Status s; - for (int i = 0; i < num_; i += entries_per_batch_) { - batch.Clear(); - for (int j = 0; j < entries_per_batch_; j++) { - const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num); - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - batch.Delete(key); - thread->stats.FinishedSingleOp(); - } - s = db_->Write(write_options_, &batch); - if (!s.ok()) { - fprintf(stderr, "del error: %s\n", s.ToString().c_str()); - exit(1); - } - } - } - - void DeleteSeq(ThreadState* thread) { - DoDelete(thread, true); - } - - void DeleteRandom(ThreadState* thread) { - DoDelete(thread, false); - } - - void ReadWhileWriting(ThreadState* thread) { - if (thread->tid > 0) { - ReadRandom(thread); - } else { - // Special thread that keeps writing until other threads are done. - RandomGenerator gen; - while (true) { - { - MutexLock l(&thread->shared->mu); - if (thread->shared->num_done + 1 >= thread->shared->num_initialized) { - // Other threads have finished - break; - } - } - - const int k = thread->rand.Next() % FLAGS_num; - char key[100]; - snprintf(key, sizeof(key), "%016d", k); - Status s = db_->Put(write_options_, key, gen.Generate(value_size_)); - if (!s.ok()) { - fprintf(stderr, "put error: %s\n", s.ToString().c_str()); - exit(1); - } - } - - // Do not count any of the preceding work/delay in stats. - thread->stats.Start(); - } - } - - void Compact(ThreadState* thread) { - db_->CompactRange(NULL, NULL); - } - - void PrintStats(const char* key) { - std::string stats; - if (!db_->GetProperty(key, &stats)) { - stats = "(failed)"; - } - fprintf(stdout, "\n%s\n", stats.c_str()); - } - - static void WriteToFile(void* arg, const char* buf, int n) { - reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n)); - } - - void HeapProfile() { - char fname[100]; - snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_); - WritableFile* file; - Status s = Env::Default()->NewWritableFile(fname, &file); - if (!s.ok()) { - fprintf(stderr, "%s\n", s.ToString().c_str()); - return; - } - bool ok = port::GetHeapProfile(WriteToFile, file); - delete file; - if (!ok) { - fprintf(stderr, "heap profiling not supported\n"); - Env::Default()->DeleteFile(fname); - } - } -}; - -} // namespace leveldb - -int main(int argc, char** argv) { - FLAGS_write_buffer_size = leveldb::Options().write_buffer_size; - FLAGS_open_files = leveldb::Options().max_open_files; - std::string default_db_path; - - for (int i = 1; i < argc; i++) { - double d; - int n; - char junk; - if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { - FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); - } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { - FLAGS_compression_ratio = d; - } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_histogram = n; - } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_use_existing_db = n; - } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { - FLAGS_num = n; - } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { - FLAGS_reads = n; - } else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) { - FLAGS_threads = n; - } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { - FLAGS_value_size = n; - } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) { - FLAGS_write_buffer_size = n; - } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { - FLAGS_cache_size = n; - } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) { - FLAGS_bloom_bits = n; - } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) { - FLAGS_open_files = n; - } else if (strncmp(argv[i], "--db=", 5) == 0) { - FLAGS_db = argv[i] + 5; - } else { - fprintf(stderr, "Invalid flag '%s'\n", argv[i]); - exit(1); - } - } - - // Choose a location for the test database if none given with --db=<path> - if (FLAGS_db == NULL) { - leveldb::Env::Default()->GetTestDirectory(&default_db_path); - default_db_path += "/dbbench"; - FLAGS_db = default_db_path.c_str(); - } - - leveldb::Benchmark benchmark; - benchmark.Run(); - return 0; -} diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc deleted file mode 100644 index c9de169f29..0000000000 --- a/src/leveldb/db/db_impl.cc +++ /dev/null @@ -1,1467 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_impl.h" - -#include <algorithm> -#include <set> -#include <string> -#include <stdint.h> -#include <stdio.h> -#include <vector> -#include "db/builder.h" -#include "db/db_iter.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "leveldb/status.h" -#include "leveldb/table.h" -#include "leveldb/table_builder.h" -#include "port/port.h" -#include "table/block.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -// Information kept for every waiting writer -struct DBImpl::Writer { - Status status; - WriteBatch* batch; - bool sync; - bool done; - port::CondVar cv; - - explicit Writer(port::Mutex* mu) : cv(mu) { } -}; - -struct DBImpl::CompactionState { - Compaction* const compaction; - - // Sequence numbers < smallest_snapshot are not significant since we - // will never have to service a snapshot below smallest_snapshot. - // Therefore if we have seen a sequence number S <= smallest_snapshot, - // we can drop all entries for the same key with sequence numbers < S. - SequenceNumber smallest_snapshot; - - // Files produced by compaction - struct Output { - uint64_t number; - uint64_t file_size; - InternalKey smallest, largest; - }; - std::vector<Output> outputs; - - // State kept for output being generated - WritableFile* outfile; - TableBuilder* builder; - - uint64_t total_bytes; - - Output* current_output() { return &outputs[outputs.size()-1]; } - - explicit CompactionState(Compaction* c) - : compaction(c), - outfile(NULL), - builder(NULL), - total_bytes(0) { - } -}; - -// Fix user-supplied options to be reasonable -template <class T,class V> -static void ClipToRange(T* ptr, V minvalue, V maxvalue) { - if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue; - if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue; -} -Options SanitizeOptions(const std::string& dbname, - const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, - const Options& src) { - Options result = src; - result.comparator = icmp; - result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL; - ClipToRange(&result.max_open_files, 20, 50000); - ClipToRange(&result.write_buffer_size, 64<<10, 1<<30); - ClipToRange(&result.block_size, 1<<10, 4<<20); - if (result.info_log == NULL) { - // Open a log file in the same directory as the db - src.env->CreateDir(dbname); // In case it does not exist - src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname)); - Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log); - if (!s.ok()) { - // No place suitable for logging - result.info_log = NULL; - } - } - if (result.block_cache == NULL) { - result.block_cache = NewLRUCache(8 << 20); - } - return result; -} - -DBImpl::DBImpl(const Options& options, const std::string& dbname) - : env_(options.env), - internal_comparator_(options.comparator), - internal_filter_policy_(options.filter_policy), - options_(SanitizeOptions( - dbname, &internal_comparator_, &internal_filter_policy_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - dbname_(dbname), - db_lock_(NULL), - shutting_down_(NULL), - bg_cv_(&mutex_), - mem_(new MemTable(internal_comparator_)), - imm_(NULL), - logfile_(NULL), - logfile_number_(0), - log_(NULL), - tmp_batch_(new WriteBatch), - bg_compaction_scheduled_(false), - manual_compaction_(NULL) { - mem_->Ref(); - has_imm_.Release_Store(NULL); - - // Reserve ten files or so for other uses and give the rest to TableCache. - const int table_cache_size = options.max_open_files - 10; - table_cache_ = new TableCache(dbname_, &options_, table_cache_size); - - versions_ = new VersionSet(dbname_, &options_, table_cache_, - &internal_comparator_); -} - -DBImpl::~DBImpl() { - // Wait for background work to finish - mutex_.Lock(); - shutting_down_.Release_Store(this); // Any non-NULL value is ok - while (bg_compaction_scheduled_) { - bg_cv_.Wait(); - } - mutex_.Unlock(); - - if (db_lock_ != NULL) { - env_->UnlockFile(db_lock_); - } - - delete versions_; - if (mem_ != NULL) mem_->Unref(); - if (imm_ != NULL) imm_->Unref(); - delete tmp_batch_; - delete log_; - delete logfile_; - delete table_cache_; - - if (owns_info_log_) { - delete options_.info_log; - } - if (owns_cache_) { - delete options_.block_cache; - } -} - -Status DBImpl::NewDB() { - VersionEdit new_db; - new_db.SetComparatorName(user_comparator()->Name()); - new_db.SetLogNumber(0); - new_db.SetNextFile(2); - new_db.SetLastSequence(0); - - const std::string manifest = DescriptorFileName(dbname_, 1); - WritableFile* file; - Status s = env_->NewWritableFile(manifest, &file); - if (!s.ok()) { - return s; - } - { - log::Writer log(file); - std::string record; - new_db.EncodeTo(&record); - s = log.AddRecord(record); - if (s.ok()) { - s = file->Close(); - } - } - delete file; - if (s.ok()) { - // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(manifest); - } - return s; -} - -void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { - // No change needed - } else { - Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); - *s = Status::OK(); - } -} - -void DBImpl::DeleteObsoleteFiles() { - // Make a set of all of the live files - std::set<uint64_t> live = pending_outputs_; - versions_->AddLiveFiles(&live); - - std::vector<std::string> filenames; - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - bool keep = true; - switch (type) { - case kLogFile: - keep = ((number >= versions_->LogNumber()) || - (number == versions_->PrevLogNumber())); - break; - case kDescriptorFile: - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - keep = (number >= versions_->ManifestFileNumber()); - break; - case kTableFile: - keep = (live.find(number) != live.end()); - break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; - case kCurrentFile: - case kDBLockFile: - case kInfoLogFile: - keep = true; - break; - } - - if (!keep) { - if (type == kTableFile) { - table_cache_->Evict(number); - } - Log(options_.info_log, "Delete type=%d #%lld\n", - int(type), - static_cast<unsigned long long>(number)); - env_->DeleteFile(dbname_ + "/" + filenames[i]); - } - } - } -} - -Status DBImpl::Recover(VersionEdit* edit) { - mutex_.AssertHeld(); - - // Ignore error from CreateDir since the creation of the DB is - // committed only when the descriptor is created, and this directory - // may already exist from a previous failed creation attempt. - env_->CreateDir(dbname_); - assert(db_lock_ == NULL); - Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); - if (!s.ok()) { - return s; - } - - if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { - s = NewDB(); - if (!s.ok()) { - return s; - } - } else { - return Status::InvalidArgument( - dbname_, "does not exist (create_if_missing is false)"); - } - } else { - if (options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); - } - } - - s = versions_->Recover(); - if (s.ok()) { - SequenceNumber max_sequence(0); - - // Recover from all newer log files than the ones named in the - // descriptor (new log files may have been added by the previous - // incarnation without registering them in the descriptor). - // - // Note that PrevLogNumber() is no longer used, but we pay - // attention to it in case we are recovering a database - // produced by an older version of leveldb. - const uint64_t min_log = versions_->LogNumber(); - const uint64_t prev_log = versions_->PrevLogNumber(); - std::vector<std::string> filenames; - s = env_->GetChildren(dbname_, &filenames); - if (!s.ok()) { - return s; - } - uint64_t number; - FileType type; - std::vector<uint64_t> logs; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) - && type == kLogFile - && ((number >= min_log) || (number == prev_log))) { - logs.push_back(number); - } - } - - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - for (size_t i = 0; i < logs.size(); i++) { - s = RecoverLogFile(logs[i], edit, &max_sequence); - - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(logs[i]); - } - - if (s.ok()) { - if (versions_->LastSequence() < max_sequence) { - versions_->SetLastSequence(max_sequence); - } - } - } - - return s; -} - -Status DBImpl::RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - Logger* info_log; - const char* fname; - Status* status; // NULL if options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) { - Log(info_log, "%s%s: dropping %d bytes; %s", - (this->status == NULL ? "(ignoring error) " : ""), - fname, static_cast<int>(bytes), s.ToString().c_str()); - if (this->status != NULL && this->status->ok()) *this->status = s; - } - }; - - mutex_.AssertHeld(); - - // Open the log file - std::string fname = LogFileName(dbname_, log_number); - SequentialFile* file; - Status status = env_->NewSequentialFile(fname, &file); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks ? &status : NULL); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(file, &reporter, true/*checksum*/, - 0/*initial_offset*/); - Log(options_.info_log, "Recovering log #%llu", - (unsigned long long) log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable* mem = NULL; - while (reader.ReadRecord(&record, &scratch) && - status.ok()) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - - if (mem == NULL) { - mem = new MemTable(internal_comparator_); - mem->Ref(); - } - status = WriteBatchInternal::InsertInto(&batch, mem); - MaybeIgnoreError(&status); - if (!status.ok()) { - break; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } - - if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) { - status = WriteLevel0Table(mem, edit, NULL); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - break; - } - mem->Unref(); - mem = NULL; - } - } - - if (status.ok() && mem != NULL) { - status = WriteLevel0Table(mem, edit, NULL); - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - } - - if (mem != NULL) mem->Unref(); - delete file; - return status; -} - -Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, - Version* base) { - mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; - meta.number = versions_->NewFileNumber(); - pending_outputs_.insert(meta.number); - Iterator* iter = mem->NewIterator(); - Log(options_.info_log, "Level-0 table #%llu: started", - (unsigned long long) meta.number); - - Status s; - { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); - mutex_.Lock(); - } - - Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s", - (unsigned long long) meta.number, - (unsigned long long) meta.file_size, - s.ToString().c_str()); - delete iter; - pending_outputs_.erase(meta.number); - - - // Note that if file_size is zero, the file has been deleted and - // should not be added to the manifest. - int level = 0; - if (s.ok() && meta.file_size > 0) { - const Slice min_user_key = meta.smallest.user_key(); - const Slice max_user_key = meta.largest.user_key(); - if (base != NULL) { - level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); - } - edit->AddFile(level, meta.number, meta.file_size, - meta.smallest, meta.largest); - } - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; - stats_[level].Add(stats); - return s; -} - -Status DBImpl::CompactMemTable() { - mutex_.AssertHeld(); - assert(imm_ != NULL); - - // Save the contents of the memtable as a new Table - VersionEdit edit; - Version* base = versions_->current(); - base->Ref(); - Status s = WriteLevel0Table(imm_, &edit, base); - base->Unref(); - - if (s.ok() && shutting_down_.Acquire_Load()) { - s = Status::IOError("Deleting DB during memtable compaction"); - } - - // Replace immutable memtable with the generated Table - if (s.ok()) { - edit.SetPrevLogNumber(0); - edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed - s = versions_->LogAndApply(&edit, &mutex_); - } - - if (s.ok()) { - // Commit to the new state - imm_->Unref(); - imm_ = NULL; - has_imm_.Release_Store(NULL); - DeleteObsoleteFiles(); - } - - return s; -} - -void DBImpl::CompactRange(const Slice* begin, const Slice* end) { - int max_level_with_files = 1; - { - MutexLock l(&mutex_); - Version* base = versions_->current(); - for (int level = 1; level < config::kNumLevels; level++) { - if (base->OverlapInLevel(level, begin, end)) { - max_level_with_files = level; - } - } - } - TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap - for (int level = 0; level < max_level_with_files; level++) { - TEST_CompactRange(level, begin, end); - } -} - -void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { - assert(level >= 0); - assert(level + 1 < config::kNumLevels); - - InternalKey begin_storage, end_storage; - - ManualCompaction manual; - manual.level = level; - manual.done = false; - if (begin == NULL) { - manual.begin = NULL; - } else { - begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); - manual.begin = &begin_storage; - } - if (end == NULL) { - manual.end = NULL; - } else { - end_storage = InternalKey(*end, 0, static_cast<ValueType>(0)); - manual.end = &end_storage; - } - - MutexLock l(&mutex_); - while (!manual.done) { - while (manual_compaction_ != NULL) { - bg_cv_.Wait(); - } - manual_compaction_ = &manual; - MaybeScheduleCompaction(); - while (manual_compaction_ == &manual) { - bg_cv_.Wait(); - } - } -} - -Status DBImpl::TEST_CompactMemTable() { - // NULL batch means just wait for earlier writes to be done - Status s = Write(WriteOptions(), NULL); - if (s.ok()) { - // Wait until the compaction completes - MutexLock l(&mutex_); - while (imm_ != NULL && bg_error_.ok()) { - bg_cv_.Wait(); - } - if (imm_ != NULL) { - s = bg_error_; - } - } - return s; -} - -void DBImpl::MaybeScheduleCompaction() { - mutex_.AssertHeld(); - if (bg_compaction_scheduled_) { - // Already scheduled - } else if (shutting_down_.Acquire_Load()) { - // DB is being deleted; no more background compactions - } else if (imm_ == NULL && - manual_compaction_ == NULL && - !versions_->NeedsCompaction()) { - // No work to be done - } else { - bg_compaction_scheduled_ = true; - env_->Schedule(&DBImpl::BGWork, this); - } -} - -void DBImpl::BGWork(void* db) { - reinterpret_cast<DBImpl*>(db)->BackgroundCall(); -} - -void DBImpl::BackgroundCall() { - MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); - if (!shutting_down_.Acquire_Load()) { - Status s = BackgroundCompaction(); - if (s.ok()) { - // Success - } else if (shutting_down_.Acquire_Load()) { - // Error most likely due to shutdown; do not wait - } else { - // Wait a little bit before retrying background compaction in - // case this is an environmental problem and we do not want to - // chew up resources for failed compactions for the duration of - // the problem. - bg_cv_.SignalAll(); // In case a waiter can proceed despite the error - Log(options_.info_log, "Waiting after background compaction error: %s", - s.ToString().c_str()); - mutex_.Unlock(); - env_->SleepForMicroseconds(1000000); - mutex_.Lock(); - } - } - - bg_compaction_scheduled_ = false; - - // Previous compaction may have produced too many files in a level, - // so reschedule another compaction if needed. - MaybeScheduleCompaction(); - bg_cv_.SignalAll(); -} - -Status DBImpl::BackgroundCompaction() { - mutex_.AssertHeld(); - - if (imm_ != NULL) { - return CompactMemTable(); - } - - Compaction* c; - bool is_manual = (manual_compaction_ != NULL); - InternalKey manual_end; - if (is_manual) { - ManualCompaction* m = manual_compaction_; - c = versions_->CompactRange(m->level, m->begin, m->end); - m->done = (c == NULL); - if (c != NULL) { - manual_end = c->input(0, c->num_input_files(0) - 1)->largest; - } - Log(options_.info_log, - "Manual compaction at level-%d from %s .. %s; will stop at %s\n", - m->level, - (m->begin ? m->begin->DebugString().c_str() : "(begin)"), - (m->end ? m->end->DebugString().c_str() : "(end)"), - (m->done ? "(end)" : manual_end.DebugString().c_str())); - } else { - c = versions_->PickCompaction(); - } - - Status status; - if (c == NULL) { - // Nothing to do - } else if (!is_manual && c->IsTrivialMove()) { - // Move file to next level - assert(c->num_input_files(0) == 1); - FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest); - status = versions_->LogAndApply(c->edit(), &mutex_); - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast<unsigned long long>(f->number), - c->level() + 1, - static_cast<unsigned long long>(f->file_size), - status.ToString().c_str(), - versions_->LevelSummary(&tmp)); - } else { - CompactionState* compact = new CompactionState(c); - status = DoCompactionWork(compact); - CleanupCompaction(compact); - c->ReleaseInputs(); - DeleteObsoleteFiles(); - } - delete c; - - if (status.ok()) { - // Done - } else if (shutting_down_.Acquire_Load()) { - // Ignore compaction errors found during shutting down - } else { - Log(options_.info_log, - "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { - bg_error_ = status; - } - } - - if (is_manual) { - ManualCompaction* m = manual_compaction_; - if (!status.ok()) { - m->done = true; - } - if (!m->done) { - // We only compacted part of the requested range. Update *m - // to the range that is left to be compacted. - m->tmp_storage = manual_end; - m->begin = &m->tmp_storage; - } - manual_compaction_ = NULL; - } - return status; -} - -void DBImpl::CleanupCompaction(CompactionState* compact) { - mutex_.AssertHeld(); - if (compact->builder != NULL) { - // May happen if we get a shutdown call in the middle of compaction - compact->builder->Abandon(); - delete compact->builder; - } else { - assert(compact->outfile == NULL); - } - delete compact->outfile; - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(out.number); - } - delete compact; -} - -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { - assert(compact != NULL); - assert(compact->builder == NULL); - uint64_t file_number; - { - mutex_.Lock(); - file_number = versions_->NewFileNumber(); - pending_outputs_.insert(file_number); - CompactionState::Output out; - out.number = file_number; - out.smallest.Clear(); - out.largest.Clear(); - compact->outputs.push_back(out); - mutex_.Unlock(); - } - - // Make the output file - std::string fname = TableFileName(dbname_, file_number); - Status s = env_->NewWritableFile(fname, &compact->outfile); - if (s.ok()) { - compact->builder = new TableBuilder(options_, compact->outfile); - } - return s; -} - -Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, - Iterator* input) { - assert(compact != NULL); - assert(compact->outfile != NULL); - assert(compact->builder != NULL); - - const uint64_t output_number = compact->current_output()->number; - assert(output_number != 0); - - // Check for iterator errors - Status s = input->status(); - const uint64_t current_entries = compact->builder->NumEntries(); - if (s.ok()) { - s = compact->builder->Finish(); - } else { - compact->builder->Abandon(); - } - const uint64_t current_bytes = compact->builder->FileSize(); - compact->current_output()->file_size = current_bytes; - compact->total_bytes += current_bytes; - delete compact->builder; - compact->builder = NULL; - - // Finish and check for file errors - if (s.ok()) { - s = compact->outfile->Sync(); - } - if (s.ok()) { - s = compact->outfile->Close(); - } - delete compact->outfile; - compact->outfile = NULL; - - if (s.ok() && current_entries > 0) { - // Verify that the table is usable - Iterator* iter = table_cache_->NewIterator(ReadOptions(), - output_number, - current_bytes); - s = iter->status(); - delete iter; - if (s.ok()) { - Log(options_.info_log, - "Generated table #%llu: %lld keys, %lld bytes", - (unsigned long long) output_number, - (unsigned long long) current_entries, - (unsigned long long) current_bytes); - } - } - return s; -} - - -Status DBImpl::InstallCompactionResults(CompactionState* compact) { - mutex_.AssertHeld(); - Log(options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1, - static_cast<long long>(compact->total_bytes)); - - // Add compaction outputs - compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile( - level + 1, - out.number, out.file_size, out.smallest, out.largest); - } - return versions_->LogAndApply(compact->compaction->edit(), &mutex_); -} - -Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(options_.info_log, "Compacting %d@%d + %d@%d files", - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->level() + 1); - - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == NULL); - assert(compact->outfile == NULL); - if (snapshots_.empty()) { - compact->smallest_snapshot = versions_->LastSequence(); - } else { - compact->smallest_snapshot = snapshots_.oldest()->number_; - } - - // Release mutex while we're actually doing the compaction work - mutex_.Unlock(); - - Iterator* input = versions_->MakeInputIterator(compact->compaction); - input->SeekToFirst(); - Status status; - ParsedInternalKey ikey; - std::string current_user_key; - bool has_current_user_key = false; - SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { - // Prioritize immutable compaction work - if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); - mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary - } - mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); - } - - Slice key = input->key(); - if (compact->compaction->ShouldStopBefore(key) && - compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // Handle key/value, add to state, etc. - bool drop = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - current_user_key.clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - user_comparator()->Compare(ikey.user_key, - Slice(current_user_key)) != 0) { - // First occurrence of this user key - current_user_key.assign(ikey.user_key.data(), ikey.user_key.size()); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - } - - if (last_sequence_for_key <= compact->smallest_snapshot) { - // Hidden by an newer entry for same user key - drop = true; // (A) - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= compact->smallest_snapshot && - compact->compaction->IsBaseLevelForKey(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - } - - last_sequence_for_key = ikey.sequence; - } -#if 0 - Log(options_.info_log, - " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, " - "%d smallest_snapshot: %d", - ikey.user_key.ToString().c_str(), - (int)ikey.sequence, ikey.type, kTypeValue, drop, - compact->compaction->IsBaseLevelForKey(ikey.user_key), - (int)last_sequence_for_key, (int)compact->smallest_snapshot); -#endif - - if (!drop) { - // Open output file if necessary - if (compact->builder == NULL) { - status = OpenCompactionOutputFile(compact); - if (!status.ok()) { - break; - } - } - if (compact->builder->NumEntries() == 0) { - compact->current_output()->smallest.DecodeFrom(key); - } - compact->current_output()->largest.DecodeFrom(key); - compact->builder->Add(key, input->value()); - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - } - - input->Next(); - } - - if (status.ok() && shutting_down_.Acquire_Load()) { - status = Status::IOError("Deleting DB during compaction"); - } - if (status.ok() && compact->builder != NULL) { - status = FinishCompactionOutputFile(compact, input); - } - if (status.ok()) { - status = input->status(); - } - delete input; - input = NULL; - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; - for (int which = 0; which < 2; which++) { - for (int i = 0; i < compact->compaction->num_input_files(which); i++) { - stats.bytes_read += compact->compaction->input(which, i)->file_size; - } - } - for (size_t i = 0; i < compact->outputs.size(); i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - mutex_.Lock(); - stats_[compact->compaction->level() + 1].Add(stats); - - if (status.ok()) { - status = InstallCompactionResults(compact); - } - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, - "compacted to: %s", versions_->LevelSummary(&tmp)); - return status; -} - -namespace { -struct IterState { - port::Mutex* mu; - Version* version; - MemTable* mem; - MemTable* imm; -}; - -static void CleanupIteratorState(void* arg1, void* arg2) { - IterState* state = reinterpret_cast<IterState*>(arg1); - state->mu->Lock(); - state->mem->Unref(); - if (state->imm != NULL) state->imm->Unref(); - state->version->Unref(); - state->mu->Unlock(); - delete state; -} -} // namespace - -Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, - SequenceNumber* latest_snapshot) { - IterState* cleanup = new IterState; - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); - - // Collect together all needed child iterators - std::vector<Iterator*> list; - list.push_back(mem_->NewIterator()); - mem_->Ref(); - if (imm_ != NULL) { - list.push_back(imm_->NewIterator()); - imm_->Ref(); - } - versions_->current()->AddIterators(options, &list); - Iterator* internal_iter = - NewMergingIterator(&internal_comparator_, &list[0], list.size()); - versions_->current()->Ref(); - - cleanup->mu = &mutex_; - cleanup->mem = mem_; - cleanup->imm = imm_; - cleanup->version = versions_->current(); - internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL); - - mutex_.Unlock(); - return internal_iter; -} - -Iterator* DBImpl::TEST_NewInternalIterator() { - SequenceNumber ignored; - return NewInternalIterator(ReadOptions(), &ignored); -} - -int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { - MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); -} - -Status DBImpl::Get(const ReadOptions& options, - const Slice& key, - std::string* value) { - Status s; - MutexLock l(&mutex_); - SequenceNumber snapshot; - if (options.snapshot != NULL) { - snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_; - } else { - snapshot = versions_->LastSequence(); - } - - MemTable* mem = mem_; - MemTable* imm = imm_; - Version* current = versions_->current(); - mem->Ref(); - if (imm != NULL) imm->Ref(); - current->Ref(); - - bool have_stat_update = false; - Version::GetStats stats; - - // Unlock while reading from files and memtables - { - mutex_.Unlock(); - // First look in the memtable, then in the immutable memtable (if any). - LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s)) { - // Done - } else if (imm != NULL && imm->Get(lkey, value, &s)) { - // Done - } else { - s = current->Get(options, lkey, value, &stats); - have_stat_update = true; - } - mutex_.Lock(); - } - - if (have_stat_update && current->UpdateStats(stats)) { - MaybeScheduleCompaction(); - } - mem->Unref(); - if (imm != NULL) imm->Unref(); - current->Unref(); - return s; -} - -Iterator* DBImpl::NewIterator(const ReadOptions& options) { - SequenceNumber latest_snapshot; - Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); - return NewDBIterator( - &dbname_, env_, user_comparator(), internal_iter, - (options.snapshot != NULL - ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_ - : latest_snapshot)); -} - -const Snapshot* DBImpl::GetSnapshot() { - MutexLock l(&mutex_); - return snapshots_.New(versions_->LastSequence()); -} - -void DBImpl::ReleaseSnapshot(const Snapshot* s) { - MutexLock l(&mutex_); - snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s)); -} - -// Convenience methods -Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) { - return DB::Put(o, key, val); -} - -Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - return DB::Delete(options, key); -} - -Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { - Writer w(&mutex_); - w.batch = my_batch; - w.sync = options.sync; - w.done = false; - - MutexLock l(&mutex_); - writers_.push_back(&w); - while (!w.done && &w != writers_.front()) { - w.cv.Wait(); - } - if (w.done) { - return w.status; - } - - // May temporarily unlock and wait. - Status status = MakeRoomForWrite(my_batch == NULL); - uint64_t last_sequence = versions_->LastSequence(); - Writer* last_writer = &w; - if (status.ok() && my_batch != NULL) { // NULL batch is for compactions - WriteBatch* updates = BuildBatchGroup(&last_writer); - WriteBatchInternal::SetSequence(updates, last_sequence + 1); - last_sequence += WriteBatchInternal::Count(updates); - - // Add to log and apply to memtable. We can release the lock - // during this phase since &w is currently responsible for logging - // and protects against concurrent loggers and concurrent writes - // into mem_. - { - mutex_.Unlock(); - status = log_->AddRecord(WriteBatchInternal::Contents(updates)); - if (status.ok() && options.sync) { - status = logfile_->Sync(); - } - if (status.ok()) { - status = WriteBatchInternal::InsertInto(updates, mem_); - } - mutex_.Lock(); - } - if (updates == tmp_batch_) tmp_batch_->Clear(); - - versions_->SetLastSequence(last_sequence); - } - - while (true) { - Writer* ready = writers_.front(); - writers_.pop_front(); - if (ready != &w) { - ready->status = status; - ready->done = true; - ready->cv.Signal(); - } - if (ready == last_writer) break; - } - - // Notify new head of write queue - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } - - return status; -} - -// REQUIRES: Writer list must be non-empty -// REQUIRES: First writer must have a non-NULL batch -WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { - assert(!writers_.empty()); - Writer* first = writers_.front(); - WriteBatch* result = first->batch; - assert(result != NULL); - - size_t size = WriteBatchInternal::ByteSize(first->batch); - - // Allow the group to grow up to a maximum size, but if the - // original write is small, limit the growth so we do not slow - // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128<<10)) { - max_size = size + (128<<10); - } - - *last_writer = first; - std::deque<Writer*>::iterator iter = writers_.begin(); - ++iter; // Advance past "first" - for (; iter != writers_.end(); ++iter) { - Writer* w = *iter; - if (w->sync && !first->sync) { - // Do not include a sync write into a batch handled by a non-sync write. - break; - } - - if (w->batch != NULL) { - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { - // Do not make batch too big - break; - } - - // Append to *reuslt - if (result == first->batch) { - // Switch to temporary batch instead of disturbing caller's batch - result = tmp_batch_; - assert(WriteBatchInternal::Count(result) == 0); - WriteBatchInternal::Append(result, first->batch); - } - WriteBatchInternal::Append(result, w->batch); - } - *last_writer = w; - } - return result; -} - -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(bool force) { - mutex_.AssertHeld(); - assert(!writers_.empty()); - bool allow_delay = !force; - Status s; - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if ( - allow_delay && - versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) { - // We are getting close to hitting a hard limit on the number of - // L0 files. Rather than delaying a single write by several - // seconds when we hit the hard limit, start delaying each - // individual write by 1ms to reduce latency variance. Also, - // this delay hands over some CPU to the compaction thread in - // case it is sharing the same core as the writer. - mutex_.Unlock(); - env_->SleepForMicroseconds(1000); - allow_delay = false; // Do not delay a single write more than once - mutex_.Lock(); - } else if (!force && - (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) { - // There is room in current memtable - break; - } else if (imm_ != NULL) { - // We have filled up the current memtable, but the previous - // one is still being compacted, so we wait. - bg_cv_.Wait(); - } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) { - // There are too many level-0 files. - Log(options_.info_log, "waiting...\n"); - bg_cv_.Wait(); - } else { - // Attempt to switch to a new memtable and trigger compaction of old - assert(versions_->PrevLogNumber() == 0); - uint64_t new_log_number = versions_->NewFileNumber(); - WritableFile* lfile = NULL; - s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); - if (!s.ok()) { - // Avoid chewing through file number space in a tight loop. - versions_->ReuseFileNumber(new_log_number); - break; - } - delete log_; - delete logfile_; - logfile_ = lfile; - logfile_number_ = new_log_number; - log_ = new log::Writer(lfile); - imm_ = mem_; - has_imm_.Release_Store(imm_); - mem_ = new MemTable(internal_comparator_); - mem_->Ref(); - force = false; // Do not force another compaction if have room - MaybeScheduleCompaction(); - } - } - return s; -} - -bool DBImpl::GetProperty(const Slice& property, std::string* value) { - value->clear(); - - MutexLock l(&mutex_); - Slice in = property; - Slice prefix("leveldb."); - if (!in.starts_with(prefix)) return false; - in.remove_prefix(prefix.size()); - - if (in.starts_with("num-files-at-level")) { - in.remove_prefix(strlen("num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || level >= config::kNumLevels) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", - versions_->NumLevelFiles(static_cast<int>(level))); - *value = buf; - return true; - } - } else if (in == "stats") { - char buf[200]; - snprintf(buf, sizeof(buf), - " Compactions\n" - "Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n" - "--------------------------------------------------\n" - ); - value->append(buf); - for (int level = 0; level < config::kNumLevels; level++) { - int files = versions_->NumLevelFiles(level); - if (stats_[level].micros > 0 || files > 0) { - snprintf( - buf, sizeof(buf), - "%3d %8d %8.0f %9.0f %8.0f %9.0f\n", - level, - files, - versions_->NumLevelBytes(level) / 1048576.0, - stats_[level].micros / 1e6, - stats_[level].bytes_read / 1048576.0, - stats_[level].bytes_written / 1048576.0); - value->append(buf); - } - } - return true; - } else if (in == "sstables") { - *value = versions_->current()->DebugString(); - return true; - } - - return false; -} - -void DBImpl::GetApproximateSizes( - const Range* range, int n, - uint64_t* sizes) { - // TODO(opt): better implementation - Version* v; - { - MutexLock l(&mutex_); - versions_->current()->Ref(); - v = versions_->current(); - } - - for (int i = 0; i < n; i++) { - // Convert user_key into a corresponding internal key. - InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek); - uint64_t start = versions_->ApproximateOffsetOf(v, k1); - uint64_t limit = versions_->ApproximateOffsetOf(v, k2); - sizes[i] = (limit >= start ? limit - start : 0); - } - - { - MutexLock l(&mutex_); - v->Unref(); - } -} - -// Default implementations of convenience methods that subclasses of DB -// can call if they wish -Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) { - WriteBatch batch; - batch.Put(key, value); - return Write(opt, &batch); -} - -Status DB::Delete(const WriteOptions& opt, const Slice& key) { - WriteBatch batch; - batch.Delete(key); - return Write(opt, &batch); -} - -DB::~DB() { } - -Status DB::Open(const Options& options, const std::string& dbname, - DB** dbptr) { - *dbptr = NULL; - - DBImpl* impl = new DBImpl(options, dbname); - impl->mutex_.Lock(); - VersionEdit edit; - Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists - if (s.ok()) { - uint64_t new_log_number = impl->versions_->NewFileNumber(); - WritableFile* lfile; - s = options.env->NewWritableFile(LogFileName(dbname, new_log_number), - &lfile); - if (s.ok()) { - edit.SetLogNumber(new_log_number); - impl->logfile_ = lfile; - impl->logfile_number_ = new_log_number; - impl->log_ = new log::Writer(lfile); - s = impl->versions_->LogAndApply(&edit, &impl->mutex_); - } - if (s.ok()) { - impl->DeleteObsoleteFiles(); - impl->MaybeScheduleCompaction(); - } - } - impl->mutex_.Unlock(); - if (s.ok()) { - *dbptr = impl; - } else { - delete impl; - } - return s; -} - -Snapshot::~Snapshot() { -} - -Status DestroyDB(const std::string& dbname, const Options& options) { - Env* env = options.env; - std::vector<std::string> filenames; - // Ignore error in case directory does not exist - env->GetChildren(dbname, &filenames); - if (filenames.empty()) { - return Status::OK(); - } - - FileLock* lock; - const std::string lockname = LockFileName(dbname); - Status result = env->LockFile(lockname, &lock); - if (result.ok()) { - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type) && - type != kDBLockFile) { // Lock file will be deleted at end - Status del = env->DeleteFile(dbname + "/" + filenames[i]); - if (result.ok() && !del.ok()) { - result = del; - } - } - } - env->UnlockFile(lock); // Ignore error since state is already gone - env->DeleteFile(lockname); - env->DeleteDir(dbname); // Ignore error in case dir contains other files - } - return result; -} - -} // namespace leveldb diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h deleted file mode 100644 index bd29dd8055..0000000000 --- a/src/leveldb/db/db_impl.h +++ /dev/null @@ -1,202 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_ -#define STORAGE_LEVELDB_DB_DB_IMPL_H_ - -#include <deque> -#include <set> -#include "db/dbformat.h" -#include "db/log_writer.h" -#include "db/snapshot.h" -#include "leveldb/db.h" -#include "leveldb/env.h" -#include "port/port.h" -#include "port/thread_annotations.h" - -namespace leveldb { - -class MemTable; -class TableCache; -class Version; -class VersionEdit; -class VersionSet; - -class DBImpl : public DB { - public: - DBImpl(const Options& options, const std::string& dbname); - virtual ~DBImpl(); - - // Implementations of the DB interface - virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value); - virtual Status Delete(const WriteOptions&, const Slice& key); - virtual Status Write(const WriteOptions& options, WriteBatch* updates); - virtual Status Get(const ReadOptions& options, - const Slice& key, - std::string* value); - virtual Iterator* NewIterator(const ReadOptions&); - virtual const Snapshot* GetSnapshot(); - virtual void ReleaseSnapshot(const Snapshot* snapshot); - virtual bool GetProperty(const Slice& property, std::string* value); - virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes); - virtual void CompactRange(const Slice* begin, const Slice* end); - - // Extra methods (for testing) that are not in the public DB interface - - // Compact any files in the named level that overlap [*begin,*end] - void TEST_CompactRange(int level, const Slice* begin, const Slice* end); - - // Force current memtable contents to be compacted. - Status TEST_CompactMemTable(); - - // Return an internal iterator over the current state of the database. - // The keys of this iterator are internal keys (see format.h). - // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t TEST_MaxNextLevelOverlappingBytes(); - - private: - friend class DB; - struct CompactionState; - struct Writer; - - Iterator* NewInternalIterator(const ReadOptions&, - SequenceNumber* latest_snapshot); - - Status NewDB(); - - // Recover the descriptor from persistent storage. May do a significant - // amount of work to recover recently logged updates. Any changes to - // be made to the descriptor are added to *edit. - Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - void MaybeIgnoreError(Status* s) const; - - // Delete any unneeded files and stale in-memory entries. - void DeleteObsoleteFiles(); - - // Compact the in-memory write buffer to disk. Switches to a new - // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable() - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - Status RecoverLogFile(uint64_t log_number, - VersionEdit* edit, - SequenceNumber* max_sequence) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - Status MakeRoomForWrite(bool force /* compact even if there is room? */) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - WriteBatch* BuildBatchGroup(Writer** last_writer); - - void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); - static void BGWork(void* db); - void BackgroundCall(); - Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); - void CleanupCompaction(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - Status DoCompactionWork(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - Status OpenCompactionOutputFile(CompactionState* compact); - Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); - - // Constant after construction - Env* const env_; - const InternalKeyComparator internal_comparator_; - const InternalFilterPolicy internal_filter_policy_; - const Options options_; // options_.comparator == &internal_comparator_ - bool owns_info_log_; - bool owns_cache_; - const std::string dbname_; - - // table_cache_ provides its own synchronization - TableCache* table_cache_; - - // Lock over the persistent DB state. Non-NULL iff successfully acquired. - FileLock* db_lock_; - - // State below is protected by mutex_ - port::Mutex mutex_; - port::AtomicPointer shutting_down_; - port::CondVar bg_cv_; // Signalled when background work finishes - MemTable* mem_; - MemTable* imm_; // Memtable being compacted - port::AtomicPointer has_imm_; // So bg thread can detect non-NULL imm_ - WritableFile* logfile_; - uint64_t logfile_number_; - log::Writer* log_; - - // Queue of writers. - std::deque<Writer*> writers_; - WriteBatch* tmp_batch_; - - SnapshotList snapshots_; - - // Set of table files to protect from deletion because they are - // part of ongoing compactions. - std::set<uint64_t> pending_outputs_; - - // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - - // Information for a manual compaction - struct ManualCompaction { - int level; - bool done; - const InternalKey* begin; // NULL means beginning of key range - const InternalKey* end; // NULL means end of key range - InternalKey tmp_storage; // Used to keep track of compaction progress - }; - ManualCompaction* manual_compaction_; - - VersionSet* versions_; - - // Have we encountered a background error in paranoid mode? - Status bg_error_; - - // Per level compaction stats. stats_[level] stores the stats for - // compactions that produced data for the specified "level". - struct CompactionStats { - int64_t micros; - int64_t bytes_read; - int64_t bytes_written; - - CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { } - - void Add(const CompactionStats& c) { - this->micros += c.micros; - this->bytes_read += c.bytes_read; - this->bytes_written += c.bytes_written; - } - }; - CompactionStats stats_[config::kNumLevels]; - - // No copying allowed - DBImpl(const DBImpl&); - void operator=(const DBImpl&); - - const Comparator* user_comparator() const { - return internal_comparator_.user_comparator(); - } -}; - -// Sanitize db options. The caller should delete result.info_log if -// it is not equal to src.info_log. -extern Options SanitizeOptions(const std::string& db, - const InternalKeyComparator* icmp, - const InternalFilterPolicy* ipolicy, - const Options& src); - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_ diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc deleted file mode 100644 index 87dca2ded4..0000000000 --- a/src/leveldb/db/db_iter.cc +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/db_iter.h" - -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/mutexlock.h" - -namespace leveldb { - -#if 0 -static void DumpInternalIter(Iterator* iter) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey k; - if (!ParseInternalKey(iter->key(), &k)) { - fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str()); - } else { - fprintf(stderr, "@ '%s'\n", k.DebugString().c_str()); - } - } -} -#endif - -namespace { - -// Memtables and sstables that make the DB representation contain -// (userkey,seq,type) => uservalue entries. DBIter -// combines multiple entries for the same userkey found in the DB -// representation into a single entry while accounting for sequence -// numbers, deletion markers, overwrites, etc. -class DBIter: public Iterator { - public: - // Which direction is the iterator currently moving? - // (1) When moving forward, the internal iterator is positioned at - // the exact entry that yields this->key(), this->value() - // (2) When moving backwards, the internal iterator is positioned - // just before all entries whose user key == this->key(). - enum Direction { - kForward, - kReverse - }; - - DBIter(const std::string* dbname, Env* env, - const Comparator* cmp, Iterator* iter, SequenceNumber s) - : dbname_(dbname), - env_(env), - user_comparator_(cmp), - iter_(iter), - sequence_(s), - direction_(kForward), - valid_(false) { - } - virtual ~DBIter() { - delete iter_; - } - virtual bool Valid() const { return valid_; } - virtual Slice key() const { - assert(valid_); - return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_; - } - virtual Slice value() const { - assert(valid_); - return (direction_ == kForward) ? iter_->value() : saved_value_; - } - virtual Status status() const { - if (status_.ok()) { - return iter_->status(); - } else { - return status_; - } - } - - virtual void Next(); - virtual void Prev(); - virtual void Seek(const Slice& target); - virtual void SeekToFirst(); - virtual void SeekToLast(); - - private: - void FindNextUserEntry(bool skipping, std::string* skip); - void FindPrevUserEntry(); - bool ParseKey(ParsedInternalKey* key); - - inline void SaveKey(const Slice& k, std::string* dst) { - dst->assign(k.data(), k.size()); - } - - inline void ClearSavedValue() { - if (saved_value_.capacity() > 1048576) { - std::string empty; - swap(empty, saved_value_); - } else { - saved_value_.clear(); - } - } - - const std::string* const dbname_; - Env* const env_; - const Comparator* const user_comparator_; - Iterator* const iter_; - SequenceNumber const sequence_; - - Status status_; - std::string saved_key_; // == current key when direction_==kReverse - std::string saved_value_; // == current raw value when direction_==kReverse - Direction direction_; - bool valid_; - - // No copying allowed - DBIter(const DBIter&); - void operator=(const DBIter&); -}; - -inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { - if (!ParseInternalKey(iter_->key(), ikey)) { - status_ = Status::Corruption("corrupted internal key in DBIter"); - return false; - } else { - return true; - } -} - -void DBIter::Next() { - assert(valid_); - - if (direction_ == kReverse) { // Switch directions? - direction_ = kForward; - // iter_ is pointing just before the entries for this->key(), - // so advance into the range of entries for this->key() and then - // use the normal skipping code below. - if (!iter_->Valid()) { - iter_->SeekToFirst(); - } else { - iter_->Next(); - } - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - return; - } - } - - // Temporarily use saved_key_ as storage for key to skip. - std::string* skip = &saved_key_; - SaveKey(ExtractUserKey(iter_->key()), skip); - FindNextUserEntry(true, skip); -} - -void DBIter::FindNextUserEntry(bool skipping, std::string* skip) { - // Loop until we hit an acceptable entry to yield - assert(iter_->Valid()); - assert(direction_ == kForward); - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - SaveKey(ikey.user_key, skip); - skipping = true; - break; - case kTypeValue: - if (skipping && - user_comparator_->Compare(ikey.user_key, *skip) <= 0) { - // Entry hidden - } else { - valid_ = true; - saved_key_.clear(); - return; - } - break; - } - } - iter_->Next(); - } while (iter_->Valid()); - saved_key_.clear(); - valid_ = false; -} - -void DBIter::Prev() { - assert(valid_); - - if (direction_ == kForward) { // Switch directions? - // iter_ is pointing at the current entry. Scan backwards until - // the key changes so we can use the normal reverse scanning code. - assert(iter_->Valid()); // Otherwise valid_ would have been false - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - while (true) { - iter_->Prev(); - if (!iter_->Valid()) { - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - return; - } - if (user_comparator_->Compare(ExtractUserKey(iter_->key()), - saved_key_) < 0) { - break; - } - } - direction_ = kReverse; - } - - FindPrevUserEntry(); -} - -void DBIter::FindPrevUserEntry() { - assert(direction_ == kReverse); - - ValueType value_type = kTypeDeletion; - if (iter_->Valid()) { - do { - ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if ((value_type != kTypeDeletion) && - user_comparator_->Compare(ikey.user_key, saved_key_) < 0) { - // We encountered a non-deleted value in entries for previous keys, - break; - } - value_type = ikey.type; - if (value_type == kTypeDeletion) { - saved_key_.clear(); - ClearSavedValue(); - } else { - Slice raw_value = iter_->value(); - if (saved_value_.capacity() > raw_value.size() + 1048576) { - std::string empty; - swap(empty, saved_value_); - } - SaveKey(ExtractUserKey(iter_->key()), &saved_key_); - saved_value_.assign(raw_value.data(), raw_value.size()); - } - } - iter_->Prev(); - } while (iter_->Valid()); - } - - if (value_type == kTypeDeletion) { - // End - valid_ = false; - saved_key_.clear(); - ClearSavedValue(); - direction_ = kForward; - } else { - valid_ = true; - } -} - -void DBIter::Seek(const Slice& target) { - direction_ = kForward; - ClearSavedValue(); - saved_key_.clear(); - AppendInternalKey( - &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek)); - iter_->Seek(saved_key_); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToFirst() { - direction_ = kForward; - ClearSavedValue(); - iter_->SeekToFirst(); - if (iter_->Valid()) { - FindNextUserEntry(false, &saved_key_ /* temporary storage */); - } else { - valid_ = false; - } -} - -void DBIter::SeekToLast() { - direction_ = kReverse; - ClearSavedValue(); - iter_->SeekToLast(); - FindPrevUserEntry(); -} - -} // anonymous namespace - -Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence); -} - -} // namespace leveldb diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h deleted file mode 100644 index d9e1b174ab..0000000000 --- a/src/leveldb/db/db_iter.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_ -#define STORAGE_LEVELDB_DB_DB_ITER_H_ - -#include <stdint.h> -#include "leveldb/db.h" -#include "db/dbformat.h" - -namespace leveldb { - -// Return a new iterator that converts internal keys (yielded by -// "*internal_iter") that were live at the specified "sequence" number -// into appropriate user keys. -extern Iterator* NewDBIterator( - const std::string* dbname, - Env* env, - const Comparator* user_key_comparator, - Iterator* internal_iter, - const SequenceNumber& sequence); - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_DB_ITER_H_ diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc deleted file mode 100644 index 684ea3bdbc..0000000000 --- a/src/leveldb/db/db_test.cc +++ /dev/null @@ -1,2027 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" -#include "leveldb/filter_policy.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "leveldb/cache.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -namespace { -class AtomicCounter { - private: - port::Mutex mu_; - int count_; - public: - AtomicCounter() : count_(0) { } - void Increment() { - MutexLock l(&mu_); - count_++; - } - int Read() { - MutexLock l(&mu_); - return count_; - } - void Reset() { - MutexLock l(&mu_); - count_ = 0; - } -}; -} - -// Special Env used to delay background operations -class SpecialEnv : public EnvWrapper { - public: - // sstable Sync() calls are blocked while this pointer is non-NULL. - port::AtomicPointer delay_sstable_sync_; - - // Simulate no-space errors while this pointer is non-NULL. - port::AtomicPointer no_space_; - - // Simulate non-writable file system while this pointer is non-NULL - port::AtomicPointer non_writable_; - - // Force sync of manifest files to fail while this pointer is non-NULL - port::AtomicPointer manifest_sync_error_; - - // Force write to manifest files to fail while this pointer is non-NULL - port::AtomicPointer manifest_write_error_; - - bool count_random_reads_; - AtomicCounter random_read_counter_; - - AtomicCounter sleep_counter_; - - explicit SpecialEnv(Env* base) : EnvWrapper(base) { - delay_sstable_sync_.Release_Store(NULL); - no_space_.Release_Store(NULL); - non_writable_.Release_Store(NULL); - count_random_reads_ = false; - manifest_sync_error_.Release_Store(NULL); - manifest_write_error_.Release_Store(NULL); - } - - Status NewWritableFile(const std::string& f, WritableFile** r) { - class SSTableFile : public WritableFile { - private: - SpecialEnv* env_; - WritableFile* base_; - - public: - SSTableFile(SpecialEnv* env, WritableFile* base) - : env_(env), - base_(base) { - } - ~SSTableFile() { delete base_; } - Status Append(const Slice& data) { - if (env_->no_space_.Acquire_Load() != NULL) { - // Drop writes on the floor - return Status::OK(); - } else { - return base_->Append(data); - } - } - Status Close() { return base_->Close(); } - Status Flush() { return base_->Flush(); } - Status Sync() { - while (env_->delay_sstable_sync_.Acquire_Load() != NULL) { - env_->SleepForMicroseconds(100000); - } - return base_->Sync(); - } - }; - class ManifestFile : public WritableFile { - private: - SpecialEnv* env_; - WritableFile* base_; - public: - ManifestFile(SpecialEnv* env, WritableFile* b) : env_(env), base_(b) { } - ~ManifestFile() { delete base_; } - Status Append(const Slice& data) { - if (env_->manifest_write_error_.Acquire_Load() != NULL) { - return Status::IOError("simulated writer error"); - } else { - return base_->Append(data); - } - } - Status Close() { return base_->Close(); } - Status Flush() { return base_->Flush(); } - Status Sync() { - if (env_->manifest_sync_error_.Acquire_Load() != NULL) { - return Status::IOError("simulated sync error"); - } else { - return base_->Sync(); - } - } - }; - - if (non_writable_.Acquire_Load() != NULL) { - return Status::IOError("simulated write error"); - } - - Status s = target()->NewWritableFile(f, r); - if (s.ok()) { - if (strstr(f.c_str(), ".sst") != NULL) { - *r = new SSTableFile(this, *r); - } else if (strstr(f.c_str(), "MANIFEST") != NULL) { - *r = new ManifestFile(this, *r); - } - } - return s; - } - - Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) { - class CountingFile : public RandomAccessFile { - private: - RandomAccessFile* target_; - AtomicCounter* counter_; - public: - CountingFile(RandomAccessFile* target, AtomicCounter* counter) - : target_(target), counter_(counter) { - } - virtual ~CountingFile() { delete target_; } - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const { - counter_->Increment(); - return target_->Read(offset, n, result, scratch); - } - }; - - Status s = target()->NewRandomAccessFile(f, r); - if (s.ok() && count_random_reads_) { - *r = new CountingFile(*r, &random_read_counter_); - } - return s; - } - - virtual void SleepForMicroseconds(int micros) { - sleep_counter_.Increment(); - target()->SleepForMicroseconds(micros); - } -}; - -class DBTest { - private: - const FilterPolicy* filter_policy_; - - // Sequence of option configurations to try - enum OptionConfig { - kDefault, - kFilter, - kUncompressed, - kEnd - }; - int option_config_; - - public: - std::string dbname_; - SpecialEnv* env_; - DB* db_; - - Options last_options_; - - DBTest() : option_config_(kDefault), - env_(new SpecialEnv(Env::Default())) { - filter_policy_ = NewBloomFilterPolicy(10); - dbname_ = test::TmpDir() + "/db_test"; - DestroyDB(dbname_, Options()); - db_ = NULL; - Reopen(); - } - - ~DBTest() { - delete db_; - DestroyDB(dbname_, Options()); - delete env_; - delete filter_policy_; - } - - // Switch to a fresh database with the next option configuration to - // test. Return false if there are no more configurations to test. - bool ChangeOptions() { - option_config_++; - if (option_config_ >= kEnd) { - return false; - } else { - DestroyAndReopen(); - return true; - } - } - - // Return the current option configuration. - Options CurrentOptions() { - Options options; - switch (option_config_) { - case kFilter: - options.filter_policy = filter_policy_; - break; - case kUncompressed: - options.compression = kNoCompression; - break; - default: - break; - } - return options; - } - - DBImpl* dbfull() { - return reinterpret_cast<DBImpl*>(db_); - } - - void Reopen(Options* options = NULL) { - ASSERT_OK(TryReopen(options)); - } - - void Close() { - delete db_; - db_ = NULL; - } - - void DestroyAndReopen(Options* options = NULL) { - delete db_; - db_ = NULL; - DestroyDB(dbname_, Options()); - ASSERT_OK(TryReopen(options)); - } - - Status TryReopen(Options* options) { - delete db_; - db_ = NULL; - Options opts; - if (options != NULL) { - opts = *options; - } else { - opts = CurrentOptions(); - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const std::string& k, const std::string& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = NULL) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - // Return a string that contains all key,value pairs in order, - // formatted like "(k1->v1)(k2->v2)". - std::string Contents() { - std::vector<std::string> forward; - std::string result; - Iterator* iter = db_->NewIterator(ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - std::string s = IterStatus(iter); - result.push_back('('); - result.append(s); - result.push_back(')'); - forward.push_back(s); - } - - // Check reverse iteration results are the reverse of forward results - int matched = 0; - for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - ASSERT_LT(matched, forward.size()); - ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); - matched++; - } - ASSERT_EQ(matched, forward.size()); - - delete iter; - return result; - } - - std::string AllEntriesFor(const Slice& user_key) { - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); - iter->Seek(target.Encode()); - std::string result; - if (!iter->status().ok()) { - result = iter->status().ToString(); - } else { - result = "[ "; - bool first = true; - while (iter->Valid()) { - ParsedInternalKey ikey; - if (!ParseInternalKey(iter->key(), &ikey)) { - result += "CORRUPTED"; - } else { - if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) { - break; - } - if (!first) { - result += ", "; - } - first = false; - switch (ikey.type) { - case kTypeValue: - result += iter->value().ToString(); - break; - case kTypeDeletion: - result += "DEL"; - break; - } - } - iter->Next(); - } - if (!first) { - result += " "; - } - result += "]"; - } - delete iter; - return result; - } - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - int TotalTableFiles() { - int result = 0; - for (int level = 0; level < config::kNumLevels; level++) { - result += NumTableFilesAtLevel(level); - } - return result; - } - - // Return spread of files per level - std::string FilesPerLevel() { - std::string result; - int last_non_zero_offset = 0; - for (int level = 0; level < config::kNumLevels; level++) { - int f = NumTableFilesAtLevel(level); - char buf[100]; - snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); - result += buf; - if (f > 0) { - last_non_zero_offset = result.size(); - } - } - result.resize(last_non_zero_offset); - return result; - } - - int CountFiles() { - std::vector<std::string> files; - env_->GetChildren(dbname_, &files); - return static_cast<int>(files.size()); - } - - uint64_t Size(const Slice& start, const Slice& limit) { - Range r(start, limit); - uint64_t size; - db_->GetApproximateSizes(&r, 1, &size); - return size; - } - - void Compact(const Slice& start, const Slice& limit) { - db_->CompactRange(&start, &limit); - } - - // Do n memtable compactions, each of which produces an sstable - // covering the range [small,large]. - void MakeTables(int n, const std::string& small, const std::string& large) { - for (int i = 0; i < n; i++) { - Put(small, "begin"); - Put(large, "end"); - dbfull()->TEST_CompactMemTable(); - } - } - - // Prevent pushing of new sstables into deeper levels by adding - // tables that cover a specified range to all levels. - void FillLevels(const std::string& smallest, const std::string& largest) { - MakeTables(config::kNumLevels, smallest, largest); - } - - void DumpFileCounts(const char* label) { - fprintf(stderr, "---\n%s:\n", label); - fprintf(stderr, "maxoverlap: %lld\n", - static_cast<long long>( - dbfull()->TEST_MaxNextLevelOverlappingBytes())); - for (int level = 0; level < config::kNumLevels; level++) { - int num = NumTableFilesAtLevel(level); - if (num > 0) { - fprintf(stderr, " level %3d : %d files\n", level, num); - } - } - } - - std::string DumpSSTableList() { - std::string property; - db_->GetProperty("leveldb.sstables", &property); - return property; - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(DBTest, Empty) { - do { - ASSERT_TRUE(db_ != NULL); - ASSERT_EQ("NOT_FOUND", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, ReadWrite) { - do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - ASSERT_EQ("v3", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - } while (ChangeOptions()); -} - -TEST(DBTest, PutDeleteGet) { - do { - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - ASSERT_OK(db_->Delete(WriteOptions(), "foo")); - ASSERT_EQ("NOT_FOUND", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetFromImmutableLayer) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls - Put("k1", std::string(100000, 'x')); // Fill memtable - Put("k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_EQ("v1", Get("foo")); - env_->delay_sstable_sync_.Release_Store(NULL); // Release sync calls - } while (ChangeOptions()); -} - -TEST(DBTest, GetFromVersions) { - do { - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v1", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetSnapshot) { - do { - // Try with both a short key and a long key - for (int i = 0; i < 2; i++) { - std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); - ASSERT_OK(Put(key, "v1")); - const Snapshot* s1 = db_->GetSnapshot(); - ASSERT_OK(Put(key, "v2")); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get(key)); - ASSERT_EQ("v1", Get(key, s1)); - db_->ReleaseSnapshot(s1); - } - } while (ChangeOptions()); -} - -TEST(DBTest, GetLevel0Ordering) { - do { - // Check that we process level-0 files in correct order. The code - // below generates two level-0 files where the earlier one comes - // before the later one in the level-0 file list since the earlier - // one has a smaller "smallest" key. - ASSERT_OK(Put("bar", "b")); - ASSERT_OK(Put("foo", "v1")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Put("foo", "v2")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetOrderedByLevels) { - do { - ASSERT_OK(Put("foo", "v1")); - Compact("a", "z"); - ASSERT_EQ("v1", Get("foo")); - ASSERT_OK(Put("foo", "v2")); - ASSERT_EQ("v2", Get("foo")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("v2", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetPicksCorrectFile) { - do { - // Arrange to have multiple files in a non-level-0 level. - ASSERT_OK(Put("a", "va")); - Compact("a", "b"); - ASSERT_OK(Put("x", "vx")); - Compact("x", "y"); - ASSERT_OK(Put("f", "vf")); - Compact("f", "g"); - ASSERT_EQ("va", Get("a")); - ASSERT_EQ("vf", Get("f")); - ASSERT_EQ("vx", Get("x")); - } while (ChangeOptions()); -} - -TEST(DBTest, GetEncountersEmptyLevel) { - do { - // Arrange for the following to happen: - // * sstable A in level 0 - // * nothing in level 1 - // * sstable B in level 2 - // Then do enough Get() calls to arrange for an automatic compaction - // of sstable A. A bug would cause the compaction to be marked as - // occuring at level 1 (instead of the correct level 0). - - // Step 1: First place sstables in levels 0 and 2 - int compaction_count = 0; - while (NumTableFilesAtLevel(0) == 0 || - NumTableFilesAtLevel(2) == 0) { - ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2"; - compaction_count++; - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_CompactMemTable(); - } - - // Step 2: clear level 1 if necessary. - dbfull()->TEST_CompactRange(1, NULL, NULL); - ASSERT_EQ(NumTableFilesAtLevel(0), 1); - ASSERT_EQ(NumTableFilesAtLevel(1), 0); - ASSERT_EQ(NumTableFilesAtLevel(2), 1); - - // Step 3: read a bunch of times - for (int i = 0; i < 1000; i++) { - ASSERT_EQ("NOT_FOUND", Get("missing")); - } - - // Step 4: Wait for compaction to finish - env_->SleepForMicroseconds(1000000); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - } while (ChangeOptions()); -} - -TEST(DBTest, IterEmpty) { - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("foo"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSingle) { - ASSERT_OK(Put("a", "va")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterMulti) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->Seek(""); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("a"); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Seek("ax"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("b"); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Seek("z"); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - // Switch from reverse to forward - iter->SeekToLast(); - iter->Prev(); - iter->Prev(); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Switch from forward to reverse - iter->SeekToFirst(); - iter->Next(); - iter->Next(); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - - // Make sure iter stays at snapshot - ASSERT_OK(Put("a", "va2")); - ASSERT_OK(Put("a2", "va3")); - ASSERT_OK(Put("b", "vb2")); - ASSERT_OK(Put("c", "vc2")); - ASSERT_OK(Delete("b")); - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->vb"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterSmallAndLargeMix) { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", std::string(100000, 'b'))); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Put("d", std::string(100000, 'd'))); - ASSERT_OK(Put("e", std::string(100000, 'e'))); - - Iterator* iter = db_->NewIterator(ReadOptions()); - - iter->SeekToFirst(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Next(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - iter->SeekToLast(); - ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b')); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "(invalid)"); - - delete iter; -} - -TEST(DBTest, IterMultiWithDelete) { - do { - ASSERT_OK(Put("a", "va")); - ASSERT_OK(Put("b", "vb")); - ASSERT_OK(Put("c", "vc")); - ASSERT_OK(Delete("b")); - ASSERT_EQ("NOT_FOUND", Get("b")); - - Iterator* iter = db_->NewIterator(ReadOptions()); - iter->Seek("c"); - ASSERT_EQ(IterStatus(iter), "c->vc"); - iter->Prev(); - ASSERT_EQ(IterStatus(iter), "a->va"); - delete iter; - } while (ChangeOptions()); -} - -TEST(DBTest, Recover) { - do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("baz", "v5")); - - Reopen(); - ASSERT_EQ("v1", Get("foo")); - - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v5", Get("baz")); - ASSERT_OK(Put("bar", "v2")); - ASSERT_OK(Put("foo", "v3")); - - Reopen(); - ASSERT_EQ("v3", Get("foo")); - ASSERT_OK(Put("foo", "v4")); - ASSERT_EQ("v4", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ("v5", Get("baz")); - } while (ChangeOptions()); -} - -TEST(DBTest, RecoveryWithEmptyLog) { - do { - ASSERT_OK(Put("foo", "v1")); - ASSERT_OK(Put("foo", "v2")); - Reopen(); - Reopen(); - ASSERT_OK(Put("foo", "v3")); - Reopen(); - ASSERT_EQ("v3", Get("foo")); - } while (ChangeOptions()); -} - -// Check that writes done during a memtable compaction are recovered -// if the database is shutdown during the memtable compaction. -TEST(DBTest, RecoverDuringMemtableCompaction) { - do { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 1000000; - Reopen(&options); - - // Trigger a long memtable compaction and reopen the database during it - ASSERT_OK(Put("foo", "v1")); // Goes to 1st log file - ASSERT_OK(Put("big1", std::string(10000000, 'x'))); // Fills memtable - ASSERT_OK(Put("big2", std::string(1000, 'y'))); // Triggers compaction - ASSERT_OK(Put("bar", "v2")); // Goes to new log file - - Reopen(&options); - ASSERT_EQ("v1", Get("foo")); - ASSERT_EQ("v2", Get("bar")); - ASSERT_EQ(std::string(10000000, 'x'), Get("big1")); - ASSERT_EQ(std::string(1000, 'y'), Get("big2")); - } while (ChangeOptions()); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key%06d", i); - return std::string(buf); -} - -TEST(DBTest, MinorCompactionsHappen) { - Options options = CurrentOptions(); - options.write_buffer_size = 10000; - Reopen(&options); - - const int N = 500; - - int starting_num_tables = TotalTableFiles(); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v'))); - } - int ending_num_tables = TotalTableFiles(); - ASSERT_GT(ending_num_tables, starting_num_tables); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } - - Reopen(); - - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i))); - } -} - -TEST(DBTest, RecoverWithLargeLog) { - { - Options options = CurrentOptions(); - Reopen(&options); - ASSERT_OK(Put("big1", std::string(200000, '1'))); - ASSERT_OK(Put("big2", std::string(200000, '2'))); - ASSERT_OK(Put("small3", std::string(10, '3'))); - ASSERT_OK(Put("small4", std::string(10, '4'))); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - } - - // Make sure that if we re-open with a small write buffer size that - // we flush table files in the middle of a large log file. - Options options = CurrentOptions(); - options.write_buffer_size = 100000; - Reopen(&options); - ASSERT_EQ(NumTableFilesAtLevel(0), 3); - ASSERT_EQ(std::string(200000, '1'), Get("big1")); - ASSERT_EQ(std::string(200000, '2'), Get("big2")); - ASSERT_EQ(std::string(10, '3'), Get("small3")); - ASSERT_EQ(std::string(10, '4'), Get("small4")); - ASSERT_GT(NumTableFilesAtLevel(0), 1); -} - -TEST(DBTest, CompactionsGenerateMultipleFiles) { - Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - Reopen(&options); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - std::vector<std::string> values; - for (int i = 0; i < 80; i++) { - values.push_back(RandomString(&rnd, 100000)); - ASSERT_OK(Put(Key(i), values[i])); - } - - // Reopening moves updates to level-0 - Reopen(&options); - dbfull()->TEST_CompactRange(0, NULL, NULL); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 1); - for (int i = 0; i < 80; i++) { - ASSERT_EQ(Get(Key(i)), values[i]); - } -} - -TEST(DBTest, RepeatedWritesToSameKey) { - Options options = CurrentOptions(); - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - Reopen(&options); - - // We must have at most one file per level except for level-0, - // which may have up to kL0_StopWritesTrigger files. - const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger; - - Random rnd(301); - std::string value = RandomString(&rnd, 2 * options.write_buffer_size); - for (int i = 0; i < 5 * kMaxFiles; i++) { - Put("key", value); - ASSERT_LE(TotalTableFiles(), kMaxFiles); - fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles()); - } -} - -TEST(DBTest, SparseMerge) { - Options options = CurrentOptions(); - options.compression = kNoCompression; - Reopen(&options); - - FillLevels("A", "Z"); - - // Suppose there is: - // small amount of data with prefix A - // large amount of data with prefix B - // small amount of data with prefix C - // and that recent updates have made small changes to all three prefixes. - // Check that we do not do a compaction that merges all of B in one shot. - const std::string value(1000, 'x'); - Put("A", "va"); - // Write approximately 100MB of "B" values - for (int i = 0; i < 100000; i++) { - char key[100]; - snprintf(key, sizeof(key), "B%010d", i); - Put(key, value); - } - Put("C", "vc"); - dbfull()->TEST_CompactMemTable(); - dbfull()->TEST_CompactRange(0, NULL, NULL); - - // Make sparse update - Put("A", "va2"); - Put("B100", "bvalue2"); - Put("C", "vc2"); - dbfull()->TEST_CompactMemTable(); - - // Compactions should not cause us to create a situation where - // a file overlaps too much data at the next level. - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(0, NULL, NULL); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); - dbfull()->TEST_CompactRange(1, NULL, NULL); - ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576); -} - -static bool Between(uint64_t val, uint64_t low, uint64_t high) { - bool result = (val >= low) && (val <= high); - if (!result) { - fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n", - (unsigned long long)(val), - (unsigned long long)(low), - (unsigned long long)(high)); - } - return result; -} - -TEST(DBTest, ApproximateSizes) { - do { - Options options = CurrentOptions(); - options.write_buffer_size = 100000000; // Large write buffer - options.compression = kNoCompression; - DestroyAndReopen(); - - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - Reopen(&options); - ASSERT_TRUE(Between(Size("", "xyz"), 0, 0)); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - const int N = 80; - static const int S1 = 100000; - static const int S2 = 105000; // Allow some expansion from metadata - Random rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, S1))); - } - - // 0 because GetApproximateSizes() does not account for memtable space - ASSERT_TRUE(Between(Size("", Key(50)), 0, 0)); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - for (int compact_start = 0; compact_start < N; compact_start += 10) { - for (int i = 0; i < N; i += 10) { - ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i)); - ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1))); - ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10)); - } - ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50)); - ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50)); - - std::string cstart_str = Key(compact_start); - std::string cend_str = Key(compact_start + 9); - Slice cstart = cstart_str; - Slice cend = cend_str; - dbfull()->TEST_CompactRange(0, &cstart, &cend); - } - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GT(NumTableFilesAtLevel(1), 0); - } - } while (ChangeOptions()); -} - -TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { - do { - Options options = CurrentOptions(); - options.compression = kNoCompression; - Reopen(); - - Random rnd(301); - std::string big1 = RandomString(&rnd, 100000); - ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(2), big1)); - ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(4), big1)); - ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000))); - ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000))); - ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000))); - - // Check sizes across recovery by reopening a few times - for (int run = 0; run < 3; run++) { - Reopen(&options); - - ASSERT_TRUE(Between(Size("", Key(0)), 0, 0)); - ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000)); - ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000)); - ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000)); - ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000)); - ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000)); - ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000)); - ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000)); - ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000)); - - ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000)); - - dbfull()->TEST_CompactRange(0, NULL, NULL); - } - } while (ChangeOptions()); -} - -TEST(DBTest, IteratorPinsRef) { - Put("foo", "hello"); - - // Get iterator that will yield the current contents of the DB. - Iterator* iter = db_->NewIterator(ReadOptions()); - - // Write to force compactions - Put("foo", "newvalue1"); - for (int i = 0; i < 100; i++) { - ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values - } - Put("foo", "newvalue2"); - - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ("foo", iter->key().ToString()); - ASSERT_EQ("hello", iter->value().ToString()); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); - delete iter; -} - -TEST(DBTest, Snapshot) { - do { - Put("foo", "v1"); - const Snapshot* s1 = db_->GetSnapshot(); - Put("foo", "v2"); - const Snapshot* s2 = db_->GetSnapshot(); - Put("foo", "v3"); - const Snapshot* s3 = db_->GetSnapshot(); - - Put("foo", "v4"); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v3", Get("foo", s3)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s3); - ASSERT_EQ("v1", Get("foo", s1)); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s1); - ASSERT_EQ("v2", Get("foo", s2)); - ASSERT_EQ("v4", Get("foo")); - - db_->ReleaseSnapshot(s2); - ASSERT_EQ("v4", Get("foo")); - } while (ChangeOptions()); -} - -TEST(DBTest, HiddenValuesAreRemoved) { - do { - Random rnd(301); - FillLevels("a", "z"); - - std::string big = RandomString(&rnd, 50000); - Put("foo", big); - Put("pastfoo", "v"); - const Snapshot* snapshot = db_->GetSnapshot(); - Put("foo", "tiny"); - Put("pastfoo2", "v2"); // Advance sequence number one more - - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - ASSERT_GT(NumTableFilesAtLevel(0), 0); - - ASSERT_EQ(big, Get("foo", snapshot)); - ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000)); - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]"); - Slice x("x"); - dbfull()->TEST_CompactRange(0, NULL, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_GE(NumTableFilesAtLevel(1), 1); - dbfull()->TEST_CompactRange(1, NULL, &x); - ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]"); - - ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000)); - } while (ChangeOptions()); -} - -TEST(DBTest, DeletionMarkers1) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - const int last = config::kMaxMemCompactLevel; - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level - - // Place a table at level last-1 to prevent merging with preceding mutation - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); - ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); - - Delete("foo"); - Put("foo", "v2"); - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]"); - Slice z("z"); - dbfull()->TEST_CompactRange(last-2, NULL, &z); - // DEL eliminated, but v1 remains because we aren't compacting that level - // (DEL can be eliminated because v2 hides v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]"); - dbfull()->TEST_CompactRange(last-1, NULL, NULL); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]"); -} - -TEST(DBTest, DeletionMarkers2) { - Put("foo", "v1"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); - const int last = config::kMaxMemCompactLevel; - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo => v1 is now in last level - - // Place a table at level last-1 to prevent merging with preceding mutation - Put("a", "begin"); - Put("z", "end"); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(last), 1); - ASSERT_EQ(NumTableFilesAtLevel(last-1), 1); - - Delete("foo"); - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - ASSERT_OK(dbfull()->TEST_CompactMemTable()); // Moves to level last-2 - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-2, NULL, NULL); - // DEL kept: "last" file overlaps - ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]"); - dbfull()->TEST_CompactRange(last-1, NULL, NULL); - // Merging last-1 w/ last, so we are the base level for "foo", so - // DEL is removed. (as is v1). - ASSERT_EQ(AllEntriesFor("foo"), "[ ]"); -} - -TEST(DBTest, OverlapInLevel0) { - do { - ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config"; - - // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0. - ASSERT_OK(Put("100", "v100")); - ASSERT_OK(Put("999", "v999")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Delete("100")); - ASSERT_OK(Delete("999")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("0,1,1", FilesPerLevel()); - - // Make files spanning the following ranges in level-0: - // files[0] 200 .. 900 - // files[1] 300 .. 500 - // Note that files are sorted by smallest key. - ASSERT_OK(Put("300", "v300")); - ASSERT_OK(Put("500", "v500")); - dbfull()->TEST_CompactMemTable(); - ASSERT_OK(Put("200", "v200")); - ASSERT_OK(Put("600", "v600")); - ASSERT_OK(Put("900", "v900")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("2,1,1", FilesPerLevel()); - - // Compact away the placeholder files we created initially - dbfull()->TEST_CompactRange(1, NULL, NULL); - dbfull()->TEST_CompactRange(2, NULL, NULL); - ASSERT_EQ("2", FilesPerLevel()); - - // Do a memtable compaction. Before bug-fix, the compaction would - // not detect the overlap with level-0 files and would incorrectly place - // the deletion in a deeper level. - ASSERT_OK(Delete("600")); - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("3", FilesPerLevel()); - ASSERT_EQ("NOT_FOUND", Get("600")); - } while (ChangeOptions()); -} - -TEST(DBTest, L0_CompactionBug_Issue44_a) { - Reopen(); - ASSERT_OK(Put("b", "v")); - Reopen(); - ASSERT_OK(Delete("b")); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Delete("a")); - Reopen(); - ASSERT_OK(Put("a", "v")); - Reopen(); - Reopen(); - ASSERT_EQ("(a->v)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(a->v)", Contents()); -} - -TEST(DBTest, L0_CompactionBug_Issue44_b) { - Reopen(); - Put("",""); - Reopen(); - Delete("e"); - Put("",""); - Reopen(); - Put("c", "cv"); - Reopen(); - Put("",""); - Reopen(); - Put("",""); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - Reopen(); - Put("d","dv"); - Reopen(); - Put("",""); - Reopen(); - Delete("d"); - Delete("b"); - Reopen(); - ASSERT_EQ("(->)(c->cv)", Contents()); - env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ASSERT_EQ("(->)(c->cv)", Contents()); -} - -TEST(DBTest, ComparatorCheck) { - class NewComparator : public Comparator { - public: - virtual const char* Name() const { return "leveldb.NewComparator"; } - virtual int Compare(const Slice& a, const Slice& b) const { - return BytewiseComparator()->Compare(a, b); - } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const { - BytewiseComparator()->FindShortestSeparator(s, l); - } - virtual void FindShortSuccessor(std::string* key) const { - BytewiseComparator()->FindShortSuccessor(key); - } - }; - NewComparator cmp; - Options new_options = CurrentOptions(); - new_options.comparator = &cmp; - Status s = TryReopen(&new_options); - ASSERT_TRUE(!s.ok()); - ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) - << s.ToString(); -} - -TEST(DBTest, CustomComparator) { - class NumberComparator : public Comparator { - public: - virtual const char* Name() const { return "test.NumberComparator"; } - virtual int Compare(const Slice& a, const Slice& b) const { - return ToNumber(a) - ToNumber(b); - } - virtual void FindShortestSeparator(std::string* s, const Slice& l) const { - ToNumber(*s); // Check format - ToNumber(l); // Check format - } - virtual void FindShortSuccessor(std::string* key) const { - ToNumber(*key); // Check format - } - private: - static int ToNumber(const Slice& x) { - // Check that there are no extra characters. - ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']') - << EscapeString(x); - int val; - char ignored; - ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1) - << EscapeString(x); - return val; - } - }; - NumberComparator cmp; - Options new_options = CurrentOptions(); - new_options.create_if_missing = true; - new_options.comparator = &cmp; - new_options.filter_policy = NULL; // Cannot use bloom filters - new_options.write_buffer_size = 1000; // Compact more often - DestroyAndReopen(&new_options); - ASSERT_OK(Put("[10]", "ten")); - ASSERT_OK(Put("[0x14]", "twenty")); - for (int i = 0; i < 2; i++) { - ASSERT_EQ("ten", Get("[10]")); - ASSERT_EQ("ten", Get("[0xa]")); - ASSERT_EQ("twenty", Get("[20]")); - ASSERT_EQ("twenty", Get("[0x14]")); - ASSERT_EQ("NOT_FOUND", Get("[15]")); - ASSERT_EQ("NOT_FOUND", Get("[0xf]")); - Compact("[0]", "[9999]"); - } - - for (int run = 0; run < 2; run++) { - for (int i = 0; i < 1000; i++) { - char buf[100]; - snprintf(buf, sizeof(buf), "[%d]", i*10); - ASSERT_OK(Put(buf, buf)); - } - Compact("[0]", "[1000000]"); - } -} - -TEST(DBTest, ManualCompaction) { - ASSERT_EQ(config::kMaxMemCompactLevel, 2) - << "Need to update this test to match kMaxMemCompactLevel"; - - MakeTables(3, "p", "q"); - ASSERT_EQ("1,1,1", FilesPerLevel()); - - // Compaction range falls before files - Compact("", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel()); - - // Compaction range falls after files - Compact("r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel()); - - // Compaction range overlaps files - Compact("p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel()); - - // Populate a different range - MakeTables(3, "c", "e"); - ASSERT_EQ("1,1,2", FilesPerLevel()); - - // Compact just the new range - Compact("b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel()); - - // Compact all - MakeTables(1, "a", "z"); - ASSERT_EQ("0,1,2", FilesPerLevel()); - db_->CompactRange(NULL, NULL); - ASSERT_EQ("0,0,1", FilesPerLevel()); -} - -TEST(DBTest, DBOpen_Options) { - std::string dbname = test::TmpDir() + "/db_options_test"; - DestroyDB(dbname, Options()); - - // Does not exist, and create_if_missing == false: error - DB* db = NULL; - Options opts; - opts.create_if_missing = false; - Status s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL); - ASSERT_TRUE(db == NULL); - - // Does not exist, and create_if_missing == true: OK - opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; - - // Does exist, and error_if_exists == true: error - opts.create_if_missing = false; - opts.error_if_exists = true; - s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL); - ASSERT_TRUE(db == NULL); - - // Does exist, and error_if_exists == false: OK - opts.create_if_missing = true; - opts.error_if_exists = false; - s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; -} - -TEST(DBTest, Locking) { - DB* db2 = NULL; - Status s = DB::Open(CurrentOptions(), dbname_, &db2); - ASSERT_TRUE(!s.ok()) << "Locking did not prevent re-opening db"; -} - -// Check that number of files does not grow when we are out of space -TEST(DBTest, NoSpace) { - Options options = CurrentOptions(); - options.env = env_; - Reopen(&options); - - ASSERT_OK(Put("foo", "v1")); - ASSERT_EQ("v1", Get("foo")); - Compact("a", "z"); - const int num_files = CountFiles(); - env_->no_space_.Release_Store(env_); // Force out-of-space errors - env_->sleep_counter_.Reset(); - for (int i = 0; i < 5; i++) { - for (int level = 0; level < config::kNumLevels-1; level++) { - dbfull()->TEST_CompactRange(level, NULL, NULL); - } - } - env_->no_space_.Release_Store(NULL); - ASSERT_LT(CountFiles(), num_files + 3); - - // Check that compaction attempts slept after errors - ASSERT_GE(env_->sleep_counter_.Read(), 5); -} - -TEST(DBTest, NonWritableFileSystem) { - Options options = CurrentOptions(); - options.write_buffer_size = 1000; - options.env = env_; - Reopen(&options); - ASSERT_OK(Put("foo", "v1")); - env_->non_writable_.Release_Store(env_); // Force errors for new files - std::string big(100000, 'x'); - int errors = 0; - for (int i = 0; i < 20; i++) { - fprintf(stderr, "iter %d; errors %d\n", i, errors); - if (!Put("foo", big).ok()) { - errors++; - env_->SleepForMicroseconds(100000); - } - } - ASSERT_GT(errors, 0); - env_->non_writable_.Release_Store(NULL); -} - -TEST(DBTest, ManifestWriteError) { - // Test for the following problem: - // (a) Compaction produces file F - // (b) Log record containing F is written to MANIFEST file, but Sync() fails - // (c) GC deletes F - // (d) After reopening DB, reads fail since deleted F is named in log record - - // We iterate twice. In the second iteration, everything is the - // same except the log record never makes it to the MANIFEST file. - for (int iter = 0; iter < 2; iter++) { - port::AtomicPointer* error_type = (iter == 0) - ? &env_->manifest_sync_error_ - : &env_->manifest_write_error_; - - // Insert foo=>bar mapping - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - options.error_if_exists = false; - DestroyAndReopen(&options); - ASSERT_OK(Put("foo", "bar")); - ASSERT_EQ("bar", Get("foo")); - - // Memtable compaction (will succeed) - dbfull()->TEST_CompactMemTable(); - ASSERT_EQ("bar", Get("foo")); - const int last = config::kMaxMemCompactLevel; - ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level - - // Merging compaction (will fail) - error_type->Release_Store(env_); - dbfull()->TEST_CompactRange(last, NULL, NULL); // Should fail - ASSERT_EQ("bar", Get("foo")); - - // Recovery: should not lose data - error_type->Release_Store(NULL); - Reopen(&options); - ASSERT_EQ("bar", Get("foo")); - } -} - -TEST(DBTest, FilesDeletedAfterCompaction) { - ASSERT_OK(Put("foo", "v2")); - Compact("a", "z"); - const int num_files = CountFiles(); - for (int i = 0; i < 10; i++) { - ASSERT_OK(Put("foo", "v2")); - Compact("a", "z"); - } - ASSERT_EQ(CountFiles(), num_files); -} - -TEST(DBTest, BloomFilter) { - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.block_cache = NewLRUCache(0); // Prevent cache hits - options.filter_policy = NewBloomFilterPolicy(10); - Reopen(&options); - - // Populate multiple layers - const int N = 10000; - for (int i = 0; i < N; i++) { - ASSERT_OK(Put(Key(i), Key(i))); - } - Compact("a", "z"); - for (int i = 0; i < N; i += 100) { - ASSERT_OK(Put(Key(i), Key(i))); - } - dbfull()->TEST_CompactMemTable(); - - // Prevent auto compactions triggered by seeks - env_->delay_sstable_sync_.Release_Store(env_); - - // Lookup present keys. Should rarely read from small sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ(Key(i), Get(Key(i))); - } - int reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d present => %d reads\n", N, reads); - ASSERT_GE(reads, N); - ASSERT_LE(reads, N + 2*N/100); - - // Lookup present keys. Should rarely read from either sstable. - env_->random_read_counter_.Reset(); - for (int i = 0; i < N; i++) { - ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing")); - } - reads = env_->random_read_counter_.Read(); - fprintf(stderr, "%d missing => %d reads\n", N, reads); - ASSERT_LE(reads, 3*N/100); - - env_->delay_sstable_sync_.Release_Store(NULL); - Close(); - delete options.block_cache; - delete options.filter_policy; -} - -// Multi-threaded test: -namespace { - -static const int kNumThreads = 4; -static const int kTestSeconds = 10; -static const int kNumKeys = 1000; - -struct MTState { - DBTest* test; - port::AtomicPointer stop; - port::AtomicPointer counter[kNumThreads]; - port::AtomicPointer thread_done[kNumThreads]; -}; - -struct MTThread { - MTState* state; - int id; -}; - -static void MTThreadBody(void* arg) { - MTThread* t = reinterpret_cast<MTThread*>(arg); - int id = t->id; - DB* db = t->state->test->db_; - uintptr_t counter = 0; - fprintf(stderr, "... starting thread %d\n", id); - Random rnd(1000 + id); - std::string value; - char valbuf[1500]; - while (t->state->stop.Acquire_Load() == NULL) { - t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter)); - - int key = rnd.Uniform(kNumKeys); - char keybuf[20]; - snprintf(keybuf, sizeof(keybuf), "%016d", key); - - if (rnd.OneIn(2)) { - // Write values of the form <key, my id, counter>. - // We add some padding for force compactions. - snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d", - key, id, static_cast<int>(counter)); - ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf))); - } else { - // Read a value and verify that it matches the pattern written above. - Status s = db->Get(ReadOptions(), Slice(keybuf), &value); - if (s.IsNotFound()) { - // Key has not yet been written - } else { - // Check that the writer thread counter is >= the counter in the value - ASSERT_OK(s); - int k, w, c; - ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value; - ASSERT_EQ(k, key); - ASSERT_GE(w, 0); - ASSERT_LT(w, kNumThreads); - ASSERT_LE(c, reinterpret_cast<uintptr_t>( - t->state->counter[w].Acquire_Load())); - } - } - counter++; - } - t->state->thread_done[id].Release_Store(t); - fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); -} - -} // namespace - -TEST(DBTest, MultiThreaded) { - do { - // Initialize state - MTState mt; - mt.test = this; - mt.stop.Release_Store(0); - for (int id = 0; id < kNumThreads; id++) { - mt.counter[id].Release_Store(0); - mt.thread_done[id].Release_Store(0); - } - - // Start threads - MTThread thread[kNumThreads]; - for (int id = 0; id < kNumThreads; id++) { - thread[id].state = &mt; - thread[id].id = id; - env_->StartThread(MTThreadBody, &thread[id]); - } - - // Let them run for a while - env_->SleepForMicroseconds(kTestSeconds * 1000000); - - // Stop the threads and wait for them to finish - mt.stop.Release_Store(&mt); - for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].Acquire_Load() == NULL) { - env_->SleepForMicroseconds(100000); - } - } - } while (ChangeOptions()); -} - -namespace { -typedef std::map<std::string, std::string> KVMap; -} - -class ModelDB: public DB { - public: - class ModelSnapshot : public Snapshot { - public: - KVMap map_; - }; - - explicit ModelDB(const Options& options): options_(options) { } - ~ModelDB() { } - virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) { - return DB::Put(o, k, v); - } - virtual Status Delete(const WriteOptions& o, const Slice& key) { - return DB::Delete(o, key); - } - virtual Status Get(const ReadOptions& options, - const Slice& key, std::string* value) { - assert(false); // Not implemented - return Status::NotFound(key); - } - virtual Iterator* NewIterator(const ReadOptions& options) { - if (options.snapshot == NULL) { - KVMap* saved = new KVMap; - *saved = map_; - return new ModelIter(saved, true); - } else { - const KVMap* snapshot_state = - &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_); - return new ModelIter(snapshot_state, false); - } - } - virtual const Snapshot* GetSnapshot() { - ModelSnapshot* snapshot = new ModelSnapshot; - snapshot->map_ = map_; - return snapshot; - } - - virtual void ReleaseSnapshot(const Snapshot* snapshot) { - delete reinterpret_cast<const ModelSnapshot*>(snapshot); - } - virtual Status Write(const WriteOptions& options, WriteBatch* batch) { - class Handler : public WriteBatch::Handler { - public: - KVMap* map_; - virtual void Put(const Slice& key, const Slice& value) { - (*map_)[key.ToString()] = value.ToString(); - } - virtual void Delete(const Slice& key) { - map_->erase(key.ToString()); - } - }; - Handler handler; - handler.map_ = &map_; - return batch->Iterate(&handler); - } - - virtual bool GetProperty(const Slice& property, std::string* value) { - return false; - } - virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) { - for (int i = 0; i < n; i++) { - sizes[i] = 0; - } - } - virtual void CompactRange(const Slice* start, const Slice* end) { - } - - private: - class ModelIter: public Iterator { - public: - ModelIter(const KVMap* map, bool owned) - : map_(map), owned_(owned), iter_(map_->end()) { - } - ~ModelIter() { - if (owned_) delete map_; - } - virtual bool Valid() const { return iter_ != map_->end(); } - virtual void SeekToFirst() { iter_ = map_->begin(); } - virtual void SeekToLast() { - if (map_->empty()) { - iter_ = map_->end(); - } else { - iter_ = map_->find(map_->rbegin()->first); - } - } - virtual void Seek(const Slice& k) { - iter_ = map_->lower_bound(k.ToString()); - } - virtual void Next() { ++iter_; } - virtual void Prev() { --iter_; } - virtual Slice key() const { return iter_->first; } - virtual Slice value() const { return iter_->second; } - virtual Status status() const { return Status::OK(); } - private: - const KVMap* const map_; - const bool owned_; // Do we own map_ - KVMap::const_iterator iter_; - }; - const Options options_; - KVMap map_; -}; - -static std::string RandomKey(Random* rnd) { - int len = (rnd->OneIn(3) - ? 1 // Short sometimes to encourage collisions - : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10))); - return test::RandomKey(rnd, len); -} - -static bool CompareIterators(int step, - DB* model, - DB* db, - const Snapshot* model_snap, - const Snapshot* db_snap) { - ReadOptions options; - options.snapshot = model_snap; - Iterator* miter = model->NewIterator(options); - options.snapshot = db_snap; - Iterator* dbiter = db->NewIterator(options); - bool ok = true; - int count = 0; - for (miter->SeekToFirst(), dbiter->SeekToFirst(); - ok && miter->Valid() && dbiter->Valid(); - miter->Next(), dbiter->Next()) { - count++; - if (miter->key().compare(dbiter->key()) != 0) { - fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(dbiter->key()).c_str()); - ok = false; - break; - } - - if (miter->value().compare(dbiter->value()) != 0) { - fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n", - step, - EscapeString(miter->key()).c_str(), - EscapeString(miter->value()).c_str(), - EscapeString(miter->value()).c_str()); - ok = false; - } - } - - if (ok) { - if (miter->Valid() != dbiter->Valid()) { - fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n", - step, miter->Valid(), dbiter->Valid()); - ok = false; - } - } - fprintf(stderr, "%d entries compared: ok=%d\n", count, ok); - delete miter; - delete dbiter; - return ok; -} - -TEST(DBTest, Randomized) { - Random rnd(test::RandomSeed()); - do { - ModelDB model(CurrentOptions()); - const int N = 10000; - const Snapshot* model_snap = NULL; - const Snapshot* db_snap = NULL; - std::string k, v; - for (int step = 0; step < N; step++) { - if (step % 100 == 0) { - fprintf(stderr, "Step %d of %d\n", step, N); - } - // TODO(sanjay): Test Get() works - int p = rnd.Uniform(100); - if (p < 45) { // Put - k = RandomKey(&rnd); - v = RandomString(&rnd, - rnd.OneIn(20) - ? 100 + rnd.Uniform(100) - : rnd.Uniform(8)); - ASSERT_OK(model.Put(WriteOptions(), k, v)); - ASSERT_OK(db_->Put(WriteOptions(), k, v)); - - } else if (p < 90) { // Delete - k = RandomKey(&rnd); - ASSERT_OK(model.Delete(WriteOptions(), k)); - ASSERT_OK(db_->Delete(WriteOptions(), k)); - - - } else { // Multi-element batch - WriteBatch b; - const int num = rnd.Uniform(8); - for (int i = 0; i < num; i++) { - if (i == 0 || !rnd.OneIn(10)) { - k = RandomKey(&rnd); - } else { - // Periodically re-use the same key from the previous iter, so - // we have multiple entries in the write batch for the same key - } - if (rnd.OneIn(2)) { - v = RandomString(&rnd, rnd.Uniform(10)); - b.Put(k, v); - } else { - b.Delete(k); - } - } - ASSERT_OK(model.Write(WriteOptions(), &b)); - ASSERT_OK(db_->Write(WriteOptions(), &b)); - } - - if ((step % 100) == 0) { - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap)); - // Save a snapshot from each DB this time that we'll use next - // time we compare things, to make sure the current state is - // preserved with the snapshot - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - - Reopen(); - ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL)); - - model_snap = model.GetSnapshot(); - db_snap = db_->GetSnapshot(); - } - } - if (model_snap != NULL) model.ReleaseSnapshot(model_snap); - if (db_snap != NULL) db_->ReleaseSnapshot(db_snap); - } while (ChangeOptions()); -} - -std::string MakeKey(unsigned int num) { - char buf[30]; - snprintf(buf, sizeof(buf), "%016u", num); - return std::string(buf); -} - -void BM_LogAndApply(int iters, int num_base_files) { - std::string dbname = test::TmpDir() + "/leveldb_test_benchmark"; - DestroyDB(dbname, Options()); - - DB* db = NULL; - Options opts; - opts.create_if_missing = true; - Status s = DB::Open(opts, dbname, &db); - ASSERT_OK(s); - ASSERT_TRUE(db != NULL); - - delete db; - db = NULL; - - Env* env = Env::Default(); - - port::Mutex mu; - MutexLock l(&mu); - - InternalKeyComparator cmp(BytewiseComparator()); - Options options; - VersionSet vset(dbname, &options, NULL, &cmp); - ASSERT_OK(vset.Recover()); - VersionEdit vbase; - uint64_t fnum = 1; - for (int i = 0; i < num_base_files; i++) { - InternalKey start(MakeKey(2*fnum), 1, kTypeValue); - InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); - vbase.AddFile(2, fnum++, 1 /* file size */, start, limit); - } - ASSERT_OK(vset.LogAndApply(&vbase, &mu)); - - uint64_t start_micros = env->NowMicros(); - - for (int i = 0; i < iters; i++) { - VersionEdit vedit; - vedit.DeleteFile(2, fnum); - InternalKey start(MakeKey(2*fnum), 1, kTypeValue); - InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); - vedit.AddFile(2, fnum++, 1 /* file size */, start, limit); - vset.LogAndApply(&vedit, &mu); - } - uint64_t stop_micros = env->NowMicros(); - unsigned int us = stop_micros - start_micros; - char buf[16]; - snprintf(buf, sizeof(buf), "%d", num_base_files); - fprintf(stderr, - "BM_LogAndApply/%-6s %8d iters : %9u us (%7.0f us / iter)\n", - buf, iters, us, ((float)us) / iters); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - if (argc > 1 && std::string(argv[1]) == "--benchmark") { - leveldb::BM_LogAndApply(1000, 1); - leveldb::BM_LogAndApply(1000, 100); - leveldb::BM_LogAndApply(1000, 10000); - leveldb::BM_LogAndApply(100, 100000); - return 0; - } - - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc deleted file mode 100644 index 28e11b398d..0000000000 --- a/src/leveldb/db/dbformat.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include <stdio.h> -#include "db/dbformat.h" -#include "port/port.h" -#include "util/coding.h" - -namespace leveldb { - -static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) { - assert(seq <= kMaxSequenceNumber); - assert(t <= kValueTypeForSeek); - return (seq << 8) | t; -} - -void AppendInternalKey(std::string* result, const ParsedInternalKey& key) { - result->append(key.user_key.data(), key.user_key.size()); - PutFixed64(result, PackSequenceAndType(key.sequence, key.type)); -} - -std::string ParsedInternalKey::DebugString() const { - char buf[50]; - snprintf(buf, sizeof(buf), "' @ %llu : %d", - (unsigned long long) sequence, - int(type)); - std::string result = "'"; - result += user_key.ToString(); - result += buf; - return result; -} - -std::string InternalKey::DebugString() const { - std::string result; - ParsedInternalKey parsed; - if (ParseInternalKey(rep_, &parsed)) { - result = parsed.DebugString(); - } else { - result = "(bad)"; - result.append(EscapeString(rep_)); - } - return result; -} - -const char* InternalKeyComparator::Name() const { - return "leveldb.InternalKeyComparator"; -} - -int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const { - // Order by: - // increasing user key (according to user-supplied comparator) - // decreasing sequence number - // decreasing type (though sequence# should be enough to disambiguate) - int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey)); - if (r == 0) { - const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8); - const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8); - if (anum > bnum) { - r = -1; - } else if (anum < bnum) { - r = +1; - } - } - return r; -} - -void InternalKeyComparator::FindShortestSeparator( - std::string* start, - const Slice& limit) const { - // Attempt to shorten the user portion of the key - Slice user_start = ExtractUserKey(*start); - Slice user_limit = ExtractUserKey(limit); - std::string tmp(user_start.data(), user_start.size()); - user_comparator_->FindShortestSeparator(&tmp, user_limit); - if (tmp.size() < user_start.size() && - user_comparator_->Compare(user_start, tmp) < 0) { - // User key has become shorter physically, but larger logically. - // Tack on the earliest possible number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*start, tmp) < 0); - assert(this->Compare(tmp, limit) < 0); - start->swap(tmp); - } -} - -void InternalKeyComparator::FindShortSuccessor(std::string* key) const { - Slice user_key = ExtractUserKey(*key); - std::string tmp(user_key.data(), user_key.size()); - user_comparator_->FindShortSuccessor(&tmp); - if (tmp.size() < user_key.size() && - user_comparator_->Compare(user_key, tmp) < 0) { - // User key has become shorter physically, but larger logically. - // Tack on the earliest possible number to the shortened user key. - PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek)); - assert(this->Compare(*key, tmp) < 0); - key->swap(tmp); - } -} - -const char* InternalFilterPolicy::Name() const { - return user_policy_->Name(); -} - -void InternalFilterPolicy::CreateFilter(const Slice* keys, int n, - std::string* dst) const { - // We rely on the fact that the code in table.cc does not mind us - // adjusting keys[]. - Slice* mkey = const_cast<Slice*>(keys); - for (int i = 0; i < n; i++) { - mkey[i] = ExtractUserKey(keys[i]); - // TODO(sanjay): Suppress dups? - } - user_policy_->CreateFilter(keys, n, dst); -} - -bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const { - return user_policy_->KeyMayMatch(ExtractUserKey(key), f); -} - -LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { - size_t usize = user_key.size(); - size_t needed = usize + 13; // A conservative estimate - char* dst; - if (needed <= sizeof(space_)) { - dst = space_; - } else { - dst = new char[needed]; - } - start_ = dst; - dst = EncodeVarint32(dst, usize + 8); - kstart_ = dst; - memcpy(dst, user_key.data(), usize); - dst += usize; - EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); - dst += 8; - end_ = dst; -} - -} // namespace leveldb diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h deleted file mode 100644 index f7f64dafb6..0000000000 --- a/src/leveldb/db/dbformat.h +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_FORMAT_H_ -#define STORAGE_LEVELDB_DB_FORMAT_H_ - -#include <stdio.h> -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/filter_policy.h" -#include "leveldb/slice.h" -#include "leveldb/table_builder.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -// Grouping of constants. We may want to make some of these -// parameters set via options. -namespace config { -static const int kNumLevels = 7; - -// Level-0 compaction is started when we hit this many files. -static const int kL0_CompactionTrigger = 4; - -// Soft limit on number of level-0 files. We slow down writes at this point. -static const int kL0_SlowdownWritesTrigger = 8; - -// Maximum number of level-0 files. We stop writes at this point. -static const int kL0_StopWritesTrigger = 12; - -// Maximum level to which a new compacted memtable is pushed if it -// does not create overlap. We try to push to level 2 to avoid the -// relatively expensive level 0=>1 compactions and to avoid some -// expensive manifest file operations. We do not push all the way to -// the largest level since that can generate a lot of wasted disk -// space if the same key space is being repeatedly overwritten. -static const int kMaxMemCompactLevel = 2; - -} // namespace config - -class InternalKey; - -// Value types encoded as the last component of internal keys. -// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk -// data structures. -enum ValueType { - kTypeDeletion = 0x0, - kTypeValue = 0x1 -}; -// kValueTypeForSeek defines the ValueType that should be passed when -// constructing a ParsedInternalKey object for seeking to a particular -// sequence number (since we sort sequence numbers in decreasing order -// and the value type is embedded as the low 8 bits in the sequence -// number in internal keys, we need to use the highest-numbered -// ValueType, not the lowest). -static const ValueType kValueTypeForSeek = kTypeValue; - -typedef uint64_t SequenceNumber; - -// We leave eight bits empty at the bottom so a type and sequence# -// can be packed together into 64-bits. -static const SequenceNumber kMaxSequenceNumber = - ((0x1ull << 56) - 1); - -struct ParsedInternalKey { - Slice user_key; - SequenceNumber sequence; - ValueType type; - - ParsedInternalKey() { } // Intentionally left uninitialized (for speed) - ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t) - : user_key(u), sequence(seq), type(t) { } - std::string DebugString() const; -}; - -// Return the length of the encoding of "key". -inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) { - return key.user_key.size() + 8; -} - -// Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); - -// Attempt to parse an internal key from "internal_key". On success, -// stores the parsed data in "*result", and returns true. -// -// On error, returns false, leaves "*result" in an undefined state. -extern bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result); - -// Returns the user key portion of an internal key. -inline Slice ExtractUserKey(const Slice& internal_key) { - assert(internal_key.size() >= 8); - return Slice(internal_key.data(), internal_key.size() - 8); -} - -inline ValueType ExtractValueType(const Slice& internal_key) { - assert(internal_key.size() >= 8); - const size_t n = internal_key.size(); - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - return static_cast<ValueType>(c); -} - -// A comparator for internal keys that uses a specified comparator for -// the user key portion and breaks ties by decreasing sequence number. -class InternalKeyComparator : public Comparator { - private: - const Comparator* user_comparator_; - public: - explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { } - virtual const char* Name() const; - virtual int Compare(const Slice& a, const Slice& b) const; - virtual void FindShortestSeparator( - std::string* start, - const Slice& limit) const; - virtual void FindShortSuccessor(std::string* key) const; - - const Comparator* user_comparator() const { return user_comparator_; } - - int Compare(const InternalKey& a, const InternalKey& b) const; -}; - -// Filter policy wrapper that converts from internal keys to user keys -class InternalFilterPolicy : public FilterPolicy { - private: - const FilterPolicy* const user_policy_; - public: - explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { } - virtual const char* Name() const; - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const; - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const; -}; - -// Modules in this directory should keep internal keys wrapped inside -// the following class instead of plain strings so that we do not -// incorrectly use string comparisons instead of an InternalKeyComparator. -class InternalKey { - private: - std::string rep_; - public: - InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); - } - - void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); } - Slice Encode() const { - assert(!rep_.empty()); - return rep_; - } - - Slice user_key() const { return ExtractUserKey(rep_); } - - void SetFrom(const ParsedInternalKey& p) { - rep_.clear(); - AppendInternalKey(&rep_, p); - } - - void Clear() { rep_.clear(); } - - std::string DebugString() const; -}; - -inline int InternalKeyComparator::Compare( - const InternalKey& a, const InternalKey& b) const { - return Compare(a.Encode(), b.Encode()); -} - -inline bool ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result) { - const size_t n = internal_key.size(); - if (n < 8) return false; - uint64_t num = DecodeFixed64(internal_key.data() + n - 8); - unsigned char c = num & 0xff; - result->sequence = num >> 8; - result->type = static_cast<ValueType>(c); - result->user_key = Slice(internal_key.data(), n - 8); - return (c <= static_cast<unsigned char>(kTypeValue)); -} - -// A helper class useful for DBImpl::Get() -class LookupKey { - public: - // Initialize *this for looking up user_key at a snapshot with - // the specified sequence number. - LookupKey(const Slice& user_key, SequenceNumber sequence); - - ~LookupKey(); - - // Return a key suitable for lookup in a MemTable. - Slice memtable_key() const { return Slice(start_, end_ - start_); } - - // Return an internal key (suitable for passing to an internal iterator) - Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } - - // Return the user key - Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } - - private: - // We construct a char array of the form: - // klength varint32 <-- start_ - // userkey char[klength] <-- kstart_ - // tag uint64 - // <-- end_ - // The array is a suitable MemTable key. - // The suffix starting with "userkey" can be used as an InternalKey. - const char* start_; - const char* kstart_; - const char* end_; - char space_[200]; // Avoid allocation for short keys - - // No copying allowed - LookupKey(const LookupKey&); - void operator=(const LookupKey&); -}; - -inline LookupKey::~LookupKey() { - if (start_ != space_) delete[] start_; -} - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_FORMAT_H_ diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc deleted file mode 100644 index 5d82f5d313..0000000000 --- a/src/leveldb/db/dbformat_test.cc +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/dbformat.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string IKey(const std::string& user_key, - uint64_t seq, - ValueType vt) { - std::string encoded; - AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt)); - return encoded; -} - -static std::string Shorten(const std::string& s, const std::string& l) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l); - return result; -} - -static std::string ShortSuccessor(const std::string& s) { - std::string result = s; - InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result); - return result; -} - -static void TestKey(const std::string& key, - uint64_t seq, - ValueType vt) { - std::string encoded = IKey(key, seq, vt); - - Slice in(encoded); - ParsedInternalKey decoded("", 0, kTypeValue); - - ASSERT_TRUE(ParseInternalKey(in, &decoded)); - ASSERT_EQ(key, decoded.user_key.ToString()); - ASSERT_EQ(seq, decoded.sequence); - ASSERT_EQ(vt, decoded.type); - - ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded)); -} - -class FormatTest { }; - -TEST(FormatTest, InternalKey_EncodeDecode) { - const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" }; - const uint64_t seq[] = { - 1, 2, 3, - (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1, - (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1, - (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1 - }; - for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) { - for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) { - TestKey(keys[k], seq[s], kTypeValue); - TestKey("hello", 1, kTypeDeletion); - } - } -} - -TEST(FormatTest, InternalKeyShortSeparator) { - // When user keys are same - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 99, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 101, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foo", 100, kTypeDeletion))); - - // When user keys are misordered - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("bar", 99, kTypeValue))); - - // When user keys are different, but correctly ordered - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - Shorten(IKey("foo", 100, kTypeValue), - IKey("hello", 200, kTypeValue))); - - // When start user key is prefix of limit user key - ASSERT_EQ(IKey("foo", 100, kTypeValue), - Shorten(IKey("foo", 100, kTypeValue), - IKey("foobar", 200, kTypeValue))); - - // When limit user key is prefix of start user key - ASSERT_EQ(IKey("foobar", 100, kTypeValue), - Shorten(IKey("foobar", 100, kTypeValue), - IKey("foo", 200, kTypeValue))); -} - -TEST(FormatTest, InternalKeyShortestSuccessor) { - ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek), - ShortSuccessor(IKey("foo", 100, kTypeValue))); - ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue), - ShortSuccessor(IKey("\xff\xff", 100, kTypeValue))); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc deleted file mode 100644 index 3c4d49f64e..0000000000 --- a/src/leveldb/db/filename.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include <ctype.h> -#include <stdio.h> -#include "db/filename.h" -#include "db/dbformat.h" -#include "leveldb/env.h" -#include "util/logging.h" - -namespace leveldb { - -// A utility routine: write "data" to the named file and Sync() it. -extern Status WriteStringToFileSync(Env* env, const Slice& data, - const std::string& fname); - -static std::string MakeFileName(const std::string& name, uint64_t number, - const char* suffix) { - char buf[100]; - snprintf(buf, sizeof(buf), "/%06llu.%s", - static_cast<unsigned long long>(number), - suffix); - return name + buf; -} - -std::string LogFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "log"); -} - -std::string TableFileName(const std::string& name, uint64_t number) { - assert(number > 0); - return MakeFileName(name, number, "sst"); -} - -std::string DescriptorFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - char buf[100]; - snprintf(buf, sizeof(buf), "/MANIFEST-%06llu", - static_cast<unsigned long long>(number)); - return dbname + buf; -} - -std::string CurrentFileName(const std::string& dbname) { - return dbname + "/CURRENT"; -} - -std::string LockFileName(const std::string& dbname) { - return dbname + "/LOCK"; -} - -std::string TempFileName(const std::string& dbname, uint64_t number) { - assert(number > 0); - return MakeFileName(dbname, number, "dbtmp"); -} - -std::string InfoLogFileName(const std::string& dbname) { - return dbname + "/LOG"; -} - -// Return the name of the old info log file for "dbname". -std::string OldInfoLogFileName(const std::string& dbname) { - return dbname + "/LOG.old"; -} - - -// Owned filenames have the form: -// dbname/CURRENT -// dbname/LOCK -// dbname/LOG -// dbname/LOG.old -// dbname/MANIFEST-[0-9]+ -// dbname/[0-9]+.(log|sst) -bool ParseFileName(const std::string& fname, - uint64_t* number, - FileType* type) { - Slice rest(fname); - if (rest == "CURRENT") { - *number = 0; - *type = kCurrentFile; - } else if (rest == "LOCK") { - *number = 0; - *type = kDBLockFile; - } else if (rest == "LOG" || rest == "LOG.old") { - *number = 0; - *type = kInfoLogFile; - } else if (rest.starts_with("MANIFEST-")) { - rest.remove_prefix(strlen("MANIFEST-")); - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - if (!rest.empty()) { - return false; - } - *type = kDescriptorFile; - *number = num; - } else { - // Avoid strtoull() to keep filename format independent of the - // current locale - uint64_t num; - if (!ConsumeDecimalNumber(&rest, &num)) { - return false; - } - Slice suffix = rest; - if (suffix == Slice(".log")) { - *type = kLogFile; - } else if (suffix == Slice(".sst")) { - *type = kTableFile; - } else if (suffix == Slice(".dbtmp")) { - *type = kTempFile; - } else { - return false; - } - *number = num; - } - return true; -} - -Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number) { - // Remove leading "dbname/" and add newline to manifest file name - std::string manifest = DescriptorFileName(dbname, descriptor_number); - Slice contents = manifest; - assert(contents.starts_with(dbname + "/")); - contents.remove_prefix(dbname.size() + 1); - std::string tmp = TempFileName(dbname, descriptor_number); - Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp); - if (s.ok()) { - s = env->RenameFile(tmp, CurrentFileName(dbname)); - } - if (!s.ok()) { - env->DeleteFile(tmp); - } - return s; -} - -} // namespace leveldb diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h deleted file mode 100644 index d5d09b1146..0000000000 --- a/src/leveldb/db/filename.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// File names used by DB code - -#ifndef STORAGE_LEVELDB_DB_FILENAME_H_ -#define STORAGE_LEVELDB_DB_FILENAME_H_ - -#include <stdint.h> -#include <string> -#include "leveldb/slice.h" -#include "leveldb/status.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -enum FileType { - kLogFile, - kDBLockFile, - kTableFile, - kDescriptorFile, - kCurrentFile, - kTempFile, - kInfoLogFile // Either the current one, or an old one -}; - -// Return the name of the log file with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string LogFileName(const std::string& dbname, uint64_t number); - -// Return the name of the sstable with the specified number -// in the db named by "dbname". The result will be prefixed with -// "dbname". -extern std::string TableFileName(const std::string& dbname, uint64_t number); - -// Return the name of the descriptor file for the db named by -// "dbname" and the specified incarnation number. The result will be -// prefixed with "dbname". -extern std::string DescriptorFileName(const std::string& dbname, - uint64_t number); - -// Return the name of the current file. This file contains the name -// of the current manifest file. The result will be prefixed with -// "dbname". -extern std::string CurrentFileName(const std::string& dbname); - -// Return the name of the lock file for the db named by -// "dbname". The result will be prefixed with "dbname". -extern std::string LockFileName(const std::string& dbname); - -// Return the name of a temporary file owned by the db named "dbname". -// The result will be prefixed with "dbname". -extern std::string TempFileName(const std::string& dbname, uint64_t number); - -// Return the name of the info log file for "dbname". -extern std::string InfoLogFileName(const std::string& dbname); - -// Return the name of the old info log file for "dbname". -extern std::string OldInfoLogFileName(const std::string& dbname); - -// If filename is a leveldb file, store the type of the file in *type. -// The number encoded in the filename is stored in *number. If the -// filename was successfully parsed, returns true. Else return false. -extern bool ParseFileName(const std::string& filename, - uint64_t* number, - FileType* type); - -// Make the CURRENT file point to the descriptor file with the -// specified number. -extern Status SetCurrentFile(Env* env, const std::string& dbname, - uint64_t descriptor_number); - - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_FILENAME_H_ diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc deleted file mode 100644 index 47353d6c9a..0000000000 --- a/src/leveldb/db/filename_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/filename.h" - -#include "db/dbformat.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -class FileNameTest { }; - -TEST(FileNameTest, Parse) { - Slice db; - FileType type; - uint64_t number; - - // Successful parses - static struct { - const char* fname; - uint64_t number; - FileType type; - } cases[] = { - { "100.log", 100, kLogFile }, - { "0.log", 0, kLogFile }, - { "0.sst", 0, kTableFile }, - { "CURRENT", 0, kCurrentFile }, - { "LOCK", 0, kDBLockFile }, - { "MANIFEST-2", 2, kDescriptorFile }, - { "MANIFEST-7", 7, kDescriptorFile }, - { "LOG", 0, kInfoLogFile }, - { "LOG.old", 0, kInfoLogFile }, - { "18446744073709551615.log", 18446744073709551615ull, kLogFile }, - }; - for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) { - std::string f = cases[i].fname; - ASSERT_TRUE(ParseFileName(f, &number, &type)) << f; - ASSERT_EQ(cases[i].type, type) << f; - ASSERT_EQ(cases[i].number, number) << f; - } - - // Errors - static const char* errors[] = { - "", - "foo", - "foo-dx-100.log", - ".log", - "", - "manifest", - "CURREN", - "CURRENTX", - "MANIFES", - "MANIFEST", - "MANIFEST-", - "XMANIFEST-3", - "MANIFEST-3x", - "LOC", - "LOCKx", - "LO", - "LOGx", - "18446744073709551616.log", - "184467440737095516150.log", - "100", - "100.", - "100.lop" - }; - for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) { - std::string f = errors[i]; - ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f; - }; -} - -TEST(FileNameTest, Construction) { - uint64_t number; - FileType type; - std::string fname; - - fname = CurrentFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kCurrentFile, type); - - fname = LockFileName("foo"); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(0, number); - ASSERT_EQ(kDBLockFile, type); - - fname = LogFileName("foo", 192); - ASSERT_EQ("foo/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(192, number); - ASSERT_EQ(kLogFile, type); - - fname = TableFileName("bar", 200); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(200, number); - ASSERT_EQ(kTableFile, type); - - fname = DescriptorFileName("bar", 100); - ASSERT_EQ("bar/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(100, number); - ASSERT_EQ(kDescriptorFile, type); - - fname = TempFileName("tmp", 999); - ASSERT_EQ("tmp/", std::string(fname.data(), 4)); - ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type)); - ASSERT_EQ(999, number); - ASSERT_EQ(kTempFile, type); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/leveldb_main.cc b/src/leveldb/db/leveldb_main.cc deleted file mode 100644 index 995d76107a..0000000000 --- a/src/leveldb/db/leveldb_main.cc +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include <stdio.h> -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/version_edit.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "leveldb/options.h" -#include "leveldb/status.h" -#include "leveldb/table.h" -#include "leveldb/write_batch.h" -#include "util/logging.h" - -namespace leveldb { - -namespace { - -bool GuessType(const std::string& fname, FileType* type) { - size_t pos = fname.rfind('/'); - std::string basename; - if (pos == std::string::npos) { - basename = fname; - } else { - basename = std::string(fname.data() + pos + 1, fname.size() - pos - 1); - } - uint64_t ignored; - return ParseFileName(basename, &ignored, type); -} - -// Notified when log reader encounters corruption. -class CorruptionReporter : public log::Reader::Reporter { - public: - virtual void Corruption(size_t bytes, const Status& status) { - printf("corruption: %d bytes; %s\n", - static_cast<int>(bytes), - status.ToString().c_str()); - } -}; - -// Print contents of a log file. (*func)() is called on every record. -bool PrintLogContents(Env* env, const std::string& fname, - void (*func)(Slice)) { - SequentialFile* file; - Status s = env->NewSequentialFile(fname, &file); - if (!s.ok()) { - fprintf(stderr, "%s\n", s.ToString().c_str()); - return false; - } - CorruptionReporter reporter; - log::Reader reader(file, &reporter, true, 0); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch)) { - printf("--- offset %llu; ", - static_cast<unsigned long long>(reader.LastRecordOffset())); - (*func)(record); - } - delete file; - return true; -} - -// Called on every item found in a WriteBatch. -class WriteBatchItemPrinter : public WriteBatch::Handler { - public: - uint64_t offset_; - uint64_t sequence_; - - virtual void Put(const Slice& key, const Slice& value) { - printf(" put '%s' '%s'\n", - EscapeString(key).c_str(), - EscapeString(value).c_str()); - } - virtual void Delete(const Slice& key) { - printf(" del '%s'\n", - EscapeString(key).c_str()); - } -}; - - -// Called on every log record (each one of which is a WriteBatch) -// found in a kLogFile. -static void WriteBatchPrinter(Slice record) { - if (record.size() < 12) { - printf("log record length %d is too small\n", - static_cast<int>(record.size())); - return; - } - WriteBatch batch; - WriteBatchInternal::SetContents(&batch, record); - printf("sequence %llu\n", - static_cast<unsigned long long>(WriteBatchInternal::Sequence(&batch))); - WriteBatchItemPrinter batch_item_printer; - Status s = batch.Iterate(&batch_item_printer); - if (!s.ok()) { - printf(" error: %s\n", s.ToString().c_str()); - } -} - -bool DumpLog(Env* env, const std::string& fname) { - return PrintLogContents(env, fname, WriteBatchPrinter); -} - -// Called on every log record (each one of which is a WriteBatch) -// found in a kDescriptorFile. -static void VersionEditPrinter(Slice record) { - VersionEdit edit; - Status s = edit.DecodeFrom(record); - if (!s.ok()) { - printf("%s\n", s.ToString().c_str()); - return; - } - printf("%s", edit.DebugString().c_str()); -} - -bool DumpDescriptor(Env* env, const std::string& fname) { - return PrintLogContents(env, fname, VersionEditPrinter); -} - -bool DumpTable(Env* env, const std::string& fname) { - uint64_t file_size; - RandomAccessFile* file = NULL; - Table* table = NULL; - Status s = env->GetFileSize(fname, &file_size); - if (s.ok()) { - s = env->NewRandomAccessFile(fname, &file); - } - if (s.ok()) { - // We use the default comparator, which may or may not match the - // comparator used in this database. However this should not cause - // problems since we only use Table operations that do not require - // any comparisons. In particular, we do not call Seek or Prev. - s = Table::Open(Options(), file, file_size, &table); - } - if (!s.ok()) { - fprintf(stderr, "%s\n", s.ToString().c_str()); - delete table; - delete file; - return false; - } - - ReadOptions ro; - ro.fill_cache = false; - Iterator* iter = table->NewIterator(ro); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey key; - if (!ParseInternalKey(iter->key(), &key)) { - printf("badkey '%s' => '%s'\n", - EscapeString(iter->key()).c_str(), - EscapeString(iter->value()).c_str()); - } else { - char kbuf[20]; - const char* type; - if (key.type == kTypeDeletion) { - type = "del"; - } else if (key.type == kTypeValue) { - type = "val"; - } else { - snprintf(kbuf, sizeof(kbuf), "%d", static_cast<int>(key.type)); - type = kbuf; - } - printf("'%s' @ %8llu : %s => '%s'\n", - EscapeString(key.user_key).c_str(), - static_cast<unsigned long long>(key.sequence), - type, - EscapeString(iter->value()).c_str()); - } - } - s = iter->status(); - if (!s.ok()) { - printf("iterator error: %s\n", s.ToString().c_str()); - } - - delete iter; - delete table; - delete file; - return true; -} - -bool DumpFile(Env* env, const std::string& fname) { - FileType ftype; - if (!GuessType(fname, &ftype)) { - fprintf(stderr, "%s: unknown file type\n", fname.c_str()); - return false; - } - switch (ftype) { - case kLogFile: return DumpLog(env, fname); - case kDescriptorFile: return DumpDescriptor(env, fname); - case kTableFile: return DumpTable(env, fname); - - default: { - fprintf(stderr, "%s: not a dump-able file type\n", fname.c_str()); - break; - } - } - return false; -} - -bool HandleDumpCommand(Env* env, char** files, int num) { - bool ok = true; - for (int i = 0; i < num; i++) { - ok &= DumpFile(env, files[i]); - } - return ok; -} - -} -} // namespace leveldb - -static void Usage() { - fprintf( - stderr, - "Usage: leveldbutil command...\n" - " dump files... -- dump contents of specified files\n" - ); -} - -int main(int argc, char** argv) { - leveldb::Env* env = leveldb::Env::Default(); - bool ok = true; - if (argc < 2) { - Usage(); - ok = false; - } else { - std::string command = argv[1]; - if (command == "dump") { - ok = leveldb::HandleDumpCommand(env, argv+2, argc-2); - } else { - Usage(); - ok = false; - } - } - return (ok ? 0 : 1); -} diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h deleted file mode 100644 index 2690cb9789..0000000000 --- a/src/leveldb/db/log_format.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Log format information shared by reader and writer. -// See ../doc/log_format.txt for more detail. - -#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_ -#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_ - -namespace leveldb { -namespace log { - -enum RecordType { - // Zero is reserved for preallocated files - kZeroType = 0, - - kFullType = 1, - - // For fragments - kFirstType = 2, - kMiddleType = 3, - kLastType = 4 -}; -static const int kMaxRecordType = kLastType; - -static const int kBlockSize = 32768; - -// Header is checksum (4 bytes), type (1 byte), length (2 bytes). -static const int kHeaderSize = 4 + 1 + 2; - -} // namespace log -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_ diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc deleted file mode 100644 index b35f115aad..0000000000 --- a/src/leveldb/db/log_reader.cc +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" - -#include <stdio.h> -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Reader::Reporter::~Reporter() { -} - -Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum, - uint64_t initial_offset) - : file_(file), - reporter_(reporter), - checksum_(checksum), - backing_store_(new char[kBlockSize]), - buffer_(), - eof_(false), - last_record_offset_(0), - end_of_buffer_offset_(0), - initial_offset_(initial_offset) { -} - -Reader::~Reader() { - delete[] backing_store_; -} - -bool Reader::SkipToInitialBlock() { - size_t offset_in_block = initial_offset_ % kBlockSize; - uint64_t block_start_location = initial_offset_ - offset_in_block; - - // Don't search a block if we'd be in the trailer - if (offset_in_block > kBlockSize - 6) { - offset_in_block = 0; - block_start_location += kBlockSize; - } - - end_of_buffer_offset_ = block_start_location; - - // Skip to start of first block that can contain the initial record - if (block_start_location > 0) { - Status skip_status = file_->Skip(block_start_location); - if (!skip_status.ok()) { - ReportDrop(block_start_location, skip_status); - return false; - } - } - - return true; -} - -bool Reader::ReadRecord(Slice* record, std::string* scratch) { - if (last_record_offset_ < initial_offset_) { - if (!SkipToInitialBlock()) { - return false; - } - } - - scratch->clear(); - record->clear(); - bool in_fragmented_record = false; - // Record offset of the logical record that we're reading - // 0 is a dummy value to make compilers happy - uint64_t prospective_record_offset = 0; - - Slice fragment; - while (true) { - uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); - const unsigned int record_type = ReadPhysicalRecord(&fragment); - switch (record_type) { - case kFullType: - if (in_fragmented_record) { - // Handle bug in earlier versions of log::Writer where - // it could emit an empty kFirstType record at the tail end - // of a block followed by a kFullType or kFirstType record - // at the beginning of the next block. - if (scratch->empty()) { - in_fragmented_record = false; - } else { - ReportCorruption(scratch->size(), "partial record without end(1)"); - } - } - prospective_record_offset = physical_record_offset; - scratch->clear(); - *record = fragment; - last_record_offset_ = prospective_record_offset; - return true; - - case kFirstType: - if (in_fragmented_record) { - // Handle bug in earlier versions of log::Writer where - // it could emit an empty kFirstType record at the tail end - // of a block followed by a kFullType or kFirstType record - // at the beginning of the next block. - if (scratch->empty()) { - in_fragmented_record = false; - } else { - ReportCorruption(scratch->size(), "partial record without end(2)"); - } - } - prospective_record_offset = physical_record_offset; - scratch->assign(fragment.data(), fragment.size()); - in_fragmented_record = true; - break; - - case kMiddleType: - if (!in_fragmented_record) { - ReportCorruption(fragment.size(), - "missing start of fragmented record(1)"); - } else { - scratch->append(fragment.data(), fragment.size()); - } - break; - - case kLastType: - if (!in_fragmented_record) { - ReportCorruption(fragment.size(), - "missing start of fragmented record(2)"); - } else { - scratch->append(fragment.data(), fragment.size()); - *record = Slice(*scratch); - last_record_offset_ = prospective_record_offset; - return true; - } - break; - - case kEof: - if (in_fragmented_record) { - ReportCorruption(scratch->size(), "partial record without end(3)"); - scratch->clear(); - } - return false; - - case kBadRecord: - if (in_fragmented_record) { - ReportCorruption(scratch->size(), "error in middle of record"); - in_fragmented_record = false; - scratch->clear(); - } - break; - - default: { - char buf[40]; - snprintf(buf, sizeof(buf), "unknown record type %u", record_type); - ReportCorruption( - (fragment.size() + (in_fragmented_record ? scratch->size() : 0)), - buf); - in_fragmented_record = false; - scratch->clear(); - break; - } - } - } - return false; -} - -uint64_t Reader::LastRecordOffset() { - return last_record_offset_; -} - -void Reader::ReportCorruption(size_t bytes, const char* reason) { - ReportDrop(bytes, Status::Corruption(reason)); -} - -void Reader::ReportDrop(size_t bytes, const Status& reason) { - if (reporter_ != NULL && - end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) { - reporter_->Corruption(bytes, reason); - } -} - -unsigned int Reader::ReadPhysicalRecord(Slice* result) { - while (true) { - if (buffer_.size() < kHeaderSize) { - if (!eof_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - end_of_buffer_offset_ += buffer_.size(); - if (!status.ok()) { - buffer_.clear(); - ReportDrop(kBlockSize, status); - eof_ = true; - return kEof; - } else if (buffer_.size() < kBlockSize) { - eof_ = true; - } - continue; - } else if (buffer_.size() == 0) { - // End of file - return kEof; - } else { - size_t drop_size = buffer_.size(); - buffer_.clear(); - ReportCorruption(drop_size, "truncated record at end of file"); - return kEof; - } - } - - // Parse the header - const char* header = buffer_.data(); - const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff; - const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff; - const unsigned int type = header[6]; - const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { - size_t drop_size = buffer_.size(); - buffer_.clear(); - ReportCorruption(drop_size, "bad record length"); - return kBadRecord; - } - - if (type == kZeroType && length == 0) { - // Skip zero length record without reporting any drops since - // such records are produced by the mmap based writing code in - // env_posix.cc that preallocates file regions. - buffer_.clear(); - return kBadRecord; - } - - // Check crc - if (checksum_) { - uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); - if (actual_crc != expected_crc) { - // Drop the rest of the buffer since "length" itself may have - // been corrupted and if we trust it, we could find some - // fragment of a real log record that just happens to look - // like a valid log record. - size_t drop_size = buffer_.size(); - buffer_.clear(); - ReportCorruption(drop_size, "checksum mismatch"); - return kBadRecord; - } - } - - buffer_.remove_prefix(kHeaderSize + length); - - // Skip physical record that started before initial_offset_ - if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < - initial_offset_) { - result->clear(); - return kBadRecord; - } - - *result = Slice(header + kHeaderSize, length); - return type; - } -} - -} // namespace log -} // namespace leveldb diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h deleted file mode 100644 index 82d4bee68d..0000000000 --- a/src/leveldb/db/log_reader.h +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_ -#define STORAGE_LEVELDB_DB_LOG_READER_H_ - -#include <stdint.h> - -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class SequentialFile; - -namespace log { - -class Reader { - public: - // Interface for reporting errors. - class Reporter { - public: - virtual ~Reporter(); - - // Some corruption was detected. "size" is the approximate number - // of bytes dropped due to the corruption. - virtual void Corruption(size_t bytes, const Status& status) = 0; - }; - - // Create a reader that will return log records from "*file". - // "*file" must remain live while this Reader is in use. - // - // If "reporter" is non-NULL, it is notified whenever some data is - // dropped due to a detected corruption. "*reporter" must remain - // live while this Reader is in use. - // - // If "checksum" is true, verify checksums if available. - // - // The Reader will start reading at the first record located at physical - // position >= initial_offset within the file. - Reader(SequentialFile* file, Reporter* reporter, bool checksum, - uint64_t initial_offset); - - ~Reader(); - - // Read the next record into *record. Returns true if read - // successfully, false if we hit end of the input. May use - // "*scratch" as temporary storage. The contents filled in *record - // will only be valid until the next mutating operation on this - // reader or the next mutation to *scratch. - bool ReadRecord(Slice* record, std::string* scratch); - - // Returns the physical offset of the last record returned by ReadRecord. - // - // Undefined before the first call to ReadRecord. - uint64_t LastRecordOffset(); - - private: - SequentialFile* const file_; - Reporter* const reporter_; - bool const checksum_; - char* const backing_store_; - Slice buffer_; - bool eof_; // Last Read() indicated EOF by returning < kBlockSize - - // Offset of the last record returned by ReadRecord. - uint64_t last_record_offset_; - // Offset of the first location past the end of buffer_. - uint64_t end_of_buffer_offset_; - - // Offset at which to start looking for the first record to return - uint64_t const initial_offset_; - - // Extend record types with the following special values - enum { - kEof = kMaxRecordType + 1, - // Returned whenever we find an invalid physical record. - // Currently there are three situations in which this happens: - // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) - // * The record is a 0-length record (No drop is reported) - // * The record is below constructor's initial_offset (No drop is reported) - kBadRecord = kMaxRecordType + 2 - }; - - // Skips all blocks that are completely before "initial_offset_". - // - // Returns true on success. Handles reporting. - bool SkipToInitialBlock(); - - // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result); - - // Reports dropped bytes to the reporter. - // buffer_ must be updated to remove the dropped bytes prior to invocation. - void ReportCorruption(size_t bytes, const char* reason); - void ReportDrop(size_t bytes, const Status& reason); - - // No copying allowed - Reader(const Reader&); - void operator=(const Reader&); -}; - -} // namespace log -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_LOG_READER_H_ diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc deleted file mode 100644 index 4c5cf87573..0000000000 --- a/src/leveldb/db/log_test.cc +++ /dev/null @@ -1,500 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { -namespace log { - -// Construct a string of the specified length made out of the supplied -// partial string. -static std::string BigString(const std::string& partial_string, size_t n) { - std::string result; - while (result.size() < n) { - result.append(partial_string); - } - result.resize(n); - return result; -} - -// Construct a string from a number -static std::string NumberString(int n) { - char buf[50]; - snprintf(buf, sizeof(buf), "%d.", n); - return std::string(buf); -} - -// Return a skewed potentially long string -static std::string RandomSkewedString(int i, Random* rnd) { - return BigString(NumberString(i), rnd->Skewed(17)); -} - -class LogTest { - private: - class StringDest : public WritableFile { - public: - std::string contents_; - - virtual Status Close() { return Status::OK(); } - virtual Status Flush() { return Status::OK(); } - virtual Status Sync() { return Status::OK(); } - virtual Status Append(const Slice& slice) { - contents_.append(slice.data(), slice.size()); - return Status::OK(); - } - }; - - class StringSource : public SequentialFile { - public: - Slice contents_; - bool force_error_; - bool returned_partial_; - StringSource() : force_error_(false), returned_partial_(false) { } - - virtual Status Read(size_t n, Slice* result, char* scratch) { - ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error"; - - if (force_error_) { - force_error_ = false; - returned_partial_ = true; - return Status::Corruption("read error"); - } - - if (contents_.size() < n) { - n = contents_.size(); - returned_partial_ = true; - } - *result = Slice(contents_.data(), n); - contents_.remove_prefix(n); - return Status::OK(); - } - - virtual Status Skip(uint64_t n) { - if (n > contents_.size()) { - contents_.clear(); - return Status::NotFound("in-memory file skipepd past end"); - } - - contents_.remove_prefix(n); - - return Status::OK(); - } - }; - - class ReportCollector : public Reader::Reporter { - public: - size_t dropped_bytes_; - std::string message_; - - ReportCollector() : dropped_bytes_(0) { } - virtual void Corruption(size_t bytes, const Status& status) { - dropped_bytes_ += bytes; - message_.append(status.ToString()); - } - }; - - StringDest dest_; - StringSource source_; - ReportCollector report_; - bool reading_; - Writer writer_; - Reader reader_; - - // Record metadata for testing initial offset functionality - static size_t initial_offset_record_sizes_[]; - static uint64_t initial_offset_last_record_offsets_[]; - - public: - LogTest() : reading_(false), - writer_(&dest_), - reader_(&source_, &report_, true/*checksum*/, - 0/*initial_offset*/) { - } - - void Write(const std::string& msg) { - ASSERT_TRUE(!reading_) << "Write() after starting to read"; - writer_.AddRecord(Slice(msg)); - } - - size_t WrittenBytes() const { - return dest_.contents_.size(); - } - - std::string Read() { - if (!reading_) { - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - } - std::string scratch; - Slice record; - if (reader_.ReadRecord(&record, &scratch)) { - return record.ToString(); - } else { - return "EOF"; - } - } - - void IncrementByte(int offset, int delta) { - dest_.contents_[offset] += delta; - } - - void SetByte(int offset, char new_byte) { - dest_.contents_[offset] = new_byte; - } - - void ShrinkSize(int bytes) { - dest_.contents_.resize(dest_.contents_.size() - bytes); - } - - void FixChecksum(int header_offset, int len) { - // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len); - crc = crc32c::Mask(crc); - EncodeFixed32(&dest_.contents_[header_offset], crc); - } - - void ForceError() { - source_.force_error_ = true; - } - - size_t DroppedBytes() const { - return report_.dropped_bytes_; - } - - std::string ReportMessage() const { - return report_.message_; - } - - // Returns OK iff recorded error message contains "msg" - std::string MatchError(const std::string& msg) const { - if (report_.message_.find(msg) == std::string::npos) { - return report_.message_; - } else { - return "OK"; - } - } - - void WriteInitialOffsetLog() { - for (int i = 0; i < 4; i++) { - std::string record(initial_offset_record_sizes_[i], - static_cast<char>('a' + i)); - Write(record); - } - } - - void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { - WriteInitialOffsetLog(); - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, - WrittenBytes() + offset_past_end); - Slice record; - std::string scratch; - ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); - delete offset_reader; - } - - void CheckInitialOffsetRecord(uint64_t initial_offset, - int expected_record_offset) { - WriteInitialOffsetLog(); - reading_ = true; - source_.contents_ = Slice(dest_.contents_); - Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/, - initial_offset); - Slice record; - std::string scratch; - ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); - ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset], - record.size()); - ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset], - offset_reader->LastRecordOffset()); - ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]); - delete offset_reader; - } - -}; - -size_t LogTest::initial_offset_record_sizes_[] = - {10000, // Two sizable records in first block - 10000, - 2 * log::kBlockSize - 1000, // Span three blocks - 1}; - -uint64_t LogTest::initial_offset_last_record_offsets_[] = - {0, - kHeaderSize + 10000, - 2 * (kHeaderSize + 10000), - 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; - - -TEST(LogTest, Empty) { - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, ReadWrite) { - Write("foo"); - Write("bar"); - Write(""); - Write("xxxx"); - ASSERT_EQ("foo", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("xxxx", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ("EOF", Read()); // Make sure reads at eof work -} - -TEST(LogTest, ManyBlocks) { - for (int i = 0; i < 100000; i++) { - Write(NumberString(i)); - } - for (int i = 0; i < 100000; i++) { - ASSERT_EQ(NumberString(i), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, Fragmentation) { - Write("small"); - Write(BigString("medium", 50000)); - Write(BigString("large", 100000)); - ASSERT_EQ("small", Read()); - ASSERT_EQ(BigString("medium", 50000), Read()); - ASSERT_EQ(BigString("large", 100000), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, MarginalTrailer) { - // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, MarginalTrailer2) { - // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes()); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(0, DroppedBytes()); - ASSERT_EQ("", ReportMessage()); -} - -TEST(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - Write(""); - Write("bar"); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("", Read()); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; - Write(BigString("foo", n)); - ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes()); - ASSERT_EQ(BigString("foo", n), Read()); - ASSERT_EQ("EOF", Read()); -} - -TEST(LogTest, RandomRead) { - const int N = 500; - Random write_rnd(301); - for (int i = 0; i < N; i++) { - Write(RandomSkewedString(i, &write_rnd)); - } - Random read_rnd(301); - for (int i = 0; i < N; i++) { - ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read()); - } - ASSERT_EQ("EOF", Read()); -} - -// Tests of all the error paths in log_reader.cc follow: - -TEST(LogTest, ReadError) { - Write("foo"); - ForceError(); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kBlockSize, DroppedBytes()); - ASSERT_EQ("OK", MatchError("read error")); -} - -TEST(LogTest, BadRecordType) { - Write("foo"); - // Type is stored in header[6] - IncrementByte(6, 100); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("unknown record type")); -} - -TEST(LogTest, TruncatedTrailingRecord) { - Write("foo"); - ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize - 1, DroppedBytes()); - ASSERT_EQ("OK", MatchError("truncated record at end of file")); -} - -TEST(LogTest, BadLength) { - Write("foo"); - ShrinkSize(1); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(kHeaderSize + 2, DroppedBytes()); - ASSERT_EQ("OK", MatchError("bad record length")); -} - -TEST(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10, DroppedBytes()); - ASSERT_EQ("OK", MatchError("checksum mismatch")); -} - -TEST(LogTest, UnexpectedMiddleType) { - Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedLastType) { - Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("missing start")); -} - -TEST(LogTest, UnexpectedFullType) { - Write("foo"); - Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ("bar", Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, UnexpectedFirstType) { - Write("foo"); - Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); - ASSERT_EQ(BigString("bar", 100000), Read()); - ASSERT_EQ("EOF", Read()); - ASSERT_EQ(3, DroppedBytes()); - ASSERT_EQ("OK", MatchError("partial record without end")); -} - -TEST(LogTest, ErrorJoinsRecords) { - // Consider two fragmented records: - // first(R1) last(R1) first(R2) last(R2) - // where the middle two fragments disappear. We do not want - // first(R1),last(R2) to get joined and returned as a valid record. - - // Write records that span two blocks - Write(BigString("foo", kBlockSize)); - Write(BigString("bar", kBlockSize)); - Write("correct"); - - // Wipe the middle block - for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) { - SetByte(offset, 'x'); - } - - ASSERT_EQ("correct", Read()); - ASSERT_EQ("EOF", Read()); - const int dropped = DroppedBytes(); - ASSERT_LE(dropped, 2*kBlockSize + 100); - ASSERT_GE(dropped, 2*kBlockSize); -} - -TEST(LogTest, ReadStart) { - CheckInitialOffsetRecord(0, 0); -} - -TEST(LogTest, ReadSecondOneOff) { - CheckInitialOffsetRecord(1, 1); -} - -TEST(LogTest, ReadSecondTenThousand) { - CheckInitialOffsetRecord(10000, 1); -} - -TEST(LogTest, ReadSecondStart) { - CheckInitialOffsetRecord(10007, 1); -} - -TEST(LogTest, ReadThirdOneOff) { - CheckInitialOffsetRecord(10008, 2); -} - -TEST(LogTest, ReadThirdStart) { - CheckInitialOffsetRecord(20014, 2); -} - -TEST(LogTest, ReadFourthOneOff) { - CheckInitialOffsetRecord(20015, 3); -} - -TEST(LogTest, ReadFourthFirstBlockTrailer) { - CheckInitialOffsetRecord(log::kBlockSize - 4, 3); -} - -TEST(LogTest, ReadFourthMiddleBlock) { - CheckInitialOffsetRecord(log::kBlockSize + 1, 3); -} - -TEST(LogTest, ReadFourthLastBlock) { - CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); -} - -TEST(LogTest, ReadFourthStart) { - CheckInitialOffsetRecord( - 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, - 3); -} - -TEST(LogTest, ReadEnd) { - CheckOffsetPastEndReturnsNoRecords(0); -} - -TEST(LogTest, ReadPastEnd) { - CheckOffsetPastEndReturnsNoRecords(5); -} - -} // namespace log -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc deleted file mode 100644 index 2da99ac088..0000000000 --- a/src/leveldb/db/log_writer.cc +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/log_writer.h" - -#include <stdint.h> -#include "leveldb/env.h" -#include "util/coding.h" -#include "util/crc32c.h" - -namespace leveldb { -namespace log { - -Writer::Writer(WritableFile* dest) - : dest_(dest), - block_offset_(0) { - for (int i = 0; i <= kMaxRecordType; i++) { - char t = static_cast<char>(i); - type_crc_[i] = crc32c::Value(&t, 1); - } -} - -Writer::~Writer() { -} - -Status Writer::AddRecord(const Slice& slice) { - const char* ptr = slice.data(); - size_t left = slice.size(); - - // Fragment the record if necessary and emit it. Note that if slice - // is empty, we still want to iterate once to emit a single - // zero-length record - Status s; - bool begin = true; - do { - const int leftover = kBlockSize - block_offset_; - assert(leftover >= 0); - if (leftover < kHeaderSize) { - // Switch to a new block - if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); - } - block_offset_ = 0; - } - - // Invariant: we never leave < kHeaderSize bytes in a block. - assert(kBlockSize - block_offset_ - kHeaderSize >= 0); - - const size_t avail = kBlockSize - block_offset_ - kHeaderSize; - const size_t fragment_length = (left < avail) ? left : avail; - - RecordType type; - const bool end = (left == fragment_length); - if (begin && end) { - type = kFullType; - } else if (begin) { - type = kFirstType; - } else if (end) { - type = kLastType; - } else { - type = kMiddleType; - } - - s = EmitPhysicalRecord(type, ptr, fragment_length); - ptr += fragment_length; - left -= fragment_length; - begin = false; - } while (s.ok() && left > 0); - return s; -} - -Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { - assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); - - // Format the header - char buf[kHeaderSize]; - buf[4] = static_cast<char>(n & 0xff); - buf[5] = static_cast<char>(n >> 8); - buf[6] = static_cast<char>(t); - - // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage - EncodeFixed32(buf, crc); - - // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); - if (s.ok()) { - s = dest_->Append(Slice(ptr, n)); - if (s.ok()) { - s = dest_->Flush(); - } - } - block_offset_ += kHeaderSize + n; - return s; -} - -} // namespace log -} // namespace leveldb diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h deleted file mode 100644 index a3a954d967..0000000000 --- a/src/leveldb/db/log_writer.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_ -#define STORAGE_LEVELDB_DB_LOG_WRITER_H_ - -#include <stdint.h> -#include "db/log_format.h" -#include "leveldb/slice.h" -#include "leveldb/status.h" - -namespace leveldb { - -class WritableFile; - -namespace log { - -class Writer { - public: - // Create a writer that will append data to "*dest". - // "*dest" must be initially empty. - // "*dest" must remain live while this Writer is in use. - explicit Writer(WritableFile* dest); - ~Writer(); - - Status AddRecord(const Slice& slice); - - private: - WritableFile* dest_; - int block_offset_; // Current offset in block - - // crc32c values for all supported record types. These are - // pre-computed to reduce the overhead of computing the crc of the - // record type stored in the header. - uint32_t type_crc_[kMaxRecordType + 1]; - - Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length); - - // No copying allowed - Writer(const Writer&); - void operator=(const Writer&); -}; - -} // namespace log -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_ diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc deleted file mode 100644 index bfec0a7e7a..0000000000 --- a/src/leveldb/db/memtable.cc +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/memtable.h" -#include "db/dbformat.h" -#include "leveldb/comparator.h" -#include "leveldb/env.h" -#include "leveldb/iterator.h" -#include "util/coding.h" - -namespace leveldb { - -static Slice GetLengthPrefixedSlice(const char* data) { - uint32_t len; - const char* p = data; - p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted - return Slice(p, len); -} - -MemTable::MemTable(const InternalKeyComparator& cmp) - : comparator_(cmp), - refs_(0), - table_(comparator_, &arena_) { -} - -MemTable::~MemTable() { - assert(refs_ == 0); -} - -size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); } - -int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) - const { - // Internal keys are encoded as length-prefixed strings. - Slice a = GetLengthPrefixedSlice(aptr); - Slice b = GetLengthPrefixedSlice(bptr); - return comparator.Compare(a, b); -} - -// Encode a suitable internal key target for "target" and return it. -// Uses *scratch as scratch space, and the returned pointer will point -// into this scratch space. -static const char* EncodeKey(std::string* scratch, const Slice& target) { - scratch->clear(); - PutVarint32(scratch, target.size()); - scratch->append(target.data(), target.size()); - return scratch->data(); -} - -class MemTableIterator: public Iterator { - public: - explicit MemTableIterator(MemTable::Table* table) : iter_(table) { } - - virtual bool Valid() const { return iter_.Valid(); } - virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); } - virtual void SeekToFirst() { iter_.SeekToFirst(); } - virtual void SeekToLast() { iter_.SeekToLast(); } - virtual void Next() { iter_.Next(); } - virtual void Prev() { iter_.Prev(); } - virtual Slice key() const { return GetLengthPrefixedSlice(iter_.key()); } - virtual Slice value() const { - Slice key_slice = GetLengthPrefixedSlice(iter_.key()); - return GetLengthPrefixedSlice(key_slice.data() + key_slice.size()); - } - - virtual Status status() const { return Status::OK(); } - - private: - MemTable::Table::Iterator iter_; - std::string tmp_; // For passing to EncodeKey - - // No copying allowed - MemTableIterator(const MemTableIterator&); - void operator=(const MemTableIterator&); -}; - -Iterator* MemTable::NewIterator() { - return new MemTableIterator(&table_); -} - -void MemTable::Add(SequenceNumber s, ValueType type, - const Slice& key, - const Slice& value) { - // Format of an entry is concatenation of: - // key_size : varint32 of internal_key.size() - // key bytes : char[internal_key.size()] - // value_size : varint32 of value.size() - // value bytes : char[value.size()] - size_t key_size = key.size(); - size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; - const size_t encoded_len = - VarintLength(internal_key_size) + internal_key_size + - VarintLength(val_size) + val_size; - char* buf = arena_.Allocate(encoded_len); - char* p = EncodeVarint32(buf, internal_key_size); - memcpy(p, key.data(), key_size); - p += key_size; - EncodeFixed64(p, (s << 8) | type); - p += 8; - p = EncodeVarint32(p, val_size); - memcpy(p, value.data(), val_size); - assert((p + val_size) - buf == encoded_len); - table_.Insert(buf); -} - -bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { - Slice memkey = key.memtable_key(); - Table::Iterator iter(&table_); - iter.Seek(memkey.data()); - if (iter.Valid()) { - // entry format is: - // klength varint32 - // userkey char[klength] - // tag uint64 - // vlength varint32 - // value char[vlength] - // Check that it belongs to same user key. We do not check the - // sequence number since the Seek() call above should have skipped - // all entries with overly large sequence numbers. - const char* entry = iter.key(); - uint32_t key_length; - const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); - if (comparator_.comparator.user_comparator()->Compare( - Slice(key_ptr, key_length - 8), - key.user_key()) == 0) { - // Correct user key - const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); - switch (static_cast<ValueType>(tag & 0xff)) { - case kTypeValue: { - Slice v = GetLengthPrefixedSlice(key_ptr + key_length); - value->assign(v.data(), v.size()); - return true; - } - case kTypeDeletion: - *s = Status::NotFound(Slice()); - return true; - } - } - } - return false; -} - -} // namespace leveldb diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h deleted file mode 100644 index 92e90bb099..0000000000 --- a/src/leveldb/db/memtable.h +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_ -#define STORAGE_LEVELDB_DB_MEMTABLE_H_ - -#include <string> -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/skiplist.h" -#include "util/arena.h" - -namespace leveldb { - -class InternalKeyComparator; -class Mutex; -class MemTableIterator; - -class MemTable { - public: - // MemTables are reference counted. The initial reference count - // is zero and the caller must call Ref() at least once. - explicit MemTable(const InternalKeyComparator& comparator); - - // Increase reference count. - void Ref() { ++refs_; } - - // Drop reference count. Delete if no more references exist. - void Unref() { - --refs_; - assert(refs_ >= 0); - if (refs_ <= 0) { - delete this; - } - } - - // Returns an estimate of the number of bytes of data in use by this - // data structure. - // - // REQUIRES: external synchronization to prevent simultaneous - // operations on the same MemTable. - size_t ApproximateMemoryUsage(); - - // Return an iterator that yields the contents of the memtable. - // - // The caller must ensure that the underlying MemTable remains live - // while the returned iterator is live. The keys returned by this - // iterator are internal keys encoded by AppendInternalKey in the - // db/format.{h,cc} module. - Iterator* NewIterator(); - - // Add an entry into memtable that maps key to value at the - // specified sequence number and with the specified type. - // Typically value will be empty if type==kTypeDeletion. - void Add(SequenceNumber seq, ValueType type, - const Slice& key, - const Slice& value); - - // If memtable contains a value for key, store it in *value and return true. - // If memtable contains a deletion for key, store a NotFound() error - // in *status and return true. - // Else, return false. - bool Get(const LookupKey& key, std::string* value, Status* s); - - private: - ~MemTable(); // Private since only Unref() should be used to delete it - - struct KeyComparator { - const InternalKeyComparator comparator; - explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } - int operator()(const char* a, const char* b) const; - }; - friend class MemTableIterator; - friend class MemTableBackwardIterator; - - typedef SkipList<const char*, KeyComparator> Table; - - KeyComparator comparator_; - int refs_; - Arena arena_; - Table table_; - - // No copying allowed - MemTable(const MemTable&); - void operator=(const MemTable&); -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_ diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc deleted file mode 100644 index 022d52f3de..0000000000 --- a/src/leveldb/db/repair.cc +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// We recover the contents of the descriptor from the other files we find. -// (1) Any log files are first converted to tables -// (2) We scan every table to compute -// (a) smallest/largest for the table -// (b) largest sequence number in the table -// (3) We generate descriptor contents: -// - log number is set to zero -// - next-file-number is set to 1 + largest file number we found -// - last-sequence-number is set to largest sequence# found across -// all tables (see 2c) -// - compaction pointers are cleared -// - every table file is added at level 0 -// -// Possible optimization 1: -// (a) Compute total size and use to pick appropriate max-level M -// (b) Sort tables by largest sequence# in the table -// (c) For each table: if it overlaps earlier table, place in level-0, -// else place in level-M. -// Possible optimization 2: -// Store per-table metadata (smallest, largest, largest-seq#, ...) -// in the table's meta section to speed up ScanTable. - -#include "db/builder.h" -#include "db/db_impl.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "db/version_edit.h" -#include "db/write_batch_internal.h" -#include "leveldb/comparator.h" -#include "leveldb/db.h" -#include "leveldb/env.h" - -namespace leveldb { - -namespace { - -class Repairer { - public: - Repairer(const std::string& dbname, const Options& options) - : dbname_(dbname), - env_(options.env), - icmp_(options.comparator), - ipolicy_(options.filter_policy), - options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), - owns_info_log_(options_.info_log != options.info_log), - owns_cache_(options_.block_cache != options.block_cache), - next_file_number_(1) { - // TableCache can be small since we expect each table to be opened once. - table_cache_ = new TableCache(dbname_, &options_, 10); - } - - ~Repairer() { - delete table_cache_; - if (owns_info_log_) { - delete options_.info_log; - } - if (owns_cache_) { - delete options_.block_cache; - } - } - - Status Run() { - Status status = FindFiles(); - if (status.ok()) { - ConvertLogFilesToTables(); - ExtractMetaData(); - status = WriteDescriptor(); - } - if (status.ok()) { - unsigned long long bytes = 0; - for (size_t i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; - } - Log(options_.info_log, - "**** Repaired leveldb %s; " - "recovered %d files; %llu bytes. " - "Some data may have been lost. " - "****", - dbname_.c_str(), - static_cast<int>(tables_.size()), - bytes); - } - return status; - } - - private: - struct TableInfo { - FileMetaData meta; - SequenceNumber max_sequence; - }; - - std::string const dbname_; - Env* const env_; - InternalKeyComparator const icmp_; - InternalFilterPolicy const ipolicy_; - Options const options_; - bool owns_info_log_; - bool owns_cache_; - TableCache* table_cache_; - VersionEdit edit_; - - std::vector<std::string> manifests_; - std::vector<uint64_t> table_numbers_; - std::vector<uint64_t> logs_; - std::vector<TableInfo> tables_; - uint64_t next_file_number_; - - Status FindFiles() { - std::vector<std::string> filenames; - Status status = env_->GetChildren(dbname_, &filenames); - if (!status.ok()) { - return status; - } - if (filenames.empty()) { - return Status::IOError(dbname_, "repair found no files"); - } - - uint64_t number; - FileType type; - for (size_t i = 0; i < filenames.size(); i++) { - if (ParseFileName(filenames[i], &number, &type)) { - if (type == kDescriptorFile) { - manifests_.push_back(filenames[i]); - } else { - if (number + 1 > next_file_number_) { - next_file_number_ = number + 1; - } - if (type == kLogFile) { - logs_.push_back(number); - } else if (type == kTableFile) { - table_numbers_.push_back(number); - } else { - // Ignore other files - } - } - } - } - return status; - } - - void ConvertLogFilesToTables() { - for (size_t i = 0; i < logs_.size(); i++) { - std::string logname = LogFileName(dbname_, logs_[i]); - Status status = ConvertLogToTable(logs_[i]); - if (!status.ok()) { - Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", - (unsigned long long) logs_[i], - status.ToString().c_str()); - } - ArchiveFile(logname); - } - } - - Status ConvertLogToTable(uint64_t log) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - Logger* info_log; - uint64_t lognum; - virtual void Corruption(size_t bytes, const Status& s) { - // We print error messages for corruption, but continue repairing. - Log(info_log, "Log #%llu: dropping %d bytes; %s", - (unsigned long long) lognum, - static_cast<int>(bytes), - s.ToString().c_str()); - } - }; - - // Open the log file - std::string logname = LogFileName(dbname_, log); - SequentialFile* lfile; - Status status = env_->NewSequentialFile(logname, &lfile); - if (!status.ok()) { - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log; - reporter.lognum = log; - // We intentially make log::Reader do checksumming so that - // corruptions cause entire commits to be skipped instead of - // propagating bad information (like overly large sequence - // numbers). - log::Reader reader(lfile, &reporter, false/*do not checksum*/, - 0/*initial_offset*/); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - MemTable* mem = new MemTable(icmp_); - mem->Ref(); - int counter = 0; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; - } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto(&batch, mem); - if (status.ok()) { - counter += WriteBatchInternal::Count(&batch); - } else { - Log(options_.info_log, "Log #%llu: ignoring %s", - (unsigned long long) log, - status.ToString().c_str()); - status = Status::OK(); // Keep going with rest of file - } - } - delete lfile; - - // Do not record a version edit for this conversion to a Table - // since ExtractMetaData() will also generate edits. - FileMetaData meta; - meta.number = next_file_number_++; - Iterator* iter = mem->NewIterator(); - status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); - delete iter; - mem->Unref(); - mem = NULL; - if (status.ok()) { - if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); - } - } - Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", - (unsigned long long) log, - counter, - (unsigned long long) meta.number, - status.ToString().c_str()); - return status; - } - - void ExtractMetaData() { - std::vector<TableInfo> kept; - for (size_t i = 0; i < table_numbers_.size(); i++) { - TableInfo t; - t.meta.number = table_numbers_[i]; - Status status = ScanTable(&t); - if (!status.ok()) { - std::string fname = TableFileName(dbname_, table_numbers_[i]); - Log(options_.info_log, "Table #%llu: ignoring %s", - (unsigned long long) table_numbers_[i], - status.ToString().c_str()); - ArchiveFile(fname); - } else { - tables_.push_back(t); - } - } - } - - Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); - int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); - if (status.ok()) { - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), t->meta.number, t->meta.file_size); - bool empty = true; - ParsedInternalKey parsed; - t->max_sequence = 0; - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - if (!ParseInternalKey(key, &parsed)) { - Log(options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, - EscapeString(key).c_str()); - continue; - } - - counter++; - if (empty) { - empty = false; - t->meta.smallest.DecodeFrom(key); - } - t->meta.largest.DecodeFrom(key); - if (parsed.sequence > t->max_sequence) { - t->max_sequence = parsed.sequence; - } - } - if (!iter->status().ok()) { - status = iter->status(); - } - delete iter; - } - Log(options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, - counter, - status.ToString().c_str()); - return status; - } - - Status WriteDescriptor() { - std::string tmp = TempFileName(dbname_, 1); - WritableFile* file; - Status status = env_->NewWritableFile(tmp, &file); - if (!status.ok()) { - return status; - } - - SequenceNumber max_sequence = 0; - for (size_t i = 0; i < tables_.size(); i++) { - if (max_sequence < tables_[i].max_sequence) { - max_sequence = tables_[i].max_sequence; - } - } - - edit_.SetComparatorName(icmp_.user_comparator()->Name()); - edit_.SetLogNumber(0); - edit_.SetNextFile(next_file_number_); - edit_.SetLastSequence(max_sequence); - - for (size_t i = 0; i < tables_.size(); i++) { - // TODO(opt): separate out into multiple levels - const TableInfo& t = tables_[i]; - edit_.AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest); - } - - //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); - { - log::Writer log(file); - std::string record; - edit_.EncodeTo(&record); - status = log.AddRecord(record); - } - if (status.ok()) { - status = file->Close(); - } - delete file; - file = NULL; - - if (!status.ok()) { - env_->DeleteFile(tmp); - } else { - // Discard older manifests - for (size_t i = 0; i < manifests_.size(); i++) { - ArchiveFile(dbname_ + "/" + manifests_[i]); - } - - // Install new manifest - status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); - if (status.ok()) { - status = SetCurrentFile(env_, dbname_, 1); - } else { - env_->DeleteFile(tmp); - } - } - return status; - } - - void ArchiveFile(const std::string& fname) { - // Move into another directory. E.g., for - // dir/foo - // rename to - // dir/lost/foo - const char* slash = strrchr(fname.c_str(), '/'); - std::string new_dir; - if (slash != NULL) { - new_dir.assign(fname.data(), slash - fname.data()); - } - new_dir.append("/lost"); - env_->CreateDir(new_dir); // Ignore error - std::string new_file = new_dir; - new_file.append("/"); - new_file.append((slash == NULL) ? fname.c_str() : slash + 1); - Status s = env_->RenameFile(fname, new_file); - Log(options_.info_log, "Archiving %s: %s\n", - fname.c_str(), s.ToString().c_str()); - } -}; -} // namespace - -Status RepairDB(const std::string& dbname, const Options& options) { - Repairer repairer(dbname, options); - return repairer.Run(); -} - -} // namespace leveldb diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h deleted file mode 100644 index af85be6d01..0000000000 --- a/src/leveldb/db/skiplist.h +++ /dev/null @@ -1,379 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread safety -// ------------- -// -// Writes require external synchronization, most likely a mutex. -// Reads require a guarantee that the SkipList will not be destroyed -// while the read is in progress. Apart from that, reads progress -// without any internal locking or synchronization. -// -// Invariants: -// -// (1) Allocated nodes are never deleted until the SkipList is -// destroyed. This is trivially guaranteed by the code since we -// never delete any skip list nodes. -// -// (2) The contents of a Node except for the next/prev pointers are -// immutable after the Node has been linked into the SkipList. -// Only Insert() modifies the list, and it is careful to initialize -// a node and use release-stores to publish the nodes in one or -// more lists. -// -// ... prev vs. next pointer ordering ... - -#include <assert.h> -#include <stdlib.h> -#include "port/port.h" -#include "util/arena.h" -#include "util/random.h" - -namespace leveldb { - -class Arena; - -template<typename Key, class Comparator> -class SkipList { - private: - struct Node; - - public: - // Create a new SkipList object that will use "cmp" for comparing keys, - // and will allocate memory using "*arena". Objects allocated in the arena - // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena); - - // Insert key into the list. - // REQUIRES: nothing that compares equal to key is currently in the list. - void Insert(const Key& key); - - // Returns true iff an entry that compares equal to key is in the list. - bool Contains(const Key& key) const; - - // Iteration over the contents of a skip list - class Iterator { - public: - // Initialize an iterator over the specified list. - // The returned iterator is not valid. - explicit Iterator(const SkipList* list); - - // Returns true iff the iterator is positioned at a valid node. - bool Valid() const; - - // Returns the key at the current position. - // REQUIRES: Valid() - const Key& key() const; - - // Advances to the next position. - // REQUIRES: Valid() - void Next(); - - // Advances to the previous position. - // REQUIRES: Valid() - void Prev(); - - // Advance to the first entry with a key >= target - void Seek(const Key& target); - - // Position at the first entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToFirst(); - - // Position at the last entry in list. - // Final state of iterator is Valid() iff list is not empty. - void SeekToLast(); - - private: - const SkipList* list_; - Node* node_; - // Intentionally copyable - }; - - private: - enum { kMaxHeight = 12 }; - - // Immutable after construction - Comparator const compare_; - Arena* const arena_; // Arena used for allocations of nodes - - Node* const head_; - - // Modified only by Insert(). Read racily by readers, but stale - // values are ok. - port::AtomicPointer max_height_; // Height of the entire list - - inline int GetMaxHeight() const { - return static_cast<int>( - reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load())); - } - - // Read/written only by Insert(). - Random rnd_; - - Node* NewNode(const Key& key, int height); - int RandomHeight(); - bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); } - - // Return true if key is greater than the data stored in "n" - bool KeyIsAfterNode(const Key& key, Node* n) const; - - // Return the earliest node that comes at or after key. - // Return NULL if there is no such node. - // - // If prev is non-NULL, fills prev[level] with pointer to previous - // node at "level" for every level in [0..max_height_-1]. - Node* FindGreaterOrEqual(const Key& key, Node** prev) const; - - // Return the latest node with a key < key. - // Return head_ if there is no such node. - Node* FindLessThan(const Key& key) const; - - // Return the last node in the list. - // Return head_ if list is empty. - Node* FindLast() const; - - // No copying allowed - SkipList(const SkipList&); - void operator=(const SkipList&); -}; - -// Implementation details follow -template<typename Key, class Comparator> -struct SkipList<Key,Comparator>::Node { - explicit Node(const Key& k) : key(k) { } - - Key const key; - - // Accessors/mutators for links. Wrapped in methods so we can - // add the appropriate barriers as necessary. - Node* Next(int n) { - assert(n >= 0); - // Use an 'acquire load' so that we observe a fully initialized - // version of the returned Node. - return reinterpret_cast<Node*>(next_[n].Acquire_Load()); - } - void SetNext(int n, Node* x) { - assert(n >= 0); - // Use a 'release store' so that anybody who reads through this - // pointer observes a fully initialized version of the inserted node. - next_[n].Release_Store(x); - } - - // No-barrier variants that can be safely used in a few locations. - Node* NoBarrier_Next(int n) { - assert(n >= 0); - return reinterpret_cast<Node*>(next_[n].NoBarrier_Load()); - } - void NoBarrier_SetNext(int n, Node* x) { - assert(n >= 0); - next_[n].NoBarrier_Store(x); - } - - private: - // Array of length equal to the node height. next_[0] is lowest level link. - port::AtomicPointer next_[1]; -}; - -template<typename Key, class Comparator> -typename SkipList<Key,Comparator>::Node* -SkipList<Key,Comparator>::NewNode(const Key& key, int height) { - char* mem = arena_->AllocateAligned( - sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); - return new (mem) Node(key); -} - -template<typename Key, class Comparator> -inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) { - list_ = list; - node_ = NULL; -} - -template<typename Key, class Comparator> -inline bool SkipList<Key,Comparator>::Iterator::Valid() const { - return node_ != NULL; -} - -template<typename Key, class Comparator> -inline const Key& SkipList<Key,Comparator>::Iterator::key() const { - assert(Valid()); - return node_->key; -} - -template<typename Key, class Comparator> -inline void SkipList<Key,Comparator>::Iterator::Next() { - assert(Valid()); - node_ = node_->Next(0); -} - -template<typename Key, class Comparator> -inline void SkipList<Key,Comparator>::Iterator::Prev() { - // Instead of using explicit "prev" links, we just search for the - // last node that falls before key. - assert(Valid()); - node_ = list_->FindLessThan(node_->key); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template<typename Key, class Comparator> -inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) { - node_ = list_->FindGreaterOrEqual(target, NULL); -} - -template<typename Key, class Comparator> -inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() { - node_ = list_->head_->Next(0); -} - -template<typename Key, class Comparator> -inline void SkipList<Key,Comparator>::Iterator::SeekToLast() { - node_ = list_->FindLast(); - if (node_ == list_->head_) { - node_ = NULL; - } -} - -template<typename Key, class Comparator> -int SkipList<Key,Comparator>::RandomHeight() { - // Increase height with probability 1 in kBranching - static const unsigned int kBranching = 4; - int height = 1; - while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) { - height++; - } - assert(height > 0); - assert(height <= kMaxHeight); - return height; -} - -template<typename Key, class Comparator> -bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const { - // NULL n is considered infinite - return (n != NULL) && (compare_(n->key, key) < 0); -} - -template<typename Key, class Comparator> -typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev) - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (KeyIsAfterNode(key, next)) { - // Keep searching in this list - x = next; - } else { - if (prev != NULL) prev[level] = x; - if (level == 0) { - return next; - } else { - // Switch to next list - level--; - } - } - } -} - -template<typename Key, class Comparator> -typename SkipList<Key,Comparator>::Node* -SkipList<Key,Comparator>::FindLessThan(const Key& key) const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - assert(x == head_ || compare_(x->key, key) < 0); - Node* next = x->Next(level); - if (next == NULL || compare_(next->key, key) >= 0) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template<typename Key, class Comparator> -typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast() - const { - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - Node* next = x->Next(level); - if (next == NULL) { - if (level == 0) { - return x; - } else { - // Switch to next list - level--; - } - } else { - x = next; - } - } -} - -template<typename Key, class Comparator> -SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena) - : compare_(cmp), - arena_(arena), - head_(NewNode(0 /* any key will do */, kMaxHeight)), - max_height_(reinterpret_cast<void*>(1)), - rnd_(0xdeadbeef) { - for (int i = 0; i < kMaxHeight; i++) { - head_->SetNext(i, NULL); - } -} - -template<typename Key, class Comparator> -void SkipList<Key,Comparator>::Insert(const Key& key) { - // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual() - // here since Insert() is externally synchronized. - Node* prev[kMaxHeight]; - Node* x = FindGreaterOrEqual(key, prev); - - // Our data structure does not allow duplicate insertion - assert(x == NULL || !Equal(key, x->key)); - - int height = RandomHeight(); - if (height > GetMaxHeight()) { - for (int i = GetMaxHeight(); i < height; i++) { - prev[i] = head_; - } - //fprintf(stderr, "Change height from %d to %d\n", max_height_, height); - - // It is ok to mutate max_height_ without any synchronization - // with concurrent readers. A concurrent reader that observes - // the new value of max_height_ will see either the old value of - // new level pointers from head_ (NULL), or a new value set in - // the loop below. In the former case the reader will - // immediately drop to the next level since NULL sorts after all - // keys. In the latter case the reader will use the new node. - max_height_.NoBarrier_Store(reinterpret_cast<void*>(height)); - } - - x = NewNode(key, height); - for (int i = 0; i < height; i++) { - // NoBarrier_SetNext() suffices since we will add a barrier when - // we publish a pointer to "x" in prev[i]. - x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i)); - prev[i]->SetNext(i, x); - } -} - -template<typename Key, class Comparator> -bool SkipList<Key,Comparator>::Contains(const Key& key) const { - Node* x = FindGreaterOrEqual(key, NULL); - if (x != NULL && Equal(key, x->key)) { - return true; - } else { - return false; - } -} - -} // namespace leveldb diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc deleted file mode 100644 index c78f4b4fb1..0000000000 --- a/src/leveldb/db/skiplist_test.cc +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/skiplist.h" -#include <set> -#include "leveldb/env.h" -#include "util/arena.h" -#include "util/hash.h" -#include "util/random.h" -#include "util/testharness.h" - -namespace leveldb { - -typedef uint64_t Key; - -struct Comparator { - int operator()(const Key& a, const Key& b) const { - if (a < b) { - return -1; - } else if (a > b) { - return +1; - } else { - return 0; - } - } -}; - -class SkipTest { }; - -TEST(SkipTest, Empty) { - Arena arena; - Comparator cmp; - SkipList<Key, Comparator> list(cmp, &arena); - ASSERT_TRUE(!list.Contains(10)); - - SkipList<Key, Comparator>::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToFirst(); - ASSERT_TRUE(!iter.Valid()); - iter.Seek(100); - ASSERT_TRUE(!iter.Valid()); - iter.SeekToLast(); - ASSERT_TRUE(!iter.Valid()); -} - -TEST(SkipTest, InsertAndLookup) { - const int N = 2000; - const int R = 5000; - Random rnd(1000); - std::set<Key> keys; - Arena arena; - Comparator cmp; - SkipList<Key, Comparator> list(cmp, &arena); - for (int i = 0; i < N; i++) { - Key key = rnd.Next() % R; - if (keys.insert(key).second) { - list.Insert(key); - } - } - - for (int i = 0; i < R; i++) { - if (list.Contains(i)) { - ASSERT_EQ(keys.count(i), 1); - } else { - ASSERT_EQ(keys.count(i), 0); - } - } - - // Simple iterator tests - { - SkipList<Key, Comparator>::Iterator iter(&list); - ASSERT_TRUE(!iter.Valid()); - - iter.Seek(0); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToFirst(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.begin()), iter.key()); - - iter.SeekToLast(); - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*(keys.rbegin()), iter.key()); - } - - // Forward iteration test - for (int i = 0; i < R; i++) { - SkipList<Key, Comparator>::Iterator iter(&list); - iter.Seek(i); - - // Compare against model iterator - std::set<Key>::iterator model_iter = keys.lower_bound(i); - for (int j = 0; j < 3; j++) { - if (model_iter == keys.end()) { - ASSERT_TRUE(!iter.Valid()); - break; - } else { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - ++model_iter; - iter.Next(); - } - } - } - - // Backward iteration test - { - SkipList<Key, Comparator>::Iterator iter(&list); - iter.SeekToLast(); - - // Compare against model iterator - for (std::set<Key>::reverse_iterator model_iter = keys.rbegin(); - model_iter != keys.rend(); - ++model_iter) { - ASSERT_TRUE(iter.Valid()); - ASSERT_EQ(*model_iter, iter.key()); - iter.Prev(); - } - ASSERT_TRUE(!iter.Valid()); - } -} - -// We want to make sure that with a single writer and multiple -// concurrent readers (with no synchronization other than when a -// reader's iterator is created), the reader always observes all the -// data that was present in the skip list when the iterator was -// constructor. Because insertions are happening concurrently, we may -// also observe new values that were inserted since the iterator was -// constructed, but we should never miss any values that were present -// at iterator construction time. -// -// We generate multi-part keys: -// <key,gen,hash> -// where: -// key is in range [0..K-1] -// gen is a generation number for key -// hash is hash(key,gen) -// -// The insertion code picks a random key, sets gen to be 1 + the last -// generation number inserted for that key, and sets hash to Hash(key,gen). -// -// At the beginning of a read, we snapshot the last inserted -// generation number for each key. We then iterate, including random -// calls to Next() and Seek(). For every key we encounter, we -// check that it is either expected given the initial snapshot or has -// been concurrently added since the iterator started. -class ConcurrentTest { - private: - static const uint32_t K = 4; - - static uint64_t key(Key key) { return (key >> 40); } - static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; } - static uint64_t hash(Key key) { return key & 0xff; } - - static uint64_t HashNumbers(uint64_t k, uint64_t g) { - uint64_t data[2] = { k, g }; - return Hash(reinterpret_cast<char*>(data), sizeof(data), 0); - } - - static Key MakeKey(uint64_t k, uint64_t g) { - assert(sizeof(Key) == sizeof(uint64_t)); - assert(k <= K); // We sometimes pass K to seek to the end of the skiplist - assert(g <= 0xffffffffu); - return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff)); - } - - static bool IsValidKey(Key k) { - return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff); - } - - static Key RandomTarget(Random* rnd) { - switch (rnd->Next() % 10) { - case 0: - // Seek to beginning - return MakeKey(0, 0); - case 1: - // Seek to end - return MakeKey(K, 0); - default: - // Seek to middle - return MakeKey(rnd->Next() % K, 0); - } - } - - // Per-key generation - struct State { - port::AtomicPointer generation[K]; - void Set(int k, intptr_t v) { - generation[k].Release_Store(reinterpret_cast<void*>(v)); - } - intptr_t Get(int k) { - return reinterpret_cast<intptr_t>(generation[k].Acquire_Load()); - } - - State() { - for (int k = 0; k < K; k++) { - Set(k, 0); - } - } - }; - - // Current state of the test - State current_; - - Arena arena_; - - // SkipList is not protected by mu_. We just use a single writer - // thread to modify it. - SkipList<Key, Comparator> list_; - - public: - ConcurrentTest() : list_(Comparator(), &arena_) { } - - // REQUIRES: External synchronization - void WriteStep(Random* rnd) { - const uint32_t k = rnd->Next() % K; - const intptr_t g = current_.Get(k) + 1; - const Key key = MakeKey(k, g); - list_.Insert(key); - current_.Set(k, g); - } - - void ReadStep(Random* rnd) { - // Remember the initial committed state of the skiplist. - State initial_state; - for (int k = 0; k < K; k++) { - initial_state.Set(k, current_.Get(k)); - } - - Key pos = RandomTarget(rnd); - SkipList<Key, Comparator>::Iterator iter(&list_); - iter.Seek(pos); - while (true) { - Key current; - if (!iter.Valid()) { - current = MakeKey(K, 0); - } else { - current = iter.key(); - ASSERT_TRUE(IsValidKey(current)) << current; - } - ASSERT_LE(pos, current) << "should not go backwards"; - - // Verify that everything in [pos,current) was not present in - // initial_state. - while (pos < current) { - ASSERT_LT(key(pos), K) << pos; - - // Note that generation 0 is never inserted, so it is ok if - // <*,0,*> is missing. - ASSERT_TRUE((gen(pos) == 0) || - (gen(pos) > initial_state.Get(key(pos))) - ) << "key: " << key(pos) - << "; gen: " << gen(pos) - << "; initgen: " - << initial_state.Get(key(pos)); - - // Advance to next key in the valid key space - if (key(pos) < key(current)) { - pos = MakeKey(key(pos) + 1, 0); - } else { - pos = MakeKey(key(pos), gen(pos) + 1); - } - } - - if (!iter.Valid()) { - break; - } - - if (rnd->Next() % 2) { - iter.Next(); - pos = MakeKey(key(pos), gen(pos) + 1); - } else { - Key new_target = RandomTarget(rnd); - if (new_target > pos) { - pos = new_target; - iter.Seek(new_target); - } - } - } - } -}; -const uint32_t ConcurrentTest::K; - -// Simple test that does single-threaded testing of the ConcurrentTest -// scaffolding. -TEST(SkipTest, ConcurrentWithoutThreads) { - ConcurrentTest test; - Random rnd(test::RandomSeed()); - for (int i = 0; i < 10000; i++) { - test.ReadStep(&rnd); - test.WriteStep(&rnd); - } -} - -class TestState { - public: - ConcurrentTest t_; - int seed_; - port::AtomicPointer quit_flag_; - - enum ReaderState { - STARTING, - RUNNING, - DONE - }; - - explicit TestState(int s) - : seed_(s), - quit_flag_(NULL), - state_(STARTING), - state_cv_(&mu_) {} - - void Wait(ReaderState s) { - mu_.Lock(); - while (state_ != s) { - state_cv_.Wait(); - } - mu_.Unlock(); - } - - void Change(ReaderState s) { - mu_.Lock(); - state_ = s; - state_cv_.Signal(); - mu_.Unlock(); - } - - private: - port::Mutex mu_; - ReaderState state_; - port::CondVar state_cv_; -}; - -static void ConcurrentReader(void* arg) { - TestState* state = reinterpret_cast<TestState*>(arg); - Random rnd(state->seed_); - int64_t reads = 0; - state->Change(TestState::RUNNING); - while (!state->quit_flag_.Acquire_Load()) { - state->t_.ReadStep(&rnd); - ++reads; - } - state->Change(TestState::DONE); -} - -static void RunConcurrent(int run) { - const int seed = test::RandomSeed() + (run * 100); - Random rnd(seed); - const int N = 1000; - const int kSize = 1000; - for (int i = 0; i < N; i++) { - if ((i % 100) == 0) { - fprintf(stderr, "Run %d of %d\n", i, N); - } - TestState state(seed + 1); - Env::Default()->Schedule(ConcurrentReader, &state); - state.Wait(TestState::RUNNING); - for (int i = 0; i < kSize; i++) { - state.t_.WriteStep(&rnd); - } - state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do - state.Wait(TestState::DONE); - } -} - -TEST(SkipTest, Concurrent1) { RunConcurrent(1); } -TEST(SkipTest, Concurrent2) { RunConcurrent(2); } -TEST(SkipTest, Concurrent3) { RunConcurrent(3); } -TEST(SkipTest, Concurrent4) { RunConcurrent(4); } -TEST(SkipTest, Concurrent5) { RunConcurrent(5); } - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h deleted file mode 100644 index e7f8fd2c37..0000000000 --- a/src/leveldb/db/snapshot.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_ -#define STORAGE_LEVELDB_DB_SNAPSHOT_H_ - -#include "leveldb/db.h" - -namespace leveldb { - -class SnapshotList; - -// Snapshots are kept in a doubly-linked list in the DB. -// Each SnapshotImpl corresponds to a particular sequence number. -class SnapshotImpl : public Snapshot { - public: - SequenceNumber number_; // const after creation - - private: - friend class SnapshotList; - - // SnapshotImpl is kept in a doubly-linked circular list - SnapshotImpl* prev_; - SnapshotImpl* next_; - - SnapshotList* list_; // just for sanity checks -}; - -class SnapshotList { - public: - SnapshotList() { - list_.prev_ = &list_; - list_.next_ = &list_; - } - - bool empty() const { return list_.next_ == &list_; } - SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } - SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } - - const SnapshotImpl* New(SequenceNumber seq) { - SnapshotImpl* s = new SnapshotImpl; - s->number_ = seq; - s->list_ = this; - s->next_ = &list_; - s->prev_ = list_.prev_; - s->prev_->next_ = s; - s->next_->prev_ = s; - return s; - } - - void Delete(const SnapshotImpl* s) { - assert(s->list_ == this); - s->prev_->next_ = s->next_; - s->next_->prev_ = s->prev_; - delete s; - } - - private: - // Dummy head of doubly-linked list of snapshots - SnapshotImpl list_; -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_ diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc deleted file mode 100644 index 497db27076..0000000000 --- a/src/leveldb/db/table_cache.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/table_cache.h" - -#include "db/filename.h" -#include "leveldb/env.h" -#include "leveldb/table.h" -#include "util/coding.h" - -namespace leveldb { - -struct TableAndFile { - RandomAccessFile* file; - Table* table; -}; - -static void DeleteEntry(const Slice& key, void* value) { - TableAndFile* tf = reinterpret_cast<TableAndFile*>(value); - delete tf->table; - delete tf->file; - delete tf; -} - -static void UnrefEntry(void* arg1, void* arg2) { - Cache* cache = reinterpret_cast<Cache*>(arg1); - Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2); - cache->Release(h); -} - -TableCache::TableCache(const std::string& dbname, - const Options* options, - int entries) - : env_(options->env), - dbname_(dbname), - options_(options), - cache_(NewLRUCache(entries)) { -} - -TableCache::~TableCache() { - delete cache_; -} - -Status TableCache::FindTable(uint64_t file_number, uint64_t file_size, - Cache::Handle** handle) { - Status s; - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - Slice key(buf, sizeof(buf)); - *handle = cache_->Lookup(key); - if (*handle == NULL) { - std::string fname = TableFileName(dbname_, file_number); - RandomAccessFile* file = NULL; - Table* table = NULL; - s = env_->NewRandomAccessFile(fname, &file); - if (s.ok()) { - s = Table::Open(*options_, file, file_size, &table); - } - - if (!s.ok()) { - assert(table == NULL); - delete file; - // We do not cache error results so that if the error is transient, - // or somebody repairs the file, we recover automatically. - } else { - TableAndFile* tf = new TableAndFile; - tf->file = file; - tf->table = table; - *handle = cache_->Insert(key, tf, 1, &DeleteEntry); - } - } - return s; -} - -Iterator* TableCache::NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr) { - if (tableptr != NULL) { - *tableptr = NULL; - } - - Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, &handle); - if (!s.ok()) { - return NewErrorIterator(s); - } - - Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; - Iterator* result = table->NewIterator(options); - result->RegisterCleanup(&UnrefEntry, cache_, handle); - if (tableptr != NULL) { - *tableptr = table; - } - return result; -} - -Status TableCache::Get(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - const Slice& k, - void* arg, - void (*saver)(void*, const Slice&, const Slice&)) { - Cache::Handle* handle = NULL; - Status s = FindTable(file_number, file_size, &handle); - if (s.ok()) { - Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table; - s = t->InternalGet(options, k, arg, saver); - cache_->Release(handle); - } - return s; -} - -void TableCache::Evict(uint64_t file_number) { - char buf[sizeof(file_number)]; - EncodeFixed64(buf, file_number); - cache_->Erase(Slice(buf, sizeof(buf))); -} - -} // namespace leveldb diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h deleted file mode 100644 index 8cf4aaf12d..0000000000 --- a/src/leveldb/db/table_cache.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// Thread-safe (provides internal synchronization) - -#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_ -#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_ - -#include <string> -#include <stdint.h> -#include "db/dbformat.h" -#include "leveldb/cache.h" -#include "leveldb/table.h" -#include "port/port.h" - -namespace leveldb { - -class Env; - -class TableCache { - public: - TableCache(const std::string& dbname, const Options* options, int entries); - ~TableCache(); - - // Return an iterator for the specified file number (the corresponding - // file length must be exactly "file_size" bytes). If "tableptr" is - // non-NULL, also sets "*tableptr" to point to the Table object - // underlying the returned iterator, or NULL if no Table object underlies - // the returned iterator. The returned "*tableptr" object is owned by - // the cache and should not be deleted, and is valid for as long as the - // returned iterator is live. - Iterator* NewIterator(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - Table** tableptr = NULL); - - // If a seek to internal key "k" in specified file finds an entry, - // call (*handle_result)(arg, found_key, found_value). - Status Get(const ReadOptions& options, - uint64_t file_number, - uint64_t file_size, - const Slice& k, - void* arg, - void (*handle_result)(void*, const Slice&, const Slice&)); - - // Evict any entry for the specified file number - void Evict(uint64_t file_number); - - private: - Env* const env_; - const std::string dbname_; - const Options* options_; - Cache* cache_; - - Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**); -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_ diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc deleted file mode 100644 index f10a2d58b2..0000000000 --- a/src/leveldb/db/version_edit.cc +++ /dev/null @@ -1,266 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" - -#include "db/version_set.h" -#include "util/coding.h" - -namespace leveldb { - -// Tag numbers for serialized VersionEdit. These numbers are written to -// disk and should not be changed. -enum Tag { - kComparator = 1, - kLogNumber = 2, - kNextFileNumber = 3, - kLastSequence = 4, - kCompactPointer = 5, - kDeletedFile = 6, - kNewFile = 7, - // 8 was used for large value refs - kPrevLogNumber = 9 -}; - -void VersionEdit::Clear() { - comparator_.clear(); - log_number_ = 0; - prev_log_number_ = 0; - last_sequence_ = 0; - next_file_number_ = 0; - has_comparator_ = false; - has_log_number_ = false; - has_prev_log_number_ = false; - has_next_file_number_ = false; - has_last_sequence_ = false; - deleted_files_.clear(); - new_files_.clear(); -} - -void VersionEdit::EncodeTo(std::string* dst) const { - if (has_comparator_) { - PutVarint32(dst, kComparator); - PutLengthPrefixedSlice(dst, comparator_); - } - if (has_log_number_) { - PutVarint32(dst, kLogNumber); - PutVarint64(dst, log_number_); - } - if (has_prev_log_number_) { - PutVarint32(dst, kPrevLogNumber); - PutVarint64(dst, prev_log_number_); - } - if (has_next_file_number_) { - PutVarint32(dst, kNextFileNumber); - PutVarint64(dst, next_file_number_); - } - if (has_last_sequence_) { - PutVarint32(dst, kLastSequence); - PutVarint64(dst, last_sequence_); - } - - for (size_t i = 0; i < compact_pointers_.size(); i++) { - PutVarint32(dst, kCompactPointer); - PutVarint32(dst, compact_pointers_[i].first); // level - PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); - } - - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - PutVarint32(dst, kDeletedFile); - PutVarint32(dst, iter->first); // level - PutVarint64(dst, iter->second); // file number - } - - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - PutVarint32(dst, kNewFile); - PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.number); - PutVarint64(dst, f.file_size); - PutLengthPrefixedSlice(dst, f.smallest.Encode()); - PutLengthPrefixedSlice(dst, f.largest.Encode()); - } -} - -static bool GetInternalKey(Slice* input, InternalKey* dst) { - Slice str; - if (GetLengthPrefixedSlice(input, &str)) { - dst->DecodeFrom(str); - return true; - } else { - return false; - } -} - -static bool GetLevel(Slice* input, int* level) { - uint32_t v; - if (GetVarint32(input, &v) && - v < config::kNumLevels) { - *level = v; - return true; - } else { - return false; - } -} - -Status VersionEdit::DecodeFrom(const Slice& src) { - Clear(); - Slice input = src; - const char* msg = NULL; - uint32_t tag; - - // Temporary storage for parsing - int level; - uint64_t number; - FileMetaData f; - Slice str; - InternalKey key; - - while (msg == NULL && GetVarint32(&input, &tag)) { - switch (tag) { - case kComparator: - if (GetLengthPrefixedSlice(&input, &str)) { - comparator_ = str.ToString(); - has_comparator_ = true; - } else { - msg = "comparator name"; - } - break; - - case kLogNumber: - if (GetVarint64(&input, &log_number_)) { - has_log_number_ = true; - } else { - msg = "log number"; - } - break; - - case kPrevLogNumber: - if (GetVarint64(&input, &prev_log_number_)) { - has_prev_log_number_ = true; - } else { - msg = "previous log number"; - } - break; - - case kNextFileNumber: - if (GetVarint64(&input, &next_file_number_)) { - has_next_file_number_ = true; - } else { - msg = "next file number"; - } - break; - - case kLastSequence: - if (GetVarint64(&input, &last_sequence_)) { - has_last_sequence_ = true; - } else { - msg = "last sequence number"; - } - break; - - case kCompactPointer: - if (GetLevel(&input, &level) && - GetInternalKey(&input, &key)) { - compact_pointers_.push_back(std::make_pair(level, key)); - } else { - msg = "compaction pointer"; - } - break; - - case kDeletedFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &number)) { - deleted_files_.insert(std::make_pair(level, number)); - } else { - msg = "deleted file"; - } - break; - - case kNewFile: - if (GetLevel(&input, &level) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && - GetInternalKey(&input, &f.smallest) && - GetInternalKey(&input, &f.largest)) { - new_files_.push_back(std::make_pair(level, f)); - } else { - msg = "new-file entry"; - } - break; - - default: - msg = "unknown tag"; - break; - } - } - - if (msg == NULL && !input.empty()) { - msg = "invalid tag"; - } - - Status result; - if (msg != NULL) { - result = Status::Corruption("VersionEdit", msg); - } - return result; -} - -std::string VersionEdit::DebugString() const { - std::string r; - r.append("VersionEdit {"); - if (has_comparator_) { - r.append("\n Comparator: "); - r.append(comparator_); - } - if (has_log_number_) { - r.append("\n LogNumber: "); - AppendNumberTo(&r, log_number_); - } - if (has_prev_log_number_) { - r.append("\n PrevLogNumber: "); - AppendNumberTo(&r, prev_log_number_); - } - if (has_next_file_number_) { - r.append("\n NextFile: "); - AppendNumberTo(&r, next_file_number_); - } - if (has_last_sequence_) { - r.append("\n LastSeq: "); - AppendNumberTo(&r, last_sequence_); - } - for (size_t i = 0; i < compact_pointers_.size(); i++) { - r.append("\n CompactPointer: "); - AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" "); - r.append(compact_pointers_[i].second.DebugString()); - } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); - iter != deleted_files_.end(); - ++iter) { - r.append("\n DeleteFile: "); - AppendNumberTo(&r, iter->first); - r.append(" "); - AppendNumberTo(&r, iter->second); - } - for (size_t i = 0; i < new_files_.size(); i++) { - const FileMetaData& f = new_files_[i].second; - r.append("\n AddFile: "); - AppendNumberTo(&r, new_files_[i].first); - r.append(" "); - AppendNumberTo(&r, f.number); - r.append(" "); - AppendNumberTo(&r, f.file_size); - r.append(" "); - r.append(f.smallest.DebugString()); - r.append(" .. "); - r.append(f.largest.DebugString()); - } - r.append("\n}\n"); - return r; -} - -} // namespace leveldb diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h deleted file mode 100644 index eaef77b327..0000000000 --- a/src/leveldb/db/version_edit.h +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_ -#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_ - -#include <set> -#include <utility> -#include <vector> -#include "db/dbformat.h" - -namespace leveldb { - -class VersionSet; - -struct FileMetaData { - int refs; - int allowed_seeks; // Seeks allowed until compaction - uint64_t number; - uint64_t file_size; // File size in bytes - InternalKey smallest; // Smallest internal key served by table - InternalKey largest; // Largest internal key served by table - - FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { } -}; - -class VersionEdit { - public: - VersionEdit() { Clear(); } - ~VersionEdit() { } - - void Clear(); - - void SetComparatorName(const Slice& name) { - has_comparator_ = true; - comparator_ = name.ToString(); - } - void SetLogNumber(uint64_t num) { - has_log_number_ = true; - log_number_ = num; - } - void SetPrevLogNumber(uint64_t num) { - has_prev_log_number_ = true; - prev_log_number_ = num; - } - void SetNextFile(uint64_t num) { - has_next_file_number_ = true; - next_file_number_ = num; - } - void SetLastSequence(SequenceNumber seq) { - has_last_sequence_ = true; - last_sequence_ = seq; - } - void SetCompactPointer(int level, const InternalKey& key) { - compact_pointers_.push_back(std::make_pair(level, key)); - } - - // Add the specified file at the specified number. - // REQUIRES: This version has not been saved (see VersionSet::SaveTo) - // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, - uint64_t file_size, - const InternalKey& smallest, - const InternalKey& largest) { - FileMetaData f; - f.number = file; - f.file_size = file_size; - f.smallest = smallest; - f.largest = largest; - new_files_.push_back(std::make_pair(level, f)); - } - - // Delete the specified "file" from the specified "level". - void DeleteFile(int level, uint64_t file) { - deleted_files_.insert(std::make_pair(level, file)); - } - - void EncodeTo(std::string* dst) const; - Status DecodeFrom(const Slice& src); - - std::string DebugString() const; - - private: - friend class VersionSet; - - typedef std::set< std::pair<int, uint64_t> > DeletedFileSet; - - std::string comparator_; - uint64_t log_number_; - uint64_t prev_log_number_; - uint64_t next_file_number_; - SequenceNumber last_sequence_; - bool has_comparator_; - bool has_log_number_; - bool has_prev_log_number_; - bool has_next_file_number_; - bool has_last_sequence_; - - std::vector< std::pair<int, InternalKey> > compact_pointers_; - DeletedFileSet deleted_files_; - std::vector< std::pair<int, FileMetaData> > new_files_; -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_ diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc deleted file mode 100644 index 280310b49d..0000000000 --- a/src/leveldb/db/version_edit_test.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_edit.h" -#include "util/testharness.h" - -namespace leveldb { - -static void TestEncodeDecode(const VersionEdit& edit) { - std::string encoded, encoded2; - edit.EncodeTo(&encoded); - VersionEdit parsed; - Status s = parsed.DecodeFrom(encoded); - ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); - ASSERT_EQ(encoded, encoded2); -} - -class VersionEditTest { }; - -TEST(VersionEditTest, EncodeDecode) { - static const uint64_t kBig = 1ull << 50; - - VersionEdit edit; - for (int i = 0; i < 4; i++) { - TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, - InternalKey("foo", kBig + 500 + i, kTypeValue), - InternalKey("zoo", kBig + 600 + i, kTypeDeletion)); - edit.DeleteFile(4, kBig + 700 + i); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); - } - - edit.SetComparatorName("foo"); - edit.SetLogNumber(kBig + 100); - edit.SetNextFile(kBig + 200); - edit.SetLastSequence(kBig + 1000); - TestEncodeDecode(edit); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc deleted file mode 100644 index 7d0a5de2b9..0000000000 --- a/src/leveldb/db/version_set.cc +++ /dev/null @@ -1,1438 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_set.h" - -#include <algorithm> -#include <stdio.h> -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" -#include "db/table_cache.h" -#include "leveldb/env.h" -#include "leveldb/table_builder.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" - -namespace leveldb { - -static const int kTargetFileSize = 2 * 1048576; - -// Maximum bytes of overlaps in grandparent (i.e., level+2) before we -// stop building a single file in a level->level+1 compaction. -static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize; - -// Maximum number of bytes in all compacted files. We avoid expanding -// the lower level file set of a compaction if it would make the -// total compaction cover more than this many bytes. -static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize; - -static double MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - double result = 10 * 1048576.0; // Result for both level-0 and level-1 - while (level > 1) { - result *= 10; - level--; - } - return result; -} - -static uint64_t MaxFileSizeForLevel(int level) { - return kTargetFileSize; // We could vary per level to reduce number of files? -} - -static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) { - int64_t sum = 0; - for (size_t i = 0; i < files.size(); i++) { - sum += files[i]->file_size; - } - return sum; -} - -namespace { -std::string IntSetToString(const std::set<uint64_t>& s) { - std::string result = "{"; - for (std::set<uint64_t>::const_iterator it = s.begin(); - it != s.end(); - ++it) { - result += (result.size() > 1) ? "," : ""; - result += NumberToString(*it); - } - result += "}"; - return result; -} -} // namespace - -Version::~Version() { - assert(refs_ == 0); - - // Remove from linked list - prev_->next_ = next_; - next_->prev_ = prev_; - - // Drop references to files - for (int level = 0; level < config::kNumLevels; level++) { - for (size_t i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; - assert(f->refs > 0); - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } -} - -int FindFile(const InternalKeyComparator& icmp, - const std::vector<FileMetaData*>& files, - const Slice& key) { - uint32_t left = 0; - uint32_t right = files.size(); - while (left < right) { - uint32_t mid = (left + right) / 2; - const FileMetaData* f = files[mid]; - if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { - // Key at "mid.largest" is < "target". Therefore all - // files at or before "mid" are uninteresting. - left = mid + 1; - } else { - // Key at "mid.largest" is >= "target". Therefore all files - // after "mid" are uninteresting. - right = mid; - } - } - return right; -} - -static bool AfterFile(const Comparator* ucmp, - const Slice* user_key, const FileMetaData* f) { - // NULL user_key occurs before all keys and is therefore never after *f - return (user_key != NULL && - ucmp->Compare(*user_key, f->largest.user_key()) > 0); -} - -static bool BeforeFile(const Comparator* ucmp, - const Slice* user_key, const FileMetaData* f) { - // NULL user_key occurs after all keys and is therefore never before *f - return (user_key != NULL && - ucmp->Compare(*user_key, f->smallest.user_key()) < 0); -} - -bool SomeFileOverlapsRange( - const InternalKeyComparator& icmp, - bool disjoint_sorted_files, - const std::vector<FileMetaData*>& files, - const Slice* smallest_user_key, - const Slice* largest_user_key) { - const Comparator* ucmp = icmp.user_comparator(); - if (!disjoint_sorted_files) { - // Need to check against all files - for (size_t i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - if (AfterFile(ucmp, smallest_user_key, f) || - BeforeFile(ucmp, largest_user_key, f)) { - // No overlap - } else { - return true; // Overlap - } - } - return false; - } - - // Binary search over file list - uint32_t index = 0; - if (smallest_user_key != NULL) { - // Find the earliest possible internal key for smallest_user_key - InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek); - index = FindFile(icmp, files, small.Encode()); - } - - if (index >= files.size()) { - // beginning of range is after all files, so no overlap. - return false; - } - - return !BeforeFile(ucmp, largest_user_key, files[index]); -} - -// An internal iterator. For a given version/level pair, yields -// information about the files in the level. For a given entry, key() -// is the largest key that occurs in the file, and value() is an -// 16-byte value containing the file number and file size, both -// encoded using EncodeFixed64. -class Version::LevelFileNumIterator : public Iterator { - public: - LevelFileNumIterator(const InternalKeyComparator& icmp, - const std::vector<FileMetaData*>* flist) - : icmp_(icmp), - flist_(flist), - index_(flist->size()) { // Marks as invalid - } - virtual bool Valid() const { - return index_ < flist_->size(); - } - virtual void Seek(const Slice& target) { - index_ = FindFile(icmp_, *flist_, target); - } - virtual void SeekToFirst() { index_ = 0; } - virtual void SeekToLast() { - index_ = flist_->empty() ? 0 : flist_->size() - 1; - } - virtual void Next() { - assert(Valid()); - index_++; - } - virtual void Prev() { - assert(Valid()); - if (index_ == 0) { - index_ = flist_->size(); // Marks as invalid - } else { - index_--; - } - } - Slice key() const { - assert(Valid()); - return (*flist_)[index_]->largest.Encode(); - } - Slice value() const { - assert(Valid()); - EncodeFixed64(value_buf_, (*flist_)[index_]->number); - EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size); - return Slice(value_buf_, sizeof(value_buf_)); - } - virtual Status status() const { return Status::OK(); } - private: - const InternalKeyComparator icmp_; - const std::vector<FileMetaData*>* const flist_; - uint32_t index_; - - // Backing store for value(). Holds the file number and size. - mutable char value_buf_[16]; -}; - -static Iterator* GetFileIterator(void* arg, - const ReadOptions& options, - const Slice& file_value) { - TableCache* cache = reinterpret_cast<TableCache*>(arg); - if (file_value.size() != 16) { - return NewErrorIterator( - Status::Corruption("FileReader invoked with unexpected value")); - } else { - return cache->NewIterator(options, - DecodeFixed64(file_value.data()), - DecodeFixed64(file_value.data() + 8)); - } -} - -Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, - int level) const { - return NewTwoLevelIterator( - new LevelFileNumIterator(vset_->icmp_, &files_[level]), - &GetFileIterator, vset_->table_cache_, options); -} - -void Version::AddIterators(const ReadOptions& options, - std::vector<Iterator*>* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < files_[0].size(); i++) { - iters->push_back( - vset_->table_cache_->NewIterator( - options, files_[0][i]->number, files_[0][i]->file_size)); - } - - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < config::kNumLevels; level++) { - if (!files_[level].empty()) { - iters->push_back(NewConcatenatingIterator(options, level)); - } - } -} - -// Callback from TableCache::Get() -namespace { -enum SaverState { - kNotFound, - kFound, - kDeleted, - kCorrupt, -}; -struct Saver { - SaverState state; - const Comparator* ucmp; - Slice user_key; - std::string* value; -}; -} -static void SaveValue(void* arg, const Slice& ikey, const Slice& v) { - Saver* s = reinterpret_cast<Saver*>(arg); - ParsedInternalKey parsed_key; - if (!ParseInternalKey(ikey, &parsed_key)) { - s->state = kCorrupt; - } else { - if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { - s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted; - if (s->state == kFound) { - s->value->assign(v.data(), v.size()); - } - } - } -} - -static bool NewestFirst(FileMetaData* a, FileMetaData* b) { - return a->number > b->number; -} - -Status Version::Get(const ReadOptions& options, - const LookupKey& k, - std::string* value, - GetStats* stats) { - Slice ikey = k.internal_key(); - Slice user_key = k.user_key(); - const Comparator* ucmp = vset_->icmp_.user_comparator(); - Status s; - - stats->seek_file = NULL; - stats->seek_file_level = -1; - FileMetaData* last_file_read = NULL; - int last_file_read_level = -1; - - // We can search level-by-level since entries never hop across - // levels. Therefore we are guaranteed that if we find data - // in an smaller level, later levels are irrelevant. - std::vector<FileMetaData*> tmp; - FileMetaData* tmp2; - for (int level = 0; level < config::kNumLevels; level++) { - size_t num_files = files_[level].size(); - if (num_files == 0) continue; - - // Get the list of files to search in this level - FileMetaData* const* files = &files_[level][0]; - if (level == 0) { - // Level-0 files may overlap each other. Find all files that - // overlap user_key and process them in order from newest to oldest. - tmp.reserve(num_files); - for (uint32_t i = 0; i < num_files; i++) { - FileMetaData* f = files[i]; - if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && - ucmp->Compare(user_key, f->largest.user_key()) <= 0) { - tmp.push_back(f); - } - } - if (tmp.empty()) continue; - - std::sort(tmp.begin(), tmp.end(), NewestFirst); - files = &tmp[0]; - num_files = tmp.size(); - } else { - // Binary search to find earliest index whose largest key >= ikey. - uint32_t index = FindFile(vset_->icmp_, files_[level], ikey); - if (index >= num_files) { - files = NULL; - num_files = 0; - } else { - tmp2 = files[index]; - if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) { - // All of "tmp2" is past any data for user_key - files = NULL; - num_files = 0; - } else { - files = &tmp2; - num_files = 1; - } - } - } - - for (uint32_t i = 0; i < num_files; ++i) { - if (last_file_read != NULL && stats->seek_file == NULL) { - // We have had more than one seek for this read. Charge the 1st file. - stats->seek_file = last_file_read; - stats->seek_file_level = last_file_read_level; - } - - FileMetaData* f = files[i]; - last_file_read = f; - last_file_read_level = level; - - Saver saver; - saver.state = kNotFound; - saver.ucmp = ucmp; - saver.user_key = user_key; - saver.value = value; - s = vset_->table_cache_->Get(options, f->number, f->file_size, - ikey, &saver, SaveValue); - if (!s.ok()) { - return s; - } - switch (saver.state) { - case kNotFound: - break; // Keep searching in other files - case kFound: - return s; - case kDeleted: - s = Status::NotFound(Slice()); // Use empty error message for speed - return s; - case kCorrupt: - s = Status::Corruption("corrupted key for ", user_key); - return s; - } - } - } - - return Status::NotFound(Slice()); // Use an empty error message for speed -} - -bool Version::UpdateStats(const GetStats& stats) { - FileMetaData* f = stats.seek_file; - if (f != NULL) { - f->allowed_seeks--; - if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) { - file_to_compact_ = f; - file_to_compact_level_ = stats.seek_file_level; - return true; - } - } - return false; -} - -void Version::Ref() { - ++refs_; -} - -void Version::Unref() { - assert(this != &vset_->dummy_versions_); - assert(refs_ >= 1); - --refs_; - if (refs_ == 0) { - delete this; - } -} - -bool Version::OverlapInLevel(int level, - const Slice* smallest_user_key, - const Slice* largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], - smallest_user_key, largest_user_key); -} - -int Version::PickLevelForMemTableOutput( - const Slice& smallest_user_key, - const Slice& largest_user_key) { - int level = 0; - if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { - // Push to next level if there is no overlap in next level, - // and the #bytes overlapping in the level after that are limited. - InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0)); - std::vector<FileMetaData*> overlaps; - while (level < config::kMaxMemCompactLevel) { - if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { - break; - } - GetOverlappingInputs(level + 2, &start, &limit, &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > kMaxGrandParentOverlapBytes) { - break; - } - level++; - } - } - return level; -} - -// Store in "*inputs" all files in "level" that overlap [begin,end] -void Version::GetOverlappingInputs( - int level, - const InternalKey* begin, - const InternalKey* end, - std::vector<FileMetaData*>* inputs) { - inputs->clear(); - Slice user_begin, user_end; - if (begin != NULL) { - user_begin = begin->user_key(); - } - if (end != NULL) { - user_end = end->user_key(); - } - const Comparator* user_cmp = vset_->icmp_.user_comparator(); - for (size_t i = 0; i < files_[level].size(); ) { - FileMetaData* f = files_[level][i++]; - const Slice file_start = f->smallest.user_key(); - const Slice file_limit = f->largest.user_key(); - if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) { - // "f" is completely before specified range; skip it - } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) { - // "f" is completely after specified range; skip it - } else { - inputs->push_back(f); - if (level == 0) { - // Level-0 files may overlap each other. So check if the newly - // added file has expanded the range. If so, restart search. - if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) { - user_begin = file_start; - inputs->clear(); - i = 0; - } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) { - user_end = file_limit; - inputs->clear(); - i = 0; - } - } - } - } -} - -std::string Version::DebugString() const { - std::string r; - for (int level = 0; level < config::kNumLevels; level++) { - // E.g., - // --- level 1 --- - // 17:123['a' .. 'd'] - // 20:43['e' .. 'g'] - r.append("--- level "); - AppendNumberTo(&r, level); - r.append(" ---\n"); - const std::vector<FileMetaData*>& files = files_[level]; - for (size_t i = 0; i < files.size(); i++) { - r.push_back(' '); - AppendNumberTo(&r, files[i]->number); - r.push_back(':'); - AppendNumberTo(&r, files[i]->file_size); - r.append("["); - r.append(files[i]->smallest.DebugString()); - r.append(" .. "); - r.append(files[i]->largest.DebugString()); - r.append("]\n"); - } - } - return r; -} - -// A helper class so we can efficiently apply a whole sequence -// of edits to a particular state without creating intermediate -// Versions that contain full copies of the intermediate state. -class VersionSet::Builder { - private: - // Helper to sort by v->files_[file_number].smallest - struct BySmallestKey { - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - int r = internal_comparator->Compare(f1->smallest, f2->smallest); - if (r != 0) { - return (r < 0); - } else { - // Break ties by file number - return (f1->number < f2->number); - } - } - }; - - typedef std::set<FileMetaData*, BySmallestKey> FileSet; - struct LevelState { - std::set<uint64_t> deleted_files; - FileSet* added_files; - }; - - VersionSet* vset_; - Version* base_; - LevelState levels_[config::kNumLevels]; - - public: - // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset), - base_(base) { - base_->Ref(); - BySmallestKey cmp; - cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < config::kNumLevels; level++) { - levels_[level].added_files = new FileSet(cmp); - } - } - - ~Builder() { - for (int level = 0; level < config::kNumLevels; level++) { - const FileSet* added = levels_[level].added_files; - std::vector<FileMetaData*> to_unref; - to_unref.reserve(added->size()); - for (FileSet::const_iterator it = added->begin(); - it != added->end(); ++it) { - to_unref.push_back(*it); - } - delete added; - for (uint32_t i = 0; i < to_unref.size(); i++) { - FileMetaData* f = to_unref[i]; - f->refs--; - if (f->refs <= 0) { - delete f; - } - } - } - base_->Unref(); - } - - // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - // Update compaction pointers - for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { - const int level = edit->compact_pointers_[i].first; - vset_->compact_pointer_[level] = - edit->compact_pointers_[i].second.Encode().ToString(); - } - - // Delete files - const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); - iter != del.end(); - ++iter) { - const int level = iter->first; - const uint64_t number = iter->second; - levels_[level].deleted_files.insert(number); - } - - // Add new files - for (size_t i = 0; i < edit->new_files_.size(); i++) { - const int level = edit->new_files_[i].first; - FileMetaData* f = new FileMetaData(edit->new_files_[i].second); - f->refs = 1; - - // We arrange to automatically compact this file after - // a certain number of seeks. Let's assume: - // (1) One seek costs 10ms - // (2) Writing or reading 1MB costs 10ms (100MB/s) - // (3) A compaction of 1MB does 25MB of IO: - // 1MB read from this level - // 10-12MB read from next level (boundaries may be misaligned) - // 10-12MB written to next level - // This implies that 25 seeks cost the same as the compaction - // of 1MB of data. I.e., one seek costs approximately the - // same as the compaction of 40KB of data. We are a little - // conservative and allow approximately one seek for every 16KB - // of data before triggering a compaction. - f->allowed_seeks = (f->file_size / 16384); - if (f->allowed_seeks < 100) f->allowed_seeks = 100; - - levels_[level].deleted_files.erase(f->number); - levels_[level].added_files->insert(f); - } - } - - // Save the current state in *v. - void SaveTo(Version* v) { - BySmallestKey cmp; - cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < config::kNumLevels; level++) { - // Merge the set of added files with the set of pre-existing files. - // Drop any deleted files. Store the result in *v. - const std::vector<FileMetaData*>& base_files = base_->files_[level]; - std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin(); - std::vector<FileMetaData*>::const_iterator base_end = base_files.end(); - const FileSet* added = levels_[level].added_files; - v->files_[level].reserve(base_files.size() + added->size()); - for (FileSet::const_iterator added_iter = added->begin(); - added_iter != added->end(); - ++added_iter) { - // Add all smaller files listed in base_ - for (std::vector<FileMetaData*>::const_iterator bpos - = std::upper_bound(base_iter, base_end, *added_iter, cmp); - base_iter != bpos; - ++base_iter) { - MaybeAddFile(v, level, *base_iter); - } - - MaybeAddFile(v, level, *added_iter); - } - - // Add remaining base files - for (; base_iter != base_end; ++base_iter) { - MaybeAddFile(v, level, *base_iter); - } - -#ifndef NDEBUG - // Make sure there is no overlap in levels > 0 - if (level > 0) { - for (uint32_t i = 1; i < v->files_[level].size(); i++) { - const InternalKey& prev_end = v->files_[level][i-1]->largest; - const InternalKey& this_begin = v->files_[level][i]->smallest; - if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { - fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", - prev_end.DebugString().c_str(), - this_begin.DebugString().c_str()); - abort(); - } - } - } -#endif - } - } - - void MaybeAddFile(Version* v, int level, FileMetaData* f) { - if (levels_[level].deleted_files.count(f->number) > 0) { - // File is deleted: do nothing - } else { - std::vector<FileMetaData*>* files = &v->files_[level]; - if (level > 0 && !files->empty()) { - // Must not overlap - assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest, - f->smallest) < 0); - } - f->refs++; - files->push_back(f); - } - } -}; - -VersionSet::VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator* cmp) - : env_(options->env), - dbname_(dbname), - options_(options), - table_cache_(table_cache), - icmp_(*cmp), - next_file_number_(2), - manifest_file_number_(0), // Filled by Recover() - last_sequence_(0), - log_number_(0), - prev_log_number_(0), - descriptor_file_(NULL), - descriptor_log_(NULL), - dummy_versions_(this), - current_(NULL) { - AppendVersion(new Version(this)); -} - -VersionSet::~VersionSet() { - current_->Unref(); - assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty - delete descriptor_log_; - delete descriptor_file_; -} - -void VersionSet::AppendVersion(Version* v) { - // Make "v" current - assert(v->refs_ == 0); - assert(v != current_); - if (current_ != NULL) { - current_->Unref(); - } - current_ = v; - v->Ref(); - - // Append to linked list - v->prev_ = dummy_versions_.prev_; - v->next_ = &dummy_versions_; - v->prev_->next_ = v; - v->next_->prev_ = v; -} - -Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { - if (edit->has_log_number_) { - assert(edit->log_number_ >= log_number_); - assert(edit->log_number_ < next_file_number_); - } else { - edit->SetLogNumber(log_number_); - } - - if (!edit->has_prev_log_number_) { - edit->SetPrevLogNumber(prev_log_number_); - } - - edit->SetNextFile(next_file_number_); - edit->SetLastSequence(last_sequence_); - - Version* v = new Version(this); - { - Builder builder(this, current_); - builder.Apply(edit); - builder.SaveTo(v); - } - Finalize(v); - - // Initialize new descriptor log file if necessary by creating - // a temporary file that contains a snapshot of the current version. - std::string new_manifest_file; - Status s; - if (descriptor_log_ == NULL) { - // No reason to unlock *mu here since we only hit this path in the - // first call to LogAndApply (when opening the database). - assert(descriptor_file_ == NULL); - new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); - s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); - if (s.ok()) { - descriptor_log_ = new log::Writer(descriptor_file_); - s = WriteSnapshot(descriptor_log_); - } - } - - // Unlock during expensive MANIFEST log write - { - mu->Unlock(); - - // Write new record to MANIFEST log - if (s.ok()) { - std::string record; - edit->EncodeTo(&record); - s = descriptor_log_->AddRecord(record); - if (s.ok()) { - s = descriptor_file_->Sync(); - } - if (!s.ok()) { - Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); - if (ManifestContains(record)) { - Log(options_->info_log, - "MANIFEST contains log record despite error; advancing to new " - "version to prevent mismatch between in-memory and logged state"); - s = Status::OK(); - } - } - } - - // If we just created a new descriptor file, install it by writing a - // new CURRENT file that points to it. - if (s.ok() && !new_manifest_file.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - // No need to double-check MANIFEST in case of error since it - // will be discarded below. - } - - mu->Lock(); - } - - // Install the new version - if (s.ok()) { - AppendVersion(v); - log_number_ = edit->log_number_; - prev_log_number_ = edit->prev_log_number_; - } else { - delete v; - if (!new_manifest_file.empty()) { - delete descriptor_log_; - delete descriptor_file_; - descriptor_log_ = NULL; - descriptor_file_ = NULL; - env_->DeleteFile(new_manifest_file); - } - } - - return s; -} - -Status VersionSet::Recover() { - struct LogReporter : public log::Reader::Reporter { - Status* status; - virtual void Corruption(size_t bytes, const Status& s) { - if (this->status->ok()) *this->status = s; - } - }; - - // Read "CURRENT" file, which contains a pointer to the current manifest file - std::string current; - Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); - if (!s.ok()) { - return s; - } - if (current.empty() || current[current.size()-1] != '\n') { - return Status::Corruption("CURRENT file does not end with newline"); - } - current.resize(current.size() - 1); - - std::string dscname = dbname_ + "/" + current; - SequentialFile* file; - s = env_->NewSequentialFile(dscname, &file); - if (!s.ok()) { - return s; - } - - bool have_log_number = false; - bool have_prev_log_number = false; - bool have_next_file = false; - bool have_last_sequence = false; - uint64_t next_file = 0; - uint64_t last_sequence = 0; - uint64_t log_number = 0; - uint64_t prev_log_number = 0; - Builder builder(this, current_); - - { - LogReporter reporter; - reporter.status = &s; - log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/); - Slice record; - std::string scratch; - while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit; - s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument( - edit.comparator_ + " does not match existing comparator ", - icmp_.user_comparator()->Name()); - } - } - - if (s.ok()) { - builder.Apply(&edit); - } - - if (edit.has_log_number_) { - log_number = edit.log_number_; - have_log_number = true; - } - - if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; - have_prev_log_number = true; - } - - if (edit.has_next_file_number_) { - next_file = edit.next_file_number_; - have_next_file = true; - } - - if (edit.has_last_sequence_) { - last_sequence = edit.last_sequence_; - have_last_sequence = true; - } - } - } - delete file; - file = NULL; - - if (s.ok()) { - if (!have_next_file) { - s = Status::Corruption("no meta-nextfile entry in descriptor"); - } else if (!have_log_number) { - s = Status::Corruption("no meta-lognumber entry in descriptor"); - } else if (!have_last_sequence) { - s = Status::Corruption("no last-sequence-number entry in descriptor"); - } - - if (!have_prev_log_number) { - prev_log_number = 0; - } - - MarkFileNumberUsed(prev_log_number); - MarkFileNumberUsed(log_number); - } - - if (s.ok()) { - Version* v = new Version(this); - builder.SaveTo(v); - // Install recovered version - Finalize(v); - AppendVersion(v); - manifest_file_number_ = next_file; - next_file_number_ = next_file + 1; - last_sequence_ = last_sequence; - log_number_ = log_number; - prev_log_number_ = prev_log_number; - } - - return s; -} - -void VersionSet::MarkFileNumberUsed(uint64_t number) { - if (next_file_number_ <= number) { - next_file_number_ = number + 1; - } -} - -void VersionSet::Finalize(Version* v) { - // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; - - for (int level = 0; level < config::kNumLevels-1; level++) { - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - score = v->files_[level].size() / - static_cast<double>(config::kL0_CompactionTrigger); - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); - score = static_cast<double>(level_bytes) / MaxBytesForLevel(level); - } - - if (score > best_score) { - best_level = level; - best_score = score; - } - } - - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; -} - -Status VersionSet::WriteSnapshot(log::Writer* log) { - // TODO: Break up into multiple records to reduce memory usage on recovery? - - // Save metadata - VersionEdit edit; - edit.SetComparatorName(icmp_.user_comparator()->Name()); - - // Save compaction pointers - for (int level = 0; level < config::kNumLevels; level++) { - if (!compact_pointer_[level].empty()) { - InternalKey key; - key.DecodeFrom(compact_pointer_[level]); - edit.SetCompactPointer(level, key); - } - } - - // Save files - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector<FileMetaData*>& files = current_->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); - } - } - - std::string record; - edit.EncodeTo(&record); - return log->AddRecord(record); -} - -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return current_->files_[level].size(); -} - -const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { - // Update code if kNumLevels changes - assert(config::kNumLevels == 7); - snprintf(scratch->buffer, sizeof(scratch->buffer), - "files[ %d %d %d %d %d %d %d ]", - int(current_->files_[0].size()), - int(current_->files_[1].size()), - int(current_->files_[2].size()), - int(current_->files_[3].size()), - int(current_->files_[4].size()), - int(current_->files_[5].size()), - int(current_->files_[6].size())); - return scratch->buffer; -} - -// Return true iff the manifest contains the specified record. -bool VersionSet::ManifestContains(const std::string& record) const { - std::string fname = DescriptorFileName(dbname_, manifest_file_number_); - Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); - SequentialFile* file = NULL; - Status s = env_->NewSequentialFile(fname, &file); - if (!s.ok()) { - Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); - return false; - } - log::Reader reader(file, NULL, true/*checksum*/, 0); - Slice r; - std::string scratch; - bool result = false; - while (reader.ReadRecord(&r, &scratch)) { - if (r == Slice(record)) { - result = true; - break; - } - } - delete file; - Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); - return result; -} - -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { - uint64_t result = 0; - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector<FileMetaData*>& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - if (icmp_.Compare(files[i]->largest, ikey) <= 0) { - // Entire file is before "ikey", so just add the file size - result += files[i]->file_size; - } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { - // Entire file is after "ikey", so ignore - if (level > 0) { - // Files other than level 0 are sorted by meta->smallest, so - // no further files in this level will contain data for - // "ikey". - break; - } - } else { - // "ikey" falls in the range for this table. Add the - // approximate offset of "ikey" within the table. - Table* tableptr; - Iterator* iter = table_cache_->NewIterator( - ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); - if (tableptr != NULL) { - result += tableptr->ApproximateOffsetOf(ikey.Encode()); - } - delete iter; - } - } - } - return result; -} - -void VersionSet::AddLiveFiles(std::set<uint64_t>* live) { - for (Version* v = dummy_versions_.next_; - v != &dummy_versions_; - v = v->next_) { - for (int level = 0; level < config::kNumLevels; level++) { - const std::vector<FileMetaData*>& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { - assert(level >= 0); - assert(level < config::kNumLevels); - return TotalFileSize(current_->files_[level]); -} - -int64_t VersionSet::MaxNextLevelOverlappingBytes() { - int64_t result = 0; - std::vector<FileMetaData*> overlaps; - for (int level = 1; level < config::kNumLevels - 1; level++) { - for (size_t i = 0; i < current_->files_[level].size(); i++) { - const FileMetaData* f = current_->files_[level][i]; - current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest, - &overlaps); - const int64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - -// Stores the minimal range that covers all entries in inputs in -// *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs, - InternalKey* smallest, - InternalKey* largest) { - assert(!inputs.empty()); - smallest->Clear(); - largest->Clear(); - for (size_t i = 0; i < inputs.size(); i++) { - FileMetaData* f = inputs[i]; - if (i == 0) { - *smallest = f->smallest; - *largest = f->largest; - } else { - if (icmp_.Compare(f->smallest, *smallest) < 0) { - *smallest = f->smallest; - } - if (icmp_.Compare(f->largest, *largest) > 0) { - *largest = f->largest; - } - } - } -} - -// Stores the minimal range that covers all entries in inputs1 and inputs2 -// in *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1, - const std::vector<FileMetaData*>& inputs2, - InternalKey* smallest, - InternalKey* largest) { - std::vector<FileMetaData*> all = inputs1; - all.insert(all.end(), inputs2.begin(), inputs2.end()); - GetRange(all, smallest, largest); -} - -Iterator* VersionSet::MakeInputIterator(Compaction* c) { - ReadOptions options; - options.verify_checksums = options_->paranoid_checks; - options.fill_cache = false; - - // Level-0 files have to be merged together. For other levels, - // we will make a concatenating iterator per level. - // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); - Iterator** list = new Iterator*[space]; - int num = 0; - for (int which = 0; which < 2; which++) { - if (!c->inputs_[which].empty()) { - if (c->level() + which == 0) { - const std::vector<FileMetaData*>& files = c->inputs_[which]; - for (size_t i = 0; i < files.size(); i++) { - list[num++] = table_cache_->NewIterator( - options, files[i]->number, files[i]->file_size); - } - } else { - // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator( - new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), - &GetFileIterator, table_cache_, options); - } - } - } - assert(num <= space); - Iterator* result = NewMergingIterator(&icmp_, list, num); - delete[] list; - return result; -} - -Compaction* VersionSet::PickCompaction() { - Compaction* c; - int level; - - // We prefer compactions triggered by too much data in a level over - // the compactions triggered by seeks. - const bool size_compaction = (current_->compaction_score_ >= 1); - const bool seek_compaction = (current_->file_to_compact_ != NULL); - if (size_compaction) { - level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - c = new Compaction(level); - - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); - break; - } - } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - } else if (seek_compaction) { - level = current_->file_to_compact_level_; - c = new Compaction(level); - c->inputs_[0].push_back(current_->file_to_compact_); - } else { - return NULL; - } - - c->input_version_ = current_; - c->input_version_->Ref(); - - // Files in level 0 may overlap each other, so pick up all overlapping ones - if (level == 0) { - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); - } - - SetupOtherInputs(c); - - return c; -} - -void VersionSet::SetupOtherInputs(Compaction* c) { - const int level = c->level(); - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - - current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]); - - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - - // See if we can grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. - if (!c->inputs_[1].empty()) { - std::vector<FileMetaData*> expanded0; - current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); - const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); - const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); - const int64_t expanded0_size = TotalFileSize(expanded0); - if (expanded0.size() > c->inputs_[0].size() && - inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector<FileMetaData*> expanded1; - current_->GetOverlappingInputs(level+1, &new_start, &new_limit, - &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { - Log(options_->info_log, - "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", - level, - int(c->inputs_[0].size()), - int(c->inputs_[1].size()), - long(inputs0_size), long(inputs1_size), - int(expanded0.size()), - int(expanded1.size()), - long(expanded0_size), long(inputs1_size)); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - } - } - } - - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < config::kNumLevels) { - current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, - &c->grandparents_); - } - - if (false) { - Log(options_->info_log, "Compacting %d '%s' .. '%s'", - level, - smallest.DebugString().c_str(), - largest.DebugString().c_str()); - } - - // Update the place where we will do the next compaction for this level. - // We update this immediately instead of waiting for the VersionEdit - // to be applied so that if the compaction fails, we will try a different - // key range next time. - compact_pointer_[level] = largest.Encode().ToString(); - c->edit_.SetCompactPointer(level, largest); -} - -Compaction* VersionSet::CompactRange( - int level, - const InternalKey* begin, - const InternalKey* end) { - std::vector<FileMetaData*> inputs; - current_->GetOverlappingInputs(level, begin, end, &inputs); - if (inputs.empty()) { - return NULL; - } - - // Avoid compacting too much in one shot in case the range is large. - const uint64_t limit = MaxFileSizeForLevel(level); - uint64_t total = 0; - for (size_t i = 0; i < inputs.size(); i++) { - uint64_t s = inputs[i]->file_size; - total += s; - if (total >= limit) { - inputs.resize(i + 1); - break; - } - } - - Compaction* c = new Compaction(level); - c->input_version_ = current_; - c->input_version_->Ref(); - c->inputs_[0] = inputs; - SetupOtherInputs(c); - return c; -} - -Compaction::Compaction(int level) - : level_(level), - max_output_file_size_(MaxFileSizeForLevel(level)), - input_version_(NULL), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0) { - for (int i = 0; i < config::kNumLevels; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - if (input_version_ != NULL) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { - const std::vector<FileMetaData*>& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != NULL) { - input_version_->Unref(); - input_version_ = NULL; - } -} - -} // namespace leveldb diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h deleted file mode 100644 index 9d084fdb7d..0000000000 --- a/src/leveldb/db/version_set.h +++ /dev/null @@ -1,383 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// The representation of a DBImpl consists of a set of Versions. The -// newest version is called "current". Older versions may be kept -// around to provide a consistent view to live iterators. -// -// Each Version keeps track of a set of Table files per level. The -// entire set of versions is maintained in a VersionSet. -// -// Version,VersionSet are thread-compatible, but require external -// synchronization on all accesses. - -#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ -#define STORAGE_LEVELDB_DB_VERSION_SET_H_ - -#include <map> -#include <set> -#include <vector> -#include "db/dbformat.h" -#include "db/version_edit.h" -#include "port/port.h" -#include "port/thread_annotations.h" - -namespace leveldb { - -namespace log { class Writer; } - -class Compaction; -class Iterator; -class MemTable; -class TableBuilder; -class TableCache; -class Version; -class VersionSet; -class WritableFile; - -// Return the smallest index i such that files[i]->largest >= key. -// Return files.size() if there is no such file. -// REQUIRES: "files" contains a sorted list of non-overlapping files. -extern int FindFile(const InternalKeyComparator& icmp, - const std::vector<FileMetaData*>& files, - const Slice& key); - -// Returns true iff some file in "files" overlaps the user key range -// [*smallest,*largest]. -// smallest==NULL represents a key smaller than all keys in the DB. -// largest==NULL represents a key largest than all keys in the DB. -// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges -// in sorted order. -extern bool SomeFileOverlapsRange( - const InternalKeyComparator& icmp, - bool disjoint_sorted_files, - const std::vector<FileMetaData*>& files, - const Slice* smallest_user_key, - const Slice* largest_user_key); - -class Version { - public: - // Append to *iters a sequence of iterators that will - // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); - - // Lookup the value for key. If found, store it in *val and - // return OK. Else return a non-OK status. Fills *stats. - // REQUIRES: lock is not held - struct GetStats { - FileMetaData* seek_file; - int seek_file_level; - }; - Status Get(const ReadOptions&, const LookupKey& key, std::string* val, - GetStats* stats); - - // Adds "stats" into the current state. Returns true if a new - // compaction may need to be triggered, false otherwise. - // REQUIRES: lock is held - bool UpdateStats(const GetStats& stats); - - // Reference count management (so Versions do not disappear out from - // under live iterators) - void Ref(); - void Unref(); - - void GetOverlappingInputs( - int level, - const InternalKey* begin, // NULL means before all keys - const InternalKey* end, // NULL means after all keys - std::vector<FileMetaData*>* inputs); - - // Returns true iff some file in the specified level overlaps - // some part of [*smallest_user_key,*largest_user_key]. - // smallest_user_key==NULL represents a key smaller than all keys in the DB. - // largest_user_key==NULL represents a key largest than all keys in the DB. - bool OverlapInLevel(int level, - const Slice* smallest_user_key, - const Slice* largest_user_key); - - // Return the level at which we should place a new memtable compaction - // result that covers the range [smallest_user_key,largest_user_key]. - int PickLevelForMemTableOutput(const Slice& smallest_user_key, - const Slice& largest_user_key); - - int NumFiles(int level) const { return files_[level].size(); } - - // Return a human readable string that describes this version's contents. - std::string DebugString() const; - - private: - friend class Compaction; - friend class VersionSet; - - class LevelFileNumIterator; - Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; - - VersionSet* vset_; // VersionSet to which this Version belongs - Version* next_; // Next version in linked list - Version* prev_; // Previous version in linked list - int refs_; // Number of live refs to this version - - // List of files per level - std::vector<FileMetaData*> files_[config::kNumLevels]; - - // Next file to compact based on seek stats. - FileMetaData* file_to_compact_; - int file_to_compact_level_; - - // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by Finalize(). - double compaction_score_; - int compaction_level_; - - explicit Version(VersionSet* vset) - : vset_(vset), next_(this), prev_(this), refs_(0), - file_to_compact_(NULL), - file_to_compact_level_(-1), - compaction_score_(-1), - compaction_level_(-1) { - } - - ~Version(); - - // No copying allowed - Version(const Version&); - void operator=(const Version&); -}; - -class VersionSet { - public: - VersionSet(const std::string& dbname, - const Options* options, - TableCache* table_cache, - const InternalKeyComparator*); - ~VersionSet(); - - // Apply *edit to the current version to form a new descriptor that - // is both saved to persistent state and installed as the new - // current version. Will release *mu while actually writing to the file. - // REQUIRES: *mu is held on entry. - // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(VersionEdit* edit, port::Mutex* mu) - EXCLUSIVE_LOCKS_REQUIRED(mu); - - // Recover the last saved descriptor from persistent storage. - Status Recover(); - - // Return the current version. - Version* current() const { return current_; } - - // Return the current manifest file number - uint64_t ManifestFileNumber() const { return manifest_file_number_; } - - // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } - - // Arrange to reuse "file_number" unless a newer file number has - // already been allocated. - // REQUIRES: "file_number" was returned by a call to NewFileNumber(). - void ReuseFileNumber(uint64_t file_number) { - if (next_file_number_ == file_number + 1) { - next_file_number_ = file_number; - } - } - - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - - // Return the last sequence number. - uint64_t LastSequence() const { return last_sequence_; } - - // Set the last sequence number to s. - void SetLastSequence(uint64_t s) { - assert(s >= last_sequence_); - last_sequence_ = s; - } - - // Mark the specified file number as used. - void MarkFileNumberUsed(uint64_t number); - - // Return the current log file number. - uint64_t LogNumber() const { return log_number_; } - - // Return the log file number for the log file that is currently - // being compacted, or zero if there is no such log file. - uint64_t PrevLogNumber() const { return prev_log_number_; } - - // Pick level and inputs for a new compaction. - // Returns NULL if there is no compaction to be done. - // Otherwise returns a pointer to a heap-allocated object that - // describes the compaction. Caller should delete the result. - Compaction* PickCompaction(); - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns NULL if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - Compaction* CompactRange( - int level, - const InternalKey* begin, - const InternalKey* end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); - - // Create an iterator that reads over the compaction inputs for "*c". - // The caller should delete the iterator when no longer needed. - Iterator* MakeInputIterator(Compaction* c); - - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const { - Version* v = current_; - return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); - } - - // Add all files listed in any live version to *live. - // May also mutate some internal state. - void AddLiveFiles(std::set<uint64_t>* live); - - // Return the approximate offset in the database of the data for - // "key" as of version "v". - uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - - // Return a human-readable short (single-line) summary of the number - // of files per level. Uses *scratch as backing store. - struct LevelSummaryStorage { - char buffer[100]; - }; - const char* LevelSummary(LevelSummaryStorage* scratch) const; - - private: - class Builder; - - friend class Compaction; - friend class Version; - - void Finalize(Version* v); - - void GetRange(const std::vector<FileMetaData*>& inputs, - InternalKey* smallest, - InternalKey* largest); - - void GetRange2(const std::vector<FileMetaData*>& inputs1, - const std::vector<FileMetaData*>& inputs2, - InternalKey* smallest, - InternalKey* largest); - - void SetupOtherInputs(Compaction* c); - - // Save current contents to *log - Status WriteSnapshot(log::Writer* log); - - void AppendVersion(Version* v); - - bool ManifestContains(const std::string& record) const; - - Env* const env_; - const std::string dbname_; - const Options* const options_; - TableCache* const table_cache_; - const InternalKeyComparator icmp_; - uint64_t next_file_number_; - uint64_t manifest_file_number_; - uint64_t last_sequence_; - uint64_t log_number_; - uint64_t prev_log_number_; // 0 or backing store for memtable being compacted - - // Opened lazily - WritableFile* descriptor_file_; - log::Writer* descriptor_log_; - Version dummy_versions_; // Head of circular doubly-linked list of versions. - Version* current_; // == dummy_versions_.prev_ - - // Per-level key at which the next compaction at that level should start. - // Either an empty string, or a valid InternalKey. - std::string compact_pointer_[config::kNumLevels]; - - // No copying allowed - VersionSet(const VersionSet&); - void operator=(const VersionSet&); -}; - -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // and "level+1" will be merged to produce a set of "level+1" files. - int level() const { return level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return &edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - private: - friend class Version; - friend class VersionSet; - - explicit Compaction(int level); - - int level_; - uint64_t max_output_file_size_; - Version* input_version_; - VersionEdit edit_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector<FileMetaData*> grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - int64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - - // State for implementing IsBaseLevelForKey - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - size_t level_ptrs_[config::kNumLevels]; -}; - -} // namespace leveldb - -#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc deleted file mode 100644 index 501e34d133..0000000000 --- a/src/leveldb/db/version_set_test.cc +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "db/version_set.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace leveldb { - -class FindFileTest { - public: - std::vector<FileMetaData*> files_; - bool disjoint_sorted_files_; - - FindFileTest() : disjoint_sorted_files_(true) { } - - ~FindFileTest() { - for (int i = 0; i < files_.size(); i++) { - delete files_[i]; - } - } - - void Add(const char* smallest, const char* largest, - SequenceNumber smallest_seq = 100, - SequenceNumber largest_seq = 100) { - FileMetaData* f = new FileMetaData; - f->number = files_.size() + 1; - f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); - f->largest = InternalKey(largest, largest_seq, kTypeValue); - files_.push_back(f); - } - - int Find(const char* key) { - InternalKey target(key, 100, kTypeValue); - InternalKeyComparator cmp(BytewiseComparator()); - return FindFile(cmp, files_, target.Encode()); - } - - bool Overlaps(const char* smallest, const char* largest) { - InternalKeyComparator cmp(BytewiseComparator()); - Slice s(smallest != NULL ? smallest : ""); - Slice l(largest != NULL ? largest : ""); - return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, - (smallest != NULL ? &s : NULL), - (largest != NULL ? &l : NULL)); - } -}; - -TEST(FindFileTest, Empty) { - ASSERT_EQ(0, Find("foo")); - ASSERT_TRUE(! Overlaps("a", "z")); - ASSERT_TRUE(! Overlaps(NULL, "z")); - ASSERT_TRUE(! Overlaps("a", NULL)); - ASSERT_TRUE(! Overlaps(NULL, NULL)); -} - -TEST(FindFileTest, Single) { - Add("p", "q"); - ASSERT_EQ(0, Find("a")); - ASSERT_EQ(0, Find("p")); - ASSERT_EQ(0, Find("p1")); - ASSERT_EQ(0, Find("q")); - ASSERT_EQ(1, Find("q1")); - ASSERT_EQ(1, Find("z")); - - ASSERT_TRUE(! Overlaps("a", "b")); - ASSERT_TRUE(! Overlaps("z1", "z2")); - ASSERT_TRUE(Overlaps("a", "p")); - ASSERT_TRUE(Overlaps("a", "q")); - ASSERT_TRUE(Overlaps("a", "z")); - ASSERT_TRUE(Overlaps("p", "p1")); - ASSERT_TRUE(Overlaps("p", "q")); - ASSERT_TRUE(Overlaps("p", "z")); - ASSERT_TRUE(Overlaps("p1", "p2")); - ASSERT_TRUE(Overlaps("p1", "z")); - ASSERT_TRUE(Overlaps("q", "q")); - ASSERT_TRUE(Overlaps("q", "q1")); - - ASSERT_TRUE(! Overlaps(NULL, "j")); - ASSERT_TRUE(! Overlaps("r", NULL)); - ASSERT_TRUE(Overlaps(NULL, "p")); - ASSERT_TRUE(Overlaps(NULL, "p1")); - ASSERT_TRUE(Overlaps("q", NULL)); - ASSERT_TRUE(Overlaps(NULL, NULL)); -} - - -TEST(FindFileTest, Multiple) { - Add("150", "200"); - Add("200", "250"); - Add("300", "350"); - Add("400", "450"); - ASSERT_EQ(0, Find("100")); - ASSERT_EQ(0, Find("150")); - ASSERT_EQ(0, Find("151")); - ASSERT_EQ(0, Find("199")); - ASSERT_EQ(0, Find("200")); - ASSERT_EQ(1, Find("201")); - ASSERT_EQ(1, Find("249")); - ASSERT_EQ(1, Find("250")); - ASSERT_EQ(2, Find("251")); - ASSERT_EQ(2, Find("299")); - ASSERT_EQ(2, Find("300")); - ASSERT_EQ(2, Find("349")); - ASSERT_EQ(2, Find("350")); - ASSERT_EQ(3, Find("351")); - ASSERT_EQ(3, Find("400")); - ASSERT_EQ(3, Find("450")); - ASSERT_EQ(4, Find("451")); - - ASSERT_TRUE(! Overlaps("100", "149")); - ASSERT_TRUE(! Overlaps("251", "299")); - ASSERT_TRUE(! Overlaps("451", "500")); - ASSERT_TRUE(! Overlaps("351", "399")); - - ASSERT_TRUE(Overlaps("100", "150")); - ASSERT_TRUE(Overlaps("100", "200")); - ASSERT_TRUE(Overlaps("100", "300")); - ASSERT_TRUE(Overlaps("100", "400")); - ASSERT_TRUE(Overlaps("100", "500")); - ASSERT_TRUE(Overlaps("375", "400")); - ASSERT_TRUE(Overlaps("450", "450")); - ASSERT_TRUE(Overlaps("450", "500")); -} - -TEST(FindFileTest, MultipleNullBoundaries) { - Add("150", "200"); - Add("200", "250"); - Add("300", "350"); - Add("400", "450"); - ASSERT_TRUE(! Overlaps(NULL, "149")); - ASSERT_TRUE(! Overlaps("451", NULL)); - ASSERT_TRUE(Overlaps(NULL, NULL)); - ASSERT_TRUE(Overlaps(NULL, "150")); - ASSERT_TRUE(Overlaps(NULL, "199")); - ASSERT_TRUE(Overlaps(NULL, "200")); - ASSERT_TRUE(Overlaps(NULL, "201")); - ASSERT_TRUE(Overlaps(NULL, "400")); - ASSERT_TRUE(Overlaps(NULL, "800")); - ASSERT_TRUE(Overlaps("100", NULL)); - ASSERT_TRUE(Overlaps("200", NULL)); - ASSERT_TRUE(Overlaps("449", NULL)); - ASSERT_TRUE(Overlaps("450", NULL)); -} - -TEST(FindFileTest, OverlapSequenceChecks) { - Add("200", "200", 5000, 3000); - ASSERT_TRUE(! Overlaps("199", "199")); - ASSERT_TRUE(! Overlaps("201", "300")); - ASSERT_TRUE(Overlaps("200", "200")); - ASSERT_TRUE(Overlaps("190", "200")); - ASSERT_TRUE(Overlaps("200", "210")); -} - -TEST(FindFileTest, OverlappingFiles) { - Add("150", "600"); - Add("400", "500"); - disjoint_sorted_files_ = false; - ASSERT_TRUE(! Overlaps("100", "149")); - ASSERT_TRUE(! Overlaps("601", "700")); - ASSERT_TRUE(Overlaps("100", "150")); - ASSERT_TRUE(Overlaps("100", "200")); - ASSERT_TRUE(Overlaps("100", "300")); - ASSERT_TRUE(Overlaps("100", "400")); - ASSERT_TRUE(Overlaps("100", "500")); - ASSERT_TRUE(Overlaps("375", "400")); - ASSERT_TRUE(Overlaps("450", "450")); - ASSERT_TRUE(Overlaps("450", "500")); - ASSERT_TRUE(Overlaps("450", "700")); - ASSERT_TRUE(Overlaps("600", "700")); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc deleted file mode 100644 index 33f4a4257e..0000000000 --- a/src/leveldb/db/write_batch.cc +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// WriteBatch::rep_ := -// sequence: fixed64 -// count: fixed32 -// data: record[count] -// record := -// kTypeValue varstring varstring | -// kTypeDeletion varstring -// varstring := -// len: varint32 -// data: uint8[len] - -#include "leveldb/write_batch.h" - -#include "leveldb/db.h" -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "util/coding.h" - -namespace leveldb { - -// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. -static const size_t kHeader = 12; - -WriteBatch::WriteBatch() { - Clear(); -} - -WriteBatch::~WriteBatch() { } - -WriteBatch::Handler::~Handler() { } - -void WriteBatch::Clear() { - rep_.clear(); - rep_.resize(kHeader); -} - -Status WriteBatch::Iterate(Handler* handler) const { - Slice input(rep_); - if (input.size() < kHeader) { - return Status::Corruption("malformed WriteBatch (too small)"); - } - - input.remove_prefix(kHeader); - Slice key, value; - int found = 0; - while (!input.empty()) { - found++; - char tag = input[0]; - input.remove_prefix(1); - switch (tag) { - case kTypeValue: - if (GetLengthPrefixedSlice(&input, &key) && - GetLengthPrefixedSlice(&input, &value)) { - handler->Put(key, value); - } else { - return Status::Corruption("bad WriteBatch Put"); - } - break; - case kTypeDeletion: - if (GetLengthPrefixedSlice(&input, &key)) { - handler->Delete(key); - } else { - return Status::Corruption("bad WriteBatch Delete"); - } - break; - default: - return Status::Corruption("unknown WriteBatch tag"); - } - } - if (found != WriteBatchInternal::Count(this)) { - return Status::Corruption("WriteBatch has wrong count"); - } else { - return Status::OK(); - } -} - -int WriteBatchInternal::Count(const WriteBatch* b) { - return DecodeFixed32(b->rep_.data() + 8); -} - -void WriteBatchInternal::SetCount(WriteBatch* b, int n) { - EncodeFixed32(&b->rep_[8], n); -} - -SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) { - return SequenceNumber(DecodeFixed64(b->rep_.data())); -} - -void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) { - EncodeFixed64(&b->rep_[0], seq); -} - -void WriteBatch::Put(const Slice& key, const Slice& value) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast<char>(kTypeValue)); - PutLengthPrefixedSlice(&rep_, key); - PutLengthPrefixedSlice(&rep_, value); -} - -void WriteBatch::Delete(const Slice& key) { - WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1); - rep_.push_back(static_cast<char>(kTypeDeletion)); - PutLengthPrefixedSlice(&rep_, key); -} - -namespace { -class MemTableInserter : public WriteBatch::Handler { - public: - SequenceNumber sequence_; - MemTable* mem_; - - virtual void Put(const Slice& key, const Slice& value) { - mem_->Add(sequence_, kTypeValue, key, value); - sequence_++; - } - virtual void Delete(const Slice& key) { - mem_->Add(sequence_, kTypeDeletion, key, Slice()); - sequence_++; - } -}; -} // namespace - -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - MemTableInserter inserter; - inserter.sequence_ = WriteBatchInternal::Sequence(b); - inserter.mem_ = memtable; - return b->Iterate(&inserter); -} - -void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) { - assert(contents.size() >= kHeader); - b->rep_.assign(contents.data(), contents.size()); -} - -void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) { - SetCount(dst, Count(dst) + Count(src)); - assert(src->rep_.size() >= kHeader); - dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader); -} - -} // namespace leveldb diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h deleted file mode 100644 index 4423a7f318..0000000000 --- a/src/leveldb/db/write_batch_internal.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ -#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ - -#include "leveldb/write_batch.h" - -namespace leveldb { - -class MemTable; - -// WriteBatchInternal provides static methods for manipulating a -// WriteBatch that we don't want in the public WriteBatch interface. -class WriteBatchInternal { - public: - // Return the number of entries in the batch. - static int Count(const WriteBatch* batch); - - // Set the count for the number of entries in the batch. - static void SetCount(WriteBatch* batch, int n); - - // Return the seqeunce number for the start of this batch. - static SequenceNumber Sequence(const WriteBatch* batch); - - // Store the specified number as the seqeunce number for the start of - // this batch. - static void SetSequence(WriteBatch* batch, SequenceNumber seq); - - static Slice Contents(const WriteBatch* batch) { - return Slice(batch->rep_); - } - - static size_t ByteSize(const WriteBatch* batch) { - return batch->rep_.size(); - } - - static void SetContents(WriteBatch* batch, const Slice& contents); - - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); - - static void Append(WriteBatch* dst, const WriteBatch* src); -}; - -} // namespace leveldb - - -#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_ diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc deleted file mode 100644 index 9064e3d85e..0000000000 --- a/src/leveldb/db/write_batch_test.cc +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "leveldb/db.h" - -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "leveldb/env.h" -#include "util/logging.h" -#include "util/testharness.h" - -namespace leveldb { - -static std::string PrintContents(WriteBatch* b) { - InternalKeyComparator cmp(BytewiseComparator()); - MemTable* mem = new MemTable(cmp); - mem->Ref(); - std::string state; - Status s = WriteBatchInternal::InsertInto(b, mem); - int count = 0; - Iterator* iter = mem->NewIterator(); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ParsedInternalKey ikey; - ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - break; - case kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - count++; - break; - } - state.append("@"); - state.append(NumberToString(ikey.sequence)); - } - delete iter; - if (!s.ok()) { - state.append("ParseError()"); - } else if (count != WriteBatchInternal::Count(b)) { - state.append("CountMismatch()"); - } - mem->Unref(); - return state; -} - -class WriteBatchTest { }; - -TEST(WriteBatchTest, Empty) { - WriteBatch batch; - ASSERT_EQ("", PrintContents(&batch)); - ASSERT_EQ(0, WriteBatchInternal::Count(&batch)); -} - -TEST(WriteBatchTest, Multiple) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - batch.Put(Slice("baz"), Slice("boo")); - WriteBatchInternal::SetSequence(&batch, 100); - ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch)); - ASSERT_EQ(3, WriteBatchInternal::Count(&batch)); - ASSERT_EQ("Put(baz, boo)@102" - "Delete(box)@101" - "Put(foo, bar)@100", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, Corruption) { - WriteBatch batch; - batch.Put(Slice("foo"), Slice("bar")); - batch.Delete(Slice("box")); - WriteBatchInternal::SetSequence(&batch, 200); - Slice contents = WriteBatchInternal::Contents(&batch); - WriteBatchInternal::SetContents(&batch, - Slice(contents.data(),contents.size()-1)); - ASSERT_EQ("Put(foo, bar)@200" - "ParseError()", - PrintContents(&batch)); -} - -TEST(WriteBatchTest, Append) { - WriteBatch b1, b2; - WriteBatchInternal::SetSequence(&b1, 200); - WriteBatchInternal::SetSequence(&b2, 300); - WriteBatchInternal::Append(&b1, &b2); - ASSERT_EQ("", - PrintContents(&b1)); - b2.Put("a", "va"); - WriteBatchInternal::Append(&b1, &b2); - ASSERT_EQ("Put(a, va)@200", - PrintContents(&b1)); - b2.Clear(); - b2.Put("b", "vb"); - WriteBatchInternal::Append(&b1, &b2); - ASSERT_EQ("Put(a, va)@200" - "Put(b, vb)@201", - PrintContents(&b1)); - b2.Delete("foo"); - WriteBatchInternal::Append(&b1, &b2); - ASSERT_EQ("Put(a, va)@200" - "Put(b, vb)@202" - "Put(b, vb)@201" - "Delete(foo)@203", - PrintContents(&b1)); -} - -} // namespace leveldb - -int main(int argc, char** argv) { - return leveldb::test::RunAllTests(); -} |