aboutsummaryrefslogtreecommitdiff
path: root/src/leveldb/db
diff options
context:
space:
mode:
Diffstat (limited to 'src/leveldb/db')
-rw-r--r--src/leveldb/db/corruption_test.cc2
-rw-r--r--src/leveldb/db/db_bench.cc24
-rw-r--r--src/leveldb/db/db_impl.cc194
-rw-r--r--src/leveldb/db/db_impl.h8
-rw-r--r--src/leveldb/db/db_test.cc32
-rw-r--r--src/leveldb/db/fault_injection_test.cc554
-rw-r--r--src/leveldb/db/leveldbutil.cc (renamed from src/leveldb/db/leveldb_main.cc)0
-rw-r--r--src/leveldb/db/log_reader.cc22
-rw-r--r--src/leveldb/db/log_reader.h5
-rw-r--r--src/leveldb/db/log_test.cc101
-rw-r--r--src/leveldb/db/log_writer.cc17
-rw-r--r--src/leveldb/db/log_writer.h6
-rw-r--r--src/leveldb/db/memtable.h5
-rw-r--r--src/leveldb/db/recovery_test.cc324
-rw-r--r--src/leveldb/db/skiplist.h8
-rw-r--r--src/leveldb/db/skiplist_test.cc2
-rw-r--r--src/leveldb/db/snapshot.h1
-rw-r--r--src/leveldb/db/version_set.cc40
-rw-r--r--src/leveldb/db/version_set.h4
-rw-r--r--src/leveldb/db/write_batch_internal.h1
20 files changed, 1236 insertions, 114 deletions
diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc
index 96afc68913..37a484d25f 100644
--- a/src/leveldb/db/corruption_test.cc
+++ b/src/leveldb/db/corruption_test.cc
@@ -36,7 +36,7 @@ class CorruptionTest {
tiny_cache_ = NewLRUCache(100);
options_.env = &env_;
options_.block_cache = tiny_cache_;
- dbname_ = test::TmpDir() + "/db_test";
+ dbname_ = test::TmpDir() + "/corruption_test";
DestroyDB(dbname_, options_);
db_ = NULL;
diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc
index 705a170aae..7a0f5e08cd 100644
--- a/src/leveldb/db/db_bench.cc
+++ b/src/leveldb/db/db_bench.cc
@@ -33,6 +33,7 @@
// readmissing -- read N missing keys in random order
// readhot -- read N times in random order from 1% section of DB
// seekrandom -- N random seeks
+// open -- cost of opening a DB
// crc32c -- repeated crc32c of 4K of data
// acquireload -- load N*1000 times
// Meta operations:
@@ -99,6 +100,9 @@ static int FLAGS_bloom_bits = -1;
// benchmark will fail.
static bool FLAGS_use_existing_db = false;
+// If true, reuse existing log/MANIFEST files when re-opening a database.
+static bool FLAGS_reuse_logs = false;
+
// Use the db with the following name.
static const char* FLAGS_db = NULL;
@@ -138,6 +142,7 @@ class RandomGenerator {
}
};
+#if defined(__linux)
static Slice TrimSpace(Slice s) {
size_t start = 0;
while (start < s.size() && isspace(s[start])) {
@@ -149,6 +154,7 @@ static Slice TrimSpace(Slice s) {
}
return Slice(s.data() + start, limit - start);
}
+#endif
static void AppendWithSpace(std::string* str, Slice msg) {
if (msg.empty()) return;
@@ -442,7 +448,11 @@ class Benchmark {
bool fresh_db = false;
int num_threads = FLAGS_threads;
- if (name == Slice("fillseq")) {
+ if (name == Slice("open")) {
+ method = &Benchmark::OpenBench;
+ num_ /= 10000;
+ if (num_ < 1) num_ = 1;
+ } else if (name == Slice("fillseq")) {
fresh_db = true;
method = &Benchmark::WriteSeq;
} else if (name == Slice("fillbatch")) {
@@ -695,6 +705,7 @@ class Benchmark {
options.write_buffer_size = FLAGS_write_buffer_size;
options.max_open_files = FLAGS_open_files;
options.filter_policy = filter_policy_;
+ options.reuse_logs = FLAGS_reuse_logs;
Status s = DB::Open(options, FLAGS_db, &db_);
if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@@ -702,6 +713,14 @@ class Benchmark {
}
}
+ void OpenBench(ThreadState* thread) {
+ for (int i = 0; i < num_; i++) {
+ delete db_;
+ Open();
+ thread->stats.FinishedSingleOp();
+ }
+ }
+
void WriteSeq(ThreadState* thread) {
DoWrite(thread, true);
}
@@ -941,6 +960,9 @@ int main(int argc, char** argv) {
} else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
(n == 0 || n == 1)) {
FLAGS_use_existing_db = n;
+ } else if (sscanf(argv[i], "--reuse_logs=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_reuse_logs = n;
} else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
FLAGS_num = n;
} else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index 49b95953b4..60f4e66e55 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -125,7 +125,7 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
db_lock_(NULL),
shutting_down_(NULL),
bg_cv_(&mutex_),
- mem_(new MemTable(internal_comparator_)),
+ mem_(NULL),
imm_(NULL),
logfile_(NULL),
logfile_number_(0),
@@ -134,7 +134,6 @@ DBImpl::DBImpl(const Options& raw_options, const std::string& dbname)
tmp_batch_(new WriteBatch),
bg_compaction_scheduled_(false),
manual_compaction_(NULL) {
- mem_->Ref();
has_imm_.Release_Store(NULL);
// Reserve ten files or so for other uses and give the rest to TableCache.
@@ -271,7 +270,7 @@ void DBImpl::DeleteObsoleteFiles() {
}
}
-Status DBImpl::Recover(VersionEdit* edit) {
+Status DBImpl::Recover(VersionEdit* edit, bool *save_manifest) {
mutex_.AssertHeld();
// Ignore error from CreateDir since the creation of the DB is
@@ -301,66 +300,69 @@ Status DBImpl::Recover(VersionEdit* edit) {
}
}
- s = versions_->Recover();
- if (s.ok()) {
- SequenceNumber max_sequence(0);
-
- // Recover from all newer log files than the ones named in the
- // descriptor (new log files may have been added by the previous
- // incarnation without registering them in the descriptor).
- //
- // Note that PrevLogNumber() is no longer used, but we pay
- // attention to it in case we are recovering a database
- // produced by an older version of leveldb.
- const uint64_t min_log = versions_->LogNumber();
- const uint64_t prev_log = versions_->PrevLogNumber();
- std::vector<std::string> filenames;
- s = env_->GetChildren(dbname_, &filenames);
+ s = versions_->Recover(save_manifest);
+ if (!s.ok()) {
+ return s;
+ }
+ SequenceNumber max_sequence(0);
+
+ // Recover from all newer log files than the ones named in the
+ // descriptor (new log files may have been added by the previous
+ // incarnation without registering them in the descriptor).
+ //
+ // Note that PrevLogNumber() is no longer used, but we pay
+ // attention to it in case we are recovering a database
+ // produced by an older version of leveldb.
+ const uint64_t min_log = versions_->LogNumber();
+ const uint64_t prev_log = versions_->PrevLogNumber();
+ std::vector<std::string> filenames;
+ s = env_->GetChildren(dbname_, &filenames);
+ if (!s.ok()) {
+ return s;
+ }
+ std::set<uint64_t> expected;
+ versions_->AddLiveFiles(&expected);
+ uint64_t number;
+ FileType type;
+ std::vector<uint64_t> logs;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &type)) {
+ expected.erase(number);
+ if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
+ logs.push_back(number);
+ }
+ }
+ if (!expected.empty()) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d missing files; e.g.",
+ static_cast<int>(expected.size()));
+ return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
+ }
+
+ // Recover in the order in which the logs were generated
+ std::sort(logs.begin(), logs.end());
+ for (size_t i = 0; i < logs.size(); i++) {
+ s = RecoverLogFile(logs[i], (i == logs.size() - 1), save_manifest, edit,
+ &max_sequence);
if (!s.ok()) {
return s;
}
- std::set<uint64_t> expected;
- versions_->AddLiveFiles(&expected);
- uint64_t number;
- FileType type;
- std::vector<uint64_t> logs;
- for (size_t i = 0; i < filenames.size(); i++) {
- if (ParseFileName(filenames[i], &number, &type)) {
- expected.erase(number);
- if (type == kLogFile && ((number >= min_log) || (number == prev_log)))
- logs.push_back(number);
- }
- }
- if (!expected.empty()) {
- char buf[50];
- snprintf(buf, sizeof(buf), "%d missing files; e.g.",
- static_cast<int>(expected.size()));
- return Status::Corruption(buf, TableFileName(dbname_, *(expected.begin())));
- }
-
- // Recover in the order in which the logs were generated
- std::sort(logs.begin(), logs.end());
- for (size_t i = 0; i < logs.size(); i++) {
- s = RecoverLogFile(logs[i], edit, &max_sequence);
- // The previous incarnation may not have written any MANIFEST
- // records after allocating this log number. So we manually
- // update the file number allocation counter in VersionSet.
- versions_->MarkFileNumberUsed(logs[i]);
- }
+ // The previous incarnation may not have written any MANIFEST
+ // records after allocating this log number. So we manually
+ // update the file number allocation counter in VersionSet.
+ versions_->MarkFileNumberUsed(logs[i]);
+ }
- if (s.ok()) {
- if (versions_->LastSequence() < max_sequence) {
- versions_->SetLastSequence(max_sequence);
- }
- }
+ if (versions_->LastSequence() < max_sequence) {
+ versions_->SetLastSequence(max_sequence);
}
- return s;
+ return Status::OK();
}
-Status DBImpl::RecoverLogFile(uint64_t log_number,
- VersionEdit* edit,
+Status DBImpl::RecoverLogFile(uint64_t log_number, bool last_log,
+ bool* save_manifest, VersionEdit* edit,
SequenceNumber* max_sequence) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
@@ -405,6 +407,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
std::string scratch;
Slice record;
WriteBatch batch;
+ int compactions = 0;
MemTable* mem = NULL;
while (reader.ReadRecord(&record, &scratch) &&
status.ok()) {
@@ -432,25 +435,52 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
}
if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+ compactions++;
+ *save_manifest = true;
status = WriteLevel0Table(mem, edit, NULL);
+ mem->Unref();
+ mem = NULL;
if (!status.ok()) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
break;
}
- mem->Unref();
- mem = NULL;
}
}
- if (status.ok() && mem != NULL) {
- status = WriteLevel0Table(mem, edit, NULL);
- // Reflect errors immediately so that conditions like full
- // file-systems cause the DB::Open() to fail.
+ delete file;
+
+ // See if we should keep reusing the last log file.
+ if (status.ok() && options_.reuse_logs && last_log && compactions == 0) {
+ assert(logfile_ == NULL);
+ assert(log_ == NULL);
+ assert(mem_ == NULL);
+ uint64_t lfile_size;
+ if (env_->GetFileSize(fname, &lfile_size).ok() &&
+ env_->NewAppendableFile(fname, &logfile_).ok()) {
+ Log(options_.info_log, "Reusing old log %s \n", fname.c_str());
+ log_ = new log::Writer(logfile_, lfile_size);
+ logfile_number_ = log_number;
+ if (mem != NULL) {
+ mem_ = mem;
+ mem = NULL;
+ } else {
+ // mem can be NULL if lognum exists but was empty.
+ mem_ = new MemTable(internal_comparator_);
+ mem_->Ref();
+ }
+ }
+ }
+
+ if (mem != NULL) {
+ // mem did not get reused; compact it.
+ if (status.ok()) {
+ *save_manifest = true;
+ status = WriteLevel0Table(mem, edit, NULL);
+ }
+ mem->Unref();
}
- if (mem != NULL) mem->Unref();
- delete file;
return status;
}
@@ -821,8 +851,9 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
delete iter;
if (s.ok()) {
Log(options_.info_log,
- "Generated table #%llu: %lld keys, %lld bytes",
+ "Generated table #%llu@%d: %lld keys, %lld bytes",
(unsigned long long) output_number,
+ compact->compaction->level(),
(unsigned long long) current_entries,
(unsigned long long) current_bytes);
}
@@ -1395,6 +1426,19 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
} else if (in == "sstables") {
*value = versions_->current()->DebugString();
return true;
+ } else if (in == "approximate-memory-usage") {
+ size_t total_usage = options_.block_cache->TotalCharge();
+ if (mem_) {
+ total_usage += mem_->ApproximateMemoryUsage();
+ }
+ if (imm_) {
+ total_usage += imm_->ApproximateMemoryUsage();
+ }
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%llu",
+ static_cast<unsigned long long>(total_usage));
+ value->append(buf);
+ return true;
}
return false;
@@ -1449,8 +1493,11 @@ Status DB::Open(const Options& options, const std::string& dbname,
DBImpl* impl = new DBImpl(options, dbname);
impl->mutex_.Lock();
VersionEdit edit;
- Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
- if (s.ok()) {
+ // Recover handles create_if_missing, error_if_exists
+ bool save_manifest = false;
+ Status s = impl->Recover(&edit, &save_manifest);
+ if (s.ok() && impl->mem_ == NULL) {
+ // Create new log and a corresponding memtable.
uint64_t new_log_number = impl->versions_->NewFileNumber();
WritableFile* lfile;
s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
@@ -1460,15 +1507,22 @@ Status DB::Open(const Options& options, const std::string& dbname,
impl->logfile_ = lfile;
impl->logfile_number_ = new_log_number;
impl->log_ = new log::Writer(lfile);
- s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
- }
- if (s.ok()) {
- impl->DeleteObsoleteFiles();
- impl->MaybeScheduleCompaction();
+ impl->mem_ = new MemTable(impl->internal_comparator_);
+ impl->mem_->Ref();
}
}
+ if (s.ok() && save_manifest) {
+ edit.SetPrevLogNumber(0); // No older logs needed after recovery.
+ edit.SetLogNumber(impl->logfile_number_);
+ s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+ }
+ if (s.ok()) {
+ impl->DeleteObsoleteFiles();
+ impl->MaybeScheduleCompaction();
+ }
impl->mutex_.Unlock();
if (s.ok()) {
+ assert(impl->mem_ != NULL);
*dbptr = impl;
} else {
delete impl;
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
index cfc998164a..8ff323e728 100644
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@@ -78,7 +78,8 @@ class DBImpl : public DB {
// Recover the descriptor from persistent storage. May do a significant
// amount of work to recover recently logged updates. Any changes to
// be made to the descriptor are added to *edit.
- Status Recover(VersionEdit* edit) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ Status Recover(VersionEdit* edit, bool* save_manifest)
+ EXCLUSIVE_LOCKS_REQUIRED(mutex_);
void MaybeIgnoreError(Status* s) const;
@@ -90,9 +91,8 @@ class DBImpl : public DB {
// Errors are recorded in bg_error_.
void CompactMemTable() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- Status RecoverLogFile(uint64_t log_number,
- VersionEdit* edit,
- SequenceNumber* max_sequence)
+ Status RecoverLogFile(uint64_t log_number, bool last_log, bool* save_manifest,
+ VersionEdit* edit, SequenceNumber* max_sequence)
EXCLUSIVE_LOCKS_REQUIRED(mutex_);
Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
index 0fed9137d5..a0b08bc19c 100644
--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@@ -193,6 +193,7 @@ class DBTest {
// Sequence of option configurations to try
enum OptionConfig {
kDefault,
+ kReuse,
kFilter,
kUncompressed,
kEnd
@@ -237,7 +238,11 @@ class DBTest {
// Return the current option configuration.
Options CurrentOptions() {
Options options;
+ options.reuse_logs = false;
switch (option_config_) {
+ case kReuse:
+ options.reuse_logs = true;
+ break;
case kFilter:
options.filter_policy = filter_policy_;
break;
@@ -558,6 +563,17 @@ TEST(DBTest, GetFromVersions) {
} while (ChangeOptions());
}
+TEST(DBTest, GetMemUsage) {
+ do {
+ ASSERT_OK(Put("foo", "v1"));
+ std::string val;
+ ASSERT_TRUE(db_->GetProperty("leveldb.approximate-memory-usage", &val));
+ int mem_usage = atoi(val.c_str());
+ ASSERT_GT(mem_usage, 0);
+ ASSERT_LT(mem_usage, 5*1024*1024);
+ } while (ChangeOptions());
+}
+
TEST(DBTest, GetSnapshot) {
do {
// Try with both a short key and a long key
@@ -1080,6 +1096,14 @@ TEST(DBTest, ApproximateSizes) {
// 0 because GetApproximateSizes() does not account for memtable space
ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+ if (options.reuse_logs) {
+ // Recovery will reuse memtable, and GetApproximateSizes() does not
+ // account for memtable usage;
+ Reopen(&options);
+ ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+ continue;
+ }
+
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
@@ -1123,6 +1147,11 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+ if (options.reuse_logs) {
+ // Need to force a memtable compaction since recovery does not do so.
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ }
+
// Check sizes across recovery by reopening a few times
for (int run = 0; run < 3; run++) {
Reopen(&options);
@@ -2084,7 +2113,8 @@ void BM_LogAndApply(int iters, int num_base_files) {
InternalKeyComparator cmp(BytewiseComparator());
Options options;
VersionSet vset(dbname, &options, NULL, &cmp);
- ASSERT_OK(vset.Recover());
+ bool save_manifest;
+ ASSERT_OK(vset.Recover(&save_manifest));
VersionEdit vbase;
uint64_t fnum = 1;
for (int i = 0; i < num_base_files; i++) {
diff --git a/src/leveldb/db/fault_injection_test.cc b/src/leveldb/db/fault_injection_test.cc
new file mode 100644
index 0000000000..875dfe81ee
--- /dev/null
+++ b/src/leveldb/db/fault_injection_test.cc
@@ -0,0 +1,554 @@
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "leveldb/db.h"
+
+#include <map>
+#include <set>
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+class FaultInjectionTestEnv;
+
+namespace {
+
+// Assume a filename, and not a directory name like "/foo/bar/"
+static std::string GetDirName(const std::string filename) {
+ size_t found = filename.find_last_of("/\\");
+ if (found == std::string::npos) {
+ return "";
+ } else {
+ return filename.substr(0, found);
+ }
+}
+
+Status SyncDir(const std::string& dir) {
+ // As this is a test it isn't required to *actually* sync this directory.
+ return Status::OK();
+}
+
+// A basic file truncation function suitable for this test.
+Status Truncate(const std::string& filename, uint64_t length) {
+ leveldb::Env* env = leveldb::Env::Default();
+
+ SequentialFile* orig_file;
+ Status s = env->NewSequentialFile(filename, &orig_file);
+ if (!s.ok())
+ return s;
+
+ char* scratch = new char[length];
+ leveldb::Slice result;
+ s = orig_file->Read(length, &result, scratch);
+ delete orig_file;
+ if (s.ok()) {
+ std::string tmp_name = GetDirName(filename) + "/truncate.tmp";
+ WritableFile* tmp_file;
+ s = env->NewWritableFile(tmp_name, &tmp_file);
+ if (s.ok()) {
+ s = tmp_file->Append(result);
+ delete tmp_file;
+ if (s.ok()) {
+ s = env->RenameFile(tmp_name, filename);
+ } else {
+ env->DeleteFile(tmp_name);
+ }
+ }
+ }
+
+ delete[] scratch;
+
+ return s;
+}
+
+struct FileState {
+ std::string filename_;
+ ssize_t pos_;
+ ssize_t pos_at_last_sync_;
+ ssize_t pos_at_last_flush_;
+
+ FileState(const std::string& filename)
+ : filename_(filename),
+ pos_(-1),
+ pos_at_last_sync_(-1),
+ pos_at_last_flush_(-1) { }
+
+ FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {}
+
+ bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; }
+
+ Status DropUnsyncedData() const;
+};
+
+} // anonymous namespace
+
+// A wrapper around WritableFile which informs another Env whenever this file
+// is written to or sync'ed.
+class TestWritableFile : public WritableFile {
+ public:
+ TestWritableFile(const FileState& state,
+ WritableFile* f,
+ FaultInjectionTestEnv* env);
+ virtual ~TestWritableFile();
+ virtual Status Append(const Slice& data);
+ virtual Status Close();
+ virtual Status Flush();
+ virtual Status Sync();
+
+ private:
+ FileState state_;
+ WritableFile* target_;
+ bool writable_file_opened_;
+ FaultInjectionTestEnv* env_;
+
+ Status SyncParent();
+};
+
+class FaultInjectionTestEnv : public EnvWrapper {
+ public:
+ FaultInjectionTestEnv() : EnvWrapper(Env::Default()), filesystem_active_(true) {}
+ virtual ~FaultInjectionTestEnv() { }
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result);
+ virtual Status NewAppendableFile(const std::string& fname,
+ WritableFile** result);
+ virtual Status DeleteFile(const std::string& f);
+ virtual Status RenameFile(const std::string& s, const std::string& t);
+
+ void WritableFileClosed(const FileState& state);
+ Status DropUnsyncedFileData();
+ Status DeleteFilesCreatedAfterLastDirSync();
+ void DirWasSynced();
+ bool IsFileCreatedSinceLastDirSync(const std::string& filename);
+ void ResetState();
+ void UntrackFile(const std::string& f);
+ // Setting the filesystem to inactive is the test equivalent to simulating a
+ // system reset. Setting to inactive will freeze our saved filesystem state so
+ // that it will stop being recorded. It can then be reset back to the state at
+ // the time of the reset.
+ bool IsFilesystemActive() const { return filesystem_active_; }
+ void SetFilesystemActive(bool active) { filesystem_active_ = active; }
+
+ private:
+ port::Mutex mutex_;
+ std::map<std::string, FileState> db_file_state_;
+ std::set<std::string> new_files_since_last_dir_sync_;
+ bool filesystem_active_; // Record flushes, syncs, writes
+};
+
+TestWritableFile::TestWritableFile(const FileState& state,
+ WritableFile* f,
+ FaultInjectionTestEnv* env)
+ : state_(state),
+ target_(f),
+ writable_file_opened_(true),
+ env_(env) {
+ assert(f != NULL);
+}
+
+TestWritableFile::~TestWritableFile() {
+ if (writable_file_opened_) {
+ Close();
+ }
+ delete target_;
+}
+
+Status TestWritableFile::Append(const Slice& data) {
+ Status s = target_->Append(data);
+ if (s.ok() && env_->IsFilesystemActive()) {
+ state_.pos_ += data.size();
+ }
+ return s;
+}
+
+Status TestWritableFile::Close() {
+ writable_file_opened_ = false;
+ Status s = target_->Close();
+ if (s.ok()) {
+ env_->WritableFileClosed(state_);
+ }
+ return s;
+}
+
+Status TestWritableFile::Flush() {
+ Status s = target_->Flush();
+ if (s.ok() && env_->IsFilesystemActive()) {
+ state_.pos_at_last_flush_ = state_.pos_;
+ }
+ return s;
+}
+
+Status TestWritableFile::SyncParent() {
+ Status s = SyncDir(GetDirName(state_.filename_));
+ if (s.ok()) {
+ env_->DirWasSynced();
+ }
+ return s;
+}
+
+Status TestWritableFile::Sync() {
+ if (!env_->IsFilesystemActive()) {
+ return Status::OK();
+ }
+ // Ensure new files referred to by the manifest are in the filesystem.
+ Status s = target_->Sync();
+ if (s.ok()) {
+ state_.pos_at_last_sync_ = state_.pos_;
+ }
+ if (env_->IsFileCreatedSinceLastDirSync(state_.filename_)) {
+ Status ps = SyncParent();
+ if (s.ok() && !ps.ok()) {
+ s = ps;
+ }
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ WritableFile* actual_writable_file;
+ Status s = target()->NewWritableFile(fname, &actual_writable_file);
+ if (s.ok()) {
+ FileState state(fname);
+ state.pos_ = 0;
+ *result = new TestWritableFile(state, actual_writable_file, this);
+ // NewWritableFile doesn't append to files, so if the same file is
+ // opened again then it will be truncated - so forget our saved
+ // state.
+ UntrackFile(fname);
+ MutexLock l(&mutex_);
+ new_files_since_last_dir_sync_.insert(fname);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::NewAppendableFile(const std::string& fname,
+ WritableFile** result) {
+ WritableFile* actual_writable_file;
+ Status s = target()->NewAppendableFile(fname, &actual_writable_file);
+ if (s.ok()) {
+ FileState state(fname);
+ state.pos_ = 0;
+ {
+ MutexLock l(&mutex_);
+ if (db_file_state_.count(fname) == 0) {
+ new_files_since_last_dir_sync_.insert(fname);
+ } else {
+ state = db_file_state_[fname];
+ }
+ }
+ *result = new TestWritableFile(state, actual_writable_file, this);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::DropUnsyncedFileData() {
+ Status s;
+ MutexLock l(&mutex_);
+ for (std::map<std::string, FileState>::const_iterator it =
+ db_file_state_.begin();
+ s.ok() && it != db_file_state_.end(); ++it) {
+ const FileState& state = it->second;
+ if (!state.IsFullySynced()) {
+ s = state.DropUnsyncedData();
+ }
+ }
+ return s;
+}
+
+void FaultInjectionTestEnv::DirWasSynced() {
+ MutexLock l(&mutex_);
+ new_files_since_last_dir_sync_.clear();
+}
+
+bool FaultInjectionTestEnv::IsFileCreatedSinceLastDirSync(
+ const std::string& filename) {
+ MutexLock l(&mutex_);
+ return new_files_since_last_dir_sync_.find(filename) !=
+ new_files_since_last_dir_sync_.end();
+}
+
+void FaultInjectionTestEnv::UntrackFile(const std::string& f) {
+ MutexLock l(&mutex_);
+ db_file_state_.erase(f);
+ new_files_since_last_dir_sync_.erase(f);
+}
+
+Status FaultInjectionTestEnv::DeleteFile(const std::string& f) {
+ Status s = EnvWrapper::DeleteFile(f);
+ ASSERT_OK(s);
+ if (s.ok()) {
+ UntrackFile(f);
+ }
+ return s;
+}
+
+Status FaultInjectionTestEnv::RenameFile(const std::string& s,
+ const std::string& t) {
+ Status ret = EnvWrapper::RenameFile(s, t);
+
+ if (ret.ok()) {
+ MutexLock l(&mutex_);
+ if (db_file_state_.find(s) != db_file_state_.end()) {
+ db_file_state_[t] = db_file_state_[s];
+ db_file_state_.erase(s);
+ }
+
+ if (new_files_since_last_dir_sync_.erase(s) != 0) {
+ assert(new_files_since_last_dir_sync_.find(t) ==
+ new_files_since_last_dir_sync_.end());
+ new_files_since_last_dir_sync_.insert(t);
+ }
+ }
+
+ return ret;
+}
+
+void FaultInjectionTestEnv::ResetState() {
+ // Since we are not destroying the database, the existing files
+ // should keep their recorded synced/flushed state. Therefore
+ // we do not reset db_file_state_ and new_files_since_last_dir_sync_.
+ MutexLock l(&mutex_);
+ SetFilesystemActive(true);
+}
+
+Status FaultInjectionTestEnv::DeleteFilesCreatedAfterLastDirSync() {
+ // Because DeleteFile access this container make a copy to avoid deadlock
+ mutex_.Lock();
+ std::set<std::string> new_files(new_files_since_last_dir_sync_.begin(),
+ new_files_since_last_dir_sync_.end());
+ mutex_.Unlock();
+ Status s;
+ std::set<std::string>::const_iterator it;
+ for (it = new_files.begin(); s.ok() && it != new_files.end(); ++it) {
+ s = DeleteFile(*it);
+ }
+ return s;
+}
+
+void FaultInjectionTestEnv::WritableFileClosed(const FileState& state) {
+ MutexLock l(&mutex_);
+ db_file_state_[state.filename_] = state;
+}
+
+Status FileState::DropUnsyncedData() const {
+ ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_;
+ return Truncate(filename_, sync_pos);
+}
+
+class FaultInjectionTest {
+ public:
+ enum ExpectedVerifResult { VAL_EXPECT_NO_ERROR, VAL_EXPECT_ERROR };
+ enum ResetMethod { RESET_DROP_UNSYNCED_DATA, RESET_DELETE_UNSYNCED_FILES };
+
+ FaultInjectionTestEnv* env_;
+ std::string dbname_;
+ Cache* tiny_cache_;
+ Options options_;
+ DB* db_;
+
+ FaultInjectionTest()
+ : env_(new FaultInjectionTestEnv),
+ tiny_cache_(NewLRUCache(100)),
+ db_(NULL) {
+ dbname_ = test::TmpDir() + "/fault_test";
+ DestroyDB(dbname_, Options()); // Destroy any db from earlier run
+ options_.reuse_logs = true;
+ options_.env = env_;
+ options_.paranoid_checks = true;
+ options_.block_cache = tiny_cache_;
+ options_.create_if_missing = true;
+ }
+
+ ~FaultInjectionTest() {
+ CloseDB();
+ DestroyDB(dbname_, Options());
+ delete tiny_cache_;
+ delete env_;
+ }
+
+ void ReuseLogs(bool reuse) {
+ options_.reuse_logs = reuse;
+ }
+
+ void Build(int start_idx, int num_vals) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = start_idx; i < start_idx + num_vals; i++) {
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ WriteOptions options;
+ ASSERT_OK(db_->Write(options, &batch));
+ }
+ }
+
+ Status ReadValue(int i, std::string* val) const {
+ std::string key_space, value_space;
+ Slice key = Key(i, &key_space);
+ Value(i, &value_space);
+ ReadOptions options;
+ return db_->Get(options, key, val);
+ }
+
+ Status Verify(int start_idx, int num_vals,
+ ExpectedVerifResult expected) const {
+ std::string val;
+ std::string value_space;
+ Status s;
+ for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+ Value(i, &value_space);
+ s = ReadValue(i, &val);
+ if (expected == VAL_EXPECT_NO_ERROR) {
+ if (s.ok()) {
+ ASSERT_EQ(value_space, val);
+ }
+ } else if (s.ok()) {
+ fprintf(stderr, "Expected an error at %d, but was OK\n", i);
+ s = Status::IOError(dbname_, "Expected value error:");
+ } else {
+ s = Status::OK(); // An expected error
+ }
+ }
+ return s;
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) const {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) const {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+
+ Status OpenDB() {
+ delete db_;
+ db_ = NULL;
+ env_->ResetState();
+ return DB::Open(options_, dbname_, &db_);
+ }
+
+ void CloseDB() {
+ delete db_;
+ db_ = NULL;
+ }
+
+ void DeleteAllData() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ WriteOptions options;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+ }
+
+ delete iter;
+ }
+
+ void ResetDBState(ResetMethod reset_method) {
+ switch (reset_method) {
+ case RESET_DROP_UNSYNCED_DATA:
+ ASSERT_OK(env_->DropUnsyncedFileData());
+ break;
+ case RESET_DELETE_UNSYNCED_FILES:
+ ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+ break;
+ default:
+ assert(false);
+ }
+ }
+
+ void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+ DeleteAllData();
+ Build(0, num_pre_sync);
+ db_->CompactRange(NULL, NULL);
+ Build(num_pre_sync, num_post_sync);
+ }
+
+ void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+ int num_pre_sync,
+ int num_post_sync) {
+ env_->SetFilesystemActive(false);
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::VAL_EXPECT_NO_ERROR));
+ ASSERT_OK(Verify(num_pre_sync, num_post_sync, FaultInjectionTest::VAL_EXPECT_ERROR));
+ }
+
+ void NoWriteTestPreFault() {
+ }
+
+ void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+ CloseDB();
+ ResetDBState(reset_method);
+ ASSERT_OK(OpenDB());
+ }
+
+ void DoTest() {
+ Random rnd(0);
+ ASSERT_OK(OpenDB());
+ for (size_t idx = 0; idx < kNumIterations; idx++) {
+ int num_pre_sync = rnd.Uniform(kMaxNumValues);
+ int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ PartialCompactTestReopenWithFault(RESET_DROP_UNSYNCED_DATA,
+ num_pre_sync,
+ num_post_sync);
+
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(RESET_DROP_UNSYNCED_DATA);
+
+ PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+ // No new files created so we expect all values since no files will be
+ // dropped.
+ PartialCompactTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES,
+ num_pre_sync + num_post_sync,
+ 0);
+
+ NoWriteTestPreFault();
+ NoWriteTestReopenWithFault(RESET_DELETE_UNSYNCED_FILES);
+ }
+ }
+};
+
+TEST(FaultInjectionTest, FaultTestNoLogReuse) {
+ ReuseLogs(false);
+ DoTest();
+}
+
+TEST(FaultInjectionTest, FaultTestWithLogReuse) {
+ ReuseLogs(true);
+ DoTest();
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/leveldb_main.cc b/src/leveldb/db/leveldbutil.cc
index 9f4b7dd70c..9f4b7dd70c 100644
--- a/src/leveldb/db/leveldb_main.cc
+++ b/src/leveldb/db/leveldbutil.cc
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
index e44b66c85b..a6d304545d 100644
--- a/src/leveldb/db/log_reader.cc
+++ b/src/leveldb/db/log_reader.cc
@@ -25,7 +25,8 @@ Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
eof_(false),
last_record_offset_(0),
end_of_buffer_offset_(0),
- initial_offset_(initial_offset) {
+ initial_offset_(initial_offset),
+ resyncing_(initial_offset > 0) {
}
Reader::~Reader() {
@@ -72,8 +73,25 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch) {
Slice fragment;
while (true) {
- uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type = ReadPhysicalRecord(&fragment);
+
+ // ReadPhysicalRecord may have only had an empty trailer remaining in its
+ // internal buffer. Calculate the offset of the next physical record now
+ // that it has returned, properly accounting for its header size.
+ uint64_t physical_record_offset =
+ end_of_buffer_offset_ - buffer_.size() - kHeaderSize - fragment.size();
+
+ if (resyncing_) {
+ if (record_type == kMiddleType) {
+ continue;
+ } else if (record_type == kLastType) {
+ resyncing_ = false;
+ continue;
+ } else {
+ resyncing_ = false;
+ }
+ }
+
switch (record_type) {
case kFullType:
if (in_fragmented_record) {
diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h
index 6aff791716..8389d61f8f 100644
--- a/src/leveldb/db/log_reader.h
+++ b/src/leveldb/db/log_reader.h
@@ -73,6 +73,11 @@ class Reader {
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
+ // True if we are resynchronizing after a seek (initial_offset_ > 0). In
+ // particular, a run of kMiddleType and kLastType records can be silently
+ // skipped in this mode
+ bool resyncing_;
+
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc
index dcf0562652..48a5928657 100644
--- a/src/leveldb/db/log_test.cc
+++ b/src/leveldb/db/log_test.cc
@@ -79,7 +79,7 @@ class LogTest {
virtual Status Skip(uint64_t n) {
if (n > contents_.size()) {
contents_.clear();
- return Status::NotFound("in-memory file skipepd past end");
+ return Status::NotFound("in-memory file skipped past end");
}
contents_.remove_prefix(n);
@@ -104,23 +104,34 @@ class LogTest {
StringSource source_;
ReportCollector report_;
bool reading_;
- Writer writer_;
- Reader reader_;
+ Writer* writer_;
+ Reader* reader_;
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
+ static int num_initial_offset_records_;
public:
LogTest() : reading_(false),
- writer_(&dest_),
- reader_(&source_, &report_, true/*checksum*/,
- 0/*initial_offset*/) {
+ writer_(new Writer(&dest_)),
+ reader_(new Reader(&source_, &report_, true/*checksum*/,
+ 0/*initial_offset*/)) {
+ }
+
+ ~LogTest() {
+ delete writer_;
+ delete reader_;
+ }
+
+ void ReopenForAppend() {
+ delete writer_;
+ writer_ = new Writer(&dest_, dest_.contents_.size());
}
void Write(const std::string& msg) {
ASSERT_TRUE(!reading_) << "Write() after starting to read";
- writer_.AddRecord(Slice(msg));
+ writer_->AddRecord(Slice(msg));
}
size_t WrittenBytes() const {
@@ -134,7 +145,7 @@ class LogTest {
}
std::string scratch;
Slice record;
- if (reader_.ReadRecord(&record, &scratch)) {
+ if (reader_->ReadRecord(&record, &scratch)) {
return record.ToString();
} else {
return "EOF";
@@ -182,13 +193,18 @@ class LogTest {
}
void WriteInitialOffsetLog() {
- for (int i = 0; i < 4; i++) {
+ for (int i = 0; i < num_initial_offset_records_; i++) {
std::string record(initial_offset_record_sizes_[i],
static_cast<char>('a' + i));
Write(record);
}
}
+ void StartReadingAt(uint64_t initial_offset) {
+ delete reader_;
+ reader_ = new Reader(&source_, &report_, true/*checksum*/, initial_offset);
+ }
+
void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
WriteInitialOffsetLog();
reading_ = true;
@@ -208,32 +224,48 @@ class LogTest {
source_.contents_ = Slice(dest_.contents_);
Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
initial_offset);
- Slice record;
- std::string scratch;
- ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
- ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
- record.size());
- ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
- offset_reader->LastRecordOffset());
- ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+
+ // Read all records from expected_record_offset through the last one.
+ ASSERT_LT(expected_record_offset, num_initial_offset_records_);
+ for (; expected_record_offset < num_initial_offset_records_;
+ ++expected_record_offset) {
+ Slice record;
+ std::string scratch;
+ ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+ ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+ record.size());
+ ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+ offset_reader->LastRecordOffset());
+ ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+ }
delete offset_reader;
}
-
};
size_t LogTest::initial_offset_record_sizes_[] =
{10000, // Two sizable records in first block
10000,
2 * log::kBlockSize - 1000, // Span three blocks
- 1};
+ 1,
+ 13716, // Consume all but two bytes of block 3.
+ log::kBlockSize - kHeaderSize, // Consume the entirety of block 4.
+ };
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
- (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+ (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+ 2 * (kHeaderSize + 10000) +
+ (2 * log::kBlockSize - 1000) + 3 * kHeaderSize
+ + kHeaderSize + 1,
+ 3 * log::kBlockSize,
+ };
+// LogTest::initial_offset_last_record_offsets_ must be defined before this.
+int LogTest::num_initial_offset_records_ =
+ sizeof(LogTest::initial_offset_last_record_offsets_)/sizeof(uint64_t);
TEST(LogTest, Empty) {
ASSERT_EQ("EOF", Read());
@@ -318,6 +350,15 @@ TEST(LogTest, AlignedEof) {
ASSERT_EQ("EOF", Read());
}
+TEST(LogTest, OpenForAppend) {
+ Write("hello");
+ ReopenForAppend();
+ Write("world");
+ ASSERT_EQ("hello", Read());
+ ASSERT_EQ("world", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
TEST(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
@@ -445,6 +486,22 @@ TEST(LogTest, PartialLastIsIgnored) {
ASSERT_EQ(0, DroppedBytes());
}
+TEST(LogTest, SkipIntoMultiRecord) {
+ // Consider a fragmented record:
+ // first(R1), middle(R1), last(R1), first(R2)
+ // If initial_offset points to a record after first(R1) but before first(R2)
+ // incomplete fragment errors are not actual errors, and must be suppressed
+ // until a new first or full record is encountered.
+ Write(BigString("foo", 3*kBlockSize));
+ Write("correct");
+ StartReadingAt(kBlockSize);
+
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("", ReportMessage());
+ ASSERT_EQ(0, DroppedBytes());
+ ASSERT_EQ("EOF", Read());
+}
+
TEST(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
@@ -514,6 +571,10 @@ TEST(LogTest, ReadFourthStart) {
3);
}
+TEST(LogTest, ReadInitialOffsetIntoBlockPadding) {
+ CheckInitialOffsetRecord(3 * log::kBlockSize - 3, 5);
+}
+
TEST(LogTest, ReadEnd) {
CheckOffsetPastEndReturnsNoRecords(0);
}
diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc
index 2da99ac088..74a03270da 100644
--- a/src/leveldb/db/log_writer.cc
+++ b/src/leveldb/db/log_writer.cc
@@ -12,15 +12,24 @@
namespace leveldb {
namespace log {
-Writer::Writer(WritableFile* dest)
- : dest_(dest),
- block_offset_(0) {
+static void InitTypeCrc(uint32_t* type_crc) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
- type_crc_[i] = crc32c::Value(&t, 1);
+ type_crc[i] = crc32c::Value(&t, 1);
}
}
+Writer::Writer(WritableFile* dest)
+ : dest_(dest),
+ block_offset_(0) {
+ InitTypeCrc(type_crc_);
+}
+
+Writer::Writer(WritableFile* dest, uint64_t dest_length)
+ : dest_(dest), block_offset_(dest_length % kBlockSize) {
+ InitTypeCrc(type_crc_);
+}
+
Writer::~Writer() {
}
diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h
index a3a954d967..9e7cc4705b 100644
--- a/src/leveldb/db/log_writer.h
+++ b/src/leveldb/db/log_writer.h
@@ -22,6 +22,12 @@ class Writer {
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
+
+ // Create a writer that will append data to "*dest".
+ // "*dest" must have initial length "dest_length".
+ // "*dest" must remain live while this Writer is in use.
+ Writer(WritableFile* dest, uint64_t dest_length);
+
~Writer();
Status AddRecord(const Slice& slice);
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
index 92e90bb099..9f41567cde 100644
--- a/src/leveldb/db/memtable.h
+++ b/src/leveldb/db/memtable.h
@@ -36,10 +36,7 @@ class MemTable {
}
// Returns an estimate of the number of bytes of data in use by this
- // data structure.
- //
- // REQUIRES: external synchronization to prevent simultaneous
- // operations on the same MemTable.
+ // data structure. It is safe to call when MemTable is being modified.
size_t ApproximateMemoryUsage();
// Return an iterator that yields the contents of the memtable.
diff --git a/src/leveldb/db/recovery_test.cc b/src/leveldb/db/recovery_test.cc
new file mode 100644
index 0000000000..9596f4288a
--- /dev/null
+++ b/src/leveldb/db/recovery_test.cc
@@ -0,0 +1,324 @@
+// Copyright (c) 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class RecoveryTest {
+ public:
+ RecoveryTest() : env_(Env::Default()), db_(NULL) {
+ dbname_ = test::TmpDir() + "/recovery_test";
+ DestroyDB(dbname_, Options());
+ Open();
+ }
+
+ ~RecoveryTest() {
+ Close();
+ DestroyDB(dbname_, Options());
+ }
+
+ DBImpl* dbfull() const { return reinterpret_cast<DBImpl*>(db_); }
+ Env* env() const { return env_; }
+
+ bool CanAppend() {
+ WritableFile* tmp;
+ Status s = env_->NewAppendableFile(CurrentFileName(dbname_), &tmp);
+ delete tmp;
+ if (s.IsNotSupportedError()) {
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ void Close() {
+ delete db_;
+ db_ = NULL;
+ }
+
+ void Open(Options* options = NULL) {
+ Close();
+ Options opts;
+ if (options != NULL) {
+ opts = *options;
+ } else {
+ opts.reuse_logs = true; // TODO(sanjay): test both ways
+ opts.create_if_missing = true;
+ }
+ if (opts.env == NULL) {
+ opts.env = env_;
+ }
+ ASSERT_OK(DB::Open(opts, dbname_, &db_));
+ ASSERT_EQ(1, NumLogs());
+ }
+
+ Status Put(const std::string& k, const std::string& v) {
+ return db_->Put(WriteOptions(), k, v);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
+ std::string result;
+ Status s = db_->Get(ReadOptions(), k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string ManifestFileName() {
+ std::string current;
+ ASSERT_OK(ReadFileToString(env_, CurrentFileName(dbname_), &current));
+ size_t len = current.size();
+ if (len > 0 && current[len-1] == '\n') {
+ current.resize(len - 1);
+ }
+ return dbname_ + "/" + current;
+ }
+
+ std::string LogName(uint64_t number) {
+ return LogFileName(dbname_, number);
+ }
+
+ size_t DeleteLogFiles() {
+ std::vector<uint64_t> logs = GetFiles(kLogFile);
+ for (size_t i = 0; i < logs.size(); i++) {
+ ASSERT_OK(env_->DeleteFile(LogName(logs[i]))) << LogName(logs[i]);
+ }
+ return logs.size();
+ }
+
+ uint64_t FirstLogFile() {
+ return GetFiles(kLogFile)[0];
+ }
+
+ std::vector<uint64_t> GetFiles(FileType t) {
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+ std::vector<uint64_t> result;
+ for (size_t i = 0; i < filenames.size(); i++) {
+ uint64_t number;
+ FileType type;
+ if (ParseFileName(filenames[i], &number, &type) && type == t) {
+ result.push_back(number);
+ }
+ }
+ return result;
+ }
+
+ int NumLogs() {
+ return GetFiles(kLogFile).size();
+ }
+
+ int NumTables() {
+ return GetFiles(kTableFile).size();
+ }
+
+ uint64_t FileSize(const std::string& fname) {
+ uint64_t result;
+ ASSERT_OK(env_->GetFileSize(fname, &result)) << fname;
+ return result;
+ }
+
+ void CompactMemTable() {
+ dbfull()->TEST_CompactMemTable();
+ }
+
+ // Directly construct a log file that sets key to val.
+ void MakeLogFile(uint64_t lognum, SequenceNumber seq, Slice key, Slice val) {
+ std::string fname = LogFileName(dbname_, lognum);
+ WritableFile* file;
+ ASSERT_OK(env_->NewWritableFile(fname, &file));
+ log::Writer writer(file);
+ WriteBatch batch;
+ batch.Put(key, val);
+ WriteBatchInternal::SetSequence(&batch, seq);
+ ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
+ ASSERT_OK(file->Flush());
+ delete file;
+ }
+
+ private:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+};
+
+TEST(RecoveryTest, ManifestReused) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ std::string old_manifest = ManifestFileName();
+ Open();
+ ASSERT_EQ(old_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+ Open();
+ ASSERT_EQ(old_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(RecoveryTest, LargeManifestCompacted) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ std::string old_manifest = ManifestFileName();
+
+ // Pad with zeroes to make manifest file very big.
+ {
+ uint64_t len = FileSize(old_manifest);
+ WritableFile* file;
+ ASSERT_OK(env()->NewAppendableFile(old_manifest, &file));
+ std::string zeroes(3*1048576 - static_cast<size_t>(len), 0);
+ ASSERT_OK(file->Append(zeroes));
+ ASSERT_OK(file->Flush());
+ delete file;
+ }
+
+ Open();
+ std::string new_manifest = ManifestFileName();
+ ASSERT_NE(old_manifest, new_manifest);
+ ASSERT_GT(10000, FileSize(new_manifest));
+ ASSERT_EQ("bar", Get("foo"));
+
+ Open();
+ ASSERT_EQ(new_manifest, ManifestFileName());
+ ASSERT_EQ("bar", Get("foo"));
+}
+
+TEST(RecoveryTest, NoLogFiles) {
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_EQ(1, DeleteLogFiles());
+ Open();
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+ Open();
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(RecoveryTest, LogFileReuse) {
+ if (!CanAppend()) {
+ fprintf(stderr, "skipping test because env does not support appending\n");
+ return;
+ }
+ for (int i = 0; i < 2; i++) {
+ ASSERT_OK(Put("foo", "bar"));
+ if (i == 0) {
+ // Compact to ensure current log is empty
+ CompactMemTable();
+ }
+ Close();
+ ASSERT_EQ(1, NumLogs());
+ uint64_t number = FirstLogFile();
+ if (i == 0) {
+ ASSERT_EQ(0, FileSize(LogName(number)));
+ } else {
+ ASSERT_LT(0, FileSize(LogName(number)));
+ }
+ Open();
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
+ ASSERT_EQ("bar", Get("foo"));
+ Open();
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_EQ(number, FirstLogFile()) << "did not reuse log file";
+ ASSERT_EQ("bar", Get("foo"));
+ }
+}
+
+TEST(RecoveryTest, MultipleMemTables) {
+ // Make a large log.
+ const int kNum = 1000;
+ for (int i = 0; i < kNum; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%050d", i);
+ ASSERT_OK(Put(buf, buf));
+ }
+ ASSERT_EQ(0, NumTables());
+ Close();
+ ASSERT_EQ(0, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ uint64_t old_log_file = FirstLogFile();
+
+ // Force creation of multiple memtables by reducing the write buffer size.
+ Options opt;
+ opt.reuse_logs = true;
+ opt.write_buffer_size = (kNum*100) / 2;
+ Open(&opt);
+ ASSERT_LE(2, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ ASSERT_NE(old_log_file, FirstLogFile()) << "must not reuse log";
+ for (int i = 0; i < kNum; i++) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%050d", i);
+ ASSERT_EQ(buf, Get(buf));
+ }
+}
+
+TEST(RecoveryTest, MultipleLogFiles) {
+ ASSERT_OK(Put("foo", "bar"));
+ Close();
+ ASSERT_EQ(1, NumLogs());
+
+ // Make a bunch of uncompacted log files.
+ uint64_t old_log = FirstLogFile();
+ MakeLogFile(old_log+1, 1000, "hello", "world");
+ MakeLogFile(old_log+2, 1001, "hi", "there");
+ MakeLogFile(old_log+3, 1002, "foo", "bar2");
+
+ // Recover and check that all log files were processed.
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ uint64_t new_log = FirstLogFile();
+ ASSERT_LE(old_log+3, new_log);
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+
+ // Test that previous recovery produced recoverable state.
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ if (CanAppend()) {
+ ASSERT_EQ(new_log, FirstLogFile());
+ }
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+
+ // Check that introducing an older log file does not cause it to be re-read.
+ Close();
+ MakeLogFile(old_log+1, 2000, "hello", "stale write");
+ Open();
+ ASSERT_LE(1, NumTables());
+ ASSERT_EQ(1, NumLogs());
+ if (CanAppend()) {
+ ASSERT_EQ(new_log, FirstLogFile());
+ }
+ ASSERT_EQ("bar2", Get("foo"));
+ ASSERT_EQ("world", Get("hello"));
+ ASSERT_EQ("there", Get("hi"));
+}
+
+} // namespace leveldb
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h
index ed8b092203..8bd77764d8 100644
--- a/src/leveldb/db/skiplist.h
+++ b/src/leveldb/db/skiplist.h
@@ -1,10 +1,10 @@
-#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
-#define STORAGE_LEVELDB_DB_SKIPLIST_H_
-
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
-//
+
+#ifndef STORAGE_LEVELDB_DB_SKIPLIST_H_
+#define STORAGE_LEVELDB_DB_SKIPLIST_H_
+
// Thread safety
// -------------
//
diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc
index c78f4b4fb1..aee1461e1b 100644
--- a/src/leveldb/db/skiplist_test.cc
+++ b/src/leveldb/db/skiplist_test.cc
@@ -250,7 +250,7 @@ class ConcurrentTest {
// Note that generation 0 is never inserted, so it is ok if
// <*,0,*> is missing.
ASSERT_TRUE((gen(pos) == 0) ||
- (gen(pos) > initial_state.Get(key(pos)))
+ (gen(pos) > static_cast<Key>(initial_state.Get(key(pos))))
) << "key: " << key(pos)
<< "; gen: " << gen(pos)
<< "; initgen: "
diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h
index e7f8fd2c37..6ed413c42d 100644
--- a/src/leveldb/db/snapshot.h
+++ b/src/leveldb/db/snapshot.h
@@ -5,6 +5,7 @@
#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#include "db/dbformat.h"
#include "leveldb/db.h"
namespace leveldb {
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
index aa83df55e4..a5e0f77a6a 100644
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
@@ -893,7 +893,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
return s;
}
-Status VersionSet::Recover() {
+Status VersionSet::Recover(bool *save_manifest) {
struct LogReporter : public log::Reader::Reporter {
Status* status;
virtual void Corruption(size_t bytes, const Status& s) {
@@ -1003,11 +1003,49 @@ Status VersionSet::Recover() {
last_sequence_ = last_sequence;
log_number_ = log_number;
prev_log_number_ = prev_log_number;
+
+ // See if we can reuse the existing MANIFEST file.
+ if (ReuseManifest(dscname, current)) {
+ // No need to save new manifest
+ } else {
+ *save_manifest = true;
+ }
}
return s;
}
+bool VersionSet::ReuseManifest(const std::string& dscname,
+ const std::string& dscbase) {
+ if (!options_->reuse_logs) {
+ return false;
+ }
+ FileType manifest_type;
+ uint64_t manifest_number;
+ uint64_t manifest_size;
+ if (!ParseFileName(dscbase, &manifest_number, &manifest_type) ||
+ manifest_type != kDescriptorFile ||
+ !env_->GetFileSize(dscname, &manifest_size).ok() ||
+ // Make new compacted MANIFEST if old one is too big
+ manifest_size >= kTargetFileSize) {
+ return false;
+ }
+
+ assert(descriptor_file_ == NULL);
+ assert(descriptor_log_ == NULL);
+ Status r = env_->NewAppendableFile(dscname, &descriptor_file_);
+ if (!r.ok()) {
+ Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str());
+ assert(descriptor_file_ == NULL);
+ return false;
+ }
+
+ Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str());
+ descriptor_log_ = new log::Writer(descriptor_file_, manifest_size);
+ manifest_file_number_ = manifest_number;
+ return true;
+}
+
void VersionSet::MarkFileNumberUsed(uint64_t number) {
if (next_file_number_ <= number) {
next_file_number_ = number + 1;
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
index 8dc14b8e01..1dec745673 100644
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@@ -179,7 +179,7 @@ class VersionSet {
EXCLUSIVE_LOCKS_REQUIRED(mu);
// Recover the last saved descriptor from persistent storage.
- Status Recover();
+ Status Recover(bool *save_manifest);
// Return the current version.
Version* current() const { return current_; }
@@ -274,6 +274,8 @@ class VersionSet {
friend class Compaction;
friend class Version;
+ bool ReuseManifest(const std::string& dscname, const std::string& dscbase);
+
void Finalize(Version* v);
void GetRange(const std::vector<FileMetaData*>& inputs,
diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h
index 310a3c8912..9448ef7b21 100644
--- a/src/leveldb/db/write_batch_internal.h
+++ b/src/leveldb/db/write_batch_internal.h
@@ -5,6 +5,7 @@
#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#include "db/dbformat.h"
#include "leveldb/write_batch.h"
namespace leveldb {