40 files changed, 12547 insertions, 0 deletions
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
new file mode 100644
index 0000000000..f419882197
--- /dev/null
+++ b/src/leveldb/db/builder.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+
+namespace leveldb {
+
+Status BuildTable(const std::string& dbname,
+                  Env* env,
+                  const Options& options,
+                  TableCache* table_cache,
+                  Iterator* iter,
+                  FileMetaData* meta) {
+  Status s;
+  meta->file_size = 0;
+  iter->SeekToFirst();
+
+  std::string fname = TableFileName(dbname, meta->number);
+  if (iter->Valid()) {
+    WritableFile* file;
+    s = env->NewWritableFile(fname, &file);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TableBuilder* builder = new TableBuilder(options, file);
+    meta->smallest.DecodeFrom(iter->key());
+    for (; iter->Valid(); iter->Next()) {
+      Slice key = iter->key();
+      meta->largest.DecodeFrom(key);
+      builder->Add(key, iter->value());
+    }
+
+    // Finish and check for builder errors
+    if (s.ok()) {
+      s = builder->Finish();
+      if (s.ok()) {
+        meta->file_size = builder->FileSize();
+        assert(meta->file_size > 0);
+      }
+    } else {
+      builder->Abandon();
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    if (s.ok()) {
+      s = file->Sync();
+    }
+    if (s.ok()) {
+      s = file->Close();
+    }
+    delete file;
+    file = NULL;
+
+    if (s.ok()) {
+      // Verify that the table is usable
+      Iterator* it = table_cache->NewIterator(ReadOptions(),
+                                              meta->number,
+                                              meta->file_size);
+      s = it->status();
+      delete it;
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (s.ok() && meta->file_size > 0) {
+    // Keep it
+  } else {
+    env->DeleteFile(fname);
+  }
+  return s;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/builder.h b/src/leveldb/db/builder.h
new file mode 100644
index 0000000000..62431fcf44
--- /dev/null
+++ b/src/leveldb/db/builder.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
+#define STORAGE_LEVELDB_DB_BUILDER_H_
+
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+class Iterator;
+class TableCache;
+class VersionEdit;
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to meta->number.  On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+                         Env* env,
+                         const Options& options,
+                         TableCache* table_cache,
+                         Iterator* iter,
+                         FileMetaData* meta);
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_BUILDER_H_
diff --git a/src/leveldb/db/c.cc b/src/leveldb/db/c.cc
new file mode 100644
index 0000000000..2dde400e77
--- /dev/null
+++ b/src/leveldb/db/c.cc
@@ -0,0 +1,581 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/c.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include "leveldb/cache.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/iterator.h"
+#include "leveldb/options.h"
+#include "leveldb/status.h"
+#include "leveldb/write_batch.h"
+
+using leveldb::Cache;
+using leveldb::Comparator;
+using leveldb::CompressionType;
+using leveldb::DB;
+using leveldb::Env;
+using leveldb::FileLock;
+using leveldb::FilterPolicy;
+using leveldb::Iterator;
+using leveldb::Logger;
+using leveldb::NewBloomFilterPolicy;
+using leveldb::NewLRUCache;
+using leveldb::Options;
+using leveldb::RandomAccessFile;
+using leveldb::Range;
+using leveldb::ReadOptions;
+using leveldb::SequentialFile;
+using leveldb::Slice;
+using leveldb::Snapshot;
+using leveldb::Status;
+using leveldb::WritableFile;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+
+extern "C" {
+
+struct leveldb_t              { DB*               rep; };
+struct leveldb_iterator_t     { Iterator*         rep; };
+struct leveldb_writebatch_t   { WriteBatch        rep; };
+struct leveldb_snapshot_t     { const Snapshot*   rep; };
+struct leveldb_readoptions_t  { ReadOptions       rep; };
+struct leveldb_writeoptions_t { WriteOptions      rep; };
+struct leveldb_options_t      { Options           rep; };
+struct leveldb_cache_t        { Cache*            rep; };
+struct leveldb_seqfile_t      { SequentialFile*   rep; };
+struct leveldb_randomfile_t   { RandomAccessFile* rep; };
+struct leveldb_writablefile_t { WritableFile*     rep; };
+struct leveldb_logger_t       { Logger*           rep; };
+struct leveldb_filelock_t     { FileLock*         rep; };
+
+struct leveldb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(
+      void*,
+      const char* a, size_t alen,
+      const char* b, size_t blen);
+  const char* (*name_)(void*);
+
+  virtual ~leveldb_comparator_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual int Compare(const Slice& a, const Slice& b) const {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  // No-ops since the C binding does not support key shortening methods.
+  virtual void FindShortestSeparator(std::string*, const Slice&) const { }
+  virtual void FindShortSuccessor(std::string* key) const { }
+};
+
+struct leveldb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*create_)(
+      void*,
+      const char* const* key_array, const size_t* key_length_array,
+      int num_keys,
+      size_t* filter_length);
+  unsigned char (*key_match_)(
+      void*,
+      const char* key, size_t length,
+      const char* filter, size_t filter_length);
+
+  virtual ~leveldb_filterpolicy_t() {
+    (*destructor_)(state_);
+  }
+
+  virtual const char* Name() const {
+    return (*name_)(state_);
+  }
+
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+    std::vector<const char*> key_pointers(n);
+    std::vector<size_t> key_sizes(n);
+    for (int i = 0; i < n; i++) {
+      key_pointers[i] = keys[i].data();
+      key_sizes[i] = keys[i].size();
+    }
+    size_t len;
+    char* filter = (*create_)(state_, &key_pointers[0], &key_sizes[0], n, &len);
+    dst->append(filter, len);
+    free(filter);
+  }
+
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+    return (*key_match_)(state_, key.data(), key.size(),
+                         filter.data(), filter.size());
+  }
+};
+
+struct leveldb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != NULL);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == NULL) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+leveldb_t* leveldb_open(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return NULL;
+  }
+  leveldb_t* result = new leveldb_t;
+  result->rep = db;
+  return result;
+}
+
+void leveldb_close(leveldb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void leveldb_put(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    const char* val, size_t vallen,
+    char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void leveldb_delete(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    const char* key, size_t keylen,
+    char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+
+void leveldb_write(
+    leveldb_t* db,
+    const leveldb_writeoptions_t* options,
+    leveldb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* leveldb_get(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key, size_t keylen,
+    size_t* vallen,
+    char** errptr) {
+  char* result = NULL;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+leveldb_iterator_t* leveldb_create_iterator(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options) {
+  leveldb_iterator_t* result = new leveldb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+const leveldb_snapshot_t* leveldb_create_snapshot(
+    leveldb_t* db) {
+  leveldb_snapshot_t* result = new leveldb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void leveldb_release_snapshot(
+    leveldb_t* db,
+    const leveldb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* leveldb_property_value(
+    leveldb_t* db,
+    const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return NULL;
+  }
+}
+
+void leveldb_approximate_sizes(
+    leveldb_t* db,
+    int num_ranges,
+    const char* const* range_start_key, const size_t* range_start_key_len,
+    const char* const* range_limit_key, const size_t* range_limit_key_len,
+    uint64_t* sizes) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  delete[] ranges;
+}
+
+void leveldb_compact_range(
+    leveldb_t* db,
+    const char* start_key, size_t start_key_len,
+    const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      // Pass NULL Slice if corresponding "const char*" is NULL
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : NULL),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : NULL));
+}
+
+void leveldb_destroy_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void leveldb_repair_db(
+    const leveldb_options_t* options,
+    const char* name,
+    char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void leveldb_iter_destroy(leveldb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char leveldb_iter_valid(const leveldb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void leveldb_iter_seek_to_first(leveldb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void leveldb_iter_seek_to_last(leveldb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void leveldb_iter_seek(leveldb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void leveldb_iter_next(leveldb_iterator_t* iter) {
+  iter->rep->Next();
+}
+
+void leveldb_iter_prev(leveldb_iterator_t* iter) {
+  iter->rep->Prev();
+}
+
+const char* leveldb_iter_key(const leveldb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* leveldb_iter_value(const leveldb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+void leveldb_iter_get_error(const leveldb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+leveldb_writebatch_t* leveldb_writebatch_create() {
+  return new leveldb_writebatch_t;
+}
+
+void leveldb_writebatch_destroy(leveldb_writebatch_t* b) {
+  delete b;
+}
+
+void leveldb_writebatch_clear(leveldb_writebatch_t* b) {
+  b->rep.Clear();
+}
+
+void leveldb_writebatch_put(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen,
+    const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void leveldb_writebatch_delete(
+    leveldb_writebatch_t* b,
+    const char* key, size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void leveldb_writebatch_iterate(
+    leveldb_writebatch_t* b,
+    void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  class H : public WriteBatch::Handler {
+   public:
+    void* state_;
+    void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+    void (*deleted_)(void*, const char* k, size_t klen);
+    virtual void Put(const Slice& key, const Slice& value) {
+      (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+    }
+    virtual void Delete(const Slice& key) {
+      (*deleted_)(state_, key.data(), key.size());
+    }
+  };
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+leveldb_options_t* leveldb_options_create() {
+  return new leveldb_options_t;
+}
+
+void leveldb_options_destroy(leveldb_options_t* options) {
+  delete options;
+}
+
+void leveldb_options_set_comparator(
+    leveldb_options_t* opt,
+    leveldb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void leveldb_options_set_filter_policy(
+    leveldb_options_t* opt,
+    leveldb_filterpolicy_t* policy) {
+  opt->rep.filter_policy = policy;
+}
+
+void leveldb_options_set_create_if_missing(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+void leveldb_options_set_error_if_exists(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+void leveldb_options_set_paranoid_checks(
+    leveldb_options_t* opt, unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+void leveldb_options_set_env(leveldb_options_t* opt, leveldb_env_t* env) {
+  opt->rep.env = (env ? env->rep : NULL);
+}
+
+void leveldb_options_set_info_log(leveldb_options_t* opt, leveldb_logger_t* l) {
+  opt->rep.info_log = (l ? l->rep : NULL);
+}
+
+void leveldb_options_set_write_buffer_size(leveldb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+void leveldb_options_set_max_open_files(leveldb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+void leveldb_options_set_cache(leveldb_options_t* opt, leveldb_cache_t* c) {
+  opt->rep.block_cache = c->rep;
+}
+
+void leveldb_options_set_block_size(leveldb_options_t* opt, size_t s) {
+  opt->rep.block_size = s;
+}
+
+void leveldb_options_set_block_restart_interval(leveldb_options_t* opt, int n) {
+  opt->rep.block_restart_interval = n;
+}
+
+void leveldb_options_set_compression(leveldb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+leveldb_comparator_t* leveldb_comparator_create(
+    void* state,
+    void (*destructor)(void*),
+    int (*compare)(
+        void*,
+        const char* a, size_t alen,
+        const char* b, size_t blen),
+    const char* (*name)(void*)) {
+  leveldb_comparator_t* result = new leveldb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  return result;
+}
+
+void leveldb_comparator_destroy(leveldb_comparator_t* cmp) {
+  delete cmp;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create(
+    void* state,
+    void (*destructor)(void*),
+    char* (*create_filter)(
+        void*,
+        const char* const* key_array, const size_t* key_length_array,
+        int num_keys,
+        size_t* filter_length),
+    unsigned char (*key_may_match)(
+        void*,
+        const char* key, size_t length,
+        const char* filter, size_t filter_length),
+    const char* (*name)(void*)) {
+  leveldb_filterpolicy_t* result = new leveldb_filterpolicy_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_ = create_filter;
+  result->key_match_ = key_may_match;
+  result->name_ = name;
+  return result;
+}
+
+void leveldb_filterpolicy_destroy(leveldb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+leveldb_filterpolicy_t* leveldb_filterpolicy_create_bloom(int bits_per_key) {
+  // Make a leveldb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public leveldb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() { delete rep_; }
+    const char* Name() const { return rep_->Name(); }
+    void CreateFilter(const Slice* keys, int n, std::string* dst) const {
+      return rep_->CreateFilter(keys, n, dst);
+    }
+    bool KeyMayMatch(const Slice& key, const Slice& filter) const {
+      return rep_->KeyMayMatch(key, filter);
+    }
+    static void DoNothing(void*) { }
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key);
+  wrapper->state_ = NULL;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+leveldb_readoptions_t* leveldb_readoptions_create() {
+  return new leveldb_readoptions_t;
+}
+
+void leveldb_readoptions_destroy(leveldb_readoptions_t* opt) {
+  delete opt;
+}
+
+void leveldb_readoptions_set_verify_checksums(
+    leveldb_readoptions_t* opt,
+    unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+void leveldb_readoptions_set_fill_cache(
+    leveldb_readoptions_t* opt, unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+void leveldb_readoptions_set_snapshot(
+    leveldb_readoptions_t* opt,
+    const leveldb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : NULL);
+}
+
+leveldb_writeoptions_t* leveldb_writeoptions_create() {
+  return new leveldb_writeoptions_t;
+}
+
+void leveldb_writeoptions_destroy(leveldb_writeoptions_t* opt) {
+  delete opt;
+}
+
+void leveldb_writeoptions_set_sync(
+    leveldb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.sync = v;
+}
+
+leveldb_cache_t* leveldb_cache_create_lru(size_t capacity) {
+  leveldb_cache_t* c = new leveldb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+void leveldb_cache_destroy(leveldb_cache_t* cache) {
+  delete cache->rep;
+  delete cache;
+}
+
+leveldb_env_t* leveldb_create_default_env() {
+  leveldb_env_t* result = new leveldb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+void leveldb_env_destroy(leveldb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+}  // end extern "C"
diff --git a/src/leveldb/db/c_test.c b/src/leveldb/db/c_test.c
new file mode 100644
index 0000000000..9792447157
--- /dev/null
+++ b/src/leveldb/db/c_test.c
@@ -0,0 +1,381 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+
+#include "leveldb/c.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+const char* phase = "";
+static char dbname[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+
+static const char* GetTempDir(void) {
+    const char* ret = getenv("TEST_TMPDIR");
+    if (ret == NULL || ret[0] == '\0')
+        ret = "/tmp";
+    return ret;
+}
+
+#define CheckNoError(err)                                               \
+  if ((err) != NULL) {                                                  \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                            \
+  }
+
+#define CheckCondition(cond)                                            \
+  if (!(cond)) {                                                        \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                            \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n",
+            phase,
+            (expected ? expected : "(null)"),
+            (v ? v : "(null"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckGet(
+    leveldb_t* db,
+    const leveldb_readoptions_t* options,
+    const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = leveldb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckIter(leveldb_iterator_t* iter,
+                      const char* key, const char* val) {
+  size_t len;
+  const char* str;
+  str = leveldb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = leveldb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckPut(void* ptr,
+                     const char* k, size_t klen,
+                     const char* v, size_t vlen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from leveldb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*) ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { }
+
+static int CmpCompare(void* arg, const char* a, size_t alen,
+                      const char* b, size_t blen) {
+  int n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen) r = -1;
+    else if (alen > blen) r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  return "foo";
+}
+
+// Custom filter policy
+static unsigned char fake_filter_result = 1;
+static void FilterDestroy(void* arg) { }
+static const char* FilterName(void* arg) {
+  return "TestFilter";
+}
+static char* FilterCreate(
+    void* arg,
+    const char* const* key_array, const size_t* key_length_array,
+    int num_keys,
+    size_t* filter_length) {
+  *filter_length = 4;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+unsigned char FilterKeyMatch(
+    void* arg,
+    const char* key, size_t length,
+    const char* filter, size_t filter_length) {
+  CheckCondition(filter_length == 4);
+  CheckCondition(memcmp(filter, "fake", 4) == 0);
+  return fake_filter_result;
+}
+
+int main(int argc, char** argv) {
+  leveldb_t* db;
+  leveldb_comparator_t* cmp;
+  leveldb_cache_t* cache;
+  leveldb_env_t* env;
+  leveldb_options_t* options;
+  leveldb_readoptions_t* roptions;
+  leveldb_writeoptions_t* woptions;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname),
+           "%s/leveldb_c_test-%d",
+           GetTempDir(),
+           ((int) geteuid()));
+
+  StartPhase("create_objects");
+  cmp = leveldb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  env = leveldb_create_default_env();
+  cache = leveldb_cache_create_lru(100000);
+
+  options = leveldb_options_create();
+  leveldb_options_set_comparator(options, cmp);
+  leveldb_options_set_error_if_exists(options, 1);
+  leveldb_options_set_cache(options, cache);
+  leveldb_options_set_env(options, env);
+  leveldb_options_set_info_log(options, NULL);
+  leveldb_options_set_write_buffer_size(options, 100000);
+  leveldb_options_set_paranoid_checks(options, 1);
+  leveldb_options_set_max_open_files(options, 10);
+  leveldb_options_set_block_size(options, 1024);
+  leveldb_options_set_block_restart_interval(options, 8);
+  leveldb_options_set_compression(options, leveldb_no_compression);
+
+  roptions = leveldb_readoptions_create();
+  leveldb_readoptions_set_verify_checksums(roptions, 1);
+  leveldb_readoptions_set_fill_cache(roptions, 0);
+
+  woptions = leveldb_writeoptions_create();
+  leveldb_writeoptions_set_sync(woptions, 1);
+
+  StartPhase("destroy");
+  leveldb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  db = leveldb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  leveldb_options_set_create_if_missing(options, 1);
+  db = leveldb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  leveldb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactall");
+  leveldb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  leveldb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("writebatch");
+  {
+    leveldb_writebatch_t* wb = leveldb_writebatch_create();
+    leveldb_writebatch_put(wb, "foo", 3, "a", 1);
+    leveldb_writebatch_clear(wb);
+    leveldb_writebatch_put(wb, "bar", 3, "b", 1);
+    leveldb_writebatch_put(wb, "box", 3, "c", 1);
+    leveldb_writebatch_delete(wb, "bar", 3);
+    leveldb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    leveldb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    leveldb_writebatch_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    leveldb_iterator_t* iter = leveldb_create_iterator(db, roptions);
+    CheckCondition(!leveldb_iter_valid(iter));
+    leveldb_iter_seek_to_first(iter);
+    CheckCondition(leveldb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    leveldb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    leveldb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    leveldb_iter_prev(iter);
+    CheckCondition(!leveldb_iter_valid(iter));
+    leveldb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    leveldb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    leveldb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    leveldb_iter_destroy(iter);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = { "a", "k00000000000000010000" };
+    size_t start_len[2] = { 1, 21 };
+    const char* limit[2] = { "k00000000000000010000", "z" };
+    size_t limit_len[2] = { 21, 1 };
+    leveldb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      leveldb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    leveldb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = leveldb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = leveldb_property_value(db, "leveldb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const leveldb_snapshot_t* snap;
+    snap = leveldb_create_snapshot(db);
+    leveldb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    leveldb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    leveldb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    leveldb_release_snapshot(db, snap);
+  }
+
+  StartPhase("repair");
+  {
+    leveldb_close(db);
+    leveldb_options_set_create_if_missing(options, 0);
+    leveldb_options_set_error_if_exists(options, 0);
+    leveldb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = leveldb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    leveldb_options_set_create_if_missing(options, 1);
+    leveldb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 0; run < 2; run++) {
+    // First run uses custom filter, second run uses bloom filter
+    CheckNoError(err);
+    leveldb_filterpolicy_t* policy;
+    if (run == 0) {
+      policy = leveldb_filterpolicy_create(
+          NULL, FilterDestroy, FilterCreate, FilterKeyMatch, FilterName);
+    } else {
+      policy = leveldb_filterpolicy_create_bloom(10);
+    }
+
+    // Create new database
+    leveldb_close(db);
+    leveldb_destroy_db(options, dbname, &err);
+    leveldb_options_set_filter_policy(options, policy);
+    db = leveldb_open(options, dbname, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    leveldb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    leveldb_compact_range(db, NULL, 0, NULL, 0);
+
+    fake_filter_result = 1;
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+    if (phase == 0) {
+      // Must not find value when custom filter returns false
+      fake_filter_result = 0;
+      CheckGet(db, roptions, "foo", NULL);
+      CheckGet(db, roptions, "bar", NULL);
+      fake_filter_result = 1;
+
+      CheckGet(db, roptions, "foo", "foovalue");
+      CheckGet(db, roptions, "bar", "barvalue");
+    }
+    leveldb_options_set_filter_policy(options, NULL);
+    leveldb_filterpolicy_destroy(policy);
+  }
+
+  StartPhase("cleanup");
+  leveldb_close(db);
+  leveldb_options_destroy(options);
+  leveldb_readoptions_destroy(roptions);
+  leveldb_writeoptions_destroy(woptions);
+  leveldb_cache_destroy(cache);
+  leveldb_comparator_destroy(cmp);
+  leveldb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
diff --git a/src/leveldb/db/corruption_test.cc b/src/leveldb/db/corruption_test.cc
new file mode 100644
index 0000000000..31b2d5f416
--- /dev/null
+++ b/src/leveldb/db/corruption_test.cc
@@ -0,0 +1,359 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "leveldb/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+  test::ErrorEnv env_;
+  std::string dbname_;
+  Cache* tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    tiny_cache_ = NewLRUCache(100);
+    options_.env = &env_;
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, options_);
+
+    db_ = NULL;
+    options_.create_if_missing = true;
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() {
+     delete db_;
+     DestroyDB(dbname_, Options());
+     delete tiny_cache_;
+  }
+
+  Status TryReopen(Options* options = NULL) {
+    delete db_;
+    db_ = NULL;
+    Options opt = (options ? *options : options_);
+    opt.env = &env_;
+    opt.block_cache = tiny_cache_;
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void RepairDB() {
+    delete db_;
+    db_ = NULL;
+    ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      batch.Put(key, Value(i, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Check(int min_expected, int max_expected) {
+    int next_expected = 0;
+    int missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) ||
+          !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(key, &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    delete iter;
+
+    fprintf(stderr,
+            "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+            min_expected, max_expected, correct, bad_keys, bad_values, missed);
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (int i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type == filetype &&
+          int(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    struct stat sbuf;
+    if (stat(fname.c_str(), &sbuf) != 0) {
+      const char* msg = strerror(errno);
+      ASSERT_TRUE(false) << fname << ": " << msg;
+    }
+
+    if (offset < 0) {
+      // Relative to end of file; make it absolute
+      if (-offset > sbuf.st_size) {
+        offset = 0;
+      } else {
+        offset = sbuf.st_size + offset;
+      }
+    }
+    if (offset > sbuf.st_size) {
+      offset = sbuf.st_size;
+    }
+    if (offset + bytes_to_corrupt > sbuf.st_size) {
+      bytes_to_corrupt = sbuf.st_size - offset;
+    }
+
+    // Do it
+    std::string contents;
+    Status s = ReadFileToString(Env::Default(), fname, &contents);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+    for (int i = 0; i < bytes_to_corrupt; i++) {
+      contents[i + offset] ^= 0x80;
+    }
+    s = WriteStringToFile(Env::Default(), contents, fname);
+    ASSERT_TRUE(s.ok()) << s.ToString();
+  }
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    Random r(k);
+    return test::RandomString(&r, kValueSize, storage);
+  }
+};
+
+TEST(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+  Corrupt(kLogFile, 19, 1);      // WriteBatch tag for first record
+  Corrupt(kLogFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  Reopen();
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+  env_.writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_.writable_file_error_ = true;
+  const int num = 3 + (Options().write_buffer_size / kValueSize);
+  std::string value_storage;
+  Status s;
+  for (int i = 0; s.ok() && i < num; i++) {
+    WriteBatch batch;
+    batch.Put("a", Value(100, &value_storage));
+    s = db_->Write(WriteOptions(), &batch);
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_.num_writable_file_errors_, 1);
+  env_.writable_file_error_ = false;
+  Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  dbi->TEST_CompactRange(0, NULL, NULL);
+  dbi->TEST_CompactRange(1, NULL, NULL);
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+  Build(10000);  // Enough to build multiple Tables
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+
+  Corrupt(kTableFile, -2000, 500);
+  Reopen();
+  Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  dbi->TEST_CompactRange(0, NULL, NULL);
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  const int last = config::kMaxMemCompactLevel;
+  ASSERT_EQ(1, Property("leveldb.num-files-at-level" + NumberToString(last)));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 1048576;
+  Reopen(&options);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+
+  // Fill levels >= 1 so memtable compaction outputs to level 1
+  for (int level = 1; level < config::kNumLevels; level++) {
+    dbi->Put(WriteOptions(), "", "begin");
+    dbi->Put(WriteOptions(), "~", "end");
+    dbi->TEST_CompactMemTable();
+  }
+
+  Build(10);
+  dbi->TEST_CompactMemTable();
+  ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  for (int i = 0; i < 10000 && s.ok(); i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+  dbi->TEST_CompactMemTable();
+  Corrupt(kTableFile, 100, 1);
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  dbi->TEST_CompactMemTable();
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/db_bench.cc b/src/leveldb/db/db_bench.cc
new file mode 100644
index 0000000000..21d3e25f31
--- /dev/null
+++ b/src/leveldb/db/db_bench.cc
@@ -0,0 +1,978 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "leveldb/cache.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/write_batch.h"
+#include "port/port.h"
+#include "util/crc32c.h"
+#include "util/histogram.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/testutil.h"
+
+// Comma-separated list of operations to run in the specified order
+//   Actual benchmarks:
+//      fillseq       -- write N values in sequential key order in async mode
+//      fillrandom    -- write N values in random key order in async mode
+//      overwrite     -- overwrite N values in random key order in async mode
+//      fillsync      -- write N/100 values in random key order in sync mode
+//      fill100K      -- write N/1000 100K values in random order in async mode
+//      deleteseq     -- delete N keys in sequential order
+//      deleterandom  -- delete N keys in random order
+//      readseq       -- read N times sequentially
+//      readreverse   -- read N times in reverse order
+//      readrandom    -- read N times in random order
+//      readmissing   -- read N missing keys in random order
+//      readhot       -- read N times in random order from 1% section of DB
+//      seekrandom    -- N random seeks
+//      crc32c        -- repeated crc32c of 4K of data
+//      acquireload   -- load N*1000 times
+//   Meta operations:
+//      compact     -- Compact the entire DB
+//      stats       -- Print DB stats
+//      sstables    -- Print sstable info
+//      heapprofile -- Dump a heap profile (if supported by this port)
+static const char* FLAGS_benchmarks =
+    "fillseq,"
+    "fillsync,"
+    "fillrandom,"
+    "overwrite,"
+    "readrandom,"
+    "readrandom,"  // Extra run to allow previous compactions to quiesce
+    "readseq,"
+    "readreverse,"
+    "compact,"
+    "readrandom,"
+    "readseq,"
+    "readreverse,"
+    "fill100K,"
+    "crc32c,"
+    "snappycomp,"
+    "snappyuncomp,"
+    "acquireload,"
+    ;
+
+// Number of key/values to place in database
+static int FLAGS_num = 1000000;
+
+// Number of read operations to do.  If negative, do FLAGS_num reads.
+static int FLAGS_reads = -1;
+
+// Number of concurrent threads to run.
+static int FLAGS_threads = 1;
+
+// Size of each value
+static int FLAGS_value_size = 100;
+
+// Arrange to generate values that shrink to this fraction of
+// their original size after compression
+static double FLAGS_compression_ratio = 0.5;
+
+// Print histogram of operation timings
+static bool FLAGS_histogram = false;
+
+// Number of bytes to buffer in memtable before compacting
+// (initialized to default value by "main")
+static int FLAGS_write_buffer_size = 0;
+
+// Number of bytes to use as a cache of uncompressed data.
+// Negative means use default settings.
+static int FLAGS_cache_size = -1;
+
+// Maximum number of files to keep open at the same time (use default if == 0)
+static int FLAGS_open_files = 0;
+
+// Bloom filter bits per key.
+// Negative means use default settings.
+static int FLAGS_bloom_bits = -1;
+
+// If true, do not destroy the existing database.  If you set this
+// flag and also specify a benchmark that wants a fresh database, that
+// benchmark will fail.
+static bool FLAGS_use_existing_db = false;
+
+// Use the db with the following name.
+static const char* FLAGS_db = NULL;
+
+namespace leveldb {
+
+namespace {
+
+// Helper for quickly generating random data.
+class RandomGenerator {
+ private:
+  std::string data_;
+  int pos_;
+
+ public:
+  RandomGenerator() {
+    // We use a limited amount of data over and over again and ensure
+    // that it is larger than the compression window (32KB), and also
+    // large enough to serve all typical value sizes we want to write.
+    Random rnd(301);
+    std::string piece;
+    while (data_.size() < 1048576) {
+      // Add a short fragment that is as compressible as specified
+      // by FLAGS_compression_ratio.
+      test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+      data_.append(piece);
+    }
+    pos_ = 0;
+  }
+
+  Slice Generate(int len) {
+    if (pos_ + len > data_.size()) {
+      pos_ = 0;
+      assert(len < data_.size());
+    }
+    pos_ += len;
+    return Slice(data_.data() + pos_ - len, len);
+  }
+};
+
+static Slice TrimSpace(Slice s) {
+  int start = 0;
+  while (start < s.size() && isspace(s[start])) {
+    start++;
+  }
+  int limit = s.size();
+  while (limit > start && isspace(s[limit-1])) {
+    limit--;
+  }
+  return Slice(s.data() + start, limit - start);
+}
+
+static void AppendWithSpace(std::string* str, Slice msg) {
+  if (msg.empty()) return;
+  if (!str->empty()) {
+    str->push_back(' ');
+  }
+  str->append(msg.data(), msg.size());
+}
+
+class Stats {
+ private:
+  double start_;
+  double finish_;
+  double seconds_;
+  int done_;
+  int next_report_;
+  int64_t bytes_;
+  double last_op_finish_;
+  Histogram hist_;
+  std::string message_;
+
+ public:
+  Stats() { Start(); }
+
+  void Start() {
+    next_report_ = 100;
+    last_op_finish_ = start_;
+    hist_.Clear();
+    done_ = 0;
+    bytes_ = 0;
+    seconds_ = 0;
+    start_ = Env::Default()->NowMicros();
+    finish_ = start_;
+    message_.clear();
+  }
+
+  void Merge(const Stats& other) {
+    hist_.Merge(other.hist_);
+    done_ += other.done_;
+    bytes_ += other.bytes_;
+    seconds_ += other.seconds_;
+    if (other.start_ < start_) start_ = other.start_;
+    if (other.finish_ > finish_) finish_ = other.finish_;
+
+    // Just keep the messages from one thread
+    if (message_.empty()) message_ = other.message_;
+  }
+
+  void Stop() {
+    finish_ = Env::Default()->NowMicros();
+    seconds_ = (finish_ - start_) * 1e-6;
+  }
+
+  void AddMessage(Slice msg) {
+    AppendWithSpace(&message_, msg);
+  }
+
+  void FinishedSingleOp() {
+    if (FLAGS_histogram) {
+      double now = Env::Default()->NowMicros();
+      double micros = now - last_op_finish_;
+      hist_.Add(micros);
+      if (micros > 20000) {
+        fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+        fflush(stderr);
+      }
+      last_op_finish_ = now;
+    }
+
+    done_++;
+    if (done_ >= next_report_) {
+      if      (next_report_ < 1000)   next_report_ += 100;
+      else if (next_report_ < 5000)   next_report_ += 500;
+      else if (next_report_ < 10000)  next_report_ += 1000;
+      else if (next_report_ < 50000)  next_report_ += 5000;
+      else if (next_report_ < 100000) next_report_ += 10000;
+      else if (next_report_ < 500000) next_report_ += 50000;
+      else                            next_report_ += 100000;
+      fprintf(stderr, "... finished %d ops%30s\r", done_, "");
+      fflush(stderr);
+    }
+  }
+
+  void AddBytes(int64_t n) {
+    bytes_ += n;
+  }
+
+  void Report(const Slice& name) {
+    // Pretend at least one op was done in case we are running a benchmark
+    // that does not call FinishedSingleOp().
+    if (done_ < 1) done_ = 1;
+
+    std::string extra;
+    if (bytes_ > 0) {
+      // Rate is computed on actual elapsed time, not the sum of per-thread
+      // elapsed times.
+      double elapsed = (finish_ - start_) * 1e-6;
+      char rate[100];
+      snprintf(rate, sizeof(rate), "%6.1f MB/s",
+               (bytes_ / 1048576.0) / elapsed);
+      extra = rate;
+    }
+    AppendWithSpace(&extra, message_);
+
+    fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
+            name.ToString().c_str(),
+            seconds_ * 1e6 / done_,
+            (extra.empty() ? "" : " "),
+            extra.c_str());
+    if (FLAGS_histogram) {
+      fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+    }
+    fflush(stdout);
+  }
+};
+
+// State shared by all concurrent executions of the same benchmark.
+struct SharedState {
+  port::Mutex mu;
+  port::CondVar cv;
+  int total;
+
+  // Each thread goes through the following states:
+  //    (1) initializing
+  //    (2) waiting for others to be initialized
+  //    (3) running
+  //    (4) done
+
+  int num_initialized;
+  int num_done;
+  bool start;
+
+  SharedState() : cv(&mu) { }
+};
+
+// Per-thread state for concurrent executions of the same benchmark.
+struct ThreadState {
+  int tid;             // 0..n-1 when running in n threads
+  Random rand;         // Has different seeds for different threads
+  Stats stats;
+  SharedState* shared;
+
+  ThreadState(int index)
+      : tid(index),
+        rand(1000 + index) {
+  }
+};
+
+}  // namespace
+
+class Benchmark {
+ private:
+  Cache* cache_;
+  const FilterPolicy* filter_policy_;
+  DB* db_;
+  int num_;
+  int value_size_;
+  int entries_per_batch_;
+  WriteOptions write_options_;
+  int reads_;
+  int heap_counter_;
+
+  void PrintHeader() {
+    const int kKeySize = 16;
+    PrintEnvironment();
+    fprintf(stdout, "Keys:       %d bytes each\n", kKeySize);
+    fprintf(stdout, "Values:     %d bytes each (%d bytes after compression)\n",
+            FLAGS_value_size,
+            static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+    fprintf(stdout, "Entries:    %d\n", num_);
+    fprintf(stdout, "RawSize:    %.1f MB (estimated)\n",
+            ((static_cast<int64_t>(kKeySize + FLAGS_value_size) * num_)
+             / 1048576.0));
+    fprintf(stdout, "FileSize:   %.1f MB (estimated)\n",
+            (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
+             / 1048576.0));
+    PrintWarnings();
+    fprintf(stdout, "------------------------------------------------\n");
+  }
+
+  void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+    fprintf(stdout,
+            "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+            );
+#endif
+#ifndef NDEBUG
+    fprintf(stdout,
+            "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+
+    // See if snappy is working by attempting to compress a compressible string
+    const char text[] = "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy";
+    std::string compressed;
+    if (!port::Snappy_Compress(text, sizeof(text), &compressed)) {
+      fprintf(stdout, "WARNING: Snappy compression is not enabled\n");
+    } else if (compressed.size() >= sizeof(text)) {
+      fprintf(stdout, "WARNING: Snappy compression is not effective\n");
+    }
+  }
+
+  void PrintEnvironment() {
+    fprintf(stderr, "LevelDB:    version %d.%d\n",
+            kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+    time_t now = time(NULL);
+    fprintf(stderr, "Date:       %s", ctime(&now));  // ctime() adds newline
+
+    FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo != NULL) {
+      char line[1000];
+      int num_cpus = 0;
+      std::string cpu_type;
+      std::string cache_size;
+      while (fgets(line, sizeof(line), cpuinfo) != NULL) {
+        const char* sep = strchr(line, ':');
+        if (sep == NULL) {
+          continue;
+        }
+        Slice key = TrimSpace(Slice(line, sep - 1 - line));
+        Slice val = TrimSpace(Slice(sep + 1));
+        if (key == "model name") {
+          ++num_cpus;
+          cpu_type = val.ToString();
+        } else if (key == "cache size") {
+          cache_size = val.ToString();
+        }
+      }
+      fclose(cpuinfo);
+      fprintf(stderr, "CPU:        %d * %s\n", num_cpus, cpu_type.c_str());
+      fprintf(stderr, "CPUCache:   %s\n", cache_size.c_str());
+    }
+#endif
+  }
+
+ public:
+  Benchmark()
+  : cache_(FLAGS_cache_size >= 0 ? NewLRUCache(FLAGS_cache_size) : NULL),
+    filter_policy_(FLAGS_bloom_bits >= 0
+                   ? NewBloomFilterPolicy(FLAGS_bloom_bits)
+                   : NULL),
+    db_(NULL),
+    num_(FLAGS_num),
+    value_size_(FLAGS_value_size),
+    entries_per_batch_(1),
+    reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads),
+    heap_counter_(0) {
+    std::vector<std::string> files;
+    Env::Default()->GetChildren(FLAGS_db, &files);
+    for (int i = 0; i < files.size(); i++) {
+      if (Slice(files[i]).starts_with("heap-")) {
+        Env::Default()->DeleteFile(std::string(FLAGS_db) + "/" + files[i]);
+      }
+    }
+    if (!FLAGS_use_existing_db) {
+      DestroyDB(FLAGS_db, Options());
+    }
+  }
+
+  ~Benchmark() {
+    delete db_;
+    delete cache_;
+    delete filter_policy_;
+  }
+
+  void Run() {
+    PrintHeader();
+    Open();
+
+    const char* benchmarks = FLAGS_benchmarks;
+    while (benchmarks != NULL) {
+      const char* sep = strchr(benchmarks, ',');
+      Slice name;
+      if (sep == NULL) {
+        name = benchmarks;
+        benchmarks = NULL;
+      } else {
+        name = Slice(benchmarks, sep - benchmarks);
+        benchmarks = sep + 1;
+      }
+
+      // Reset parameters that may be overriddden bwlow
+      num_ = FLAGS_num;
+      reads_ = (FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads);
+      value_size_ = FLAGS_value_size;
+      entries_per_batch_ = 1;
+      write_options_ = WriteOptions();
+
+      void (Benchmark::*method)(ThreadState*) = NULL;
+      bool fresh_db = false;
+      int num_threads = FLAGS_threads;
+
+      if (name == Slice("fillseq")) {
+        fresh_db = true;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillbatch")) {
+        fresh_db = true;
+        entries_per_batch_ = 1000;
+        method = &Benchmark::WriteSeq;
+      } else if (name == Slice("fillrandom")) {
+        fresh_db = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("overwrite")) {
+        fresh_db = false;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fillsync")) {
+        fresh_db = true;
+        num_ /= 1000;
+        write_options_.sync = true;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("fill100K")) {
+        fresh_db = true;
+        num_ /= 1000;
+        value_size_ = 100 * 1000;
+        method = &Benchmark::WriteRandom;
+      } else if (name == Slice("readseq")) {
+        method = &Benchmark::ReadSequential;
+      } else if (name == Slice("readreverse")) {
+        method = &Benchmark::ReadReverse;
+      } else if (name == Slice("readrandom")) {
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("readmissing")) {
+        method = &Benchmark::ReadMissing;
+      } else if (name == Slice("seekrandom")) {
+        method = &Benchmark::SeekRandom;
+      } else if (name == Slice("readhot")) {
+        method = &Benchmark::ReadHot;
+      } else if (name == Slice("readrandomsmall")) {
+        reads_ /= 1000;
+        method = &Benchmark::ReadRandom;
+      } else if (name == Slice("deleteseq")) {
+        method = &Benchmark::DeleteSeq;
+      } else if (name == Slice("deleterandom")) {
+        method = &Benchmark::DeleteRandom;
+      } else if (name == Slice("readwhilewriting")) {
+        num_threads++;  // Add extra thread for writing
+        method = &Benchmark::ReadWhileWriting;
+      } else if (name == Slice("compact")) {
+        method = &Benchmark::Compact;
+      } else if (name == Slice("crc32c")) {
+        method = &Benchmark::Crc32c;
+      } else if (name == Slice("acquireload")) {
+        method = &Benchmark::AcquireLoad;
+      } else if (name == Slice("snappycomp")) {
+        method = &Benchmark::SnappyCompress;
+      } else if (name == Slice("snappyuncomp")) {
+        method = &Benchmark::SnappyUncompress;
+      } else if (name == Slice("heapprofile")) {
+        HeapProfile();
+      } else if (name == Slice("stats")) {
+        PrintStats("leveldb.stats");
+      } else if (name == Slice("sstables")) {
+        PrintStats("leveldb.sstables");
+      } else {
+        if (name != Slice()) {  // No error message for empty name
+          fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+        }
+      }
+
+      if (fresh_db) {
+        if (FLAGS_use_existing_db) {
+          fprintf(stdout, "%-12s : skipped (--use_existing_db is true)\n",
+                  name.ToString().c_str());
+          method = NULL;
+        } else {
+          delete db_;
+          db_ = NULL;
+          DestroyDB(FLAGS_db, Options());
+          Open();
+        }
+      }
+
+      if (method != NULL) {
+        RunBenchmark(num_threads, name, method);
+      }
+    }
+  }
+
+ private:
+  struct ThreadArg {
+    Benchmark* bm;
+    SharedState* shared;
+    ThreadState* thread;
+    void (Benchmark::*method)(ThreadState*);
+  };
+
+  static void ThreadBody(void* v) {
+    ThreadArg* arg = reinterpret_cast<ThreadArg*>(v);
+    SharedState* shared = arg->shared;
+    ThreadState* thread = arg->thread;
+    {
+      MutexLock l(&shared->mu);
+      shared->num_initialized++;
+      if (shared->num_initialized >= shared->total) {
+        shared->cv.SignalAll();
+      }
+      while (!shared->start) {
+        shared->cv.Wait();
+      }
+    }
+
+    thread->stats.Start();
+    (arg->bm->*(arg->method))(thread);
+    thread->stats.Stop();
+
+    {
+      MutexLock l(&shared->mu);
+      shared->num_done++;
+      if (shared->num_done >= shared->total) {
+        shared->cv.SignalAll();
+      }
+    }
+  }
+
+  void RunBenchmark(int n, Slice name,
+                    void (Benchmark::*method)(ThreadState*)) {
+    SharedState shared;
+    shared.total = n;
+    shared.num_initialized = 0;
+    shared.num_done = 0;
+    shared.start = false;
+
+    ThreadArg* arg = new ThreadArg[n];
+    for (int i = 0; i < n; i++) {
+      arg[i].bm = this;
+      arg[i].method = method;
+      arg[i].shared = &shared;
+      arg[i].thread = new ThreadState(i);
+      arg[i].thread->shared = &shared;
+      Env::Default()->StartThread(ThreadBody, &arg[i]);
+    }
+
+    shared.mu.Lock();
+    while (shared.num_initialized < n) {
+      shared.cv.Wait();
+    }
+
+    shared.start = true;
+    shared.cv.SignalAll();
+    while (shared.num_done < n) {
+      shared.cv.Wait();
+    }
+    shared.mu.Unlock();
+
+    for (int i = 1; i < n; i++) {
+      arg[0].thread->stats.Merge(arg[i].thread->stats);
+    }
+    arg[0].thread->stats.Report(name);
+
+    for (int i = 0; i < n; i++) {
+      delete arg[i].thread;
+    }
+    delete[] arg;
+  }
+
+  void Crc32c(ThreadState* thread) {
+    // Checksum about 500MB of data total
+    const int size = 4096;
+    const char* label = "(4K per op)";
+    std::string data(size, 'x');
+    int64_t bytes = 0;
+    uint32_t crc = 0;
+    while (bytes < 500 * 1048576) {
+      crc = crc32c::Value(data.data(), size);
+      thread->stats.FinishedSingleOp();
+      bytes += size;
+    }
+    // Print so result is not dead
+    fprintf(stderr, "... crc=0x%x\r", static_cast<unsigned int>(crc));
+
+    thread->stats.AddBytes(bytes);
+    thread->stats.AddMessage(label);
+  }
+
+  void AcquireLoad(ThreadState* thread) {
+    int dummy;
+    port::AtomicPointer ap(&dummy);
+    int count = 0;
+    void *ptr = NULL;
+    thread->stats.AddMessage("(each op is 1000 loads)");
+    while (count < 100000) {
+      for (int i = 0; i < 1000; i++) {
+        ptr = ap.Acquire_Load();
+      }
+      count++;
+      thread->stats.FinishedSingleOp();
+    }
+    if (ptr == NULL) exit(1); // Disable unused variable warning.
+  }
+
+  void SnappyCompress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    int64_t bytes = 0;
+    int64_t produced = 0;
+    bool ok = true;
+    std::string compressed;
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+      produced += compressed.size();
+      bytes += input.size();
+      thread->stats.FinishedSingleOp();
+    }
+
+    if (!ok) {
+      thread->stats.AddMessage("(snappy failure)");
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "(output: %.1f%%)",
+               (produced * 100.0) / bytes);
+      thread->stats.AddMessage(buf);
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void SnappyUncompress(ThreadState* thread) {
+    RandomGenerator gen;
+    Slice input = gen.Generate(Options().block_size);
+    std::string compressed;
+    bool ok = port::Snappy_Compress(input.data(), input.size(), &compressed);
+    int64_t bytes = 0;
+    char* uncompressed = new char[input.size()];
+    while (ok && bytes < 1024 * 1048576) {  // Compress 1G
+      ok =  port::Snappy_Uncompress(compressed.data(), compressed.size(),
+                                    uncompressed);
+      bytes += input.size();
+      thread->stats.FinishedSingleOp();
+    }
+    delete[] uncompressed;
+
+    if (!ok) {
+      thread->stats.AddMessage("(snappy failure)");
+    } else {
+      thread->stats.AddBytes(bytes);
+    }
+  }
+
+  void Open() {
+    assert(db_ == NULL);
+    Options options;
+    options.create_if_missing = !FLAGS_use_existing_db;
+    options.block_cache = cache_;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.filter_policy = filter_policy_;
+    Status s = DB::Open(options, FLAGS_db, &db_);
+    if (!s.ok()) {
+      fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+      exit(1);
+    }
+  }
+
+  void WriteSeq(ThreadState* thread) {
+    DoWrite(thread, true);
+  }
+
+  void WriteRandom(ThreadState* thread) {
+    DoWrite(thread, false);
+  }
+
+  void DoWrite(ThreadState* thread, bool seq) {
+    if (num_ != FLAGS_num) {
+      char msg[100];
+      snprintf(msg, sizeof(msg), "(%d ops)", num_);
+      thread->stats.AddMessage(msg);
+    }
+
+    RandomGenerator gen;
+    WriteBatch batch;
+    Status s;
+    int64_t bytes = 0;
+    for (int i = 0; i < num_; i += entries_per_batch_) {
+      batch.Clear();
+      for (int j = 0; j < entries_per_batch_; j++) {
+        const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        batch.Put(key, gen.Generate(value_size_));
+        bytes += value_size_ + strlen(key);
+        thread->stats.FinishedSingleOp();
+      }
+      s = db_->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadSequential(ThreadState* thread) {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadReverse(ThreadState* thread) {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    int i = 0;
+    int64_t bytes = 0;
+    for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
+      bytes += iter->key().size() + iter->value().size();
+      thread->stats.FinishedSingleOp();
+      ++i;
+    }
+    delete iter;
+    thread->stats.AddBytes(bytes);
+  }
+
+  void ReadRandom(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    int found = 0;
+    for (int i = 0; i < reads_; i++) {
+      char key[100];
+      const int k = thread->rand.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d", k);
+      if (db_->Get(options, key, &value).ok()) {
+        found++;
+      }
+      thread->stats.FinishedSingleOp();
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void ReadMissing(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    for (int i = 0; i < reads_; i++) {
+      char key[100];
+      const int k = thread->rand.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d.", k);
+      db_->Get(options, key, &value);
+      thread->stats.FinishedSingleOp();
+    }
+  }
+
+  void ReadHot(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    const int range = (FLAGS_num + 99) / 100;
+    for (int i = 0; i < reads_; i++) {
+      char key[100];
+      const int k = thread->rand.Next() % range;
+      snprintf(key, sizeof(key), "%016d", k);
+      db_->Get(options, key, &value);
+      thread->stats.FinishedSingleOp();
+    }
+  }
+
+  void SeekRandom(ThreadState* thread) {
+    ReadOptions options;
+    std::string value;
+    int found = 0;
+    for (int i = 0; i < reads_; i++) {
+      Iterator* iter = db_->NewIterator(options);
+      char key[100];
+      const int k = thread->rand.Next() % FLAGS_num;
+      snprintf(key, sizeof(key), "%016d", k);
+      iter->Seek(key);
+      if (iter->Valid() && iter->key() == key) found++;
+      delete iter;
+      thread->stats.FinishedSingleOp();
+    }
+    char msg[100];
+    snprintf(msg, sizeof(msg), "(%d of %d found)", found, num_);
+    thread->stats.AddMessage(msg);
+  }
+
+  void DoDelete(ThreadState* thread, bool seq) {
+    RandomGenerator gen;
+    WriteBatch batch;
+    Status s;
+    for (int i = 0; i < num_; i += entries_per_batch_) {
+      batch.Clear();
+      for (int j = 0; j < entries_per_batch_; j++) {
+        const int k = seq ? i+j : (thread->rand.Next() % FLAGS_num);
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        batch.Delete(key);
+        thread->stats.FinishedSingleOp();
+      }
+      s = db_->Write(write_options_, &batch);
+      if (!s.ok()) {
+        fprintf(stderr, "del error: %s\n", s.ToString().c_str());
+        exit(1);
+      }
+    }
+  }
+
+  void DeleteSeq(ThreadState* thread) {
+    DoDelete(thread, true);
+  }
+
+  void DeleteRandom(ThreadState* thread) {
+    DoDelete(thread, false);
+  }
+
+  void ReadWhileWriting(ThreadState* thread) {
+    if (thread->tid > 0) {
+      ReadRandom(thread);
+    } else {
+      // Special thread that keeps writing until other threads are done.
+      RandomGenerator gen;
+      while (true) {
+        {
+          MutexLock l(&thread->shared->mu);
+          if (thread->shared->num_done + 1 >= thread->shared->num_initialized) {
+            // Other threads have finished
+            break;
+          }
+        }
+
+        const int k = thread->rand.Next() % FLAGS_num;
+        char key[100];
+        snprintf(key, sizeof(key), "%016d", k);
+        Status s = db_->Put(write_options_, key, gen.Generate(value_size_));
+        if (!s.ok()) {
+          fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+          exit(1);
+        }
+      }
+
+      // Do not count any of the preceding work/delay in stats.
+      thread->stats.Start();
+    }
+  }
+
+  void Compact(ThreadState* thread) {
+    db_->CompactRange(NULL, NULL);
+  }
+
+  void PrintStats(const char* key) {
+    std::string stats;
+    if (!db_->GetProperty(key, &stats)) {
+      stats = "(failed)";
+    }
+    fprintf(stdout, "\n%s\n", stats.c_str());
+  }
+
+  static void WriteToFile(void* arg, const char* buf, int n) {
+    reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
+  }
+
+  void HeapProfile() {
+    char fname[100];
+    snprintf(fname, sizeof(fname), "%s/heap-%04d", FLAGS_db, ++heap_counter_);
+    WritableFile* file;
+    Status s = Env::Default()->NewWritableFile(fname, &file);
+    if (!s.ok()) {
+      fprintf(stderr, "%s\n", s.ToString().c_str());
+      return;
+    }
+    bool ok = port::GetHeapProfile(WriteToFile, file);
+    delete file;
+    if (!ok) {
+      fprintf(stderr, "heap profiling not supported\n");
+      Env::Default()->DeleteFile(fname);
+    }
+  }
+};
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  FLAGS_write_buffer_size = leveldb::Options().write_buffer_size;
+  FLAGS_open_files = leveldb::Options().max_open_files;
+  std::string default_db_path;
+
+  for (int i = 1; i < argc; i++) {
+    double d;
+    int n;
+    char junk;
+    if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
+      FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
+    } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
+      FLAGS_compression_ratio = d;
+    } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_histogram = n;
+    } else if (sscanf(argv[i], "--use_existing_db=%d%c", &n, &junk) == 1 &&
+               (n == 0 || n == 1)) {
+      FLAGS_use_existing_db = n;
+    } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
+      FLAGS_num = n;
+    } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) {
+      FLAGS_reads = n;
+    } else if (sscanf(argv[i], "--threads=%d%c", &n, &junk) == 1) {
+      FLAGS_threads = n;
+    } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
+      FLAGS_value_size = n;
+    } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) {
+      FLAGS_cache_size = n;
+    } else if (sscanf(argv[i], "--bloom_bits=%d%c", &n, &junk) == 1) {
+      FLAGS_bloom_bits = n;
+    } else if (sscanf(argv[i], "--open_files=%d%c", &n, &junk) == 1) {
+      FLAGS_open_files = n;
+    } else if (strncmp(argv[i], "--db=", 5) == 0) {
+      FLAGS_db = argv[i] + 5;
+    } else {
+      fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
+      exit(1);
+    }
+  }
+
+  // Choose a location for the test database if none given with --db=<path>
+  if (FLAGS_db == NULL) {
+      leveldb::Env::Default()->GetTestDirectory(&default_db_path);
+      default_db_path += "/dbbench";
+      FLAGS_db = default_db_path.c_str();
+  }
+
+  leveldb::Benchmark benchmark;
+  benchmark.Run();
+  return 0;
+}
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
new file mode 100644
index 0000000000..90c1c811d8
--- /dev/null
+++ b/src/leveldb/db/db_impl.cc
@@ -0,0 +1,1463 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "leveldb/status.h"
+#include "leveldb/table.h"
+#include "leveldb/table_builder.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+// Information kept for every waiting writer
+struct DBImpl::Writer {
+  Status status;
+  WriteBatch* batch;
+  bool sync;
+  bool done;
+  port::CondVar cv;
+
+  explicit Writer(port::Mutex* mu) : cv(mu) { }
+};
+
+struct DBImpl::CompactionState {
+  Compaction* const compaction;
+
+  // Sequence numbers < smallest_snapshot are not significant since we
+  // will never have to service a snapshot below smallest_snapshot.
+  // Therefore if we have seen a sequence number S <= smallest_snapshot,
+  // we can drop all entries for the same key with sequence numbers < S.
+  SequenceNumber smallest_snapshot;
+
+  // Files produced by compaction
+  struct Output {
+    uint64_t number;
+    uint64_t file_size;
+    InternalKey smallest, largest;
+  };
+  std::vector<Output> outputs;
+
+  // State kept for output being generated
+  WritableFile* outfile;
+  TableBuilder* builder;
+
+  uint64_t total_bytes;
+
+  Output* current_output() { return &outputs[outputs.size()-1]; }
+
+  explicit CompactionState(Compaction* c)
+      : compaction(c),
+        outfile(NULL),
+        builder(NULL),
+        total_bytes(0) {
+  }
+};
+
+// Fix user-supplied options to be reasonable
+template <class T,class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+Options SanitizeOptions(const std::string& dbname,
+                        const InternalKeyComparator* icmp,
+                        const InternalFilterPolicy* ipolicy,
+                        const Options& src) {
+  Options result = src;
+  result.comparator = icmp;
+  result.filter_policy = (src.filter_policy != NULL) ? ipolicy : NULL;
+  ClipToRange(&result.max_open_files,            20,     50000);
+  ClipToRange(&result.write_buffer_size,         64<<10, 1<<30);
+  ClipToRange(&result.block_size,                1<<10,  4<<20);
+  if (result.info_log == NULL) {
+    // Open a log file in the same directory as the db
+    src.env->CreateDir(dbname);  // In case it does not exist
+    src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
+    Status s = src.env->NewLogger(InfoLogFileName(dbname), &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = NULL;
+    }
+  }
+  if (result.block_cache == NULL) {
+    result.block_cache = NewLRUCache(8 << 20);
+  }
+  return result;
+}
+
+DBImpl::DBImpl(const Options& options, const std::string& dbname)
+    : env_(options.env),
+      internal_comparator_(options.comparator),
+      internal_filter_policy_(options.filter_policy),
+      options_(SanitizeOptions(
+          dbname, &internal_comparator_, &internal_filter_policy_, options)),
+      owns_info_log_(options_.info_log != options.info_log),
+      owns_cache_(options_.block_cache != options.block_cache),
+      dbname_(dbname),
+      db_lock_(NULL),
+      shutting_down_(NULL),
+      bg_cv_(&mutex_),
+      mem_(new MemTable(internal_comparator_)),
+      imm_(NULL),
+      logfile_(NULL),
+      logfile_number_(0),
+      log_(NULL),
+      tmp_batch_(new WriteBatch),
+      bg_compaction_scheduled_(false),
+      manual_compaction_(NULL) {
+  mem_->Ref();
+  has_imm_.Release_Store(NULL);
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  const int table_cache_size = options.max_open_files - 10;
+  table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
+
+  versions_ = new VersionSet(dbname_, &options_, table_cache_,
+                             &internal_comparator_);
+}
+
+DBImpl::~DBImpl() {
+  // Wait for background work to finish
+  mutex_.Lock();
+  shutting_down_.Release_Store(this);  // Any non-NULL value is ok
+  while (bg_compaction_scheduled_) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
+
+  if (db_lock_ != NULL) {
+    env_->UnlockFile(db_lock_);
+  }
+
+  delete versions_;
+  if (mem_ != NULL) mem_->Unref();
+  if (imm_ != NULL) imm_->Unref();
+  delete tmp_batch_;
+  delete log_;
+  delete logfile_;
+  delete table_cache_;
+
+  if (owns_info_log_) {
+    delete options_.info_log;
+  }
+  if (owns_cache_) {
+    delete options_.block_cache;
+  }
+}
+
+Status DBImpl::NewDB() {
+  VersionEdit new_db;
+  new_db.SetComparatorName(user_comparator()->Name());
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  WritableFile* file;
+  Status s = env_->NewWritableFile(manifest, &file);
+  if (!s.ok()) {
+    return s;
+  }
+  {
+    log::Writer log(file);
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+    if (s.ok()) {
+      s = file->Close();
+    }
+  }
+  delete file;
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(env_, dbname_, 1);
+  } else {
+    env_->DeleteFile(manifest);
+  }
+  return s;
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || options_.paranoid_checks) {
+    // No change needed
+  } else {
+    Log(options_.info_log, "Ignoring error %s", s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  // Make a set of all of the live files
+  std::set<uint64_t> live = pending_outputs_;
+  versions_->AddLiveFiles(&live);
+
+  std::vector<std::string> filenames;
+  env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    if (ParseFileName(filenames[i], &number, &type)) {
+      bool keep = true;
+      switch (type) {
+        case kLogFile:
+          keep = ((number >= versions_->LogNumber()) ||
+                  (number == versions_->PrevLogNumber()));
+          break;
+        case kDescriptorFile:
+          // Keep my manifest file, and any newer incarnations'
+          // (in case there is a race that allows other incarnations)
+          keep = (number >= versions_->ManifestFileNumber());
+          break;
+        case kTableFile:
+          keep = (live.find(number) != live.end());
+          break;
+        case kTempFile:
+          // Any temp files that are currently being written to must
+          // be recorded in pending_outputs_, which is inserted into "live"
+          keep = (live.find(number) != live.end());
+          break;
+        case kCurrentFile:
+        case kDBLockFile:
+        case kInfoLogFile:
+          keep = true;
+          break;
+      }
+
+      if (!keep) {
+        if (type == kTableFile) {
+          table_cache_->Evict(number);
+        }
+        Log(options_.info_log, "Delete type=%d #%lld\n",
+            int(type),
+            static_cast<unsigned long long>(number));
+        env_->DeleteFile(dbname_ + "/" + filenames[i]);
+      }
+    }
+  }
+}
+
+Status DBImpl::Recover(VersionEdit* edit) {
+  mutex_.AssertHeld();
+
+  // Ignore error from CreateDir since the creation of the DB is
+  // committed only when the descriptor is created, and this directory
+  // may already exist from a previous failed creation attempt.
+  env_->CreateDir(dbname_);
+  assert(db_lock_ == NULL);
+  Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (!env_->FileExists(CurrentFileName(dbname_))) {
+    if (options_.create_if_missing) {
+      s = NewDB();
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      return Status::InvalidArgument(
+          dbname_, "does not exist (create_if_missing is false)");
+    }
+  } else {
+    if (options_.error_if_exists) {
+      return Status::InvalidArgument(
+          dbname_, "exists (error_if_exists is true)");
+    }
+  }
+
+  s = versions_->Recover();
+  if (s.ok()) {
+    SequenceNumber max_sequence(0);
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that PrevLogNumber() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of leveldb.
+    const uint64_t min_log = versions_->LogNumber();
+    const uint64_t prev_log = versions_->PrevLogNumber();
+    std::vector<std::string> filenames;
+    s = env_->GetChildren(dbname_, &filenames);
+    if (!s.ok()) {
+      return s;
+    }
+    uint64_t number;
+    FileType type;
+    std::vector<uint64_t> logs;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)
+          && type == kLogFile
+          && ((number >= min_log) || (number == prev_log))) {
+        logs.push_back(number);
+      }
+    }
+
+    // Recover in the order in which the logs were generated
+    std::sort(logs.begin(), logs.end());
+    for (size_t i = 0; i < logs.size(); i++) {
+      s = RecoverLogFile(logs[i], edit, &max_sequence);
+
+      // The previous incarnation may not have written any MANIFEST
+      // records after allocating this log number.  So we manually
+      // update the file number allocation counter in VersionSet.
+      versions_->MarkFileNumberUsed(logs[i]);
+    }
+
+    if (s.ok()) {
+      if (versions_->LastSequence() < max_sequence) {
+        versions_->SetLastSequence(max_sequence);
+      }
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number,
+                              VersionEdit* edit,
+                              SequenceNumber* max_sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // NULL if options_.paranoid_checks==false
+    virtual void Corruption(size_t bytes, const Status& s) {
+      Log(info_log, "%s%s: dropping %d bytes; %s",
+          (this->status == NULL ? "(ignoring error) " : ""),
+          fname, static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status != NULL && this->status->ok()) *this->status = s;
+    }
+  };
+
+  mutex_.AssertHeld();
+
+  // Open the log file
+  std::string fname = LogFileName(dbname_, log_number);
+  SequentialFile* file;
+  Status status = env_->NewSequentialFile(fname, &file);
+  if (!status.ok()) {
+    MaybeIgnoreError(&status);
+    return status;
+  }
+
+  // Create the log reader.
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = options_.info_log;
+  reporter.fname = fname.c_str();
+  reporter.status = (options_.paranoid_checks ? &status : NULL);
+  // We intentially make log::Reader do checksumming even if
+  // paranoid_checks==false so that corruptions cause entire commits
+  // to be skipped instead of propagating bad information (like overly
+  // large sequence numbers).
+  log::Reader reader(file, &reporter, true/*checksum*/,
+                     0/*initial_offset*/);
+  Log(options_.info_log, "Recovering log #%llu",
+      (unsigned long long) log_number);
+
+  // Read all the records and add to a memtable
+  std::string scratch;
+  Slice record;
+  WriteBatch batch;
+  MemTable* mem = NULL;
+  while (reader.ReadRecord(&record, &scratch) &&
+         status.ok()) {
+    if (record.size() < 12) {
+      reporter.Corruption(
+          record.size(), Status::Corruption("log record too small"));
+      continue;
+    }
+    WriteBatchInternal::SetContents(&batch, record);
+
+    if (mem == NULL) {
+      mem = new MemTable(internal_comparator_);
+      mem->Ref();
+    }
+    status = WriteBatchInternal::InsertInto(&batch, mem);
+    MaybeIgnoreError(&status);
+    if (!status.ok()) {
+      break;
+    }
+    const SequenceNumber last_seq =
+        WriteBatchInternal::Sequence(&batch) +
+        WriteBatchInternal::Count(&batch) - 1;
+    if (last_seq > *max_sequence) {
+      *max_sequence = last_seq;
+    }
+
+    if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+      status = WriteLevel0Table(mem, edit, NULL);
+      if (!status.ok()) {
+        // Reflect errors immediately so that conditions like full
+        // file-systems cause the DB::Open() to fail.
+        break;
+      }
+      mem->Unref();
+      mem = NULL;
+    }
+  }
+
+  if (status.ok() && mem != NULL) {
+    status = WriteLevel0Table(mem, edit, NULL);
+    // Reflect errors immediately so that conditions like full
+    // file-systems cause the DB::Open() to fail.
+  }
+
+  if (mem != NULL) mem->Unref();
+  delete file;
+  return status;
+}
+
+Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
+                                Version* base) {
+  mutex_.AssertHeld();
+  const uint64_t start_micros = env_->NowMicros();
+  FileMetaData meta;
+  meta.number = versions_->NewFileNumber();
+  pending_outputs_.insert(meta.number);
+  Iterator* iter = mem->NewIterator();
+  Log(options_.info_log, "Level-0 table #%llu: started",
+      (unsigned long long) meta.number);
+
+  Status s;
+  {
+    mutex_.Unlock();
+    s = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+    mutex_.Lock();
+  }
+
+  Log(options_.info_log, "Level-0 table #%llu: %lld bytes %s",
+      (unsigned long long) meta.number,
+      (unsigned long long) meta.file_size,
+      s.ToString().c_str());
+  delete iter;
+  pending_outputs_.erase(meta.number);
+
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  int level = 0;
+  if (s.ok() && meta.file_size > 0) {
+    const Slice min_user_key = meta.smallest.user_key();
+    const Slice max_user_key = meta.largest.user_key();
+    if (base != NULL) {
+      level = base->PickLevelForMemTableOutput(min_user_key, max_user_key);
+    }
+    edit->AddFile(level, meta.number, meta.file_size,
+                  meta.smallest, meta.largest);
+  }
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros;
+  stats.bytes_written = meta.file_size;
+  stats_[level].Add(stats);
+  return s;
+}
+
+Status DBImpl::CompactMemTable() {
+  mutex_.AssertHeld();
+  assert(imm_ != NULL);
+
+  // Save the contents of the memtable as a new Table
+  VersionEdit edit;
+  Version* base = versions_->current();
+  base->Ref();
+  Status s = WriteLevel0Table(imm_, &edit, base);
+  base->Unref();
+
+  if (s.ok() && shutting_down_.Acquire_Load()) {
+    s = Status::IOError("Deleting DB during memtable compaction");
+  }
+
+  // Replace immutable memtable with the generated Table
+  if (s.ok()) {
+    edit.SetPrevLogNumber(0);
+    edit.SetLogNumber(logfile_number_);  // Earlier logs no longer needed
+    s = versions_->LogAndApply(&edit, &mutex_);
+  }
+
+  if (s.ok()) {
+    // Commit to the new state
+    imm_->Unref();
+    imm_ = NULL;
+    has_imm_.Release_Store(NULL);
+    DeleteObsoleteFiles();
+  }
+
+  return s;
+}
+
+void DBImpl::CompactRange(const Slice* begin, const Slice* end) {
+  int max_level_with_files = 1;
+  {
+    MutexLock l(&mutex_);
+    Version* base = versions_->current();
+    for (int level = 1; level < config::kNumLevels; level++) {
+      if (base->OverlapInLevel(level, begin, end)) {
+        max_level_with_files = level;
+      }
+    }
+  }
+  TEST_CompactMemTable(); // TODO(sanjay): Skip if memtable does not overlap
+  for (int level = 0; level < max_level_with_files; level++) {
+    TEST_CompactRange(level, begin, end);
+  }
+}
+
+void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
+  assert(level >= 0);
+  assert(level + 1 < config::kNumLevels);
+
+  InternalKey begin_storage, end_storage;
+
+  ManualCompaction manual;
+  manual.level = level;
+  manual.done = false;
+  if (begin == NULL) {
+    manual.begin = NULL;
+  } else {
+    begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
+    manual.begin = &begin_storage;
+  }
+  if (end == NULL) {
+    manual.end = NULL;
+  } else {
+    end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
+    manual.end = &end_storage;
+  }
+
+  MutexLock l(&mutex_);
+  while (!manual.done) {
+    while (manual_compaction_ != NULL) {
+      bg_cv_.Wait();
+    }
+    manual_compaction_ = &manual;
+    MaybeScheduleCompaction();
+    while (manual_compaction_ == &manual) {
+      bg_cv_.Wait();
+    }
+  }
+}
+
+Status DBImpl::TEST_CompactMemTable() {
+  // NULL batch means just wait for earlier writes to be done
+  Status s = Write(WriteOptions(), NULL);
+  if (s.ok()) {
+    // Wait until the compaction completes
+    MutexLock l(&mutex_);
+    while (imm_ != NULL && bg_error_.ok()) {
+      bg_cv_.Wait();
+    }
+    if (imm_ != NULL) {
+      s = bg_error_;
+    }
+  }
+  return s;
+}
+
+void DBImpl::MaybeScheduleCompaction() {
+  mutex_.AssertHeld();
+  if (bg_compaction_scheduled_) {
+    // Already scheduled
+  } else if (shutting_down_.Acquire_Load()) {
+    // DB is being deleted; no more background compactions
+  } else if (imm_ == NULL &&
+             manual_compaction_ == NULL &&
+             !versions_->NeedsCompaction()) {
+    // No work to be done
+  } else {
+    bg_compaction_scheduled_ = true;
+    env_->Schedule(&DBImpl::BGWork, this);
+  }
+}
+
+void DBImpl::BGWork(void* db) {
+  reinterpret_cast<DBImpl*>(db)->BackgroundCall();
+}
+
+void DBImpl::BackgroundCall() {
+  MutexLock l(&mutex_);
+  assert(bg_compaction_scheduled_);
+  if (!shutting_down_.Acquire_Load()) {
+    Status s = BackgroundCompaction();
+    if (!s.ok()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      Log(options_.info_log, "Waiting after background compaction error: %s",
+          s.ToString().c_str());
+      mutex_.Unlock();
+      env_->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+  }
+
+  bg_compaction_scheduled_ = false;
+
+  // Previous compaction may have produced too many files in a level,
+  // so reschedule another compaction if needed.
+  MaybeScheduleCompaction();
+  bg_cv_.SignalAll();
+}
+
+Status DBImpl::BackgroundCompaction() {
+  mutex_.AssertHeld();
+
+  if (imm_ != NULL) {
+    return CompactMemTable();
+  }
+
+  Compaction* c;
+  bool is_manual = (manual_compaction_ != NULL);
+  InternalKey manual_end;
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    c = versions_->CompactRange(m->level, m->begin, m->end);
+    m->done = (c == NULL);
+    if (c != NULL) {
+      manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
+    }
+    Log(options_.info_log,
+        "Manual compaction at level-%d from %s .. %s; will stop at %s\n",
+        m->level,
+        (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+        (m->end ? m->end->DebugString().c_str() : "(end)"),
+        (m->done ? "(end)" : manual_end.DebugString().c_str()));
+  } else {
+    c = versions_->PickCompaction();
+  }
+
+  Status status;
+  if (c == NULL) {
+    // Nothing to do
+  } else if (!is_manual && c->IsTrivialMove()) {
+    // Move file to next level
+    assert(c->num_input_files(0) == 1);
+    FileMetaData* f = c->input(0, 0);
+    c->edit()->DeleteFile(c->level(), f->number);
+    c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
+                       f->smallest, f->largest);
+    status = versions_->LogAndApply(c->edit(), &mutex_);
+    VersionSet::LevelSummaryStorage tmp;
+    Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
+        static_cast<unsigned long long>(f->number),
+        c->level() + 1,
+        static_cast<unsigned long long>(f->file_size),
+        status.ToString().c_str(),
+        versions_->LevelSummary(&tmp));
+  } else {
+    CompactionState* compact = new CompactionState(c);
+    status = DoCompactionWork(compact);
+    CleanupCompaction(compact);
+    c->ReleaseInputs();
+    DeleteObsoleteFiles();
+  }
+  delete c;
+
+  if (status.ok()) {
+    // Done
+  } else if (shutting_down_.Acquire_Load()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    Log(options_.info_log,
+        "Compaction error: %s", status.ToString().c_str());
+    if (options_.paranoid_checks && bg_error_.ok()) {
+      bg_error_ = status;
+    }
+  }
+
+  if (is_manual) {
+    ManualCompaction* m = manual_compaction_;
+    if (!status.ok()) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      m->tmp_storage = manual_end;
+      m->begin = &m->tmp_storage;
+    }
+    manual_compaction_ = NULL;
+  }
+  return status;
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact) {
+  mutex_.AssertHeld();
+  if (compact->builder != NULL) {
+    // May happen if we get a shutdown call in the middle of compaction
+    compact->builder->Abandon();
+    delete compact->builder;
+  } else {
+    assert(compact->outfile == NULL);
+  }
+  delete compact->outfile;
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    pending_outputs_.erase(out.number);
+  }
+  delete compact;
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+  assert(compact != NULL);
+  assert(compact->builder == NULL);
+  uint64_t file_number;
+  {
+    mutex_.Lock();
+    file_number = versions_->NewFileNumber();
+    pending_outputs_.insert(file_number);
+    CompactionState::Output out;
+    out.number = file_number;
+    out.smallest.Clear();
+    out.largest.Clear();
+    compact->outputs.push_back(out);
+    mutex_.Unlock();
+  }
+
+  // Make the output file
+  std::string fname = TableFileName(dbname_, file_number);
+  Status s = env_->NewWritableFile(fname, &compact->outfile);
+  if (s.ok()) {
+    compact->builder = new TableBuilder(options_, compact->outfile);
+  }
+  return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+                                          Iterator* input) {
+  assert(compact != NULL);
+  assert(compact->outfile != NULL);
+  assert(compact->builder != NULL);
+
+  const uint64_t output_number = compact->current_output()->number;
+  assert(output_number != 0);
+
+  // Check for iterator errors
+  Status s = input->status();
+  const uint64_t current_entries = compact->builder->NumEntries();
+  if (s.ok()) {
+    s = compact->builder->Finish();
+  } else {
+    compact->builder->Abandon();
+  }
+  const uint64_t current_bytes = compact->builder->FileSize();
+  compact->current_output()->file_size = current_bytes;
+  compact->total_bytes += current_bytes;
+  delete compact->builder;
+  compact->builder = NULL;
+
+  // Finish and check for file errors
+  if (s.ok()) {
+    s = compact->outfile->Sync();
+  }
+  if (s.ok()) {
+    s = compact->outfile->Close();
+  }
+  delete compact->outfile;
+  compact->outfile = NULL;
+
+  if (s.ok() && current_entries > 0) {
+    // Verify that the table is usable
+    Iterator* iter = table_cache_->NewIterator(ReadOptions(),
+                                               output_number,
+                                               current_bytes);
+    s = iter->status();
+    delete iter;
+    if (s.ok()) {
+      Log(options_.info_log,
+          "Generated table #%llu: %lld keys, %lld bytes",
+          (unsigned long long) output_number,
+          (unsigned long long) current_entries,
+          (unsigned long long) current_bytes);
+    }
+  }
+  return s;
+}
+
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact) {
+  mutex_.AssertHeld();
+  Log(options_.info_log,  "Compacted %d@%d + %d@%d files => %lld bytes",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->level() + 1,
+      static_cast<long long>(compact->total_bytes));
+
+  // Add compaction outputs
+  compact->compaction->AddInputDeletions(compact->compaction->edit());
+  const int level = compact->compaction->level();
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    const CompactionState::Output& out = compact->outputs[i];
+    compact->compaction->edit()->AddFile(
+        level + 1,
+        out.number, out.file_size, out.smallest, out.largest);
+  }
+  return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact) {
+  const uint64_t start_micros = env_->NowMicros();
+  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
+
+  Log(options_.info_log,  "Compacting %d@%d + %d@%d files",
+      compact->compaction->num_input_files(0),
+      compact->compaction->level(),
+      compact->compaction->num_input_files(1),
+      compact->compaction->level() + 1);
+
+  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+  assert(compact->builder == NULL);
+  assert(compact->outfile == NULL);
+  if (snapshots_.empty()) {
+    compact->smallest_snapshot = versions_->LastSequence();
+  } else {
+    compact->smallest_snapshot = snapshots_.oldest()->number_;
+  }
+
+  // Release mutex while we're actually doing the compaction work
+  mutex_.Unlock();
+
+  Iterator* input = versions_->MakeInputIterator(compact->compaction);
+  input->SeekToFirst();
+  Status status;
+  ParsedInternalKey ikey;
+  std::string current_user_key;
+  bool has_current_user_key = false;
+  SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
+  for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+    // Prioritize immutable compaction work
+    if (has_imm_.NoBarrier_Load() != NULL) {
+      const uint64_t imm_start = env_->NowMicros();
+      mutex_.Lock();
+      if (imm_ != NULL) {
+        CompactMemTable();
+        bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
+      }
+      mutex_.Unlock();
+      imm_micros += (env_->NowMicros() - imm_start);
+    }
+
+    Slice key = input->key();
+    if (compact->compaction->ShouldStopBefore(key) &&
+        compact->builder != NULL) {
+      status = FinishCompactionOutputFile(compact, input);
+      if (!status.ok()) {
+        break;
+      }
+    }
+
+    // Handle key/value, add to state, etc.
+    bool drop = false;
+    if (!ParseInternalKey(key, &ikey)) {
+      // Do not hide error keys
+      current_user_key.clear();
+      has_current_user_key = false;
+      last_sequence_for_key = kMaxSequenceNumber;
+    } else {
+      if (!has_current_user_key ||
+          user_comparator()->Compare(ikey.user_key,
+                                     Slice(current_user_key)) != 0) {
+        // First occurrence of this user key
+        current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+        has_current_user_key = true;
+        last_sequence_for_key = kMaxSequenceNumber;
+      }
+
+      if (last_sequence_for_key <= compact->smallest_snapshot) {
+        // Hidden by an newer entry for same user key
+        drop = true;    // (A)
+      } else if (ikey.type == kTypeDeletion &&
+                 ikey.sequence <= compact->smallest_snapshot &&
+                 compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+        // For this user key:
+        // (1) there is no data in higher levels
+        // (2) data in lower levels will have larger sequence numbers
+        // (3) data in layers that are being compacted here and have
+        //     smaller sequence numbers will be dropped in the next
+        //     few iterations of this loop (by rule (A) above).
+        // Therefore this deletion marker is obsolete and can be dropped.
+        drop = true;
+      }
+
+      last_sequence_for_key = ikey.sequence;
+    }
+#if 0
+    Log(options_.info_log,
+        "  Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+        "%d smallest_snapshot: %d",
+        ikey.user_key.ToString().c_str(),
+        (int)ikey.sequence, ikey.type, kTypeValue, drop,
+        compact->compaction->IsBaseLevelForKey(ikey.user_key),
+        (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+
+    if (!drop) {
+      // Open output file if necessary
+      if (compact->builder == NULL) {
+        status = OpenCompactionOutputFile(compact);
+        if (!status.ok()) {
+          break;
+        }
+      }
+      if (compact->builder->NumEntries() == 0) {
+        compact->current_output()->smallest.DecodeFrom(key);
+      }
+      compact->current_output()->largest.DecodeFrom(key);
+      compact->builder->Add(key, input->value());
+
+      // Close output file if it is big enough
+      if (compact->builder->FileSize() >=
+          compact->compaction->MaxOutputFileSize()) {
+        status = FinishCompactionOutputFile(compact, input);
+        if (!status.ok()) {
+          break;
+        }
+      }
+    }
+
+    input->Next();
+  }
+
+  if (status.ok() && shutting_down_.Acquire_Load()) {
+    status = Status::IOError("Deleting DB during compaction");
+  }
+  if (status.ok() && compact->builder != NULL) {
+    status = FinishCompactionOutputFile(compact, input);
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  delete input;
+  input = NULL;
+
+  CompactionStats stats;
+  stats.micros = env_->NowMicros() - start_micros - imm_micros;
+  for (int which = 0; which < 2; which++) {
+    for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+      stats.bytes_read += compact->compaction->input(which, i)->file_size;
+    }
+  }
+  for (size_t i = 0; i < compact->outputs.size(); i++) {
+    stats.bytes_written += compact->outputs[i].file_size;
+  }
+
+  mutex_.Lock();
+  stats_[compact->compaction->level() + 1].Add(stats);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(compact);
+  }
+  VersionSet::LevelSummaryStorage tmp;
+  Log(options_.info_log,
+      "compacted to: %s", versions_->LevelSummary(&tmp));
+  return status;
+}
+
+namespace {
+struct IterState {
+  port::Mutex* mu;
+  Version* version;
+  MemTable* mem;
+  MemTable* imm;
+};
+
+static void CleanupIteratorState(void* arg1, void* arg2) {
+  IterState* state = reinterpret_cast<IterState*>(arg1);
+  state->mu->Lock();
+  state->mem->Unref();
+  if (state->imm != NULL) state->imm->Unref();
+  state->version->Unref();
+  state->mu->Unlock();
+  delete state;
+}
+}  // namespace
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+                                      SequenceNumber* latest_snapshot) {
+  IterState* cleanup = new IterState;
+  mutex_.Lock();
+  *latest_snapshot = versions_->LastSequence();
+
+  // Collect together all needed child iterators
+  std::vector<Iterator*> list;
+  list.push_back(mem_->NewIterator());
+  mem_->Ref();
+  if (imm_ != NULL) {
+    list.push_back(imm_->NewIterator());
+    imm_->Ref();
+  }
+  versions_->current()->AddIterators(options, &list);
+  Iterator* internal_iter =
+      NewMergingIterator(&internal_comparator_, &list[0], list.size());
+  versions_->current()->Ref();
+
+  cleanup->mu = &mutex_;
+  cleanup->mem = mem_;
+  cleanup->imm = imm_;
+  cleanup->version = versions_->current();
+  internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, NULL);
+
+  mutex_.Unlock();
+  return internal_iter;
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator() {
+  SequenceNumber ignored;
+  return NewInternalIterator(ReadOptions(), &ignored);
+}
+
+int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+  MutexLock l(&mutex_);
+  return versions_->MaxNextLevelOverlappingBytes();
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+                   const Slice& key,
+                   std::string* value) {
+  Status s;
+  MutexLock l(&mutex_);
+  SequenceNumber snapshot;
+  if (options.snapshot != NULL) {
+    snapshot = reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_;
+  } else {
+    snapshot = versions_->LastSequence();
+  }
+
+  MemTable* mem = mem_;
+  MemTable* imm = imm_;
+  Version* current = versions_->current();
+  mem->Ref();
+  if (imm != NULL) imm->Ref();
+  current->Ref();
+
+  bool have_stat_update = false;
+  Version::GetStats stats;
+
+  // Unlock while reading from files and memtables
+  {
+    mutex_.Unlock();
+    // First look in the memtable, then in the immutable memtable (if any).
+    LookupKey lkey(key, snapshot);
+    if (mem->Get(lkey, value, &s)) {
+      // Done
+    } else if (imm != NULL && imm->Get(lkey, value, &s)) {
+      // Done
+    } else {
+      s = current->Get(options, lkey, value, &stats);
+      have_stat_update = true;
+    }
+    mutex_.Lock();
+  }
+
+  if (have_stat_update && current->UpdateStats(stats)) {
+    MaybeScheduleCompaction();
+  }
+  mem->Unref();
+  if (imm != NULL) imm->Unref();
+  current->Unref();
+  return s;
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options) {
+  SequenceNumber latest_snapshot;
+  Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+  return NewDBIterator(
+      &dbname_, env_, user_comparator(), internal_iter,
+      (options.snapshot != NULL
+       ? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_
+       : latest_snapshot));
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+  MutexLock l(&mutex_);
+  return snapshots_.New(versions_->LastSequence());
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  MutexLock l(&mutex_);
+  snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
+  return DB::Put(o, key, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
+  return DB::Delete(options, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
+  Writer w(&mutex_);
+  w.batch = my_batch;
+  w.sync = options.sync;
+  w.done = false;
+
+  MutexLock l(&mutex_);
+  writers_.push_back(&w);
+  while (!w.done && &w != writers_.front()) {
+    w.cv.Wait();
+  }
+  if (w.done) {
+    return w.status;
+  }
+
+  // May temporarily unlock and wait.
+  Status status = MakeRoomForWrite(my_batch == NULL);
+  uint64_t last_sequence = versions_->LastSequence();
+  Writer* last_writer = &w;
+  if (status.ok() && my_batch != NULL) {  // NULL batch is for compactions
+    WriteBatch* updates = BuildBatchGroup(&last_writer);
+    WriteBatchInternal::SetSequence(updates, last_sequence + 1);
+    last_sequence += WriteBatchInternal::Count(updates);
+
+    // Add to log and apply to memtable.  We can release the lock
+    // during this phase since &w is currently responsible for logging
+    // and protects against concurrent loggers and concurrent writes
+    // into mem_.
+    {
+      mutex_.Unlock();
+      status = log_->AddRecord(WriteBatchInternal::Contents(updates));
+      if (status.ok() && options.sync) {
+        status = logfile_->Sync();
+      }
+      if (status.ok()) {
+        status = WriteBatchInternal::InsertInto(updates, mem_);
+      }
+      mutex_.Lock();
+    }
+    if (updates == tmp_batch_) tmp_batch_->Clear();
+
+    versions_->SetLastSequence(last_sequence);
+  }
+
+  while (true) {
+    Writer* ready = writers_.front();
+    writers_.pop_front();
+    if (ready != &w) {
+      ready->status = status;
+      ready->done = true;
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) break;
+  }
+
+  // Notify new head of write queue
+  if (!writers_.empty()) {
+    writers_.front()->cv.Signal();
+  }
+
+  return status;
+}
+
+// REQUIRES: Writer list must be non-empty
+// REQUIRES: First writer must have a non-NULL batch
+WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
+  assert(!writers_.empty());
+  Writer* first = writers_.front();
+  WriteBatch* result = first->batch;
+  assert(result != NULL);
+
+  size_t size = WriteBatchInternal::ByteSize(first->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = 1 << 20;
+  if (size <= (128<<10)) {
+    max_size = size + (128<<10);
+  }
+
+  *last_writer = first;
+  std::deque<Writer*>::iterator iter = writers_.begin();
+  ++iter;  // Advance past "first"
+  for (; iter != writers_.end(); ++iter) {
+    Writer* w = *iter;
+    if (w->sync && !first->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (w->batch != NULL) {
+      size += WriteBatchInternal::ByteSize(w->batch);
+      if (size > max_size) {
+        // Do not make batch too big
+        break;
+      }
+
+      // Append to *reuslt
+      if (result == first->batch) {
+        // Switch to temporary batch instead of disturbing caller's batch
+        result = tmp_batch_;
+        assert(WriteBatchInternal::Count(result) == 0);
+        WriteBatchInternal::Append(result, first->batch);
+      }
+      WriteBatchInternal::Append(result, w->batch);
+    }
+    *last_writer = w;
+  }
+  return result;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::MakeRoomForWrite(bool force) {
+  mutex_.AssertHeld();
+  assert(!writers_.empty());
+  bool allow_delay = !force;
+  Status s;
+  while (true) {
+    if (!bg_error_.ok()) {
+      // Yield previous error
+      s = bg_error_;
+      break;
+    } else if (
+        allow_delay &&
+        versions_->NumLevelFiles(0) >= config::kL0_SlowdownWritesTrigger) {
+      // We are getting close to hitting a hard limit on the number of
+      // L0 files.  Rather than delaying a single write by several
+      // seconds when we hit the hard limit, start delaying each
+      // individual write by 1ms to reduce latency variance.  Also,
+      // this delay hands over some CPU to the compaction thread in
+      // case it is sharing the same core as the writer.
+      mutex_.Unlock();
+      env_->SleepForMicroseconds(1000);
+      allow_delay = false;  // Do not delay a single write more than once
+      mutex_.Lock();
+    } else if (!force &&
+               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
+      // There is room in current memtable
+      break;
+    } else if (imm_ != NULL) {
+      // We have filled up the current memtable, but the previous
+      // one is still being compacted, so we wait.
+      bg_cv_.Wait();
+    } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
+      // There are too many level-0 files.
+      Log(options_.info_log, "waiting...\n");
+      bg_cv_.Wait();
+    } else {
+      // Attempt to switch to a new memtable and trigger compaction of old
+      assert(versions_->PrevLogNumber() == 0);
+      uint64_t new_log_number = versions_->NewFileNumber();
+      WritableFile* lfile = NULL;
+      s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+      if (!s.ok()) {
+        // Avoid chewing through file number space in a tight loop.
+        versions_->ReuseFileNumber(new_log_number);
+        break;
+      }
+      delete log_;
+      delete logfile_;
+      logfile_ = lfile;
+      logfile_number_ = new_log_number;
+      log_ = new log::Writer(lfile);
+      imm_ = mem_;
+      has_imm_.Release_Store(imm_);
+      mem_ = new MemTable(internal_comparator_);
+      mem_->Ref();
+      force = false;   // Do not force another compaction if have room
+      MaybeScheduleCompaction();
+    }
+  }
+  return s;
+}
+
+bool DBImpl::GetProperty(const Slice& property, std::string* value) {
+  value->clear();
+
+  MutexLock l(&mutex_);
+  Slice in = property;
+  Slice prefix("leveldb.");
+  if (!in.starts_with(prefix)) return false;
+  in.remove_prefix(prefix.size());
+
+  if (in.starts_with("num-files-at-level")) {
+    in.remove_prefix(strlen("num-files-at-level"));
+    uint64_t level;
+    bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+    if (!ok || level >= config::kNumLevels) {
+      return false;
+    } else {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%d",
+               versions_->NumLevelFiles(static_cast<int>(level)));
+      *value = buf;
+      return true;
+    }
+  } else if (in == "stats") {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "                               Compactions\n"
+             "Level  Files Size(MB) Time(sec) Read(MB) Write(MB)\n"
+             "--------------------------------------------------\n"
+             );
+    value->append(buf);
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int files = versions_->NumLevelFiles(level);
+      if (stats_[level].micros > 0 || files > 0) {
+        snprintf(
+            buf, sizeof(buf),
+            "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
+            level,
+            files,
+            versions_->NumLevelBytes(level) / 1048576.0,
+            stats_[level].micros / 1e6,
+            stats_[level].bytes_read / 1048576.0,
+            stats_[level].bytes_written / 1048576.0);
+        value->append(buf);
+      }
+    }
+    return true;
+  } else if (in == "sstables") {
+    *value = versions_->current()->DebugString();
+    return true;
+  }
+
+  return false;
+}
+
+void DBImpl::GetApproximateSizes(
+    const Range* range, int n,
+    uint64_t* sizes) {
+  // TODO(opt): better implementation
+  Version* v;
+  {
+    MutexLock l(&mutex_);
+    versions_->current()->Ref();
+    v = versions_->current();
+  }
+
+  for (int i = 0; i < n; i++) {
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+    uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+    uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+    sizes[i] = (limit >= start ? limit - start : 0);
+  }
+
+  {
+    MutexLock l(&mutex_);
+    v->Unref();
+  }
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+  WriteBatch batch;
+  batch.Put(key, value);
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, const Slice& key) {
+  WriteBatch batch;
+  batch.Delete(key);
+  return Write(opt, &batch);
+}
+
+DB::~DB() { }
+
+Status DB::Open(const Options& options, const std::string& dbname,
+                DB** dbptr) {
+  *dbptr = NULL;
+
+  DBImpl* impl = new DBImpl(options, dbname);
+  impl->mutex_.Lock();
+  VersionEdit edit;
+  Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    WritableFile* lfile;
+    s = options.env->NewWritableFile(LogFileName(dbname, new_log_number),
+                                     &lfile);
+    if (s.ok()) {
+      edit.SetLogNumber(new_log_number);
+      impl->logfile_ = lfile;
+      impl->logfile_number_ = new_log_number;
+      impl->log_ = new log::Writer(lfile);
+      s = impl->versions_->LogAndApply(&edit, &impl->mutex_);
+    }
+    if (s.ok()) {
+      impl->DeleteObsoleteFiles();
+      impl->MaybeScheduleCompaction();
+    }
+  }
+  impl->mutex_.Unlock();
+  if (s.ok()) {
+    *dbptr = impl;
+  } else {
+    delete impl;
+  }
+  return s;
+}
+
+Snapshot::~Snapshot() {
+}
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+  Env* env = options.env;
+  std::vector<std::string> filenames;
+  // Ignore error in case directory does not exist
+  env->GetChildren(dbname, &filenames);
+  if (filenames.empty()) {
+    return Status::OK();
+  }
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del = env->DeleteFile(dbname + "/" + filenames[i]);
+        if (result.ok() && !del.ok()) {
+          result = del;
+        }
+      }
+    }
+    env->UnlockFile(lock);  // Ignore error since state is already gone
+    env->DeleteFile(lockname);
+    env->DeleteDir(dbname);  // Ignore error in case dir contains other files
+  }
+  return result;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
new file mode 100644
index 0000000000..8d2bb3405f
--- /dev/null
+++ b/src/leveldb/db/db_impl.h
@@ -0,0 +1,194 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
+#define STORAGE_LEVELDB_DB_DB_IMPL_H_
+
+#include <deque>
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+  DBImpl(const Options& options, const std::string& dbname);
+  virtual ~DBImpl();
+
+  // Implementations of the DB interface
+  virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+  virtual Status Delete(const WriteOptions&, const Slice& key);
+  virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key,
+                     std::string* value);
+  virtual Iterator* NewIterator(const ReadOptions&);
+  virtual const Snapshot* GetSnapshot();
+  virtual void ReleaseSnapshot(const Snapshot* snapshot);
+  virtual bool GetProperty(const Slice& property, std::string* value);
+  virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+  virtual void CompactRange(const Slice* begin, const Slice* end);
+
+  // Extra methods (for testing) that are not in the public DB interface
+
+  // Compact any files in the named level that overlap [*begin,*end]
+  void TEST_CompactRange(int level, const Slice* begin, const Slice* end);
+
+  // Force current memtable contents to be compacted.
+  Status TEST_CompactMemTable();
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  Iterator* TEST_NewInternalIterator();
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t TEST_MaxNextLevelOverlappingBytes();
+
+ private:
+  friend class DB;
+  struct CompactionState;
+  struct Writer;
+
+  Iterator* NewInternalIterator(const ReadOptions&,
+                                SequenceNumber* latest_snapshot);
+
+  Status NewDB();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  Status Recover(VersionEdit* edit);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+
+  // Compact the in-memory write buffer to disk.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful.
+  Status CompactMemTable();
+
+  Status RecoverLogFile(uint64_t log_number,
+                        VersionEdit* edit,
+                        SequenceNumber* max_sequence);
+
+  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base);
+
+  Status MakeRoomForWrite(bool force /* compact even if there is room? */);
+  WriteBatch* BuildBatchGroup(Writer** last_writer);
+
+  void MaybeScheduleCompaction();
+  static void BGWork(void* db);
+  void BackgroundCall();
+  Status BackgroundCompaction();
+  void CleanupCompaction(CompactionState* compact);
+  Status DoCompactionWork(CompactionState* compact);
+
+  Status OpenCompactionOutputFile(CompactionState* compact);
+  Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+  Status InstallCompactionResults(CompactionState* compact);
+
+  // Constant after construction
+  Env* const env_;
+  const InternalKeyComparator internal_comparator_;
+  const InternalFilterPolicy internal_filter_policy_;
+  const Options options_;  // options_.comparator == &internal_comparator_
+  bool owns_info_log_;
+  bool owns_cache_;
+  const std::string dbname_;
+
+  // table_cache_ provides its own synchronization
+  TableCache* table_cache_;
+
+  // Lock over the persistent DB state.  Non-NULL iff successfully acquired.
+  FileLock* db_lock_;
+
+  // State below is protected by mutex_
+  port::Mutex mutex_;
+  port::AtomicPointer shutting_down_;
+  port::CondVar bg_cv_;          // Signalled when background work finishes
+  MemTable* mem_;
+  MemTable* imm_;                // Memtable being compacted
+  port::AtomicPointer has_imm_;  // So bg thread can detect non-NULL imm_
+  WritableFile* logfile_;
+  uint64_t logfile_number_;
+  log::Writer* log_;
+
+  // Queue of writers.
+  std::deque<Writer*> writers_;
+  WriteBatch* tmp_batch_;
+
+  SnapshotList snapshots_;
+
+  // Set of table files to protect from deletion because they are
+  // part of ongoing compactions.
+  std::set<uint64_t> pending_outputs_;
+
+  // Has a background compaction been scheduled or is running?
+  bool bg_compaction_scheduled_;
+
+  // Information for a manual compaction
+  struct ManualCompaction {
+    int level;
+    bool done;
+    const InternalKey* begin;   // NULL means beginning of key range
+    const InternalKey* end;     // NULL means end of key range
+    InternalKey tmp_storage;    // Used to keep track of compaction progress
+  };
+  ManualCompaction* manual_compaction_;
+
+  VersionSet* versions_;
+
+  // Have we encountered a background error in paranoid mode?
+  Status bg_error_;
+
+  // Per level compaction stats.  stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    int64_t micros;
+    int64_t bytes_read;
+    int64_t bytes_written;
+
+    CompactionStats() : micros(0), bytes_read(0), bytes_written(0) { }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->bytes_read += c.bytes_read;
+      this->bytes_written += c.bytes_written;
+    }
+  };
+  CompactionStats stats_[config::kNumLevels];
+
+  // No copying allowed
+  DBImpl(const DBImpl&);
+  void operator=(const DBImpl&);
+
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+};
+
+// Sanitize db options.  The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+                               const InternalKeyComparator* icmp,
+                               const InternalFilterPolicy* ipolicy,
+                               const Options& src);
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_DB_IMPL_H_
diff --git a/src/leveldb/db/db_iter.cc b/src/leveldb/db/db_iter.cc
new file mode 100644
index 0000000000..87dca2ded4
--- /dev/null
+++ b/src/leveldb/db/db_iter.cc
@@ -0,0 +1,299 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey k;
+    if (!ParseInternalKey(iter->key(), &k)) {
+      fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+    } else {
+      fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+    }
+  }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward, the internal iterator is positioned at
+  //     the exact entry that yields this->key(), this->value()
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction {
+    kForward,
+    kReverse
+  };
+
+  DBIter(const std::string* dbname, Env* env,
+         const Comparator* cmp, Iterator* iter, SequenceNumber s)
+      : dbname_(dbname),
+        env_(env),
+        user_comparator_(cmp),
+        iter_(iter),
+        sequence_(s),
+        direction_(kForward),
+        valid_(false) {
+  }
+  virtual ~DBIter() {
+    delete iter_;
+  }
+  virtual bool Valid() const { return valid_; }
+  virtual Slice key() const {
+    assert(valid_);
+    return (direction_ == kForward) ? ExtractUserKey(iter_->key()) : saved_key_;
+  }
+  virtual Slice value() const {
+    assert(valid_);
+    return (direction_ == kForward) ? iter_->value() : saved_value_;
+  }
+  virtual Status status() const {
+    if (status_.ok()) {
+      return iter_->status();
+    } else {
+      return status_;
+    }
+  }
+
+  virtual void Next();
+  virtual void Prev();
+  virtual void Seek(const Slice& target);
+  virtual void SeekToFirst();
+  virtual void SeekToLast();
+
+ private:
+  void FindNextUserEntry(bool skipping, std::string* skip);
+  void FindPrevUserEntry();
+  bool ParseKey(ParsedInternalKey* key);
+
+  inline void SaveKey(const Slice& k, std::string* dst) {
+    dst->assign(k.data(), k.size());
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  const std::string* const dbname_;
+  Env* const env_;
+  const Comparator* const user_comparator_;
+  Iterator* const iter_;
+  SequenceNumber const sequence_;
+
+  Status status_;
+  std::string saved_key_;     // == current key when direction_==kReverse
+  std::string saved_value_;   // == current raw value when direction_==kReverse
+  Direction direction_;
+  bool valid_;
+
+  // No copying allowed
+  DBIter(const DBIter&);
+  void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  if (!ParseInternalKey(iter_->key(), ikey)) {
+    status_ = Status::Corruption("corrupted internal key in DBIter");
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+
+  if (direction_ == kReverse) {  // Switch directions?
+    direction_ = kForward;
+    // iter_ is pointing just before the entries for this->key(),
+    // so advance into the range of entries for this->key() and then
+    // use the normal skipping code below.
+    if (!iter_->Valid()) {
+      iter_->SeekToFirst();
+    } else {
+      iter_->Next();
+    }
+    if (!iter_->Valid()) {
+      valid_ = false;
+      saved_key_.clear();
+      return;
+    }
+  }
+
+  // Temporarily use saved_key_ as storage for key to skip.
+  std::string* skip = &saved_key_;
+  SaveKey(ExtractUserKey(iter_->key()), skip);
+  FindNextUserEntry(true, skip);
+}
+
+void DBIter::FindNextUserEntry(bool skipping, std::string* skip) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_->Valid());
+  assert(direction_ == kForward);
+  do {
+    ParsedInternalKey ikey;
+    if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+      switch (ikey.type) {
+        case kTypeDeletion:
+          // Arrange to skip all upcoming entries for this key since
+          // they are hidden by this deletion.
+          SaveKey(ikey.user_key, skip);
+          skipping = true;
+          break;
+        case kTypeValue:
+          if (skipping &&
+              user_comparator_->Compare(ikey.user_key, *skip) <= 0) {
+            // Entry hidden
+          } else {
+            valid_ = true;
+            saved_key_.clear();
+            return;
+          }
+          break;
+      }
+    }
+    iter_->Next();
+  } while (iter_->Valid());
+  saved_key_.clear();
+  valid_ = false;
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+
+  if (direction_ == kForward) {  // Switch directions?
+    // iter_ is pointing at the current entry.  Scan backwards until
+    // the key changes so we can use the normal reverse scanning code.
+    assert(iter_->Valid());  // Otherwise valid_ would have been false
+    SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+    while (true) {
+      iter_->Prev();
+      if (!iter_->Valid()) {
+        valid_ = false;
+        saved_key_.clear();
+        ClearSavedValue();
+        return;
+      }
+      if (user_comparator_->Compare(ExtractUserKey(iter_->key()),
+                                    saved_key_) < 0) {
+        break;
+      }
+    }
+    direction_ = kReverse;
+  }
+
+  FindPrevUserEntry();
+}
+
+void DBIter::FindPrevUserEntry() {
+  assert(direction_ == kReverse);
+
+  ValueType value_type = kTypeDeletion;
+  if (iter_->Valid()) {
+    do {
+      ParsedInternalKey ikey;
+      if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
+        if ((value_type != kTypeDeletion) &&
+            user_comparator_->Compare(ikey.user_key, saved_key_) < 0) {
+          // We encountered a non-deleted value in entries for previous keys,
+          break;
+        }
+        value_type = ikey.type;
+        if (value_type == kTypeDeletion) {
+          saved_key_.clear();
+          ClearSavedValue();
+        } else {
+          Slice raw_value = iter_->value();
+          if (saved_value_.capacity() > raw_value.size() + 1048576) {
+            std::string empty;
+            swap(empty, saved_value_);
+          }
+          SaveKey(ExtractUserKey(iter_->key()), &saved_key_);
+          saved_value_.assign(raw_value.data(), raw_value.size());
+        }
+      }
+      iter_->Prev();
+    } while (iter_->Valid());
+  }
+
+  if (value_type == kTypeDeletion) {
+    // End
+    valid_ = false;
+    saved_key_.clear();
+    ClearSavedValue();
+    direction_ = kForward;
+  } else {
+    valid_ = true;
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  direction_ = kForward;
+  ClearSavedValue();
+  saved_key_.clear();
+  AppendInternalKey(
+      &saved_key_, ParsedInternalKey(target, sequence_, kValueTypeForSeek));
+  iter_->Seek(saved_key_);
+  if (iter_->Valid()) {
+    FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToFirst() {
+  direction_ = kForward;
+  ClearSavedValue();
+  iter_->SeekToFirst();
+  if (iter_->Valid()) {
+    FindNextUserEntry(false, &saved_key_ /* temporary storage */);
+  } else {
+    valid_ = false;
+  }
+}
+
+void DBIter::SeekToLast() {
+  direction_ = kReverse;
+  ClearSavedValue();
+  iter_->SeekToLast();
+  FindPrevUserEntry();
+}
+
+}  // anonymous namespace
+
+Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Comparator* user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence) {
+  return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/db_iter.h b/src/leveldb/db/db_iter.h
new file mode 100644
index 0000000000..d9e1b174ab
--- /dev/null
+++ b/src/leveldb/db/db_iter.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
+#define STORAGE_LEVELDB_DB_DB_ITER_H_
+
+#include <stdint.h>
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    const std::string* dbname,
+    Env* env,
+    const Comparator* user_key_comparator,
+    Iterator* internal_iter,
+    const SequenceNumber& sequence);
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_DB_ITER_H_
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
new file mode 100644
index 0000000000..3744d0e117
--- /dev/null
+++ b/src/leveldb/db/db_test.cc
@@ -0,0 +1,1946 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "util/hash.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static std::string RandomString(Random* rnd, int len) {
+  std::string r;
+  test::RandomString(rnd, len, &r);
+  return r;
+}
+
+namespace {
+class AtomicCounter {
+ private:
+  port::Mutex mu_;
+  int count_;
+ public:
+  AtomicCounter() : count_(0) { }
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+  }
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+  }
+};
+}
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  // sstable Sync() calls are blocked while this pointer is non-NULL.
+  port::AtomicPointer delay_sstable_sync_;
+
+  // Simulate no-space errors while this pointer is non-NULL.
+  port::AtomicPointer no_space_;
+
+  // Simulate non-writable file system while this pointer is non-NULL
+  port::AtomicPointer non_writable_;
+
+  bool count_random_reads_;
+  AtomicCounter random_read_counter_;
+
+  AtomicCounter sleep_counter_;
+
+  explicit SpecialEnv(Env* base) : EnvWrapper(base) {
+    delay_sstable_sync_.Release_Store(NULL);
+    no_space_.Release_Store(NULL);
+    non_writable_.Release_Store(NULL);
+    count_random_reads_ = false;
+  }
+
+  Status NewWritableFile(const std::string& f, WritableFile** r) {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      WritableFile* base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, WritableFile* base)
+          : env_(env),
+            base_(base) {
+      }
+      ~SSTableFile() { delete base_; }
+      Status Append(const Slice& data) {
+        if (env_->no_space_.Acquire_Load() != NULL) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() {
+        while (env_->delay_sstable_sync_.Acquire_Load() != NULL) {
+          env_->SleepForMicroseconds(100000);
+        }
+        return base_->Sync();
+      }
+    };
+
+    if (non_writable_.Acquire_Load() != NULL) {
+      return Status::IOError("simulated write error");
+    }
+
+    Status s = target()->NewWritableFile(f, r);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != NULL) {
+        *r = new SSTableFile(this, *r);
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+    class CountingFile : public RandomAccessFile {
+     private:
+      RandomAccessFile* target_;
+      AtomicCounter* counter_;
+     public:
+      CountingFile(RandomAccessFile* target, AtomicCounter* counter)
+          : target_(target), counter_(counter) {
+      }
+      virtual ~CountingFile() { delete target_; }
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const {
+        counter_->Increment();
+        return target_->Read(offset, n, result, scratch);
+      }
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r);
+    if (s.ok() && count_random_reads_) {
+      *r = new CountingFile(*r, &random_read_counter_);
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) {
+    sleep_counter_.Increment();
+    target()->SleepForMicroseconds(micros);
+  }
+};
+
+class DBTest {
+ private:
+  const FilterPolicy* filter_policy_;
+
+  // Sequence of option configurations to try
+  enum OptionConfig {
+    kDefault,
+    kFilter,
+    kUncompressed,
+    kEnd
+  };
+  int option_config_;
+
+ public:
+  std::string dbname_;
+  SpecialEnv* env_;
+  DB* db_;
+
+  Options last_options_;
+
+  DBTest() : option_config_(kDefault),
+             env_(new SpecialEnv(Env::Default())) {
+    filter_policy_ = NewBloomFilterPolicy(10);
+    dbname_ = test::TmpDir() + "/db_test";
+    DestroyDB(dbname_, Options());
+    db_ = NULL;
+    Reopen();
+  }
+
+  ~DBTest() {
+    delete db_;
+    DestroyDB(dbname_, Options());
+    delete env_;
+    delete filter_policy_;
+  }
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= kEnd) {
+      return false;
+    } else {
+      DestroyAndReopen();
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+    switch (option_config_) {
+      case kFilter:
+        options.filter_policy = filter_policy_;
+        break;
+      case kUncompressed:
+        options.compression = kNoCompression;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  DBImpl* dbfull() {
+    return reinterpret_cast<DBImpl*>(db_);
+  }
+
+  void Reopen(Options* options = NULL) {
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Close() {
+    delete db_;
+    db_ = NULL;
+  }
+
+  void DestroyAndReopen(Options* options = NULL) {
+    delete db_;
+    db_ = NULL;
+    DestroyDB(dbname_, Options());
+    ASSERT_OK(TryReopen(options));
+  }
+
+  Status TryReopen(Options* options) {
+    delete db_;
+    db_ = NULL;
+    Options opts;
+    if (options != NULL) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const std::string& k, const std::string& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) {
+    return db_->Delete(WriteOptions(), k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents() {
+    std::vector<std::string> forward;
+    std::string result;
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      std::string s = IterStatus(iter);
+      result.push_back('(');
+      result.append(s);
+      result.push_back(')');
+      forward.push_back(s);
+    }
+
+    // Check reverse iteration results are the reverse of forward results
+    int matched = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_LT(matched, forward.size());
+      ASSERT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+      matched++;
+    }
+    ASSERT_EQ(matched, forward.size());
+
+    delete iter;
+    return result;
+  }
+
+  std::string AllEntriesFor(const Slice& user_key) {
+    Iterator* iter = dbfull()->TEST_NewInternalIterator();
+    InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+    iter->Seek(target.Encode());
+    std::string result;
+    if (!iter->status().ok()) {
+      result = iter->status().ToString();
+    } else {
+      result = "[ ";
+      bool first = true;
+      while (iter->Valid()) {
+        ParsedInternalKey ikey;
+        if (!ParseInternalKey(iter->key(), &ikey)) {
+          result += "CORRUPTED";
+        } else {
+          if (last_options_.comparator->Compare(ikey.user_key, user_key) != 0) {
+            break;
+          }
+          if (!first) {
+            result += ", ";
+          }
+          first = false;
+          switch (ikey.type) {
+            case kTypeValue:
+              result += iter->value().ToString();
+              break;
+            case kTypeDeletion:
+              result += "DEL";
+              break;
+          }
+        }
+        iter->Next();
+      }
+      if (!first) {
+        result += " ";
+      }
+      result += "]";
+    }
+    delete iter;
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    ASSERT_TRUE(
+        db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
+                         &property));
+    return atoi(property.c_str());
+  }
+
+  int TotalTableFiles() {
+    int result = 0;
+    for (int level = 0; level < config::kNumLevels; level++) {
+      result += NumTableFilesAtLevel(level);
+    }
+    return result;
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  int CountFiles() {
+    std::vector<std::string> files;
+    env_->GetChildren(dbname_, &files);
+    return static_cast<int>(files.size());
+  }
+
+  uint64_t Size(const Slice& start, const Slice& limit) {
+    Range r(start, limit);
+    uint64_t size;
+    db_->GetApproximateSizes(&r, 1, &size);
+    return size;
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    db_->CompactRange(&start, &limit);
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      Put(small, "begin");
+      Put(large, "end");
+      dbfull()->TEST_CompactMemTable();
+    }
+  }
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest) {
+    MakeTables(config::kNumLevels, smallest, largest);
+  }
+
+  void DumpFileCounts(const char* label) {
+    fprintf(stderr, "---\n%s:\n", label);
+    fprintf(stderr, "maxoverlap: %lld\n",
+            static_cast<long long>(
+                dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+    for (int level = 0; level < config::kNumLevels; level++) {
+      int num = NumTableFilesAtLevel(level);
+      if (num > 0) {
+        fprintf(stderr, "  level %3d : %d files\n", level, num);
+      }
+    }
+  }
+
+  std::string DumpSSTableList() {
+    std::string property;
+    db_->GetProperty("leveldb.sstables", &property);
+    return property;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST(DBTest, Empty) {
+  do {
+    ASSERT_TRUE(db_ != NULL);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, ReadWrite) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v3"));
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, PutDeleteGet) {
+  do {
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    Reopen(&options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+
+    env_->delay_sstable_sync_.Release_Store(env_);   // Block sync calls
+    Put("k1", std::string(100000, 'x'));             // Fill memtable
+    Put("k2", std::string(100000, 'y'));             // Trigger compaction
+    ASSERT_EQ("v1", Get("foo"));
+    env_->delay_sstable_sync_.Release_Store(NULL);   // Release sync calls
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetFromVersions) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("v1", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetSnapshot) {
+  do {
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(key, "v2"));
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      dbfull()->TEST_CompactMemTable();
+      ASSERT_EQ("v2", Get(key));
+      ASSERT_EQ("v1", Get(key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetLevel0Ordering) {
+  do {
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put("bar", "b"));
+    ASSERT_OK(Put("foo", "v1"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_OK(Put("foo", "v2"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetOrderedByLevels) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    Compact("a", "z");
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_EQ("v2", Get("foo"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("v2", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetPicksCorrectFile) {
+  do {
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put("a", "va"));
+    Compact("a", "b");
+    ASSERT_OK(Put("x", "vx"));
+    Compact("x", "y");
+    ASSERT_OK(Put("f", "vf"));
+    Compact("f", "g");
+    ASSERT_EQ("va", Get("a"));
+    ASSERT_EQ("vf", Get("f"));
+    ASSERT_EQ("vx", Get("x"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, GetEncountersEmptyLevel) {
+  do {
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occuring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    int compaction_count = 0;
+    while (NumTableFilesAtLevel(0) == 0 ||
+           NumTableFilesAtLevel(2) == 0) {
+      ASSERT_LE(compaction_count, 100) << "could not fill levels 0 and 2";
+      compaction_count++;
+      Put("a", "begin");
+      Put("z", "end");
+      dbfull()->TEST_CompactMemTable();
+    }
+
+    // Step 2: clear level 1 if necessary.
+    dbfull()->TEST_CompactRange(1, NULL, NULL);
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    env_->SleepForMicroseconds(1000000);
+
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IterEmpty) {
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("foo");
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  delete iter;
+}
+
+TEST(DBTest, IterSingle) {
+  ASSERT_OK(Put("a", "va"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("a");
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("b");
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  delete iter;
+}
+
+TEST(DBTest, IterMulti) {
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("b", "vb"));
+  ASSERT_OK(Put("c", "vc"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("a");
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("ax");
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Seek("b");
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Seek("z");
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  // Switch from reverse to forward
+  iter->SeekToLast();
+  iter->Prev();
+  iter->Prev();
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+
+  // Switch from forward to reverse
+  iter->SeekToFirst();
+  iter->Next();
+  iter->Next();
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+
+  // Make sure iter stays at snapshot
+  ASSERT_OK(Put("a",  "va2"));
+  ASSERT_OK(Put("a2", "va3"));
+  ASSERT_OK(Put("b",  "vb2"));
+  ASSERT_OK(Put("c",  "vc2"));
+  ASSERT_OK(Delete("b"));
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "b->vb");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  delete iter;
+}
+
+TEST(DBTest, IterSmallAndLargeMix) {
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("b", std::string(100000, 'b')));
+  ASSERT_OK(Put("c", "vc"));
+  ASSERT_OK(Put("d", std::string(100000, 'd')));
+  ASSERT_OK(Put("e", std::string(100000, 'e')));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  delete iter;
+}
+
+TEST(DBTest, IterMultiWithDelete) {
+  do {
+    ASSERT_OK(Put("a", "va"));
+    ASSERT_OK(Put("b", "vb"));
+    ASSERT_OK(Put("c", "vc"));
+    ASSERT_OK(Delete("b"));
+    ASSERT_EQ("NOT_FOUND", Get("b"));
+
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    iter->Seek("c");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, Recover) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_OK(Put("baz", "v5"));
+
+    Reopen();
+    ASSERT_EQ("v1", Get("foo"));
+
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v5", Get("baz"));
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v3"));
+
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+    ASSERT_OK(Put("foo", "v4"));
+    ASSERT_EQ("v4", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ("v5", Get("baz"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+  do {
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_OK(Put("foo", "v2"));
+    Reopen();
+    Reopen();
+    ASSERT_OK(Put("foo", "v3"));
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+  } while (ChangeOptions());
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST(DBTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 1000000;
+    Reopen(&options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put("foo", "v1"));                         // Goes to 1st log file
+    ASSERT_OK(Put("big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put("big2", std::string(1000, 'y')));      // Triggers compaction
+    ASSERT_OK(Put("bar", "v2"));                         // Goes to new log file
+
+    Reopen(&options);
+    ASSERT_EQ("v1", Get("foo"));
+    ASSERT_EQ("v2", Get("bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get("big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get("big2"));
+  } while (ChangeOptions());
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%06d", i);
+  return std::string(buf);
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10000;
+  Reopen(&options);
+
+  const int N = 500;
+
+  int starting_num_tables = TotalTableFiles();
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+  }
+  int ending_num_tables = TotalTableFiles();
+  ASSERT_GT(ending_num_tables, starting_num_tables);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+  }
+
+  Reopen();
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+  }
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+  {
+    Options options = CurrentOptions();
+    Reopen(&options);
+    ASSERT_OK(Put("big1", std::string(200000, '1')));
+    ASSERT_OK(Put("big2", std::string(200000, '2')));
+    ASSERT_OK(Put("small3", std::string(10, '3')));
+    ASSERT_OK(Put("small4", std::string(10, '4')));
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  }
+
+  // Make sure that if we re-open with a small write buffer size that
+  // we flush table files in the middle of a large log file.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  Reopen(&options);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+  ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+  ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+  ASSERT_EQ(std::string(10, '3'), Get("small3"));
+  ASSERT_EQ(std::string(10, '4'), Get("small4"));
+  ASSERT_GT(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;        // Large write buffer
+  Reopen(&options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(RandomString(&rnd, 100000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  Reopen(&options);
+  dbfull()->TEST_CompactRange(0, NULL, NULL);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST(DBTest, RepeatedWritesToSameKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  Reopen(&options);
+
+  // We must have at most one file per level except for level-0,
+  // which may have up to kL0_StopWritesTrigger files.
+  const int kMaxFiles = config::kNumLevels + config::kL0_StopWritesTrigger;
+
+  Random rnd(301);
+  std::string value = RandomString(&rnd, 2 * options.write_buffer_size);
+  for (int i = 0; i < 5 * kMaxFiles; i++) {
+    Put("key", value);
+    ASSERT_LE(TotalTableFiles(), kMaxFiles);
+    fprintf(stderr, "after %d: %d files\n", int(i+1), TotalTableFiles());
+  }
+}
+
+TEST(DBTest, SparseMerge) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  Reopen(&options);
+
+  FillLevels("A", "Z");
+
+  // Suppose there is:
+  //    small amount of data with prefix A
+  //    large amount of data with prefix B
+  //    small amount of data with prefix C
+  // and that recent updates have made small changes to all three prefixes.
+  // Check that we do not do a compaction that merges all of B in one shot.
+  const std::string value(1000, 'x');
+  Put("A", "va");
+  // Write approximately 100MB of "B" values
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    Put(key, value);
+  }
+  Put("C", "vc");
+  dbfull()->TEST_CompactMemTable();
+  dbfull()->TEST_CompactRange(0, NULL, NULL);
+
+  // Make sparse update
+  Put("A",    "va2");
+  Put("B100", "bvalue2");
+  Put("C",    "vc2");
+  dbfull()->TEST_CompactMemTable();
+
+  // Compactions should not cause us to create a situation where
+  // a file overlaps too much data at the next level.
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(0, NULL, NULL);
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+  dbfull()->TEST_CompactRange(1, NULL, NULL);
+  ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val),
+            (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;        // Large write buffer
+    options.compression = kNoCompression;
+    DestroyAndReopen();
+
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+    Reopen(&options);
+    ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(Key(i), RandomString(&rnd, S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      Reopen(&options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_TRUE(Between(Size("", Key(i)), S1*i, S2*i));
+          ASSERT_TRUE(Between(Size("", Key(i)+".suffix"), S1*(i+1), S2*(i+1)));
+          ASSERT_TRUE(Between(Size(Key(i), Key(i+10)), S1*10, S2*10));
+        }
+        ASSERT_TRUE(Between(Size("", Key(50)), S1*50, S2*50));
+        ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), S1*50, S2*50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        dbfull()->TEST_CompactRange(0, &cstart, &cend);
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    Reopen();
+
+    Random rnd(301);
+    std::string big1 = RandomString(&rnd, 100000);
+    ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(2), big1));
+    ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(4), big1));
+    ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+    ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+    ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      Reopen(&options);
+
+      ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+      ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+      ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+      ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+      ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+      ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+      ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+      ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+      ASSERT_TRUE(Between(Size("", Key(8)), 550000, 560000));
+
+      ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+      dbfull()->TEST_CompactRange(0, NULL, NULL);
+    }
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, IteratorPinsRef) {
+  Put("foo", "hello");
+
+  // Get iterator that will yield the current contents of the DB.
+  Iterator* iter = db_->NewIterator(ReadOptions());
+
+  // Write to force compactions
+  Put("foo", "newvalue1");
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+  }
+  Put("foo", "newvalue2");
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("hello", iter->value().ToString());
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  delete iter;
+}
+
+TEST(DBTest, Snapshot) {
+  do {
+    Put("foo", "v1");
+    const Snapshot* s1 = db_->GetSnapshot();
+    Put("foo", "v2");
+    const Snapshot* s2 = db_->GetSnapshot();
+    Put("foo", "v3");
+    const Snapshot* s3 = db_->GetSnapshot();
+
+    Put("foo", "v4");
+    ASSERT_EQ("v1", Get("foo", s1));
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v3", Get("foo", s3));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s3);
+    ASSERT_EQ("v1", Get("foo", s1));
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("v2", Get("foo", s2));
+    ASSERT_EQ("v4", Get("foo"));
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ("v4", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+  do {
+    Random rnd(301);
+    FillLevels("a", "z");
+
+    std::string big = RandomString(&rnd, 50000);
+    Put("foo", big);
+    Put("pastfoo", "v");
+    const Snapshot* snapshot = db_->GetSnapshot();
+    Put("foo", "tiny");
+    Put("pastfoo2", "v2");        // Advance sequence number one more
+
+    ASSERT_OK(dbfull()->TEST_CompactMemTable());
+    ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+    ASSERT_EQ(big, Get("foo", snapshot));
+    ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+    Slice x("x");
+    dbfull()->TEST_CompactRange(0, NULL, &x);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1), 1);
+    dbfull()->TEST_CompactRange(1, NULL, &x);
+    ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+    ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, DeletionMarkers1) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  const int last = config::kMaxMemCompactLevel;
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put("a", "begin");
+  Put("z", "end");
+  dbfull()->TEST_CompactMemTable();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+  Delete("foo");
+  Put("foo", "v2");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+  Slice z("z");
+  dbfull()->TEST_CompactRange(last-2, NULL, &z);
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+  Put("foo", "v1");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());
+  const int last = config::kMaxMemCompactLevel;
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);   // foo => v1 is now in last level
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  Put("a", "begin");
+  Put("z", "end");
+  dbfull()->TEST_CompactMemTable();
+  ASSERT_EQ(NumTableFilesAtLevel(last), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last-1), 1);
+
+  Delete("foo");
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  ASSERT_OK(dbfull()->TEST_CompactMemTable());  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last-2, NULL, NULL);
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+  dbfull()->TEST_CompactRange(last-1, NULL, NULL);
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, OverlapInLevel0) {
+  do {
+    ASSERT_EQ(config::kMaxMemCompactLevel, 2) << "Fix test to match config";
+
+    // Fill levels 1 and 2 to disable the pushing of new memtables to levels > 0.
+    ASSERT_OK(Put("100", "v100"));
+    ASSERT_OK(Put("999", "v999"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_OK(Delete("100"));
+    ASSERT_OK(Delete("999"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("0,1,1", FilesPerLevel());
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put("300", "v300"));
+    ASSERT_OK(Put("500", "v500"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_OK(Put("200", "v200"));
+    ASSERT_OK(Put("600", "v600"));
+    ASSERT_OK(Put("900", "v900"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("2,1,1", FilesPerLevel());
+
+    // Compact away the placeholder files we created initially
+    dbfull()->TEST_CompactRange(1, NULL, NULL);
+    dbfull()->TEST_CompactRange(2, NULL, NULL);
+    ASSERT_EQ("2", FilesPerLevel());
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete("600"));
+    dbfull()->TEST_CompactMemTable();
+    ASSERT_EQ("3", FilesPerLevel());
+    ASSERT_EQ("NOT_FOUND", Get("600"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_a) {
+  Reopen();
+  ASSERT_OK(Put("b", "v"));
+  Reopen();
+  ASSERT_OK(Delete("b"));
+  ASSERT_OK(Delete("a"));
+  Reopen();
+  ASSERT_OK(Delete("a"));
+  Reopen();
+  ASSERT_OK(Put("a", "v"));
+  Reopen();
+  Reopen();
+  ASSERT_EQ("(a->v)", Contents());
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+  ASSERT_EQ("(a->v)", Contents());
+}
+
+TEST(DBTest, L0_CompactionBug_Issue44_b) {
+  Reopen();
+  Put("","");
+  Reopen();
+  Delete("e");
+  Put("","");
+  Reopen();
+  Put("c", "cv");
+  Reopen();
+  Put("","");
+  Reopen();
+  Put("","");
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+  Reopen();
+  Put("d","dv");
+  Reopen();
+  Put("","");
+  Reopen();
+  Delete("d");
+  Delete("b");
+  Reopen();
+  ASSERT_EQ("(->)(c->cv)", Contents());
+  env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+  ASSERT_EQ("(->)(c->cv)", Contents());
+}
+
+TEST(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "leveldb.NewComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  NewComparator cmp;
+  Options new_options = CurrentOptions();
+  new_options.comparator = &cmp;
+  Status s = TryReopen(&new_options);
+  ASSERT_TRUE(!s.ok());
+  ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+      << s.ToString();
+}
+
+TEST(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    virtual const char* Name() const { return "test.NumberComparator"; }
+    virtual int Compare(const Slice& a, const Slice& b) const {
+      return ToNumber(a) - ToNumber(b);
+    }
+    virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+      ToNumber(*s);     // Check format
+      ToNumber(l);      // Check format
+    }
+    virtual void FindShortSuccessor(std::string* key) const {
+      ToNumber(*key);   // Check format
+    }
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      ASSERT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size()-1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      ASSERT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  NumberComparator cmp;
+  Options new_options = CurrentOptions();
+  new_options.create_if_missing = true;
+  new_options.comparator = &cmp;
+  new_options.filter_policy = NULL;     // Cannot use bloom filters
+  new_options.write_buffer_size = 1000;  // Compact more often
+  DestroyAndReopen(&new_options);
+  ASSERT_OK(Put("[10]", "ten"));
+  ASSERT_OK(Put("[0x14]", "twenty"));
+  for (int i = 0; i < 2; i++) {
+    ASSERT_EQ("ten", Get("[10]"));
+    ASSERT_EQ("ten", Get("[0xa]"));
+    ASSERT_EQ("twenty", Get("[20]"));
+    ASSERT_EQ("twenty", Get("[0x14]"));
+    ASSERT_EQ("NOT_FOUND", Get("[15]"));
+    ASSERT_EQ("NOT_FOUND", Get("[0xf]"));
+    Compact("[0]", "[9999]");
+  }
+
+  for (int run = 0; run < 2; run++) {
+    for (int i = 0; i < 1000; i++) {
+      char buf[100];
+      snprintf(buf, sizeof(buf), "[%d]", i*10);
+      ASSERT_OK(Put(buf, buf));
+    }
+    Compact("[0]", "[1000000]");
+  }
+}
+
+TEST(DBTest, ManualCompaction) {
+  ASSERT_EQ(config::kMaxMemCompactLevel, 2)
+      << "Need to update this test to match kMaxMemCompactLevel";
+
+  MakeTables(3, "p", "q");
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+
+  // Compaction range falls before files
+  Compact("", "c");
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+
+  // Compaction range falls after files
+  Compact("r", "z");
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+
+  // Compaction range overlaps files
+  Compact("p1", "p9");
+  ASSERT_EQ("0,0,1", FilesPerLevel());
+
+  // Populate a different range
+  MakeTables(3, "c", "e");
+  ASSERT_EQ("1,1,2", FilesPerLevel());
+
+  // Compact just the new range
+  Compact("b", "f");
+  ASSERT_EQ("0,0,2", FilesPerLevel());
+
+  // Compact all
+  MakeTables(1, "a", "z");
+  ASSERT_EQ("0,1,2", FilesPerLevel());
+  db_->CompactRange(NULL, NULL);
+  ASSERT_EQ("0,0,1", FilesPerLevel());
+}
+
+TEST(DBTest, DBOpen_Options) {
+  std::string dbname = test::TmpDir() + "/db_options_test";
+  DestroyDB(dbname, Options());
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = NULL;
+  Options opts;
+  opts.create_if_missing = false;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
+  ASSERT_TRUE(db == NULL);
+
+  // Does not exist, and create_if_missing == true: OK
+  opts.create_if_missing = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+
+  // Does exist, and error_if_exists == true: error
+  opts.create_if_missing = false;
+  opts.error_if_exists = true;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
+  ASSERT_TRUE(db == NULL);
+
+  // Does exist, and error_if_exists == false: OK
+  opts.create_if_missing = true;
+  opts.error_if_exists = false;
+  s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+}
+
+// Check that number of files does not grow when we are out of space
+TEST(DBTest, NoSpace) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  Reopen(&options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  Compact("a", "z");
+  const int num_files = CountFiles();
+  env_->no_space_.Release_Store(env_);   // Force out-of-space errors
+  env_->sleep_counter_.Reset();
+  for (int i = 0; i < 5; i++) {
+    for (int level = 0; level < config::kNumLevels-1; level++) {
+      dbfull()->TEST_CompactRange(level, NULL, NULL);
+    }
+  }
+  env_->no_space_.Release_Store(NULL);
+  ASSERT_LT(CountFiles(), num_files + 3);
+
+  // Check that compaction attempts slept after errors
+  ASSERT_GE(env_->sleep_counter_.Read(), 5);
+}
+
+TEST(DBTest, NonWritableFileSystem) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1000;
+  options.env = env_;
+  Reopen(&options);
+  ASSERT_OK(Put("foo", "v1"));
+  env_->non_writable_.Release_Store(env_);  // Force errors for new files
+  std::string big(100000, 'x');
+  int errors = 0;
+  for (int i = 0; i < 20; i++) {
+    fprintf(stderr, "iter %d; errors %d\n", i, errors);
+    if (!Put("foo", big).ok()) {
+      errors++;
+      env_->SleepForMicroseconds(100000);
+    }
+  }
+  ASSERT_GT(errors, 0);
+  env_->non_writable_.Release_Store(NULL);
+}
+
+TEST(DBTest, FilesDeletedAfterCompaction) {
+  ASSERT_OK(Put("foo", "v2"));
+  Compact("a", "z");
+  const int num_files = CountFiles();
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("foo", "v2"));
+    Compact("a", "z");
+  }
+  ASSERT_EQ(CountFiles(), num_files);
+}
+
+TEST(DBTest, BloomFilter) {
+  env_->count_random_reads_ = true;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.block_cache = NewLRUCache(0);  // Prevent cache hits
+  options.filter_policy = NewBloomFilterPolicy(10);
+  Reopen(&options);
+
+  // Populate multiple layers
+  const int N = 10000;
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+  Compact("a", "z");
+  for (int i = 0; i < N; i += 100) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+  dbfull()->TEST_CompactMemTable();
+
+  // Prevent auto compactions triggered by seeks
+  env_->delay_sstable_sync_.Release_Store(env_);
+
+  // Lookup present keys.  Should rarely read from small sstable.
+  env_->random_read_counter_.Reset();
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(Key(i), Get(Key(i)));
+  }
+  int reads = env_->random_read_counter_.Read();
+  fprintf(stderr, "%d present => %d reads\n", N, reads);
+  ASSERT_GE(reads, N);
+  ASSERT_LE(reads, N + 2*N/100);
+
+  // Lookup present keys.  Should rarely read from either sstable.
+  env_->random_read_counter_.Reset();
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ("NOT_FOUND", Get(Key(i) + ".missing"));
+  }
+  reads = env_->random_read_counter_.Read();
+  fprintf(stderr, "%d missing => %d reads\n", N, reads);
+  ASSERT_LE(reads, 3*N/100);
+
+  env_->delay_sstable_sync_.Release_Store(NULL);
+  Close();
+  delete options.block_cache;
+  delete options.filter_policy;
+}
+
+// Multi-threaded test:
+namespace {
+
+static const int kNumThreads = 4;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  port::AtomicPointer stop;
+  port::AtomicPointer counter[kNumThreads];
+  port::AtomicPointer thread_done[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  uintptr_t counter = 0;
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  std::string value;
+  char valbuf[1500];
+  while (t->state->stop.Acquire_Load() == NULL) {
+    t->state->counter[id].Release_Store(reinterpret_cast<void*>(counter));
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter>.
+      // We add some padding for force compactions.
+      snprintf(valbuf, sizeof(valbuf), "%d.%d.%-1000d",
+               key, id, static_cast<int>(counter));
+      ASSERT_OK(db->Put(WriteOptions(), Slice(keybuf), Slice(valbuf)));
+    } else {
+      // Read a value and verify that it matches the pattern written above.
+      Status s = db->Get(ReadOptions(), Slice(keybuf), &value);
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int k, w, c;
+        ASSERT_EQ(3, sscanf(value.c_str(), "%d.%d.%d", &k, &w, &c)) << value;
+        ASSERT_EQ(k, key);
+        ASSERT_GE(w, 0);
+        ASSERT_LT(w, kNumThreads);
+        ASSERT_LE(c, reinterpret_cast<uintptr_t>(
+            t->state->counter[w].Acquire_Load()));
+      }
+    }
+    counter++;
+  }
+  t->state->thread_done[id].Release_Store(t);
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // namespace
+
+TEST(DBTest, MultiThreaded) {
+  do {
+    // Initialize state
+    MTState mt;
+    mt.test = this;
+    mt.stop.Release_Store(0);
+    for (int id = 0; id < kNumThreads; id++) {
+      mt.counter[id].Release_Store(0);
+      mt.thread_done[id].Release_Store(0);
+    }
+
+    // Start threads
+    MTThread thread[kNumThreads];
+    for (int id = 0; id < kNumThreads; id++) {
+      thread[id].state = &mt;
+      thread[id].id = id;
+      env_->StartThread(MTThreadBody, &thread[id]);
+    }
+
+    // Let them run for a while
+    env_->SleepForMicroseconds(kTestSeconds * 1000000);
+
+    // Stop the threads and wait for them to finish
+    mt.stop.Release_Store(&mt);
+    for (int id = 0; id < kNumThreads; id++) {
+      while (mt.thread_done[id].Acquire_Load() == NULL) {
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+  } while (ChangeOptions());
+}
+
+namespace {
+typedef std::map<std::string, std::string> KVMap;
+}
+
+class ModelDB: public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+  };
+
+  explicit ModelDB(const Options& options): options_(options) { }
+  ~ModelDB() { }
+  virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
+    return DB::Put(o, k, v);
+  }
+  virtual Status Delete(const WriteOptions& o, const Slice& key) {
+    return DB::Delete(o, key);
+  }
+  virtual Status Get(const ReadOptions& options,
+                     const Slice& key, std::string* value) {
+    assert(false);      // Not implemented
+    return Status::NotFound(key);
+  }
+  virtual Iterator* NewIterator(const ReadOptions& options) {
+    if (options.snapshot == NULL) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  virtual const Snapshot* GetSnapshot() {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+  virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      virtual void Put(const Slice& key, const Slice& value) {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      virtual void Delete(const Slice& key) {
+        map_->erase(key.ToString());
+      }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  virtual bool GetProperty(const Slice& property, std::string* value) {
+    return false;
+  }
+  virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+  }
+  virtual void CompactRange(const Slice* start, const Slice* end) {
+  }
+
+ private:
+  class ModelIter: public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {
+    }
+    ~ModelIter() {
+      if (owned_) delete map_;
+    }
+    virtual bool Valid() const { return iter_ != map_->end(); }
+    virtual void SeekToFirst() { iter_ = map_->begin(); }
+    virtual void SeekToLast() {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    virtual void Seek(const Slice& k) {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    virtual void Next() { ++iter_; }
+    virtual void Prev() { --iter_; }
+    virtual Slice key() const { return iter_->first; }
+    virtual Slice value() const { return iter_->second; }
+    virtual Status status() const { return Status::OK(); }
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+};
+
+static std::string RandomKey(Random* rnd) {
+  int len = (rnd->OneIn(3)
+             ? 1                // Short sometimes to encourage collisions
+             : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+                             DB* model,
+                             DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid();
+       miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(miter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+TEST(DBTest, Randomized) {
+  Random rnd(test::RandomSeed());
+  do {
+    ModelDB model(CurrentOptions());
+    const int N = 10000;
+    const Snapshot* model_snap = NULL;
+    const Snapshot* db_snap = NULL;
+    std::string k, v;
+    for (int step = 0; step < N; step++) {
+      if (step % 100 == 0) {
+        fprintf(stderr, "Step %d of %d\n", step, N);
+      }
+      // TODO(sanjay): Test Get() works
+      int p = rnd.Uniform(100);
+      if (p < 45) {                               // Put
+        k = RandomKey(&rnd);
+        v = RandomString(&rnd,
+                         rnd.OneIn(20)
+                         ? 100 + rnd.Uniform(100)
+                         : rnd.Uniform(8));
+        ASSERT_OK(model.Put(WriteOptions(), k, v));
+        ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+      } else if (p < 90) {                        // Delete
+        k = RandomKey(&rnd);
+        ASSERT_OK(model.Delete(WriteOptions(), k));
+        ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+      } else {                                    // Multi-element batch
+        WriteBatch b;
+        const int num = rnd.Uniform(8);
+        for (int i = 0; i < num; i++) {
+          if (i == 0 || !rnd.OneIn(10)) {
+            k = RandomKey(&rnd);
+          } else {
+            // Periodically re-use the same key from the previous iter, so
+            // we have multiple entries in the write batch for the same key
+          }
+          if (rnd.OneIn(2)) {
+            v = RandomString(&rnd, rnd.Uniform(10));
+            b.Put(k, v);
+          } else {
+            b.Delete(k);
+          }
+        }
+        ASSERT_OK(model.Write(WriteOptions(), &b));
+        ASSERT_OK(db_->Write(WriteOptions(), &b));
+      }
+
+      if ((step % 100) == 0) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+        // Save a snapshot from each DB this time that we'll use next
+        // time we compare things, to make sure the current state is
+        // preserved with the snapshot
+        if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+        if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+
+        Reopen();
+        ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+
+        model_snap = model.GetSnapshot();
+        db_snap = db_->GetSnapshot();
+      }
+    }
+    if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+    if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+  } while (ChangeOptions());
+}
+
+std::string MakeKey(unsigned int num) {
+  char buf[30];
+  snprintf(buf, sizeof(buf), "%016u", num);
+  return std::string(buf);
+}
+
+void BM_LogAndApply(int iters, int num_base_files) {
+  std::string dbname = test::TmpDir() + "/leveldb_test_benchmark";
+  DestroyDB(dbname, Options());
+
+  DB* db = NULL;
+  Options opts;
+  opts.create_if_missing = true;
+  Status s = DB::Open(opts, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != NULL);
+
+  delete db;
+  db = NULL;
+
+  Env* env = Env::Default();
+
+  port::Mutex mu;
+  MutexLock l(&mu);
+
+  InternalKeyComparator cmp(BytewiseComparator());
+  Options options;
+  VersionSet vset(dbname, &options, NULL, &cmp);
+  ASSERT_OK(vset.Recover());
+  VersionEdit vbase;
+  uint64_t fnum = 1;
+  for (int i = 0; i < num_base_files; i++) {
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vbase.AddFile(2, fnum++, 1 /* file size */, start, limit);
+  }
+  ASSERT_OK(vset.LogAndApply(&vbase, &mu));
+
+  uint64_t start_micros = env->NowMicros();
+
+  for (int i = 0; i < iters; i++) {
+    VersionEdit vedit;
+    vedit.DeleteFile(2, fnum);
+    InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
+    InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);
+    vedit.AddFile(2, fnum++, 1 /* file size */, start, limit);
+    vset.LogAndApply(&vedit, &mu);
+  }
+  uint64_t stop_micros = env->NowMicros();
+  unsigned int us = stop_micros - start_micros;
+  char buf[16];
+  snprintf(buf, sizeof(buf), "%d", num_base_files);
+  fprintf(stderr,
+          "BM_LogAndApply/%-6s   %8d iters : %9u us (%7.0f us / iter)\n",
+          buf, iters, us, ((float)us) / iters);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  if (argc > 1 && std::string(argv[1]) == "--benchmark") {
+    leveldb::BM_LogAndApply(1000, 1);
+    leveldb::BM_LogAndApply(1000, 100);
+    leveldb::BM_LogAndApply(1000, 10000);
+    leveldb::BM_LogAndApply(100, 100000);
+    return 0;
+  }
+
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/dbformat.cc b/src/leveldb/db/dbformat.cc
new file mode 100644
index 0000000000..28e11b398d
--- /dev/null
+++ b/src/leveldb/db/dbformat.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  assert(t <= kValueTypeForSeek);
+  return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString() const {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' @ %llu : %d",
+           (unsigned long long) sequence,
+           int(type));
+  std::string result = "'";
+  result += user_key.ToString();
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString() const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed)) {
+    result = parsed.DebugString();
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+  return "leveldb.InternalKeyComparator";
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+    const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const {
+  // Attempt to shorten the user portion of the key
+  Slice user_start = ExtractUserKey(*start);
+  Slice user_limit = ExtractUserKey(limit);
+  std::string tmp(user_start.data(), user_start.size());
+  user_comparator_->FindShortestSeparator(&tmp, user_limit);
+  if (tmp.size() < user_start.size() &&
+      user_comparator_->Compare(user_start, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*start, tmp) < 0);
+    assert(this->Compare(tmp, limit) < 0);
+    start->swap(tmp);
+  }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+  Slice user_key = ExtractUserKey(*key);
+  std::string tmp(user_key.data(), user_key.size());
+  user_comparator_->FindShortSuccessor(&tmp);
+  if (tmp.size() < user_key.size() &&
+      user_comparator_->Compare(user_key, tmp) < 0) {
+    // User key has become shorter physically, but larger logically.
+    // Tack on the earliest possible number to the shortened user key.
+    PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+    assert(this->Compare(*key, tmp) < 0);
+    key->swap(tmp);
+  }
+}
+
+const char* InternalFilterPolicy::Name() const {
+  return user_policy_->Name();
+}
+
+void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
+                                        std::string* dst) const {
+  // We rely on the fact that the code in table.cc does not mind us
+  // adjusting keys[].
+  Slice* mkey = const_cast<Slice*>(keys);
+  for (int i = 0; i < n; i++) {
+    mkey[i] = ExtractUserKey(keys[i]);
+    // TODO(sanjay): Suppress dups?
+  }
+  user_policy_->CreateFilter(keys, n, dst);
+}
+
+bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
+  return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
+}
+
+LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
+  size_t usize = user_key.size();
+  size_t needed = usize + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  dst = EncodeVarint32(dst, usize + 8);
+  kstart_ = dst;
+  memcpy(dst, user_key.data(), usize);
+  dst += usize;
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/dbformat.h b/src/leveldb/db/dbformat.h
new file mode 100644
index 0000000000..f7f64dafb6
--- /dev/null
+++ b/src/leveldb/db/dbformat.h
@@ -0,0 +1,227 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
+
+#include <stdio.h>
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/filter_policy.h"
+#include "leveldb/slice.h"
+#include "leveldb/table_builder.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// Grouping of constants.  We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+
+// Level-0 compaction is started when we hit this many files.
+static const int kL0_CompactionTrigger = 4;
+
+// Soft limit on number of level-0 files.  We slow down writes at this point.
+static const int kL0_SlowdownWritesTrigger = 8;
+
+// Maximum number of level-0 files.  We stop writes at this point.
+static const int kL0_StopWritesTrigger = 12;
+
+// Maximum level to which a new compacted memtable is pushed if it
+// does not create overlap.  We try to push to level 2 to avoid the
+// relatively expensive level 0=>1 compactions and to avoid some
+// expensive manifest file operations.  We do not push all the way to
+// the largest level since that can generate a lot of wasted disk
+// space if the same key space is being repeatedly overwritten.
+static const int kMaxMemCompactLevel = 2;
+
+}  // namespace config
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeValue;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+    ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey() { }  // Intentionally left uninitialized (for speed)
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) { }
+  std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  assert(internal_key.size() >= 8);
+  const size_t n = internal_key.size();
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+  const Comparator* user_comparator_;
+ public:
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
+  virtual const char* Name() const;
+  virtual int Compare(const Slice& a, const Slice& b) const;
+  virtual void FindShortestSeparator(
+      std::string* start,
+      const Slice& limit) const;
+  virtual void FindShortSuccessor(std::string* key) const;
+
+  const Comparator* user_comparator() const { return user_comparator_; }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Filter policy wrapper that converts from internal keys to user keys
+class InternalFilterPolicy : public FilterPolicy {
+ private:
+  const FilterPolicy* const user_policy_;
+ public:
+  explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
+  virtual const char* Name() const;
+  virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
+  virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+  std::string rep_;
+ public:
+  InternalKey() { }   // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  std::string DebugString() const;
+};
+
+inline int InternalKeyComparator::Compare(
+    const InternalKey& a, const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline bool ParseInternalKey(const Slice& internal_key,
+                             ParsedInternalKey* result) {
+  const size_t n = internal_key.size();
+  if (n < 8) return false;
+  uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  result->user_key = Slice(internal_key.data(), n - 8);
+  return (c <= static_cast<unsigned char>(kTypeValue));
+}
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& user_key, SequenceNumber sequence);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const { return Slice(start_, end_ - start_); }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const { return Slice(kstart_, end_ - kstart_); }
+
+  // Return the user key
+  Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];      // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_FORMAT_H_
diff --git a/src/leveldb/db/dbformat_test.cc b/src/leveldb/db/dbformat_test.cc
new file mode 100644
index 0000000000..5d82f5d313
--- /dev/null
+++ b/src/leveldb/db/dbformat_test.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string IKey(const std::string& user_key,
+                        uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+  return result;
+}
+
+static void TestKey(const std::string& key,
+                    uint64_t seq,
+                    ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_TRUE(ParseInternalKey(in, &decoded));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+  const uint64_t seq[] = {
+    1, 2, 3,
+    (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+    (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+    (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+  };
+  for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("hello", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue),
+                    IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+            Shorten(IKey("foobar", 100, kTypeValue),
+                    IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc
new file mode 100644
index 0000000000..3c4d49f64e
--- /dev/null
+++ b/src/leveldb/db/filename.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+// A utility routine: write "data" to the named file and Sync() it.
+extern Status WriteStringToFileSync(Env* env, const Slice& data,
+                                    const std::string& fname);
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+                                const char* suffix) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/%06llu.%s",
+           static_cast<unsigned long long>(number),
+           suffix);
+  return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(name, number, "sst");
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+           static_cast<unsigned long long>(number));
+  return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+  return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+  return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+  assert(number > 0);
+  return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname) {
+  return dbname + "/LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname) {
+  return dbname + "/LOG.old";
+}
+
+
+// Owned filenames have the form:
+//    dbname/CURRENT
+//    dbname/LOCK
+//    dbname/LOG
+//    dbname/LOG.old
+//    dbname/MANIFEST-[0-9]+
+//    dbname/[0-9]+.(log|sst)
+bool ParseFileName(const std::string& fname,
+                   uint64_t* number,
+                   FileType* type) {
+  Slice rest(fname);
+  if (rest == "CURRENT") {
+    *number = 0;
+    *type = kCurrentFile;
+  } else if (rest == "LOCK") {
+    *number = 0;
+    *type = kDBLockFile;
+  } else if (rest == "LOG" || rest == "LOG.old") {
+    *number = 0;
+    *type = kInfoLogFile;
+  } else if (rest.starts_with("MANIFEST-")) {
+    rest.remove_prefix(strlen("MANIFEST-"));
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    if (!rest.empty()) {
+      return false;
+    }
+    *type = kDescriptorFile;
+    *number = num;
+  } else {
+    // Avoid strtoull() to keep filename format independent of the
+    // current locale
+    uint64_t num;
+    if (!ConsumeDecimalNumber(&rest, &num)) {
+      return false;
+    }
+    Slice suffix = rest;
+    if (suffix == Slice(".log")) {
+      *type = kLogFile;
+    } else if (suffix == Slice(".sst")) {
+      *type = kTableFile;
+    } else if (suffix == Slice(".dbtmp")) {
+      *type = kTempFile;
+    } else {
+      return false;
+    }
+    *number = num;
+  }
+  return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+                      uint64_t descriptor_number) {
+  // Remove leading "dbname/" and add newline to manifest file name
+  std::string manifest = DescriptorFileName(dbname, descriptor_number);
+  Slice contents = manifest;
+  assert(contents.starts_with(dbname + "/"));
+  contents.remove_prefix(dbname.size() + 1);
+  std::string tmp = TempFileName(dbname, descriptor_number);
+  Status s = WriteStringToFileSync(env, contents.ToString() + "\n", tmp);
+  if (s.ok()) {
+    s = env->RenameFile(tmp, CurrentFileName(dbname));
+  }
+  if (!s.ok()) {
+    env->DeleteFile(tmp);
+  }
+  return s;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h
new file mode 100644
index 0000000000..d5d09b1146
--- /dev/null
+++ b/src/leveldb/db/filename.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
+#define STORAGE_LEVELDB_DB_FILENAME_H_
+
+#include <stdint.h>
+#include <string>
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+enum FileType {
+  kLogFile,
+  kDBLockFile,
+  kTableFile,
+  kDescriptorFile,
+  kCurrentFile,
+  kTempFile,
+  kInfoLogFile  // Either the current one, or an old one
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname".  The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number.  The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+                                      uint64_t number);
+
+// Return the name of the current file.  This file contains the name
+// of the current manifest file.  The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname".  The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname);
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname);
+
+// If filename is a leveldb file, store the type of the file in *type.
+// The number encoded in the filename is stored in *number.  If the
+// filename was successfully parsed, returns true.  Else return false.
+extern bool ParseFileName(const std::string& filename,
+                          uint64_t* number,
+                          FileType* type);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+                             uint64_t descriptor_number);
+
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_FILENAME_H_
diff --git a/src/leveldb/db/filename_test.cc b/src/leveldb/db/filename_test.cc
new file mode 100644
index 0000000000..47353d6c9a
--- /dev/null
+++ b/src/leveldb/db/filename_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+  } cases[] = {
+    { "100.log",            100,   kLogFile },
+    { "0.log",              0,     kLogFile },
+    { "0.sst",              0,     kTableFile },
+    { "CURRENT",            0,     kCurrentFile },
+    { "LOCK",               0,     kDBLockFile },
+    { "MANIFEST-2",         2,     kDescriptorFile },
+    { "MANIFEST-7",         7,     kDescriptorFile },
+    { "LOG",                0,     kInfoLogFile },
+    { "LOG.old",            0,     kInfoLogFile },
+    { "18446744073709551615.log", 18446744073709551615ull, kLogFile },
+  };
+  for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    std::string f = cases[i].fname;
+    ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+    ASSERT_EQ(cases[i].type, type) << f;
+    ASSERT_EQ(cases[i].number, number) << f;
+  }
+
+  // Errors
+  static const char* errors[] = {
+    "",
+    "foo",
+    "foo-dx-100.log",
+    ".log",
+    "",
+    "manifest",
+    "CURREN",
+    "CURRENTX",
+    "MANIFES",
+    "MANIFEST",
+    "MANIFEST-",
+    "XMANIFEST-3",
+    "MANIFEST-3x",
+    "LOC",
+    "LOCKx",
+    "LO",
+    "LOGx",
+    "18446744073709551616.log",
+    "184467440737095516150.log",
+    "100",
+    "100.",
+    "100.lop"
+  };
+  for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192, number);
+  ASSERT_EQ(kLogFile, type);
+
+  fname = TableFileName("bar", 200);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999, number);
+  ASSERT_EQ(kTempFile, type);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/log_format.h b/src/leveldb/db/log_format.h
new file mode 100644
index 0000000000..2690cb9789
--- /dev/null
+++ b/src/leveldb/db/log_format.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+
+namespace leveldb {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4
+};
+static const int kMaxRecordType = kLastType;
+
+static const int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}  // namespace log
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
diff --git a/src/leveldb/db/log_reader.cc b/src/leveldb/db/log_reader.cc
new file mode 100644
index 0000000000..b35f115aad
--- /dev/null
+++ b/src/leveldb/db/log_reader.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+               uint64_t initial_offset)
+    : file_(file),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      initial_offset_(initial_offset) {
+}
+
+Reader::~Reader() {
+  delete[] backing_store_;
+}
+
+bool Reader::SkipToInitialBlock() {
+  size_t offset_in_block = initial_offset_ % kBlockSize;
+  uint64_t block_start_location = initial_offset_ - offset_in_block;
+
+  // Don't search a block if we'd be in the trailer
+  if (offset_in_block > kBlockSize - 6) {
+    offset_in_block = 0;
+    block_start_location += kBlockSize;
+  }
+
+  end_of_buffer_offset_ = block_start_location;
+
+  // Skip to start of first block that can contain the initial record
+  if (block_start_location > 0) {
+    Status skip_status = file_->Skip(block_start_location);
+    if (!skip_status.ok()) {
+      ReportDrop(block_start_location, skip_status);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+  if (last_record_offset_ < initial_offset_) {
+    if (!SkipToInitialBlock()) {
+      return false;
+    }
+  }
+
+  scratch->clear();
+  record->clear();
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    const unsigned int record_type = ReadPhysicalRecord(&fragment);
+    switch (record_type) {
+      case kFullType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(1)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        return true;
+
+      case kFirstType:
+        if (in_fragmented_record) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          if (scratch->empty()) {
+            in_fragmented_record = false;
+          } else {
+            ReportCorruption(scratch->size(), "partial record without end(2)");
+          }
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          return true;
+        }
+        break;
+
+      case kEof:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "partial record without end(3)");
+          scratch->clear();
+        }
+        return false;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() {
+  return last_record_offset_;
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != NULL &&
+      end_of_buffer_offset_ - buffer_.size() - bytes >= initial_offset_) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+  while (true) {
+    if (buffer_.size() < kHeaderSize) {
+      if (!eof_) {
+        // Last read was a full read, so this is a trailer to skip
+        buffer_.clear();
+        Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+        end_of_buffer_offset_ += buffer_.size();
+        if (!status.ok()) {
+          buffer_.clear();
+          ReportDrop(kBlockSize, status);
+          eof_ = true;
+          return kEof;
+        } else if (buffer_.size() < kBlockSize) {
+          eof_ = true;
+        }
+        continue;
+      } else if (buffer_.size() == 0) {
+        // End of file
+        return kEof;
+      } else {
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "truncated record at end of file");
+        return kEof;
+      }
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    if (kHeaderSize + length > buffer_.size()) {
+      size_t drop_size = buffer_.size();
+      buffer_.clear();
+      ReportCorruption(drop_size, "bad record length");
+      return kBadRecord;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        size_t drop_size = buffer_.size();
+        buffer_.clear();
+        ReportCorruption(drop_size, "checksum mismatch");
+        return kBadRecord;
+      }
+    }
+
+    buffer_.remove_prefix(kHeaderSize + length);
+
+    // Skip physical record that started before initial_offset_
+    if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
+        initial_offset_) {
+      result->clear();
+      return kBadRecord;
+    }
+
+    *result = Slice(header + kHeaderSize, length);
+    return type;
+  }
+}
+
+}  // namespace log
+}  // namespace leveldb
diff --git a/src/leveldb/db/log_reader.h b/src/leveldb/db/log_reader.h
new file mode 100644
index 0000000000..82d4bee68d
--- /dev/null
+++ b/src/leveldb/db/log_reader.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
+#define STORAGE_LEVELDB_DB_LOG_READER_H_
+
+#include <stdint.h>
+
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class SequentialFile;
+
+namespace log {
+
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-NULL, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  //
+  // The Reader will start reading at the first record located at physical
+  // position >= initial_offset within the file.
+  Reader(SequentialFile* file, Reporter* reporter, bool checksum,
+         uint64_t initial_offset);
+
+  ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage.  The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  bool ReadRecord(Slice* record, std::string* scratch);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+ private:
+  SequentialFile* const file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+  Slice buffer_;
+  bool eof_;   // Last Read() indicated EOF by returning < kBlockSize
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // Offset at which to start looking for the first record to return
+  uint64_t const initial_offset_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    // * The record is below constructor's initial_offset (No drop is reported)
+    kBadRecord = kMaxRecordType + 2
+  };
+
+  // Skips all blocks that are completely before "initial_offset_".
+  //
+  // Returns true on success. Handles reporting.
+  bool SkipToInitialBlock();
+
+  // Return type, or one of the preceding special values
+  unsigned int ReadPhysicalRecord(Slice* result);
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  // No copying allowed
+  Reader(const Reader&);
+  void operator=(const Reader&);
+};
+
+}  // namespace log
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_LOG_READER_H_
diff --git a/src/leveldb/db/log_test.cc b/src/leveldb/db/log_test.cc
new file mode 100644
index 0000000000..4c5cf87573
--- /dev/null
+++ b/src/leveldb/db/log_test.cc
@@ -0,0 +1,500 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+  class StringDest : public WritableFile {
+   public:
+    std::string contents_;
+
+    virtual Status Close() { return Status::OK(); }
+    virtual Status Flush() { return Status::OK(); }
+    virtual Status Sync() { return Status::OK(); }
+    virtual Status Append(const Slice& slice) {
+      contents_.append(slice.data(), slice.size());
+      return Status::OK();
+    }
+  };
+
+  class StringSource : public SequentialFile {
+   public:
+    Slice contents_;
+    bool force_error_;
+    bool returned_partial_;
+    StringSource() : force_error_(false), returned_partial_(false) { }
+
+    virtual Status Read(size_t n, Slice* result, char* scratch) {
+      ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+
+      if (force_error_) {
+        force_error_ = false;
+        returned_partial_ = true;
+        return Status::Corruption("read error");
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+      *result = Slice(contents_.data(), n);
+      contents_.remove_prefix(n);
+      return Status::OK();
+    }
+
+    virtual Status Skip(uint64_t n) {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return Status::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return Status::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) { }
+    virtual void Corruption(size_t bytes, const Status& status) {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  StringDest dest_;
+  StringSource source_;
+  ReportCollector report_;
+  bool reading_;
+  Writer writer_;
+  Reader reader_;
+
+  // Record metadata for testing initial offset functionality
+  static size_t initial_offset_record_sizes_[];
+  static uint64_t initial_offset_last_record_offsets_[];
+
+ public:
+  LogTest() : reading_(false),
+              writer_(&dest_),
+              reader_(&source_, &report_, true/*checksum*/,
+                      0/*initial_offset*/) {
+  }
+
+  void Write(const std::string& msg) {
+    ASSERT_TRUE(!reading_) << "Write() after starting to read";
+    writer_.AddRecord(Slice(msg));
+  }
+
+  size_t WrittenBytes() const {
+    return dest_.contents_.size();
+  }
+
+  std::string Read() {
+    if (!reading_) {
+      reading_ = true;
+      source_.contents_ = Slice(dest_.contents_);
+    }
+    std::string scratch;
+    Slice record;
+    if (reader_.ReadRecord(&record, &scratch)) {
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, int delta) {
+    dest_.contents_[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_.contents_[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) {
+    dest_.contents_.resize(dest_.contents_.size() - bytes);
+  }
+
+  void FixChecksum(int header_offset, int len) {
+    // Compute crc of type/len/data
+    uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_.contents_[header_offset], crc);
+  }
+
+  void ForceError() {
+    source_.force_error_ = true;
+  }
+
+  size_t DroppedBytes() const {
+    return report_.dropped_bytes_;
+  }
+
+  std::string ReportMessage() const {
+    return report_.message_;
+  }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+
+  void WriteInitialOffsetLog() {
+    for (int i = 0; i < 4; i++) {
+      std::string record(initial_offset_record_sizes_[i],
+                         static_cast<char>('a' + i));
+      Write(record);
+    }
+  }
+
+  void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    source_.contents_ = Slice(dest_.contents_);
+    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+                                       WrittenBytes() + offset_past_end);
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
+    delete offset_reader;
+  }
+
+  void CheckInitialOffsetRecord(uint64_t initial_offset,
+                                int expected_record_offset) {
+    WriteInitialOffsetLog();
+    reading_ = true;
+    source_.contents_ = Slice(dest_.contents_);
+    Reader* offset_reader = new Reader(&source_, &report_, true/*checksum*/,
+                                       initial_offset);
+    Slice record;
+    std::string scratch;
+    ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
+    ASSERT_EQ(initial_offset_record_sizes_[expected_record_offset],
+              record.size());
+    ASSERT_EQ(initial_offset_last_record_offsets_[expected_record_offset],
+              offset_reader->LastRecordOffset());
+    ASSERT_EQ((char)('a' + expected_record_offset), record.data()[0]);
+    delete offset_reader;
+  }
+
+};
+
+size_t LogTest::initial_offset_record_sizes_[] =
+    {10000,  // Two sizable records in first block
+     10000,
+     2 * log::kBlockSize - 1000,  // Span three blocks
+     1};
+
+uint64_t LogTest::initial_offset_last_record_offsets_[] =
+    {0,
+     kHeaderSize + 10000,
+     2 * (kHeaderSize + 10000),
+     2 * (kHeaderSize + 10000) +
+         (2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
+
+
+TEST(LogTest, Empty) {
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  const int n = kBlockSize - 2*kHeaderSize;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST(LogTest, ShortTrailer) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+  const int n = kBlockSize - 2*kHeaderSize + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+  Write("foo");
+  ShrinkSize(4);   // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+  Write("foo");
+  IncrementByte(0, 10);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(10, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  SetByte(6, kMiddleType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+  Write("foo");
+  SetByte(6, kLastType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  SetByte(6, kFirstType);
+  FixChecksum(0, 3);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  ASSERT_EQ("correct", Read());
+  ASSERT_EQ("EOF", Read());
+  const int dropped = DroppedBytes();
+  ASSERT_LE(dropped, 2*kBlockSize + 100);
+  ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+TEST(LogTest, ReadStart) {
+  CheckInitialOffsetRecord(0, 0);
+}
+
+TEST(LogTest, ReadSecondOneOff) {
+  CheckInitialOffsetRecord(1, 1);
+}
+
+TEST(LogTest, ReadSecondTenThousand) {
+  CheckInitialOffsetRecord(10000, 1);
+}
+
+TEST(LogTest, ReadSecondStart) {
+  CheckInitialOffsetRecord(10007, 1);
+}
+
+TEST(LogTest, ReadThirdOneOff) {
+  CheckInitialOffsetRecord(10008, 2);
+}
+
+TEST(LogTest, ReadThirdStart) {
+  CheckInitialOffsetRecord(20014, 2);
+}
+
+TEST(LogTest, ReadFourthOneOff) {
+  CheckInitialOffsetRecord(20015, 3);
+}
+
+TEST(LogTest, ReadFourthFirstBlockTrailer) {
+  CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
+}
+
+TEST(LogTest, ReadFourthMiddleBlock) {
+  CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthLastBlock) {
+  CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
+}
+
+TEST(LogTest, ReadFourthStart) {
+  CheckInitialOffsetRecord(
+      2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
+      3);
+}
+
+TEST(LogTest, ReadEnd) {
+  CheckOffsetPastEndReturnsNoRecords(0);
+}
+
+TEST(LogTest, ReadPastEnd) {
+  CheckOffsetPastEndReturnsNoRecords(5);
+}
+
+}  // namespace log
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/log_writer.cc b/src/leveldb/db/log_writer.cc
new file mode 100644
index 0000000000..2da99ac088
--- /dev/null
+++ b/src/leveldb/db/log_writer.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "leveldb/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Writer::Writer(WritableFile* dest)
+    : dest_(dest),
+      block_offset_(0) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  Status s;
+  bool begin = true;
+  do {
+    const int leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < kHeaderSize) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize being 7)
+        assert(kHeaderSize == 7);
+        dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < kHeaderSize bytes in a block.
+    assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
+
+    const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length);
+    if (begin && end) {
+      type = kFullType;
+    } else if (begin) {
+      type = kFirstType;
+    } else if (end) {
+      type = kLastType;
+    } else {
+      type = kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && left > 0);
+  return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+  assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+  // Format the header
+  char buf[kHeaderSize];
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  // Compute the crc of the record type and the payload.
+  uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+  crc = crc32c::Mask(crc);                 // Adjust for storage
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  Status s = dest_->Append(Slice(buf, kHeaderSize));
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n));
+    if (s.ok()) {
+      s = dest_->Flush();
+    }
+  }
+  block_offset_ += kHeaderSize + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace leveldb
diff --git a/src/leveldb/db/log_writer.h b/src/leveldb/db/log_writer.h
new file mode 100644
index 0000000000..a3a954d967
--- /dev/null
+++ b/src/leveldb/db/log_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
+#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
+
+#include <stdint.h>
+#include "db/log_format.h"
+#include "leveldb/slice.h"
+#include "leveldb/status.h"
+
+namespace leveldb {
+
+class WritableFile;
+
+namespace log {
+
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(WritableFile* dest);
+  ~Writer();
+
+  Status AddRecord(const Slice& slice);
+
+ private:
+  WritableFile* dest_;
+  int block_offset_;       // Current offset in block
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+  // No copying allowed
+  Writer(const Writer&);
+  void operator=(const Writer&);
+};
+
+}  // namespace log
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_LOG_WRITER_H_
diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc
new file mode 100644
index 0000000000..bfec0a7e7a
--- /dev/null
+++ b/src/leveldb/db/memtable.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+#include "db/dbformat.h"
+#include "leveldb/comparator.h"
+#include "leveldb/env.h"
+#include "leveldb/iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static Slice GetLengthPrefixedSlice(const char* data) {
+  uint32_t len;
+  const char* p = data;
+  p = GetVarint32Ptr(p, p + 5, &len);  // +5: we assume "p" is not corrupted
+  return Slice(p, len);
+}
+
+MemTable::MemTable(const InternalKeyComparator& cmp)
+    : comparator_(cmp),
+      refs_(0),
+      table_(comparator_, &arena_) {
+}
+
+MemTable::~MemTable() {
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+    const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(aptr);
+  Slice b = GetLengthPrefixedSlice(bptr);
+  return comparator.Compare(a, b);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, target.size());
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+  explicit MemTableIterator(MemTable::Table* table) : iter_(table) { }
+
+  virtual bool Valid() const { return iter_.Valid(); }
+  virtual void Seek(const Slice& k) { iter_.Seek(EncodeKey(&tmp_, k)); }
+  virtual void SeekToFirst() { iter_.SeekToFirst(); }
+  virtual void SeekToLast() { iter_.SeekToLast(); }
+  virtual void Next() { iter_.Next(); }
+  virtual void Prev() { iter_.Prev(); }
+  virtual Slice key() const { return GetLengthPrefixedSlice(iter_.key()); }
+  virtual Slice value() const {
+    Slice key_slice = GetLengthPrefixedSlice(iter_.key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  virtual Status status() const { return Status::OK(); }
+
+ private:
+  MemTable::Table::Iterator iter_;
+  std::string tmp_;       // For passing to EncodeKey
+
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&);
+  void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator() {
+  return new MemTableIterator(&table_);
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+                   const Slice& key,
+                   const Slice& value) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  size_t key_size = key.size();
+  size_t val_size = value.size();
+  size_t internal_key_size = key_size + 8;
+  const size_t encoded_len =
+      VarintLength(internal_key_size) + internal_key_size +
+      VarintLength(val_size) + val_size;
+  char* buf = arena_.Allocate(encoded_len);
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  p += key_size;
+  EncodeFixed64(p, (s << 8) | type);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((p + val_size) - buf == encoded_len);
+  table_.Insert(buf);
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) {
+  Slice memkey = key.memtable_key();
+  Table::Iterator iter(&table_);
+  iter.Seek(memkey.data());
+  if (iter.Valid()) {
+    // entry format is:
+    //    klength  varint32
+    //    userkey  char[klength]
+    //    tag      uint64
+    //    vlength  varint32
+    //    value    char[vlength]
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter.key();
+    uint32_t key_length;
+    const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length);
+    if (comparator_.comparator.user_comparator()->Compare(
+            Slice(key_ptr, key_length - 8),
+            key.user_key()) == 0) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      switch (static_cast<ValueType>(tag & 0xff)) {
+        case kTypeValue: {
+          Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+          value->assign(v.data(), v.size());
+          return true;
+        }
+        case kTypeDeletion:
+          *s = Status::NotFound(Slice());
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
new file mode 100644
index 0000000000..92e90bb099
--- /dev/null
+++ b/src/leveldb/db/memtable.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
+#define STORAGE_LEVELDB_DB_MEMTABLE_H_
+
+#include <string>
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "util/arena.h"
+
+namespace leveldb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableIterator;
+
+class MemTable {
+ public:
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  explicit MemTable(const InternalKeyComparator& comparator);
+
+  // Increase reference count.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.  Delete if no more references exist.
+  void Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      delete this;
+    }
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  size_t ApproximateMemoryUsage();
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/format.{h,cc} module.
+  Iterator* NewIterator();
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  void Add(SequenceNumber seq, ValueType type,
+           const Slice& key,
+           const Slice& value);
+
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // Else, return false.
+  bool Get(const LookupKey& key, std::string* value, Status* s);
+
+ private:
+  ~MemTable();  // Private since only Unref() should be used to delete it
+
+  struct KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+    int operator()(const char* a, const char* b) const;
+  };
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+
+  typedef SkipList<const char*, KeyComparator> Table;
+
+  KeyComparator comparator_;
+  int refs_;
+  Arena arena_;
+  Table table_;
+
+  // No copying allowed
+  MemTable(const MemTable&);
+  void operator=(const MemTable&);
+};
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_MEMTABLE_H_
diff --git a/src/leveldb/db/repair.cc b/src/leveldb/db/repair.cc
new file mode 100644
index 0000000000..022d52f3de
--- /dev/null
+++ b/src/leveldb/db/repair.cc
@@ -0,0 +1,389 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+//     (a) smallest/largest for the table
+//     (b) largest sequence number in the table
+// (3) We generate descriptor contents:
+//      - log number is set to zero
+//      - next-file-number is set to 1 + largest file number we found
+//      - last-sequence-number is set to largest sequence# found across
+//        all tables (see 2c)
+//      - compaction pointers are cleared
+//      - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const Options& options)
+      : dbname_(dbname),
+        env_(options.env),
+        icmp_(options.comparator),
+        ipolicy_(options.filter_policy),
+        options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)),
+        owns_info_log_(options_.info_log != options.info_log),
+        owns_cache_(options_.block_cache != options.block_cache),
+        next_file_number_(1) {
+    // TableCache can be small since we expect each table to be opened once.
+    table_cache_ = new TableCache(dbname_, &options_, 10);
+  }
+
+  ~Repairer() {
+    delete table_cache_;
+    if (owns_info_log_) {
+      delete options_.info_log;
+    }
+    if (owns_cache_) {
+      delete options_.block_cache;
+    }
+  }
+
+  Status Run() {
+    Status status = FindFiles();
+    if (status.ok()) {
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = WriteDescriptor();
+    }
+    if (status.ok()) {
+      unsigned long long bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.file_size;
+      }
+      Log(options_.info_log,
+          "**** Repaired leveldb %s; "
+          "recovered %d files; %llu bytes. "
+          "Some data may have been lost. "
+          "****",
+          dbname_.c_str(),
+          static_cast<int>(tables_.size()),
+          bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    SequenceNumber max_sequence;
+  };
+
+  std::string const dbname_;
+  Env* const env_;
+  InternalKeyComparator const icmp_;
+  InternalFilterPolicy const ipolicy_;
+  Options const options_;
+  bool owns_info_log_;
+  bool owns_cache_;
+  TableCache* table_cache_;
+  VersionEdit edit_;
+
+  std::vector<std::string> manifests_;
+  std::vector<uint64_t> table_numbers_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    Status status = env_->GetChildren(dbname_, &filenames);
+    if (!status.ok()) {
+      return status;
+    }
+    if (filenames.empty()) {
+      return Status::IOError(dbname_, "repair found no files");
+    }
+
+    uint64_t number;
+    FileType type;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type)) {
+        if (type == kDescriptorFile) {
+          manifests_.push_back(filenames[i]);
+        } else {
+          if (number + 1 > next_file_number_) {
+            next_file_number_ = number + 1;
+          }
+          if (type == kLogFile) {
+            logs_.push_back(number);
+          } else if (type == kTableFile) {
+            table_numbers_.push_back(number);
+          } else {
+            // Ignore other files
+          }
+        }
+      }
+    }
+    return status;
+  }
+
+  void ConvertLogFilesToTables() {
+    for (size_t i = 0; i < logs_.size(); i++) {
+      std::string logname = LogFileName(dbname_, logs_[i]);
+      Status status = ConvertLogToTable(logs_[i]);
+      if (!status.ok()) {
+        Log(options_.info_log, "Log #%llu: ignoring conversion error: %s",
+            (unsigned long long) logs_[i],
+            status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      Logger* info_log;
+      uint64_t lognum;
+      virtual void Corruption(size_t bytes, const Status& s) {
+        // We print error messages for corruption, but continue repairing.
+        Log(info_log, "Log #%llu: dropping %d bytes; %s",
+            (unsigned long long) lognum,
+            static_cast<int>(bytes),
+            s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(dbname_, log);
+    SequentialFile* lfile;
+    Status status = env_->NewSequentialFile(logname, &lfile);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = options_.info_log;
+    reporter.lognum = log;
+    // We intentially make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(lfile, &reporter, false/*do not checksum*/,
+                       0/*initial_offset*/);
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    MemTable* mem = new MemTable(icmp_);
+    mem->Ref();
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < 12) {
+        reporter.Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      WriteBatchInternal::SetContents(&batch, record);
+      status = WriteBatchInternal::InsertInto(&batch, mem);
+      if (status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        Log(options_.info_log, "Log #%llu: ignoring %s",
+            (unsigned long long) log,
+            status.ToString().c_str());
+        status = Status::OK();  // Keep going with rest of file
+      }
+    }
+    delete lfile;
+
+    // Do not record a version edit for this conversion to a Table
+    // since ExtractMetaData() will also generate edits.
+    FileMetaData meta;
+    meta.number = next_file_number_++;
+    Iterator* iter = mem->NewIterator();
+    status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta);
+    delete iter;
+    mem->Unref();
+    mem = NULL;
+    if (status.ok()) {
+      if (meta.file_size > 0) {
+        table_numbers_.push_back(meta.number);
+      }
+    }
+    Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+        (unsigned long long) log,
+        counter,
+        (unsigned long long) meta.number,
+        status.ToString().c_str());
+    return status;
+  }
+
+  void ExtractMetaData() {
+    std::vector<TableInfo> kept;
+    for (size_t i = 0; i < table_numbers_.size(); i++) {
+      TableInfo t;
+      t.meta.number = table_numbers_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(dbname_, table_numbers_[i]);
+        Log(options_.info_log, "Table #%llu: ignoring %s",
+            (unsigned long long) table_numbers_[i],
+            status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(dbname_, t->meta.number);
+    int counter = 0;
+    Status status = env_->GetFileSize(fname, &t->meta.file_size);
+    if (status.ok()) {
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(), t->meta.number, t->meta.file_size);
+      bool empty = true;
+      ParsedInternalKey parsed;
+      t->max_sequence = 0;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        if (!ParseInternalKey(key, &parsed)) {
+          Log(options_.info_log, "Table #%llu: unparsable key %s",
+              (unsigned long long) t->meta.number,
+              EscapeString(key).c_str());
+          continue;
+        }
+
+        counter++;
+        if (empty) {
+          empty = false;
+          t->meta.smallest.DecodeFrom(key);
+        }
+        t->meta.largest.DecodeFrom(key);
+        if (parsed.sequence > t->max_sequence) {
+          t->max_sequence = parsed.sequence;
+        }
+      }
+      if (!iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+    }
+    Log(options_.info_log, "Table #%llu: %d entries %s",
+        (unsigned long long) t->meta.number,
+        counter,
+        status.ToString().c_str());
+    return status;
+  }
+
+  Status WriteDescriptor() {
+    std::string tmp = TempFileName(dbname_, 1);
+    WritableFile* file;
+    Status status = env_->NewWritableFile(tmp, &file);
+    if (!status.ok()) {
+      return status;
+    }
+
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      if (max_sequence < tables_[i].max_sequence) {
+        max_sequence = tables_[i].max_sequence;
+      }
+    }
+
+    edit_.SetComparatorName(icmp_.user_comparator()->Name());
+    edit_.SetLogNumber(0);
+    edit_.SetNextFile(next_file_number_);
+    edit_.SetLastSequence(max_sequence);
+
+    for (size_t i = 0; i < tables_.size(); i++) {
+      // TODO(opt): separate out into multiple levels
+      const TableInfo& t = tables_[i];
+      edit_.AddFile(0, t.meta.number, t.meta.file_size,
+                    t.meta.smallest, t.meta.largest);
+    }
+
+    //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+    {
+      log::Writer log(file);
+      std::string record;
+      edit_.EncodeTo(&record);
+      status = log.AddRecord(record);
+    }
+    if (status.ok()) {
+      status = file->Close();
+    }
+    delete file;
+    file = NULL;
+
+    if (!status.ok()) {
+      env_->DeleteFile(tmp);
+    } else {
+      // Discard older manifests
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+
+      // Install new manifest
+      status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+      if (status.ok()) {
+        status = SetCurrentFile(env_, dbname_, 1);
+      } else {
+        env_->DeleteFile(tmp);
+      }
+    }
+    return status;
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != NULL) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir);  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    Log(options_.info_log, "Archiving %s: %s\n",
+        fname.c_str(), s.ToString().c_str());
+  }
+};
+}  // namespace
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Repairer repairer(dbname, options);
+  return repairer.Run();
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/skiplist.h b/src/leveldb/db/skiplist.h
new file mode 100644
index 0000000000..af85be6d01
--- /dev/null
+++ b/src/leveldb/db/skiplist.h
@@ -0,0 +1,379 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress.  Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed.  This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+  struct Node;
+
+ public:
+  // Create a new SkipList object that will use "cmp" for comparing keys,
+  // and will allocate memory using "*arena".  Objects allocated in the arena
+  // must remain allocated for the lifetime of the skiplist object.
+  explicit SkipList(Comparator cmp, Arena* arena);
+
+  // Insert key into the list.
+  // REQUIRES: nothing that compares equal to key is currently in the list.
+  void Insert(const Key& key);
+
+  // Returns true iff an entry that compares equal to key is in the list.
+  bool Contains(const Key& key) const;
+
+  // Iteration over the contents of a skip list
+  class Iterator {
+   public:
+    // Initialize an iterator over the specified list.
+    // The returned iterator is not valid.
+    explicit Iterator(const SkipList* list);
+
+    // Returns true iff the iterator is positioned at a valid node.
+    bool Valid() const;
+
+    // Returns the key at the current position.
+    // REQUIRES: Valid()
+    const Key& key() const;
+
+    // Advances to the next position.
+    // REQUIRES: Valid()
+    void Next();
+
+    // Advances to the previous position.
+    // REQUIRES: Valid()
+    void Prev();
+
+    // Advance to the first entry with a key >= target
+    void Seek(const Key& target);
+
+    // Position at the first entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToFirst();
+
+    // Position at the last entry in list.
+    // Final state of iterator is Valid() iff list is not empty.
+    void SeekToLast();
+
+   private:
+    const SkipList* list_;
+    Node* node_;
+    // Intentionally copyable
+  };
+
+ private:
+  enum { kMaxHeight = 12 };
+
+  // Immutable after construction
+  Comparator const compare_;
+  Arena* const arena_;    // Arena used for allocations of nodes
+
+  Node* const head_;
+
+  // Modified only by Insert().  Read racily by readers, but stale
+  // values are ok.
+  port::AtomicPointer max_height_;   // Height of the entire list
+
+  inline int GetMaxHeight() const {
+    return static_cast<int>(
+        reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load()));
+  }
+
+  // Read/written only by Insert().
+  Random rnd_;
+
+  Node* NewNode(const Key& key, int height);
+  int RandomHeight();
+  bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+  // Return true if key is greater than the data stored in "n"
+  bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+  // Return the earliest node that comes at or after key.
+  // Return NULL if there is no such node.
+  //
+  // If prev is non-NULL, fills prev[level] with pointer to previous
+  // node at "level" for every level in [0..max_height_-1].
+  Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+  // Return the latest node with a key < key.
+  // Return head_ if there is no such node.
+  Node* FindLessThan(const Key& key) const;
+
+  // Return the last node in the list.
+  // Return head_ if list is empty.
+  Node* FindLast() const;
+
+  // No copying allowed
+  SkipList(const SkipList&);
+  void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+  explicit Node(const Key& k) : key(k) { }
+
+  Key const key;
+
+  // Accessors/mutators for links.  Wrapped in methods so we can
+  // add the appropriate barriers as necessary.
+  Node* Next(int n) {
+    assert(n >= 0);
+    // Use an 'acquire load' so that we observe a fully initialized
+    // version of the returned Node.
+    return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+  }
+  void SetNext(int n, Node* x) {
+    assert(n >= 0);
+    // Use a 'release store' so that anybody who reads through this
+    // pointer observes a fully initialized version of the inserted node.
+    next_[n].Release_Store(x);
+  }
+
+  // No-barrier variants that can be safely used in a few locations.
+  Node* NoBarrier_Next(int n) {
+    assert(n >= 0);
+    return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+  }
+  void NoBarrier_SetNext(int n, Node* x) {
+    assert(n >= 0);
+    next_[n].NoBarrier_Store(x);
+  }
+
+ private:
+  // Array of length equal to the node height.  next_[0] is lowest level link.
+  port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+  char* mem = arena_->AllocateAligned(
+      sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+  return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+  list_ = list;
+  node_ = NULL;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+  return node_ != NULL;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+  assert(Valid());
+  return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+  assert(Valid());
+  node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+  // Instead of using explicit "prev" links, we just search for the
+  // last node that falls before key.
+  assert(Valid());
+  node_ = list_->FindLessThan(node_->key);
+  if (node_ == list_->head_) {
+    node_ = NULL;
+  }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+  node_ = list_->FindGreaterOrEqual(target, NULL);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+  node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+  node_ = list_->FindLast();
+  if (node_ == list_->head_) {
+    node_ = NULL;
+  }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+  // Increase height with probability 1 in kBranching
+  static const unsigned int kBranching = 4;
+  int height = 1;
+  while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+    height++;
+  }
+  assert(height > 0);
+  assert(height <= kMaxHeight);
+  return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+  // NULL n is considered infinite
+  return (n != NULL) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (KeyIsAfterNode(key, next)) {
+      // Keep searching in this list
+      x = next;
+    } else {
+      if (prev != NULL) prev[level] = x;
+      if (level == 0) {
+        return next;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    assert(x == head_ || compare_(x->key, key) < 0);
+    Node* next = x->Next(level);
+    if (next == NULL || compare_(next->key, key) >= 0) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+    const {
+  Node* x = head_;
+  int level = GetMaxHeight() - 1;
+  while (true) {
+    Node* next = x->Next(level);
+    if (next == NULL) {
+      if (level == 0) {
+        return x;
+      } else {
+        // Switch to next list
+        level--;
+      }
+    } else {
+      x = next;
+    }
+  }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+    : compare_(cmp),
+      arena_(arena),
+      head_(NewNode(0 /* any key will do */, kMaxHeight)),
+      max_height_(reinterpret_cast<void*>(1)),
+      rnd_(0xdeadbeef) {
+  for (int i = 0; i < kMaxHeight; i++) {
+    head_->SetNext(i, NULL);
+  }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+  // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+  // here since Insert() is externally synchronized.
+  Node* prev[kMaxHeight];
+  Node* x = FindGreaterOrEqual(key, prev);
+
+  // Our data structure does not allow duplicate insertion
+  assert(x == NULL || !Equal(key, x->key));
+
+  int height = RandomHeight();
+  if (height > GetMaxHeight()) {
+    for (int i = GetMaxHeight(); i < height; i++) {
+      prev[i] = head_;
+    }
+    //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+    // It is ok to mutate max_height_ without any synchronization
+    // with concurrent readers.  A concurrent reader that observes
+    // the new value of max_height_ will see either the old value of
+    // new level pointers from head_ (NULL), or a new value set in
+    // the loop below.  In the former case the reader will
+    // immediately drop to the next level since NULL sorts after all
+    // keys.  In the latter case the reader will use the new node.
+    max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+  }
+
+  x = NewNode(key, height);
+  for (int i = 0; i < height; i++) {
+    // NoBarrier_SetNext() suffices since we will add a barrier when
+    // we publish a pointer to "x" in prev[i].
+    x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+    prev[i]->SetNext(i, x);
+  }
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+  Node* x = FindGreaterOrEqual(key, NULL);
+  if (x != NULL && Equal(key, x->key)) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/skiplist_test.cc b/src/leveldb/db/skiplist_test.cc
new file mode 100644
index 0000000000..c78f4b4fb1
--- /dev/null
+++ b/src/leveldb/db/skiplist_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "leveldb/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+typedef uint64_t Key;
+
+struct Comparator {
+  int operator()(const Key& a, const Key& b) const {
+    if (a < b) {
+      return -1;
+    } else if (a > b) {
+      return +1;
+    } else {
+      return 0;
+    }
+  }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+  Arena arena;
+  Comparator cmp;
+  SkipList<Key, Comparator> list(cmp, &arena);
+  ASSERT_TRUE(!list.Contains(10));
+
+  SkipList<Key, Comparator>::Iterator iter(&list);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToFirst();
+  ASSERT_TRUE(!iter.Valid());
+  iter.Seek(100);
+  ASSERT_TRUE(!iter.Valid());
+  iter.SeekToLast();
+  ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+  const int N = 2000;
+  const int R = 5000;
+  Random rnd(1000);
+  std::set<Key> keys;
+  Arena arena;
+  Comparator cmp;
+  SkipList<Key, Comparator> list(cmp, &arena);
+  for (int i = 0; i < N; i++) {
+    Key key = rnd.Next() % R;
+    if (keys.insert(key).second) {
+      list.Insert(key);
+    }
+  }
+
+  for (int i = 0; i < R; i++) {
+    if (list.Contains(i)) {
+      ASSERT_EQ(keys.count(i), 1);
+    } else {
+      ASSERT_EQ(keys.count(i), 0);
+    }
+  }
+
+  // Simple iterator tests
+  {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    ASSERT_TRUE(!iter.Valid());
+
+    iter.Seek(0);
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToFirst();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.begin()), iter.key());
+
+    iter.SeekToLast();
+    ASSERT_TRUE(iter.Valid());
+    ASSERT_EQ(*(keys.rbegin()), iter.key());
+  }
+
+  // Forward iteration test
+  for (int i = 0; i < R; i++) {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    iter.Seek(i);
+
+    // Compare against model iterator
+    std::set<Key>::iterator model_iter = keys.lower_bound(i);
+    for (int j = 0; j < 3; j++) {
+      if (model_iter == keys.end()) {
+        ASSERT_TRUE(!iter.Valid());
+        break;
+      } else {
+        ASSERT_TRUE(iter.Valid());
+        ASSERT_EQ(*model_iter, iter.key());
+        ++model_iter;
+        iter.Next();
+      }
+    }
+  }
+
+  // Backward iteration test
+  {
+    SkipList<Key, Comparator>::Iterator iter(&list);
+    iter.SeekToLast();
+
+    // Compare against model iterator
+    for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+         model_iter != keys.rend();
+         ++model_iter) {
+      ASSERT_TRUE(iter.Valid());
+      ASSERT_EQ(*model_iter, iter.key());
+      iter.Prev();
+    }
+    ASSERT_TRUE(!iter.Valid());
+  }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor.  Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+//     <key,gen,hash>
+// where:
+//     key is in range [0..K-1]
+//     gen is a generation number for key
+//     hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key.  We then iterate, including random
+// calls to Next() and Seek().  For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+  static const uint32_t K = 4;
+
+  static uint64_t key(Key key) { return (key >> 40); }
+  static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+  static uint64_t hash(Key key) { return key & 0xff; }
+
+  static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+    uint64_t data[2] = { k, g };
+    return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+  }
+
+  static Key MakeKey(uint64_t k, uint64_t g) {
+    assert(sizeof(Key) == sizeof(uint64_t));
+    assert(k <= K);  // We sometimes pass K to seek to the end of the skiplist
+    assert(g <= 0xffffffffu);
+    return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+  }
+
+  static bool IsValidKey(Key k) {
+    return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+  }
+
+  static Key RandomTarget(Random* rnd) {
+    switch (rnd->Next() % 10) {
+      case 0:
+        // Seek to beginning
+        return MakeKey(0, 0);
+      case 1:
+        // Seek to end
+        return MakeKey(K, 0);
+      default:
+        // Seek to middle
+        return MakeKey(rnd->Next() % K, 0);
+    }
+  }
+
+  // Per-key generation
+  struct State {
+    port::AtomicPointer generation[K];
+    void Set(int k, intptr_t v) {
+      generation[k].Release_Store(reinterpret_cast<void*>(v));
+    }
+    intptr_t Get(int k) {
+      return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+    }
+
+    State() {
+      for (int k = 0; k < K; k++) {
+        Set(k, 0);
+      }
+    }
+  };
+
+  // Current state of the test
+  State current_;
+
+  Arena arena_;
+
+  // SkipList is not protected by mu_.  We just use a single writer
+  // thread to modify it.
+  SkipList<Key, Comparator> list_;
+
+ public:
+  ConcurrentTest() : list_(Comparator(), &arena_) { }
+
+  // REQUIRES: External synchronization
+  void WriteStep(Random* rnd) {
+    const uint32_t k = rnd->Next() % K;
+    const intptr_t g = current_.Get(k) + 1;
+    const Key key = MakeKey(k, g);
+    list_.Insert(key);
+    current_.Set(k, g);
+  }
+
+  void ReadStep(Random* rnd) {
+    // Remember the initial committed state of the skiplist.
+    State initial_state;
+    for (int k = 0; k < K; k++) {
+      initial_state.Set(k, current_.Get(k));
+    }
+
+    Key pos = RandomTarget(rnd);
+    SkipList<Key, Comparator>::Iterator iter(&list_);
+    iter.Seek(pos);
+    while (true) {
+      Key current;
+      if (!iter.Valid()) {
+        current = MakeKey(K, 0);
+      } else {
+        current = iter.key();
+        ASSERT_TRUE(IsValidKey(current)) << current;
+      }
+      ASSERT_LE(pos, current) << "should not go backwards";
+
+      // Verify that everything in [pos,current) was not present in
+      // initial_state.
+      while (pos < current) {
+        ASSERT_LT(key(pos), K) << pos;
+
+        // Note that generation 0 is never inserted, so it is ok if
+        // <*,0,*> is missing.
+        ASSERT_TRUE((gen(pos) == 0) ||
+                    (gen(pos) > initial_state.Get(key(pos)))
+                    ) << "key: " << key(pos)
+                      << "; gen: " << gen(pos)
+                      << "; initgen: "
+                      << initial_state.Get(key(pos));
+
+        // Advance to next key in the valid key space
+        if (key(pos) < key(current)) {
+          pos = MakeKey(key(pos) + 1, 0);
+        } else {
+          pos = MakeKey(key(pos), gen(pos) + 1);
+        }
+      }
+
+      if (!iter.Valid()) {
+        break;
+      }
+
+      if (rnd->Next() % 2) {
+        iter.Next();
+        pos = MakeKey(key(pos), gen(pos) + 1);
+      } else {
+        Key new_target = RandomTarget(rnd);
+        if (new_target > pos) {
+          pos = new_target;
+          iter.Seek(new_target);
+        }
+      }
+    }
+  }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+  ConcurrentTest test;
+  Random rnd(test::RandomSeed());
+  for (int i = 0; i < 10000; i++) {
+    test.ReadStep(&rnd);
+    test.WriteStep(&rnd);
+  }
+}
+
+class TestState {
+ public:
+  ConcurrentTest t_;
+  int seed_;
+  port::AtomicPointer quit_flag_;
+
+  enum ReaderState {
+    STARTING,
+    RUNNING,
+    DONE
+  };
+
+  explicit TestState(int s)
+      : seed_(s),
+        quit_flag_(NULL),
+        state_(STARTING),
+        state_cv_(&mu_) {}
+
+  void Wait(ReaderState s) {
+    mu_.Lock();
+    while (state_ != s) {
+      state_cv_.Wait();
+    }
+    mu_.Unlock();
+  }
+
+  void Change(ReaderState s) {
+    mu_.Lock();
+    state_ = s;
+    state_cv_.Signal();
+    mu_.Unlock();
+  }
+
+ private:
+  port::Mutex mu_;
+  ReaderState state_;
+  port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+  TestState* state = reinterpret_cast<TestState*>(arg);
+  Random rnd(state->seed_);
+  int64_t reads = 0;
+  state->Change(TestState::RUNNING);
+  while (!state->quit_flag_.Acquire_Load()) {
+    state->t_.ReadStep(&rnd);
+    ++reads;
+  }
+  state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+  const int seed = test::RandomSeed() + (run * 100);
+  Random rnd(seed);
+  const int N = 1000;
+  const int kSize = 1000;
+  for (int i = 0; i < N; i++) {
+    if ((i % 100) == 0) {
+      fprintf(stderr, "Run %d of %d\n", i, N);
+    }
+    TestState state(seed + 1);
+    Env::Default()->Schedule(ConcurrentReader, &state);
+    state.Wait(TestState::RUNNING);
+    for (int i = 0; i < kSize; i++) {
+      state.t_.WriteStep(&rnd);
+    }
+    state.quit_flag_.Release_Store(&state);  // Any non-NULL arg will do
+    state.Wait(TestState::DONE);
+  }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/snapshot.h b/src/leveldb/db/snapshot.h
new file mode 100644
index 0000000000..e7f8fd2c37
--- /dev/null
+++ b/src/leveldb/db/snapshot.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+
+#include "leveldb/db.h"
+
+namespace leveldb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;                 // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+  }
+
+  bool empty() const { return list_.next_ == &list_; }
+  SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; }
+  SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; }
+
+  const SnapshotImpl* New(SequenceNumber seq) {
+    SnapshotImpl* s = new SnapshotImpl;
+    s->number_ = seq;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    return s;
+  }
+
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    delete s;
+  }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+};
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_SNAPSHOT_H_
diff --git a/src/leveldb/db/table_cache.cc b/src/leveldb/db/table_cache.cc
new file mode 100644
index 0000000000..497db27076
--- /dev/null
+++ b/src/leveldb/db/table_cache.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "leveldb/env.h"
+#include "leveldb/table.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct TableAndFile {
+  RandomAccessFile* file;
+  Table* table;
+};
+
+static void DeleteEntry(const Slice& key, void* value) {
+  TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+  delete tf->table;
+  delete tf->file;
+  delete tf;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname,
+                       const Options* options,
+                       int entries)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      cache_(NewLRUCache(entries)) {
+}
+
+TableCache::~TableCache() {
+  delete cache_;
+}
+
+Status TableCache::FindTable(uint64_t file_number, uint64_t file_size,
+                             Cache::Handle** handle) {
+  Status s;
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  Slice key(buf, sizeof(buf));
+  *handle = cache_->Lookup(key);
+  if (*handle == NULL) {
+    std::string fname = TableFileName(dbname_, file_number);
+    RandomAccessFile* file = NULL;
+    Table* table = NULL;
+    s = env_->NewRandomAccessFile(fname, &file);
+    if (s.ok()) {
+      s = Table::Open(*options_, file, file_size, &table);
+    }
+
+    if (!s.ok()) {
+      assert(table == NULL);
+      delete file;
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      TableAndFile* tf = new TableAndFile;
+      tf->file = file;
+      tf->table = table;
+      *handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+    }
+  }
+  return s;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+                                  uint64_t file_number,
+                                  uint64_t file_size,
+                                  Table** tableptr) {
+  if (tableptr != NULL) {
+    *tableptr = NULL;
+  }
+
+  Cache::Handle* handle = NULL;
+  Status s = FindTable(file_number, file_size, &handle);
+  if (!s.ok()) {
+    return NewErrorIterator(s);
+  }
+
+  Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+  Iterator* result = table->NewIterator(options);
+  result->RegisterCleanup(&UnrefEntry, cache_, handle);
+  if (tableptr != NULL) {
+    *tableptr = table;
+  }
+  return result;
+}
+
+Status TableCache::Get(const ReadOptions& options,
+                       uint64_t file_number,
+                       uint64_t file_size,
+                       const Slice& k,
+                       void* arg,
+                       void (*saver)(void*, const Slice&, const Slice&)) {
+  Cache::Handle* handle = NULL;
+  Status s = FindTable(file_number, file_size, &handle);
+  if (s.ok()) {
+    Table* t = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+    s = t->InternalGet(options, k, arg, saver);
+    cache_->Release(handle);
+  }
+  return s;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+  char buf[sizeof(file_number)];
+  EncodeFixed64(buf, file_number);
+  cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/table_cache.h b/src/leveldb/db/table_cache.h
new file mode 100644
index 0000000000..8cf4aaf12d
--- /dev/null
+++ b/src/leveldb/db/table_cache.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "leveldb/cache.h"
+#include "leveldb/table.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+class TableCache {
+ public:
+  TableCache(const std::string& dbname, const Options* options, int entries);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "tableptr" is
+  // non-NULL, also sets "*tableptr" to point to the Table object
+  // underlying the returned iterator, or NULL if no Table object underlies
+  // the returned iterator.  The returned "*tableptr" object is owned by
+  // the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  Iterator* NewIterator(const ReadOptions& options,
+                        uint64_t file_number,
+                        uint64_t file_size,
+                        Table** tableptr = NULL);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call (*handle_result)(arg, found_key, found_value).
+  Status Get(const ReadOptions& options,
+             uint64_t file_number,
+             uint64_t file_size,
+             const Slice& k,
+             void* arg,
+             void (*handle_result)(void*, const Slice&, const Slice&));
+
+  // Evict any entry for the specified file number
+  void Evict(uint64_t file_number);
+
+ private:
+  Env* const env_;
+  const std::string dbname_;
+  const Options* options_;
+  Cache* cache_;
+
+  Status FindTable(uint64_t file_number, uint64_t file_size, Cache::Handle**);
+};
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc
new file mode 100644
index 0000000000..f10a2d58b2
--- /dev/null
+++ b/src/leveldb/db/version_edit.cc
@@ -0,0 +1,266 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed.
+enum Tag {
+  kComparator           = 1,
+  kLogNumber            = 2,
+  kNextFileNumber       = 3,
+  kLastSequence         = 4,
+  kCompactPointer       = 5,
+  kDeletedFile          = 6,
+  kNewFile              = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber        = 9
+};
+
+void VersionEdit::Clear() {
+  comparator_.clear();
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  last_sequence_ = 0;
+  next_file_number_ = 0;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_last_sequence_ = false;
+  deleted_files_.clear();
+  new_files_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32(dst, kLogNumber);
+    PutVarint64(dst, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32(dst, kPrevLogNumber);
+    PutVarint64(dst, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32(dst, kNextFileNumber);
+    PutVarint64(dst, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32(dst, kLastSequence);
+    PutVarint64(dst, last_sequence_);
+  }
+
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    PutVarint32(dst, kCompactPointer);
+    PutVarint32(dst, compact_pointers_[i].first);  // level
+    PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+  }
+
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    PutVarint32(dst, kDeletedFile);
+    PutVarint32(dst, iter->first);   // level
+    PutVarint64(dst, iter->second);  // file number
+  }
+
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    PutVarint32(dst, kNewFile);
+    PutVarint32(dst, new_files_[i].first);  // level
+    PutVarint64(dst, f.number);
+    PutVarint64(dst, f.file_size);
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+  }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static bool GetLevel(Slice* input, int* level) {
+  uint32_t v;
+  if (GetVarint32(input, &v) &&
+      v < config::kNumLevels) {
+    *level = v;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+  Slice input = src;
+  const char* msg = NULL;
+  uint32_t tag;
+
+  // Temporary storage for parsing
+  int level;
+  uint64_t number;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+
+  while (msg == NULL && GetVarint32(&input, &tag)) {
+    switch (tag) {
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactPointer:
+        if (GetLevel(&input, &level) &&
+            GetInternalKey(&input, &key)) {
+          compact_pointers_.push_back(std::make_pair(level, key));
+        } else {
+          msg = "compaction pointer";
+        }
+        break;
+
+      case kDeletedFile:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          msg = "deleted file";
+        }
+        break;
+
+      case kNewFile:
+        if (GetLevel(&input, &level) &&
+            GetVarint64(&input, &f.number) &&
+            GetVarint64(&input, &f.file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          msg = "new-file entry";
+        }
+        break;
+
+      default:
+        msg = "unknown tag";
+        break;
+    }
+  }
+
+  if (msg == NULL && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != NULL) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString() const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFile: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (size_t i = 0; i < compact_pointers_.size(); i++) {
+    r.append("\n  CompactPointer: ");
+    AppendNumberTo(&r, compact_pointers_[i].first);
+    r.append(" ");
+    r.append(compact_pointers_[i].second.DebugString());
+  }
+  for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+       iter != deleted_files_.end();
+       ++iter) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, iter->first);
+    r.append(" ");
+    AppendNumberTo(&r, iter->second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.number);
+    r.append(" ");
+    AppendNumberTo(&r, f.file_size);
+    r.append(" ");
+    r.append(f.smallest.DebugString());
+    r.append(" .. ");
+    r.append(f.largest.DebugString());
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h
new file mode 100644
index 0000000000..eaef77b327
--- /dev/null
+++ b/src/leveldb/db/version_edit.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class VersionSet;
+
+struct FileMetaData {
+  int refs;
+  int allowed_seeks;          // Seeks allowed until compaction
+  uint64_t number;
+  uint64_t file_size;         // File size in bytes
+  InternalKey smallest;       // Smallest internal key served by table
+  InternalKey largest;        // Largest internal key served by table
+
+  FileMetaData() : refs(0), allowed_seeks(1 << 30), file_size(0) { }
+};
+
+class VersionEdit {
+ public:
+  VersionEdit() { Clear(); }
+  ~VersionEdit() { }
+
+  void Clear();
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  void SetCompactPointer(int level, const InternalKey& key) {
+    compact_pointers_.push_back(std::make_pair(level, key));
+  }
+
+  // Add the specified file at the specified number.
+  // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  void AddFile(int level, uint64_t file,
+               uint64_t file_size,
+               const InternalKey& smallest,
+               const InternalKey& largest) {
+    FileMetaData f;
+    f.number = file;
+    f.file_size = file_size;
+    f.smallest = smallest;
+    f.largest = largest;
+    new_files_.push_back(std::make_pair(level, f));
+  }
+
+  // Delete the specified "file" from the specified "level".
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.insert(std::make_pair(level, file));
+  }
+
+  void EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString() const;
+
+ private:
+  friend class VersionSet;
+
+  typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+  std::string comparator_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;
+  uint64_t next_file_number_;
+  SequenceNumber last_sequence_;
+  bool has_comparator_;
+  bool has_log_number_;
+  bool has_prev_log_number_;
+  bool has_next_file_number_;
+  bool has_last_sequence_;
+
+  std::vector< std::pair<int, InternalKey> > compact_pointers_;
+  DeletedFileSet deleted_files_;
+  std::vector< std::pair<int, FileMetaData> > new_files_;
+};
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc
new file mode 100644
index 0000000000..280310b49d
--- /dev/null
+++ b/src/leveldb/db/version_edit_test.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+    edit.DeleteFile(4, kBig + 700 + i);
+    edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
new file mode 100644
index 0000000000..cf976b437e
--- /dev/null
+++ b/src/leveldb/db/version_set.cc
@@ -0,0 +1,1402 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "leveldb/env.h"
+#include "leveldb/table_builder.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static const int kTargetFileSize = 2 * 1048576;
+
+// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static const int64_t kMaxGrandParentOverlapBytes = 10 * kTargetFileSize;
+
+// Maximum number of bytes in all compacted files.  We avoid expanding
+// the lower level file set of a compaction if it would make the
+// total compaction cover more than this many bytes.
+static const int64_t kExpandedCompactionByteSizeLimit = 25 * kTargetFileSize;
+
+static double MaxBytesForLevel(int level) {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  double result = 10 * 1048576.0;  // Result for both level-0 and level-1
+  while (level > 1) {
+    result *= 10;
+    level--;
+  }
+  return result;
+}
+
+static uint64_t MaxFileSizeForLevel(int level) {
+  return kTargetFileSize;  // We could vary per level to reduce number of files?
+}
+
+static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < files.size(); i++) {
+    sum += files[i]->file_size;
+  }
+  return sum;
+}
+
+namespace {
+std::string IntSetToString(const std::set<uint64_t>& s) {
+  std::string result = "{";
+  for (std::set<uint64_t>::const_iterator it = s.begin();
+       it != s.end();
+       ++it) {
+    result += (result.size() > 1) ? "," : "";
+    result += NumberToString(*it);
+  }
+  result += "}";
+  return result;
+}
+}  // namespace
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < config::kNumLevels; level++) {
+    for (size_t i = 0; i < files_[level].size(); i++) {
+      FileMetaData* f = files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const std::vector<FileMetaData*>& files,
+             const Slice& key) {
+  uint32_t left = 0;
+  uint32_t right = files.size();
+  while (left < right) {
+    uint32_t mid = (left + right) / 2;
+    const FileMetaData* f = files[mid];
+    if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) {
+      // Key at "mid.largest" is < "target".  Therefore all
+      // files at or before "mid" are uninteresting.
+      left = mid + 1;
+    } else {
+      // Key at "mid.largest" is >= "target".  Therefore all files
+      // after "mid" are uninteresting.
+      right = mid;
+    }
+  }
+  return right;
+}
+
+static bool AfterFile(const Comparator* ucmp,
+                      const Slice* user_key, const FileMetaData* f) {
+  // NULL user_key occurs before all keys and is therefore never after *f
+  return (user_key != NULL &&
+          ucmp->Compare(*user_key, f->largest.user_key()) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp,
+                       const Slice* user_key, const FileMetaData* f) {
+  // NULL user_key occurs after all keys and is therefore never before *f
+  return (user_key != NULL &&
+          ucmp->Compare(*user_key, f->smallest.user_key()) < 0);
+}
+
+bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < files.size(); i++) {
+      const FileMetaData* f = files[i];
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != NULL) {
+    // Find the earliest possible internal key for smallest_user_key
+    InternalKey small(*smallest_user_key, kMaxSequenceNumber,kValueTypeForSeek);
+    index = FindFile(icmp, files, small.Encode());
+  }
+
+  if (index >= files.size()) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, files[index]);
+}
+
+// An internal iterator.  For a given version/level pair, yields
+// information about the files in the level.  For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 16-byte value containing the file number and file size, both
+// encoded using EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+  LevelFileNumIterator(const InternalKeyComparator& icmp,
+                       const std::vector<FileMetaData*>* flist)
+      : icmp_(icmp),
+        flist_(flist),
+        index_(flist->size()) {        // Marks as invalid
+  }
+  virtual bool Valid() const {
+    return index_ < flist_->size();
+  }
+  virtual void Seek(const Slice& target) {
+    index_ = FindFile(icmp_, *flist_, target);
+  }
+  virtual void SeekToFirst() { index_ = 0; }
+  virtual void SeekToLast() {
+    index_ = flist_->empty() ? 0 : flist_->size() - 1;
+  }
+  virtual void Next() {
+    assert(Valid());
+    index_++;
+  }
+  virtual void Prev() {
+    assert(Valid());
+    if (index_ == 0) {
+      index_ = flist_->size();  // Marks as invalid
+    } else {
+      index_--;
+    }
+  }
+  Slice key() const {
+    assert(Valid());
+    return (*flist_)[index_]->largest.Encode();
+  }
+  Slice value() const {
+    assert(Valid());
+    EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+    EncodeFixed64(value_buf_+8, (*flist_)[index_]->file_size);
+    return Slice(value_buf_, sizeof(value_buf_));
+  }
+  virtual Status status() const { return Status::OK(); }
+ private:
+  const InternalKeyComparator icmp_;
+  const std::vector<FileMetaData*>* const flist_;
+  uint32_t index_;
+
+  // Backing store for value().  Holds the file number and size.
+  mutable char value_buf_[16];
+};
+
+static Iterator* GetFileIterator(void* arg,
+                                 const ReadOptions& options,
+                                 const Slice& file_value) {
+  TableCache* cache = reinterpret_cast<TableCache*>(arg);
+  if (file_value.size() != 16) {
+    return NewErrorIterator(
+        Status::Corruption("FileReader invoked with unexpected value"));
+  } else {
+    return cache->NewIterator(options,
+                              DecodeFixed64(file_value.data()),
+                              DecodeFixed64(file_value.data() + 8));
+  }
+}
+
+Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
+                                            int level) const {
+  return NewTwoLevelIterator(
+      new LevelFileNumIterator(vset_->icmp_, &files_[level]),
+      &GetFileIterator, vset_->table_cache_, options);
+}
+
+void Version::AddIterators(const ReadOptions& options,
+                           std::vector<Iterator*>* iters) {
+  // Merge all level zero files together since they may overlap
+  for (size_t i = 0; i < files_[0].size(); i++) {
+    iters->push_back(
+        vset_->table_cache_->NewIterator(
+            options, files_[0][i]->number, files_[0][i]->file_size));
+  }
+
+  // For levels > 0, we can use a concatenating iterator that sequentially
+  // walks through the non-overlapping files in the level, opening them
+  // lazily.
+  for (int level = 1; level < config::kNumLevels; level++) {
+    if (!files_[level].empty()) {
+      iters->push_back(NewConcatenatingIterator(options, level));
+    }
+  }
+}
+
+// Callback from TableCache::Get()
+namespace {
+enum SaverState {
+  kNotFound,
+  kFound,
+  kDeleted,
+  kCorrupt,
+};
+struct Saver {
+  SaverState state;
+  const Comparator* ucmp;
+  Slice user_key;
+  std::string* value;
+};
+}
+static void SaveValue(void* arg, const Slice& ikey, const Slice& v) {
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  ParsedInternalKey parsed_key;
+  if (!ParseInternalKey(ikey, &parsed_key)) {
+    s->state = kCorrupt;
+  } else {
+    if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) {
+      s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted;
+      if (s->state == kFound) {
+        s->value->assign(v.data(), v.size());
+      }
+    }
+  }
+}
+
+static bool NewestFirst(FileMetaData* a, FileMetaData* b) {
+  return a->number > b->number;
+}
+
+Status Version::Get(const ReadOptions& options,
+                    const LookupKey& k,
+                    std::string* value,
+                    GetStats* stats) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+  const Comparator* ucmp = vset_->icmp_.user_comparator();
+  Status s;
+
+  stats->seek_file = NULL;
+  stats->seek_file_level = -1;
+  FileMetaData* last_file_read = NULL;
+  int last_file_read_level = -1;
+
+  // We can search level-by-level since entries never hop across
+  // levels.  Therefore we are guaranteed that if we find data
+  // in an smaller level, later levels are irrelevant.
+  std::vector<FileMetaData*> tmp;
+  FileMetaData* tmp2;
+  for (int level = 0; level < config::kNumLevels; level++) {
+    size_t num_files = files_[level].size();
+    if (num_files == 0) continue;
+
+    // Get the list of files to search in this level
+    FileMetaData* const* files = &files_[level][0];
+    if (level == 0) {
+      // Level-0 files may overlap each other.  Find all files that
+      // overlap user_key and process them in order from newest to oldest.
+      tmp.reserve(num_files);
+      for (uint32_t i = 0; i < num_files; i++) {
+        FileMetaData* f = files[i];
+        if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 &&
+            ucmp->Compare(user_key, f->largest.user_key()) <= 0) {
+          tmp.push_back(f);
+        }
+      }
+      if (tmp.empty()) continue;
+
+      std::sort(tmp.begin(), tmp.end(), NewestFirst);
+      files = &tmp[0];
+      num_files = tmp.size();
+    } else {
+      // Binary search to find earliest index whose largest key >= ikey.
+      uint32_t index = FindFile(vset_->icmp_, files_[level], ikey);
+      if (index >= num_files) {
+        files = NULL;
+        num_files = 0;
+      } else {
+        tmp2 = files[index];
+        if (ucmp->Compare(user_key, tmp2->smallest.user_key()) < 0) {
+          // All of "tmp2" is past any data for user_key
+          files = NULL;
+          num_files = 0;
+        } else {
+          files = &tmp2;
+          num_files = 1;
+        }
+      }
+    }
+
+    for (uint32_t i = 0; i < num_files; ++i) {
+      if (last_file_read != NULL && stats->seek_file == NULL) {
+        // We have had more than one seek for this read.  Charge the 1st file.
+        stats->seek_file = last_file_read;
+        stats->seek_file_level = last_file_read_level;
+      }
+
+      FileMetaData* f = files[i];
+      last_file_read = f;
+      last_file_read_level = level;
+
+      Saver saver;
+      saver.state = kNotFound;
+      saver.ucmp = ucmp;
+      saver.user_key = user_key;
+      saver.value = value;
+      s = vset_->table_cache_->Get(options, f->number, f->file_size,
+                                   ikey, &saver, SaveValue);
+      if (!s.ok()) {
+        return s;
+      }
+      switch (saver.state) {
+        case kNotFound:
+          break;      // Keep searching in other files
+        case kFound:
+          return s;
+        case kDeleted:
+          s = Status::NotFound(Slice());  // Use empty error message for speed
+          return s;
+        case kCorrupt:
+          s = Status::Corruption("corrupted key for ", user_key);
+          return s;
+      }
+    }
+  }
+
+  return Status::NotFound(Slice());  // Use an empty error message for speed
+}
+
+bool Version::UpdateStats(const GetStats& stats) {
+  FileMetaData* f = stats.seek_file;
+  if (f != NULL) {
+    f->allowed_seeks--;
+    if (f->allowed_seeks <= 0 && file_to_compact_ == NULL) {
+      file_to_compact_ = f;
+      file_to_compact_level_ = stats.seek_file_level;
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::Ref() {
+  ++refs_;
+}
+
+void Version::Unref() {
+  assert(this != &vset_->dummy_versions_);
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+  }
+}
+
+bool Version::OverlapInLevel(int level,
+                             const Slice* smallest_user_key,
+                             const Slice* largest_user_key) {
+  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
+                               smallest_user_key, largest_user_key);
+}
+
+int Version::PickLevelForMemTableOutput(
+    const Slice& smallest_user_key,
+    const Slice& largest_user_key) {
+  int level = 0;
+  if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) {
+    // Push to next level if there is no overlap in next level,
+    // and the #bytes overlapping in the level after that are limited.
+    InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0));
+    std::vector<FileMetaData*> overlaps;
+    while (level < config::kMaxMemCompactLevel) {
+      if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) {
+        break;
+      }
+      GetOverlappingInputs(level + 2, &start, &limit, &overlaps);
+      const int64_t sum = TotalFileSize(overlaps);
+      if (sum > kMaxGrandParentOverlapBytes) {
+        break;
+      }
+      level++;
+    }
+  }
+  return level;
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+void Version::GetOverlappingInputs(
+    int level,
+    const InternalKey* begin,
+    const InternalKey* end,
+    std::vector<FileMetaData*>* inputs) {
+  inputs->clear();
+  Slice user_begin, user_end;
+  if (begin != NULL) {
+    user_begin = begin->user_key();
+  }
+  if (end != NULL) {
+    user_end = end->user_key();
+  }
+  const Comparator* user_cmp = vset_->icmp_.user_comparator();
+  for (size_t i = 0; i < files_[level].size(); ) {
+    FileMetaData* f = files_[level][i++];
+    const Slice file_start = f->smallest.user_key();
+    const Slice file_limit = f->largest.user_key();
+    if (begin != NULL && user_cmp->Compare(file_limit, user_begin) < 0) {
+      // "f" is completely before specified range; skip it
+    } else if (end != NULL && user_cmp->Compare(file_start, user_end) > 0) {
+      // "f" is completely after specified range; skip it
+    } else {
+      inputs->push_back(f);
+      if (level == 0) {
+        // Level-0 files may overlap each other.  So check if the newly
+        // added file has expanded the range.  If so, restart search.
+        if (begin != NULL && user_cmp->Compare(file_start, user_begin) < 0) {
+          user_begin = file_start;
+          inputs->clear();
+          i = 0;
+        } else if (end != NULL && user_cmp->Compare(file_limit, user_end) > 0) {
+          user_end = file_limit;
+          inputs->clear();
+          i = 0;
+        }
+      }
+    }
+  }
+}
+
+std::string Version::DebugString() const {
+  std::string r;
+  for (int level = 0; level < config::kNumLevels; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123['a' .. 'd']
+    //   20:43['e' .. 'g']
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->number);
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->file_size);
+      r.append("[");
+      r.append(files[i]->smallest.DebugString());
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString());
+      r.append("]\n");
+    }
+  }
+  return r;
+}
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+  // Helper to sort by v->files_[file_number].smallest
+  struct BySmallestKey {
+    const InternalKeyComparator* internal_comparator;
+
+    bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+      int r = internal_comparator->Compare(f1->smallest, f2->smallest);
+      if (r != 0) {
+        return (r < 0);
+      } else {
+        // Break ties by file number
+        return (f1->number < f2->number);
+      }
+    }
+  };
+
+  typedef std::set<FileMetaData*, BySmallestKey> FileSet;
+  struct LevelState {
+    std::set<uint64_t> deleted_files;
+    FileSet* added_files;
+  };
+
+  VersionSet* vset_;
+  Version* base_;
+  LevelState levels_[config::kNumLevels];
+
+ public:
+  // Initialize a builder with the files from *base and other info from *vset
+  Builder(VersionSet* vset, Version* base)
+      : vset_(vset),
+        base_(base) {
+    base_->Ref();
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
+    for (int level = 0; level < config::kNumLevels; level++) {
+      levels_[level].added_files = new FileSet(cmp);
+    }
+  }
+
+  ~Builder() {
+    for (int level = 0; level < config::kNumLevels; level++) {
+      const FileSet* added = levels_[level].added_files;
+      std::vector<FileMetaData*> to_unref;
+      to_unref.reserve(added->size());
+      for (FileSet::const_iterator it = added->begin();
+          it != added->end(); ++it) {
+        to_unref.push_back(*it);
+      }
+      delete added;
+      for (uint32_t i = 0; i < to_unref.size(); i++) {
+        FileMetaData* f = to_unref[i];
+        f->refs--;
+        if (f->refs <= 0) {
+          delete f;
+        }
+      }
+    }
+    base_->Unref();
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  void Apply(VersionEdit* edit) {
+    // Update compaction pointers
+    for (size_t i = 0; i < edit->compact_pointers_.size(); i++) {
+      const int level = edit->compact_pointers_[i].first;
+      vset_->compact_pointer_[level] =
+          edit->compact_pointers_[i].second.Encode().ToString();
+    }
+
+    // Delete files
+    const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+    for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
+         iter != del.end();
+         ++iter) {
+      const int level = iter->first;
+      const uint64_t number = iter->second;
+      levels_[level].deleted_files.insert(number);
+    }
+
+    // Add new files
+    for (size_t i = 0; i < edit->new_files_.size(); i++) {
+      const int level = edit->new_files_[i].first;
+      FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+      f->refs = 1;
+
+      // We arrange to automatically compact this file after
+      // a certain number of seeks.  Let's assume:
+      //   (1) One seek costs 10ms
+      //   (2) Writing or reading 1MB costs 10ms (100MB/s)
+      //   (3) A compaction of 1MB does 25MB of IO:
+      //         1MB read from this level
+      //         10-12MB read from next level (boundaries may be misaligned)
+      //         10-12MB written to next level
+      // This implies that 25 seeks cost the same as the compaction
+      // of 1MB of data.  I.e., one seek costs approximately the
+      // same as the compaction of 40KB of data.  We are a little
+      // conservative and allow approximately one seek for every 16KB
+      // of data before triggering a compaction.
+      f->allowed_seeks = (f->file_size / 16384);
+      if (f->allowed_seeks < 100) f->allowed_seeks = 100;
+
+      levels_[level].deleted_files.erase(f->number);
+      levels_[level].added_files->insert(f);
+    }
+  }
+
+  // Save the current state in *v.
+  void SaveTo(Version* v) {
+    BySmallestKey cmp;
+    cmp.internal_comparator = &vset_->icmp_;
+    for (int level = 0; level < config::kNumLevels; level++) {
+      // Merge the set of added files with the set of pre-existing files.
+      // Drop any deleted files.  Store the result in *v.
+      const std::vector<FileMetaData*>& base_files = base_->files_[level];
+      std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin();
+      std::vector<FileMetaData*>::const_iterator base_end = base_files.end();
+      const FileSet* added = levels_[level].added_files;
+      v->files_[level].reserve(base_files.size() + added->size());
+      for (FileSet::const_iterator added_iter = added->begin();
+           added_iter != added->end();
+           ++added_iter) {
+        // Add all smaller files listed in base_
+        for (std::vector<FileMetaData*>::const_iterator bpos
+                 = std::upper_bound(base_iter, base_end, *added_iter, cmp);
+             base_iter != bpos;
+             ++base_iter) {
+          MaybeAddFile(v, level, *base_iter);
+        }
+
+        MaybeAddFile(v, level, *added_iter);
+      }
+
+      // Add remaining base files
+      for (; base_iter != base_end; ++base_iter) {
+        MaybeAddFile(v, level, *base_iter);
+      }
+
+#ifndef NDEBUG
+      // Make sure there is no overlap in levels > 0
+      if (level > 0) {
+        for (uint32_t i = 1; i < v->files_[level].size(); i++) {
+          const InternalKey& prev_end = v->files_[level][i-1]->largest;
+          const InternalKey& this_begin = v->files_[level][i]->smallest;
+          if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) {
+            fprintf(stderr, "overlapping ranges in same level %s vs. %s\n",
+                    prev_end.DebugString().c_str(),
+                    this_begin.DebugString().c_str());
+            abort();
+          }
+        }
+      }
+#endif
+    }
+  }
+
+  void MaybeAddFile(Version* v, int level, FileMetaData* f) {
+    if (levels_[level].deleted_files.count(f->number) > 0) {
+      // File is deleted: do nothing
+    } else {
+      std::vector<FileMetaData*>* files = &v->files_[level];
+      if (level > 0 && !files->empty()) {
+        // Must not overlap
+        assert(vset_->icmp_.Compare((*files)[files->size()-1]->largest,
+                                    f->smallest) < 0);
+      }
+      f->refs++;
+      files->push_back(f);
+    }
+  }
+};
+
+VersionSet::VersionSet(const std::string& dbname,
+                       const Options* options,
+                       TableCache* table_cache,
+                       const InternalKeyComparator* cmp)
+    : env_(options->env),
+      dbname_(dbname),
+      options_(options),
+      table_cache_(table_cache),
+      icmp_(*cmp),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      last_sequence_(0),
+      log_number_(0),
+      prev_log_number_(0),
+      descriptor_file_(NULL),
+      descriptor_log_(NULL),
+      dummy_versions_(this),
+      current_(NULL) {
+  AppendVersion(new Version(this));
+}
+
+VersionSet::~VersionSet() {
+  current_->Unref();
+  assert(dummy_versions_.next_ == &dummy_versions_);  // List must be empty
+  delete descriptor_log_;
+  delete descriptor_file_;
+}
+
+void VersionSet::AppendVersion(Version* v) {
+  // Make "v" current
+  assert(v->refs_ == 0);
+  assert(v != current_);
+  if (current_ != NULL) {
+    current_->Unref();
+  }
+  current_ = v;
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = dummy_versions_.prev_;
+  v->next_ = &dummy_versions_;
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= log_number_);
+    assert(edit->log_number_ < next_file_number_);
+  } else {
+    edit->SetLogNumber(log_number_);
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+
+  edit->SetNextFile(next_file_number_);
+  edit->SetLastSequence(last_sequence_);
+
+  Version* v = new Version(this);
+  {
+    Builder builder(this, current_);
+    builder.Apply(edit);
+    builder.SaveTo(v);
+  }
+  Finalize(v);
+
+  // Initialize new descriptor log file if necessary by creating
+  // a temporary file that contains a snapshot of the current version.
+  std::string new_manifest_file;
+  Status s;
+  if (descriptor_log_ == NULL) {
+    // No reason to unlock *mu here since we only hit this path in the
+    // first call to LogAndApply (when opening the database).
+    assert(descriptor_file_ == NULL);
+    new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+    edit->SetNextFile(next_file_number_);
+    s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+    if (s.ok()) {
+      descriptor_log_ = new log::Writer(descriptor_file_);
+      s = WriteSnapshot(descriptor_log_);
+    }
+  }
+
+  // Unlock during expensive MANIFEST log write
+  {
+    mu->Unlock();
+
+    // Write new record to MANIFEST log
+    if (s.ok()) {
+      std::string record;
+      edit->EncodeTo(&record);
+      s = descriptor_log_->AddRecord(record);
+      if (s.ok()) {
+        s = descriptor_file_->Sync();
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok() && !new_manifest_file.empty()) {
+      s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+    }
+
+    mu->Lock();
+  }
+
+  // Install the new version
+  if (s.ok()) {
+    AppendVersion(v);
+    log_number_ = edit->log_number_;
+    prev_log_number_ = edit->prev_log_number_;
+  } else {
+    delete v;
+    if (!new_manifest_file.empty()) {
+      delete descriptor_log_;
+      delete descriptor_file_;
+      descriptor_log_ = NULL;
+      descriptor_file_ = NULL;
+      env_->DeleteFile(new_manifest_file);
+    }
+  }
+
+  return s;
+}
+
+Status VersionSet::Recover() {
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t bytes, const Status& s) {
+      if (this->status->ok()) *this->status = s;
+    }
+  };
+
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string current;
+  Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+  if (!s.ok()) {
+    return s;
+  }
+  if (current.empty() || current[current.size()-1] != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  current.resize(current.size() - 1);
+
+  std::string dscname = dbname_ + "/" + current;
+  SequentialFile* file;
+  s = env_->NewSequentialFile(dscname, &file);
+  if (!s.ok()) {
+    return s;
+  }
+
+  bool have_log_number = false;
+  bool have_prev_log_number = false;
+  bool have_next_file = false;
+  bool have_last_sequence = false;
+  uint64_t next_file = 0;
+  uint64_t last_sequence = 0;
+  uint64_t log_number = 0;
+  uint64_t prev_log_number = 0;
+  Builder builder(this, current_);
+
+  {
+    LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(file, &reporter, true/*checksum*/, 0/*initial_offset*/);
+    Slice record;
+    std::string scratch;
+    while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+      VersionEdit edit;
+      s = edit.DecodeFrom(record);
+      if (s.ok()) {
+        if (edit.has_comparator_ &&
+            edit.comparator_ != icmp_.user_comparator()->Name()) {
+          s = Status::InvalidArgument(
+              edit.comparator_ + "does not match existing comparator ",
+              icmp_.user_comparator()->Name());
+        }
+      }
+
+      if (s.ok()) {
+        builder.Apply(&edit);
+      }
+
+      if (edit.has_log_number_) {
+        log_number = edit.log_number_;
+        have_log_number = true;
+      }
+
+      if (edit.has_prev_log_number_) {
+        prev_log_number = edit.prev_log_number_;
+        have_prev_log_number = true;
+      }
+
+      if (edit.has_next_file_number_) {
+        next_file = edit.next_file_number_;
+        have_next_file = true;
+      }
+
+      if (edit.has_last_sequence_) {
+        last_sequence = edit.last_sequence_;
+        have_last_sequence = true;
+      }
+    }
+  }
+  delete file;
+  file = NULL;
+
+  if (s.ok()) {
+    if (!have_next_file) {
+      s = Status::Corruption("no meta-nextfile entry in descriptor");
+    } else if (!have_log_number) {
+      s = Status::Corruption("no meta-lognumber entry in descriptor");
+    } else if (!have_last_sequence) {
+      s = Status::Corruption("no last-sequence-number entry in descriptor");
+    }
+
+    if (!have_prev_log_number) {
+      prev_log_number = 0;
+    }
+
+    MarkFileNumberUsed(prev_log_number);
+    MarkFileNumberUsed(log_number);
+  }
+
+  if (s.ok()) {
+    Version* v = new Version(this);
+    builder.SaveTo(v);
+    // Install recovered version
+    Finalize(v);
+    AppendVersion(v);
+    manifest_file_number_ = next_file;
+    next_file_number_ = next_file + 1;
+    last_sequence_ = last_sequence;
+    log_number_ = log_number;
+    prev_log_number_ = prev_log_number;
+  }
+
+  return s;
+}
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  if (next_file_number_ <= number) {
+    next_file_number_ = number + 1;
+  }
+}
+
+void VersionSet::Finalize(Version* v) {
+  // Precomputed best level for next compaction
+  int best_level = -1;
+  double best_score = -1;
+
+  for (int level = 0; level < config::kNumLevels-1; level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      score = v->files_[level].size() /
+          static_cast<double>(config::kL0_CompactionTrigger);
+    } else {
+      // Compute the ratio of current size to size limit.
+      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
+    }
+
+    if (score > best_score) {
+      best_level = level;
+      best_score = score;
+    }
+  }
+
+  v->compaction_level_ = best_level;
+  v->compaction_score_ = best_score;
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // Save metadata
+  VersionEdit edit;
+  edit.SetComparatorName(icmp_.user_comparator()->Name());
+
+  // Save compaction pointers
+  for (int level = 0; level < config::kNumLevels; level++) {
+    if (!compact_pointer_[level].empty()) {
+      InternalKey key;
+      key.DecodeFrom(compact_pointer_[level]);
+      edit.SetCompactPointer(level, key);
+    }
+  }
+
+  // Save files
+  for (int level = 0; level < config::kNumLevels; level++) {
+    const std::vector<FileMetaData*>& files = current_->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      const FileMetaData* f = files[i];
+      edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
+    }
+  }
+
+  std::string record;
+  edit.EncodeTo(&record);
+  return log->AddRecord(record);
+}
+
+int VersionSet::NumLevelFiles(int level) const {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return current_->files_[level].size();
+}
+
+const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const {
+  // Update code if kNumLevels changes
+  assert(config::kNumLevels == 7);
+  snprintf(scratch->buffer, sizeof(scratch->buffer),
+           "files[ %d %d %d %d %d %d %d ]",
+           int(current_->files_[0].size()),
+           int(current_->files_[1].size()),
+           int(current_->files_[2].size()),
+           int(current_->files_[3].size()),
+           int(current_->files_[4].size()),
+           int(current_->files_[5].size()),
+           int(current_->files_[6].size()));
+  return scratch->buffer;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+  uint64_t result = 0;
+  for (int level = 0; level < config::kNumLevels; level++) {
+    const std::vector<FileMetaData*>& files = v->files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+        // Entire file is before "ikey", so just add the file size
+        result += files[i]->file_size;
+      } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+        // Entire file is after "ikey", so ignore
+        if (level > 0) {
+          // Files other than level 0 are sorted by meta->smallest, so
+          // no further files in this level will contain data for
+          // "ikey".
+          break;
+        }
+      } else {
+        // "ikey" falls in the range for this table.  Add the
+        // approximate offset of "ikey" within the table.
+        Table* tableptr;
+        Iterator* iter = table_cache_->NewIterator(
+            ReadOptions(), files[i]->number, files[i]->file_size, &tableptr);
+        if (tableptr != NULL) {
+          result += tableptr->ApproximateOffsetOf(ikey.Encode());
+        }
+        delete iter;
+      }
+    }
+  }
+  return result;
+}
+
+void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
+  for (Version* v = dummy_versions_.next_;
+       v != &dummy_versions_;
+       v = v->next_) {
+    for (int level = 0; level < config::kNumLevels; level++) {
+      const std::vector<FileMetaData*>& files = v->files_[level];
+      for (size_t i = 0; i < files.size(); i++) {
+        live->insert(files[i]->number);
+      }
+    }
+  }
+}
+
+int64_t VersionSet::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < config::kNumLevels);
+  return TotalFileSize(current_->files_[level]);
+}
+
+int64_t VersionSet::MaxNextLevelOverlappingBytes() {
+  int64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < config::kNumLevels - 1; level++) {
+    for (size_t i = 0; i < current_->files_[level].size(); i++) {
+      const FileMetaData* f = current_->files_[level][i];
+      current_->GetOverlappingInputs(level+1, &f->smallest, &f->largest,
+                                     &overlaps);
+      const int64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+// Stores the minimal range that covers all entries in inputs in
+// *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
+                          InternalKey* smallest,
+                          InternalKey* largest) {
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (i == 0) {
+      *smallest = f->smallest;
+      *largest = f->largest;
+    } else {
+      if (icmp_.Compare(f->smallest, *smallest) < 0) {
+        *smallest = f->smallest;
+      }
+      if (icmp_.Compare(f->largest, *largest) > 0) {
+        *largest = f->largest;
+      }
+    }
+  }
+}
+
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+                           const std::vector<FileMetaData*>& inputs2,
+                           InternalKey* smallest,
+                           InternalKey* largest) {
+  std::vector<FileMetaData*> all = inputs1;
+  all.insert(all.end(), inputs2.begin(), inputs2.end());
+  GetRange(all, smallest, largest);
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+  ReadOptions options;
+  options.verify_checksums = options_->paranoid_checks;
+  options.fill_cache = false;
+
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+  Iterator** list = new Iterator*[space];
+  int num = 0;
+  for (int which = 0; which < 2; which++) {
+    if (!c->inputs_[which].empty()) {
+      if (c->level() + which == 0) {
+        const std::vector<FileMetaData*>& files = c->inputs_[which];
+        for (size_t i = 0; i < files.size(); i++) {
+          list[num++] = table_cache_->NewIterator(
+              options, files[i]->number, files[i]->file_size);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = NewTwoLevelIterator(
+            new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]),
+            &GetFileIterator, table_cache_, options);
+      }
+    }
+  }
+  assert(num <= space);
+  Iterator* result = NewMergingIterator(&icmp_, list, num);
+  delete[] list;
+  return result;
+}
+
+Compaction* VersionSet::PickCompaction() {
+  Compaction* c;
+  int level;
+
+  // We prefer compactions triggered by too much data in a level over
+  // the compactions triggered by seeks.
+  const bool size_compaction = (current_->compaction_score_ >= 1);
+  const bool seek_compaction = (current_->file_to_compact_ != NULL);
+  if (size_compaction) {
+    level = current_->compaction_level_;
+    assert(level >= 0);
+    assert(level+1 < config::kNumLevels);
+    c = new Compaction(level);
+
+    // Pick the first file that comes after compact_pointer_[level]
+    for (size_t i = 0; i < current_->files_[level].size(); i++) {
+      FileMetaData* f = current_->files_[level][i];
+      if (compact_pointer_[level].empty() ||
+          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
+        c->inputs_[0].push_back(f);
+        break;
+      }
+    }
+    if (c->inputs_[0].empty()) {
+      // Wrap-around to the beginning of the key space
+      c->inputs_[0].push_back(current_->files_[level][0]);
+    }
+  } else if (seek_compaction) {
+    level = current_->file_to_compact_level_;
+    c = new Compaction(level);
+    c->inputs_[0].push_back(current_->file_to_compact_);
+  } else {
+    return NULL;
+  }
+
+  c->input_version_ = current_;
+  c->input_version_->Ref();
+
+  // Files in level 0 may overlap each other, so pick up all overlapping ones
+  if (level == 0) {
+    InternalKey smallest, largest;
+    GetRange(c->inputs_[0], &smallest, &largest);
+    // Note that the next call will discard the file we placed in
+    // c->inputs_[0] earlier and replace it with an overlapping set
+    // which will include the picked file.
+    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
+    assert(!c->inputs_[0].empty());
+  }
+
+  SetupOtherInputs(c);
+
+  return c;
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+  const int level = c->level();
+  InternalKey smallest, largest;
+  GetRange(c->inputs_[0], &smallest, &largest);
+
+  current_->GetOverlappingInputs(level+1, &smallest, &largest, &c->inputs_[1]);
+
+  // Get entire range covered by compaction
+  InternalKey all_start, all_limit;
+  GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
+  // See if we can grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up.
+  if (!c->inputs_[1].empty()) {
+    std::vector<FileMetaData*> expanded0;
+    current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0);
+    const int64_t inputs0_size = TotalFileSize(c->inputs_[0]);
+    const int64_t inputs1_size = TotalFileSize(c->inputs_[1]);
+    const int64_t expanded0_size = TotalFileSize(expanded0);
+    if (expanded0.size() > c->inputs_[0].size() &&
+        inputs1_size + expanded0_size < kExpandedCompactionByteSizeLimit) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded0, &new_start, &new_limit);
+      std::vector<FileMetaData*> expanded1;
+      current_->GetOverlappingInputs(level+1, &new_start, &new_limit,
+                                     &expanded1);
+      if (expanded1.size() == c->inputs_[1].size()) {
+        Log(options_->info_log,
+            "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
+            level,
+            int(c->inputs_[0].size()),
+            int(c->inputs_[1].size()),
+            long(inputs0_size), long(inputs1_size),
+            int(expanded0.size()),
+            int(expanded1.size()),
+            long(expanded0_size), long(inputs1_size));
+        smallest = new_start;
+        largest = new_limit;
+        c->inputs_[0] = expanded0;
+        c->inputs_[1] = expanded1;
+        GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+      }
+    }
+  }
+
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2)
+  if (level + 2 < config::kNumLevels) {
+    current_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
+                                   &c->grandparents_);
+  }
+
+  if (false) {
+    Log(options_->info_log, "Compacting %d '%s' .. '%s'",
+        level,
+        smallest.DebugString().c_str(),
+        largest.DebugString().c_str());
+  }
+
+  // Update the place where we will do the next compaction for this level.
+  // We update this immediately instead of waiting for the VersionEdit
+  // to be applied so that if the compaction fails, we will try a different
+  // key range next time.
+  compact_pointer_[level] = largest.Encode().ToString();
+  c->edit_.SetCompactPointer(level, largest);
+}
+
+Compaction* VersionSet::CompactRange(
+    int level,
+    const InternalKey* begin,
+    const InternalKey* end) {
+  std::vector<FileMetaData*> inputs;
+  current_->GetOverlappingInputs(level, begin, end, &inputs);
+  if (inputs.empty()) {
+    return NULL;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  const uint64_t limit = MaxFileSizeForLevel(level);
+  uint64_t total = 0;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    uint64_t s = inputs[i]->file_size;
+    total += s;
+    if (total >= limit) {
+      inputs.resize(i + 1);
+      break;
+    }
+  }
+
+  Compaction* c = new Compaction(level);
+  c->input_version_ = current_;
+  c->input_version_->Ref();
+  c->inputs_[0] = inputs;
+  SetupOtherInputs(c);
+  return c;
+}
+
+Compaction::Compaction(int level)
+    : level_(level),
+      max_output_file_size_(MaxFileSizeForLevel(level)),
+      input_version_(NULL),
+      grandparent_index_(0),
+      seen_key_(false),
+      overlapped_bytes_(0) {
+  for (int i = 0; i < config::kNumLevels; i++) {
+    level_ptrs_[i] = 0;
+  }
+}
+
+Compaction::~Compaction() {
+  if (input_version_ != NULL) {
+    input_version_->Unref();
+  }
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  return (num_input_files(0) == 1 &&
+          num_input_files(1) == 0 &&
+          TotalFileSize(grandparents_) <= kMaxGrandParentOverlapBytes);
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+  for (int which = 0; which < 2; which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+    }
+  }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+  // Maybe use binary search to find right entry instead of linear search?
+  const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+  for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
+    const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+    for (; level_ptrs_[lvl] < files.size(); ) {
+      FileMetaData* f = files[level_ptrs_[lvl]];
+      if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+        // We've advanced far enough
+        if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+          // Key falls in this file's range, so definitely not base level
+          return false;
+        }
+        break;
+      }
+      level_ptrs_[lvl]++;
+    }
+  }
+  return true;
+}
+
+bool Compaction::ShouldStopBefore(const Slice& internal_key) {
+  // Scan to find earliest grandparent file that contains key.
+  const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+  while (grandparent_index_ < grandparents_.size() &&
+      icmp->Compare(internal_key,
+                    grandparents_[grandparent_index_]->largest.Encode()) > 0) {
+    if (seen_key_) {
+      overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
+    }
+    grandparent_index_++;
+  }
+  seen_key_ = true;
+
+  if (overlapped_bytes_ > kMaxGrandParentOverlapBytes) {
+    // Too much overlap for current output; start new output
+    overlapped_bytes_ = 0;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void Compaction::ReleaseInputs() {
+  if (input_version_ != NULL) {
+    input_version_->Unref();
+    input_version_ = NULL;
+  }
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
new file mode 100644
index 0000000000..61c4c99a08
--- /dev/null
+++ b/src/leveldb/db/version_set.h
@@ -0,0 +1,379 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level.  The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
+#define STORAGE_LEVELDB_DB_VERSION_SET_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableBuilder;
+class TableCache;
+class Version;
+class VersionSet;
+class WritableFile;
+
+// Return the smallest index i such that files[i]->largest >= key.
+// Return files.size() if there is no such file.
+// REQUIRES: "files" contains a sorted list of non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const std::vector<FileMetaData*>& files,
+                    const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==NULL represents a key smaller than all keys in the DB.
+// largest==NULL represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges
+//           in sorted order.
+extern bool SomeFileOverlapsRange(
+    const InternalKeyComparator& icmp,
+    bool disjoint_sorted_files,
+    const std::vector<FileMetaData*>& files,
+    const Slice* smallest_user_key,
+    const Slice* largest_user_key);
+
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+  void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
+
+  // Lookup the value for key.  If found, store it in *val and
+  // return OK.  Else return a non-OK status.  Fills *stats.
+  // REQUIRES: lock is not held
+  struct GetStats {
+    FileMetaData* seek_file;
+    int seek_file_level;
+  };
+  Status Get(const ReadOptions&, const LookupKey& key, std::string* val,
+             GetStats* stats);
+
+  // Adds "stats" into the current state.  Returns true if a new
+  // compaction may need to be triggered, false otherwise.
+  // REQUIRES: lock is held
+  bool UpdateStats(const GetStats& stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  void Unref();
+
+  void GetOverlappingInputs(
+      int level,
+      const InternalKey* begin,         // NULL means before all keys
+      const InternalKey* end,           // NULL means after all keys
+      std::vector<FileMetaData*>* inputs);
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level,
+                      const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Return the level at which we should place a new memtable compaction
+  // result that covers the range [smallest_user_key,largest_user_key].
+  int PickLevelForMemTableOutput(const Slice& smallest_user_key,
+                                 const Slice& largest_user_key);
+
+  int NumFiles(int level) const { return files_[level].size(); }
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString() const;
+
+ private:
+  friend class Compaction;
+  friend class VersionSet;
+
+  class LevelFileNumIterator;
+  Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+
+  VersionSet* vset_;            // VersionSet to which this Version belongs
+  Version* next_;               // Next version in linked list
+  Version* prev_;               // Previous version in linked list
+  int refs_;                    // Number of live refs to this version
+
+  // List of files per level
+  std::vector<FileMetaData*> files_[config::kNumLevels];
+
+  // Next file to compact based on seek stats.
+  FileMetaData* file_to_compact_;
+  int file_to_compact_level_;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by Finalize().
+  double compaction_score_;
+  int compaction_level_;
+
+  explicit Version(VersionSet* vset)
+      : vset_(vset), next_(this), prev_(this), refs_(0),
+        file_to_compact_(NULL),
+        file_to_compact_level_(-1),
+        compaction_score_(-1),
+        compaction_level_(-1) {
+  }
+
+  ~Version();
+
+  // No copying allowed
+  Version(const Version&);
+  void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname,
+             const Options* options,
+             TableCache* table_cache,
+             const InternalKeyComparator*);
+  ~VersionSet();
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(VersionEdit* edit, port::Mutex* mu);
+
+  // Recover the last saved descriptor from persistent storage.
+  Status Recover();
+
+  // Return the current version.
+  Version* current() const { return current_; }
+
+  // Return the current manifest file number
+  uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_++; }
+
+  // Arrange to reuse "file_number" unless a newer file number has
+  // already been allocated.
+  // REQUIRES: "file_number" was returned by a call to NewFileNumber().
+  void ReuseFileNumber(uint64_t file_number) {
+    if (next_file_number_ == file_number + 1) {
+      next_file_number_ = file_number;
+    }
+  }
+
+  // Return the number of Table files at the specified level.
+  int NumLevelFiles(int level) const;
+
+  // Return the combined file size of all files at the specified level.
+  int64_t NumLevelBytes(int level) const;
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const { return last_sequence_; }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    last_sequence_ = s;
+  }
+
+  // Mark the specified file number as used.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Return the current log file number.
+  uint64_t LogNumber() const { return log_number_; }
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t PrevLogNumber() const { return prev_log_number_; }
+
+  // Pick level and inputs for a new compaction.
+  // Returns NULL if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  Compaction* PickCompaction();
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns NULL if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  Compaction* CompactRange(
+      int level,
+      const InternalKey* begin,
+      const InternalKey* end);
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  int64_t MaxNextLevelOverlappingBytes();
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  Iterator* MakeInputIterator(Compaction* c);
+
+  // Returns true iff some level needs a compaction.
+  bool NeedsCompaction() const {
+    Version* v = current_;
+    return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL);
+  }
+
+  // Add all files listed in any live version to *live.
+  // May also mutate some internal state.
+  void AddLiveFiles(std::set<uint64_t>* live);
+
+  // Return the approximate offset in the database of the data for
+  // "key" as of version "v".
+  uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[100];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+
+ private:
+  class Builder;
+
+  friend class Compaction;
+  friend class Version;
+
+  void Finalize(Version* v);
+
+  void GetRange(const std::vector<FileMetaData*>& inputs,
+                InternalKey* smallest,
+                InternalKey* largest);
+
+  void GetRange2(const std::vector<FileMetaData*>& inputs1,
+                 const std::vector<FileMetaData*>& inputs2,
+                 InternalKey* smallest,
+                 InternalKey* largest);
+
+  void SetupOtherInputs(Compaction* c);
+
+  // Save current contents to *log
+  Status WriteSnapshot(log::Writer* log);
+
+  void AppendVersion(Version* v);
+
+  Env* const env_;
+  const std::string dbname_;
+  const Options* const options_;
+  TableCache* const table_cache_;
+  const InternalKeyComparator icmp_;
+  uint64_t next_file_number_;
+  uint64_t manifest_file_number_;
+  uint64_t last_sequence_;
+  uint64_t log_number_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  WritableFile* descriptor_file_;
+  log::Writer* descriptor_log_;
+  Version dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;        // == dummy_versions_.prev_
+
+  // Per-level key at which the next compaction at that level should start.
+  // Either an empty string, or a valid InternalKey.
+  std::string compact_pointer_[config::kNumLevels];
+
+  // No copying allowed
+  VersionSet(const VersionSet&);
+  void operator=(const VersionSet&);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+  ~Compaction();
+
+  // Return the level that is being compacted.  Inputs from "level"
+  // and "level+1" will be merged to produce a set of "level+1" files.
+  int level() const { return level_; }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return &edit_; }
+
+  // "which" must be either 0 or 1
+  int num_input_files(int which) const { return inputs_[which].size(); }
+
+  // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+  FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the information we have available guarantees that
+  // the compaction is producing data in "level+1" for which no data exists
+  // in levels greater than "level+1".
+  bool IsBaseLevelForKey(const Slice& user_key);
+
+  // Returns true iff we should stop building the current output
+  // before processing "internal_key".
+  bool ShouldStopBefore(const Slice& internal_key);
+
+  // Release the input version for the compaction, once the compaction
+  // is successful.
+  void ReleaseInputs();
+
+ private:
+  friend class Version;
+  friend class VersionSet;
+
+  explicit Compaction(int level);
+
+  int level_;
+  uint64_t max_output_file_size_;
+  Version* input_version_;
+  VersionEdit edit_;
+
+  // Each compaction reads inputs from "level_" and "level_+1"
+  std::vector<FileMetaData*> inputs_[2];      // The two sets of inputs
+
+  // State used to check for number of of overlapping grandparent files
+  // (parent == level_ + 1, grandparent == level_ + 2)
+  std::vector<FileMetaData*> grandparents_;
+  size_t grandparent_index_;  // Index in grandparent_starts_
+  bool seen_key_;             // Some output key has been seen
+  int64_t overlapped_bytes_;  // Bytes of overlap between current output
+                              // and grandparent files
+
+  // State for implementing IsBaseLevelForKey
+
+  // level_ptrs_ holds indices into input_version_->levels_: our state
+  // is that we are positioned at one of the file ranges for each
+  // higher level than the ones involved in this compaction (i.e. for
+  // all L >= level_ + 2).
+  size_t level_ptrs_[config::kNumLevels];
+};
+
+}  // namespace leveldb
+
+#endif  // STORAGE_LEVELDB_DB_VERSION_SET_H_
diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc
new file mode 100644
index 0000000000..501e34d133
--- /dev/null
+++ b/src/leveldb/db/version_set_test.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+class FindFileTest {
+ public:
+  std::vector<FileMetaData*> files_;
+  bool disjoint_sorted_files_;
+
+  FindFileTest() : disjoint_sorted_files_(true) { }
+
+  ~FindFileTest() {
+    for (int i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData;
+    f->number = files_.size() + 1;
+    f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
+    f->largest = InternalKey(largest, largest_seq, kTypeValue);
+    files_.push_back(f);
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, files_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != NULL ? smallest : "");
+    Slice l(largest != NULL ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+                                 (smallest != NULL ? &s : NULL),
+                                 (largest != NULL ? &l : NULL));
+  }
+};
+
+TEST(FindFileTest, Empty) {
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(! Overlaps("a", "z"));
+  ASSERT_TRUE(! Overlaps(NULL, "z"));
+  ASSERT_TRUE(! Overlaps("a", NULL));
+  ASSERT_TRUE(! Overlaps(NULL, NULL));
+}
+
+TEST(FindFileTest, Single) {
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(! Overlaps("a", "b"));
+  ASSERT_TRUE(! Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(! Overlaps(NULL, "j"));
+  ASSERT_TRUE(! Overlaps("r", NULL));
+  ASSERT_TRUE(Overlaps(NULL, "p"));
+  ASSERT_TRUE(Overlaps(NULL, "p1"));
+  ASSERT_TRUE(Overlaps("q", NULL));
+  ASSERT_TRUE(Overlaps(NULL, NULL));
+}
+
+
+TEST(FindFileTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("251", "299"));
+  ASSERT_TRUE(! Overlaps("451", "500"));
+  ASSERT_TRUE(! Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST(FindFileTest, MultipleNullBoundaries) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(! Overlaps(NULL, "149"));
+  ASSERT_TRUE(! Overlaps("451", NULL));
+  ASSERT_TRUE(Overlaps(NULL, NULL));
+  ASSERT_TRUE(Overlaps(NULL, "150"));
+  ASSERT_TRUE(Overlaps(NULL, "199"));
+  ASSERT_TRUE(Overlaps(NULL, "200"));
+  ASSERT_TRUE(Overlaps(NULL, "201"));
+  ASSERT_TRUE(Overlaps(NULL, "400"));
+  ASSERT_TRUE(Overlaps(NULL, "800"));
+  ASSERT_TRUE(Overlaps("100", NULL));
+  ASSERT_TRUE(Overlaps("200", NULL));
+  ASSERT_TRUE(Overlaps("449", NULL));
+  ASSERT_TRUE(Overlaps("450", NULL));
+}
+
+TEST(FindFileTest, OverlapSequenceChecks) {
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(! Overlaps("199", "199"));
+  ASSERT_TRUE(! Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST(FindFileTest, OverlappingFiles) {
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(! Overlaps("100", "149"));
+  ASSERT_TRUE(! Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}
diff --git a/src/leveldb/db/write_batch.cc b/src/leveldb/db/write_batch.cc
new file mode 100644
index 0000000000..33f4a4257e
--- /dev/null
+++ b/src/leveldb/db/write_batch.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring         |
+//    kTypeDeletion varstring
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "leveldb/write_batch.h"
+
+#include "leveldb/db.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+static const size_t kHeader = 12;
+
+WriteBatch::WriteBatch() {
+  Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+WriteBatch::Handler::~Handler() { }
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(kHeader);
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  Slice input(rep_);
+  if (input.size() < kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  input.remove_prefix(kHeader);
+  Slice key, value;
+  int found = 0;
+  while (!input.empty()) {
+    found++;
+    char tag = input[0];
+    input.remove_prefix(1);
+    switch (tag) {
+      case kTypeValue:
+        if (GetLengthPrefixedSlice(&input, &key) &&
+            GetLengthPrefixedSlice(&input, &value)) {
+          handler->Put(key, value);
+        } else {
+          return Status::Corruption("bad WriteBatch Put");
+        }
+        break;
+      case kTypeDeletion:
+        if (GetLengthPrefixedSlice(&input, &key)) {
+          handler->Delete(key);
+        } else {
+          return Status::Corruption("bad WriteBatch Delete");
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (found != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeValue));
+  PutLengthPrefixedSlice(&rep_, key);
+  PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatch::Delete(const Slice& key) {
+  WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+  rep_.push_back(static_cast<char>(kTypeDeletion));
+  PutLengthPrefixedSlice(&rep_, key);
+}
+
+namespace {
+class MemTableInserter : public WriteBatch::Handler {
+ public:
+  SequenceNumber sequence_;
+  MemTable* mem_;
+
+  virtual void Put(const Slice& key, const Slice& value) {
+    mem_->Add(sequence_, kTypeValue, key, value);
+    sequence_++;
+  }
+  virtual void Delete(const Slice& key) {
+    mem_->Add(sequence_, kTypeDeletion, key, Slice());
+    sequence_++;
+  }
+};
+}  // namespace
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+                                      MemTable* memtable) {
+  MemTableInserter inserter;
+  inserter.sequence_ = WriteBatchInternal::Sequence(b);
+  inserter.mem_ = memtable;
+  return b->Iterate(&inserter);
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= kHeader);
+  b->rep_.assign(contents.data(), contents.size());
+}
+
+void WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src) {
+  SetCount(dst, Count(dst) + Count(src));
+  assert(src->rep_.size() >= kHeader);
+  dst->rep_.append(src->rep_.data() + kHeader, src->rep_.size() - kHeader);
+}
+
+}  // namespace leveldb
diff --git a/src/leveldb/db/write_batch_internal.h b/src/leveldb/db/write_batch_internal.h
new file mode 100644
index 0000000000..4423a7f318
--- /dev/null
+++ b/src/leveldb/db/write_batch_internal.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "leveldb/write_batch.h"
+
+namespace leveldb {
+
+class MemTable;
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // Return the number of entries in the batch.
+  static int Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, int n);
+
+  // Return the seqeunce number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the seqeunce number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  static Slice Contents(const WriteBatch* batch) {
+    return Slice(batch->rep_);
+  }
+
+  static size_t ByteSize(const WriteBatch* batch) {
+    return batch->rep_.size();
+  }
+
+  static void SetContents(WriteBatch* batch, const Slice& contents);
+
+  static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+
+  static void Append(WriteBatch* dst, const WriteBatch* src);
+};
+
+}  // namespace leveldb
+
+
+#endif  // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
diff --git a/src/leveldb/db/write_batch_test.cc b/src/leveldb/db/write_batch_test.cc
new file mode 100644
index 0000000000..9064e3d85e
--- /dev/null
+++ b/src/leveldb/db/write_batch_test.cc
@@ -0,0 +1,120 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "leveldb/db.h"
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "leveldb/env.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string PrintContents(WriteBatch* b) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  MemTable* mem = new MemTable(cmp);
+  mem->Ref();
+  std::string state;
+  Status s = WriteBatchInternal::InsertInto(b, mem);
+  int count = 0;
+  Iterator* iter = mem->NewIterator();
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ParsedInternalKey ikey;
+    ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+    switch (ikey.type) {
+      case kTypeValue:
+        state.append("Put(");
+        state.append(ikey.user_key.ToString());
+        state.append(", ");
+        state.append(iter->value().ToString());
+        state.append(")");
+        count++;
+        break;
+      case kTypeDeletion:
+        state.append("Delete(");
+        state.append(ikey.user_key.ToString());
+        state.append(")");
+        count++;
+        break;
+    }
+    state.append("@");
+    state.append(NumberToString(ikey.sequence));
+  }
+  delete iter;
+  if (!s.ok()) {
+    state.append("ParseError()");
+  } else if (count != WriteBatchInternal::Count(b)) {
+    state.append("CountMismatch()");
+  }
+  mem->Unref();
+  return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+}
+
+TEST(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  batch.Put(Slice("baz"), Slice("boo"));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ("Put(baz, boo)@102"
+            "Delete(box)@101"
+            "Put(foo, bar)@100",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  batch.Put(Slice("foo"), Slice("bar"));
+  batch.Delete(Slice("box"));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  WriteBatchInternal::SetContents(&batch,
+                                  Slice(contents.data(),contents.size()-1));
+  ASSERT_EQ("Put(foo, bar)@200"
+            "ParseError()",
+            PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("",
+            PrintContents(&b1));
+  b2.Put("a", "va");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200",
+            PrintContents(&b1));
+  b2.Clear();
+  b2.Put("b", "vb");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@201",
+            PrintContents(&b1));
+  b2.Delete("foo");
+  WriteBatchInternal::Append(&b1, &b2);
+  ASSERT_EQ("Put(a, va)@200"
+            "Put(b, vb)@202"
+            "Put(b, vb)@201"
+            "Delete(foo)@203",
+            PrintContents(&b1));
+}
+
+}  // namespace leveldb
+
+int main(int argc, char** argv) {
+  return leveldb::test::RunAllTests();
+}