summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjorlow@chromium.org <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-03-18 22:37:00 +0000
committerjorlow@chromium.org <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-03-18 22:37:00 +0000
commitf67e15e50f392625b4097caf22e8be1b0fe96013 (patch)
tree1cb1764c7627f9bac27ed0e0abf27010156e5007
parent54f1fd7eef101db1dfb2bb66a59083c45a38aa4a (diff)
downloadleveldb-f67e15e50f392625b4097caf22e8be1b0fe96013.tar.gz
Initial checkin.
git-svn-id: https://leveldb.googlecode.com/svn/trunk@2 62dab493-f737-651d-591e-8d6aee1b9529
-rw-r--r--AUTHORS8
-rw-r--r--Android.mk64
-rw-r--r--Application.mk6
-rw-r--r--LICENSE27
-rw-r--r--Makefile134
-rw-r--r--README51
-rw-r--r--TODO23
-rw-r--r--db/builder.cc97
-rw-r--r--db/builder.h36
-rw-r--r--db/corruption_test.cc366
-rw-r--r--db/db_bench.cc376
-rw-r--r--db/db_impl.cc1195
-rw-r--r--db/db_impl.h192
-rw-r--r--db/db_iter.cc412
-rw-r--r--db/db_iter.h26
-rw-r--r--db/db_test.cc963
-rw-r--r--db/dbformat.cc152
-rw-r--r--db/dbformat.h198
-rw-r--r--db/dbformat_test.cc127
-rw-r--r--db/filename.cc154
-rw-r--r--db/filename.h92
-rw-r--r--db/filename_test.cc156
-rw-r--r--db/log_format.h35
-rw-r--r--db/log_reader.cc172
-rw-r--r--db/log_reader.h75
-rw-r--r--db/log_test.cc361
-rw-r--r--db/log_writer.cc101
-rw-r--r--db/log_writer.h48
-rw-r--r--db/memtable.cc109
-rw-r--r--db/memtable.h69
-rw-r--r--db/repair.cc396
-rw-r--r--db/skiplist.h378
-rw-r--r--db/skiplist_test.cc378
-rw-r--r--db/snapshot.h66
-rw-r--r--db/table_cache.cc94
-rw-r--r--db/table_cache.h49
-rw-r--r--db/version_edit.cc282
-rw-r--r--db/version_edit.h118
-rw-r--r--db/version_edit_test.cc50
-rw-r--r--db/version_set.cc1003
-rw-r--r--db/version_set.h290
-rw-r--r--db/write_batch.cc164
-rw-r--r--db/write_batch_internal.h73
-rw-r--r--db/write_batch_test.cc110
-rw-r--r--doc/doc.css89
-rw-r--r--doc/impl.html222
-rw-r--r--doc/index.html508
-rw-r--r--doc/log_format.txt72
-rw-r--r--doc/table_format.txt61
-rw-r--r--include/cache.h99
-rw-r--r--include/comparator.h61
-rw-r--r--include/db.h137
-rw-r--r--include/env.h293
-rw-r--r--include/iterator.h95
-rw-r--r--include/options.h203
-rw-r--r--include/slice.h104
-rw-r--r--include/status.h86
-rw-r--r--include/table.h67
-rw-r--r--include/table_builder.h86
-rw-r--r--include/write_batch.h49
-rw-r--r--leveldb.gyp329
-rw-r--r--port/README10
-rw-r--r--port/port.h21
-rw-r--r--port/port_android.cc65
-rw-r--r--port/port_android.h131
-rw-r--r--port/port_chromium.cc83
-rw-r--r--port/port_chromium.h104
-rw-r--r--port/port_example.h119
-rw-r--r--port/port_posix.cc50
-rw-r--r--port/port_posix.h108
-rw-r--r--port/sha1_portable.cc298
-rw-r--r--port/sha1_portable.h25
-rw-r--r--port/sha1_test.cc55
-rw-r--r--table/block.cc261
-rw-r--r--table/block.h43
-rw-r--r--table/block_builder.cc109
-rw-r--r--table/block_builder.h57
-rw-r--r--table/format.cc131
-rw-r--r--table/format.h103
-rw-r--r--table/iterator.cc68
-rw-r--r--table/iterator_wrapper.h64
-rw-r--r--table/merger.cc143
-rw-r--r--table/merger.h26
-rw-r--r--table/table.cc175
-rw-r--r--table/table_builder.cc224
-rw-r--r--table/table_test.cc808
-rw-r--r--table/two_level_iterator.cc182
-rw-r--r--table/two_level_iterator.h34
-rw-r--r--util/arena.cc68
-rw-r--r--util/arena.h68
-rw-r--r--util/arena_test.cc68
-rw-r--r--util/cache.cc253
-rw-r--r--util/cache_test.cc169
-rw-r--r--util/coding.cc194
-rw-r--r--util/coding.h104
-rw-r--r--util/coding_test.cc173
-rw-r--r--util/comparator.cc72
-rw-r--r--util/crc32c.cc332
-rw-r--r--util/crc32c.h45
-rw-r--r--util/crc32c_test.cc86
-rw-r--r--util/env.cc77
-rw-r--r--util/env_chromium.cc608
-rw-r--r--util/env_posix.cc609
-rw-r--r--util/env_test.cc102
-rw-r--r--util/hash.cc45
-rw-r--r--util/hash.h19
-rw-r--r--util/histogram.cc128
-rw-r--r--util/histogram.h41
-rw-r--r--util/logging.cc81
-rw-r--r--util/logging.h47
-rw-r--r--util/mutexlock.h39
-rw-r--r--util/options.cc29
-rw-r--r--util/random.h59
-rw-r--r--util/status.cc59
-rw-r--r--util/testharness.cc65
-rw-r--r--util/testharness.h129
-rw-r--r--util/testutil.cc51
-rw-r--r--util/testutil.h53
118 files changed, 19207 insertions, 0 deletions
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..27a9407
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,8 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
+
+# Initial version authors:
+Jeffrey Dean <jeff@google.com>
+Sanjay Ghemawat <sanjay@google.com>
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..fa4a3de
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,64 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+# INSTRUCTIONS
+# After you've downloaded and installed the Android NDK from:
+# http://developer.android.com/sdk/ndk/index.html
+# 1. In the same directory as this file, Android.mk, type:
+# $ ln -s leveldb ../jni
+# (The Android NDK will only build native projects in
+# subdirectories named "jni".)
+# 2. $ cd ..
+# 3. Execute ndk-build:
+# $ $(ANDROID_NDK_DIR)/ndk-build
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := leveldb
+# Build flags:
+# - LEVELDB_PLATFORM_ANDROID to use the correct port header: port_android.h
+LOCAL_CFLAGS := -DLEVELDB_PLATFORM_ANDROID -std=gnu++0x
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_SRC_FILES := ./db/builder.cc \
+./db/db_bench.cc \
+./db/db_impl.cc \
+./db/db_iter.cc \
+./db/filename.cc \
+./db/dbformat.cc \
+./db/log_reader.cc \
+./db/log_writer.cc \
+./db/memtable.cc \
+./db/repair.cc \
+./db/table_cache.cc \
+./db/version_edit.cc \
+./db/version_set.cc \
+./db/write_batch.cc \
+./port/port_android.cc \
+./table/block.cc \
+./table/block_builder.cc \
+./table/format.cc \
+./table/iterator.cc \
+./table/merger.cc \
+./table/table.cc \
+./table/table_builder.cc \
+./table/two_level_iterator.cc \
+./util/arena.cc \
+./util/cache.cc \
+./util/coding.cc \
+./util/comparator.cc \
+./util/crc32c.cc \
+./util/env.cc \
+./util/env_posix.cc \
+./util/hash.cc \
+./util/histogram.cc \
+./util/logging.cc \
+./util/options.cc \
+./util/status.cc \
+./util/testharness.cc \
+./util/testutil.cc
+
+include $(BUILD_SHARED_LIBRARY)
diff --git a/Application.mk b/Application.mk
new file mode 100644
index 0000000..9360a38
--- /dev/null
+++ b/Application.mk
@@ -0,0 +1,6 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+APP_ABI := armeabi-v7a
+APP_STL := gnustl_static
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..8e80208
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a60b4de
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,134 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+CC = g++
+
+# Uncomment one of the following to switch between debug and opt mode
+#OPT = -O2 -DNDEBUG
+OPT = -g2
+
+CFLAGS = -c -DLEVELDB_PLATFORM_POSIX -I. -std=c++0x $(OPT)
+
+LDFLAGS=-lpthread
+
+LIBOBJECTS = \
+ ./db/builder.o \
+ ./db/db_impl.o \
+ ./db/db_iter.o \
+ ./db/filename.o \
+ ./db/format.o \
+ ./db/log_reader.o \
+ ./db/log_writer.o \
+ ./db/memtable.o \
+ ./db/repair.o \
+ ./db/table_cache.o \
+ ./db/version_edit.o \
+ ./db/version_set.o \
+ ./db/write_batch.o \
+ ./port/port_posix.o \
+ ./port/sha1_portable.o \
+ ./table/block.o \
+ ./table/block_builder.o \
+ ./table/format.o \
+ ./table/iterator.o \
+ ./table/merger.o \
+ ./table/table.o \
+ ./table/table_builder.o \
+ ./table/two_level_iterator.o \
+ ./util/arena.o \
+ ./util/cache.o \
+ ./util/coding.o \
+ ./util/comparator.o \
+ ./util/crc32c.o \
+ ./util/env.o \
+ ./util/env_posix.o \
+ ./util/hash.o \
+ ./util/histogram.o \
+ ./util/logging.o \
+ ./util/options.o \
+ ./util/status.o
+
+TESTUTIL = ./util/testutil.o
+TESTHARNESS = ./util/testharness.o $(TESTUTIL)
+
+TESTS = \
+ arena_test \
+ cache_test \
+ coding_test \
+ corruption_test \
+ crc32c_test \
+ db_test \
+ dbformat_test \
+ env_test \
+ filename_test \
+ log_test \
+ sha1_test \
+ skiplist_test \
+ table_test \
+ version_edit_test \
+ write_batch_test
+
+PROGRAMS = db_bench $(TESTS)
+
+all: $(PROGRAMS)
+
+check: $(TESTS)
+ for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done
+
+clean:
+ rm -f $(PROGRAMS) */*.o
+
+db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
+ $(CC) $(LDFLAGS) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) -o $@
+
+arena_test: util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) util/arena_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+table_test: table/table_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) table/table_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+sha1_test: port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) port/sha1_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+skiplist_test: db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/skiplist_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
+ $(CC) $(LDFLAGS) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) -o $@
+
+.cc.o:
+ $(CC) $(CFLAGS) $< -o $@
+
+# TODO(gabor): dependencies for .o files
+# TODO(gabor): Build library
diff --git a/README b/README
new file mode 100644
index 0000000..c97e43c
--- /dev/null
+++ b/README
@@ -0,0 +1,51 @@
+leveldb: A key-value store
+Authors: Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com)
+
+The code under this directory implements a system for maintaining a
+persistent key/value store.
+
+See doc/index.html for more explanation.
+See doc/db_layout.txt for a brief overview of the implementation.
+
+The public interface is in include/*.h. Callers should not include or
+rely on the details of any other header files in this package. Those
+internal APIs may be changed without warning.
+
+Guide to header files:
+
+include/db.h
+ Main interface to the DB: Start here
+
+include/options.h
+ Control over the behavior of an entire database, and also
+ control over the behavior of individual reads and writes.
+
+include/comparator.h
+ Abstraction for user-specified comparison function. If you want
+ just bytewise comparison of keys, you can use the default comparator,
+ but clients can write their own comparator implementations if they
+ want custom ordering (e.g. to handle different character
+ encodings, etc.)
+
+include/iterator.h
+ Interface for iterating over data. You can get an iterator
+ from a DB object.
+
+include/write_batch.h
+ Interface for atomically applying multiple updates to a database.
+
+include/slice.h
+ A simple module for maintaining a pointer and a length into some
+ other byte array.
+
+include/status.h
+ Status is returned from many of the public interfaces and is used
+ to report success and various kinds of errors.
+
+include/env.h
+ Abstraction of the OS environment. A posix implementation of
+ this interface is in util/env_posix.cc
+
+include/table.h
+include/table_builder.h
+ Lower-level modules that most clients probably won't use directly
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..7d60b5a
--- /dev/null
+++ b/TODO
@@ -0,0 +1,23 @@
+Before adding to chrome
+-----------------------
+- multi-threaded test/benchmark
+- Allow missing crc32c in Table format?
+
+Maybe afterwards
+----------------
+
+ss
+- Stats
+- Speed up backwards scan (avoid three passes over data)
+
+db
+- Maybe implement DB::BulkDeleteForRange(start_key, end_key)
+ that would blow away files whose ranges are entirely contained
+ within [start_key..end_key]? For Chrome, deletion of obsolete
+ object stores, etc. can be done in the background anyway, so
+ probably not that important.
+
+api changes?
+- Efficient large value reading and writing
+
+Faster Get implementation
diff --git a/db/builder.cc b/db/builder.cc
new file mode 100644
index 0000000..f3d0fe2
--- /dev/null
+++ b/db/builder.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/iterator.h"
+
+namespace leveldb {
+
+Status BuildTable(const std::string& dbname,
+ Env* env,
+ const Options& options,
+ TableCache* table_cache,
+ Iterator* iter,
+ FileMetaData* meta,
+ VersionEdit* edit) {
+ Status s;
+ meta->file_size = 0;
+ iter->SeekToFirst();
+
+ std::string fname = TableFileName(dbname, meta->number);
+ if (iter->Valid()) {
+ WritableFile* file;
+ s = env->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ TableBuilder* builder = new TableBuilder(options, file);
+ meta->smallest.DecodeFrom(iter->key());
+ for (; iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ meta->largest.DecodeFrom(key);
+ if (ExtractValueType(key) == kTypeLargeValueRef) {
+ if (iter->value().size() != LargeValueRef::ByteSize()) {
+ s = Status::Corruption("invalid indirect reference hash value (L0)");
+ break;
+ }
+ edit->AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
+ meta->number,
+ iter->key());
+ }
+ builder->Add(key, iter->value());
+ }
+
+ // Finish and check for builder errors
+ if (s.ok()) {
+ s = builder->Finish();
+ if (s.ok()) {
+ meta->file_size = builder->FileSize();
+ assert(meta->file_size > 0);
+ }
+ } else {
+ builder->Abandon();
+ }
+ delete builder;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = file->Sync();
+ }
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (s.ok()) {
+ // Verify that the table is usable
+ Iterator* it = table_cache->NewIterator(ReadOptions(), meta->number);
+ s = it->status();
+ delete it;
+ }
+ }
+
+ // Check for input iterator errors
+ if (!iter->status().ok()) {
+ s = iter->status();
+ }
+
+ if (s.ok() && meta->file_size > 0) {
+ edit->AddFile(0, meta->number, meta->file_size,
+ meta->smallest, meta->largest);
+ } else {
+ env->DeleteFile(fname);
+ }
+ return s;
+}
+
+}
diff --git a/db/builder.h b/db/builder.h
new file mode 100644
index 0000000..2d8afdf
--- /dev/null
+++ b/db/builder.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_BUILDER_H_
+#define STORAGE_LEVELDB_DB_BUILDER_H_
+
+#include "include/status.h"
+
+namespace leveldb {
+
+struct Options;
+struct FileMetaData;
+
+class Env;
+class Iterator;
+class TableCache;
+class VersionEdit;
+
+// Build a Table file from the contents of *iter. The generated file
+// will be named according to meta->number. On success, the rest of
+// *meta will be filled with metadata about the generated table, and
+// large value refs and the added file information will be added to
+// *edit. If no data is present in *iter, meta->file_size will be set
+// to zero, and no Table file will be produced.
+extern Status BuildTable(const std::string& dbname,
+ Env* env,
+ const Options& options,
+ TableCache* table_cache,
+ Iterator* iter,
+ FileMetaData* meta,
+ VersionEdit* edit);
+
+}
+
+#endif // STORAGE_LEVELDB_DB_BUILDER_H_
diff --git a/db/corruption_test.cc b/db/corruption_test.cc
new file mode 100644
index 0000000..a59ab0e
--- /dev/null
+++ b/db/corruption_test.cc
@@ -0,0 +1,366 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include "include/env.h"
+#include "include/table.h"
+#include "include/write_batch.h"
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static const int kValueSize = 1000;
+
+class CorruptionTest {
+ public:
+ test::ErrorEnv env_;
+ Random rnd_;
+ std::string dbname_;
+ Options options_;
+ DB* db_;
+
+ CorruptionTest() : rnd_(test::RandomSeed()) {
+ options_.env = &env_;
+ dbname_ = test::TmpDir() + "/db_test";
+ DestroyDB(dbname_, options_);
+
+ db_ = NULL;
+ options_.create_if_missing = true;
+ Reopen();
+ options_.create_if_missing = false;
+ }
+
+ ~CorruptionTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ }
+
+ Status TryReopen(Options* options = NULL) {
+ delete db_;
+ db_ = NULL;
+ Options opt = (options ? *options : options_);
+ opt.env = &env_;
+ return DB::Open(opt, dbname_, &db_);
+ }
+
+ void Reopen(Options* options = NULL) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void RepairDB() {
+ delete db_;
+ db_ = NULL;
+ ASSERT_OK(::leveldb::RepairDB(dbname_, options_));
+ }
+
+ void Build(int n) {
+ std::string key_space, value_space;
+ WriteBatch batch;
+ for (int i = 0; i < n; i++) {
+ //if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+ Slice key = Key(i, &key_space);
+ batch.Clear();
+ batch.Put(key, Value(i, &value_space));
+ ASSERT_OK(db_->Write(WriteOptions(), &batch));
+ }
+ }
+
+ void Check(int min_expected, int max_expected) {
+ int next_expected = 0;
+ int missed = 0;
+ int bad_keys = 0;
+ int bad_values = 0;
+ int correct = 0;
+ std::string value_space;
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ uint64_t key;
+ Slice in(iter->key());
+ if (!ConsumeDecimalNumber(&in, &key) ||
+ !in.empty() ||
+ key < next_expected) {
+ bad_keys++;
+ continue;
+ }
+ missed += (key - next_expected);
+ next_expected = key + 1;
+ if (iter->value() != Value(key, &value_space)) {
+ bad_values++;
+ } else {
+ correct++;
+ }
+ }
+ delete iter;
+
+ fprintf(stderr,
+ "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%d\n",
+ min_expected, max_expected, correct, bad_keys, bad_values, missed);
+ ASSERT_LE(min_expected, correct);
+ ASSERT_GE(max_expected, correct);
+ }
+
+ void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+ // Pick file to corrupt
+ std::vector<std::string> filenames;
+ ASSERT_OK(env_.GetChildren(dbname_, &filenames));
+ uint64_t number;
+ LargeValueRef large_ref;
+ FileType type;
+ std::vector<std::string> candidates;
+ for (int i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
+ type == filetype) {
+ candidates.push_back(dbname_ + "/" + filenames[i]);
+ }
+ }
+ ASSERT_TRUE(!candidates.empty()) << filetype;
+ std::string fname = candidates[rnd_.Uniform(candidates.size())];
+
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ const char* msg = strerror(errno);
+ ASSERT_TRUE(false) << fname << ": " << msg;
+ }
+
+ if (offset < 0) {
+ // Relative to end of file; make it absolute
+ if (-offset > sbuf.st_size) {
+ offset = 0;
+ } else {
+ offset = sbuf.st_size + offset;
+ }
+ }
+ if (offset > sbuf.st_size) {
+ offset = sbuf.st_size;
+ }
+ if (offset + bytes_to_corrupt > sbuf.st_size) {
+ bytes_to_corrupt = sbuf.st_size - offset;
+ }
+
+ // Do it
+ std::string contents;
+ Status s = ReadFileToString(Env::Default(), fname, &contents);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ for (int i = 0; i < bytes_to_corrupt; i++) {
+ contents[i + offset] ^= 0x80;
+ }
+ s = WriteStringToFile(Env::Default(), contents, fname);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ }
+
+ uint64_t Property(const std::string& name) {
+ uint64_t result;
+ if (!db_->GetProperty(name, &result)) {
+ result = ~static_cast<uint64_t>(0);
+ }
+ return result;
+ }
+
+ // Return the ith key
+ Slice Key(int i, std::string* storage) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%016d", i);
+ storage->assign(buf, strlen(buf));
+ return Slice(*storage);
+ }
+
+ // Return the value to associate with the specified key
+ Slice Value(int k, std::string* storage) {
+ Random r(k);
+ return test::RandomString(&r, kValueSize, storage);
+ }
+};
+
+TEST(CorruptionTest, Recovery) {
+ Build(10);
+ Check(10, 10);
+ Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record
+ Corrupt(kLogFile, 2*kValueSize, 1); // Somewhere in second log record?
+ Reopen();
+ Check(8, 8);
+}
+
+TEST(CorruptionTest, RecoverWriteError) {
+ env_.writable_file_error_ = true;
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+}
+
+TEST(CorruptionTest, NewFileErrorDuringWrite) {
+ // Do enough writing to force minor compaction
+ env_.writable_file_error_ = true;
+ const int num = 3 + (Options().write_buffer_size / kValueSize);
+ std::string value_storage;
+ Status s;
+ for (int i = 0; s.ok() && i < num; i++) {
+ WriteBatch batch;
+ batch.Put("a", Value(100, &value_storage));
+ s = db_->Write(WriteOptions(), &batch);
+ }
+ ASSERT_TRUE(!s.ok());
+ ASSERT_GE(env_.num_writable_file_errors_, 1);
+ env_.writable_file_error_ = false;
+ Reopen();
+}
+
+TEST(CorruptionTest, TableFile) {
+ Build(100);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, "", "~");
+ dbi->TEST_CompactRange(1, "", "~");
+
+ Corrupt(kTableFile, 100, 1);
+ Check(99, 99);
+}
+
+TEST(CorruptionTest, TableFileIndexData) {
+ Build(10000); // Enough to build multiple Tables
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, "", "~");
+ dbi->TEST_CompactRange(1, "", "~");
+
+ Corrupt(kTableFile, -1000, 500);
+ Reopen();
+ Check(5000, 9999);
+}
+
+TEST(CorruptionTest, MissingDescriptor) {
+ Build(1000);
+ RepairDB();
+ Reopen();
+ Check(1000, 1000);
+}
+
+TEST(CorruptionTest, SequenceNumberRecovery) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v5", v);
+ // Write something. If sequence number was not recovered properly,
+ // it will be hidden by an earlier write.
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("v6", v);
+}
+
+TEST(CorruptionTest, LargeValueRecovery) {
+ Options options;
+ options.large_value_threshold = 10000;
+ Reopen(&options);
+
+ Random rnd(301);
+ std::string big;
+ ASSERT_OK(db_->Put(WriteOptions(),
+ "foo", test::RandomString(&rnd, 100000, &big)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ(big, v);
+
+ RepairDB();
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ(big, v);
+
+ Reopen();
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ(big, v);
+}
+
+TEST(CorruptionTest, CorruptedDescriptor) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ dbi->TEST_CompactRange(0, "", "~");
+
+ Corrupt(kDescriptorFile, 0, 1000);
+ Status s = TryReopen();
+ ASSERT_TRUE(!s.ok());
+
+ RepairDB();
+ Reopen();
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+ ASSERT_EQ("hello", v);
+}
+
+TEST(CorruptionTest, CompactionInputError) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(9, 9);
+
+ // Force compactions by writing lots of values
+ Build(10000);
+ Check(10000, 10000);
+ dbi->TEST_CompactRange(0, "", "~");
+ ASSERT_EQ(0, Property("leveldb.num-files-at-level0"));
+}
+
+TEST(CorruptionTest, CompactionInputErrorParanoid) {
+ Options options;
+ options.paranoid_checks = true;
+ Reopen(&options);
+
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ ASSERT_EQ(1, Property("leveldb.num-files-at-level0"));
+
+ Corrupt(kTableFile, 100, 1);
+ Check(9, 9);
+
+ // Write must eventually fail because of corrupted table
+ Status s;
+ std::string tmp1, tmp2;
+ for (int i = 0; i < 10000 && s.ok(); i++) {
+ s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+ }
+ ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST(CorruptionTest, UnrelatedKeys) {
+ Build(10);
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ Corrupt(kTableFile, 100, 1);
+
+ std::string tmp1, tmp2;
+ ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+ std::string v;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+ dbi->TEST_CompactMemTable();
+ ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+ ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/db_bench.cc b/db/db_bench.cc
new file mode 100644
index 0000000..4ccdd5a
--- /dev/null
+++ b/db/db_bench.cc
@@ -0,0 +1,376 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "db/db_impl.h"
+#include "db/version_set.h"
+#include "include/cache.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/write_batch.h"
+#include "util/histogram.h"
+#include "util/random.h"
+#include "util/testutil.h"
+
+// Comma-separated list of operations to run in the specified order
+// Actual benchmarks:
+// writeseq -- write N values in sequential key order
+// writerandom -- write N values in random key order
+// writebig -- write N/1000 100K valuesin random order
+// readseq -- read N values sequentially
+// readrandom -- read N values in random order
+// Meta operations:
+// compact -- Compact the entire DB
+// heapprofile -- Dump a heap profile (if supported by this port)
+// sync -- switch to synchronous writes (not the default)
+// nosync -- switch to asynchronous writes (the default)
+// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
+// normal -- reset N back to its normal value (1000000)
+static const char* FLAGS_benchmarks =
+ "writeseq,"
+ "writeseq,"
+ "writerandom,"
+ "sync,tenth,tenth,writerandom,nosync,normal,"
+ "readseq,"
+ "readrandom,"
+ "compact,"
+ "readseq,"
+ "readrandom,"
+ "writebig";
+
+// Number of key/values to place in database
+static int FLAGS_num = 1000000;
+
+// Size of each value
+static int FLAGS_value_size = 100;
+
+// Arrange to generate values that shrink to this fraction of
+// their original size after compression
+static double FLAGS_compression_ratio = 0.25;
+
+// Print histogram of operation timings
+static bool FLAGS_histogram = false;
+
+// Number of bytes to buffer in memtable before compacting
+static int FLAGS_write_buffer_size = 1 << 20;
+
+namespace leveldb {
+
+// Helper for quickly generating random data.
+namespace {
+class RandomGenerator {
+ private:
+ std::string data_;
+ int pos_;
+
+ public:
+ RandomGenerator() {
+ // We use a limited amount of data over and over again and ensure
+ // that it is larger than the compression window (32KB), and also
+ // large enough to serve all typical value sizes we want to write.
+ Random rnd(301);
+ std::string piece;
+ while (data_.size() < 1048576) {
+ // Add a short fragment that is as compressible as specified
+ // by FLAGS_compression_ratio.
+ test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece);
+ data_.append(piece);
+ }
+ pos_ = 0;
+ }
+
+ Slice Generate(int len) {
+ if (pos_ + len > data_.size()) {
+ pos_ = 0;
+ assert(len < data_.size());
+ }
+ pos_ += len;
+ return Slice(data_.data() + pos_ - len, len);
+ }
+};
+}
+
+class Benchmark {
+ private:
+ Cache* cache_;
+ DB* db_;
+ int num_;
+ bool sync_;
+ int heap_counter_;
+ double start_;
+ double last_op_finish_;
+ int64_t bytes_;
+ std::string message_;
+ Histogram hist_;
+ RandomGenerator gen_;
+ Random rand_;
+
+ // State kept for progress messages
+ int done_;
+ int next_report_; // When to report next
+
+ void Start() {
+ start_ = Env::Default()->NowMicros() * 1e-6;
+ bytes_ = 0;
+ message_.clear();
+ last_op_finish_ = start_;
+ hist_.Clear();
+ done_ = 0;
+ next_report_ = 100;
+ }
+
+ void FinishedSingleOp() {
+ if (FLAGS_histogram) {
+ double now = Env::Default()->NowMicros() * 1e-6;
+ double micros = (now - last_op_finish_) * 1e6;
+ hist_.Add(micros);
+ if (micros > 20000) {
+ fprintf(stderr, "long op: %.1f micros%30s\r", micros, "");
+ fflush(stderr);
+ }
+ last_op_finish_ = now;
+ }
+
+ done_++;
+ if (done_ >= next_report_) {
+ if (next_report_ < 1000) {
+ next_report_ += 100;
+ } else if (next_report_ < 10000) {
+ next_report_ += 1000;
+ } else if (next_report_ < 100000) {
+ next_report_ += 10000;
+ } else {
+ next_report_ += 100000;
+ }
+ fprintf(stderr, "... finished %d ops%30s\r", done_, "");
+ fflush(stderr);
+ }
+ }
+
+ void Stop(const Slice& name) {
+ double finish = Env::Default()->NowMicros() * 1e-6;
+
+ // Pretend at least one op was done in case we are running a benchmark
+ // that does nto call FinishedSingleOp().
+ if (done_ < 1) done_ = 1;
+
+ if (bytes_ > 0) {
+ char rate[100];
+ snprintf(rate, sizeof(rate), "%5.1f MB/s",
+ (bytes_ / 1048576.0) / (finish - start_));
+ if (!message_.empty()) {
+ message_.push_back(' ');
+ }
+ message_.append(rate);
+ }
+
+ fprintf(stdout, "%-12s : %10.3f micros/op;%s%s\n",
+ name.ToString().c_str(),
+ (finish - start_) * 1e6 / done_,
+ (message_.empty() ? "" : " "),
+ message_.c_str());
+ if (FLAGS_histogram) {
+ fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str());
+ }
+ fflush(stdout);
+ }
+
+ public:
+ enum Order { SEQUENTIAL, RANDOM };
+
+ Benchmark() : cache_(NewLRUCache(200<<20)),
+ db_(NULL),
+ num_(FLAGS_num),
+ sync_(false),
+ heap_counter_(0),
+ bytes_(0),
+ rand_(301) {
+ std::vector<std::string> files;
+ Env::Default()->GetChildren("/tmp/dbbench", &files);
+ for (int i = 0; i < files.size(); i++) {
+ if (Slice(files[i]).starts_with("heap-")) {
+ Env::Default()->DeleteFile("/tmp/dbbench/" + files[i]);
+ }
+ }
+ DestroyDB("/tmp/dbbench", Options());
+ }
+
+ ~Benchmark() {
+ delete db_;
+ delete cache_;
+ }
+
+ void Run() {
+ Options options;
+ options.create_if_missing = true;
+ options.max_open_files = 10000;
+ options.block_cache = cache_;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+
+ Start();
+ Status s = DB::Open(options, "/tmp/dbbench", &db_);
+ Stop("open");
+ if (!s.ok()) {
+ fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+
+ const char* benchmarks = FLAGS_benchmarks;
+ while (benchmarks != NULL) {
+ const char* sep = strchr(benchmarks, ',');
+ Slice name;
+ if (sep == NULL) {
+ name = benchmarks;
+ benchmarks = NULL;
+ } else {
+ name = Slice(benchmarks, sep - benchmarks);
+ benchmarks = sep + 1;
+ }
+
+ Start();
+ if (name == Slice("writeseq")) {
+ Write(SEQUENTIAL, num_, FLAGS_value_size);
+ } else if (name == Slice("writerandom")) {
+ Write(RANDOM, num_, FLAGS_value_size);
+ } else if (name == Slice("writebig")) {
+ Write(RANDOM, num_ / 1000, 100 * 1000);
+ } else if (name == Slice("readseq")) {
+ Read(SEQUENTIAL);
+ } else if (name == Slice("readrandom")) {
+ Read(RANDOM);
+ } else if (name == Slice("compact")) {
+ Compact();
+ } else if (name == Slice("heapprofile")) {
+ HeapProfile();
+ } else if (name == Slice("sync")) {
+ sync_ = true;
+ } else if (name == Slice("nosync")) {
+ sync_ = false;
+ } else if (name == Slice("tenth")) {
+ num_ = num_ / 10;
+ } else if (name == Slice("normal")) {
+ num_ = FLAGS_num;
+ } else {
+ fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
+ }
+ Stop(name);
+ }
+ }
+
+ void Write(Order order, int num_entries, int value_size) {
+ WriteBatch batch;
+ Status s;
+ std::string val;
+ WriteOptions options;
+ options.sync = sync_;
+ for (int i = 0; i < num_entries; i++) {
+ const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
+ char key[100];
+ snprintf(key, sizeof(key), "%012d", k);
+ batch.Clear();
+ batch.Put(key, gen_.Generate(value_size));
+ s = db_->Write(options, &batch);
+ bytes_ += value_size + strlen(key);
+ if (!s.ok()) {
+ fprintf(stderr, "put error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ FinishedSingleOp();
+ }
+ }
+
+ void Read(Order order) {
+ ReadOptions options;
+ if (order == SEQUENTIAL) {
+ Iterator* iter = db_->NewIterator(options);
+ int i = 0;
+ for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+ bytes_ += iter->key().size() + iter->value().size();
+ FinishedSingleOp();
+ ++i;
+ }
+ delete iter;
+ } else {
+ std::string value;
+ for (int i = 0; i < num_; i++) {
+ char key[100];
+ const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
+ snprintf(key, sizeof(key), "%012d", k);
+ db_->Get(options, key, &value);
+ FinishedSingleOp();
+ }
+ }
+ }
+
+ void Compact() {
+ DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
+ dbi->TEST_CompactMemTable();
+ int max_level_with_files = 1;
+ for (int level = 1; level < config::kNumLevels; level++) {
+ uint64_t v;
+ char name[100];
+ snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
+ if (db_->GetProperty(name, &v) && v > 0) {
+ max_level_with_files = level;
+ }
+ }
+ for (int level = 0; level < max_level_with_files; level++) {
+ dbi->TEST_CompactRange(level, "", "~");
+ }
+ }
+
+ static void WriteToFile(void* arg, const char* buf, int n) {
+ reinterpret_cast<WritableFile*>(arg)->Append(Slice(buf, n));
+ }
+
+ void HeapProfile() {
+ char fname[100];
+ snprintf(fname, sizeof(fname), "/tmp/dbbench/heap-%04d", ++heap_counter_);
+ WritableFile* file;
+ Status s = Env::Default()->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ message_ = s.ToString();
+ return;
+ }
+ bool ok = port::GetHeapProfile(WriteToFile, file);
+ delete file;
+ if (!ok) {
+ message_ = "not supported";
+ Env::Default()->DeleteFile(fname);
+ }
+ }
+};
+
+}
+
+int main(int argc, char** argv) {
+ for (int i = 1; i < argc; i++) {
+ double d;
+ int n;
+ char junk;
+ if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) {
+ FLAGS_benchmarks = argv[i] + strlen("--benchmarks=");
+ } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) {
+ FLAGS_compression_ratio = d;
+ } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 &&
+ (n == 0 || n == 1)) {
+ FLAGS_histogram = n;
+ } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) {
+ FLAGS_num = n;
+ } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) {
+ FLAGS_value_size = n;
+ } else if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+ FLAGS_write_buffer_size = n;
+ } else {
+ fprintf(stderr, "Invalid flag '%s'\n", argv[i]);
+ exit(1);
+ }
+ }
+
+ leveldb::Benchmark benchmark;
+ benchmark.Run();
+ return 0;
+}
diff --git a/db/db_impl.cc b/db/db_impl.cc
new file mode 100644
index 0000000..5008af0
--- /dev/null
+++ b/db/db_impl.cc
@@ -0,0 +1,1195 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/status.h"
+#include "include/table.h"
+#include "include/table_builder.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+struct DBImpl::CompactionState {
+ Compaction* const compaction;
+
+ // Sequence numbers < smallest_snapshot are not significant since we
+ // will never have to service a snapshot below smallest_snapshot.
+ // Therefore if we have seen a sequence number S <= smallest_snapshot,
+ // we can drop all entries for the same key with sequence numbers < S.
+ SequenceNumber smallest_snapshot;
+
+ // Files produced by compaction
+ struct Output {
+ uint64_t number;
+ uint64_t file_size;
+ InternalKey smallest, largest;
+ };
+ std::vector<Output> outputs;
+
+ // State kept for output being generated
+ WritableFile* outfile;
+ TableBuilder* builder;
+
+ uint64_t total_bytes;
+
+ Output* current_output() { return &outputs[outputs.size()-1]; }
+
+ explicit CompactionState(Compaction* c)
+ : compaction(c),
+ outfile(NULL),
+ builder(NULL),
+ total_bytes(0) {
+ }
+};
+
+namespace {
+class NullWritableFile : public WritableFile {
+ public:
+ virtual Status Append(const Slice& data) { return Status::OK(); }
+ virtual Status Close() { return Status::OK(); }
+ virtual Status Flush() { return Status::OK(); }
+ virtual Status Sync() { return Status::OK(); }
+};
+}
+
+// Fix user-supplied options to be reasonable
+template <class T,class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+ if (*ptr > maxvalue) *ptr = maxvalue;
+ if (*ptr < minvalue) *ptr = minvalue;
+}
+Options SanitizeOptions(const std::string& dbname,
+ const InternalKeyComparator* icmp,
+ const Options& src) {
+ Options result = src;
+ result.comparator = icmp;
+ ClipToRange(&result.max_open_files, 20, 50000);
+ ClipToRange(&result.write_buffer_size, 64<<10, 1<<30);
+ ClipToRange(&result.large_value_threshold, 16<<10, 1<<30);
+ ClipToRange(&result.block_size, 1<<10, 4<<20);
+ if (result.info_log == NULL) {
+ // Open a log file in the same directory as the db
+ src.env->CreateDir(dbname); // In case it does not exist
+ src.env->RenameFile(InfoLogFileName(dbname), OldInfoLogFileName(dbname));
+ Status s = src.env->NewWritableFile(InfoLogFileName(dbname),
+ &result.info_log);
+ if (!s.ok()) {
+ // No place suitable for logging
+ result.info_log = new NullWritableFile;
+ }
+ }
+ return result;
+}
+
+DBImpl::DBImpl(const Options& options, const std::string& dbname)
+ : env_(options.env),
+ internal_comparator_(options.comparator),
+ options_(SanitizeOptions(dbname, &internal_comparator_, options)),
+ owns_info_log_(options_.info_log != options.info_log),
+ dbname_(dbname),
+ db_lock_(NULL),
+ shutting_down_(NULL),
+ bg_cv_(&mutex_),
+ compacting_cv_(&mutex_),
+ last_sequence_(0),
+ mem_(new MemTable(internal_comparator_)),
+ logfile_(NULL),
+ log_(NULL),
+ log_number_(0),
+ bg_compaction_scheduled_(false),
+ compacting_(false) {
+ // Reserve ten files or so for other uses and give the rest to TableCache.
+ const int table_cache_size = options.max_open_files - 10;
+ table_cache_ = new TableCache(dbname_, &options_, table_cache_size);
+
+ versions_ = new VersionSet(dbname_, &options_, table_cache_,
+ &internal_comparator_);
+}
+
+DBImpl::~DBImpl() {
+ // Wait for background work to finish
+ mutex_.Lock();
+ shutting_down_.Release_Store(this); // Any non-NULL value is ok
+ if (bg_compaction_scheduled_) {
+ while (bg_compaction_scheduled_) {
+ bg_cv_.Wait();
+ }
+ }
+ mutex_.Unlock();
+
+ if (db_lock_ != NULL) {
+ env_->UnlockFile(db_lock_);
+ }
+
+ delete versions_;
+ delete mem_;
+ delete log_;
+ delete logfile_;
+ delete table_cache_;
+
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+}
+
+Status DBImpl::NewDB() {
+ assert(log_number_ == 0);
+ assert(last_sequence_ == 0);
+
+ VersionEdit new_db;
+ new_db.SetComparatorName(user_comparator()->Name());
+ new_db.SetLogNumber(log_number_);
+ new_db.SetNextFile(2);
+ new_db.SetLastSequence(0);
+
+ const std::string manifest = DescriptorFileName(dbname_, 1);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(manifest, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ {
+ log::Writer log(file);
+ std::string record;
+ new_db.EncodeTo(&record);
+ s = log.AddRecord(record);
+ if (s.ok()) {
+ s = file->Close();
+ }
+ }
+ delete file;
+ if (s.ok()) {
+ // Make "CURRENT" file that points to the new manifest file.
+ s = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(manifest);
+ }
+ return s;
+}
+
+Status DBImpl::Install(VersionEdit* edit,
+ uint64_t new_log_number,
+ MemTable* cleanup_mem) {
+ mutex_.AssertHeld();
+ edit->SetLogNumber(new_log_number);
+ edit->SetLastSequence(last_sequence_);
+ return versions_->LogAndApply(edit, cleanup_mem);
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+ if (s->ok() || options_.paranoid_checks) {
+ // No change needed
+ } else {
+ Log(env_, options_.info_log, "Ignoring error %s", s->ToString().c_str());
+ *s = Status::OK();
+ }
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+ // Make a set of all of the live files
+ std::set<uint64_t> live = pending_outputs_;
+ versions_->AddLiveFiles(&live);
+
+ versions_->CleanupLargeValueRefs(live, log_number_);
+
+ std::vector<std::string> filenames;
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+ uint64_t number;
+ LargeValueRef large_ref;
+ FileType type;
+ for (int i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
+ bool keep = true;
+ switch (type) {
+ case kLogFile:
+ keep = (number == log_number_);
+ break;
+ case kDescriptorFile:
+ // Keep my manifest file, and any newer incarnations'
+ // (in case there is a race that allows other incarnations)
+ keep = (number >= versions_->ManifestFileNumber());
+ break;
+ case kTableFile:
+ keep = (live.find(number) != live.end());
+ break;
+ case kTempFile:
+ // Any temp files that are currently being written to must
+ // be recorded in pending_outputs_, which is inserted into "live"
+ keep = (live.find(number) != live.end());
+ break;
+ case kLargeValueFile:
+ keep = versions_->LargeValueIsLive(large_ref);
+ break;
+ case kCurrentFile:
+ case kDBLockFile:
+ case kInfoLogFile:
+ keep = true;
+ break;
+ }
+
+ if (!keep) {
+ if (type == kTableFile) {
+ table_cache_->Evict(number);
+ }
+ Log(env_, options_.info_log, "Delete type=%d #%lld\n",
+ int(type),
+ static_cast<unsigned long long>(number));
+ env_->DeleteFile(dbname_ + "/" + filenames[i]);
+ }
+ }
+ }
+}
+
+Status DBImpl::Recover(VersionEdit* edit) {
+ mutex_.AssertHeld();
+
+ // Ignore error from CreateDir since the creation of the DB is
+ // committed only when the descriptor is created, and this directory
+ // may already exist from a previous failed creation attempt.
+ env_->CreateDir(dbname_);
+ assert(db_lock_ == NULL);
+ Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+ if (!s.ok()) {
+ return s;
+ }
+
+ if (!env_->FileExists(CurrentFileName(dbname_))) {
+ if (options_.create_if_missing) {
+ s = NewDB();
+ if (!s.ok()) {
+ return s;
+ }
+ } else {
+ return Status::InvalidArgument(
+ dbname_, "does not exist (create_if_missing is false)");
+ }
+ } else {
+ if (options_.error_if_exists) {
+ return Status::InvalidArgument(
+ dbname_, "exists (error_if_exists is true)");
+ }
+ }
+
+ s = versions_->Recover(&log_number_, &last_sequence_);
+ if (s.ok()) {
+ // Recover from the log file named in the descriptor
+ SequenceNumber max_sequence(0);
+ if (log_number_ != 0) { // log_number_ == 0 indicates initial empty state
+ s = RecoverLogFile(log_number_, edit, &max_sequence);
+ }
+ if (s.ok()) {
+ last_sequence_ =
+ last_sequence_ > max_sequence ? last_sequence_ : max_sequence;
+ }
+ }
+
+ return s;
+}
+
+Status DBImpl::RecoverLogFile(uint64_t log_number,
+ VersionEdit* edit,
+ SequenceNumber* max_sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ WritableFile* info_log;
+ const char* fname;
+ Status* status; // NULL if options_.paranoid_checks==false
+ virtual void Corruption(size_t bytes, const Status& s) {
+ Log(env, info_log, "%s%s: dropping %d bytes; %s",
+ (this->status == NULL ? "(ignoring error) " : ""),
+ fname, static_cast<int>(bytes), s.ToString().c_str());
+ if (this->status != NULL && this->status->ok()) *this->status = s;
+ }
+ };
+
+ mutex_.AssertHeld();
+
+ // Open the log file
+ std::string fname = LogFileName(dbname_, log_number);
+ SequentialFile* file;
+ Status status = env_->NewSequentialFile(fname, &file);
+ if (!status.ok()) {
+ MaybeIgnoreError(&status);
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.fname = fname.c_str();
+ reporter.status = (options_.paranoid_checks ? &status : NULL);
+ // We intentially make log::Reader do checksumming even if
+ // paranoid_checks==false so that corruptions cause entire commits
+ // to be skipped instead of propagating bad information (like overly
+ // large sequence numbers).
+ log::Reader reader(file, &reporter, true/*checksum*/);
+ Log(env_, options_.info_log, "Recovering log #%llu",
+ (unsigned long long) log_number);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ MemTable* mem = NULL;
+ while (reader.ReadRecord(&record, &scratch) &&
+ status.ok()) {
+ if (record.size() < 12) {
+ reporter.Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+
+ if (mem == NULL) {
+ mem = new MemTable(internal_comparator_);
+ }
+ status = WriteBatchInternal::InsertInto(&batch, mem);
+ MaybeIgnoreError(&status);
+ if (!status.ok()) {
+ break;
+ }
+ const SequenceNumber last_seq =
+ WriteBatchInternal::Sequence(&batch) +
+ WriteBatchInternal::Count(&batch) - 1;
+ if (last_seq > *max_sequence) {
+ *max_sequence = last_seq;
+ }
+
+ if (mem->ApproximateMemoryUsage() > options_.write_buffer_size) {
+ status = WriteLevel0Table(mem, edit);
+ if (!status.ok()) {
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ break;
+ }
+ delete mem;
+ mem = NULL;
+ }
+ }
+
+ if (status.ok() && mem != NULL) {
+ status = WriteLevel0Table(mem, edit);
+ // Reflect errors immediately so that conditions like full
+ // file-systems cause the DB::Open() to fail.
+ }
+
+ delete mem;
+ delete file;
+ return status;
+}
+
+Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit) {
+ mutex_.AssertHeld();
+ FileMetaData meta;
+ meta.number = versions_->NewFileNumber();
+ pending_outputs_.insert(meta.number);
+ Iterator* iter = mem->NewIterator();
+ Log(env_, options_.info_log, "Level-0 table #%llu: started",
+ (unsigned long long) meta.number);
+ Status s = BuildTable(dbname_, env_, options_, table_cache_,
+ iter, &meta, edit);
+ Log(env_, options_.info_log, "Level-0 table #%llu: %lld bytes %s",
+ (unsigned long long) meta.number,
+ (unsigned long long) meta.file_size,
+ s.ToString().c_str());
+ delete iter;
+ pending_outputs_.erase(meta.number);
+ return s;
+}
+
+Status DBImpl::CompactMemTable() {
+ mutex_.AssertHeld();
+
+ WritableFile* lfile = NULL;
+ uint64_t new_log_number = versions_->NewFileNumber();
+
+ VersionEdit edit;
+
+ // Save the contents of the memtable as a new Table
+ Status s = WriteLevel0Table(mem_, &edit);
+ if (s.ok()) {
+ s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile);
+ }
+
+ // Save a new descriptor with the new table and log number.
+ if (s.ok()) {
+ s = Install(&edit, new_log_number, mem_);
+ }
+
+ if (s.ok()) {
+ // Commit to the new state
+ mem_ = new MemTable(internal_comparator_);
+ delete log_;
+ delete logfile_;
+ logfile_ = lfile;
+ log_ = new log::Writer(lfile);
+ log_number_ = new_log_number;
+ DeleteObsoleteFiles();
+ MaybeScheduleCompaction();
+ } else {
+ delete lfile;
+ env_->DeleteFile(LogFileName(dbname_, new_log_number));
+ }
+ return s;
+}
+
+void DBImpl::TEST_CompactRange(
+ int level,
+ const std::string& begin,
+ const std::string& end) {
+ MutexLock l(&mutex_);
+ while (compacting_) {
+ compacting_cv_.Wait();
+ }
+ Compaction* c = versions_->CompactRange(
+ level,
+ InternalKey(begin, kMaxSequenceNumber, kValueTypeForSeek),
+ InternalKey(end, 0, static_cast<ValueType>(0)));
+
+ if (c != NULL) {
+ CompactionState* compact = new CompactionState(c);
+ DoCompactionWork(compact); // Ignore error in test compaction
+ CleanupCompaction(compact);
+ }
+
+ // Start any background compaction that may have been delayed by this thread
+ MaybeScheduleCompaction();
+}
+
+Status DBImpl::TEST_CompactMemTable() {
+ MutexLock l(&mutex_);
+ return CompactMemTable();
+}
+
+void DBImpl::MaybeScheduleCompaction() {
+ mutex_.AssertHeld();
+ if (bg_compaction_scheduled_) {
+ // Already scheduled
+ } else if (compacting_) {
+ // Some other thread is running a compaction. Do not conflict with it.
+ } else if (shutting_down_.Acquire_Load()) {
+ // DB is being deleted; no more background compactions
+ } else if (!versions_->NeedsCompaction()) {
+ // No work to be done
+ } else {
+ bg_compaction_scheduled_ = true;
+ env_->Schedule(&DBImpl::BGWork, this);
+ }
+}
+
+void DBImpl::BGWork(void* db) {
+ reinterpret_cast<DBImpl*>(db)->BackgroundCall();
+}
+
+void DBImpl::BackgroundCall() {
+ MutexLock l(&mutex_);
+ assert(bg_compaction_scheduled_);
+ if (!shutting_down_.Acquire_Load() &&
+ !compacting_) {
+ BackgroundCompaction();
+ }
+ bg_compaction_scheduled_ = false;
+ bg_cv_.SignalAll();
+
+ // Previous compaction may have produced too many files in a level,
+ // so reschedule another compaction if needed.
+ MaybeScheduleCompaction();
+}
+
+void DBImpl::BackgroundCompaction() {
+ mutex_.AssertHeld();
+ Compaction* c = versions_->PickCompaction();
+ if (c == NULL) {
+ // Nothing to do
+ return;
+ }
+
+ Status status;
+ if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) {
+ // Move file to next level
+ FileMetaData* f = c->input(0, 0);
+ c->edit()->DeleteFile(c->level(), f->number);
+ c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
+ f->smallest, f->largest);
+ status = Install(c->edit(), log_number_, NULL);
+ Log(env_, options_.info_log, "Moved #%lld to level-%d %lld bytes %s\n",
+ static_cast<unsigned long long>(f->number),
+ c->level() + 1,
+ static_cast<unsigned long long>(f->file_size),
+ status.ToString().c_str());
+ } else {
+ CompactionState* compact = new CompactionState(c);
+ status = DoCompactionWork(compact);
+ CleanupCompaction(compact);
+ }
+ delete c;
+
+ if (status.ok()) {
+ // Done
+ } else if (shutting_down_.Acquire_Load()) {
+ // Ignore compaction errors found during shutting down
+ } else {
+ Log(env_, options_.info_log,
+ "Compaction error: %s", status.ToString().c_str());
+ if (options_.paranoid_checks && bg_error_.ok()) {
+ bg_error_ = status;
+ }
+ }
+}
+
+void DBImpl::CleanupCompaction(CompactionState* compact) {
+ mutex_.AssertHeld();
+ if (compact->builder != NULL) {
+ // May happen if we get a shutdown call in the middle of compaction
+ compact->builder->Abandon();
+ delete compact->builder;
+ } else {
+ assert(compact->outfile == NULL);
+ }
+ delete compact->outfile;
+ for (int i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ pending_outputs_.erase(out.number);
+ }
+ delete compact;
+}
+
+Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
+ assert(compact != NULL);
+ assert(compact->builder == NULL);
+ uint64_t file_number;
+ {
+ mutex_.Lock();
+ file_number = versions_->NewFileNumber();
+ pending_outputs_.insert(file_number);
+ CompactionState::Output out;
+ out.number = file_number;
+ out.smallest.Clear();
+ out.largest.Clear();
+ compact->outputs.push_back(out);
+ mutex_.Unlock();
+ }
+
+ // Make the output file
+ std::string fname = TableFileName(dbname_, file_number);
+ Status s = env_->NewWritableFile(fname, &compact->outfile);
+ if (s.ok()) {
+ compact->builder = new TableBuilder(options_, compact->outfile);
+ }
+ return s;
+}
+
+Status DBImpl::FinishCompactionOutputFile(CompactionState* compact,
+ Iterator* input) {
+ assert(compact != NULL);
+ assert(compact->outfile != NULL);
+ assert(compact->builder != NULL);
+
+ const uint64_t output_number = compact->current_output()->number;
+ assert(output_number != 0);
+
+ // Check for iterator errors
+ Status s = input->status();
+ const uint64_t current_entries = compact->builder->NumEntries();
+ if (s.ok()) {
+ s = compact->builder->Finish();
+ } else {
+ compact->builder->Abandon();
+ }
+ const uint64_t current_bytes = compact->builder->FileSize();
+ compact->current_output()->file_size = current_bytes;
+ compact->total_bytes += current_bytes;
+ delete compact->builder;
+ compact->builder = NULL;
+
+ // Finish and check for file errors
+ if (s.ok()) {
+ s = compact->outfile->Sync();
+ }
+ if (s.ok()) {
+ s = compact->outfile->Close();
+ }
+ delete compact->outfile;
+ compact->outfile = NULL;
+
+ if (s.ok() && current_entries > 0) {
+ // Verify that the table is usable
+ Iterator* iter = table_cache_->NewIterator(ReadOptions(),output_number);
+ s = iter->status();
+ delete iter;
+ if (s.ok()) {
+ Log(env_, options_.info_log,
+ "Generated table #%llu: %lld keys, %lld bytes",
+ (unsigned long long) output_number,
+ (unsigned long long) current_entries,
+ (unsigned long long) current_bytes);
+ }
+ }
+ return s;
+}
+
+
+Status DBImpl::InstallCompactionResults(CompactionState* compact) {
+ mutex_.AssertHeld();
+ Log(env_, options_.info_log, "Compacted %d@%d + %d@%d files => %lld bytes",
+ compact->compaction->num_input_files(0),
+ compact->compaction->level(),
+ compact->compaction->num_input_files(1),
+ compact->compaction->level() + 1,
+ static_cast<long long>(compact->total_bytes));
+
+ // Add compaction outputs
+ compact->compaction->AddInputDeletions(compact->compaction->edit());
+ const int level = compact->compaction->level();
+ for (int i = 0; i < compact->outputs.size(); i++) {
+ const CompactionState::Output& out = compact->outputs[i];
+ compact->compaction->edit()->AddFile(
+ level + 1,
+ out.number, out.file_size, out.smallest, out.largest);
+ pending_outputs_.erase(out.number);
+ }
+ compact->outputs.clear();
+
+ Status s = Install(compact->compaction->edit(), log_number_, NULL);
+ if (s.ok()) {
+ compact->compaction->ReleaseInputs();
+ DeleteObsoleteFiles();
+ } else {
+ // Discard any files we may have created during this failed compaction
+ for (int i = 0; i < compact->outputs.size(); i++) {
+ env_->DeleteFile(TableFileName(dbname_, compact->outputs[i].number));
+ }
+ }
+ return s;
+}
+
+Status DBImpl::DoCompactionWork(CompactionState* compact) {
+ Log(env_, options_.info_log, "Compacting %d@%d + %d@%d files",
+ compact->compaction->num_input_files(0),
+ compact->compaction->level(),
+ compact->compaction->num_input_files(1),
+ compact->compaction->level() + 1);
+
+ assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
+ assert(compact->builder == NULL);
+ assert(compact->outfile == NULL);
+ if (snapshots_.empty()) {
+ compact->smallest_snapshot = last_sequence_;
+ } else {
+ compact->smallest_snapshot = snapshots_.oldest()->number_;
+ }
+
+ // Release mutex while we're actually doing the compaction work
+ compacting_ = true;
+ mutex_.Unlock();
+
+ Iterator* input = versions_->MakeInputIterator(compact->compaction);
+ input->SeekToFirst();
+ Status status;
+ ParsedInternalKey ikey;
+ std::string current_user_key;
+ bool has_current_user_key = false;
+ SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
+ for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
+ // Handle key/value, add to state, etc.
+ Slice key = input->key();
+ bool drop = false;
+ if (!ParseInternalKey(key, &ikey)) {
+ // Do not hide error keys
+ current_user_key.clear();
+ has_current_user_key = false;
+ last_sequence_for_key = kMaxSequenceNumber;
+ } else {
+ if (!has_current_user_key ||
+ user_comparator()->Compare(ikey.user_key,
+ Slice(current_user_key)) != 0) {
+ // First occurrence of this user key
+ current_user_key.assign(ikey.user_key.data(), ikey.user_key.size());
+ has_current_user_key = true;
+ last_sequence_for_key = kMaxSequenceNumber;
+ }
+
+ if (last_sequence_for_key <= compact->smallest_snapshot) {
+ // Hidden by an newer entry for same user key
+ drop = true; // (A)
+ } else if (ikey.type == kTypeDeletion &&
+ ikey.sequence <= compact->smallest_snapshot &&
+ compact->compaction->IsBaseLevelForKey(ikey.user_key)) {
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger sequence numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller sequence numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ drop = true;
+ }
+
+ last_sequence_for_key = ikey.sequence;
+ }
+#if 0
+ Log(env_, options_.info_log,
+ " Compact: %s, seq %d, type: %d %d, drop: %d, is_base: %d, "
+ "%d smallest_snapshot: %d",
+ ikey.user_key.ToString().c_str(),
+ (int)ikey.sequence, ikey.type, kTypeLargeValueRef, drop,
+ compact->compaction->IsBaseLevelForKey(ikey.user_key),
+ (int)last_sequence_for_key, (int)compact->smallest_snapshot);
+#endif
+
+ if (!drop) {
+ // Open output file if necessary
+ if (compact->builder == NULL) {
+ status = OpenCompactionOutputFile(compact);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ if (compact->builder->NumEntries() == 0) {
+ compact->current_output()->smallest.DecodeFrom(key);
+ }
+ compact->current_output()->largest.DecodeFrom(key);
+
+ if (ikey.type == kTypeLargeValueRef) {
+ if (input->value().size() != LargeValueRef::ByteSize()) {
+ if (options_.paranoid_checks) {
+ status = Status::Corruption("invalid large value ref");
+ break;
+ } else {
+ Log(env_, options_.info_log,
+ "compaction found invalid large value ref");
+ }
+ } else {
+ compact->compaction->edit()->AddLargeValueRef(
+ LargeValueRef::FromRef(input->value()),
+ compact->current_output()->number,
+ input->key());
+ compact->builder->Add(key, input->value());
+ }
+ } else {
+ compact->builder->Add(key, input->value());
+ }
+
+ // Close output file if it is big enough
+ if (compact->builder->FileSize() >=
+ compact->compaction->MaxOutputFileSize()) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+ }
+
+ input->Next();
+ }
+
+ if (status.ok() && shutting_down_.Acquire_Load()) {
+ status = Status::IOError("Deleting DB during compaction");
+ }
+ if (status.ok() && compact->builder != NULL) {
+ status = FinishCompactionOutputFile(compact, input);
+ }
+ if (status.ok()) {
+ status = input->status();
+ }
+ delete input;
+ input = NULL;
+
+ mutex_.Lock();
+
+ if (status.ok()) {
+ status = InstallCompactionResults(compact);
+ }
+ compacting_ = false;
+ compacting_cv_.SignalAll();
+ return status;
+}
+
+Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
+ SequenceNumber* latest_snapshot) {
+ mutex_.Lock();
+ *latest_snapshot = last_sequence_;
+
+ // Collect together all needed child iterators
+ std::vector<Iterator*> list;
+ list.push_back(mem_->NewIterator());
+ versions_->current()->AddIterators(options, &list);
+ Iterator* internal_iter =
+ NewMergingIterator(&internal_comparator_, &list[0], list.size());
+ versions_->current()->Ref();
+ internal_iter->RegisterCleanup(&DBImpl::Unref, this, versions_->current());
+
+ mutex_.Unlock();
+ return internal_iter;
+}
+
+Iterator* DBImpl::TEST_NewInternalIterator() {
+ SequenceNumber ignored;
+ return NewInternalIterator(ReadOptions(), &ignored);
+}
+
+Status DBImpl::Get(const ReadOptions& options,
+ const Slice& key,
+ std::string* value) {
+ // TODO(opt): faster implementation
+ Iterator* iter = NewIterator(options);
+ iter->Seek(key);
+ bool found = false;
+ if (iter->Valid() && user_comparator()->Compare(key, iter->key()) == 0) {
+ Slice v = iter->value();
+ value->assign(v.data(), v.size());
+ found = true;
+ }
+ // Non-OK iterator status trumps everything else
+ Status result = iter->status();
+ if (result.ok() && !found) {
+ result = Status::NotFound(Slice()); // Use an empty error message for speed
+ }
+ delete iter;
+ return result;
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& options) {
+ SequenceNumber latest_snapshot;
+ Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot);
+ SequenceNumber sequence =
+ (options.snapshot ? options.snapshot->number_ : latest_snapshot);
+ return NewDBIterator(&dbname_, env_,
+ user_comparator(), internal_iter, sequence);
+}
+
+void DBImpl::Unref(void* arg1, void* arg2) {
+ DBImpl* impl = reinterpret_cast<DBImpl*>(arg1);
+ Version* v = reinterpret_cast<Version*>(arg2);
+ MutexLock l(&impl->mutex_);
+ v->Unref();
+}
+
+const Snapshot* DBImpl::GetSnapshot() {
+ MutexLock l(&mutex_);
+ return snapshots_.New(last_sequence_);
+}
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+ MutexLock l(&mutex_);
+ snapshots_.Delete(s);
+}
+
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
+ return DB::Put(o, key, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& options, const Slice& key) {
+ return DB::Delete(options, key);
+}
+
+Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
+ Status status;
+
+ WriteBatch* final = NULL;
+ {
+ MutexLock l(&mutex_);
+ if (!bg_error_.ok()) {
+ status = bg_error_;
+ } else if (mem_->ApproximateMemoryUsage() > options_.write_buffer_size) {
+ status = CompactMemTable();
+ }
+ if (status.ok()) {
+ status = HandleLargeValues(last_sequence_ + 1, updates, &final);
+ }
+ if (status.ok()) {
+ WriteBatchInternal::SetSequence(final, last_sequence_ + 1);
+ last_sequence_ += WriteBatchInternal::Count(final);
+
+ // Add to log and apply to memtable
+ status = log_->AddRecord(WriteBatchInternal::Contents(final));
+ if (status.ok() && options.sync) {
+ status = logfile_->Sync();
+ }
+ if (status.ok()) {
+ status = WriteBatchInternal::InsertInto(final, mem_);
+ }
+ }
+
+ if (options.post_write_snapshot != NULL) {
+ *options.post_write_snapshot =
+ status.ok() ? snapshots_.New(last_sequence_) : NULL;
+ }
+ }
+ if (final != updates) {
+ delete final;
+ }
+
+ return status;
+}
+
+bool DBImpl::HasLargeValues(const WriteBatch& batch) const {
+ if (WriteBatchInternal::ByteSize(&batch) >= options_.large_value_threshold) {
+ for (WriteBatchInternal::Iterator it(batch); !it.Done(); it.Next()) {
+ if (it.op() == kTypeValue &&
+ it.value().size() >= options_.large_value_threshold) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// Given "raw_value", determines the appropriate compression format to use
+// and stores the data that should be written to the large value file in
+// "*file_bytes", and sets "*ref" to the appropriate large value reference.
+// May use "*scratch" as backing store for "*file_bytes".
+void DBImpl::MaybeCompressLargeValue(
+ const Slice& raw_value,
+ Slice* file_bytes,
+ std::string* scratch,
+ LargeValueRef* ref) {
+ switch (options_.compression) {
+ case kLightweightCompression: {
+ port::Lightweight_Compress(raw_value.data(), raw_value.size(), scratch);
+ if (scratch->size() < (raw_value.size() / 8) * 7) {
+ *file_bytes = *scratch;
+ *ref = LargeValueRef::Make(raw_value, kLightweightCompression);
+ return;
+ }
+
+ // Less than 12.5% compression: just leave as uncompressed data
+ break;
+ }
+ case kNoCompression:
+ // Use default code outside of switch
+ break;
+ }
+ // Store as uncompressed data
+ *file_bytes = raw_value;
+ *ref = LargeValueRef::Make(raw_value, kNoCompression);
+}
+
+Status DBImpl::HandleLargeValues(SequenceNumber assigned_seq,
+ WriteBatch* updates,
+ WriteBatch** final) {
+ if (!HasLargeValues(*updates)) {
+ // Fast path: no large values found
+ *final = updates;
+ } else {
+ // Copy *updates to a new WriteBatch, replacing the references to
+ *final = new WriteBatch;
+ SequenceNumber seq = assigned_seq;
+ for (WriteBatchInternal::Iterator it(*updates); !it.Done(); it.Next()) {
+ switch (it.op()) {
+ case kTypeValue:
+ if (it.value().size() < options_.large_value_threshold) {
+ (*final)->Put(it.key(), it.value());
+ } else {
+ std::string scratch;
+ Slice file_bytes;
+ LargeValueRef large_ref;
+ MaybeCompressLargeValue(
+ it.value(), &file_bytes, &scratch, &large_ref);
+ InternalKey ikey(it.key(), seq, kTypeLargeValueRef);
+ if (versions_->RegisterLargeValueRef(large_ref, log_number_,ikey)) {
+ // TODO(opt): avoid holding the lock here (but be careful about
+ // another thread doing a Write and changing log_number_ or
+ // having us get a different "assigned_seq" value).
+
+ uint64_t tmp_number = versions_->NewFileNumber();
+ pending_outputs_.insert(tmp_number);
+ std::string tmp = TempFileName(dbname_, tmp_number);
+ WritableFile* file;
+ Status s = env_->NewWritableFile(tmp, &file);
+ if (!s.ok()) {
+ return s; // Caller will delete *final
+ }
+
+ file->Append(file_bytes);
+
+ s = file->Close();
+ delete file;
+
+ if (s.ok()) {
+ const std::string fname =
+ LargeValueFileName(dbname_, large_ref);
+ s = env_->RenameFile(tmp, fname);
+ } else {
+ Log(env_, options_.info_log, "Write large value: %s",
+ s.ToString().c_str());
+ }
+ pending_outputs_.erase(tmp_number);
+
+ if (!s.ok()) {
+ env_->DeleteFile(tmp); // Cleanup; intentionally ignoring error
+ return s; // Caller will delete *final
+ }
+ }
+
+ // Put an indirect reference in the write batch in place
+ // of large value
+ WriteBatchInternal::PutLargeValueRef(*final, it.key(), large_ref);
+ }
+ break;
+ case kTypeLargeValueRef:
+ return Status::Corruption("Corrupted write batch");
+ break;
+ case kTypeDeletion:
+ (*final)->Delete(it.key());
+ break;
+ }
+ seq = seq + 1;
+ }
+ }
+ return Status::OK();
+}
+
+bool DBImpl::GetProperty(const Slice& property, uint64_t* value) {
+ MutexLock l(&mutex_);
+ Slice in = property;
+ Slice prefix("leveldb.");
+ if (!in.starts_with(prefix)) return false;
+ in.remove_prefix(prefix.size());
+
+ if (in.starts_with("num-files-at-level")) {
+ in.remove_prefix(strlen("num-files-at-level"));
+ uint64_t level;
+ bool ok = ConsumeDecimalNumber(&in, &level) && in.empty();
+ if (!ok || level < 0 || level >= config::kNumLevels) {
+ return false;
+ } else {
+ *value = versions_->NumLevelFiles(level);
+ return true;
+ }
+ }
+ return false;
+}
+
+void DBImpl::GetApproximateSizes(
+ const Range* range, int n,
+ uint64_t* sizes) {
+ // TODO(opt): better implementation
+ Version* v;
+ {
+ MutexLock l(&mutex_);
+ versions_->current()->Ref();
+ v = versions_->current();
+ }
+
+ for (int i = 0; i < n; i++) {
+ // Convert user_key into a corresponding internal key.
+ InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+ InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+ uint64_t start = versions_->ApproximateOffsetOf(v, k1);
+ uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
+ sizes[i] = (limit >= start ? limit - start : 0);
+ }
+
+ {
+ MutexLock l(&mutex_);
+ v->Unref();
+ }
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
+ WriteBatch batch;
+ batch.Put(key, value);
+ return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, const Slice& key) {
+ WriteBatch batch;
+ batch.Delete(key);
+ return Write(opt, &batch);
+}
+
+DB::~DB() { }
+
+Status DB::Open(const Options& options, const std::string& dbname,
+ DB** dbptr) {
+ *dbptr = NULL;
+
+ DBImpl* impl = new DBImpl(options, dbname);
+ impl->mutex_.Lock();
+ VersionEdit edit;
+ Status s = impl->Recover(&edit); // Handles create_if_missing, error_if_exists
+ if (s.ok()) {
+ impl->log_number_ = impl->versions_->NewFileNumber();
+ WritableFile* lfile;
+ s = options.env->NewWritableFile(LogFileName(dbname, impl->log_number_),
+ &lfile);
+ if (s.ok()) {
+ impl->logfile_ = lfile;
+ impl->log_ = new log::Writer(lfile);
+ s = impl->Install(&edit, impl->log_number_, NULL);
+ }
+ if (s.ok()) {
+ impl->DeleteObsoleteFiles();
+ }
+ }
+ impl->mutex_.Unlock();
+ if (s.ok()) {
+ *dbptr = impl;
+ } else {
+ delete impl;
+ }
+ return s;
+}
+
+Status DestroyDB(const std::string& dbname, const Options& options) {
+ Env* env = options.env;
+ std::vector<std::string> filenames;
+ // Ignore error in case directory does not exist
+ env->GetChildren(dbname, &filenames);
+ if (filenames.empty()) {
+ return Status::OK();
+ }
+
+ FileLock* lock;
+ Status result = env->LockFile(LockFileName(dbname), &lock);
+ if (result.ok()) {
+ uint64_t number;
+ LargeValueRef large_ref;
+ FileType type;
+ for (int i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
+ Status del = env->DeleteFile(dbname + "/" + filenames[i]);
+ if (result.ok() && !del.ok()) {
+ result = del;
+ }
+ }
+ }
+ env->UnlockFile(lock); // Ignore error since state is already gone
+ env->DeleteFile(LockFileName(dbname));
+ env->DeleteDir(dbname); // Ignore error in case dir contains other files
+ }
+ return result;
+}
+
+}
diff --git a/db/db_impl.h b/db/db_impl.h
new file mode 100644
index 0000000..fc3d3f2
--- /dev/null
+++ b/db/db_impl.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_IMPL_H_
+#define STORAGE_LEVELDB_DB_DB_IMPL_H_
+
+#include <set>
+#include "db/dbformat.h"
+#include "db/log_writer.h"
+#include "db/snapshot.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class MemTable;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class DBImpl : public DB {
+ public:
+ DBImpl(const Options& options, const std::string& dbname);
+ virtual ~DBImpl();
+
+ // Implementations of the DB interface
+ virtual Status Put(const WriteOptions&, const Slice& key, const Slice& value);
+ virtual Status Delete(const WriteOptions&, const Slice& key);
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates);
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key,
+ std::string* value);
+ virtual Iterator* NewIterator(const ReadOptions&);
+ virtual const Snapshot* GetSnapshot();
+ virtual void ReleaseSnapshot(const Snapshot* snapshot);
+ virtual bool GetProperty(const Slice& property, uint64_t* value);
+ virtual void GetApproximateSizes(const Range* range, int n, uint64_t* sizes);
+
+ // Extra methods (for testing) that are not in the public DB interface
+
+ // Compact any files in the named level that overlap [begin,end]
+ void TEST_CompactRange(
+ int level,
+ const std::string& begin,
+ const std::string& end);
+
+ // Force current memtable contents to be compacted.
+ Status TEST_CompactMemTable();
+
+ // Return an internal iterator over the current state of the database.
+ // The keys of this iterator are internal keys (see format.h).
+ // The returned iterator should be deleted when no longer needed.
+ Iterator* TEST_NewInternalIterator();
+
+ private:
+ friend class DB;
+
+ Iterator* NewInternalIterator(const ReadOptions&,
+ SequenceNumber* latest_snapshot);
+
+ Status NewDB();
+
+ // Recover the descriptor from persistent storage. May do a significant
+ // amount of work to recover recently logged updates. Any changes to
+ // be made to the descriptor are added to *edit.
+ Status Recover(VersionEdit* edit);
+
+ // Apply the specified updates and save the resulting descriptor to
+ // persistent storage. If cleanup_mem is non-NULL, arrange to
+ // delete it when all existing snapshots have gone away iff Install()
+ // returns OK.
+ Status Install(VersionEdit* edit,
+ uint64_t new_log_number,
+ MemTable* cleanup_mem);
+
+ void MaybeIgnoreError(Status* s) const;
+
+ // Delete any unneeded files and stale in-memory entries.
+ void DeleteObsoleteFiles();
+
+ // Called when an iterator over a particular version of the
+ // descriptor goes away.
+ static void Unref(void* arg1, void* arg2);
+
+ // Compact the in-memory write buffer to disk. Switches to a new
+ // log-file/memtable and writes a new descriptor iff successful.
+ Status CompactMemTable();
+
+ Status RecoverLogFile(uint64_t log_number,
+ VersionEdit* edit,
+ SequenceNumber* max_sequence);
+
+ Status WriteLevel0Table(MemTable* mem, VersionEdit* edit);
+
+ bool HasLargeValues(const WriteBatch& batch) const;
+
+ // Process data in "*updates" and return a status. "assigned_seq"
+ // is the sequence number assigned to the first mod in "*updates".
+ // If no large values are encountered, "*final" is set to "updates".
+ // If large values were encountered, registers the references of the
+ // large values with the VersionSet, writes the large values to
+ // files (if appropriate), and allocates a new WriteBatch with the
+ // large values replaced with indirect references and stores a
+ // pointer to the new WriteBatch in *final. If *final != updates on
+ // return, then the client should delete *final when no longer
+ // needed. Returns OK on success, and an appropriate error
+ // otherwise.
+ Status HandleLargeValues(SequenceNumber assigned_seq,
+ WriteBatch* updates,
+ WriteBatch** final);
+
+ // Helper routine for HandleLargeValues
+ void MaybeCompressLargeValue(
+ const Slice& raw_value,
+ Slice* file_bytes,
+ std::string* scratch,
+ LargeValueRef* ref);
+
+ struct CompactionState;
+
+ void MaybeScheduleCompaction();
+ static void BGWork(void* db);
+ void BackgroundCall();
+ void BackgroundCompaction();
+ void CleanupCompaction(CompactionState* compact);
+ Status DoCompactionWork(CompactionState* compact);
+
+ Status OpenCompactionOutputFile(CompactionState* compact);
+ Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
+ Status InstallCompactionResults(CompactionState* compact);
+
+ // Constant after construction
+ Env* const env_;
+ const InternalKeyComparator internal_comparator_;
+ const Options options_; // options_.comparator == &internal_comparator_
+ bool owns_info_log_;
+ const std::string dbname_;
+
+ // table_cache_ provides its own synchronization
+ TableCache* table_cache_;
+
+ // Lock over the persistent DB state. Non-NULL iff successfully acquired.
+ FileLock* db_lock_;
+
+ // State below is protected by mutex_
+ port::Mutex mutex_;
+ port::AtomicPointer shutting_down_;
+ port::CondVar bg_cv_; // Signalled when !bg_compaction_scheduled_
+ port::CondVar compacting_cv_; // Signalled when !compacting_
+ SequenceNumber last_sequence_;
+ MemTable* mem_;
+ WritableFile* logfile_;
+ log::Writer* log_;
+ uint64_t log_number_;
+ SnapshotList snapshots_;
+
+ // Set of table files to protect from deletion because they are
+ // part of ongoing compactions.
+ std::set<uint64_t> pending_outputs_;
+
+ // Has a background compaction been scheduled or is running?
+ bool bg_compaction_scheduled_;
+
+ // Is there a compaction running?
+ bool compacting_;
+
+ VersionSet* versions_;
+
+ // Have we encountered a background error in paranoid mode?
+ Status bg_error_;
+
+ // No copying allowed
+ DBImpl(const DBImpl&);
+ void operator=(const DBImpl&);
+
+ const Comparator* user_comparator() const {
+ return internal_comparator_.user_comparator();
+ }
+};
+
+// Sanitize db options. The caller should delete result.info_log if
+// it is not equal to src.info_log.
+extern Options SanitizeOptions(const std::string& db,
+ const InternalKeyComparator* icmp,
+ const Options& src);
+
+}
+
+#endif // STORAGE_LEVELDB_DB_DB_IMPL_H_
diff --git a/db/db_iter.cc b/db/db_iter.cc
new file mode 100644
index 0000000..c23de22
--- /dev/null
+++ b/db/db_iter.cc
@@ -0,0 +1,412 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "include/env.h"
+#include "include/iterator.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+#if 0
+static void DumpInternalIter(Iterator* iter) {
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey k;
+ if (!ParseInternalKey(iter->key(), &k)) {
+ fprintf(stderr, "Corrupt '%s'\n", EscapeString(iter->key()).c_str());
+ } else {
+ fprintf(stderr, "@ '%s'\n", k.DebugString().c_str());
+ }
+ }
+}
+#endif
+
+namespace {
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries. DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter: public Iterator {
+ public:
+ DBIter(const std::string* dbname, Env* env,
+ const Comparator* cmp, Iterator* iter, SequenceNumber s)
+ : dbname_(dbname),
+ env_(env),
+ user_comparator_(cmp),
+ iter_(iter),
+ sequence_(s),
+ large_(NULL),
+ valid_(false) {
+ }
+ virtual ~DBIter() {
+ delete iter_;
+ delete large_;
+ }
+ virtual bool Valid() const { return valid_; }
+ virtual Slice key() const {
+ assert(valid_);
+ return key_;
+ }
+ virtual Slice value() const {
+ assert(valid_);
+ if (large_ == NULL) {
+ return value_;
+ } else {
+ MutexLock l(&large_->mutex);
+ if (!large_->produced) {
+ ReadIndirectValue();
+ }
+ return large_->value;
+ }
+ }
+
+ virtual void Next() {
+ assert(valid_);
+ // iter_ is already positioned past DBIter::key()
+ FindNextUserEntry();
+ }
+
+ virtual void Prev() {
+ assert(valid_);
+ bool ignored;
+ ScanUntilBeforeCurrentKey(&ignored);
+ FindPrevUserEntry();
+ }
+
+ virtual void Seek(const Slice& target) {
+ ParsedInternalKey ikey(target, sequence_, kValueTypeForSeek);
+ std::string tmp;
+ AppendInternalKey(&tmp, ikey);
+ iter_->Seek(tmp);
+ FindNextUserEntry();
+ }
+ virtual void SeekToFirst() {
+ iter_->SeekToFirst();
+ FindNextUserEntry();
+ }
+
+ virtual void SeekToLast();
+
+ virtual Status status() const {
+ if (status_.ok()) {
+ if (large_ != NULL && !large_->status.ok()) return large_->status;
+ return iter_->status();
+ } else {
+ return status_;
+ }
+ }
+
+ private:
+ void FindNextUserEntry();
+ void FindPrevUserEntry();
+ void SaveKey(const Slice& k) { key_.assign(k.data(), k.size()); }
+ void SaveValue(const Slice& v) {
+ if (value_.capacity() > v.size() + 1048576) {
+ std::string empty;
+ swap(empty, value_);
+ }
+ value_.assign(v.data(), v.size());
+ }
+ bool ParseKey(ParsedInternalKey* key);
+ void SkipPast(const Slice& k);
+ void ScanUntilBeforeCurrentKey(bool* found_live);
+
+ void ReadIndirectValue() const;
+
+ struct Large {
+ port::Mutex mutex;
+ std::string value;
+ bool produced;
+ Status status;
+ };
+
+ const std::string* const dbname_;
+ Env* const env_;
+
+ const Comparator* const user_comparator_;
+
+ // iter_ is positioned just past current entry for DBIter if valid_
+ Iterator* const iter_;
+
+ SequenceNumber const sequence_;
+ Status status_;
+ std::string key_; // Always a user key
+ std::string value_;
+ Large* large_; // Non-NULL if value is an indirect reference
+ bool valid_;
+
+ // No copying allowed
+ DBIter(const DBIter&);
+ void operator=(const DBIter&);
+};
+
+inline bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+ if (!ParseInternalKey(iter_->key(), ikey)) {
+ status_ = Status::Corruption("corrupted internal key in DBIter");
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void DBIter::FindNextUserEntry() {
+ if (large_ != NULL) {
+ if (status_.ok() && !large_->status.ok()) {
+ status_ = large_->status;
+ }
+ delete large_;
+ large_ = NULL;
+ }
+ while (iter_->Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseKey(&ikey)) {
+ // Skip past corrupted entry
+ iter_->Next();
+ continue;
+ }
+ if (ikey.sequence > sequence_) {
+ // Ignore entries newer than the snapshot
+ iter_->Next();
+ continue;
+ }
+
+ switch (ikey.type) {
+ case kTypeDeletion:
+ SaveKey(ikey.user_key); // Make local copy for use by SkipPast()
+ iter_->Next();
+ SkipPast(key_);
+ // Do not return deleted entries. Instead keep looping.
+ break;
+
+ case kTypeValue:
+ SaveKey(ikey.user_key);
+ SaveValue(iter_->value());
+ iter_->Next();
+ SkipPast(key_);
+ // Yield the value we just found.
+ valid_ = true;
+ return;
+
+ case kTypeLargeValueRef:
+ SaveKey(ikey.user_key);
+ // Save the large value ref as value_, and read it lazily on a call
+ // to value()
+ SaveValue(iter_->value());
+ large_ = new Large;
+ large_->produced = false;
+ iter_->Next();
+ SkipPast(key_);
+ // Yield the value we just found.
+ valid_ = true;
+ return;
+ }
+ }
+ valid_ = false;
+ key_.clear();
+ value_.clear();
+ assert(large_ == NULL);
+}
+
+void DBIter::SkipPast(const Slice& k) {
+ while (iter_->Valid()) {
+ ParsedInternalKey ikey;
+ // Note that if we cannot parse an internal key, we keep looping
+ // so that if we have a run like the following:
+ // <x,100,v> => value100
+ // <corrupted entry for user key x>
+ // <x,50,v> => value50
+ // we will skip over the corrupted entry as well as value50.
+ if (ParseKey(&ikey) && user_comparator_->Compare(ikey.user_key, k) != 0) {
+ break;
+ }
+ iter_->Next();
+ }
+}
+
+void DBIter::SeekToLast() {
+ // Position iter_ at the last uncorrupted user key and then
+ // let FindPrevUserEntry() do the heavy lifting to find
+ // a user key that is live.
+ iter_->SeekToLast();
+ ParsedInternalKey current;
+ while (iter_->Valid() && !ParseKey(&current)) {
+ iter_->Prev();
+ }
+ if (iter_->Valid()) {
+ SaveKey(current.user_key);
+ }
+ FindPrevUserEntry();
+}
+
+// Let X be the user key at which iter_ is currently positioned.
+// Adjust DBIter to point at the last entry with a key <= X that
+// has a live value.
+void DBIter::FindPrevUserEntry() {
+ // Consider the following example:
+ //
+ // A@540
+ // A@400
+ //
+ // B@300
+ // B@200
+ // B@100 <- iter_
+ //
+ // C@301
+ // C@201
+ //
+ // The comments marked "(first iteration)" below relate what happens
+ // for the preceding example in the first iteration of the while loop
+ // below. There may be more than one iteration either if there are
+ // no live values for B, or if there is a corruption.
+ while (iter_->Valid()) {
+ std::string saved = key_;
+ bool found_live;
+ ScanUntilBeforeCurrentKey(&found_live);
+ // (first iteration) iter_ at A@400
+ if (found_live) {
+ // Step forward into range of entries with user key >= saved
+ if (!iter_->Valid()) {
+ iter_->SeekToFirst();
+ } else {
+ iter_->Next();
+ }
+ // (first iteration) iter_ at B@300
+
+ FindNextUserEntry(); // Sets key_ to the key of the next value it found
+ if (valid_ && user_comparator_->Compare(key_, saved) == 0) {
+ // (first iteration) iter_ at C@301
+ return;
+ }
+
+ // FindNextUserEntry() could not find any entries under the
+ // user key "saved". This is probably a corruption since
+ // ScanUntilBefore(saved) found a live value. So we skip
+ // backwards to an earlier key and ignore the corrupted
+ // entries for "saved".
+ //
+ // (first iteration) iter_ at C@301 and saved == "B"
+ key_ = saved;
+ bool ignored;
+ ScanUntilBeforeCurrentKey(&ignored);
+ // (first iteration) iter_ at A@400
+ }
+ }
+ valid_ = false;
+ key_.clear();
+ value_.clear();
+}
+
+void DBIter::ScanUntilBeforeCurrentKey(bool* found_live) {
+ *found_live = false;
+ if (!iter_->Valid()) {
+ iter_->SeekToLast();
+ }
+
+ while (iter_->Valid()) {
+ ParsedInternalKey current;
+ if (!ParseKey(&current)) {
+ iter_->Prev();
+ continue;
+ }
+
+ if (current.sequence > sequence_) {
+ // Ignore entries that are serialized after this read
+ iter_->Prev();
+ continue;
+ }
+
+ const int cmp = user_comparator_->Compare(current.user_key, key_);
+ if (cmp < 0) {
+ SaveKey(current.user_key);
+ return;
+ } else if (cmp == 0) {
+ switch (current.type) {
+ case kTypeDeletion:
+ *found_live = false;
+ break;
+
+ case kTypeValue:
+ case kTypeLargeValueRef:
+ *found_live = true;
+ break;
+ }
+ } else { // cmp > 0
+ *found_live = false;
+ }
+
+ iter_->Prev();
+ }
+}
+
+void DBIter::ReadIndirectValue() const {
+ assert(!large_->produced);
+ large_->produced = true;
+ LargeValueRef large_ref;
+ if (value_.size() != LargeValueRef::ByteSize()) {
+ large_->status = Status::Corruption("malformed large value reference");
+ return;
+ }
+ memcpy(large_ref.data, value_.data(), LargeValueRef::ByteSize());
+ std::string fname = LargeValueFileName(*dbname_, large_ref);
+ RandomAccessFile* file;
+ Status s = env_->NewRandomAccessFile(fname, &file);
+ if (s.ok()) {
+ uint64_t file_size = file->Size();
+ uint64_t value_size = large_ref.ValueSize();
+ large_->value.resize(value_size);
+ Slice result;
+ s = file->Read(0, file_size, &result,
+ const_cast<char*>(large_->value.data()));
+ if (s.ok()) {
+ if (result.size() == file_size) {
+ switch (large_ref.compression_type()) {
+ case kNoCompression: {
+ if (result.data() != large_->value.data()) {
+ large_->value.assign(result.data(), result.size());
+ }
+ break;
+ }
+ case kLightweightCompression: {
+ std::string uncompressed;
+ if (port::Lightweight_Uncompress(result.data(), result.size(),
+ &uncompressed) &&
+ uncompressed.size() == large_ref.ValueSize()) {
+ swap(uncompressed, large_->value);
+ } else {
+ s = Status::Corruption(
+ "Unable to read entire compressed large value file");
+ }
+ }
+ }
+ } else {
+ s = Status::Corruption("Unable to read entire large value file");
+ }
+ }
+ delete file; // Ignore errors on closing
+ }
+ if (!s.ok()) {
+ large_->value.clear();
+ large_->status = s;
+ }
+}
+
+} // anonymous namespace
+
+Iterator* NewDBIterator(
+ const std::string* dbname,
+ Env* env,
+ const Comparator* user_key_comparator,
+ Iterator* internal_iter,
+ const SequenceNumber& sequence) {
+ return new DBIter(dbname, env, user_key_comparator, internal_iter, sequence);
+}
+
+}
diff --git a/db/db_iter.h b/db/db_iter.h
new file mode 100644
index 0000000..a0be50e
--- /dev/null
+++ b/db/db_iter.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_DB_ITER_H_
+#define STORAGE_LEVELDB_DB_DB_ITER_H_
+
+#include <stdint.h>
+#include "include/db.h"
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified "sequence" number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+ const std::string* dbname,
+ Env* env,
+ const Comparator* user_key_comparator,
+ Iterator* internal_iter,
+ const SequenceNumber& sequence);
+
+}
+
+#endif // STORAGE_LEVELDB_DB_DB_ITER_H_
diff --git a/db/db_test.cc b/db/db_test.cc
new file mode 100644
index 0000000..895a5e1
--- /dev/null
+++ b/db/db_test.cc
@@ -0,0 +1,963 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include "db/db_impl.h"
+#include "db/filename.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "include/env.h"
+#include "include/table.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+static std::string RandomString(Random* rnd, int len) {
+ std::string r;
+ test::RandomString(rnd, len, &r);
+ return r;
+}
+
+class DBTest {
+ public:
+ std::string dbname_;
+ Env* env_;
+ DB* db_;
+
+ Options last_options_;
+
+ DBTest() : env_(Env::Default()) {
+ dbname_ = test::TmpDir() + "/db_test";
+ DestroyDB(dbname_, Options());
+ db_ = NULL;
+ Reopen();
+ }
+
+ ~DBTest() {
+ delete db_;
+ DestroyDB(dbname_, Options());
+ }
+
+ DBImpl* dbfull() {
+ return reinterpret_cast<DBImpl*>(db_);
+ }
+
+ void Reopen(Options* options = NULL) {
+ ASSERT_OK(TryReopen(options));
+ }
+
+ void DestroyAndReopen(Options* options = NULL) {
+ delete db_;
+ db_ = NULL;
+ DestroyDB(dbname_, Options());
+ ASSERT_OK(TryReopen(options));
+ }
+
+ Status TryReopen(Options* options) {
+ delete db_;
+ db_ = NULL;
+ Options opts;
+ if (options != NULL) {
+ opts = *options;
+ } else {
+ opts.create_if_missing = true;
+ }
+ last_options_ = opts;
+
+ return DB::Open(opts, dbname_, &db_);
+ }
+
+ Status Put(const std::string& k, const std::string& v) {
+ WriteBatch batch;
+ batch.Put(k, v);
+ return db_->Write(WriteOptions(), &batch);
+ }
+
+ Status Delete(const std::string& k) {
+ WriteBatch batch;
+ batch.Delete(k);
+ return db_->Write(WriteOptions(), &batch);
+ }
+
+ std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
+ ReadOptions options;
+ options.snapshot = snapshot;
+ std::string result;
+ Status s = db_->Get(options, k, &result);
+ if (s.IsNotFound()) {
+ result = "NOT_FOUND";
+ } else if (!s.ok()) {
+ result = s.ToString();
+ }
+ return result;
+ }
+
+ std::string AllEntriesFor(const Slice& user_key) {
+ Iterator* iter = dbfull()->TEST_NewInternalIterator();
+ InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+ iter->Seek(target.Encode());
+ std::string result;
+ if (!iter->status().ok()) {
+ result = iter->status().ToString();
+ } else {
+ result = "[ ";
+ bool first = true;
+ while (iter->Valid()) {
+ ParsedInternalKey ikey;
+ if (!ParseInternalKey(iter->key(), &ikey)) {
+ result += "CORRUPTED";
+ } else {
+ if (last_options_.comparator->Compare(
+ ikey.user_key, user_key) != 0) {
+ break;
+ }
+ if (!first) {
+ result += ", ";
+ }
+ first = false;
+ switch (ikey.type) {
+ case kTypeValue:
+ result += iter->value().ToString();
+ break;
+ case kTypeLargeValueRef:
+ result += "LARGEVALUE(" + EscapeString(iter->value()) + ")";
+ break;
+ case kTypeDeletion:
+ result += "DEL";
+ break;
+ }
+ }
+ iter->Next();
+ }
+ if (!first) {
+ result += " ";
+ }
+ result += "]";
+ }
+ delete iter;
+ return result;
+ }
+
+ int NumTableFilesAtLevel(int level) {
+ uint64_t val;
+ ASSERT_TRUE(
+ db_->GetProperty("leveldb.num-files-at-level" + NumberToString(level),
+ &val));
+ return val;
+ }
+
+ uint64_t Size(const Slice& start, const Slice& limit) {
+ Range r(start, limit);
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ std::set<LargeValueRef> LargeValueFiles() const {
+ // Return the set of large value files that exist in the database
+ std::vector<std::string> filenames;
+ env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+ uint64_t number;
+ LargeValueRef large_ref;
+ FileType type;
+ std::set<LargeValueRef> live;
+ for (int i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &large_ref, &type) &&
+ type == kLargeValueFile) {
+ fprintf(stderr, " live: %s\n",
+ LargeValueRefToFilenameString(large_ref).c_str());
+ live.insert(large_ref);
+ }
+ }
+ fprintf(stderr, "Found %d live large value files\n", (int)live.size());
+ return live;
+ }
+};
+
+TEST(DBTest, Empty) {
+ ASSERT_TRUE(db_ != NULL);
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(DBTest, ReadWrite) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+}
+
+TEST(DBTest, PutDeleteGet) {
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+ ASSERT_EQ("v2", Get("foo"));
+ ASSERT_OK(db_->Delete(WriteOptions(), "foo"));
+ ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST(DBTest, Recover) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("baz", "v5"));
+
+ Reopen();
+ ASSERT_EQ("v1", Get("foo"));
+
+ ASSERT_EQ("v1", Get("foo"));
+ ASSERT_EQ("v5", Get("baz"));
+ ASSERT_OK(Put("bar", "v2"));
+ ASSERT_OK(Put("foo", "v3"));
+
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+ ASSERT_OK(Put("foo", "v4"));
+ ASSERT_EQ("v4", Get("foo"));
+ ASSERT_EQ("v2", Get("bar"));
+ ASSERT_EQ("v5", Get("baz"));
+}
+
+TEST(DBTest, RecoveryWithEmptyLog) {
+ ASSERT_OK(Put("foo", "v1"));
+ ASSERT_OK(Put("foo", "v2"));
+ Reopen();
+ Reopen();
+ ASSERT_OK(Put("foo", "v3"));
+ Reopen();
+ ASSERT_EQ("v3", Get("foo"));
+}
+
+static std::string Key(int i) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "key%06d", i);
+ return std::string(buf);
+}
+
+TEST(DBTest, MinorCompactionsHappen) {
+ Options options;
+ options.write_buffer_size = 10000;
+ Reopen(&options);
+
+ const int N = 100;
+
+ int starting_num_tables = NumTableFilesAtLevel(0);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(1000, 'v')));
+ }
+ int ending_num_tables = NumTableFilesAtLevel(0);
+ ASSERT_GT(ending_num_tables, starting_num_tables);
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+
+ Reopen();
+
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(Key(i)));
+ }
+}
+
+TEST(DBTest, RecoverWithLargeLog) {
+ {
+ Options options;
+ options.large_value_threshold = 1048576;
+ Reopen(&options);
+ ASSERT_OK(Put("big1", std::string(200000, '1')));
+ ASSERT_OK(Put("big2", std::string(200000, '2')));
+ ASSERT_OK(Put("small3", std::string(10, '3')));
+ ASSERT_OK(Put("small4", std::string(10, '4')));
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ }
+
+ // Make sure that if we re-open with a small write buffer size that
+ // we flush table files in the middle of a large log file.
+ Options options;
+ options.write_buffer_size = 100000;
+ options.large_value_threshold = 1048576;
+ Reopen(&options);
+ ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+ ASSERT_EQ(std::string(200000, '1'), Get("big1"));
+ ASSERT_EQ(std::string(200000, '2'), Get("big2"));
+ ASSERT_EQ(std::string(10, '3'), Get("small3"));
+ ASSERT_EQ(std::string(10, '4'), Get("small4"));
+ ASSERT_GT(NumTableFilesAtLevel(0), 1);
+}
+
+TEST(DBTest, CompactionsGenerateMultipleFiles) {
+ Options options;
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.large_value_threshold = 1048576;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ std::vector<std::string> values;
+ for (int i = 0; i < 80; i++) {
+ values.push_back(RandomString(&rnd, 100000));
+ ASSERT_OK(Put(Key(i), values[i]));
+ }
+
+ // Reopening moves updates to level-0
+ Reopen(&options);
+ dbfull()->TEST_CompactRange(0, "", Key(100000));
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 1);
+ for (int i = 0; i < 80; i++) {
+ ASSERT_EQ(Get(Key(i)), values[i]);
+ }
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val),
+ (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+TEST(DBTest, ApproximateSizes) {
+ for (int test = 0; test < 2; test++) {
+ // test==0: default large_value_threshold
+ // test==1: 1 MB large_value_threshold
+ Options options;
+ options.large_value_threshold = (test == 0) ? 65536 : 1048576;
+ options.write_buffer_size = 100000000; // Large write buffer
+ options.compression = kNoCompression;
+ DestroyAndReopen();
+
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+ Reopen(&options);
+ ASSERT_TRUE(Between(Size("", "xyz"), 0, 0));
+
+ // Write 8MB (80 values, each 100K)
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ const int N = 80;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_OK(Put(Key(i), RandomString(&rnd, 100000)));
+ }
+ if (test == 1) {
+ // 0 because GetApproximateSizes() does not account for memtable space for
+ // non-large values
+ ASSERT_TRUE(Between(Size("", Key(50)), 0, 0));
+ } else {
+ ASSERT_TRUE(Between(Size("", Key(50)), 100000*50, 100000*50 + 10000));
+ ASSERT_TRUE(Between(Size(Key(20), Key(30)),
+ 100000*10, 100000*10 + 10000));
+ }
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ for (int compact_start = 0; compact_start < N; compact_start += 10) {
+ for (int i = 0; i < N; i += 10) {
+ ASSERT_TRUE(Between(Size("", Key(i)), 100000*i, 100000*i + 10000));
+ ASSERT_TRUE(Between(Size("", Key(i)+".suffix"),
+ 100000 * (i+1), 100000 * (i+1) + 10000));
+ ASSERT_TRUE(Between(Size(Key(i), Key(i+10)),
+ 100000 * 10, 100000 * 10 + 10000));
+ }
+ ASSERT_TRUE(Between(Size("", Key(50)), 5000000, 5010000));
+ ASSERT_TRUE(Between(Size("", Key(50)+".suffix"), 5100000, 5110000));
+
+ dbfull()->TEST_CompactRange(0,
+ Key(compact_start),
+ Key(compact_start + 9));
+ }
+
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GT(NumTableFilesAtLevel(1), 0);
+ }
+ }
+}
+
+TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+ Options options;
+ options.large_value_threshold = 65536;
+ options.compression = kNoCompression;
+ Reopen();
+
+ Random rnd(301);
+ std::string big1 = RandomString(&rnd, 100000);
+ ASSERT_OK(Put(Key(0), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(1), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(2), big1));
+ ASSERT_OK(Put(Key(3), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(4), big1));
+ ASSERT_OK(Put(Key(5), RandomString(&rnd, 10000)));
+ ASSERT_OK(Put(Key(6), RandomString(&rnd, 300000)));
+ ASSERT_OK(Put(Key(7), RandomString(&rnd, 10000)));
+
+ // Check sizes across recovery by reopening a few times
+ for (int run = 0; run < 3; run++) {
+ Reopen(&options);
+
+ ASSERT_TRUE(Between(Size("", Key(0)), 0, 0));
+ ASSERT_TRUE(Between(Size("", Key(1)), 10000, 11000));
+ ASSERT_TRUE(Between(Size("", Key(2)), 20000, 21000));
+ ASSERT_TRUE(Between(Size("", Key(3)), 120000, 121000));
+ ASSERT_TRUE(Between(Size("", Key(4)), 130000, 131000));
+ ASSERT_TRUE(Between(Size("", Key(5)), 230000, 231000));
+ ASSERT_TRUE(Between(Size("", Key(6)), 240000, 241000));
+ ASSERT_TRUE(Between(Size("", Key(7)), 540000, 541000));
+ ASSERT_TRUE(Between(Size("", Key(8)), 550000, 551000));
+
+ ASSERT_TRUE(Between(Size(Key(3), Key(5)), 110000, 111000));
+
+ dbfull()->TEST_CompactRange(0, Key(0), Key(100));
+ }
+}
+
+TEST(DBTest, IteratorPinsRef) {
+ Put("foo", "hello");
+
+ // Get iterator that will yield the current contents of the DB.
+ Iterator* iter = db_->NewIterator(ReadOptions());
+
+ // Write to force compactions
+ Put("foo", "newvalue1");
+ for (int i = 0; i < 100; i++) {
+ ASSERT_OK(Put(Key(i), Key(i) + std::string(100000, 'v'))); // 100K values
+ }
+ Put("foo", "newvalue2");
+
+ iter->SeekToFirst();
+ ASSERT_TRUE(iter->Valid());
+ ASSERT_EQ("foo", iter->key().ToString());
+ ASSERT_EQ("hello", iter->value().ToString());
+ iter->Next();
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+}
+
+TEST(DBTest, Snapshot) {
+ Put("foo", "v1");
+ const Snapshot* s1 = db_->GetSnapshot();
+ Put("foo", "v2");
+ const Snapshot* s2 = db_->GetSnapshot();
+ Put("foo", "v3");
+ const Snapshot* s3 = db_->GetSnapshot();
+
+ Put("foo", "v4");
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v3", Get("foo", s3));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s3);
+ ASSERT_EQ("v1", Get("foo", s1));
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s1);
+ ASSERT_EQ("v2", Get("foo", s2));
+ ASSERT_EQ("v4", Get("foo"));
+
+ db_->ReleaseSnapshot(s2);
+ ASSERT_EQ("v4", Get("foo"));
+}
+
+TEST(DBTest, HiddenValuesAreRemoved) {
+ Random rnd(301);
+ std::string big = RandomString(&rnd, 50000);
+ Put("foo", big);
+ Put("pastfoo", "v");
+ const Snapshot* snapshot = db_->GetSnapshot();
+ Put("foo", "tiny");
+ Put("pastfoo2", "v2"); // Advance sequence number one more
+
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_GT(NumTableFilesAtLevel(0), 0);
+
+ ASSERT_EQ(big, Get("foo", snapshot));
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 50000, 60000));
+ db_->ReleaseSnapshot(snapshot);
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny, " + big + " ]");
+ dbfull()->TEST_CompactRange(0, "", "x");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+ ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+ ASSERT_GE(NumTableFilesAtLevel(1), 1);
+ dbfull()->TEST_CompactRange(1, "", "x");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ tiny ]");
+
+ ASSERT_TRUE(Between(Size("", "pastfoo"), 0, 1000));
+}
+
+TEST(DBTest, DeletionMarkers1) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ dbfull()->TEST_CompactRange(0, "", "z");
+ dbfull()->TEST_CompactRange(1, "", "z");
+ ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
+ Delete("foo");
+ Put("foo", "v2");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, DEL, v1 ]");
+ dbfull()->TEST_CompactRange(0, "", "z");
+ // DEL eliminated, but v1 remains because we aren't compacting that level
+ // (DEL can be eliminated because v2 hides v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2, v1 ]");
+ dbfull()->TEST_CompactRange(1, "", "z");
+ // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
+ // (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ v2 ]");
+}
+
+TEST(DBTest, DeletionMarkers2) {
+ Put("foo", "v1");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ dbfull()->TEST_CompactRange(0, "", "z");
+ dbfull()->TEST_CompactRange(1, "", "z");
+ ASSERT_EQ(NumTableFilesAtLevel(2), 1); // foo => v1 is now in level 2 file
+ Delete("foo");
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(0, "", "z");
+ // DEL kept: L2 file overlaps
+ ASSERT_EQ(AllEntriesFor("foo"), "[ DEL, v1 ]");
+ dbfull()->TEST_CompactRange(1, "", "z");
+ // Merging L1 w/ L2, so we are the base level for "foo", so DEL is removed.
+ // (as is v1).
+ ASSERT_EQ(AllEntriesFor("foo"), "[ ]");
+}
+
+TEST(DBTest, ComparatorCheck) {
+ class NewComparator : public Comparator {
+ public:
+ virtual const char* Name() const { return "leveldb.NewComparator"; }
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return BytewiseComparator()->Compare(a, b);
+ }
+ virtual void FindShortestSeparator(std::string* s, const Slice& l) const {
+ BytewiseComparator()->FindShortestSeparator(s, l);
+ }
+ virtual void FindShortSuccessor(std::string* key) const {
+ BytewiseComparator()->FindShortSuccessor(key);
+ }
+ };
+ NewComparator cmp;
+ Options new_options;
+ new_options.comparator = &cmp;
+ Status s = TryReopen(&new_options);
+ ASSERT_TRUE(!s.ok());
+ ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+ << s.ToString();
+}
+
+static bool LargeValuesOK(DBTest* db,
+ const std::set<LargeValueRef>& expected) {
+ std::set<LargeValueRef> actual = db->LargeValueFiles();
+ if (actual.size() != expected.size()) {
+ fprintf(stderr, "Sets differ in size: %d vs %d\n",
+ (int)actual.size(), (int)expected.size());
+ return false;
+ }
+ for (std::set<LargeValueRef>::const_iterator it = expected.begin();
+ it != expected.end();
+ ++it) {
+ if (actual.count(*it) != 1) {
+ fprintf(stderr, " key '%s' not found in actual set\n",
+ LargeValueRefToFilenameString(*it).c_str());
+ return false;
+ }
+ }
+ return true;
+}
+
+TEST(DBTest, LargeValues1) {
+ Options options;
+ options.large_value_threshold = 10000;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ std::string big1;
+ test::CompressibleString(&rnd, 1.0, 100000, &big1); // Not compressible
+ std::set<LargeValueRef> expected;
+
+ ASSERT_OK(Put("big1", big1));
+ expected.insert(LargeValueRef::Make(big1, kNoCompression));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(Delete("big1"));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ // No handling of deletion markers on memtable compactions, so big1 remains
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ dbfull()->TEST_CompactRange(0, "", "z");
+ expected.erase(LargeValueRef::Make(big1, kNoCompression));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+TEST(DBTest, LargeValues2) {
+ Options options;
+ options.large_value_threshold = 10000;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ std::string big1, big2;
+ test::CompressibleString(&rnd, 1.0, 20000, &big1); // Not compressible
+ test::CompressibleString(&rnd, 0.6, 40000, &big2); // Compressible
+ std::set<LargeValueRef> expected;
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(Put("big1", big1));
+ expected.insert(LargeValueRef::Make(big1, kNoCompression));
+ ASSERT_EQ(big1, Get("big1"));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(Put("big2", big2));
+ ASSERT_EQ(big2, Get("big2"));
+#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
+ // TODO(sanjay) Reenable after compression support is added
+ expected.insert(LargeValueRef::Make(big2, kNoCompression));
+#else
+ expected.insert(LargeValueRef::Make(big2, kLightweightCompression));
+#endif
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ dbfull()->TEST_CompactRange(0, "", "z");
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(Put("big2", big2));
+ ASSERT_OK(Put("big2_b", big2));
+ ASSERT_EQ(big1, Get("big1"));
+ ASSERT_EQ(big2, Get("big2"));
+ ASSERT_EQ(big2, Get("big2_b"));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(Delete("big1"));
+ ASSERT_EQ("NOT_FOUND", Get("big1"));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+ dbfull()->TEST_CompactRange(0, "", "z");
+ expected.erase(LargeValueRef::Make(big1, kNoCompression));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+ dbfull()->TEST_CompactRange(1, "", "z");
+
+ ASSERT_OK(Delete("big2"));
+ ASSERT_EQ("NOT_FOUND", Get("big2"));
+ ASSERT_EQ(big2, Get("big2_b"));
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+ dbfull()->TEST_CompactRange(0, "", "z");
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+
+ // Make sure the large value refs survive a reload and compactions after
+ // the reload.
+ Reopen();
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+ ASSERT_OK(Put("foo", "bar"));
+ ASSERT_OK(dbfull()->TEST_CompactMemTable());
+ dbfull()->TEST_CompactRange(0, "", "z");
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+TEST(DBTest, LargeValues3) {
+ // Make sure we don't compress values if
+ Options options;
+ options.large_value_threshold = 10000;
+ options.compression = kNoCompression;
+ Reopen(&options);
+
+ Random rnd(301);
+
+ std::string big1 = std::string(100000, 'x'); // Very compressible
+ std::set<LargeValueRef> expected;
+
+ ASSERT_OK(Put("big1", big1));
+ ASSERT_EQ(big1, Get("big1"));
+ expected.insert(LargeValueRef::Make(big1, kNoCompression));
+ ASSERT_TRUE(LargeValuesOK(this, expected));
+}
+
+
+TEST(DBTest, DBOpen_Options) {
+ std::string dbname = test::TmpDir() + "/db_options_test";
+ DestroyDB(dbname, Options());
+
+ // Does not exist, and create_if_missing == false: error
+ DB* db = NULL;
+ Options opts;
+ opts.create_if_missing = false;
+ Status s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != NULL);
+ ASSERT_TRUE(db == NULL);
+
+ // Does not exist, and create_if_missing == true: OK
+ opts.create_if_missing = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != NULL);
+
+ delete db;
+ db = NULL;
+
+ // Does exist, and error_if_exists == true: error
+ opts.create_if_missing = false;
+ opts.error_if_exists = true;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != NULL);
+ ASSERT_TRUE(db == NULL);
+
+ // Does exist, and error_if_exists == false: OK
+ opts.create_if_missing = true;
+ opts.error_if_exists = false;
+ s = DB::Open(opts, dbname, &db);
+ ASSERT_OK(s);
+ ASSERT_TRUE(db != NULL);
+
+ delete db;
+ db = NULL;
+}
+
+class ModelDB: public DB {
+ public:
+ explicit ModelDB(const Options& options): options_(options) { }
+ ~ModelDB() { }
+ virtual Status Put(const WriteOptions& o, const Slice& k, const Slice& v) {
+ return DB::Put(o, k, v);
+ }
+ virtual Status Delete(const WriteOptions& o, const Slice& key) {
+ return DB::Delete(o, key);
+ }
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, std::string* value) {
+ assert(false); // Not implemented
+ return Status::NotFound(key);
+ }
+ virtual Iterator* NewIterator(const ReadOptions& options) {
+ if (options.snapshot == NULL) {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return new ModelIter(saved, true);
+ } else {
+ const KVMap* snapshot_state =
+ reinterpret_cast<const KVMap*>(options.snapshot->number_);
+ return new ModelIter(snapshot_state, false);
+ }
+ }
+ virtual const Snapshot* GetSnapshot() {
+ KVMap* saved = new KVMap;
+ *saved = map_;
+ return snapshots_.New(
+ reinterpret_cast<SequenceNumber>(saved));
+ }
+
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) {
+ const KVMap* saved = reinterpret_cast<const KVMap*>(snapshot->number_);
+ delete saved;
+ snapshots_.Delete(snapshot);
+ }
+ virtual Status Write(const WriteOptions& options, WriteBatch* batch) {
+ assert(options.post_write_snapshot == NULL); // Not supported
+ for (WriteBatchInternal::Iterator it(*batch); !it.Done(); it.Next()) {
+ switch (it.op()) {
+ case kTypeValue:
+ map_[it.key().ToString()] = it.value().ToString();
+ break;
+ case kTypeLargeValueRef:
+ assert(false); // Should not occur
+ break;
+ case kTypeDeletion:
+ map_.erase(it.key().ToString());
+ break;
+ }
+ }
+ return Status::OK();
+ }
+
+ virtual bool GetProperty(const Slice& property, uint64_t* value) {
+ return false;
+ }
+ virtual void GetApproximateSizes(const Range* r, int n, uint64_t* sizes) {
+ for (int i = 0; i < n; i++) {
+ sizes[i] = 0;
+ }
+ }
+ private:
+ typedef std::map<std::string, std::string> KVMap;
+ class ModelIter: public Iterator {
+ public:
+ ModelIter(const KVMap* map, bool owned)
+ : map_(map), owned_(owned), iter_(map_->end()) {
+ }
+ ~ModelIter() {
+ if (owned_) delete map_;
+ }
+ virtual bool Valid() const { return iter_ != map_->end(); }
+ virtual void SeekToFirst() { iter_ = map_->begin(); }
+ virtual void SeekToLast() {
+ if (map_->empty()) {
+ iter_ = map_->end();
+ } else {
+ iter_ = map_->find(map_->rbegin()->first);
+ }
+ }
+ virtual void Seek(const Slice& k) {
+ iter_ = map_->lower_bound(k.ToString());
+ }
+ virtual void Next() { ++iter_; }
+ virtual void Prev() { --iter_; }
+ virtual Slice key() const { return iter_->first; }
+ virtual Slice value() const { return iter_->second; }
+ virtual Status status() const { return Status::OK(); }
+ private:
+ const KVMap* const map_;
+ const bool owned_; // Do we own map_
+ KVMap::const_iterator iter_;
+ };
+ const Options options_;
+ KVMap map_;
+ SnapshotList snapshots_;
+};
+
+static std::string RandomKey(Random* rnd) {
+ int len = (rnd->OneIn(3)
+ ? 1 // Short sometimes to encourage collisions
+ : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+ return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step,
+ DB* model,
+ DB* db,
+ const Snapshot* model_snap,
+ const Snapshot* db_snap) {
+ ReadOptions options;
+ options.snapshot = model_snap;
+ Iterator* miter = model->NewIterator(options);
+ options.snapshot = db_snap;
+ Iterator* dbiter = db->NewIterator(options);
+ bool ok = true;
+ int count = 0;
+ for (miter->SeekToFirst(), dbiter->SeekToFirst();
+ ok && miter->Valid() && dbiter->Valid();
+ miter->Next(), dbiter->Next()) {
+ count++;
+ if (miter->key().compare(dbiter->key()) != 0) {
+ fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n",
+ step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(dbiter->key()).c_str());
+ ok = false;
+ break;
+ }
+
+ if (miter->value().compare(dbiter->value()) != 0) {
+ fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+ step,
+ EscapeString(miter->key()).c_str(),
+ EscapeString(miter->value()).c_str(),
+ EscapeString(miter->value()).c_str());
+ ok = false;
+ }
+ }
+
+ if (ok) {
+ if (miter->Valid() != dbiter->Valid()) {
+ fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+ step, miter->Valid(), dbiter->Valid());
+ ok = false;
+ }
+ }
+ fprintf(stderr, "%d entries compared: ok=%d\n", count, ok);
+ delete miter;
+ delete dbiter;
+ return ok;
+}
+
+TEST(DBTest, Randomized) {
+ Random rnd(test::RandomSeed());
+ ModelDB model(last_options_);
+ const int N = 10000;
+ const Snapshot* model_snap = NULL;
+ const Snapshot* db_snap = NULL;
+ std::string k, v;
+ for (int step = 0; step < N; step++) {
+ if (step % 100 == 0) {
+ fprintf(stderr, "Step %d of %d\n", step, N);
+ }
+ int p = rnd.Uniform(100);
+ if (p < 45) { // Put
+ k = RandomKey(&rnd);
+ v = RandomString(&rnd,
+ rnd.OneIn(20)
+ ? 100 + rnd.Uniform(100)
+ : rnd.Uniform(8));
+ ASSERT_OK(model.Put(WriteOptions(), k, v));
+ ASSERT_OK(db_->Put(WriteOptions(), k, v));
+
+ } else if (p < 90) { // Delete
+ k = RandomKey(&rnd);
+ ASSERT_OK(model.Delete(WriteOptions(), k));
+ ASSERT_OK(db_->Delete(WriteOptions(), k));
+
+
+ } else { // Multi-element batch
+ WriteBatch b;
+ const int num = rnd.Uniform(8);
+ for (int i = 0; i < num; i++) {
+ if (i == 0 || !rnd.OneIn(10)) {
+ k = RandomKey(&rnd);
+ } else {
+ // Periodically re-use the same key from the previous iter, so
+ // we have multiple entries in the write batch for the same key
+ }
+ if (rnd.OneIn(2)) {
+ v = RandomString(&rnd, rnd.Uniform(10));
+ b.Put(k, v);
+ } else {
+ b.Delete(k);
+ }
+ }
+ ASSERT_OK(model.Write(WriteOptions(), &b));
+ ASSERT_OK(db_->Write(WriteOptions(), &b));
+ }
+
+ if ((step % 100) == 0) {
+ ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+ ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+ // Save a snapshot from each DB this time that we'll use next
+ // time we compare things, to make sure the current state is
+ // preserved with the snapshot
+ if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+ if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+
+ Reopen();
+ ASSERT_TRUE(CompareIterators(step, &model, db_, NULL, NULL));
+
+ model_snap = model.GetSnapshot();
+ db_snap = db_->GetSnapshot();
+ }
+ }
+ if (model_snap != NULL) model.ReleaseSnapshot(model_snap);
+ if (db_snap != NULL) db_->ReleaseSnapshot(db_snap);
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/dbformat.cc b/db/dbformat.cc
new file mode 100644
index 0000000..f09a729
--- /dev/null
+++ b/db/dbformat.cc
@@ -0,0 +1,152 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+ assert(seq <= kMaxSequenceNumber);
+ assert(t <= kValueTypeForSeek);
+ return (seq << 8) | t;
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+ result->append(key.user_key.data(), key.user_key.size());
+ PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+std::string ParsedInternalKey::DebugString() const {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "' @ %llu : %d",
+ (unsigned long long) sequence,
+ int(type));
+ std::string result = "'";
+ result += user_key.ToString();
+ result += buf;
+ return result;
+}
+
+const char* InternalKeyComparator::Name() const {
+ return "leveldb.InternalKeyComparator";
+}
+
+int InternalKeyComparator::Compare(const Slice& akey, const Slice& bkey) const {
+ // Order by:
+ // increasing user key (according to user-supplied comparator)
+ // decreasing sequence number
+ // decreasing type (though sequence# should be enough to disambiguate)
+ int r = user_comparator_->Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+ if (r == 0) {
+ const uint64_t anum = DecodeFixed64(akey.data() + akey.size() - 8);
+ const uint64_t bnum = DecodeFixed64(bkey.data() + bkey.size() - 8);
+ if (anum > bnum) {
+ r = -1;
+ } else if (anum < bnum) {
+ r = +1;
+ }
+ }
+ return r;
+}
+
+void InternalKeyComparator::FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const {
+ // Attempt to shorten the user portion of the key
+ Slice user_start = ExtractUserKey(*start);
+ Slice user_limit = ExtractUserKey(limit);
+ std::string tmp(user_start.data(), user_start.size());
+ user_comparator_->FindShortestSeparator(&tmp, user_limit);
+ if (user_comparator_->Compare(*start, tmp) < 0) {
+ // User key has become larger. Tack on the earliest possible
+ // number to the shortened user key.
+ PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+ assert(this->Compare(*start, tmp) < 0);
+ assert(this->Compare(tmp, limit) < 0);
+ start->swap(tmp);
+ }
+}
+
+void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
+ Slice user_key = ExtractUserKey(*key);
+ std::string tmp(user_key.data(), user_key.size());
+ user_comparator_->FindShortSuccessor(&tmp);
+ if (user_comparator_->Compare(user_key, tmp) < 0) {
+ // User key has become larger. Tack on the earliest possible
+ // number to the shortened user key.
+ PutFixed64(&tmp, PackSequenceAndType(kMaxSequenceNumber,kValueTypeForSeek));
+ assert(this->Compare(*key, tmp) < 0);
+ key->swap(tmp);
+ }
+}
+
+LargeValueRef LargeValueRef::Make(const Slice& value, CompressionType ctype) {
+ LargeValueRef result;
+ port::SHA1_Hash(value.data(), value.size(), &result.data[0]);
+ EncodeFixed64(&result.data[20], value.size());
+ result.data[28] = static_cast<unsigned char>(ctype);
+ return result;
+}
+
+std::string LargeValueRefToFilenameString(const LargeValueRef& h) {
+ assert(sizeof(h.data) == LargeValueRef::ByteSize());
+ assert(sizeof(h.data) == 29); // So we can hardcode the array size of buf
+ static const char tohex[] = "0123456789abcdef";
+ char buf[20*2];
+ for (int i = 0; i < 20; i++) {
+ buf[2*i] = tohex[(h.data[i] >> 4) & 0xf];
+ buf[2*i+1] = tohex[h.data[i] & 0xf];
+ }
+ std::string result = std::string(buf, sizeof(buf));
+ result += "-";
+ result += NumberToString(h.ValueSize());
+ result += "-";
+ result += NumberToString(static_cast<uint64_t>(h.compression_type()));
+ return result;
+}
+
+static uint32_t hexvalue(char c) {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ } else if (c >= 'A' && c <= 'F') {
+ return 10 + c - 'A';
+ } else {
+ assert(c >= 'a' && c <= 'f');
+ return 10 + c - 'a';
+ }
+}
+
+bool FilenameStringToLargeValueRef(const Slice& s, LargeValueRef* h) {
+ Slice in = s;
+ if (in.size() < 40) {
+ return false;
+ }
+ for (int i = 0; i < 20; i++) {
+ if (!isxdigit(in[i*2]) || !isxdigit(in[i*2+1])) {
+ return false;
+ }
+ unsigned char c = (hexvalue(in[i*2])<<4) | hexvalue(in[i*2+1]);
+ h->data[i] = c;
+ }
+ in.remove_prefix(40);
+ uint64_t value_size, ctype;
+
+ if (ConsumeChar(&in, '-') &&
+ ConsumeDecimalNumber(&in, &value_size) &&
+ ConsumeChar(&in, '-') &&
+ ConsumeDecimalNumber(&in, &ctype) &&
+ in.empty() &&
+ (ctype <= kLightweightCompression)) {
+ EncodeFixed64(&h->data[20], value_size);
+ h->data[28] = static_cast<unsigned char>(ctype);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+}
diff --git a/db/dbformat.h b/db/dbformat.h
new file mode 100644
index 0000000..e784457
--- /dev/null
+++ b/db/dbformat.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_FORMAT_H_
+#define STORAGE_LEVELDB_DB_FORMAT_H_
+
+#include <stdio.h>
+#include "include/comparator.h"
+#include "include/db.h"
+#include "include/slice.h"
+#include "include/table_builder.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+enum ValueType {
+ kTypeDeletion = 0x0,
+ kTypeValue = 0x1,
+ kTypeLargeValueRef = 0x2,
+};
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+static const ValueType kValueTypeForSeek = kTypeLargeValueRef;
+
+typedef uint64_t SequenceNumber;
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber =
+ ((0x1ull << 56) - 1);
+
+struct ParsedInternalKey {
+ Slice user_key;
+ SequenceNumber sequence;
+ ValueType type;
+
+ ParsedInternalKey() { } // Intentionally left uninitialized (for speed)
+ ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+ : user_key(u), sequence(seq), type(t) { }
+ std::string DebugString() const;
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+ return key.user_key.size() + 8;
+}
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+ const ParsedInternalKey& key);
+
+// Attempt to parse an internal key from "internal_key". On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ return Slice(internal_key.data(), internal_key.size() - 8);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+ assert(internal_key.size() >= 8);
+ const size_t n = internal_key.size();
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator : public Comparator {
+ private:
+ const Comparator* user_comparator_;
+ public:
+ explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) { }
+ virtual const char* Name() const;
+ virtual int Compare(const Slice& a, const Slice& b) const;
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const;
+ virtual void FindShortSuccessor(std::string* key) const;
+
+ const Comparator* user_comparator() const { return user_comparator_; }
+
+ int Compare(const InternalKey& a, const InternalKey& b) const;
+};
+
+// Modules in this directory should keep internal keys wrapped inside
+// the following class instead of plain strings so that we do not
+// incorrectly use string comparisons instead of an InternalKeyComparator.
+class InternalKey {
+ private:
+ std::string rep_;
+ public:
+ InternalKey() { } // Leave rep_ as empty to indicate it is invalid
+ InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) {
+ AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t));
+ }
+
+ void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+ Slice Encode() const {
+ assert(!rep_.empty());
+ return rep_;
+ }
+
+ Slice user_key() const { return ExtractUserKey(rep_); }
+
+ void SetFrom(const ParsedInternalKey& p) {
+ rep_.clear();
+ AppendInternalKey(&rep_, p);
+ }
+
+ void Clear() { rep_.clear(); }
+};
+
+inline int InternalKeyComparator::Compare(
+ const InternalKey& a, const InternalKey& b) const {
+ return Compare(a.Encode(), b.Encode());
+}
+
+// LargeValueRef is a 160-bit hash value (20 bytes), plus an 8 byte
+// uncompressed size, and a 1 byte CompressionType code. An
+// encoded form of it is embedded in the filenames of large value
+// files stored in the database, and the raw binary form is stored as
+// the iter->value() result for values of type kTypeLargeValueRef in
+// the table and log files that make up the database.
+struct LargeValueRef {
+ char data[29];
+
+ // Initialize a large value ref for the given data
+ static LargeValueRef Make(const Slice& data,
+ CompressionType compression_type);
+
+ // Initialize a large value ref from a serialized, 29-byte reference value
+ static LargeValueRef FromRef(const Slice& ref) {
+ LargeValueRef result;
+ assert(ref.size() == sizeof(result.data));
+ memcpy(result.data, ref.data(), sizeof(result.data));
+ return result;
+ }
+
+ // Return the number of bytes in a LargeValueRef (not the
+ // number of bytes in the value referenced).
+ static size_t ByteSize() { return sizeof(LargeValueRef().data); }
+
+ // Return the number of bytes in the value referenced by "*this".
+ uint64_t ValueSize() const { return DecodeFixed64(&data[20]); }
+
+ CompressionType compression_type() const {
+ return static_cast<CompressionType>(data[28]);
+ }
+
+ bool operator==(const LargeValueRef& b) const {
+ return memcmp(data, b.data, sizeof(data)) == 0;
+ }
+ bool operator<(const LargeValueRef& b) const {
+ return memcmp(data, b.data, sizeof(data)) < 0;
+ }
+};
+
+// Convert the large value ref to a human-readable string suitable
+// for embedding in a large value filename.
+extern std::string LargeValueRefToFilenameString(const LargeValueRef& h);
+
+// Parse the large value filename string in "input" and store it in
+// "*h". If successful, returns true. Otherwise returns false.
+extern bool FilenameStringToLargeValueRef(const Slice& in, LargeValueRef* ref);
+
+inline bool ParseInternalKey(const Slice& internal_key,
+ ParsedInternalKey* result) {
+ const size_t n = internal_key.size();
+ if (n < 8) return false;
+ uint64_t num = DecodeFixed64(internal_key.data() + n - 8);
+ unsigned char c = num & 0xff;
+ result->sequence = num >> 8;
+ result->type = static_cast<ValueType>(c);
+ result->user_key = Slice(internal_key.data(), n - 8);
+ return (c <= static_cast<unsigned char>(kTypeLargeValueRef));
+}
+
+}
+
+#endif // STORAGE_LEVELDB_DB_FORMAT_H_
diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc
new file mode 100644
index 0000000..5dfa101
--- /dev/null
+++ b/db/dbformat_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string IKey(const std::string& user_key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded;
+ AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+ return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortestSeparator(&result, l);
+ return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+ std::string result = s;
+ InternalKeyComparator(BytewiseComparator()).FindShortSuccessor(&result);
+ return result;
+}
+
+static void TestKey(const std::string& key,
+ uint64_t seq,
+ ValueType vt) {
+ std::string encoded = IKey(key, seq, vt);
+
+ Slice in(encoded);
+ ParsedInternalKey decoded("", 0, kTypeValue);
+
+ ASSERT_TRUE(ParseInternalKey(in, &decoded));
+ ASSERT_EQ(key, decoded.user_key.ToString());
+ ASSERT_EQ(seq, decoded.sequence);
+ ASSERT_EQ(vt, decoded.type);
+
+ ASSERT_TRUE(!ParseInternalKey(Slice("bar"), &decoded));
+}
+
+class FormatTest { };
+
+TEST(FormatTest, InternalKey_EncodeDecode) {
+ const char* keys[] = { "", "k", "hello", "longggggggggggggggggggggg" };
+ const uint64_t seq[] = {
+ 1, 2, 3,
+ (1ull << 8) - 1, 1ull << 8, (1ull << 8) + 1,
+ (1ull << 16) - 1, 1ull << 16, (1ull << 16) + 1,
+ (1ull << 32) - 1, 1ull << 32, (1ull << 32) + 1
+ };
+ for (int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+ for (int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+ TestKey(keys[k], seq[s], kTypeValue);
+ TestKey("hello", 1, kTypeDeletion);
+ }
+ }
+}
+
+TEST(FormatTest, InternalKeyShortSeparator) {
+ // When user keys are same
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 99, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 101, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeDeletion)));
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foo", 100, kTypeLargeValueRef)));
+
+ // When user keys are misordered
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("bar", 99, kTypeValue)));
+
+ // When user keys are different, but correctly ordered
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("hello", 200, kTypeValue)));
+
+ // When start user key is prefix of limit user key
+ ASSERT_EQ(IKey("foo", 100, kTypeValue),
+ Shorten(IKey("foo", 100, kTypeValue),
+ IKey("foobar", 200, kTypeValue)));
+
+ // When limit user key is prefix of start user key
+ ASSERT_EQ(IKey("foobar", 100, kTypeValue),
+ Shorten(IKey("foobar", 100, kTypeValue),
+ IKey("foo", 200, kTypeValue)));
+}
+
+TEST(FormatTest, InternalKeyShortestSuccessor) {
+ ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+ ShortSuccessor(IKey("foo", 100, kTypeValue)));
+ ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+ ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST(FormatTest, SHA1) {
+ // Check that we are computing the same value as sha1.
+ // Note that the last two numbers are the length of the input and the
+ // compression type.
+ ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-0", // SHA1, uncompr
+ LargeValueRefToFilenameString(
+ LargeValueRef::Make("hello", kNoCompression)));
+ ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d-5-1", // SHA1, lwcompr
+ LargeValueRefToFilenameString(
+ LargeValueRef::Make("hello", kLightweightCompression)));
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/filename.cc b/db/filename.cc
new file mode 100644
index 0000000..55e6d28
--- /dev/null
+++ b/db/filename.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <ctype.h>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/dbformat.h"
+#include "include/env.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static std::string MakeFileName(const std::string& name, uint64_t number,
+ const char* suffix) {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/%06llu.%s",
+ static_cast<unsigned long long>(number),
+ suffix);
+ return name + buf;
+}
+
+std::string LogFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "log");
+}
+
+std::string TableFileName(const std::string& name, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(name, number, "sst");
+}
+
+std::string LargeValueFileName(const std::string& name,
+ const LargeValueRef& large_ref) {
+ std::string result = name + "/";
+ result += LargeValueRefToFilenameString(large_ref);
+ result += ".val";
+ return result;
+}
+
+std::string DescriptorFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/MANIFEST-%06llu",
+ static_cast<unsigned long long>(number));
+ return dbname + buf;
+}
+
+std::string CurrentFileName(const std::string& dbname) {
+ return dbname + "/CURRENT";
+}
+
+std::string LockFileName(const std::string& dbname) {
+ return dbname + "/LOCK";
+}
+
+std::string TempFileName(const std::string& dbname, uint64_t number) {
+ assert(number > 0);
+ return MakeFileName(dbname, number, "dbtmp");
+}
+
+std::string InfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG";
+}
+
+// Return the name of the old info log file for "dbname".
+std::string OldInfoLogFileName(const std::string& dbname) {
+ return dbname + "/LOG.old";
+}
+
+
+// Owned filenames have the form:
+// dbname/CURRENT
+// dbname/LOCK
+// dbname/LOG
+// dbname/LOG.old
+// dbname/MANIFEST-[0-9]+
+// dbname/[0-9a-f]{20}-[0-9]+-[0-9]+.val
+// dbname/[0-9]+.(log|sst)
+bool ParseFileName(const std::string& fname,
+ uint64_t* number,
+ LargeValueRef* large_ref,
+ FileType* type) {
+ Slice rest(fname);
+ if (rest == "CURRENT") {
+ *number = 0;
+ *type = kCurrentFile;
+ } else if (rest == "LOCK") {
+ *number = 0;
+ *type = kDBLockFile;
+ } else if (rest == "LOG" || rest == "LOG.old") {
+ *number = 0;
+ *type = kInfoLogFile;
+ } else if (rest.size() >= 4 &&
+ Slice(rest.data() + rest.size() - 4, 4) == ".val") {
+ LargeValueRef h;
+ if (!FilenameStringToLargeValueRef(Slice(rest.data(), rest.size() - 4),
+ &h)) {
+ return false;
+ }
+ *large_ref = h;
+ *type = kLargeValueFile;
+ } else if (rest.starts_with("MANIFEST-")) {
+ rest.remove_prefix(strlen("MANIFEST-"));
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ if (!rest.empty()) {
+ return false;
+ }
+ *type = kDescriptorFile;
+ *number = num;
+ } else {
+ // Avoid strtoull() to keep filename format independent of the
+ // current locale
+ uint64_t num;
+ if (!ConsumeDecimalNumber(&rest, &num)) {
+ return false;
+ }
+ Slice suffix = rest;
+ if (suffix == Slice(".log")) {
+ *type = kLogFile;
+ } else if (suffix == Slice(".sst")) {
+ *type = kTableFile;
+ } else if (suffix == Slice(".dbtmp")) {
+ *type = kTempFile;
+ } else {
+ return false;
+ }
+ *number = num;
+ }
+ return true;
+}
+
+Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number) {
+ // Remove leading "dbname/" and add newline to manifest file name
+ std::string manifest = DescriptorFileName(dbname, descriptor_number);
+ Slice contents = manifest;
+ assert(contents.starts_with(dbname + "/"));
+ contents.remove_prefix(dbname.size() + 1);
+ std::string tmp = TempFileName(dbname, descriptor_number);
+ Status s = WriteStringToFile(env, contents.ToString() + "\n", tmp);
+ if (s.ok()) {
+ s = env->RenameFile(tmp, CurrentFileName(dbname));
+ }
+ if (!s.ok()) {
+ env->DeleteFile(tmp);
+ }
+ return s;
+}
+
+}
diff --git a/db/filename.h b/db/filename.h
new file mode 100644
index 0000000..3fd2ea4
--- /dev/null
+++ b/db/filename.h
@@ -0,0 +1,92 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// File names used by DB code
+
+#ifndef STORAGE_LEVELDB_DB_FILENAME_H_
+#define STORAGE_LEVELDB_DB_FILENAME_H_
+
+#include <stdint.h>
+#include <string>
+#include "include/slice.h"
+#include "include/status.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+struct LargeValueRef;
+
+enum FileType {
+ kLogFile,
+ kDBLockFile,
+ kTableFile,
+ kLargeValueFile,
+ kDescriptorFile,
+ kCurrentFile,
+ kTempFile,
+ kInfoLogFile, // Either the current one, or an old one
+};
+
+// Return the name of the log file with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string LogFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the sstable with the specified number
+// in the db named by "dbname". The result will be prefixed with
+// "dbname".
+extern std::string TableFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the large value file with the specified large
+// value reference in the db named by "dbname". The result will be
+// prefixed with "dbname".
+extern std::string LargeValueFileName(const std::string& dbname,
+ const LargeValueRef& large_ref);
+
+// Return the name of the descriptor file for the db named by
+// "dbname" and the specified incarnation number. The result will be
+// prefixed with "dbname".
+extern std::string DescriptorFileName(const std::string& dbname,
+ uint64_t number);
+
+// Return the name of the current file. This file contains the name
+// of the current manifest file. The result will be prefixed with
+// "dbname".
+extern std::string CurrentFileName(const std::string& dbname);
+
+// Return the name of the lock file for the db named by
+// "dbname". The result will be prefixed with "dbname".
+extern std::string LockFileName(const std::string& dbname);
+
+// Return the name of a temporary file owned by the db named "dbname".
+// The result will be prefixed with "dbname".
+extern std::string TempFileName(const std::string& dbname, uint64_t number);
+
+// Return the name of the info log file for "dbname".
+extern std::string InfoLogFileName(const std::string& dbname);
+
+// Return the name of the old info log file for "dbname".
+extern std::string OldInfoLogFileName(const std::string& dbname);
+
+// If filename is a leveldb file, store the type of the file in *type.
+// If *type is kLargeValueFile, then the large value reference data
+// from the filename is stored in "*large_ref. For all other types of
+// files, the number encoded in the filename is stored in *number. If
+// the filename was successfully parsed, returns true. Else return
+// false.
+extern bool ParseFileName(const std::string& filename,
+ uint64_t* number,
+ LargeValueRef* large_ref,
+ FileType* type);
+
+// Make the CURRENT file point to the descriptor file with the
+// specified number.
+extern Status SetCurrentFile(Env* env, const std::string& dbname,
+ uint64_t descriptor_number);
+
+
+}
+
+#endif // STORAGE_LEVELDB_DB_FILENAME_H_
diff --git a/db/filename_test.cc b/db/filename_test.cc
new file mode 100644
index 0000000..08a54eb
--- /dev/null
+++ b/db/filename_test.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class FileNameTest { };
+
+TEST(FileNameTest, Parse) {
+ Slice db;
+ FileType type;
+ uint64_t number;
+ LargeValueRef large_ref;
+
+ // Successful parses
+ static struct {
+ const char* fname;
+ uint64_t number;
+ const char* large_ref;
+ FileType type;
+ } cases[] = {
+ { "100.log", 100, "", kLogFile },
+ { "0.log", 0, "", kLogFile },
+ { "0.sst", 0, "", kTableFile },
+ { "CURRENT", 0, "", kCurrentFile },
+ { "LOCK", 0, "", kDBLockFile },
+ { "MANIFEST-2", 2, "", kDescriptorFile },
+ { "MANIFEST-7", 7, "", kDescriptorFile },
+ { "LOG", 0, "", kInfoLogFile },
+ { "LOG.old", 0, "", kInfoLogFile },
+ { "18446744073709551615.log", 18446744073709551615ull, "",
+ kLogFile },
+ { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0.val", 0,
+ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-1234-0", kLargeValueFile },
+ { "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0.val", 0,
+ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-10000000000-0",
+ kLargeValueFile },
+ };
+ for (int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ std::string f = cases[i].fname;
+ ASSERT_TRUE(ParseFileName(f, &number, &large_ref, &type)) << f;
+ ASSERT_EQ(cases[i].type, type) << f;
+ if (type == kLargeValueFile) {
+ ASSERT_EQ(cases[i].large_ref, LargeValueRefToFilenameString(large_ref))
+ << f;
+ } else {
+ ASSERT_EQ(cases[i].number, number) << f;
+ }
+ }
+
+ // Errors
+ static const char* errors[] = {
+ "",
+ "foo",
+ "foo-dx-100.log",
+ ".log",
+ "",
+ "manifest",
+ "CURREN",
+ "CURRENTX",
+ "MANIFES",
+ "MANIFEST",
+ "MANIFEST-",
+ "XMANIFEST-3",
+ "MANIFEST-3x",
+ "LOC",
+ "LOCKx",
+ "LO",
+ "LOGx",
+ "18446744073709551616.log",
+ "184467440737095516150.log",
+ "100",
+ "100.",
+ "100.lop",
+ "100.val",
+ ".val",
+ "123456789012345678901234567890123456789-12340.val",
+ "1234567890123456789012345678901234567-123-0.val",
+ "12345678901234567890123456789012345678902-100-1-.val",
+ // Overflow on value size
+ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000000000000000000-1.val",
+ // '03.val' is a bad compression type
+ "2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2e2323-100000-3.val" };
+ for (int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+ std::string f = errors[i];
+ ASSERT_TRUE(!ParseFileName(f, &number, &large_ref, &type)) << f;
+ };
+}
+
+TEST(FileNameTest, Construction) {
+ uint64_t number;
+ FileType type;
+ LargeValueRef large_ref;
+ std::string fname;
+
+ fname = CurrentFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kCurrentFile, type);
+
+ fname = LockFileName("foo");
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(0, number);
+ ASSERT_EQ(kDBLockFile, type);
+
+ fname = LogFileName("foo", 192);
+ ASSERT_EQ("foo/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(192, number);
+ ASSERT_EQ(kLogFile, type);
+
+ fname = TableFileName("bar", 200);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(200, number);
+ ASSERT_EQ(kTableFile, type);
+
+ fname = DescriptorFileName("bar", 100);
+ ASSERT_EQ("bar/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(100, number);
+ ASSERT_EQ(kDescriptorFile, type);
+
+ fname = TempFileName("tmp", 999);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_EQ(999, number);
+ ASSERT_EQ(kTempFile, type);
+
+ for (int i = 0; i <= kLightweightCompression; i++) {
+ CompressionType ctype = static_cast<CompressionType>(i);
+ std::string value = "abcdef";
+ LargeValueRef real_large_ref = LargeValueRef::Make(Slice(value), ctype);
+ fname = LargeValueFileName("tmp", real_large_ref);
+ ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+ ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &large_ref, &type));
+ ASSERT_TRUE(real_large_ref == large_ref);
+ ASSERT_EQ(kLargeValueFile, type);
+ ASSERT_EQ(large_ref.compression_type(), ctype);
+ }
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/log_format.h b/db/log_format.h
new file mode 100644
index 0000000..137cd4a
--- /dev/null
+++ b/db/log_format.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+#define STORAGE_LEVELDB_DB_LOG_FORMAT_H_
+
+namespace leveldb {
+namespace log {
+
+enum RecordType {
+ // Zero is reserved for preallocated files
+ kZeroType = 0,
+
+ kFullType = 1,
+
+ // For fragments
+ kFirstType = 2,
+ kMiddleType = 3,
+ kLastType = 4,
+};
+static const int kMaxRecordType = kLastType;
+
+static const int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
+static const int kHeaderSize = 4 + 1 + 2;
+
+}
+}
+
+#endif // STORAGE_LEVELDB_DB_LOG_FORMAT_H_
diff --git a/db/log_reader.cc b/db/log_reader.cc
new file mode 100644
index 0000000..243bd2c
--- /dev/null
+++ b/db/log_reader.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdint.h>
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Reader::Reporter::~Reporter() {
+}
+
+Reader::Reader(SequentialFile* file, Reporter* reporter, bool checksum)
+ : file_(file),
+ reporter_(reporter),
+ checksum_(checksum),
+ backing_store_(new char[kBlockSize]),
+ buffer_(),
+ eof_(false) {
+}
+
+Reader::~Reader() {
+ delete[] backing_store_;
+}
+
+bool Reader::ReadRecord(Slice* record, std::string* scratch) {
+ scratch->clear();
+ record->clear();
+ bool in_fragmented_record = false;
+
+ Slice fragment;
+ while (true) {
+ switch (ReadPhysicalRecord(&fragment)) {
+ case kFullType:
+ if (in_fragmented_record) {
+ ReportDrop(scratch->size(), "partial record without end");
+ }
+ scratch->clear();
+ *record = fragment;
+ return true;
+
+ case kFirstType:
+ if (in_fragmented_record) {
+ ReportDrop(scratch->size(), "partial record without end");
+ }
+ scratch->assign(fragment.data(), fragment.size());
+ in_fragmented_record = true;
+ break;
+
+ case kMiddleType:
+ if (!in_fragmented_record) {
+ ReportDrop(fragment.size(), "missing start of fragmented record");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ }
+ break;
+
+ case kLastType:
+ if (!in_fragmented_record) {
+ ReportDrop(fragment.size(), "missing start of fragmented record");
+ } else {
+ scratch->append(fragment.data(), fragment.size());
+ *record = Slice(*scratch);
+ return true;
+ }
+ break;
+
+ case kEof:
+ if (in_fragmented_record) {
+ ReportDrop(scratch->size(), "partial record without end");
+ scratch->clear();
+ }
+ return false;
+
+ case kBadRecord:
+ if (in_fragmented_record) {
+ ReportDrop(scratch->size(), "error in middle of record");
+ in_fragmented_record = false;
+ scratch->clear();
+ }
+ break;
+
+ default:
+ ReportDrop(
+ (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+ "unknown record type");
+ in_fragmented_record = false;
+ scratch->clear();
+ break;
+ }
+ }
+ return false;
+}
+
+void Reader::ReportDrop(size_t bytes, const char* reason) {
+ if (reporter_ != NULL) {
+ reporter_->Corruption(bytes, Status::Corruption(reason));
+ }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result) {
+ while (true) {
+ if (buffer_.size() <= kHeaderSize) {
+ if (!eof_) {
+ // Last read was a full read, so this is a trailer to skip
+ buffer_.clear();
+ Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
+ if (!status.ok()) {
+ if (reporter_ != NULL) {
+ reporter_->Corruption(kBlockSize, status);
+ }
+ buffer_.clear();
+ eof_ = true;
+ return kEof;
+ } else if (buffer_.size() < kBlockSize) {
+ eof_ = true;
+ }
+ continue;
+ } else if (buffer_.size() == 0) {
+ // End of file
+ return kEof;
+ } else if (buffer_.size() < kHeaderSize) {
+ ReportDrop(buffer_.size(), "truncated record at end of file");
+ buffer_.clear();
+ return kEof;
+ } else {
+ // We have a trailing zero-length record. Fall through and check it.
+ }
+ }
+
+ // Parse the header
+ const char* header = buffer_.data();
+ const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+ const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+ const unsigned int type = header[6];
+ const uint32_t length = a | (b << 8);
+ if (kHeaderSize + length > buffer_.size()) {
+ ReportDrop(buffer_.size(), "bad record length");
+ buffer_.clear();
+ return kBadRecord;
+ }
+
+ // Check crc
+ if (checksum_) {
+ if (type == kZeroType && length == 0) {
+ // Skip zero length record
+ buffer_.remove_prefix(kHeaderSize + length);
+ return kBadRecord;
+ }
+
+ uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+ uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
+ if (actual_crc != expected_crc) {
+ ReportDrop(length, "checksum mismatch");
+ buffer_.remove_prefix(kHeaderSize + length);
+ return kBadRecord;
+ }
+ }
+
+ buffer_.remove_prefix(kHeaderSize + length);
+ *result = Slice(header + kHeaderSize, length);
+ return type;
+ }
+}
+
+}
+}
diff --git a/db/log_reader.h b/db/log_reader.h
new file mode 100644
index 0000000..515d2af
--- /dev/null
+++ b/db/log_reader.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_READER_H_
+#define STORAGE_LEVELDB_DB_LOG_READER_H_
+
+#include "db/log_format.h"
+#include "include/slice.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class SequentialFile;
+
+namespace log {
+
+class Reader {
+ public:
+ // Interface for reporting errors.
+ class Reporter {
+ public:
+ virtual ~Reporter();
+
+ // Some corruption was detected. "size" is the approximate number
+ // of bytes dropped due to the corruption.
+ virtual void Corruption(size_t bytes, const Status& status) = 0;
+ };
+
+ // Create a reader that will return log records from "*file".
+ // "*file" must remain live while this Reader is in use.
+ //
+ // If "reporter" is non-NULL, it is notified whenever some data is
+ // dropped due to a detected corruption. "*reporter" must remain
+ // live while this Reader is in use.
+ //
+ // If "checksum" is true, verify checksums if available.
+ Reader(SequentialFile* file, Reporter* reporter, bool checksum);
+
+ ~Reader();
+
+ // Read the next record into *record. Returns true if read
+ // successfully, false if we hit end of the input. May use
+ // "*scratch" as temporary storage. The contents filled in *record
+ // will only be valid until the next mutating operation on this
+ // reader or the next mutation to *scratch.
+ bool ReadRecord(Slice* record, std::string* scratch);
+
+ private:
+ SequentialFile* const file_;
+ Reporter* const reporter_;
+ bool const checksum_;
+ char* const backing_store_;
+ Slice buffer_;
+ bool eof_; // Last Read() indicated EOF by returning < kBlockSize
+
+ // Extend record types with the following special values
+ enum {
+ kEof = kMaxRecordType + 1,
+ kBadRecord = kMaxRecordType + 2
+ };
+
+ // Return type, or one of the preceding special values
+ unsigned int ReadPhysicalRecord(Slice* result);
+ void ReportDrop(size_t bytes, const char* reason);
+
+ // No copying allowed
+ Reader(const Reader&);
+ void operator=(const Reader&);
+};
+
+}
+}
+
+#endif // STORAGE_LEVELDB_DB_LOG_READER_H_
diff --git a/db/log_test.cc b/db/log_test.cc
new file mode 100644
index 0000000..8c1915d
--- /dev/null
+++ b/db/log_test.cc
@@ -0,0 +1,361 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+ std::string result;
+ while (result.size() < n) {
+ result.append(partial_string);
+ }
+ result.resize(n);
+ return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+ char buf[50];
+ snprintf(buf, sizeof(buf), "%d.", n);
+ return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+ return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+class LogTest {
+ private:
+ class StringDest : public WritableFile {
+ public:
+ std::string contents_;
+
+ virtual Status Close() { return Status::OK(); }
+ virtual Status Flush() { return Status::OK(); }
+ virtual Status Sync() { return Status::OK(); }
+ virtual Status Append(const Slice& slice) {
+ contents_.append(slice.data(), slice.size());
+ return Status::OK();
+ }
+ };
+
+ class StringSource : public SequentialFile {
+ public:
+ Slice contents_;
+ bool force_error_;
+ bool returned_partial_;
+ StringSource() : force_error_(false), returned_partial_(false) { }
+
+ virtual Status Read(size_t n, Slice* result, char* scratch) {
+ ASSERT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+ ASSERT_EQ(kBlockSize, n);
+
+ if (force_error_) {
+ force_error_ = false;
+ returned_partial_ = true;
+ return Status::Corruption("read error");
+ }
+
+ if (contents_.size() < n) {
+ n = contents_.size();
+ returned_partial_ = true;
+ }
+ *result = Slice(contents_.data(), n);
+ contents_.remove_prefix(n);
+ return Status::OK();
+ }
+ };
+
+ class ReportCollector : public Reader::Reporter {
+ public:
+ size_t dropped_bytes_;
+ std::string message_;
+
+ ReportCollector() : dropped_bytes_(0) { }
+ virtual void Corruption(size_t bytes, const Status& status) {
+ dropped_bytes_ += bytes;
+ message_.append(status.ToString());
+ }
+ };
+
+ StringDest dest_;
+ StringSource source_;
+ ReportCollector report_;
+ bool reading_;
+ Writer writer_;
+ Reader reader_;
+
+ public:
+ LogTest() : reading_(false),
+ writer_(&dest_),
+ reader_(&source_, &report_, true/*checksum*/) {
+ }
+
+ void Write(const std::string& msg) {
+ ASSERT_TRUE(!reading_) << "Write() after starting to read";
+ writer_.AddRecord(Slice(msg));
+ }
+
+ size_t WrittenBytes() const {
+ return dest_.contents_.size();
+ }
+
+ std::string Read() {
+ if (!reading_) {
+ reading_ = true;
+ source_.contents_ = Slice(dest_.contents_);
+ }
+ std::string scratch;
+ Slice record;
+ if (reader_.ReadRecord(&record, &scratch)) {
+ return record.ToString();
+ } else {
+ return "EOF";
+ }
+ }
+
+ void IncrementByte(int offset, int delta) {
+ dest_.contents_[offset] += delta;
+ }
+
+ void SetByte(int offset, char new_byte) {
+ dest_.contents_[offset] = new_byte;
+ }
+
+ void ShrinkSize(int bytes) {
+ dest_.contents_.resize(dest_.contents_.size() - bytes);
+ }
+
+ void FixChecksum(int header_offset, int len) {
+ // Compute crc of type/len/data
+ uint32_t crc = crc32c::Value(&dest_.contents_[header_offset+6], 1 + len);
+ crc = crc32c::Mask(crc);
+ EncodeFixed32(&dest_.contents_[header_offset], crc);
+ }
+
+ void ForceError() {
+ source_.force_error_ = true;
+ }
+
+ size_t DroppedBytes() const {
+ return report_.dropped_bytes_;
+ }
+
+ // Returns OK iff recorded error message contains "msg"
+ std::string MatchError(const std::string& msg) const {
+ if (report_.message_.find(msg) == std::string::npos) {
+ return report_.message_;
+ } else {
+ return "OK";
+ }
+ }
+};
+
+TEST(LogTest, Empty) {
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ReadWrite) {
+ Write("foo");
+ Write("bar");
+ Write("");
+ Write("xxxx");
+ ASSERT_EQ("foo", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("xxxx", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
+}
+
+TEST(LogTest, ManyBlocks) {
+ for (int i = 0; i < 100000; i++) {
+ Write(NumberString(i));
+ }
+ for (int i = 0; i < 100000; i++) {
+ ASSERT_EQ(NumberString(i), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, Fragmentation) {
+ Write("small");
+ Write(BigString("medium", 50000));
+ Write(BigString("large", 100000));
+ ASSERT_EQ("small", Read());
+ ASSERT_EQ(BigString("medium", 50000), Read());
+ ASSERT_EQ(BigString("large", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, MarginalTrailer) {
+ // Make a trailer that is exactly the same length as an empty record.
+ const int n = kBlockSize - 2*kHeaderSize;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, ShortTrailer) {
+ const int n = kBlockSize - 2*kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ Write("");
+ Write("bar");
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("", Read());
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, AlignedEof) {
+ const int n = kBlockSize - 2*kHeaderSize + 4;
+ Write(BigString("foo", n));
+ ASSERT_EQ(kBlockSize - kHeaderSize + 4, WrittenBytes());
+ ASSERT_EQ(BigString("foo", n), Read());
+ ASSERT_EQ("EOF", Read());
+}
+
+TEST(LogTest, RandomRead) {
+ const int N = 500;
+ Random write_rnd(301);
+ for (int i = 0; i < N; i++) {
+ Write(RandomSkewedString(i, &write_rnd));
+ }
+ Random read_rnd(301);
+ for (int i = 0; i < N; i++) {
+ ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+ }
+ ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST(LogTest, ReadError) {
+ Write("foo");
+ ForceError();
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kBlockSize, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST(LogTest, BadRecordType) {
+ Write("foo");
+ // Type is stored in header[6]
+ IncrementByte(6, 100);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST(LogTest, TruncatedTrailingRecord) {
+ Write("foo");
+ ShrinkSize(4); // Drop all payload as well as a header byte
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kHeaderSize - 1, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("truncated record at end of file"));
+}
+
+TEST(LogTest, BadLength) {
+ Write("foo");
+ ShrinkSize(1);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(kHeaderSize + 2, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("bad record length"));
+}
+
+TEST(LogTest, ChecksumMismatch) {
+ Write("foo");
+ IncrementByte(0, 10);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("checksum mismatch"));
+}
+
+TEST(LogTest, UnexpectedMiddleType) {
+ Write("foo");
+ SetByte(6, kMiddleType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedLastType) {
+ Write("foo");
+ SetByte(6, kLastType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST(LogTest, UnexpectedFullType) {
+ Write("foo");
+ Write("bar");
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ("bar", Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, UnexpectedFirstType) {
+ Write("foo");
+ Write(BigString("bar", 100000));
+ SetByte(6, kFirstType);
+ FixChecksum(0, 3);
+ ASSERT_EQ(BigString("bar", 100000), Read());
+ ASSERT_EQ("EOF", Read());
+ ASSERT_EQ(3, DroppedBytes());
+ ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST(LogTest, ErrorJoinsRecords) {
+ // Consider two fragmented records:
+ // first(R1) last(R1) first(R2) last(R2)
+ // where the middle two fragments disappear. We do not want
+ // first(R1),last(R2) to get joined and returned as a valid record.
+
+ // Write records that span two blocks
+ Write(BigString("foo", kBlockSize));
+ Write(BigString("bar", kBlockSize));
+ Write("correct");
+
+ // Wipe the middle block
+ for (int offset = kBlockSize; offset < 2*kBlockSize; offset++) {
+ SetByte(offset, 'x');
+ }
+
+ ASSERT_EQ("correct", Read());
+ ASSERT_EQ("EOF", Read());
+ const int dropped = DroppedBytes();
+ ASSERT_LE(dropped, 2*kBlockSize + 100);
+ ASSERT_GE(dropped, 2*kBlockSize);
+}
+
+}
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/log_writer.cc b/db/log_writer.cc
new file mode 100644
index 0000000..465eca2
--- /dev/null
+++ b/db/log_writer.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+#include "include/env.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+namespace log {
+
+Writer::Writer(WritableFile* dest)
+ : dest_(dest),
+ block_offset_(0) {
+ for (int i = 0; i <= kMaxRecordType; i++) {
+ char t = static_cast<char>(i);
+ type_crc_[i] = crc32c::Value(&t, 1);
+ }
+}
+
+Writer::~Writer() {
+}
+
+Status Writer::AddRecord(const Slice& slice) {
+ const char* ptr = slice.data();
+ size_t left = slice.size();
+
+ // Fragment the record if necessary and emit it. Note that if slice
+ // is empty, we still want to iterate once to emit a single
+ // zero-length record
+ Status s;
+ do {
+ const int leftover = kBlockSize - block_offset_;
+ assert(leftover >= 0);
+ if (leftover <= kHeaderSize) {
+ // Switch to a new block
+ if (leftover > 0) {
+ // Fill the trailer
+ dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00", leftover));
+ }
+ block_offset_ = 0;
+ }
+
+ // Invariant: we never leave <= kHeaderSize bytes in a block.
+ const int avail = kBlockSize - block_offset_ - kHeaderSize;
+ assert(avail > 0);
+
+ const size_t fragment_length = (left < avail) ? left : avail;
+
+ RecordType type;
+ const bool begin = (ptr == slice.data());
+ const bool end = (left == fragment_length);
+ if (begin && end) {
+ type = kFullType;
+ } else if (begin) {
+ type = kFirstType;
+ } else if (end) {
+ type = kLastType;
+ } else {
+ type = kMiddleType;
+ }
+
+ s = EmitPhysicalRecord(type, ptr, fragment_length);
+ ptr += fragment_length;
+ left -= fragment_length;
+ } while (s.ok() && left > 0);
+ return s;
+}
+
+Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
+ assert(n <= 0xffff); // Must fit in two bytes
+ assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+
+ // Format the header
+ char buf[kHeaderSize];
+ buf[4] = static_cast<char>(n & 0xff);
+ buf[5] = static_cast<char>(n >> 8);
+ buf[6] = static_cast<char>(t);
+
+ // Compute the crc of the record type and the payload.
+ uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
+ crc = crc32c::Mask(crc); // Adjust for storage
+ EncodeFixed32(buf, crc);
+
+ // Write the header and the payload
+ Status s = dest_->Append(Slice(buf, kHeaderSize));
+ if (s.ok()) {
+ s = dest_->Append(Slice(ptr, n));
+ if (s.ok()) {
+ s = dest_->Flush();
+ }
+ }
+ block_offset_ += kHeaderSize + n;
+ return s;
+}
+
+}
+}
diff --git a/db/log_writer.h b/db/log_writer.h
new file mode 100644
index 0000000..13c64ba
--- /dev/null
+++ b/db/log_writer.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_LOG_WRITER_H_
+#define STORAGE_LEVELDB_DB_LOG_WRITER_H_
+
+#include <stdint.h>
+#include "db/log_format.h"
+#include "include/slice.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class WritableFile;
+
+namespace log {
+
+class Writer {
+ public:
+ // Create a writer that will append data to "*dest".
+ // "*dest" must be initially empty.
+ // "*dest" must remain live while this Writer is in use.
+ explicit Writer(WritableFile* dest);
+ ~Writer();
+
+ Status AddRecord(const Slice& slice);
+
+ private:
+ WritableFile* dest_;
+ int block_offset_; // Current offset in block
+
+ // crc32c values for all supported record types. These are
+ // pre-computed to reduce the overhead of computing the crc of the
+ // record type stored in the header.
+ uint32_t type_crc_[kMaxRecordType + 1];
+
+ Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);
+
+ // No copying allowed
+ Writer(const Writer&);
+ void operator=(const Writer&);
+};
+
+}
+}
+
+#endif // STORAGE_LEVELDB_DB_LOG_WRITER_H_
diff --git a/db/memtable.cc b/db/memtable.cc
new file mode 100644
index 0000000..349cfcc
--- /dev/null
+++ b/db/memtable.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+#include "db/dbformat.h"
+#include "include/comparator.h"
+#include "include/env.h"
+#include "include/iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+static Slice GetLengthPrefixedSlice(const char* data) {
+ uint32_t len;
+ const char* p = data;
+ p = GetVarint32Ptr(p, p + 5, &len); // +5: we assume "p" is not corrupted
+ return Slice(p, len);
+}
+
+MemTable::MemTable(const InternalKeyComparator& cmp)
+ : comparator_(cmp),
+ table_(comparator_, &arena_) {
+}
+
+MemTable::~MemTable() {
+}
+
+size_t MemTable::ApproximateMemoryUsage() { return arena_.MemoryUsage(); }
+
+int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
+ const {
+ // Internal keys are encoded as length-prefixed strings.
+ Slice a = GetLengthPrefixedSlice(aptr);
+ Slice b = GetLengthPrefixedSlice(bptr);
+ return comparator.Compare(a, b);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+static const char* EncodeKey(std::string* scratch, const Slice& target) {
+ scratch->clear();
+ PutVarint32(scratch, target.size());
+ scratch->append(target.data(), target.size());
+ return scratch->data();
+}
+
+class MemTableIterator: public Iterator {
+ public:
+ explicit MemTableIterator(MemTable::Table* table) {
+ iter_ = new MemTable::Table::Iterator(table);
+ }
+ virtual ~MemTableIterator() { delete iter_; }
+
+ virtual bool Valid() const { return iter_->Valid(); }
+ virtual void Seek(const Slice& k) { iter_->Seek(EncodeKey(&tmp_, k)); }
+ virtual void SeekToFirst() { iter_->SeekToFirst(); }
+ virtual void SeekToLast() { iter_->SeekToLast(); }
+ virtual void Next() { iter_->Next(); }
+ virtual void Prev() { iter_->Prev(); }
+ virtual Slice key() const { return GetLengthPrefixedSlice(iter_->key()); }
+ virtual Slice value() const {
+ Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+ return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+ }
+
+ virtual Status status() const { return Status::OK(); }
+
+ private:
+ MemTable::Table::Iterator* iter_;
+ std::string tmp_; // For passing to EncodeKey
+
+ // No copying allowed
+ MemTableIterator(const MemTableIterator&);
+ void operator=(const MemTableIterator&);
+};
+
+Iterator* MemTable::NewIterator() {
+ return new MemTableIterator(&table_);
+}
+
+void MemTable::Add(SequenceNumber s, ValueType type,
+ const Slice& key,
+ const Slice& value) {
+ // Format of an entry is concatenation of:
+ // key_size : varint32 of internal_key.size()
+ // key bytes : char[internal_key.size()]
+ // value_size : varint32 of value.size()
+ // value bytes : char[value.size()]
+ size_t key_size = key.size();
+ size_t val_size = value.size();
+ size_t internal_key_size = key_size + 8;
+ const size_t encoded_len =
+ VarintLength(internal_key_size) + internal_key_size +
+ VarintLength(val_size) + val_size;
+ char* buf = arena_.Allocate(encoded_len);
+ char* p = EncodeVarint32(buf, internal_key_size);
+ memcpy(p, key.data(), key_size);
+ p += key_size;
+ EncodeFixed64(p, (s << 8) | type);
+ p += 8;
+ p = EncodeVarint32(p, val_size);
+ memcpy(p, value.data(), val_size);
+ assert((p + val_size) - buf == encoded_len);
+ table_.Insert(buf);
+}
+
+}
diff --git a/db/memtable.h b/db/memtable.h
new file mode 100644
index 0000000..fa95e15
--- /dev/null
+++ b/db/memtable.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_MEMTABLE_H_
+#define STORAGE_LEVELDB_DB_MEMTABLE_H_
+
+#include <string>
+#include "include/db.h"
+#include "db/dbformat.h"
+#include "db/skiplist.h"
+#include "util/arena.h"
+
+namespace leveldb {
+
+class InternalKeyComparator;
+class Mutex;
+class MemTableIterator;
+
+class MemTable {
+ public:
+ explicit MemTable(const InternalKeyComparator& comparator);
+ ~MemTable();
+
+ // Returns an estimate of the number of bytes of data in use by this
+ // data structure.
+ //
+ // REQUIRES: external synchronization to prevent simultaneous
+ // operations on the same MemTable.
+ size_t ApproximateMemoryUsage();
+
+ // Return an iterator that yields the contents of the memtable.
+ //
+ // The caller must ensure that the underlying MemTable remains live
+ // while the returned iterator is live. The keys returned by this
+ // iterator are internal keys encoded by AppendInternalKey in the
+ // db/format.{h,cc} module.
+ Iterator* NewIterator();
+
+ // Add an entry into memtable that maps key to value at the
+ // specified sequence number and with the specified type.
+ // Typically value will be empty if type==kTypeDeletion.
+ void Add(SequenceNumber seq, ValueType type,
+ const Slice& key,
+ const Slice& value);
+
+ private:
+ struct KeyComparator {
+ const InternalKeyComparator comparator;
+ explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { }
+ int operator()(const char* a, const char* b) const;
+ };
+ friend class MemTableIterator;
+ friend class MemTableBackwardIterator;
+
+ typedef SkipList<const char*, KeyComparator> Table;
+
+ KeyComparator comparator_;
+ Arena arena_;
+ Table table_;
+
+ // No copying allowed
+ MemTable(const MemTable&);
+ void operator=(const MemTable&);
+};
+
+}
+
+#endif // STORAGE_LEVELDB_DB_MEMTABLE_H_
diff --git a/db/repair.cc b/db/repair.cc
new file mode 100644
index 0000000..0727914
--- /dev/null
+++ b/db/repair.cc
@@ -0,0 +1,396 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// We recover the contents of the descriptor from the other files we find.
+// (1) Any log files are first converted to tables
+// (2) We scan every table to compute
+// (a) smallest/largest for the table
+// (b) large value refs from the table
+// (c) largest sequence number in the table
+// (3) We generate descriptor contents:
+// - log number is set to zero
+// - next-file-number is set to 1 + largest file number we found
+// - last-sequence-number is set to largest sequence# found across
+// all tables (see 2c)
+// - compaction pointers are cleared
+// - every table file is added at level 0
+//
+// Possible optimization 1:
+// (a) Compute total size and use to pick appropriate max-level M
+// (b) Sort tables by largest sequence# in the table
+// (c) For each table: if it overlaps earlier table, place in level-0,
+// else place in level-M.
+// Possible optimization 2:
+// Store per-table metadata (smallest, largest, largest-seq#,
+// large-value-refs, ...) in the table's meta section to speed up
+// ScanTable.
+
+#include "db/builder.h"
+#include "db/db_impl.h"
+#include "db/dbformat.h"
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "include/comparator.h"
+#include "include/db.h"
+#include "include/env.h"
+
+namespace leveldb {
+
+namespace {
+
+class Repairer {
+ public:
+ Repairer(const std::string& dbname, const Options& options)
+ : dbname_(dbname),
+ env_(options.env),
+ icmp_(options.comparator),
+ options_(SanitizeOptions(dbname, &icmp_, options)),
+ owns_info_log_(options_.info_log != options.info_log),
+ next_file_number_(1) {
+ // TableCache can be small since we expect each table to be opened once.
+ table_cache_ = new TableCache(dbname_, &options_, 10);
+ }
+
+ ~Repairer() {
+ delete table_cache_;
+ if (owns_info_log_) {
+ delete options_.info_log;
+ }
+ }
+
+ Status Run() {
+ Status status = FindFiles();
+ if (status.ok()) {
+ ConvertLogFilesToTables();
+ ExtractMetaData();
+ status = WriteDescriptor();
+ }
+ if (status.ok()) {
+ unsigned long long bytes = 0;
+ for (int i = 0; i < tables_.size(); i++) {
+ bytes += tables_[i].meta.file_size;
+ }
+ Log(env_, options_.info_log,
+ "**** Repaired leveldb %s; "
+ "recovered %d files; %llu bytes. "
+ "Some data may have been lost. "
+ "****",
+ dbname_.c_str(),
+ static_cast<int>(tables_.size()),
+ bytes);
+ }
+ return status;
+ }
+
+ private:
+ struct TableInfo {
+ FileMetaData meta;
+ SequenceNumber max_sequence;
+ };
+
+ std::string const dbname_;
+ Env* const env_;
+ InternalKeyComparator const icmp_;
+ Options const options_;
+ bool owns_info_log_;
+ TableCache* table_cache_;
+ VersionEdit edit_;
+
+ std::vector<std::string> manifests_;
+ std::vector<uint64_t> table_numbers_;
+ std::vector<uint64_t> logs_;
+ std::vector<TableInfo> tables_;
+ uint64_t next_file_number_;
+
+ Status FindFiles() {
+ std::vector<std::string> filenames;
+ Status status = env_->GetChildren(dbname_, &filenames);
+ if (!status.ok()) {
+ return status;
+ }
+ if (filenames.empty()) {
+ return Status::IOError(dbname_, "repair found no files");
+ }
+
+ uint64_t number;
+ LargeValueRef large_ref;
+ FileType type;
+ for (int i = 0; i < filenames.size(); i++) {
+ if (ParseFileName(filenames[i], &number, &large_ref, &type)) {
+ if (type == kLargeValueFile) {
+ // Will be picked up when we process a Table that points to it
+ } else if (type == kDescriptorFile) {
+ manifests_.push_back(filenames[i]);
+ } else {
+ if (number + 1 > next_file_number_) {
+ next_file_number_ = number + 1;
+ }
+ if (type == kLogFile) {
+ logs_.push_back(number);
+ } else if (type == kTableFile) {
+ table_numbers_.push_back(number);
+ } else {
+ // Ignore other files
+ }
+ }
+ }
+ }
+ return status;
+ }
+
+ void ConvertLogFilesToTables() {
+ for (int i = 0; i < logs_.size(); i++) {
+ std::string logname = LogFileName(dbname_, logs_[i]);
+ Status status = ConvertLogToTable(logs_[i]);
+ if (!status.ok()) {
+ Log(env_, options_.info_log, "Log #%llu: ignoring conversion error: %s",
+ (unsigned long long) logs_[i],
+ status.ToString().c_str());
+ }
+ ArchiveFile(logname);
+ }
+ }
+
+ Status ConvertLogToTable(uint64_t log) {
+ struct LogReporter : public log::Reader::Reporter {
+ Env* env;
+ WritableFile* info_log;
+ uint64_t lognum;
+ virtual void Corruption(size_t bytes, const Status& s) {
+ // We print error messages for corruption, but continue repairing.
+ Log(env, info_log, "Log #%llu: dropping %d bytes; %s",
+ (unsigned long long) lognum,
+ static_cast<int>(bytes),
+ s.ToString().c_str());
+ }
+ };
+
+ // Open the log file
+ std::string logname = LogFileName(dbname_, log);
+ SequentialFile* lfile;
+ Status status = env_->NewSequentialFile(logname, &lfile);
+ if (!status.ok()) {
+ return status;
+ }
+
+ // Create the log reader.
+ LogReporter reporter;
+ reporter.env = env_;
+ reporter.info_log = options_.info_log;
+ reporter.lognum = log;
+ // We intentially make log::Reader do checksumming so that
+ // corruptions cause entire commits to be skipped instead of
+ // propagating bad information (like overly large sequence
+ // numbers).
+ log::Reader reader(lfile, &reporter, false/*do not checksum*/);
+
+ // Read all the records and add to a memtable
+ std::string scratch;
+ Slice record;
+ WriteBatch batch;
+ MemTable mem(icmp_);
+ int counter = 0;
+ while (reader.ReadRecord(&record, &scratch)) {
+ if (record.size() < 12) {
+ reporter.Corruption(
+ record.size(), Status::Corruption("log record too small"));
+ continue;
+ }
+ WriteBatchInternal::SetContents(&batch, record);
+ status = WriteBatchInternal::InsertInto(&batch, &mem);
+ if (status.ok()) {
+ counter += WriteBatchInternal::Count(&batch);
+ } else {
+ Log(env_, options_.info_log, "Log #%llu: ignoring %s",
+ (unsigned long long) log,
+ status.ToString().c_str());
+ status = Status::OK(); // Keep going with rest of file
+ }
+ }
+ delete lfile;
+
+ // We ignore any version edits generated by the conversion to a Table
+ // since ExtractMetaData() will also generate edits.
+ VersionEdit skipped;
+ FileMetaData meta;
+ meta.number = next_file_number_++;
+ Iterator* iter = mem.NewIterator();
+ status = BuildTable(dbname_, env_, options_, table_cache_, iter,
+ &meta, &skipped);
+ delete iter;
+ if (status.ok()) {
+ if (meta.file_size > 0) {
+ table_numbers_.push_back(meta.number);
+ }
+ }
+ Log(env_, options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s",
+ (unsigned long long) log,
+ counter,
+ (unsigned long long) meta.number,
+ status.ToString().c_str());
+ return status;
+ }
+
+ void ExtractMetaData() {
+ std::vector<TableInfo> kept;
+ for (int i = 0; i < table_numbers_.size(); i++) {
+ TableInfo t;
+ t.meta.number = table_numbers_[i];
+ Status status = ScanTable(&t);
+ if (!status.ok()) {
+ std::string fname = TableFileName(dbname_, table_numbers_[i]);
+ Log(env_, options_.info_log, "Table #%llu: ignoring %s",
+ (unsigned long long) table_numbers_[i],
+ status.ToString().c_str());
+ ArchiveFile(fname);
+ } else {
+ tables_.push_back(t);
+ }
+ }
+ }
+
+ Status ScanTable(TableInfo* t) {
+ std::string fname = TableFileName(dbname_, t->meta.number);
+ int counter = 0;
+ Status status = env_->GetFileSize(fname, &t->meta.file_size);
+ if (status.ok()) {
+ Iterator* iter = table_cache_->NewIterator(
+ ReadOptions(), t->meta.number);
+ bool empty = true;
+ ParsedInternalKey parsed;
+ t->max_sequence = 0;
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ Slice key = iter->key();
+ if (!ParseInternalKey(key, &parsed)) {
+ Log(env_, options_.info_log, "Table #%llu: unparsable key %s",
+ (unsigned long long) t->meta.number,
+ EscapeString(key).c_str());
+ continue;
+ }
+
+ counter++;
+ if (empty) {
+ empty = false;
+ t->meta.smallest.DecodeFrom(key);
+ }
+ t->meta.largest.DecodeFrom(key);
+ if (parsed.sequence > t->max_sequence) {
+ t->max_sequence = parsed.sequence;
+ }
+
+ if (ExtractValueType(key) == kTypeLargeValueRef) {
+ if (iter->value().size() != LargeValueRef::ByteSize()) {
+ Log(env_, options_.info_log, "Table #%llu: bad large value ref",
+ (unsigned long long) t->meta.number);
+ } else {
+ edit_.AddLargeValueRef(LargeValueRef::FromRef(iter->value()),
+ t->meta.number,
+ key);
+ }
+ }
+ }
+ if (!iter->status().ok()) {
+ status = iter->status();
+ }
+ delete iter;
+ }
+ Log(env_, options_.info_log, "Table #%llu: %d entries %s",
+ (unsigned long long) t->meta.number,
+ counter,
+ status.ToString().c_str());
+ return status;
+ }
+
+ Status WriteDescriptor() {
+ std::string tmp = TempFileName(dbname_, 1);
+ WritableFile* file;
+ Status status = env_->NewWritableFile(tmp, &file);
+ if (!status.ok()) {
+ return status;
+ }
+
+ SequenceNumber max_sequence = 0;
+ for (int i = 0; i < tables_.size(); i++) {
+ if (max_sequence < tables_[i].max_sequence) {
+ max_sequence = tables_[i].max_sequence;
+ }
+ }
+
+ edit_.SetComparatorName(icmp_.user_comparator()->Name());
+ edit_.SetLogNumber(0);
+ edit_.SetNextFile(next_file_number_);
+ edit_.SetLastSequence(max_sequence);
+
+ for (int i = 0; i < tables_.size(); i++) {
+ // TODO(opt): separate out into multiple levels
+ const TableInfo& t = tables_[i];
+ edit_.AddFile(0, t.meta.number, t.meta.file_size,
+ t.meta.smallest, t.meta.largest);
+ }
+
+ //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
+ {
+ log::Writer log(file);
+ std::string record;
+ edit_.EncodeTo(&record);
+ status = log.AddRecord(record);
+ }
+ if (status.ok()) {
+ status = file->Close();
+ }
+ delete file;
+ file = NULL;
+
+ if (!status.ok()) {
+ env_->DeleteFile(tmp);
+ } else {
+ // Discard older manifests
+ for (int i = 0; i < manifests_.size(); i++) {
+ ArchiveFile(dbname_ + "/" + manifests_[i]);
+ }
+
+ // Install new manifest
+ status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1));
+ if (status.ok()) {
+ status = SetCurrentFile(env_, dbname_, 1);
+ } else {
+ env_->DeleteFile(tmp);
+ }
+ }
+ return status;
+ }
+
+ void ArchiveFile(const std::string& fname) {
+ // Move into another directory. E.g., for
+ // dir/foo
+ // rename to
+ // dir/lost/foo
+ const char* slash = strrchr(fname.c_str(), '/');
+ std::string new_dir;
+ if (slash != NULL) {
+ new_dir.assign(fname.data(), slash - fname.data());
+ }
+ new_dir.append("/lost");
+ env_->CreateDir(new_dir); // Ignore error
+ std::string new_file = new_dir;
+ new_file.append("/");
+ new_file.append((slash == NULL) ? fname.c_str() : slash + 1);
+ Status s = env_->RenameFile(fname, new_file);
+ Log(env_, options_.info_log, "Archiving %s: %s\n",
+ fname.c_str(), s.ToString().c_str());
+ }
+};
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+ Repairer repairer(dbname, options);
+ return repairer.Run();
+}
+
+}
diff --git a/db/skiplist.h b/db/skiplist.h
new file mode 100644
index 0000000..be39354
--- /dev/null
+++ b/db/skiplist.h
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread safety
+// -------------
+//
+// Writes require external synchronization, most likely a mutex.
+// Reads require a guarantee that the SkipList will not be destroyed
+// while the read is in progress. Apart from that, reads progress
+// without any internal locking or synchronization.
+//
+// Invariants:
+//
+// (1) Allocated nodes are never deleted until the SkipList is
+// destroyed. This is trivially guaranteed by the code since we
+// never delete any skip list nodes.
+//
+// (2) The contents of a Node except for the next/prev pointers are
+// immutable after the Node has been linked into the SkipList.
+// Only Insert() modifies the list, and it is careful to initialize
+// a node and use release-stores to publish the nodes in one or
+// more lists.
+//
+// ... prev vs. next pointer ordering ...
+
+#include <assert.h>
+#include <stdlib.h>
+#include "port/port.h"
+#include "util/arena.h"
+#include "util/random.h"
+
+namespace leveldb {
+
+class Arena;
+
+template<typename Key, class Comparator>
+class SkipList {
+ private:
+ struct Node;
+
+ public:
+ // Create a new SkipList object that will use "cmp" for comparing keys,
+ // and will allocate memory using "*arena". Objects allocated in the arena
+ // must remain allocated for the lifetime of the skiplist object.
+ explicit SkipList(Comparator cmp, Arena* arena);
+
+ // Insert key into the list.
+ // REQUIRES: nothing that compares equal to key is currently in the list.
+ void Insert(const Key& key);
+
+ // Returns true iff an entry that compares equal to key is in the list.
+ bool Contains(const Key& key) const;
+
+ // Iteration over the contents of a skip list
+ class Iterator {
+ public:
+ // Initialize an iterator over the specified list.
+ // The returned iterator is not valid.
+ explicit Iterator(const SkipList* list);
+
+ // Returns true iff the iterator is positioned at a valid node.
+ bool Valid() const;
+
+ // Returns the key at the current position.
+ // REQUIRES: Valid()
+ const Key& key() const;
+
+ // Advances to the next position.
+ // REQUIRES: Valid()
+ void Next();
+
+ // Advances to the previous position.
+ // REQUIRES: Valid()
+ void Prev();
+
+ // Advance to the first entry with a key >= target
+ void Seek(const Key& target);
+
+ // Position at the first entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToFirst();
+
+ // Position at the last entry in list.
+ // Final state of iterator is Valid() iff list is not empty.
+ void SeekToLast();
+
+ private:
+ const SkipList* list_;
+ Node* node_;
+ // Intentionally copyable
+ };
+
+ private:
+ enum { kMaxHeight = 12 };
+
+ // Immutable after construction
+ Comparator const compare_;
+ Arena* const arena_; // Arena used for allocations of nodes
+
+ Node* const head_;
+
+ // Modified only by Insert(). Read racily by readers, but stale
+ // values are ok.
+ port::AtomicPointer max_height_; // Height of the entire list
+
+ inline int GetMaxHeight() const {
+ return reinterpret_cast<intptr_t>(max_height_.NoBarrier_Load());
+ }
+
+ // Read/written only by Insert().
+ Random rnd_;
+
+ Node* NewNode(const Key& key, int height);
+ int RandomHeight();
+ bool Equal(const Key& a, const Key& b) const { return (compare_(a, b) == 0); }
+
+ // Return true if key is greater than the data stored in "n"
+ bool KeyIsAfterNode(const Key& key, Node* n) const;
+
+ // Return the earliest node that comes at or after key.
+ // Return NULL if there is no such node.
+ //
+ // If prev is non-NULL, fills prev[level] with pointer to previous
+ // node at "level" for every level in [0..max_height_-1].
+ Node* FindGreaterOrEqual(const Key& key, Node** prev) const;
+
+ // Return the latest node with a key < key.
+ // Return head_ if there is no such node.
+ Node* FindLessThan(const Key& key) const;
+
+ // Return the last node in the list.
+ // Return head_ if list is empty.
+ Node* FindLast() const;
+
+ // No copying allowed
+ SkipList(const SkipList&);
+ void operator=(const SkipList&);
+};
+
+// Implementation details follow
+template<typename Key, class Comparator>
+struct SkipList<Key,Comparator>::Node {
+ explicit Node(const Key& k) : key(k) { }
+
+ Key const key;
+
+ // Accessors/mutators for links. Wrapped in methods so we can
+ // add the appropriate barriers as necessary.
+ Node* Next(int n) {
+ assert(n >= 0);
+ // Use an 'acquire load' so that we observe a fully initialized
+ // version of the returned Node.
+ return reinterpret_cast<Node*>(next_[n].Acquire_Load());
+ }
+ void SetNext(int n, Node* x) {
+ assert(n >= 0);
+ // Use a 'release store' so that anybody who reads through this
+ // pointer observes a fully initialized version of the inserted node.
+ next_[n].Release_Store(x);
+ }
+
+ // No-barrier variants that can be safely used in a few locations.
+ Node* NoBarrier_Next(int n) {
+ assert(n >= 0);
+ return reinterpret_cast<Node*>(next_[n].NoBarrier_Load());
+ }
+ void NoBarrier_SetNext(int n, Node* x) {
+ assert(n >= 0);
+ next_[n].NoBarrier_Store(x);
+ }
+
+ private:
+ // Array of length equal to the node height. next_[0] is lowest level link.
+ port::AtomicPointer next_[1];
+};
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::NewNode(const Key& key, int height) {
+ char* mem = arena_->AllocateAligned(
+ sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1));
+ return new (mem) Node(key);
+}
+
+template<typename Key, class Comparator>
+inline SkipList<Key,Comparator>::Iterator::Iterator(const SkipList* list) {
+ list_ = list;
+ node_ = NULL;
+}
+
+template<typename Key, class Comparator>
+inline bool SkipList<Key,Comparator>::Iterator::Valid() const {
+ return node_ != NULL;
+}
+
+template<typename Key, class Comparator>
+inline const Key& SkipList<Key,Comparator>::Iterator::key() const {
+ assert(Valid());
+ return node_->key;
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Next() {
+ assert(Valid());
+ node_ = node_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Prev() {
+ // Instead of using explicit "prev" links, we just search for the
+ // last node that falls before key.
+ assert(Valid());
+ node_ = list_->FindLessThan(node_->key);
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::Seek(const Key& target) {
+ node_ = list_->FindGreaterOrEqual(target, NULL);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToFirst() {
+ node_ = list_->head_->Next(0);
+}
+
+template<typename Key, class Comparator>
+inline void SkipList<Key,Comparator>::Iterator::SeekToLast() {
+ node_ = list_->FindLast();
+ if (node_ == list_->head_) {
+ node_ = NULL;
+ }
+}
+
+template<typename Key, class Comparator>
+int SkipList<Key,Comparator>::RandomHeight() {
+ // Increase height with probability 1 in kBranching
+ static const unsigned int kBranching = 4;
+ int height = 1;
+ while (height < kMaxHeight && ((rnd_.Next() % kBranching) == 0)) {
+ height++;
+ }
+ assert(height > 0);
+ assert(height <= kMaxHeight);
+ return height;
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::KeyIsAfterNode(const Key& key, Node* n) const {
+ // NULL n is considered infinite
+ return (n != NULL) && (compare_(n->key, key) < 0);
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindGreaterOrEqual(const Key& key, Node** prev)
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (KeyIsAfterNode(key, next)) {
+ // Keep searching in this list
+ x = next;
+ } else {
+ if (prev != NULL) prev[level] = x;
+ if (level == 0) {
+ return next;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node*
+SkipList<Key,Comparator>::FindLessThan(const Key& key) const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ assert(x == head_ || compare_(x->key, key) < 0);
+ Node* next = x->Next(level);
+ if (next == NULL || compare_(next->key, key) >= 0) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+typename SkipList<Key,Comparator>::Node* SkipList<Key,Comparator>::FindLast()
+ const {
+ Node* x = head_;
+ int level = GetMaxHeight() - 1;
+ while (true) {
+ Node* next = x->Next(level);
+ if (next == NULL) {
+ if (level == 0) {
+ return x;
+ } else {
+ // Switch to next list
+ level--;
+ }
+ } else {
+ x = next;
+ }
+ }
+}
+
+template<typename Key, class Comparator>
+SkipList<Key,Comparator>::SkipList(Comparator cmp, Arena* arena)
+ : compare_(cmp),
+ arena_(arena),
+ head_(NewNode(0 /* any key will do */, kMaxHeight)),
+ max_height_(reinterpret_cast<void*>(1)),
+ rnd_(0xdeadbeef) {
+ for (int i = 0; i < kMaxHeight; i++) {
+ head_->SetNext(i, NULL);
+ }
+}
+
+template<typename Key, class Comparator>
+void SkipList<Key,Comparator>::Insert(const Key& key) {
+ // TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
+ // here since Insert() is externally synchronized.
+ Node* prev[kMaxHeight];
+ Node* x = FindGreaterOrEqual(key, prev);
+
+ // Our data structure does not allow duplicate insertion
+ assert(x == NULL || !Equal(key, x->key));
+
+ int height = RandomHeight();
+ if (height > GetMaxHeight()) {
+ for (int i = GetMaxHeight(); i < height; i++) {
+ prev[i] = head_;
+ }
+ //fprintf(stderr, "Change height from %d to %d\n", max_height_, height);
+
+ // It is ok to mutate max_height_ without any synchronization
+ // with concurrent readers. A concurrent reader that observes
+ // the new value of max_height_ will see either the old value of
+ // new level pointers from head_ (NULL), or a new value set in
+ // the loop below. In the former case the reader will
+ // immediately drop to the next level since NULL sorts after all
+ // keys. In the latter case the reader will use the new node.
+ max_height_.NoBarrier_Store(reinterpret_cast<void*>(height));
+ }
+
+ x = NewNode(key, height);
+ for (int i = 0; i < height; i++) {
+ // NoBarrier_SetNext() suffices since we will add a barrier when
+ // we publish a pointer to "x" in prev[i].
+ x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
+ prev[i]->SetNext(i, x);
+ }
+}
+
+template<typename Key, class Comparator>
+bool SkipList<Key,Comparator>::Contains(const Key& key) const {
+ Node* x = FindGreaterOrEqual(key, NULL);
+ if (x != NULL && Equal(key, x->key)) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+}
diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc
new file mode 100644
index 0000000..0cfc893
--- /dev/null
+++ b/db/skiplist_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/skiplist.h"
+#include <set>
+#include "include/env.h"
+#include "util/arena.h"
+#include "util/hash.h"
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+typedef uint64_t Key;
+
+struct Comparator {
+ int operator()(const Key& a, const Key& b) const {
+ if (a < b) {
+ return -1;
+ } else if (a > b) {
+ return +1;
+ } else {
+ return 0;
+ }
+ }
+};
+
+class SkipTest { };
+
+TEST(SkipTest, Empty) {
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ ASSERT_TRUE(!list.Contains(10));
+
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToFirst();
+ ASSERT_TRUE(!iter.Valid());
+ iter.Seek(100);
+ ASSERT_TRUE(!iter.Valid());
+ iter.SeekToLast();
+ ASSERT_TRUE(!iter.Valid());
+}
+
+TEST(SkipTest, InsertAndLookup) {
+ const int N = 2000;
+ const int R = 5000;
+ Random rnd(1000);
+ std::set<Key> keys;
+ Arena arena;
+ Comparator cmp;
+ SkipList<Key, Comparator> list(cmp, &arena);
+ for (int i = 0; i < N; i++) {
+ Key key = rnd.Next() % R;
+ if (keys.insert(key).second) {
+ list.Insert(key);
+ }
+ }
+
+ for (int i = 0; i < R; i++) {
+ if (list.Contains(i)) {
+ ASSERT_EQ(keys.count(i), 1);
+ } else {
+ ASSERT_EQ(keys.count(i), 0);
+ }
+ }
+
+ // Simple iterator tests
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ ASSERT_TRUE(!iter.Valid());
+
+ iter.Seek(0);
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToFirst();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.begin()), iter.key());
+
+ iter.SeekToLast();
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*(keys.rbegin()), iter.key());
+ }
+
+ // Forward iteration test
+ for (int i = 0; i < R; i++) {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.Seek(i);
+
+ // Compare against model iterator
+ std::set<Key>::iterator model_iter = keys.lower_bound(i);
+ for (int j = 0; j < 3; j++) {
+ if (model_iter == keys.end()) {
+ ASSERT_TRUE(!iter.Valid());
+ break;
+ } else {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ ++model_iter;
+ iter.Next();
+ }
+ }
+ }
+
+ // Backward iteration test
+ {
+ SkipList<Key, Comparator>::Iterator iter(&list);
+ iter.SeekToLast();
+
+ // Compare against model iterator
+ for (std::set<Key>::reverse_iterator model_iter = keys.rbegin();
+ model_iter != keys.rend();
+ ++model_iter) {
+ ASSERT_TRUE(iter.Valid());
+ ASSERT_EQ(*model_iter, iter.key());
+ iter.Prev();
+ }
+ ASSERT_TRUE(!iter.Valid());
+ }
+}
+
+// We want to make sure that with a single writer and multiple
+// concurrent readers (with no synchronization other than when a
+// reader's iterator is created), the reader always observes all the
+// data that was present in the skip list when the iterator was
+// constructor. Because insertions are happening concurrently, we may
+// also observe new values that were inserted since the iterator was
+// constructed, but we should never miss any values that were present
+// at iterator construction time.
+//
+// We generate multi-part keys:
+// <key,gen,hash>
+// where:
+// key is in range [0..K-1]
+// gen is a generation number for key
+// hash is hash(key,gen)
+//
+// The insertion code picks a random key, sets gen to be 1 + the last
+// generation number inserted for that key, and sets hash to Hash(key,gen).
+//
+// At the beginning of a read, we snapshot the last inserted
+// generation number for each key. We then iterate, including random
+// calls to Next() and Seek(). For every key we encounter, we
+// check that it is either expected given the initial snapshot or has
+// been concurrently added since the iterator started.
+class ConcurrentTest {
+ private:
+ static const uint32_t K = 4;
+
+ static uint64_t key(Key key) { return (key >> 40); }
+ static uint64_t gen(Key key) { return (key >> 8) & 0xffffffffu; }
+ static uint64_t hash(Key key) { return key & 0xff; }
+
+ static uint64_t HashNumbers(uint64_t k, uint64_t g) {
+ uint64_t data[2] = { k, g };
+ return Hash(reinterpret_cast<char*>(data), sizeof(data), 0);
+ }
+
+ static Key MakeKey(uint64_t k, uint64_t g) {
+ assert(sizeof(Key) == sizeof(uint64_t));
+ assert(k <= K); // We sometimes pass K to seek to the end of the skiplist
+ assert(g <= 0xffffffffu);
+ return ((k << 40) | (g << 8) | (HashNumbers(k, g) & 0xff));
+ }
+
+ static bool IsValidKey(Key k) {
+ return hash(k) == (HashNumbers(key(k), gen(k)) & 0xff);
+ }
+
+ static Key RandomTarget(Random* rnd) {
+ switch (rnd->Next() % 10) {
+ case 0:
+ // Seek to beginning
+ return MakeKey(0, 0);
+ case 1:
+ // Seek to end
+ return MakeKey(K, 0);
+ default:
+ // Seek to middle
+ return MakeKey(rnd->Next() % K, 0);
+ }
+ }
+
+ // Per-key generation
+ struct State {
+ port::AtomicPointer generation[K];
+ void Set(int k, intptr_t v) {
+ generation[k].Release_Store(reinterpret_cast<void*>(v));
+ }
+ intptr_t Get(int k) {
+ return reinterpret_cast<intptr_t>(generation[k].Acquire_Load());
+ }
+
+ State() {
+ for (int k = 0; k < K; k++) {
+ Set(k, 0);
+ }
+ }
+ };
+
+ // Current state of the test
+ State current_;
+
+ Arena arena_;
+
+ // SkipList is not protected by mu_. We just use a single writer
+ // thread to modify it.
+ SkipList<Key, Comparator> list_;
+
+ public:
+ ConcurrentTest() : list_(Comparator(), &arena_) { }
+
+ // REQUIRES: External synchronization
+ void WriteStep(Random* rnd) {
+ const uint32_t k = rnd->Next() % K;
+ const intptr_t g = current_.Get(k) + 1;
+ const Key key = MakeKey(k, g);
+ list_.Insert(key);
+ current_.Set(k, g);
+ }
+
+ void ReadStep(Random* rnd) {
+ // Remember the initial committed state of the skiplist.
+ State initial_state;
+ for (int k = 0; k < K; k++) {
+ initial_state.Set(k, current_.Get(k));
+ }
+
+ Key pos = RandomTarget(rnd);
+ SkipList<Key, Comparator>::Iterator iter(&list_);
+ iter.Seek(pos);
+ while (true) {
+ Key current;
+ if (!iter.Valid()) {
+ current = MakeKey(K, 0);
+ } else {
+ current = iter.key();
+ ASSERT_TRUE(IsValidKey(current)) << std::hex << current;
+ }
+ ASSERT_LE(pos, current) << "should not go backwards";
+
+ // Verify that everything in [pos,current) was not present in
+ // initial_state.
+ while (pos < current) {
+ ASSERT_LT(key(pos), K) << std::hex << pos;
+
+ // Note that generation 0 is never inserted, so it is ok if
+ // <*,0,*> is missing.
+ ASSERT_TRUE((gen(pos) == 0) ||
+ (gen(pos) > initial_state.Get(key(pos)))
+ ) << "key: " << key(pos)
+ << "; gen: " << gen(pos)
+ << "; initgen: "
+ << initial_state.Get(key(pos));
+
+ // Advance to next key in the valid key space
+ if (key(pos) < key(current)) {
+ pos = MakeKey(key(pos) + 1, 0);
+ } else {
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ }
+ }
+
+ if (!iter.Valid()) {
+ break;
+ }
+
+ if (rnd->Next() % 2) {
+ iter.Next();
+ pos = MakeKey(key(pos), gen(pos) + 1);
+ } else {
+ Key new_target = RandomTarget(rnd);
+ if (new_target > pos) {
+ pos = new_target;
+ iter.Seek(new_target);
+ }
+ }
+ }
+ }
+};
+const uint32_t ConcurrentTest::K;
+
+// Simple test that does single-threaded testing of the ConcurrentTest
+// scaffolding.
+TEST(SkipTest, ConcurrentWithoutThreads) {
+ ConcurrentTest test;
+ Random rnd(test::RandomSeed());
+ for (int i = 0; i < 10000; i++) {
+ test.ReadStep(&rnd);
+ test.WriteStep(&rnd);
+ }
+}
+
+class TestState {
+ public:
+ ConcurrentTest t_;
+ int seed_;
+ port::AtomicPointer quit_flag_;
+
+ enum ReaderState {
+ STARTING,
+ RUNNING,
+ DONE
+ };
+
+ explicit TestState(int s)
+ : seed_(s),
+ quit_flag_(NULL),
+ state_(STARTING),
+ state_cv_(&mu_) {}
+
+ void Wait(ReaderState s) {
+ mu_.Lock();
+ while (state_ != s) {
+ state_cv_.Wait();
+ }
+ mu_.Unlock();
+ }
+
+ void Change(ReaderState s) {
+ mu_.Lock();
+ state_ = s;
+ state_cv_.Signal();
+ mu_.Unlock();
+ }
+
+ private:
+ port::Mutex mu_;
+ ReaderState state_;
+ port::CondVar state_cv_;
+};
+
+static void ConcurrentReader(void* arg) {
+ TestState* state = reinterpret_cast<TestState*>(arg);
+ Random rnd(state->seed_);
+ int64_t reads = 0;
+ state->Change(TestState::RUNNING);
+ while (!state->quit_flag_.Acquire_Load()) {
+ state->t_.ReadStep(&rnd);
+ ++reads;
+ }
+ state->Change(TestState::DONE);
+}
+
+static void RunConcurrent(int run) {
+ const int seed = test::RandomSeed() + (run * 100);
+ Random rnd(seed);
+ const int N = 1000;
+ const int kSize = 1000;
+ for (int i = 0; i < N; i++) {
+ if ((i % 100) == 0) {
+ fprintf(stderr, "Run %d of %d\n", i, N);
+ }
+ TestState state(seed + 1);
+ Env::Default()->Schedule(ConcurrentReader, &state);
+ state.Wait(TestState::RUNNING);
+ for (int i = 0; i < kSize; i++) {
+ state.t_.WriteStep(&rnd);
+ }
+ state.quit_flag_.Release_Store(&state); // Any non-NULL arg will do
+ state.Wait(TestState::DONE);
+ }
+}
+
+TEST(SkipTest, Concurrent1) { RunConcurrent(1); }
+TEST(SkipTest, Concurrent2) { RunConcurrent(2); }
+TEST(SkipTest, Concurrent3) { RunConcurrent(3); }
+TEST(SkipTest, Concurrent4) { RunConcurrent(4); }
+TEST(SkipTest, Concurrent5) { RunConcurrent(5); }
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/snapshot.h b/db/snapshot.h
new file mode 100644
index 0000000..6a07f80
--- /dev/null
+++ b/db/snapshot.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_SNAPSHOT_H_
+#define STORAGE_LEVELDB_DB_SNAPSHOT_H_
+
+#include "include/db.h"
+
+namespace leveldb {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each Snapshot corresponds to a particular sequence number.
+class Snapshot {
+ public:
+ SequenceNumber number_; // const after creation
+
+ private:
+ friend class SnapshotList;
+
+ // Snapshot is kept in a doubly-linked circular list
+ Snapshot* prev_;
+ Snapshot* next_;
+
+ SnapshotList* list_; // just for sanity checks
+};
+
+class SnapshotList {
+ public:
+ SnapshotList() {
+ list_.prev_ = &list_;
+ list_.next_ = &list_;
+ }
+
+ bool empty() const { return list_.next_ == &list_; }
+ Snapshot* oldest() const { assert(!empty()); return list_.next_; }
+ Snapshot* newest() const { assert(!empty()); return list_.prev_; }
+
+ const Snapshot* New(SequenceNumber seq) {
+ Snapshot* s = new Snapshot;
+ s->number_ = seq;
+ s->list_ = this;
+ s->next_ = &list_;
+ s->prev_ = list_.prev_;
+ s->prev_->next_ = s;
+ s->next_->prev_ = s;
+ return s;
+ }
+
+ void Delete(const Snapshot* s) {
+ assert(s->list_ == this);
+ s->prev_->next_ = s->next_;
+ s->next_->prev_ = s->prev_;
+ delete s;
+ }
+
+ private:
+ // Dummy head of doubly-linked list of snapshots
+ Snapshot list_;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_DB_SNAPSHOT_H_
diff --git a/db/table_cache.cc b/db/table_cache.cc
new file mode 100644
index 0000000..604298d
--- /dev/null
+++ b/db/table_cache.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/filename.h"
+#include "include/env.h"
+#include "include/table.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct TableAndFile {
+ RandomAccessFile* file;
+ Table* table;
+};
+
+static void DeleteEntry(const Slice& key, void* value) {
+ TableAndFile* tf = reinterpret_cast<TableAndFile*>(value);
+ delete tf->table;
+ delete tf->file;
+ delete tf;
+}
+
+static void UnrefEntry(void* arg1, void* arg2) {
+ Cache* cache = reinterpret_cast<Cache*>(arg1);
+ Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+ cache->Release(h);
+}
+
+TableCache::TableCache(const std::string& dbname,
+ const Options* options,
+ int entries)
+ : env_(options->env),
+ dbname_(dbname),
+ options_(options),
+ cache_(NewLRUCache(entries)) {
+}
+
+TableCache::~TableCache() {
+ delete cache_;
+}
+
+Iterator* TableCache::NewIterator(const ReadOptions& options,
+ uint64_t file_number,
+ Table** tableptr) {
+ if (tableptr != NULL) {
+ *tableptr = NULL;
+ }
+
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ Slice key(buf, sizeof(buf));
+ Cache::Handle* handle = cache_->Lookup(key);
+ if (handle == NULL) {
+ std::string fname = TableFileName(dbname_, file_number);
+ RandomAccessFile* file = NULL;
+ Table* table = NULL;
+ Status s = env_->NewRandomAccessFile(fname, &file);
+ if (s.ok()) {
+ s = Table::Open(*options_, file, &table);
+ }
+
+ if (!s.ok()) {
+ assert(table == NULL);
+ delete file;
+ // We do not cache error results so that if the error is transient,
+ // or somebody repairs the file, we recover automatically.
+ return NewErrorIterator(s);
+ }
+
+ TableAndFile* tf = new TableAndFile;
+ tf->file = file;
+ tf->table = table;
+ handle = cache_->Insert(key, tf, 1, &DeleteEntry);
+ }
+
+ Table* table = reinterpret_cast<TableAndFile*>(cache_->Value(handle))->table;
+ Iterator* result = table->NewIterator(options);
+ result->RegisterCleanup(&UnrefEntry, cache_, handle);
+ if (tableptr != NULL) {
+ *tableptr = table;
+ }
+ return result;
+}
+
+void TableCache::Evict(uint64_t file_number) {
+ char buf[sizeof(file_number)];
+ EncodeFixed64(buf, file_number);
+ cache_->Erase(Slice(buf, sizeof(buf)));
+}
+
+}
diff --git a/db/table_cache.h b/db/table_cache.h
new file mode 100644
index 0000000..6c357df
--- /dev/null
+++ b/db/table_cache.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#ifndef STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+#define STORAGE_LEVELDB_DB_TABLE_CACHE_H_
+
+#include <string>
+#include <stdint.h>
+#include "db/dbformat.h"
+#include "include/cache.h"
+#include "include/table.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+class Env;
+
+class TableCache {
+ public:
+ TableCache(const std::string& dbname, const Options* options, int entries);
+ ~TableCache();
+
+ // Get an iterator for the specified file number and return it. If
+ // "tableptr" is non-NULL, also sets "*tableptr" to point to the
+ // Table object underlying the returned iterator, or NULL if no
+ // Table object underlies the returned iterator. The returned
+ // "*tableptr" object is owned by the cache and should not be
+ // deleted, and is valid for as long as the returned iterator is
+ // live.
+ Iterator* NewIterator(const ReadOptions& options,
+ uint64_t file_number,
+ Table** tableptr = NULL);
+
+ // Evict any entry for the specified file number
+ void Evict(uint64_t file_number);
+
+ private:
+ Env* const env_;
+ const std::string dbname_;
+ const Options* options_;
+ Cache* cache_;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_DB_TABLE_CACHE_H_
diff --git a/db/version_edit.cc b/db/version_edit.cc
new file mode 100644
index 0000000..809dd82
--- /dev/null
+++ b/db/version_edit.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/version_set.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+// Tag numbers for serialized VersionEdit. These numbers are written to
+// disk and should not be changed.
+enum Tag {
+ kComparator = 1,
+ kLogNumber = 2,
+ kNextFileNumber = 3,
+ kLastSequence = 4,
+ kCompactPointer = 5,
+ kDeletedFile = 6,
+ kNewFile = 7,
+ kLargeValueRef = 8,
+};
+
+void VersionEdit::Clear() {
+ comparator_.clear();
+ log_number_ = 0;
+ last_sequence_ = 0;
+ next_file_number_ = 0;
+ has_comparator_ = false;
+ has_log_number_ = false;
+ has_next_file_number_ = false;
+ has_last_sequence_ = false;
+ deleted_files_.clear();
+ new_files_.clear();
+ large_refs_added_.clear();
+}
+
+void VersionEdit::EncodeTo(std::string* dst) const {
+ if (has_comparator_) {
+ PutVarint32(dst, kComparator);
+ PutLengthPrefixedSlice(dst, comparator_);
+ }
+ if (has_log_number_) {
+ PutVarint32(dst, kLogNumber);
+ PutVarint64(dst, log_number_);
+ }
+ if (has_next_file_number_) {
+ PutVarint32(dst, kNextFileNumber);
+ PutVarint64(dst, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ PutVarint32(dst, kLastSequence);
+ PutVarint64(dst, last_sequence_);
+ }
+
+ for (int i = 0; i < compact_pointers_.size(); i++) {
+ PutVarint32(dst, kCompactPointer);
+ PutVarint32(dst, compact_pointers_[i].first); // level
+ PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
+ }
+
+ for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+ iter != deleted_files_.end();
+ ++iter) {
+ PutVarint32(dst, kDeletedFile);
+ PutVarint32(dst, iter->first); // level
+ PutVarint64(dst, iter->second); // file number
+ }
+
+ for (int i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ PutVarint32(dst, kNewFile);
+ PutVarint32(dst, new_files_[i].first); // level
+ PutVarint64(dst, f.number);
+ PutVarint64(dst, f.file_size);
+ PutLengthPrefixedSlice(dst, f.smallest.Encode());
+ PutLengthPrefixedSlice(dst, f.largest.Encode());
+ }
+
+ for (int i = 0; i < large_refs_added_.size(); i++) {
+ const VersionEdit::Large& l = large_refs_added_[i];
+ PutVarint32(dst, kLargeValueRef);
+ PutLengthPrefixedSlice(dst,
+ Slice(l.large_ref.data, LargeValueRef::ByteSize()));
+ PutVarint64(dst, l.fnum);
+ PutLengthPrefixedSlice(dst, l.internal_key.Encode());
+ }
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+ Slice str;
+ if (GetLengthPrefixedSlice(input, &str)) {
+ dst->DecodeFrom(str);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool GetLevel(Slice* input, int* level) {
+ uint32_t v;
+ if (GetVarint32(input, &v) &&
+ v < config::kNumLevels) {
+ *level = v;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+ Clear();
+ Slice input = src;
+ const char* msg = NULL;
+ uint32_t tag;
+
+ // Temporary storage for parsing
+ int level;
+ uint64_t number;
+ FileMetaData f;
+ Slice str;
+ Large large;
+ InternalKey key;
+
+ while (msg == NULL && GetVarint32(&input, &tag)) {
+ switch (tag) {
+ case kComparator:
+ if (GetLengthPrefixedSlice(&input, &str)) {
+ comparator_ = str.ToString();
+ has_comparator_ = true;
+ } else {
+ msg = "comparator name";
+ }
+ break;
+
+ case kLogNumber:
+ if (GetVarint64(&input, &log_number_)) {
+ has_log_number_ = true;
+ } else {
+ msg = "log number";
+ }
+ break;
+
+ case kNextFileNumber:
+ if (GetVarint64(&input, &next_file_number_)) {
+ has_next_file_number_ = true;
+ } else {
+ msg = "next file number";
+ }
+ break;
+
+ case kLastSequence:
+ if (GetVarint64(&input, &last_sequence_)) {
+ has_last_sequence_ = true;
+ } else {
+ msg = "last sequence number";
+ }
+ break;
+
+ case kCompactPointer:
+ if (GetLevel(&input, &level) &&
+ GetInternalKey(&input, &key)) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ } else {
+ msg = "compaction pointer";
+ }
+ break;
+
+ case kDeletedFile:
+ if (GetLevel(&input, &level) &&
+ GetVarint64(&input, &number)) {
+ deleted_files_.insert(std::make_pair(level, number));
+ } else {
+ msg = "deleted file";
+ }
+ break;
+
+ case kNewFile:
+ if (GetLevel(&input, &level) &&
+ GetVarint64(&input, &f.number) &&
+ GetVarint64(&input, &f.file_size) &&
+ GetInternalKey(&input, &f.smallest) &&
+ GetInternalKey(&input, &f.largest)) {
+ new_files_.push_back(std::make_pair(level, f));
+ } else {
+ msg = "new-file entry";
+ }
+ break;
+
+ case kLargeValueRef:
+ if (GetLengthPrefixedSlice(&input, &str) &&
+ (str.size() == LargeValueRef::ByteSize()) &&
+ GetVarint64(&input, &large.fnum) &&
+ GetInternalKey(&input, &large.internal_key)) {
+ large.large_ref = LargeValueRef::FromRef(str);
+ large_refs_added_.push_back(large);
+ } else {
+ msg = "large ref";
+ }
+ break;
+
+ default:
+ msg = "unknown tag";
+ break;
+ }
+ }
+
+ if (msg == NULL && !input.empty()) {
+ msg = "invalid tag";
+ }
+
+ Status result;
+ if (msg != NULL) {
+ result = Status::Corruption("VersionEdit", msg);
+ }
+ return result;
+}
+
+std::string VersionEdit::DebugString() const {
+ std::string r;
+ r.append("VersionEdit {");
+ if (has_comparator_) {
+ r.append("\n Comparator: ");
+ r.append(comparator_);
+ }
+ if (has_log_number_) {
+ r.append("\n LogNumber: ");
+ AppendNumberTo(&r, log_number_);
+ }
+ if (has_next_file_number_) {
+ r.append("\n NextFile: ");
+ AppendNumberTo(&r, next_file_number_);
+ }
+ if (has_last_sequence_) {
+ r.append("\n LastSeq: ");
+ AppendNumberTo(&r, last_sequence_);
+ }
+ for (int i = 0; i < compact_pointers_.size(); i++) {
+ r.append("\n CompactPointer: ");
+ AppendNumberTo(&r, compact_pointers_[i].first);
+ r.append(" '");
+ AppendEscapedStringTo(&r, compact_pointers_[i].second.Encode());
+ r.append("'");
+ }
+ for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
+ iter != deleted_files_.end();
+ ++iter) {
+ r.append("\n DeleteFile: ");
+ AppendNumberTo(&r, iter->first);
+ r.append(" ");
+ AppendNumberTo(&r, iter->second);
+ }
+ for (int i = 0; i < new_files_.size(); i++) {
+ const FileMetaData& f = new_files_[i].second;
+ r.append("\n AddFile: ");
+ AppendNumberTo(&r, new_files_[i].first);
+ r.append(" ");
+ AppendNumberTo(&r, f.number);
+ r.append(" ");
+ AppendNumberTo(&r, f.file_size);
+ r.append(" '");
+ AppendEscapedStringTo(&r, f.smallest.Encode());
+ r.append("' .. '");
+ AppendEscapedStringTo(&r, f.largest.Encode());
+ r.append("'");
+ }
+ for (int i = 0; i < large_refs_added_.size(); i++) {
+ const VersionEdit::Large& l = large_refs_added_[i];
+ r.append("\n LargeRef: ");
+ AppendNumberTo(&r, l.fnum);
+ r.append(" ");
+ r.append(LargeValueRefToFilenameString(l.large_ref));
+ r.append(" '");
+ AppendEscapedStringTo(&r, l.internal_key.Encode());
+ r.append("'");
+ }
+ r.append("\n}\n");
+ return r;
+}
+
+}
diff --git a/db/version_edit.h b/db/version_edit.h
new file mode 100644
index 0000000..1b71283
--- /dev/null
+++ b/db/version_edit.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+#define STORAGE_LEVELDB_DB_VERSION_EDIT_H_
+
+#include <set>
+#include <utility>
+#include <vector>
+#include "db/dbformat.h"
+
+namespace leveldb {
+
+class VersionSet;
+
+struct FileMetaData {
+ int refs;
+ uint64_t number;
+ uint64_t file_size; // File size in bytes
+ InternalKey smallest; // Smallest internal key served by table
+ InternalKey largest; // Largest internal key served by table
+
+ FileMetaData() : refs(0), file_size(0) { }
+};
+
+class VersionEdit {
+ public:
+ VersionEdit() { Clear(); }
+ ~VersionEdit() { }
+
+ void Clear();
+
+ void SetComparatorName(const Slice& name) {
+ has_comparator_ = true;
+ comparator_ = name.ToString();
+ }
+ void SetLogNumber(uint64_t num) {
+ has_log_number_ = true;
+ log_number_ = num;
+ }
+ void SetNextFile(uint64_t num) {
+ has_next_file_number_ = true;
+ next_file_number_ = num;
+ }
+ void SetLastSequence(SequenceNumber seq) {
+ has_last_sequence_ = true;
+ last_sequence_ = seq;
+ }
+ void SetCompactPointer(int level, const InternalKey& key) {
+ compact_pointers_.push_back(std::make_pair(level, key));
+ }
+
+ // Add the specified file at the specified number.
+ // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
+ // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+ void AddFile(int level, uint64_t file,
+ uint64_t file_size,
+ const InternalKey& smallest,
+ const InternalKey& largest) {
+ FileMetaData f;
+ f.number = file;
+ f.file_size = file_size;
+ f.smallest = smallest;
+ f.largest = largest;
+ new_files_.push_back(std::make_pair(level, f));
+ }
+
+ // Delete the specified "file" from the specified "level".
+ void DeleteFile(int level, uint64_t file) {
+ deleted_files_.insert(std::make_pair(level, file));
+ }
+
+ // Record that a large value with the specified large_ref was
+ // written to the output file numbered "fnum"
+ void AddLargeValueRef(const LargeValueRef& large_ref,
+ uint64_t fnum,
+ const Slice& internal_key) {
+ large_refs_added_.resize(large_refs_added_.size() + 1);
+ Large* large = &(large_refs_added_.back());
+ large->large_ref = large_ref;
+ large->fnum = fnum;
+ large->internal_key.DecodeFrom(internal_key);
+ }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(const Slice& src);
+
+ std::string DebugString() const;
+
+ private:
+ friend class VersionSet;
+
+ typedef std::set< std::pair<int, uint64_t> > DeletedFileSet;
+
+ std::string comparator_;
+ uint64_t log_number_;
+ uint64_t next_file_number_;
+ SequenceNumber last_sequence_;
+ bool has_comparator_;
+ bool has_log_number_;
+ bool has_next_file_number_;
+ bool has_last_sequence_;
+
+ std::vector< std::pair<int, InternalKey> > compact_pointers_;
+ DeletedFileSet deleted_files_;
+ std::vector< std::pair<int, FileMetaData> > new_files_;
+ struct Large {
+ LargeValueRef large_ref;
+ uint64_t fnum;
+ InternalKey internal_key;
+ };
+ std::vector<Large> large_refs_added_;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_DB_VERSION_EDIT_H_
diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc
new file mode 100644
index 0000000..50913cd
--- /dev/null
+++ b/db/version_edit_test.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+ std::string encoded, encoded2;
+ edit.EncodeTo(&encoded);
+ VersionEdit parsed;
+ Status s = parsed.DecodeFrom(encoded);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ parsed.EncodeTo(&encoded2);
+ ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest { };
+
+TEST(VersionEditTest, EncodeDecode) {
+ static const uint64_t kBig = 1ull << 50;
+
+ VersionEdit edit;
+ for (int i = 0; i < 4; i++) {
+ TestEncodeDecode(edit);
+ edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
+ InternalKey("foo", kBig + 500 + i, kTypeLargeValueRef),
+ InternalKey("zoo", kBig + 600 + i, kTypeDeletion));
+ edit.DeleteFile(4, kBig + 700 + i);
+ edit.AddLargeValueRef(LargeValueRef::Make("big", kNoCompression),
+ kBig + 800 + i, "foobar");
+ edit.AddLargeValueRef(LargeValueRef::Make("big2", kLightweightCompression),
+ kBig + 801 + i, "baz");
+ edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
+ }
+
+ edit.SetComparatorName("foo");
+ edit.SetLogNumber(kBig + 100);
+ edit.SetNextFile(kBig + 200);
+ edit.SetLastSequence(kBig + 1000);
+ TestEncodeDecode(edit);
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/db/version_set.cc b/db/version_set.cc
new file mode 100644
index 0000000..2435fa2
--- /dev/null
+++ b/db/version_set.cc
@@ -0,0 +1,1003 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <stdio.h>
+#include "db/filename.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "include/env.h"
+#include "include/table_builder.h"
+#include "table/merger.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+static double MaxBytesForLevel(int level) {
+ if (level == 0) {
+ return 4 * 1048576.0;
+ } else {
+ double result = 10 * 1048576.0;
+ while (level > 1) {
+ result *= 10;
+ level--;
+ }
+ return result;
+ }
+}
+
+static uint64_t MaxFileSizeForLevel(int level) {
+ return 2 << 20; // We could vary per level to reduce number of files?
+}
+
+namespace {
+std::string IntSetToString(const std::set<uint64_t>& s) {
+ std::string result = "{";
+ for (std::set<uint64_t>::const_iterator it = s.begin();
+ it != s.end();
+ ++it) {
+ result += (result.size() > 1) ? "," : "";
+ result += NumberToString(*it);
+ }
+ result += "}";
+ return result;
+}
+}
+
+Version::~Version() {
+ assert(refs_ == 0);
+ for (int level = 0; level < config::kNumLevels; level++) {
+ for (int i = 0; i < files_[level].size(); i++) {
+ FileMetaData* f = files_[level][i];
+ assert(f->refs >= 0);
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+ delete cleanup_mem_;
+}
+
+// An internal iterator. For a given version/level pair, yields
+// information about the files in the level. For a given entry, key()
+// is the largest key that occurs in the file, and value() is an
+// 8-byte value containing the file number of the file, encoding using
+// EncodeFixed64.
+class Version::LevelFileNumIterator : public Iterator {
+ public:
+ LevelFileNumIterator(const Version* version,
+ const std::vector<FileMetaData*>* flist)
+ : icmp_(version->vset_->icmp_.user_comparator()),
+ flist_(flist),
+ index_(flist->size()) { // Marks as invalid
+ }
+ virtual bool Valid() const {
+ return index_ < flist_->size();
+ }
+ virtual void Seek(const Slice& target) {
+ uint32_t left = 0;
+ uint32_t right = flist_->size() - 1;
+ while (left < right) {
+ uint32_t mid = (left + right) / 2;
+ int cmp = icmp_.Compare((*flist_)[mid]->largest.Encode(), target);
+ if (cmp < 0) {
+ // Key at "mid.largest" is < than "target". Therefore all
+ // files at or before "mid" are uninteresting.
+ left = mid + 1;
+ } else {
+ // Key at "mid.largest" is >= "target". Therefore all files
+ // after "mid" are uninteresting.
+ right = mid;
+ }
+ }
+ index_ = left;
+ }
+ virtual void SeekToFirst() { index_ = 0; }
+ virtual void SeekToLast() {
+ index_ = flist_->empty() ? 0 : flist_->size() - 1;
+ }
+ virtual void Next() {
+ assert(Valid());
+ index_++;
+ }
+ virtual void Prev() {
+ assert(Valid());
+ if (index_ == 0) {
+ index_ = flist_->size(); // Marks as invalid
+ } else {
+ index_--;
+ }
+ }
+ Slice key() const {
+ assert(Valid());
+ return (*flist_)[index_]->largest.Encode();
+ }
+ Slice value() const {
+ assert(Valid());
+ EncodeFixed64(value_buf_, (*flist_)[index_]->number);
+ return Slice(value_buf_, sizeof(value_buf_));
+ }
+ virtual Status status() const { return Status::OK(); }
+ private:
+ const InternalKeyComparator icmp_;
+ const std::vector<FileMetaData*>* const flist_;
+ int index_;
+
+ mutable char value_buf_[8]; // Used for encoding the file number for value()
+};
+
+static Iterator* GetFileIterator(void* arg,
+ const ReadOptions& options,
+ const Slice& file_value) {
+ TableCache* cache = reinterpret_cast<TableCache*>(arg);
+ if (file_value.size() != 8) {
+ return NewErrorIterator(
+ Status::Corruption("FileReader invoked with unexpected value"));
+ } else {
+ return cache->NewIterator(options, DecodeFixed64(file_value.data()));
+ }
+}
+
+Iterator* Version::NewConcatenatingIterator(const ReadOptions& options,
+ int level) const {
+ return NewTwoLevelIterator(
+ new LevelFileNumIterator(this, &files_[level]),
+ &GetFileIterator, vset_->table_cache_, options);
+}
+
+void Version::AddIterators(const ReadOptions& options,
+ std::vector<Iterator*>* iters) {
+ // Merge all level zero files together since they may overlap
+ for (int i = 0; i < files_[0].size(); i++) {
+ iters->push_back(
+ vset_->table_cache_->NewIterator(options, files_[0][i]->number));
+ }
+
+ // For levels > 0, we can use a concatenating iterator that sequentially
+ // walks through the non-overlapping files in the level, opening them
+ // lazily.
+ for (int level = 1; level < config::kNumLevels; level++) {
+ if (!files_[level].empty()) {
+ iters->push_back(NewConcatenatingIterator(options, level));
+ }
+ }
+}
+
+void Version::Ref() {
+ ++refs_;
+}
+
+void Version::Unref() {
+ assert(refs_ >= 1);
+ --refs_;
+ if (refs_ == 0) {
+ vset_->MaybeDeleteOldVersions();
+ // TODO: try to delete obsolete files
+ }
+}
+
+std::string Version::DebugString() const {
+ std::string r;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ // E.g., level 1: 17:123['a' .. 'd'] 20:43['e' .. 'g']
+ r.append("level ");
+ AppendNumberTo(&r, level);
+ r.push_back(':');
+ const std::vector<FileMetaData*>& files = files_[level];
+ for (int i = 0; i < files.size(); i++) {
+ r.push_back(' ');
+ AppendNumberTo(&r, files[i]->number);
+ r.push_back(':');
+ AppendNumberTo(&r, files[i]->file_size);
+ r.append("['");
+ AppendEscapedStringTo(&r, files[i]->smallest.Encode());
+ r.append("' .. '");
+ AppendEscapedStringTo(&r, files[i]->largest.Encode());
+ r.append("']");
+ }
+ r.push_back('\n');
+ }
+ return r;
+}
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionSet::Builder {
+ private:
+ typedef std::map<uint64_t, FileMetaData*> FileMap;
+ VersionSet* vset_;
+ FileMap files_[config::kNumLevels];
+
+ public:
+ // Initialize a builder with the files from *base and other info from *vset
+ Builder(VersionSet* vset, Version* base)
+ : vset_(vset) {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = base->files_[level];
+ for (int i = 0; i < files.size(); i++) {
+ FileMetaData* f = files[i];
+ f->refs++;
+ files_[level].insert(std::make_pair(f->number, f));
+ }
+ }
+ }
+
+ ~Builder() {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const FileMap& fmap = files_[level];
+ for (FileMap::const_iterator iter = fmap.begin();
+ iter != fmap.end();
+ ++iter) {
+ FileMetaData* f = iter->second;
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ }
+ }
+ }
+
+ // Apply all of the edits in *edit to the current state.
+ void Apply(VersionEdit* edit) {
+ // Update compaction pointers
+ for (int i = 0; i < edit->compact_pointers_.size(); i++) {
+ const int level = edit->compact_pointers_[i].first;
+ vset_->compact_pointer_[level] =
+ edit->compact_pointers_[i].second.Encode().ToString();
+ }
+
+ // Delete files
+ const VersionEdit::DeletedFileSet& del = edit->deleted_files_;
+ for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin();
+ iter != del.end();
+ ++iter) {
+ const int level = iter->first;
+ const uint64_t number = iter->second;
+ FileMap::iterator fiter = files_[level].find(number);
+ assert(fiter != files_[level].end()); // Sanity check for debug mode
+ if (fiter != files_[level].end()) {
+ FileMetaData* f = fiter->second;
+ f->refs--;
+ if (f->refs <= 0) {
+ delete f;
+ }
+ files_[level].erase(fiter);
+ }
+ }
+
+ // Add new files
+ for (int i = 0; i < edit->new_files_.size(); i++) {
+ const int level = edit->new_files_[i].first;
+ FileMetaData* f = new FileMetaData(edit->new_files_[i].second);
+ f->refs = 1;
+ assert(files_[level].count(f->number) == 0);
+ files_[level].insert(std::make_pair(f->number, f));
+ }
+
+ // Add large value refs
+ for (int i = 0; i < edit->large_refs_added_.size(); i++) {
+ const VersionEdit::Large& l = edit->large_refs_added_[i];
+ vset_->RegisterLargeValueRef(l.large_ref, l.fnum, l.internal_key);
+ }
+ }
+
+ // Save the current state in *v.
+ void SaveTo(Version* v) {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const FileMap& fmap = files_[level];
+ for (FileMap::const_iterator iter = fmap.begin();
+ iter != fmap.end();
+ ++iter) {
+ FileMetaData* f = iter->second;
+ f->refs++;
+ v->files_[level].push_back(f);
+ }
+ }
+ }
+};
+
+VersionSet::VersionSet(const std::string& dbname,
+ const Options* options,
+ TableCache* table_cache,
+ const InternalKeyComparator* cmp)
+ : env_(options->env),
+ dbname_(dbname),
+ options_(options),
+ table_cache_(table_cache),
+ icmp_(*cmp),
+ next_file_number_(2),
+ manifest_file_number_(0), // Filled by Recover()
+ descriptor_file_(NULL),
+ descriptor_log_(NULL),
+ current_(new Version(this)),
+ oldest_(current_) {
+}
+
+VersionSet::~VersionSet() {
+ for (Version* v = oldest_; v != NULL; ) {
+ Version* next = v->next_;
+ assert(v->refs_ == 0);
+ delete v;
+ v = next;
+ }
+ delete descriptor_log_;
+ delete descriptor_file_;
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, MemTable* cleanup_mem) {
+ edit->SetNextFile(next_file_number_);
+
+ Version* v = new Version(this);
+ {
+ Builder builder(this, current_);
+ builder.Apply(edit);
+ builder.SaveTo(v);
+ }
+
+ std::string new_manifest_file;
+ Status s = Finalize(v);
+
+ // Initialize new descriptor log file if necessary by creating
+ // a temporary file that contains a snapshot of the current version.
+ if (s.ok()) {
+ if (descriptor_log_ == NULL) {
+ assert(descriptor_file_ == NULL);
+ new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_);
+ edit->SetNextFile(next_file_number_);
+ s = env_->NewWritableFile(new_manifest_file, &descriptor_file_);
+ if (s.ok()) {
+ descriptor_log_ = new log::Writer(descriptor_file_);
+ s = WriteSnapshot(descriptor_log_);
+ }
+ }
+ }
+
+ // Write new record to log file
+ if (s.ok()) {
+ std::string record;
+ edit->EncodeTo(&record);
+ s = descriptor_log_->AddRecord(record);
+ if (s.ok()) {
+ s = descriptor_file_->Sync();
+ }
+ }
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (s.ok() && !new_manifest_file.empty()) {
+ s = SetCurrentFile(env_, dbname_, manifest_file_number_);
+ }
+
+ // Install the new version
+ if (s.ok()) {
+ assert(current_->next_ == NULL);
+ assert(current_->cleanup_mem_ == NULL);
+ current_->cleanup_mem_ = cleanup_mem;
+ v->next_ = NULL;
+ current_->next_ = v;
+ current_ = v;
+ } else {
+ delete v;
+ if (!new_manifest_file.empty()) {
+ delete descriptor_log_;
+ delete descriptor_file_;
+ descriptor_log_ = NULL;
+ descriptor_file_ = NULL;
+ env_->DeleteFile(new_manifest_file);
+ }
+ }
+ //Log(env_, options_->info_log, "State\n%s", current_->DebugString().c_str());
+
+ return s;
+}
+
+Status VersionSet::Recover(uint64_t* log_number,
+ SequenceNumber* last_sequence) {
+ struct LogReporter : public log::Reader::Reporter {
+ Status* status;
+ virtual void Corruption(size_t bytes, const Status& s) {
+ if (this->status->ok()) *this->status = s;
+ }
+ };
+
+ // Read "CURRENT" file, which contains a pointer to the current manifest file
+ std::string current;
+ Status s = ReadFileToString(env_, CurrentFileName(dbname_), &current);
+ if (!s.ok()) {
+ return s;
+ }
+ if (current.empty() || current[current.size()-1] != '\n') {
+ return Status::Corruption("CURRENT file does not end with newline");
+ }
+ current.resize(current.size() - 1);
+
+ std::string dscname = dbname_ + "/" + current;
+ SequentialFile* file;
+ s = env_->NewSequentialFile(dscname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+
+ bool have_log_number = false;
+ bool have_next_file = false;
+ bool have_last_sequence = false;
+ uint64_t next_file = 0;
+ Builder builder(this, current_);
+
+ {
+ LogReporter reporter;
+ reporter.status = &s;
+ log::Reader reader(file, &reporter, true/*checksum*/);
+ Slice record;
+ std::string scratch;
+ while (reader.ReadRecord(&record, &scratch) && s.ok()) {
+ VersionEdit edit;
+ s = edit.DecodeFrom(record);
+ if (s.ok()) {
+ if (edit.has_comparator_ &&
+ edit.comparator_ != icmp_.user_comparator()->Name()) {
+ s = Status::InvalidArgument(
+ edit.comparator_ + "does not match existing comparator ",
+ icmp_.user_comparator()->Name());
+ }
+ }
+
+ if (s.ok()) {
+ builder.Apply(&edit);
+ }
+
+ if (edit.has_log_number_) {
+ *log_number = edit.log_number_;
+ have_log_number = true;
+ }
+
+ if (edit.has_next_file_number_) {
+ next_file = edit.next_file_number_;
+ have_next_file = true;
+ }
+
+ if (edit.has_last_sequence_) {
+ *last_sequence = edit.last_sequence_;
+ have_last_sequence = true;
+ }
+ }
+ }
+ delete file;
+ file = NULL;
+
+ if (s.ok()) {
+ if (!have_next_file) {
+ s = Status::Corruption("no meta-nextfile entry in descriptor");
+ } else if (!have_log_number) {
+ s = Status::Corruption("no meta-lognumber entry in descriptor");
+ } else if (!have_last_sequence) {
+ s = Status::Corruption("no last-sequence-number entry in descriptor");
+ }
+ }
+
+ if (s.ok()) {
+ Version* v = new Version(this);
+ builder.SaveTo(v);
+ s = Finalize(v);
+ if (!s.ok()) {
+ delete v;
+ } else {
+ // Install recovered version
+ v->next_ = NULL;
+ current_->next_ = v;
+ current_ = v;
+ manifest_file_number_ = next_file;
+ next_file_number_ = next_file + 1;
+ }
+ }
+
+ return s;
+}
+
+Status VersionSet::Finalize(Version* v) {
+ // Precomputed best level for next compaction
+ int best_level = -1;
+ double best_score = -1;
+
+ Status s;
+ for (int level = 0; s.ok() && level < config::kNumLevels; level++) {
+ s = SortLevel(v, level);
+
+ // Compute the ratio of current size to size limit.
+ uint64_t level_bytes = 0;
+ for (int i = 0; i < v->files_[level].size(); i++) {
+ level_bytes += v->files_[level][i]->file_size;
+ }
+ double score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
+
+ if (level == 0) {
+ // Level-0 file sizes are going to be often much smaller than
+ // MaxBytesForLevel(0) since we do not account for compression
+ // when producing a level-0 file; and too many level-0 files
+ // increase merging costs. So use a file-count limit for
+ // level-0 in addition to the byte-count limit.
+ double count_score = v->files_[level].size() / 4.0;
+ if (count_score > score) {
+ score = count_score;
+ }
+ }
+
+ if (score > best_score) {
+ best_level = level;
+ best_score = score;
+ }
+ }
+
+ v->compaction_level_ = best_level;
+ v->compaction_score_ = best_score;
+ return s;
+}
+
+Status VersionSet::WriteSnapshot(log::Writer* log) {
+ // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+ // Save metadata
+ VersionEdit edit;
+ edit.SetComparatorName(icmp_.user_comparator()->Name());
+
+ // Save compaction pointers
+ for (int level = 0; level < config::kNumLevels; level++) {
+ if (!compact_pointer_[level].empty()) {
+ InternalKey key;
+ key.DecodeFrom(compact_pointer_[level]);
+ edit.SetCompactPointer(level, key);
+ }
+ }
+
+ // Save files
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = current_->files_[level];
+ for (int i = 0; i < files.size(); i++) {
+ const FileMetaData* f = files[i];
+ edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest);
+ }
+ }
+
+ // Save large value refs
+ for (LargeValueMap::const_iterator it = large_value_refs_.begin();
+ it != large_value_refs_.end();
+ ++it) {
+ const LargeValueRef& ref = it->first;
+ const LargeReferencesSet& pointers = it->second;
+ for (LargeReferencesSet::const_iterator j = pointers.begin();
+ j != pointers.end();
+ ++j) {
+ edit.AddLargeValueRef(ref, j->first, j->second);
+ }
+ }
+
+ std::string record;
+ edit.EncodeTo(&record);
+ return log->AddRecord(record);
+}
+
+// Helper to sort by tables_[file_number].smallest
+struct VersionSet::BySmallestKey {
+ const InternalKeyComparator* internal_comparator;
+
+ bool operator()(FileMetaData* f1, FileMetaData* f2) const {
+ return internal_comparator->Compare(f1->smallest, f2->smallest) < 0;
+ }
+};
+
+Status VersionSet::SortLevel(Version* v, uint64_t level) {
+ Status result;
+ BySmallestKey cmp;
+ cmp.internal_comparator = &icmp_;
+ std::sort(v->files_[level].begin(), v->files_[level].end(), cmp);
+
+ if (result.ok() && level > 0) {
+ // There should be no overlap
+ for (int i = 1; i < v->files_[level].size(); i++) {
+ const InternalKey& prev_end = v->files_[level][i-1]->largest;
+ const InternalKey& this_begin = v->files_[level][i]->smallest;
+ if (icmp_.Compare(prev_end, this_begin) >= 0) {
+ result = Status::Corruption(
+ "overlapping ranges in same level",
+ (EscapeString(prev_end.Encode()) + " vs. " +
+ EscapeString(this_begin.Encode())));
+ break;
+ }
+ }
+ }
+ return result;
+}
+
+int VersionSet::NumLevelFiles(int level) const {
+ assert(level >= 0);
+ assert(level < config::kNumLevels);
+ return current_->files_[level].size();
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
+ uint64_t result = 0;
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (int i = 0; i < files.size(); i++) {
+ if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+ // Entire file is before "ikey", so just add the file size
+ result += files[i]->file_size;
+ } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+ // Entire file is after "ikey", so ignore
+ if (level > 0) {
+ // Files other than level 0 are sorted by meta->smallest, so
+ // no further files in this level will contain data for
+ // "ikey".
+ break;
+ }
+ } else {
+ // "ikey" falls in the range for this table. Add the
+ // approximate offset of "ikey" within the table.
+ Table* tableptr;
+ Iterator* iter = table_cache_->NewIterator(
+ ReadOptions(), files[i]->number, &tableptr);
+ if (tableptr != NULL) {
+ result += tableptr->ApproximateOffsetOf(ikey.Encode());
+ }
+ delete iter;
+ }
+ }
+ }
+
+ // Add in large value files which are references from internal keys
+ // stored in the table files
+ //
+ // TODO(opt): this is O(# large values in db). If this becomes too slow,
+ // we could store an auxiliary data structure indexed by internal key
+ for (LargeValueMap::const_iterator it = large_value_refs_.begin();
+ it != large_value_refs_.end();
+ ++it) {
+ const LargeValueRef& lref = it->first;
+ for (LargeReferencesSet::const_iterator it2 = it->second.begin();
+ it2 != it->second.end();
+ ++it2) {
+ if (icmp_.Compare(it2->second, ikey.Encode()) <= 0) {
+ // Internal key for large value is before our key of interest
+ result += lref.ValueSize();
+ }
+ }
+ }
+
+
+ return result;
+}
+
+bool VersionSet::RegisterLargeValueRef(const LargeValueRef& large_ref,
+ uint64_t fnum,
+ const InternalKey& internal_key) {
+ LargeReferencesSet* refs = &large_value_refs_[large_ref];
+ bool is_first = refs->empty();
+ refs->insert(make_pair(fnum, internal_key.Encode().ToString()));
+ return is_first;
+}
+
+void VersionSet::CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
+ uint64_t log_file_num) {
+ for (LargeValueMap::iterator it = large_value_refs_.begin();
+ it != large_value_refs_.end();
+ ) {
+ LargeReferencesSet* refs = &it->second;
+ for (LargeReferencesSet::iterator ref_it = refs->begin();
+ ref_it != refs->end();
+ ) {
+ if (ref_it->first != log_file_num && // Not in log file
+ live_tables.count(ref_it->first) == 0) { // Not in a live table
+ // No longer live: erase
+ LargeReferencesSet::iterator to_erase = ref_it;
+ ++ref_it;
+ refs->erase(to_erase);
+ } else {
+ // Still live: leave this reference alone
+ ++ref_it;
+ }
+ }
+ if (refs->empty()) {
+ // No longer any live references to this large value: remove from
+ // large_value_refs
+ Log(env_, options_->info_log, "large value is dead: '%s'",
+ LargeValueRefToFilenameString(it->first).c_str());
+ LargeValueMap::iterator to_erase = it;
+ ++it;
+ large_value_refs_.erase(to_erase);
+ } else {
+ ++it;
+ }
+ }
+}
+
+bool VersionSet::LargeValueIsLive(const LargeValueRef& large_ref) {
+ LargeValueMap::iterator it = large_value_refs_.find(large_ref);
+ if (it == large_value_refs_.end()) {
+ return false;
+ } else {
+ assert(!it->second.empty());
+ return true;
+ }
+}
+
+void VersionSet::MaybeDeleteOldVersions() {
+ // Note: it is important to delete versions in order since a newer
+ // version with zero refs may be holding a pointer to a memtable
+ // that is used by somebody who has a ref on an older version.
+ while (oldest_ != current_ && oldest_->refs_ == 0) {
+ Version* next = oldest_->next_;
+ delete oldest_;
+ oldest_ = next;
+ }
+}
+
+void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
+ for (Version* v = oldest_; v != NULL; v = v->next_) {
+ for (int level = 0; level < config::kNumLevels; level++) {
+ const std::vector<FileMetaData*>& files = v->files_[level];
+ for (int i = 0; i < files.size(); i++) {
+ live->insert(files[i]->number);
+ }
+ }
+ }
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+void VersionSet::GetOverlappingInputs(
+ int level,
+ const InternalKey& begin,
+ const InternalKey& end,
+ std::vector<FileMetaData*>* inputs) {
+ inputs->clear();
+ Slice user_begin = begin.user_key();
+ Slice user_end = end.user_key();
+ const Comparator* user_cmp = icmp_.user_comparator();
+ for (int i = 0; i < current_->files_[level].size(); i++) {
+ FileMetaData* f = current_->files_[level][i];
+ if (user_cmp->Compare(f->largest.user_key(), user_begin) < 0 ||
+ user_cmp->Compare(f->smallest.user_key(), user_end) > 0) {
+ // Either completely before or after range; skip it
+ } else {
+ inputs->push_back(f);
+ }
+ }
+}
+
+// Stores the minimal range that covers all entries in inputs in
+// *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest) {
+ assert(!inputs.empty());
+ smallest->Clear();
+ largest->Clear();
+ for (int i = 0; i < inputs.size(); i++) {
+ FileMetaData* f = inputs[i];
+ if (i == 0) {
+ *smallest = f->smallest;
+ *largest = f->largest;
+ } else {
+ if (icmp_.Compare(f->smallest, *smallest) < 0) {
+ *smallest = f->smallest;
+ }
+ if (icmp_.Compare(f->largest, *largest) > 0) {
+ *largest = f->largest;
+ }
+ }
+ }
+}
+
+Iterator* VersionSet::MakeInputIterator(Compaction* c) {
+ ReadOptions options;
+ options.verify_checksums = options_->paranoid_checks;
+ options.fill_cache = false;
+
+ // Level-0 files have to be merged together. For other levels,
+ // we will make a concatenating iterator per level.
+ // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+ const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2);
+ Iterator** list = new Iterator*[space];
+ int num = 0;
+ for (int which = 0; which < 2; which++) {
+ if (!c->inputs_[which].empty()) {
+ if (c->level() + which == 0) {
+ const std::vector<FileMetaData*>& files = c->inputs_[which];
+ for (int i = 0; i < files.size(); i++) {
+ list[num++] = table_cache_->NewIterator(options, files[i]->number);
+ }
+ } else {
+ // Create concatenating iterator for the files from this level
+ list[num++] = NewTwoLevelIterator(
+ new Version::LevelFileNumIterator(
+ c->input_version_, &c->inputs_[which]),
+ &GetFileIterator, table_cache_, options);
+ }
+ }
+ }
+ assert(num <= space);
+ Iterator* result = NewMergingIterator(&icmp_, list, num);
+ delete[] list;
+ return result;
+}
+
+Compaction* VersionSet::PickCompaction() {
+ if (!NeedsCompaction()) {
+ return NULL;
+ }
+ const int level = current_->compaction_level_;
+ assert(level >= 0);
+
+ Compaction* c = new Compaction(level);
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+
+ // Pick the first file that comes after compact_pointer_[level]
+ for (int i = 0; i < current_->files_[level].size(); i++) {
+ FileMetaData* f = current_->files_[level][i];
+ if (compact_pointer_[level].empty() ||
+ icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
+ c->inputs_[0].push_back(f);
+ break;
+ }
+ }
+ if (c->inputs_[0].empty()) {
+ // Wrap-around to the beginning of the key space
+ c->inputs_[0].push_back(current_->files_[level][0]);
+ }
+
+ // Find the range we are compacting
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+
+ // Files in level 0 may overlap each other, so pick up all overlapping ones
+ if (level == 0) {
+ // Note that the next call will discard the file we placed in
+ // c->inputs_[0] earlier and replace it with an overlapping set
+ // which will include the picked file.
+ GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
+ assert(!c->inputs_[0].empty());
+ GetRange(c->inputs_[0], &smallest, &largest);
+ }
+
+ GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
+
+ // See if we can grow the number of inputs in "level" without
+ // changing the number of "level+1" files we pick up.
+ if (!c->inputs_[1].empty()) {
+ // Get entire range covered by compaction
+ std::vector<FileMetaData*> all = c->inputs_[0];
+ all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end());
+ InternalKey all_start, all_limit;
+ GetRange(all, &all_start, &all_limit);
+
+ std::vector<FileMetaData*> expanded0;
+ GetOverlappingInputs(level, all_start, all_limit, &expanded0);
+ if (expanded0.size() > c->inputs_[0].size()) {
+ InternalKey new_start, new_limit;
+ GetRange(expanded0, &new_start, &new_limit);
+ std::vector<FileMetaData*> expanded1;
+ GetOverlappingInputs(level+1, new_start, new_limit, &expanded1);
+ if (expanded1.size() == c->inputs_[1].size()) {
+ Log(env_, options_->info_log,
+ "Expanding@%d %d+%d to %d+%d\n",
+ level,
+ int(c->inputs_[0].size()),
+ int(c->inputs_[1].size()),
+ int(expanded0.size()),
+ int(expanded1.size()));
+ smallest = new_start;
+ largest = new_limit;
+ c->inputs_[0] = expanded0;
+ c->inputs_[1] = expanded1;
+ }
+ }
+ }
+
+ if (false) {
+ Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
+ level,
+ EscapeString(smallest.Encode()).c_str(),
+ EscapeString(largest.Encode()).c_str());
+ }
+
+ // Update the place where we will do the next compaction for this level.
+ // We update this immediately instead of waiting for the VersionEdit
+ // to be applied so that if the compaction fails, we will try a different
+ // key range next time.
+ compact_pointer_[level] = largest.Encode().ToString();
+ c->edit_.SetCompactPointer(level, largest);
+
+ return c;
+}
+
+Compaction* VersionSet::CompactRange(
+ int level,
+ const InternalKey& begin,
+ const InternalKey& end) {
+ std::vector<FileMetaData*> inputs;
+ GetOverlappingInputs(level, begin, end, &inputs);
+ if (inputs.empty()) {
+ return NULL;
+ }
+
+ Compaction* c = new Compaction(level);
+ c->input_version_ = current_;
+ c->input_version_->Ref();
+ c->inputs_[0] = inputs;
+
+ // Find the range we are compacting
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+
+ GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
+ if (false) {
+ Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
+ level,
+ EscapeString(smallest.Encode()).c_str(),
+ EscapeString(largest.Encode()).c_str());
+ }
+ return c;
+}
+
+Compaction::Compaction(int level)
+ : level_(level),
+ max_output_file_size_(MaxFileSizeForLevel(level)),
+ input_version_(NULL) {
+ for (int i = 0; i < config::kNumLevels; i++) {
+ level_ptrs_[i] = 0;
+ }
+}
+
+Compaction::~Compaction() {
+ if (input_version_ != NULL) {
+ input_version_->Unref();
+ }
+}
+
+void Compaction::AddInputDeletions(VersionEdit* edit) {
+ for (int which = 0; which < 2; which++) {
+ for (int i = 0; i < inputs_[which].size(); i++) {
+ edit->DeleteFile(level_ + which, inputs_[which][i]->number);
+ }
+ }
+}
+
+bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
+ // Maybe use binary search to find right entry instead of linear search?
+ const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
+ for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) {
+ const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
+ for (; level_ptrs_[lvl] < files.size(); ) {
+ FileMetaData* f = files[level_ptrs_[lvl]];
+ if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+ // We've advanced far enough
+ if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
+ // Key falls in this file's range, so definitely not base level
+ return false;
+ }
+ break;
+ }
+ level_ptrs_[lvl]++;
+ }
+ }
+ return true;
+}
+
+void Compaction::ReleaseInputs() {
+ if (input_version_ != NULL) {
+ input_version_->Unref();
+ input_version_ = NULL;
+ }
+}
+
+}
diff --git a/db/version_set.h b/db/version_set.h
new file mode 100644
index 0000000..b8eee3d
--- /dev/null
+++ b/db/version_set.h
@@ -0,0 +1,290 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions. The
+// newest version is called "current". Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of Table files per level. The
+// entire set of versions is maintained in a VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
+#define STORAGE_LEVELDB_DB_VERSION_SET_H_
+
+#include <map>
+#include <set>
+#include <vector>
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+// Grouping of constants. We may want to make some of these
+// parameters set via options.
+namespace config {
+static const int kNumLevels = 7;
+}
+
+namespace log { class Writer; }
+
+class Compaction;
+class Iterator;
+class MemTable;
+class TableBuilder;
+class TableCache;
+class Version;
+class VersionSet;
+class WritableFile;
+
+class Version {
+ public:
+ // Append to *iters a sequence of iterators that will
+ // yield the contents of this Version when merged together.
+ // REQUIRES: This version has been saved (see VersionSet::SaveTo)
+ void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters);
+
+ // Reference count management (so Versions do not disappear out from
+ // under live iterators)
+ void Ref();
+ void Unref();
+
+ // Return a human readable string that describes this version's contents.
+ std::string DebugString() const;
+
+ private:
+ friend class Compaction;
+ friend class VersionSet;
+
+ class LevelFileNumIterator;
+ Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const;
+
+ VersionSet* vset_; // VersionSet to which this Version belongs
+ Version* next_; // Next version in linked list
+ int refs_; // Number of live refs to this version
+ MemTable* cleanup_mem_; // NULL, or table to delete when version dropped
+
+ // List of files per level
+ std::vector<FileMetaData*> files_[config::kNumLevels];
+
+ // Level that should be compacted next and its compaction score.
+ // Score < 1 means compaction is not strictly needed. These fields
+ // are initialized by Finalize().
+ double compaction_score_;
+ int compaction_level_;
+
+ explicit Version(VersionSet* vset)
+ : vset_(vset), next_(NULL), refs_(0),
+ cleanup_mem_(NULL),
+ compaction_score_(-1),
+ compaction_level_(-1) {
+ }
+
+ ~Version();
+
+ // No copying allowed
+ Version(const Version&);
+ void operator=(const Version&);
+};
+
+class VersionSet {
+ public:
+ VersionSet(const std::string& dbname,
+ const Options* options,
+ TableCache* table_cache,
+ const InternalKeyComparator*);
+ ~VersionSet();
+
+ // Apply *edit to the current version to form a new descriptor that
+ // is both saved to persistent state and installed as the new
+ // current version. Iff Apply() returns OK, arrange to delete
+ // cleanup_mem (if cleanup_mem != NULL) when it is no longer needed
+ // by older versions.
+ Status LogAndApply(VersionEdit* edit, MemTable* cleanup_mem);
+
+ // Recover the last saved descriptor from persistent storage.
+ Status Recover(uint64_t* log_number, SequenceNumber* last_sequence);
+
+ // Save current contents to *log
+ Status WriteSnapshot(log::Writer* log);
+
+ // Return the current version.
+ Version* current() const { return current_; }
+
+ // Return the current manifest file number
+ uint64_t ManifestFileNumber() const { return manifest_file_number_; }
+
+ // Allocate and return a new file number
+ uint64_t NewFileNumber() { return next_file_number_++; }
+
+ // Return the number of Table files at the specified level.
+ int NumLevelFiles(int level) const;
+
+ // Pick level and inputs for a new compaction.
+ // Returns NULL if there is no compaction to be done.
+ // Otherwise returns a pointer to a heap-allocated object that
+ // describes the compaction. Caller should delete the result.
+ Compaction* PickCompaction();
+
+ // Return a compaction object for compacting the range [begin,end] in
+ // the specified level. Returns NULL if there is nothing in that
+ // level that overlaps the specified range. Caller should delete
+ // the result.
+ Compaction* CompactRange(
+ int level,
+ const InternalKey& begin,
+ const InternalKey& end);
+
+ // Create an iterator that reads over the compaction inputs for "*c".
+ // The caller should delete the iterator when no longer needed.
+ Iterator* MakeInputIterator(Compaction* c);
+
+ // Returns true iff some level needs a compaction.
+ bool NeedsCompaction() const { return current_->compaction_score_ >= 1; }
+
+ // Add all files listed in any live version to *live.
+ // May also mutate some internal state.
+ void AddLiveFiles(std::set<uint64_t>* live);
+
+ // Return the approximate offset in the database of the data for
+ // "key" as of version "v".
+ uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
+
+ // Register a reference to a large value with the specified
+ // large_ref from the specified file number. Returns "true" if this
+ // is the first recorded reference to the "large_ref" value in the
+ // database, and false otherwise.
+ bool RegisterLargeValueRef(const LargeValueRef& large_ref,
+ uint64_t filenum,
+ const InternalKey& internal_key);
+
+ // Cleanup the large value reference state by eliminating any
+ // references from files that are not includes in either "live_tables"
+ // or "log_file".
+ void CleanupLargeValueRefs(const std::set<uint64_t>& live_tables,
+ uint64_t log_file_num);
+
+ // Returns true if a large value with the given reference is live.
+ bool LargeValueIsLive(const LargeValueRef& large_ref);
+
+ private:
+ class Builder;
+
+ friend class Compaction;
+ friend class Version;
+
+ Status Finalize(Version* v);
+
+ // Delete any old versions that are no longer needed.
+ void MaybeDeleteOldVersions();
+
+ struct BySmallestKey;
+ Status SortLevel(Version* v, uint64_t level);
+
+ void GetOverlappingInputs(
+ int level,
+ const InternalKey& begin,
+ const InternalKey& end,
+ std::vector<FileMetaData*>* inputs);
+
+ void GetRange(const std::vector<FileMetaData*>& inputs,
+ InternalKey* smallest,
+ InternalKey* largest);
+
+ Env* const env_;
+ const std::string dbname_;
+ const Options* const options_;
+ TableCache* const table_cache_;
+ const InternalKeyComparator icmp_;
+ uint64_t next_file_number_;
+ uint64_t manifest_file_number_;
+
+ // Opened lazily
+ WritableFile* descriptor_file_;
+ log::Writer* descriptor_log_;
+
+ // Versions are kept in a singly linked list that is never empty
+ Version* current_; // Pointer to the last (newest) list entry
+ Version* oldest_; // Pointer to the first (oldest) list entry
+
+ // Map from large value reference to the set of <file numbers,internal_key>
+ // values containing references to the value. We keep the
+ // internal key as a std::string rather than as an InternalKey because
+ // we want to be able to easily use a set.
+ typedef std::set<std::pair<uint64_t, std::string> > LargeReferencesSet;
+ typedef std::map<LargeValueRef, LargeReferencesSet> LargeValueMap;
+ LargeValueMap large_value_refs_;
+
+ // Per-level key at which the next compaction at that level should start.
+ // Either an empty string, or a valid InternalKey.
+ std::string compact_pointer_[config::kNumLevels];
+
+ // No copying allowed
+ VersionSet(const VersionSet&);
+ void operator=(const VersionSet&);
+};
+
+// A Compaction encapsulates information about a compaction.
+class Compaction {
+ public:
+ ~Compaction();
+
+ // Return the level that is being compacted. Inputs from "level"
+ // and "level+1" will be merged to produce a set of "level+1" files.
+ int level() const { return level_; }
+
+ // Return the object that holds the edits to the descriptor done
+ // by this compaction.
+ VersionEdit* edit() { return &edit_; }
+
+ // "which" must be either 0 or 1
+ int num_input_files(int which) const { return inputs_[which].size(); }
+
+ // Return the ith input file at "level()+which" ("which" must be 0 or 1).
+ FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
+
+ // Maximum size of files to build during this compaction.
+ uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+
+ // Add all inputs to this compaction as delete operations to *edit.
+ void AddInputDeletions(VersionEdit* edit);
+
+ // Returns true if the information we have available guarantees that
+ // the compaction is producing data in "level+1" for which no data exists
+ // in levels greater than "level+1".
+ bool IsBaseLevelForKey(const Slice& user_key);
+
+ // Release the input version for the compaction, once the compaction
+ // is successful.
+ void ReleaseInputs();
+
+ private:
+ friend class Version;
+ friend class VersionSet;
+
+ explicit Compaction(int level);
+
+ int level_;
+ uint64_t max_output_file_size_;
+ Version* input_version_;
+ VersionEdit edit_;
+
+ // Each compaction reads inputs from "level_" and "level_+1"
+ std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
+
+ // State for implementing IsBaseLevelForKey
+
+ // level_ptrs_ holds indices into input_version_->levels_: our state
+ // is that we are positioned at one of the file ranges for each
+ // higher level than the ones involved in this compaction (i.e. for
+ // all L >= level_ + 2).
+ int level_ptrs_[config::kNumLevels];
+};
+
+}
+
+#endif // STORAGE_LEVELDB_DB_VERSION_SET_H_
diff --git a/db/write_batch.cc b/db/write_batch.cc
new file mode 100644
index 0000000..b6c4979
--- /dev/null
+++ b/db/write_batch.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+// sequence: fixed64
+// count: fixed32
+// data: record[count]
+// record :=
+// kTypeValue varstring varstring |
+// kTypeLargeValueRef varstring varstring |
+// kTypeDeletion varstring
+// varstring :=
+// len: varint32
+// data: uint8[len]
+
+#include "include/write_batch.h"
+
+#include "include/db.h"
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+WriteBatch::WriteBatch() {
+ Clear();
+}
+
+WriteBatch::~WriteBatch() { }
+
+void WriteBatch::Clear() {
+ rep_.clear();
+ rep_.resize(12);
+}
+
+int WriteBatchInternal::Count(const WriteBatch* b) {
+ return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, int n) {
+ EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+ return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+ EncodeFixed64(&b->rep_[0], seq);
+}
+
+void WriteBatch::Put(const Slice& key, const Slice& value) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeValue));
+ PutLengthPrefixedSlice(&rep_, key);
+ PutLengthPrefixedSlice(&rep_, value);
+}
+
+void WriteBatchInternal::PutLargeValueRef(WriteBatch* b,
+ const Slice& key,
+ const LargeValueRef& large_ref) {
+ WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+ b->rep_.push_back(static_cast<char>(kTypeLargeValueRef));
+ PutLengthPrefixedSlice(&b->rep_, key);
+ PutLengthPrefixedSlice(&b->rep_,
+ Slice(large_ref.data, sizeof(large_ref.data)));
+}
+
+void WriteBatch::Delete(const Slice& key) {
+ WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
+ rep_.push_back(static_cast<char>(kTypeDeletion));
+ PutLengthPrefixedSlice(&rep_, key);
+}
+
+Status WriteBatchInternal::InsertInto(const WriteBatch* b,
+ MemTable* memtable) {
+ const int count = WriteBatchInternal::Count(b);
+ int found = 0;
+ Iterator it(*b);
+ for (; !it.Done(); it.Next()) {
+ switch (it.op()) {
+ case kTypeDeletion:
+ memtable->Add(it.sequence_number(), kTypeDeletion, it.key(), Slice());
+ break;
+ case kTypeValue:
+ memtable->Add(it.sequence_number(), kTypeValue, it.key(), it.value());
+ break;
+ case kTypeLargeValueRef:
+ memtable->Add(it.sequence_number(), kTypeLargeValueRef,
+ it.key(), it.value());
+ break;
+ }
+ found++;
+ }
+ if (!it.status().ok()) {
+ return it.status();
+ } else if (found != count) {
+ return Status::Corruption("wrong count in WriteBatch");
+ }
+ return Status::OK();
+}
+
+void WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+ assert(contents.size() >= 12);
+ b->rep_.assign(contents.data(), contents.size());
+}
+
+WriteBatchInternal::Iterator::Iterator(const WriteBatch& batch)
+ : input_(WriteBatchInternal::Contents(&batch)),
+ done_(false) {
+ if (input_.size() < 12) {
+ done_ = true;
+ } else {
+ seq_ = WriteBatchInternal::Sequence(&batch),
+ input_.remove_prefix(12);
+ GetNextEntry();
+ }
+}
+
+void WriteBatchInternal::Iterator::Next() {
+ assert(!done_);
+ seq_++;
+ GetNextEntry();
+}
+
+void WriteBatchInternal::Iterator::GetNextEntry() {
+ if (input_.empty()) {
+ done_ = true;
+ return;
+ }
+ char tag = input_[0];
+ input_.remove_prefix(1);
+ switch (tag) {
+ case kTypeValue:
+ case kTypeLargeValueRef:
+ if (GetLengthPrefixedSlice(&input_, &key_) &&
+ GetLengthPrefixedSlice(&input_, &value_)) {
+ op_ = static_cast<ValueType>(tag);
+ } else {
+ status_ = Status::Corruption("bad WriteBatch Put");
+ done_ = true;
+ input_.clear();
+ }
+ break;
+ case kTypeDeletion:
+ if (GetLengthPrefixedSlice(&input_, &key_)) {
+ op_ = kTypeDeletion;
+ } else {
+ status_ = Status::Corruption("bad WriteBatch Delete");
+ done_ = true;
+ input_.clear();
+ }
+ break;
+ default:
+ status_ = Status::Corruption("unknown WriteBatch tag");
+ done_ = true;
+ input_.clear();
+ break;
+ }
+}
+
+}
diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h
new file mode 100644
index 0000000..df750c7
--- /dev/null
+++ b/db/write_batch_internal.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+#define STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
+
+#include "include/write_batch.h"
+
+namespace leveldb {
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+ static void PutLargeValueRef(WriteBatch* batch,
+ const Slice& key,
+ const LargeValueRef& large_ref);
+
+ // Return the number of entries in the batch.
+ static int Count(const WriteBatch* batch);
+
+ // Set the count for the number of entries in the batch.
+ static void SetCount(WriteBatch* batch, int n);
+
+ // Return the seqeunce number for the start of this batch.
+ static SequenceNumber Sequence(const WriteBatch* batch);
+
+ // Store the specified number as the seqeunce number for the start of
+ // this batch.
+ static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+ static Slice Contents(const WriteBatch* batch) {
+ return Slice(batch->rep_);
+ }
+
+ static size_t ByteSize(const WriteBatch* batch) {
+ return batch->rep_.size();
+ }
+
+ static void SetContents(WriteBatch* batch, const Slice& contents);
+
+ static Status InsertInto(const WriteBatch* batch, MemTable* memtable);
+
+ // Iterate over the contents of a write batch.
+ class Iterator {
+ public:
+ explicit Iterator(const WriteBatch& batch);
+ bool Done() const { return done_; }
+ void Next();
+ ValueType op() const { return op_; }
+ const Slice& key() const { return key_; }
+ const Slice& value() const { return value_; }
+ SequenceNumber sequence_number() const { return seq_; }
+ Status status() const { return status_; }
+
+ private:
+ void GetNextEntry();
+
+ Slice input_;
+ bool done_;
+ ValueType op_;
+ Slice key_;
+ Slice value_;
+ SequenceNumber seq_;
+ Status status_;
+ };
+};
+
+}
+
+
+#endif // STORAGE_LEVELDB_DB_WRITE_BATCH_INTERNAL_H_
diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc
new file mode 100644
index 0000000..4963579
--- /dev/null
+++ b/db/write_batch_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/db.h"
+
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "include/env.h"
+#include "util/logging.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static std::string PrintContents(WriteBatch* b) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ MemTable mem(cmp);
+ std::string state;
+ Status s = WriteBatchInternal::InsertInto(b, &mem);
+ Iterator* iter = mem.NewIterator();
+ for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+ ParsedInternalKey ikey;
+ ASSERT_TRUE(ParseInternalKey(iter->key(), &ikey));
+ switch (ikey.type) {
+ case kTypeValue:
+ state.append("Put(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ break;
+ case kTypeLargeValueRef:
+ state.append("PutRef(");
+ state.append(ikey.user_key.ToString());
+ state.append(", ");
+ state.append(iter->value().ToString());
+ state.append(")");
+ break;
+ case kTypeDeletion:
+ state.append("Delete(");
+ state.append(ikey.user_key.ToString());
+ state.append(")");
+ break;
+ }
+ state.append("@");
+ state.append(NumberToString(ikey.sequence));
+ }
+ delete iter;
+ if (!s.ok()) {
+ state.append("ParseError()");
+ }
+ return state;
+}
+
+class WriteBatchTest { };
+
+TEST(WriteBatchTest, Empty) {
+ WriteBatch batch;
+ ASSERT_EQ("", PrintContents(&batch));
+ ASSERT_EQ(0, WriteBatchInternal::Count(&batch));
+}
+
+TEST(WriteBatchTest, Multiple) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ batch.Put(Slice("baz"), Slice("boo"));
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(3, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ("Put(baz, boo)@102"
+ "Delete(box)@101"
+ "Put(foo, bar)@100",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, PutIndirect) {
+ WriteBatch batch;
+ batch.Put(Slice("baz"), Slice("boo"));
+ LargeValueRef h;
+ for (int i = 0; i < LargeValueRef::ByteSize(); i++) {
+ h.data[i] = (i < 20) ? 'a' : 'b';
+ }
+ WriteBatchInternal::PutLargeValueRef(&batch, Slice("foo"), h);
+ WriteBatchInternal::SetSequence(&batch, 100);
+ ASSERT_EQ(100, WriteBatchInternal::Sequence(&batch));
+ ASSERT_EQ(2, WriteBatchInternal::Count(&batch));
+ ASSERT_EQ("Put(baz, boo)@100"
+ "PutRef(foo, aaaaaaaaaaaaaaaaaaaabbbbbbbbb)@101",
+ PrintContents(&batch));
+}
+
+TEST(WriteBatchTest, Corruption) {
+ WriteBatch batch;
+ batch.Put(Slice("foo"), Slice("bar"));
+ batch.Delete(Slice("box"));
+ WriteBatchInternal::SetSequence(&batch, 200);
+ Slice contents = WriteBatchInternal::Contents(&batch);
+ WriteBatchInternal::SetContents(&batch,
+ Slice(contents.data(),contents.size()-1));
+ ASSERT_EQ("Put(foo, bar)@200"
+ "ParseError()",
+ PrintContents(&batch));
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/doc/doc.css b/doc/doc.css
new file mode 100644
index 0000000..700c564
--- /dev/null
+++ b/doc/doc.css
@@ -0,0 +1,89 @@
+body {
+ margin-left: 0.5in;
+ margin-right: 0.5in;
+ background: white;
+ color: black;
+}
+
+h1 {
+ margin-left: -0.2in;
+ font-size: 14pt;
+}
+h2 {
+ margin-left: -0in;
+ font-size: 12pt;
+}
+h3 {
+ margin-left: -0in;
+}
+h4 {
+ margin-left: -0in;
+}
+hr {
+ margin-left: -0in;
+}
+
+/* Definition lists: definition term bold */
+dt {
+ font-weight: bold;
+}
+
+address {
+ text-align: center;
+}
+code,samp,var {
+ color: blue;
+}
+kbd {
+ color: #600000;
+}
+div.note p {
+ float: right;
+ width: 3in;
+ margin-right: 0%;
+ padding: 1px;
+ border: 2px solid #6060a0;
+ background-color: #fffff0;
+}
+
+ul {
+ margin-top: -0em;
+ margin-bottom: -0em;
+}
+
+ol {
+ margin-top: -0em;
+ margin-bottom: -0em;
+}
+
+UL.nobullets {
+ list-style-type: none;
+ list-style-image: none;
+ margin-left: -1em;
+}
+
+p {
+ margin: 1em 0 1em 0;
+ padding: 0 0 0 0;
+}
+
+pre {
+ line-height: 1.3em;
+ padding: 0.4em 0 0.8em 0;
+ margin: 0 0 0 0;
+ border: 0 0 0 0;
+ color: blue;
+}
+
+.datatable {
+ margin-left: auto;
+ margin-right: auto;
+ margin-top: 2em;
+ margin-bottom: 2em;
+ border: 1px solid;
+}
+
+.datatable td,th {
+ padding: 0 0.5em 0 0.5em;
+ text-align: right;
+}
diff --git a/doc/impl.html b/doc/impl.html
new file mode 100644
index 0000000..2f2c809
--- /dev/null
+++ b/doc/impl.html
@@ -0,0 +1,222 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb file layout and compactions</title>
+</head>
+
+<body>
+
+<h1>Files</h1>
+
+The implementation of leveldb is similar in spirit to the
+representation of a single
+<a href="http://labs.google.com/papers/bigtable.html">
+Bigtable tablet (section 5.3)</a>.
+However the organization of the files that make up the representation
+is somewhat different and is explained below.
+
+<p>
+Each database is represented by a set of file stored in a directory.
+There are several different types of files as documented below:
+<p>
+<h2>Log files</h2>
+<p>
+A log file (*.log) stores a sequence of recent updates. Each update
+is appended to the current log file. When the log file reaches a
+pre-determined size (approximately 1MB by default), it is converted
+to a sorted table (see below) and a new log file is created for future
+updates.
+<p>
+A copy of the current log file is kept in an in-memory structure (the
+<code>memtable</code>). This copy is consulted on every read so that read
+operations reflect all logged updates.
+<p>
+<h2>Sorted tables</h2>
+<p>
+A sorted table (*.sst) stores a sequence of entries sorted by key.
+Each entry is either a value for the key, or a deletion marker for the
+key. (Deletion markers are kept around to hide obsolete values
+present in older sorted tables).
+<p>
+The set of sorted tables are organized into a sequence of levels. The
+sorted table generated from a log file is placed in a special <code>young</code>
+level (also called level-0). When the number of young files exceeds a
+certain threshold (currently four), all of the young files are merged
+together with all of the overlapping level-1 files to produce a
+sequence of new level-1 files (we create a new level-1 file for every
+2MB of data.)
+<p>
+Files in the young level may contain overlapping keys. However files
+in other levels have distinct non-overlapping key ranges. Consider
+level number L where L >= 1. When the combined size of files in
+level-L exceeds (10^L) MB (i.e., 10MB for level-1, 100MB for level-2,
+...), one file in level-L, and all of the overlapping files in
+level-(L+1) are merged to form a set of new files for level-(L+1).
+These merges have the effect of gradually migrating new updates from
+the young level to the largest level using only bulk reads and writes
+(i.e., minimizing expensive seeks).
+
+<h2>Large value files</h2>
+<p>
+Each large value (greater than 64KB by default) is placed in a large
+value file (*.val) of its own. An entry is maintained in the log
+and/or sorted tables that maps from the corresponding key to the
+name of this large value file. The name of the large value file
+is derived from a SHA1 hash of the value and its length so that
+identical values share the same file.
+<p>
+<h2>Manifest</h2>
+<p>
+A MANIFEST file lists the set of sorted tables that make up each
+level, the corresponding key ranges, and other important metadata.
+A new MANIFEST file (with a new number embedded in the file name)
+is created whenever the database is reopened. The MANIFEST file is
+formatted as a log, and changes made to the serving state (as files
+are added or removed) are appended to this log.
+<p>
+<h2>Current</h2>
+<p>
+CURRENT is a simple text file that contains the name of the latest
+MANIFEST file.
+<p>
+<h2>Info logs</h2>
+<p>
+Informational messages are printed to files named LOG and LOG.old.
+<p>
+<h2>Others</h2>
+<p>
+Other files used for miscellaneous purposes may also be present
+(LOCK, *.dbtmp).
+
+<h1>Level 0</h1>
+When the log file grows above a certain size (1MB by default):
+<ul>
+<li>Write the contents of the current memtable to an sstable
+<li>Replace the current memtable by a brand new empty memtable
+<li>Switch to a new log file
+<li>Delete the old log file and the old memtable
+</ul>
+Experimental measurements show that generating an sstable from a 1MB
+log file takes ~12ms, which seems like an acceptable latency hiccup to
+add infrequently to a log write.
+
+<p>
+The new sstable is added to a special level-0 level. level-0 contains
+a set of files (up to 4 by default). However unlike other levels,
+these files do not cover disjoint ranges, but may overlap each other.
+
+<h1>Compactions</h1>
+
+<p>
+When the size of level L exceeds its limit, we compact it in a
+background thread. The compaction picks a file from level L and all
+overlapping files from the next level L+1. Note that if a level-L
+file overlaps only part of a level-(L+1) file, the entire file at
+level-(L+1) is used as an input to the compaction and will be
+discarded after the compaction. Aside: because level-0 is special
+(files in it may overlap each other), we treat compactions from
+level-0 to level-1 specially: a level-0 compaction may pick more than
+one level-0 file in case some of these files overlap each other.
+
+<p>
+A compaction merges the contents of the picked files to produce a
+sequence of level-(L+1) files. We switch to producing a new
+level-(L+1) file after the current output file has reached the target
+file size (2MB). The old files are discarded and the new files are
+added to the serving state.
+
+<p>
+Compactions for a particular level rotate through the key space. In
+more detail, for each level L, we remember the ending key of the last
+compaction at level L. The next compaction for level L will pick the
+first file that starts after this key (wrapping around to the
+beginning of the key space if there is no such file).
+
+<p>
+Compactions drop overwritten values. They also drop deletion markers
+if there are no higher numbered levels that contain a file whose range
+overlaps the current key.
+
+<h2>Timing</h2>
+
+Level-0 compactions will read up to four 1MB files from level-0, and
+at worst all the level-1 files (10MB). I.e., we will read 14MB and
+write 14MB.
+
+<p>
+Other than the special level-0 compactions, we will pick one 2MB file
+from level L. In the worst case, this will overlap ~ 12 files from
+level L+1 (10 because level-(L+1) is ten times the size of level-L,
+and another two at the boundaries since the file ranges at level-L
+will usually not be aligned with the file ranges at level-L+1). The
+compaction will therefore read 26MB and write 26MB. Assuming a disk
+IO rate of 100MB/s (ballpark range for modern drives), the worst
+compaction cost will be approximately 0.5 second.
+
+<p>
+If we throttle the background writing to something small, say 10% of
+the full 100MB/s speed, a compaction may take up to 5 seconds. If the
+user is writing at 10MB/s, we might build up lots of level-0 files
+(~50 to hold the 5*10MB). This may signficantly increase the cost of
+reads due to the overhead of merging more files together on every
+read.
+
+<p>
+Solution 1: To reduce this problem, we might want to increase the log
+switching threshold when the number of level-0 files is large. Though
+the downside is that the larger this threshold, the larger the delay
+that we will add to write latency when a write triggers a log switch.
+
+<p>
+Solution 2: We might want to decrease write rate artificially when the
+number of level-0 files goes up.
+
+<p>
+Solution 3: We work on reducing the cost of very wide merges.
+Perhaps most of the level-0 files will have their blocks sitting
+uncompressed in the cache and we will only need to worry about the
+O(N) complexity in the merging iterator.
+
+<h2>Number of files</h2>
+
+Instead of always making 2MB files, we could make larger files for
+larger levels to reduce the total file count, though at the expense of
+more bursty compactions. Alternatively, we could shard the set of
+files into multiple directories.
+
+<p>
+An experiment on an <code>ext3</code> filesystem on Feb 04, 2011 shows
+the following timings to do 100K file opens in directories with
+varying number of files:
+<table class="datatable">
+<tr><th>Files in directory</th><th>Microseconds to open a file</th></tr>
+<tr><td>1000</td><td>9</td>
+<tr><td>10000</td><td>10</td>
+<tr><td>100000</td><td>16</td>
+</table>
+So maybe even the sharding is not necessary on modern filesystems?
+
+<h1>Recovery</h1>
+
+<ul>
+<li> Read CURRENT to find name of the latest committed MANIFEST
+<li> Read the named MANIFEST file
+<li> Clean up stale files
+<li> We could open all sstables here, but it is probably better to be lazy...
+<li> Convert log chunk to a new level-0 sstable
+<li> Start directing new writes to a new log file with recovered sequence#
+</ul>
+
+<h1>Garbage collection of files</h1>
+
+<code>DeleteObsoleteFiles()</code> is called at the end of every
+compaction and at the end of recovery. It finds the names of all
+files in the database. It deletes all log files that are not the
+current log file. It deletes all table files that are not referenced
+from some level and are not the output of an active compaction. It
+deletes all large value files that are not referenced from any live
+table or log file.
+
+</body>
+</html>
diff --git a/doc/index.html b/doc/index.html
new file mode 100644
index 0000000..53471d2
--- /dev/null
+++ b/doc/index.html
@@ -0,0 +1,508 @@
+<!DOCTYPE html>
+<html>
+<head>
+<link rel="stylesheet" type="text/css" href="doc.css" />
+<title>Leveldb</title>
+</head>
+
+<body>
+<h1>Leveldb</h1>
+<address>Jeff Dean, Sanjay Ghemawat</address>
+<p>
+The <code>leveldb</code> library provides a persistent key value store. Keys and
+values are arbitrary byte arrays. The keys are ordered within the key
+value store according to a user-specified comparator function.
+
+<p>
+<h1>Opening A Database</h1>
+<p>
+A <code>leveldb</code> database has a name which corresponds to a file system
+directory. All of the contents of database are stored in this
+directory. The following example shows how to open a database,
+creating it if necessary:
+<p>
+<pre>
+ #include &lt;assert&gt;
+ #include "leveldb/include/db.h"
+
+ leveldb::DB* db;
+ leveldb::Options options;
+ options.create_if_missing = true;
+ leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+ assert(status.ok());
+ ...
+</pre>
+If you want to raise an error if the database already exists, add
+the following line before the <code>leveldb::DB::Open</code> call:
+<pre>
+ options.error_if_exists = true;
+</pre>
+<h1>Status</h1>
+<p>
+You may have noticed the <code>leveldb::Status</code> type above. Values of this
+type are returned by most functions in <code>leveldb</code> that may encounter an
+error. You can check if such a result is ok, and also print an
+associated error message:
+<p>
+<pre>
+ leveldb::Status s = ...;
+ if (!s.ok()) cerr &lt;&lt; s.ToString() &lt;&lt; endl;
+</pre>
+<h1>Closing A Database</h1>
+<p>
+When you are done with a database, just delete the database object.
+Example:
+<p>
+<pre>
+ ... open the db as described above ...
+ ... do something with db ...
+ delete db;
+</pre>
+<h1>Reads And Writes</h1>
+<p>
+The database provides <code>Put</code>, <code>Delete</code>, and <code>Get</code> methods to
+modify/query the database. For example, the following code
+moves the value stored under key1 to key2.
+<p>
+<pre>
+ std::string value;
+ leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+ if (s.ok()) s = db-&gt;Put(leveldb::WriteOptions(), key2, value);
+ if (s.ok()) s = db-&gt;Delete(leveldb::WriteOptions(), key1);
+</pre>
+See <a href="#async">important performance note</a> below for how to
+speed up writes significantly.
+
+<h1>Atomic Updates</h1>
+<p>
+Note that if the process dies after the Put of key2 but before the
+delete of key1, the same value may be left stored under multiple keys.
+Such problems can be avoided by using the <code>WriteBatch</code> class to
+atomically apply a set of updates:
+<p>
+<pre>
+ #include "leveldb/include/write_batch.h"
+ ...
+ std::string value;
+ leveldb::Status s = db-&gt;Get(leveldb::ReadOptions(), key1, &amp;value);
+ if (s.ok()) {
+ leveldb::WriteBatch batch;
+ batch.Delete(key1);
+ batch.Put(key2, value);
+ s = db-&gt;Write(leveldb::WriteOptions(), &amp;batch);
+ }
+</pre>
+The <code>WriteBatch</code> holds a sequence of edits to be made to the database,
+and these edits within the batch are applied in order. Note that we
+called <code>Delete</code> before <code>Put</code> so that if <code>key1</code> is identical to <code>key2</code>,
+we do not end up erroneously dropping the value entirely.
+<p>
+Apart from its atomicity benefits, <code>WriteBatch</code> may also be used to
+speed up bulk updates by placing lots of individual mutations into the
+same batch.
+<p>
+<h1>Concurrency</h1>
+<p>
+A database may only be opened by one process at a time. The <code>leveldb</code>
+implementation acquires a lock from the operating system to prevent
+misuse. Within a single process, the same <code>leveldb::DB</code> object may
+be safely used by multiple concurrent threads.
+<p>
+<h1>Iteration</h1>
+<p>
+The following example demonstrates how to print all key,value pairs
+in a database.
+<p>
+<pre>
+ leveldb::Iterator* it = db-&gt;NewIterator(leveldb::ReadOptions());
+ for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+ cout &lt;&lt; it-&gt;key().ToString() &lt;&lt; ": " &lt;&lt; it-&gt;value().ToString() &lt;&lt; endl;
+ }
+ assert(it-&gt;status().ok()); // Check for any errors found during the scan
+ delete it;
+</pre>
+The following variation shows how to process just the keys in the
+range <code>[start,limit)</code>:
+<p>
+<pre>
+ for (it-&gt;Seek(start);
+ it-&gt;Valid() &amp;&amp; it-&gt;key().ToString() &lt; limit;
+ it-&gt;Next()) {
+ ...
+ }
+</pre>
+You can also process entries in reverse order. (Caveat: reverse
+iteration is currently a factor of two or three slower than forward
+iteration.)
+<p>
+<pre>
+ for (it-&gt;SeekToLast(); it-&gt;Valid(); it-&gt;Prev()) {
+ ...
+ }
+</pre>
+<h1>Snapshots</h1>
+<p>
+Snapshots provide consistent read-only views over the entire state of
+the key-value store. <code>ReadOptions::snapshot</code> may be non-NULL to indicate
+that a read should operate on a particular version of the DB state.
+If <code>ReadOptions::snapshot</code> is NULL, the read will operate on an
+implicit snapshot of the current state.
+<p>
+Snapshots typically are created by the DB::GetSnapshot() method:
+<p>
+<pre>
+ leveldb::ReadOptions options;
+ options.snapshot = db-&gt;GetSnapshot();
+ ... apply some updates to db ...
+ leveldb::Iterator* iter = db-&gt;NewIterator(options);
+ ... read using iter to view the state when the snapshot was created ...
+ delete iter;
+ db-&gt;ReleaseSnapshot(options.snapshot);
+</pre>
+Note that when a snapshot is no longer needed, it should be released
+using the DB::ReleaseSnapshot interface. This allows the
+implementation to get rid of state that was being maintained just to
+support reading as of that snapshot.
+<p>
+A Write operation can also return a snapshot that
+represents the state of the database just after applying a particular
+set of updates:
+<p>
+<pre>
+ leveldb::Snapshot* snapshot;
+ leveldb::WriteOptions write_options;
+ write_options.post_write_snapshot = &amp;snapshot;
+ leveldb::Status status = db-&gt;Write(write_options, ...);
+ ... perform other mutations to db ...
+
+ leveldb::ReadOptions read_options;
+ read_options.snapshot = snapshot;
+ leveldb::Iterator* iter = db-&gt;NewIterator(read_options);
+ ... read as of the state just after the Write call returned ...
+ delete iter;
+
+ db-&gt;ReleaseSnapshot(snapshot);
+</pre>
+<h1>Slice</h1>
+<p>
+The return value of the <code>it->key()</code> and <code>it->value()</code> calls above
+are instances of the <code>leveldb::Slice</code> type. <code>Slice</code> is a simple
+structure that contains a length and a pointer to an external byte
+array. Returning a <code>Slice</code> is a cheaper alternative to returning a
+<code>std::string</code> since we do not need to copy potentially large keys and
+values. In addition, <code>leveldb</code> methods do not return null-terminated
+C-style strings since <code>leveldb</code> keys and values are allowed to
+contain '\0' bytes.
+<p>
+C++ strings and null-terminated C-style strings can be easily converted
+to a Slice:
+<p>
+<pre>
+ leveldb::Slice s1 = "hello";
+
+ std::string str("world");
+ leveldb::Slice s2 = str;
+</pre>
+A Slice can be easily converted back to a C++ string:
+<pre>
+ std::string str = s1.ToString();
+ assert(str == std::string("hello"));
+</pre>
+Be careful when using Slices since it is up to the caller to ensure that
+the external byte array into which the Slice points remains live while
+the Slice is in use. For example, the following is buggy:
+<p>
+<pre>
+ leveldb::Slice slice;
+ if (...) {
+ std::string str = ...;
+ slice = str;
+ }
+ Use(slice);
+</pre>
+When the <code>if</code> statement goes out of scope, <code>str</code> will be destroyed and the
+backing storage for <code>slice</code> will disappear.
+<p>
+<h1>Comparators</h1>
+<p>
+The preceding examples used the default ordering function for key,
+which orders bytes lexicographically. You can however supply a custom
+comparator when opening a database. For example, suppose each
+database key consists of two numbers and we should sort by the first
+number, breaking ties by the second number. First, define a proper
+subclass of <code>leveldb::Comparator</code> that expresses these rules:
+<p>
+<pre>
+ class TwoPartComparator : public leveldb::Comparator {
+ public:
+ // Three-way comparison function:
+ // if a &lt; b: negative result
+ // if a &gt; b: positive result
+ // else: zero result
+ int Compare(const leveldb::Slice&amp; a, const leveldb::Slice&amp; b) const {
+ int a1, a2, b1, b2;
+ ParseKey(a, &amp;a1, &amp;a2);
+ ParseKey(b, &amp;b1, &amp;b2);
+ if (a1 &lt; b1) return -1;
+ if (a1 &gt; b1) return +1;
+ if (a2 &lt; b2) return -1;
+ if (a2 &gt; b2) return +1;
+ return 0;
+ }
+
+ // Ignore the following methods for now:
+ const char* Name() { return "TwoPartComparator"; }
+ void FindShortestSeparator(std::string*, const leveldb::Slice&amp;) const { }
+ void FindShortSuccessor(std::string*) const { }
+ };
+</pre>
+Now create a database using this custom comparator:
+<p>
+<pre>
+ TwoPartComparator cmp;
+ leveldb::DB* db;
+ leveldb::Options options;
+ options.create_if_missing = true;
+ options.comparator = &amp;cmp;
+ leveldb::Status status = leveldb::DB::Open(options, "/tmp/testdb", &amp;db);
+ ...
+</pre>
+<h2>Backwards compatibility</h2>
+<p>
+The result of the comparator's <code>Name</code> method is attached to the
+database when it is created, and is checked on every subsequent
+database open. If the name changes, the <code>leveldb::DB::Open</code> call will
+fail. Therefore, change the name if and only if the new key format
+and comparison function are incompatible with existing databases, and
+it is ok to discard the contents of all existing databases.
+<p>
+You can however still gradually evolve your key format over time with
+a little bit of pre-planning. For example, you could store a version
+number at the end of each key (one byte should suffice for most uses).
+When you wish to switch to a new key format (e.g., adding an optional
+third part to the keys processed by <code>TwoPartComparator</code>),
+(a) keep the same comparator name (b) increment the version number
+for new keys (c) change the comparator function so it uses the
+version numbers found in the keys to decide how to interpret them.
+<p>
+<h1>Performance</h1>
+<p>
+Performance can be tuned by changing the default values of the
+types defined in <code>leveldb/include/options.h</code>.
+
+<p>
+<h2><a name="async">Asynchronous Writes</a></h2>
+
+By default, each write to <code>leveldb</code> is synchronous: it does
+not return until the write has been pushed from memory to persistent
+storage. (On Posix systems, this is implemented by calling either
+<code>fdatasync(...)</code> or <code>msync(..., MS_SYNC)</code>.)
+<strong>Synchronous writes may be very slow and the synchrony can be
+optionally disabled</strong>:
+<pre>
+ leveldb::WriteOptions write_options;
+ write_options.sync = false;
+ db-&gt;Put(write_options, ...);
+</pre>
+Asynchronous writes are often more than a hundred times as fast as
+synchronous writes. The downside of asynchronous writes is that a
+crash of the machine may cause the last few updates to be lost. Note
+that a crash of just the writing process (i.e., not a reboot) will not
+cause any loss since even when <code>sync</code> is false, an update
+is pushed from the process memory into the operating system before it
+is considered done.
+
+<p>
+Asynchronous writes can be particularly beneficial when loading a
+large amount of data into the database since you can mitigate the
+problem of lost updates by restarting the bulk load. A hybrid scheme
+is also possible where every Nth write is synchronous, and in the
+event of a crash, the bulk load is restarted just after the last
+synchronous write finished by the previous run.
+
+<p>
+<code>WriteBatch</code> provides an alternative to asynchronous writes.
+Multiple updates may be placed in the same <code>WriteBatch</code> and
+applied together using a synchronous write. The extra cost of the
+synchronous write will be amortized across all of the writes in the batch.
+
+<p>
+<h2>Block size</h2>
+<p>
+<code>leveldb</code> groups adjacent keys together into the same block and such a
+block is the unit of transfer to and from persistent storage. The
+default block size is approximately 8192 uncompressed bytes.
+Applications that mostly do bulk scans over the contents of the
+database may wish to increase this size. Applications that do a lot
+of point reads of small values may wish to switch to a smaller block
+size if performance measurements indicate an improvement. There isn't
+much benefit in using blocks smaller than one kilobyte, or larger than
+a few megabytes. Also note that compression will be more effective
+with larger block sizes.
+<p>
+<h2>Compression</h2>
+<p>
+Each block is individually compressed before being written to
+persistent storage. Compression is on by default since the default
+compression method is very fast, and is automatically disabled for
+uncompressible data. In rare cases, applications may want to disable
+compression entirely, but should only do so if benchmarks show a
+performance improvement:
+<p>
+<pre>
+ leveldb::Options options;
+ options.compression = leveldb::kNoCompression;
+ ... leveldb::DB::Open(options, name, ...) ....
+</pre>
+<h2>Cache</h2>
+<p>
+The contents of the database are stored in a set of files in the
+filesystem and each file stores a sequence of compressed blocks. If
+<code>options.cache</code> is non-NULL, it is used to cache frequently used
+uncompressed block contents.
+<p>
+<pre>
+ #include "leveldb/include/cache.h"
+
+ leveldb::Options options;
+ options.cache = leveldb::NewLRUCache(100 * 1048576); // 100MB cache
+ leveldb::DB* db;
+ leveldb::DB::Open(options, name, &db);
+ ... use the db ...
+ delete db
+ delete options.cache;
+</pre>
+Note that the cache holds uncompressed data, and therefore it should
+be sized according to application level data sizes, without any
+reduction from compression. (Caching of compressed blocks is left to
+the operating system buffer cache, or any custom <code>Env</code>
+implementation provided by the client.)
+<p>
+When performing a bulk read, the application may wish to disable
+caching so that the data processed by the bulk read does not end up
+displacing most of the cached contents. A per-iterator option can be
+used to achieve this:
+<p>
+<pre>
+ leveldb::ReadOptions options;
+ options.fill_cache = false;
+ leveldb::Iterator* it = db-&gt;NewIterator(options);
+ for (it-&gt;SeekToFirst(); it-&gt;Valid(); it-&gt;Next()) {
+ ...
+ }
+</pre>
+<h2>Key Layout</h2>
+<p>
+Note that the unit of disk transfer and caching is a block. Adjacent
+keys (according to the database sort order) will usually be placed in
+the same block. Therefore the application can improve its performance
+by placing keys that are accessed together near each other and placing
+infrequently used keys in a separate region of the key space.
+<p>
+For example, suppose we are implementing a simple file system on top
+of <code>leveldb</code>. The types of entries we might wish to store are:
+<p>
+<pre>
+ filename -&gt; permission-bits, length, list of file_block_ids
+ file_block_id -&gt; data
+</pre>
+We might want to prefix <code>filename</code> keys with one letter (say '/') and the
+<code>file_block_id</code> keys with a different letter (say '0') so that scans
+over just the metadata do not force us to fetch and cache bulky file
+contents.
+<p>
+<h2>Large Values</h2>
+<p>
+<code>leveldb</code> has special treatment of large values (by default, a value
+of length greater than or equal to 64K is considered large, though a
+field in Options can be used to adjust this threshold). Each such
+large value is placed in a separate operating system file, and the
+normal database blocks just contain pointers to such files.
+<p>
+Furthermore, if the same large value occurs multiple times in a single
+database, it will be stored just once.
+<p>
+<h1>Checksums</h1>
+<p>
+<code>leveldb</code> associates checksums with all data it stores in the file system.
+There are two separate controls provided over how aggressively these
+checksums are verified:
+<p>
+<ul>
+<li> <code>ReadOptions::verify_checksums</code> may be set to true to force
+ checksum verification of all data that is read from the file system on
+ behalf of a particular read. By default, no such verification is
+ done.
+<p>
+<li> <code>Options::paranoid_checks</code> may be set to true before opening a
+ database to make the database implementation raise an error as soon as
+ it detects an internal corruption. Depending on which portion of the
+ database has been corrupted, the error may be raised when the database
+ is opened, or later by another database operation. By default,
+ paranoid checking is off so that the database can be used even if
+ parts of its persistent storage have been corrupted.
+<p>
+ If a database is corrupted (perhaps it cannot be opened when
+ paranoid checking is turned on), the <code>leveldb::RepairDB</code> function
+ may be used to recover as much of the data as possible
+<p>
+</ul>
+<h1>Approximate Sizes</h1>
+<p>
+The <code>GetApproximateSizes</code> method can used to get the approximate
+number of bytes of file system space used by one or more key ranges.
+<p>
+<pre>
+ leveldb::Range ranges[2];
+ ranges[0] = leveldb::Range("a", "c");
+ ranges[1] = leveldb::Range("x", "z");
+ uint64_t sizes[2];
+ leveldb::Status s = db-&gt;GetApproximateSizes(ranges, 2, sizes);
+</pre>
+The preceding call will set <code>sizes[0]</code> to the approximate number of
+bytes of file system space used by the key range <code>[a..c)</code> and
+<code>sizes[1]</code> to the approximate number of bytes used by the key range
+<code>[x..z)</code>.
+<p>
+<h1>Environment</h1>
+<p>
+All file operations (and other operating system calls) issued by the
+<code>leveldb</code> implementation are routed through a <code>leveldb::Env</code> object.
+Sophisticated clients may wish to provide their own <code>Env</code>
+implementation to get better control. For example, an application may
+introduce artificial delays in the file IO paths to limit the impact
+of <code>leveldb</code> on other activities in the system.
+<p>
+<pre>
+ class SlowEnv : public leveldb::Env {
+ .. implementation of the Env interface ...
+ };
+
+ SlowEnv env;
+ leveldb::Options options;
+ options.env = &amp;env;
+ Status s = leveldb::DB::Open(options, ...);
+</pre>
+<h1>Porting</h1>
+<p>
+<code>leveldb</code> may be ported to a new platform by providing platform
+specific implementations of the types/methods/functions exported by
+<code>leveldb/port/port.h</code>. See <code>leveldb/port/port_example.h</code> for more
+details.
+<p>
+In addition, the new platform may need a new default <code>leveldb::Env</code>
+implementation. See <code>leveldb/util/env_posix.h</code> for an example.
+
+<h1>Other Information</h1>
+
+<p>
+Details about the <code>leveldb</code> implementation may be found in
+the following documents:
+<ul>
+<li> <a href="impl.html">Implementation notes</a>
+<li> <a href="table_format.txt">Format of an immutable Table file</a>
+<li> <a href="log_format.txt">Format of a log file</a>
+</ul>
+
+</body>
+</html>
diff --git a/doc/log_format.txt b/doc/log_format.txt
new file mode 100644
index 0000000..9a801d4
--- /dev/null
+++ b/doc/log_format.txt
@@ -0,0 +1,72 @@
+The log file contents are a sequence of 32KB blocks. The only
+exception is that the tail of the file may contain a partial block.
+
+Each block consists of a sequence of records:
+ block := record* trailer?
+ record :=
+ checksum: uint32 // crc32c of type and data[]
+ length: uint16
+ type: uint8 // One of FULL, FIRST, MIDDLE, LAST
+ data: uint8[length]
+
+A record never starts within the last seven bytes of a block. Any
+leftover bytes here form the trailer, which must consist entirely of
+zero bytes and must be skipped by readers. In particular, even if
+there are exactly seven bytes left in the block, and a zero-length
+user record is added (which will fit in these seven bytes), the writer
+must skip these trailer bytes and add the record to the next block.
+
+More types may be added in the future. Some Readers may skip record
+types they do not understand, others may report that some data was
+skipped.
+
+FULL == 1
+FIRST == 2
+MIDDLE == 3
+LAST == 4
+
+The FULL record contains the contents of an entire user record.
+
+FIRST, MIDDLE, LAST are types used for user records that have been
+split into multiple fragments (typically because of block boundaries).
+FIRST is the type of the first fragment of a user record, LAST is the
+type of the last fragment of a user record, and MID is the type of all
+interior fragments of a user record.
+
+Example: consider a sequence of user records:
+ A: length 1000
+ B: length 97270
+ C: length 8000
+A will be stored as a FULL record in the first block.
+
+B will be split into three fragments: first fragment occupies the rest
+of the first block, second fragment occupies the entirety of the
+second block, and the third fragment occupies a prefix of the third
+block. This will leave six bytes free in the third block, which will
+be left empty as the trailer.
+
+C will be stored as a FULL record in the fourth block.
+
+===================
+
+Some benefits over the recordio format:
+
+(1) We do not need any heuristics for resyncing - just go to next
+block boundary and scan. If there is a corruption, skip to the next
+block. As a side-benefit, we do not get confused when part of the
+contents of one log file are embedded as a record inside another log
+file.
+
+(2) Splitting at approximate boundaries (e.g., for mapreduce) is
+simple: find the next block boundary and skip records until we
+hit a FULL or FIRST record.
+
+(3) We do not need extra buffering for large records.
+
+Some downsides compared to recordio format:
+
+(1) No packing of tiny records. This could be fixed by adding a new
+record type, so it is a shortcoming of the current implementation,
+not necessarily the format.
+
+(2) No compression. Again, this could be fixed by adding new record types.
diff --git a/doc/table_format.txt b/doc/table_format.txt
new file mode 100644
index 0000000..ad5aa4b
--- /dev/null
+++ b/doc/table_format.txt
@@ -0,0 +1,61 @@
+File format
+===========
+
+ <beginning_of_file>
+ [data block 1]
+ [data block 2]
+ ...
+ [data block N]
+ [meta block 1]
+ ...
+ [meta block K]
+ [metaindex block]
+ [index block]
+ [Footer] (fixed size; starts at file_size - sizeof(Footer))
+ <end_of_file>
+
+The file contains internal pointers. Each such pointer is called
+a BlockHandle and contains the following information:
+ offset: varint64
+ size: varint64
+
+(1) The sequence of key/value pairs in the file are stored in sorted
+order and partitioned into a sequence of data blocks. These blocks
+come one after another at the beginning of the file. Each data block
+is formatted according to the code in block_builder.cc, and then
+optionally compressed.
+
+(2) After the data blocks we store a bunch of meta blocks. The
+supported meta block types are described below. More meta block types
+may be added in the future. Each meta block is again formatted using
+block_builder.cc and then optionally compressed.
+
+(3) A "metaindex" block. It contains one entry for every other meta
+block where the key is the name of the meta block and the value is a
+BlockHandle pointing to that meta block.
+
+(4) An "index" block. This block contains one entry per data block,
+where the key is a string >= last key in that data block and before
+the first key in the successive data block. The value is the
+BlockHandle for the data block.
+
+(6) At the very end of the file is a fixed length footer that contains
+the BlockHandle of the metaindex and index blocks as well as a magic number.
+ metaindex_handle: char[p]; // Block handle for metaindex
+ index_handle: char[q]; // Block handle for index
+ padding: char[40-p-q]; // 0 bytes to make fixed length
+ // (40==2*BlockHandle::kMaxEncodedLength)
+ magic: fixed64; // == 0xdb4775248b80fb57
+
+"stats" Meta Block
+------------------
+
+This meta block contains a bunch of stats. The key is the name
+of the statistic. The value contains the statistic.
+TODO(postrelease): record following stats.
+ data size
+ index size
+ key size (uncompressed)
+ value size (uncompressed)
+ number of entries
+ number of data blocks
diff --git a/include/cache.h b/include/cache.h
new file mode 100644
index 0000000..6c98cb8
--- /dev/null
+++ b/include/cache.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Cache is an interface that maps keys to values. It has internal
+// synchronization and may be safely accessed concurrently from
+// multiple threads. It may automatically evict entries to make room
+// for new entries. Values have a specified charge against the cache
+// capacity. For example, a cache where the values are variable
+// length strings, may use the length of the string as the charge for
+// the string.
+//
+// A builtin cache implementation with a least-recently-used eviction
+// policy is provided. Clients may use their own implementations if
+// they want something more sophisticated (like scan-resistance, a
+// custom eviction policy, variable cache sizing, etc.)
+
+#ifndef STORAGE_LEVELDB_INCLUDE_CACHE_H_
+#define STORAGE_LEVELDB_INCLUDE_CACHE_H_
+
+#include <stdint.h>
+#include "include/slice.h"
+
+namespace leveldb {
+
+class Cache;
+
+// Create a new cache with a fixed size capacity. This implementation
+// of Cache uses a least-recently-used eviction policy.
+extern Cache* NewLRUCache(size_t capacity);
+
+class Cache {
+ public:
+ Cache() { }
+
+ // Destroys all existing entries by calling the "deleter"
+ // function that was passed to the constructor.
+ virtual ~Cache();
+
+ // Opaque handle to an entry stored in the cache.
+ struct Handle { };
+
+ // Insert a mapping from key->value into the cache and assign it
+ // the specified charge against the total cache capacity.
+ //
+ // Returns a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ //
+ // When the inserted entry is no longer needed, the key and
+ // value will be passed to "deleter".
+ virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value)) = 0;
+
+ // If the cache has no mapping for "key", returns NULL.
+ //
+ // Else return a handle that corresponds to the mapping. The caller
+ // must call this->Release(handle) when the returned mapping is no
+ // longer needed.
+ virtual Handle* Lookup(const Slice& key) = 0;
+
+ // Release a mapping returned by a previous Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void Release(Handle* handle) = 0;
+
+ // Return the value encapsulated in a handle returned by a
+ // successful Lookup().
+ // REQUIRES: handle must not have been released yet.
+ // REQUIRES: handle must have been returned by a method on *this.
+ virtual void* Value(Handle* handle) = 0;
+
+ // If the cache contains entry for key, erase it. Note that the
+ // underlying entry will be kept around until all existing handles
+ // to it have been released.
+ virtual void Erase(const Slice& key) = 0;
+
+ // Return a new numeric id. May be used by multiple clients who are
+ // sharing the same cache to partition the key space. Typically the
+ // client will allocate a new id at startup and prepend the id to
+ // its cache keys.
+ virtual uint64_t NewId() = 0;
+
+ private:
+ void LRU_Remove(Handle* e);
+ void LRU_Append(Handle* e);
+ void Unref(Handle* e);
+
+ struct Rep;
+ Rep* rep_;
+
+ // No copying allowed
+ Cache(const Cache&);
+ void operator=(const Cache&);
+};
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_CACHE_H_
diff --git a/include/comparator.h b/include/comparator.h
new file mode 100644
index 0000000..4e00e4d
--- /dev/null
+++ b/include/comparator.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
+
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+// A Comparator object provides a total order across slices that are
+// used as keys in an sstable or a database.
+class Comparator {
+ public:
+ virtual ~Comparator();
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "a" < "b",
+ // == 0 iff "a" == "b",
+ // > 0 iff "a" > "b"
+ virtual int Compare(const Slice& a, const Slice& b) const = 0;
+
+ // The name of the comparator. Used to check for comparator
+ // mismatches (i.e., a DB created with one comparator is
+ // accessed using a different comparator.
+ //
+ // The client of this package should switch to a new name whenever
+ // the comparator implementation changes in a way that will cause
+ // the relative ordering of any two keys to change.
+ //
+ // Names starting with "leveldb." are reserved and should not be used
+ // by any clients of this package.
+ virtual const char* Name() const = 0;
+
+ // Advanced functions: these are used to reduce the space requirements
+ // for internal data structures like index blocks.
+
+ // If *start < limit, changes *start to a short string in [start,limit).
+ // Simple comparator implementations may return with *start unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const = 0;
+
+ // Changes *key to a short string >= *key.
+ // Simple comparator implementations may return with *key unchanged,
+ // i.e., an implementation of this method that does nothing is correct.
+ virtual void FindShortSuccessor(std::string* key) const = 0;
+};
+
+// Return a builtin comparator that uses lexicographic byte-wise
+// ordering. The result remains the property of this module and
+// must not be deleted.
+extern const Comparator* BytewiseComparator();
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_COMPARATOR_H_
diff --git a/include/db.h b/include/db.h
new file mode 100644
index 0000000..c4d152d
--- /dev/null
+++ b/include/db.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_DB_H_
+#define STORAGE_LEVELDB_INCLUDE_DB_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include "include/iterator.h"
+#include "include/options.h"
+
+namespace leveldb {
+
+struct Options;
+struct ReadOptions;
+struct WriteOptions;
+
+class Snapshot;
+class WriteBatch;
+
+// Some internal types. Clients should ignore.
+class WriteBatchInternal;
+
+struct Range {
+ Slice start;
+ Slice limit;
+
+ Range(const Slice& s, const Slice& l) : start(s), limit(l) { }
+};
+
+// A DB is a persistent ordered map from keys to values.
+class DB {
+ public:
+ // Open the database with the specified "name".
+ // Stores a pointer to a heap-allocated database in *dbptr and returns
+ // OK on success.
+ // Stores NULL in *dbptr and returns a non-OK status on error.
+ // Caller should delete *dbptr when it is no longer needed.
+ static Status Open(const Options& options,
+ const std::string& name,
+ DB** dbptr);
+
+ DB() { }
+ virtual ~DB();
+
+ // Set the database entry for "key" to "value". Returns OK on success,
+ // and a non-OK status on error.
+ // Note: consider setting options.sync = false.
+ virtual Status Put(const WriteOptions& options,
+ const Slice& key,
+ const Slice& value) = 0;
+
+ // Remove the database entry (if any) for "key". Returns OK on
+ // success, and a non-OK status on error. It is not an error if "key"
+ // did not exist in the database.
+ // Note: consider setting options.sync = false.
+ virtual Status Delete(const WriteOptions& options, const Slice& key) = 0;
+
+ // Apply the specified updates to the database.
+ // Returns OK on success, non-OK on failure.
+ // Note: consider setting options.sync = false.
+ virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
+
+ // If the database contains an entry for "key" store the
+ // corresponding value in *value and return OK.
+ //
+ // If there is no entry for "key" leave *value unchanged and return
+ // a status for which Status::IsNotFound() returns true.
+ //
+ // May return some other Status on an error.
+ virtual Status Get(const ReadOptions& options,
+ const Slice& key, std::string* value) = 0;
+
+ // Return a heap-allocated iterator over the contents of the database.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ //
+ // Caller should delete the iterator when it is no longer needed.
+ // The returned iterator should be deleted before this db is deleted.
+ virtual Iterator* NewIterator(const ReadOptions& options) = 0;
+
+ // Return a handle to the current DB state. Iterators created with
+ // this handle will all observe a stable snapshot of the current DB
+ // state. The caller must call ReleaseSnapshot(result) when the
+ // snapshot is no longer needed.
+ virtual const Snapshot* GetSnapshot() = 0;
+
+ // Release a previously acquired snapshot. The caller must not
+ // use "snapshot" after this call.
+ virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0;
+
+ // DB implementations can export properties about their state
+ // via this method. If "property" is a valid property understood by this
+ // DB implementation, fills "*value" with its current value and returns
+ // true. Otherwise returns false.
+ //
+ //
+ // Valid property names include:
+ //
+ // "leveldb.num-files-at-level<N>" - return the number of files at level <N>,
+ // where <N> is an ASCII representation of a level number (e.g. "0").
+ virtual bool GetProperty(const Slice& property, uint64_t* value) = 0;
+
+ // For each i in [0,n-1], store in "sizes[i]", the approximate
+ // file system space used by keys in "[range[i].start .. range[i].limit)".
+ //
+ // Note that the returned sizes measure file system space usage, so
+ // if the user data compresses by a factor of ten, the returned
+ // sizes will be one-tenth the size of the corresponding user data size.
+ //
+ // The results may not include the sizes of recently written data.
+ virtual void GetApproximateSizes(const Range* range, int n,
+ uint64_t* sizes) = 0;
+
+ // Possible extensions:
+ // (1) Add a method to compact a range of keys
+
+ private:
+ // No copying allowed
+ DB(const DB&);
+ void operator=(const DB&);
+};
+
+// Destroy the contents of the specified database.
+// Be very careful using this method.
+Status DestroyDB(const std::string& name, const Options& options);
+
+// If a DB cannot be opened, you may attempt to call this method to
+// resurrect as much of the contents of the database as possible.
+// Some data may be lost, so be careful when calling this function
+// on a database that contains important information.
+Status RepairDB(const std::string& dbname, const Options& options);
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_DB_H_
diff --git a/include/env.h b/include/env.h
new file mode 100644
index 0000000..a728f29
--- /dev/null
+++ b/include/env.h
@@ -0,0 +1,293 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An Env is an interface used by the leveldb implementation to access
+// operating system functionality like the filesystem etc. Callers
+// may wish to provide a custom Env object when opening a database to
+// get fine gain control; e.g., to rate limit file system operations.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ENV_H_
+#define STORAGE_LEVELDB_INCLUDE_ENV_H_
+
+#include <cstdarg>
+#include <string>
+#include <vector>
+#include <stdint.h>
+#include "include/status.h"
+
+namespace leveldb {
+
+class FileLock;
+class RandomAccessFile;
+class SequentialFile;
+class Slice;
+class WritableFile;
+
+class Env {
+ public:
+ Env() { }
+ virtual ~Env();
+
+ // Return a default environment suitable for the current operating
+ // system. Sophisticated users may wish to provide their own Env
+ // implementation instead of relying on this default environment.
+ //
+ // The result of Default() belongs to leveldb and must never be deleted.
+ static Env* Default();
+
+ // Create a brand new sequentially-readable file with the specified name.
+ // On success, stores a pointer to the new file in *result and returns OK.
+ // On failure stores NULL in *result and returns non-OK. If the file does
+ // not exist, returns a non-OK status.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewSequentialFile(const std::string& fname,
+ SequentialFile** result) = 0;
+
+ // Create a brand new random access read-only file with the
+ // specified name. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores NULL in *result and
+ // returns non-OK. If the file does not exist, returns a non-OK
+ // status.
+ //
+ // The returned file may be concurrently accessed by multiple threads.
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ RandomAccessFile** result) = 0;
+
+ // Create an object that writes to a new file with the specified
+ // name. Deletes any existing file with the same name and creates a
+ // new file. On success, stores a pointer to the new file in
+ // *result and returns OK. On failure stores NULL in *result and
+ // returns non-OK.
+ //
+ // The returned file will only be accessed by one thread at a time.
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) = 0;
+
+ // Returns true iff the named file exists.
+ virtual bool FileExists(const std::string& fname) = 0;
+
+ // Store in *result the names of the children of the specified directory.
+ // The names are relative to "dir".
+ // Original contents of *results are dropped.
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) = 0;
+
+ // Delete the named file.
+ virtual Status DeleteFile(const std::string& fname) = 0;
+
+ // Create the specified directory.
+ virtual Status CreateDir(const std::string& dirname) = 0;
+
+ // Delete the specified directory.
+ virtual Status DeleteDir(const std::string& dirname) = 0;
+
+ // Store the size of fname in *file_size.
+ virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) = 0;
+
+ // Rename file src to target.
+ virtual Status RenameFile(const std::string& src,
+ const std::string& target) = 0;
+
+ // Lock the specified file. Used to prevent concurrent access to
+ // the same db by multiple processes. On failure, stores NULL in
+ // *lock and returns non-OK.
+ //
+ // On success, stores a pointer to the object that represents the
+ // acquired lock in *lock and returns OK. The caller should call
+ // UnlockFile(*lock) to release the lock. If the process exits,
+ // the lock will be automatically released.
+ //
+ // If somebody else already holds the lock, finishes immediately
+ // with a failure. I.e., this call does not wait for existing locks
+ // to go away.
+ //
+ // May create the named file if it does not already exist.
+ virtual Status LockFile(const std::string& fname, FileLock** lock) = 0;
+
+ // Release the lock acquired by a previous successful call to LockFile.
+ // REQUIRES: lock was returned by a successful LockFile() call
+ // REQUIRES: lock has not already been unlocked.
+ virtual Status UnlockFile(FileLock* lock) = 0;
+
+ // Arrange to run "(*function)(arg)" once in a background thread.
+ //
+ // "function" may run in an unspecified thread. Multiple functions
+ // added to the same Env may run concurrently in different threads.
+ // I.e., the caller may not assume that background work items are
+ // serialized.
+ virtual void Schedule(
+ void (*function)(void* arg),
+ void* arg) = 0;
+
+ // Start a new thread, invoking "function(arg)" within the new thread.
+ // When "function(arg)" returns, the thread will be destroyed.
+ virtual void StartThread(void (*function)(void* arg), void* arg) = 0;
+
+ // *path is set to a temporary directory that can be used for testing. It may
+ // or many not have just been created. The directory may or may not differ
+ // between runs of the same process, but subsequent calls will return the
+ // same directory.
+ virtual Status GetTestDirectory(std::string* path) = 0;
+
+ // Write an entry to the log file with the specified format.
+ virtual void Logv(WritableFile* log, const char* format, va_list ap) = 0;
+
+ // Returns the number of micro-seconds since some fixed point in time. Only
+ // useful for computing deltas of time.
+ virtual uint64_t NowMicros() = 0;
+
+ // Sleep/delay the thread for the perscribed number of micro-seconds.
+ virtual void SleepForMicroseconds(int micros) = 0;
+
+ private:
+ // No copying allowed
+ Env(const Env&);
+ void operator=(const Env&);
+};
+
+// A file abstraction for reading sequentially through a file
+class SequentialFile {
+ public:
+ SequentialFile() { }
+ virtual ~SequentialFile();
+
+ // Read up to "n" bytes from the file. "scratch[0..n-1]" may be
+ // written by this routine. Sets "*result" to the data that was
+ // read (including if fewer than "n" bytes were successfully read).
+ // If an error was encountered, returns a non-OK status.
+ //
+ // REQUIRES: External synchronization
+ virtual Status Read(size_t n, Slice* result, char* scratch) = 0;
+};
+
+// A file abstraction for randomly reading the contents of a file.
+class RandomAccessFile {
+ public:
+ RandomAccessFile() { }
+ virtual ~RandomAccessFile();
+
+ // Return the length of this file in bytes.
+ virtual uint64_t Size() const = 0;
+
+ // Read up to "n" bytes from the file starting at "offset".
+ // "scratch[0..n-1]" may be written by this routine. Sets "*result"
+ // to the data that was read (including if fewer than "n" bytes were
+ // successfully read). If an error was encountered, returns a
+ // non-OK status.
+ //
+ // Safe for concurrent use by multiple threads.
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const = 0;
+};
+
+// A file abstraction for sequential writing. The implementation
+// must provide buffering since callers may append small fragments
+// at a time to the file.
+class WritableFile {
+ public:
+ WritableFile() { }
+ virtual ~WritableFile();
+
+ virtual Status Append(const Slice& data) = 0;
+ virtual Status Close() = 0;
+ virtual Status Flush() = 0;
+ virtual Status Sync() = 0;
+
+ private:
+ // No copying allowed
+ WritableFile(const WritableFile&);
+ void operator=(const WritableFile&);
+};
+
+// Identifies a locked file.
+class FileLock {
+ public:
+ FileLock() { }
+ virtual ~FileLock();
+ private:
+ // No copying allowed
+ FileLock(const FileLock&);
+ void operator=(const FileLock&);
+};
+
+// Log the specified data to *info_log if info_log is non-NULL.
+extern void Log(Env* env, WritableFile* info_log, const char* format, ...)
+# if defined(__GNUC__) || defined(__clang__)
+ __attribute__((__format__ (__printf__, 3, 4)))
+# endif
+ ;
+
+// A utility routine: write "data" to the named file.
+extern Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname);
+
+// A utility routine: read contents of named file into *data
+extern Status ReadFileToString(Env* env, const std::string& fname,
+ std::string* data);
+
+// An implementation of Env that forwards all calls to another Env.
+// May be useful to clients who wish to override just part of the
+// functionality of another Env.
+class EnvWrapper : public Env {
+ public:
+ // Initialize an EnvWrapper that delegates all calls to *target
+ explicit EnvWrapper(Env* target) : target_(target) { }
+ virtual ~EnvWrapper();
+
+ // Return the target to which this Env forwards all calls
+ Env* target() const { return target_; }
+
+ // The following text is boilerplate that forwards all methods to target()
+ Status NewSequentialFile(const std::string& f, SequentialFile** r) {
+ return target_->NewSequentialFile(f, r);
+ }
+ Status NewRandomAccessFile(const std::string& f, RandomAccessFile** r) {
+ return target_->NewRandomAccessFile(f, r);
+ }
+ Status NewWritableFile(const std::string& f, WritableFile** r) {
+ return target_->NewWritableFile(f, r);
+ }
+ bool FileExists(const std::string& f) { return target_->FileExists(f); }
+ Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
+ return target_->GetChildren(dir, r);
+ }
+ Status DeleteFile(const std::string& f) { return target_->DeleteFile(f); }
+ Status CreateDir(const std::string& d) { return target_->CreateDir(d); }
+ Status DeleteDir(const std::string& d) { return target_->DeleteDir(d); }
+ Status GetFileSize(const std::string& f, uint64_t* s) {
+ return target_->GetFileSize(f, s);
+ }
+ Status RenameFile(const std::string& s, const std::string& t) {
+ return target_->RenameFile(s, t);
+ }
+ Status LockFile(const std::string& f, FileLock** l) {
+ return target_->LockFile(f, l);
+ }
+ Status UnlockFile(FileLock* l) { return target_->UnlockFile(l); }
+ void Schedule(void (*f)(void*), void* a) {
+ return target_->Schedule(f, a);
+ }
+ void StartThread(void (*f)(void*), void* a) {
+ return target_->StartThread(f, a);
+ }
+ virtual Status GetTestDirectory(std::string* path) {
+ return target_->GetTestDirectory(path);
+ }
+ virtual void Logv(WritableFile* log, const char* format, va_list ap) {
+ return target_->Logv(log, format, ap);
+ }
+ uint64_t NowMicros() {
+ return target_->NowMicros();
+ }
+ void SleepForMicroseconds(int micros) {
+ target_->SleepForMicroseconds(micros);
+ }
+ private:
+ Env* target_;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_ENV_H_
diff --git a/include/iterator.h b/include/iterator.h
new file mode 100644
index 0000000..b0872a3
--- /dev/null
+++ b/include/iterator.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// An iterator yields a sequence of key/value pairs from a source.
+// The following class defines the interface. Multiple implementations
+// are provided by this library. In particular, iterators are provided
+// to access the contents of a Table or a DB.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+#define STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
+
+#include "include/slice.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class Iterator {
+ public:
+ Iterator();
+ virtual ~Iterator();
+
+ // An iterator is either positioned at a key/value pair, or
+ // not valid. This method returns true iff the iterator is valid.
+ virtual bool Valid() const = 0;
+
+ // Position at the first key in the source. The iterator is Valid()
+ // after this call iff the source is not empty.
+ virtual void SeekToFirst() = 0;
+
+ // Position at the last key in the source. The iterator is
+ // Valid() after this call iff the source is not empty.
+ virtual void SeekToLast() = 0;
+
+ // Position at the first key in the source that at or past target
+ // The iterator is Valid() after this call iff the source contains
+ // an entry that comes at or past target.
+ virtual void Seek(const Slice& target) = 0;
+
+ // Moves to the next entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the last entry in the source.
+ // REQUIRES: Valid()
+ virtual void Next() = 0;
+
+ // Moves to the previous entry in the source. After this call, Valid() is
+ // true iff the iterator was not positioned at the first entry in source.
+ // REQUIRES: Valid()
+ virtual void Prev() = 0;
+
+ // Return the key for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: Valid()
+ virtual Slice key() const = 0;
+
+ // Return the value for the current entry. The underlying storage for
+ // the returned slice is valid only until the next modification of
+ // the iterator.
+ // REQUIRES: !AtEnd() && !AtStart()
+ virtual Slice value() const = 0;
+
+ // If an error has occurred, return it. Else return an ok status.
+ virtual Status status() const = 0;
+
+ // Clients are allowed to register function/arg1/arg2 triples that
+ // will be invoked when this iterator is destroyed.
+ //
+ // Note that unlike all of the preceding methods, this method is
+ // not abstract and therefore clients should not override it.
+ typedef void (*CleanupFunction)(void* arg1, void* arg2);
+ void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2);
+
+ private:
+ struct Cleanup {
+ CleanupFunction function;
+ void* arg1;
+ void* arg2;
+ Cleanup* next;
+ };
+ Cleanup cleanup_;
+
+ // No copying allowed
+ Iterator(const Iterator&);
+ void operator=(const Iterator&);
+};
+
+// Return an empty iterator (yields nothing).
+extern Iterator* NewEmptyIterator();
+
+// Return an empty iterator with the specified status.
+extern Iterator* NewErrorIterator(const Status& status);
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_ITERATOR_H_
diff --git a/include/options.h b/include/options.h
new file mode 100644
index 0000000..1105570
--- /dev/null
+++ b/include/options.h
@@ -0,0 +1,203 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+#define STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
+
+#include <stddef.h>
+
+namespace leveldb {
+
+class Cache;
+class Comparator;
+class Env;
+class Snapshot;
+class WritableFile;
+
+// DB contents are stored in a set of blocks, each of which holds a
+// sequence of key,value pairs. Each block may be compressed before
+// being stored in a file. The following enum describes which
+// compression method (if any) is used to compress a block.
+enum CompressionType {
+ // NOTE: do not change the values of existing entries, as these are
+ // part of the persistent format on disk.
+ kNoCompression = 0x0,
+ kLightweightCompression = 0x1,
+};
+
+// Options to control the behavior of a database (passed to DB::Open)
+struct Options {
+ // -------------------
+ // Parameters that affect behavior
+
+ // Comparator used to define the order of keys in the table.
+ // Default: a comparator that uses lexicographic byte-wise ordering
+ //
+ // REQUIRES: The client must ensure that the comparator supplied
+ // here has the same name and orders keys *exactly* the same as the
+ // comparator provided to previous open calls on the same DB.
+ const Comparator* comparator;
+
+ // If true, the database will be created if it is missing.
+ // Default: false
+ bool create_if_missing;
+
+ // If true, an error is raised if the database already exists.
+ // Default: false
+ bool error_if_exists;
+
+ // If true, the implementation will do aggressive checking of the
+ // data it is processing and will stop early if it detects any
+ // errors. This may have unforeseen ramifications: for example, a
+ // corruption of one DB entry may cause a large number of entries to
+ // become unreadable or for the entire DB to become unopenable.
+ // Default: false
+ bool paranoid_checks;
+
+ // Use the specified object to interact with the environment,
+ // e.g. to read/write files, schedule background work, etc.
+ // Default: Env::Default()
+ Env* env;
+
+ // Any internal progress/error information generated by the db will
+ // be to written to info_log if it is non-NULL, or to a file stored
+ // in the same directory as the DB contents if info_log is NULL.
+ // Default: NULL
+ WritableFile* info_log;
+
+ // -------------------
+ // Parameters that affect performance
+
+ // Amount of data to build up in memory before converting to an
+ // on-disk file.
+ //
+ // Some DB operations may encounter a delay proportional to the size
+ // of this parameter. Therefore we recommend against increasing
+ // this parameter unless you are willing to live with an occasional
+ // slow operation in exchange for faster bulk loading throughput.
+ //
+ // Default: 1MB
+ size_t write_buffer_size;
+
+ // Number of open files that can be used by the DB. You may need to
+ // increase this if your database has a large working set (budget
+ // one open file per 2MB of working set).
+ //
+ // Default: 1000
+ int max_open_files;
+
+ // Handle values larger than "large_value_threshold" bytes
+ // specially, by writing them into their own files (to avoid
+ // compaction overhead) and doing content-based elimination of
+ // duplicate values to save space.
+ //
+ // We recommend against changing this value.
+ //
+ // Default: 64K
+ size_t large_value_threshold;
+
+ // Control over blocks (user data is stored in a set of blocks, and
+ // a block is the unit of reading from disk).
+
+ // Use the specified cache for blocks (if non-NULL).
+ // Default: NULL
+ Cache* block_cache;
+
+ // Approximate size of user data packed per block. Note that the
+ // block size specified here corresponds to uncompressed data. The
+ // actual size of the unit read from disk may be smaller if
+ // compression is enabled. This parameter can be changed dynamically.
+ //
+ // Default: 8K
+ int block_size;
+
+ // Number of keys between restart points for delta encoding of keys.
+ // This parameter can be changed dynamically. Most clients should
+ // leave this parameter alone.
+ //
+ // Default: 16
+ int block_restart_interval;
+
+ // Compress blocks using the specified compression algorithm. This
+ // parameter can be changed dynamically.
+ //
+ // Default: kLightweightCompression, which gives lightweight but fast
+ // compression.
+ //
+ // Typical speeds of kLightweightCompression on an Intel(R) Core(TM)2 2.4GHz:
+ // ~200-500MB/s compression
+ // ~400-800MB/s decompression
+ // Note that these speeds are significantly faster than most
+ // persistent storage speeds, and therefore it is typically never
+ // worth switching to kNoCompression. Even if the input data is
+ // incompressible, the kLightweightCompression implementation will
+ // efficiently detect that and will switch to uncompressed mode.
+ CompressionType compression;
+
+ // Create an Options object with default values for all fields.
+ Options();
+};
+
+// Options that control read operations
+struct ReadOptions {
+ // If true, all data read from underlying storage will be
+ // verified against corresponding checksums.
+ // Default: false
+ bool verify_checksums;
+
+ // Should the data read for this iteration be cached in memory?
+ // Callers may wish to set this field to false for bulk scans.
+ // Default: true
+ bool fill_cache;
+
+ // If "snapshot" is non-NULL, read as of the supplied snapshot
+ // (which must belong to the DB that is being read and which must
+ // not have been released). If "snapshot" is NULL, use an impliicit
+ // snapshot of the state at the beginning of this read operation.
+ // Default: NULL
+ const Snapshot* snapshot;
+
+ ReadOptions()
+ : verify_checksums(false),
+ fill_cache(true),
+ snapshot(NULL) {
+ }
+};
+
+// Options that control write operations
+struct WriteOptions {
+ // If true, the write will be flushed from the operating system
+ // buffer cache (by calling WritableFile::Sync()) before the write
+ // is considered complete. If this flag is true, writes will be
+ // slower.
+ //
+ // If this flag is false, and the machine crashes, some recent
+ // writes may be lost. Note that if it is just the process that
+ // crashes (i.e., the machine does not reboot), no writes will be
+ // lost even if sync==false.
+ //
+ // Default: true
+ bool sync;
+
+ // If "post_write_snapshot" is non-NULL, and the write succeeds,
+ // *post_write_snapshot will be modified to point to a snapshot of
+ // the DB state immediately after this write. The caller must call
+ // DB::ReleaseSnapshot(*post_write_snapshotsnapshot) when the
+ // snapshot is no longer needed.
+ //
+ // If "post_write_snapshot" is non-NULL, and the write fails,
+ // *post_write_snapshot will be set to NULL.
+ //
+ // Default: NULL
+ const Snapshot** post_write_snapshot;
+
+ WriteOptions()
+ : sync(true),
+ post_write_snapshot(NULL) {
+ }
+};
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_OPTIONS_H_
diff --git a/include/slice.h b/include/slice.h
new file mode 100644
index 0000000..62cb894
--- /dev/null
+++ b/include/slice.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Slice is a simple structure containing a pointer into some external
+// storage and a size. The user of a Slice must ensure that the slice
+// is not used after the corresponding external storage has been
+// deallocated.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_SLICE_H_
+#define STORAGE_LEVELDB_INCLUDE_SLICE_H_
+
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include <string>
+
+namespace leveldb {
+
+class Slice {
+ public:
+ // Create an empty slice.
+ Slice() : data_(""), size_(0) { }
+
+ // Create a slice that refers to data[0,n-1].
+ Slice(const char* data, size_t n) : data_(data), size_(n) { }
+
+ // Create a slice that refers to the contents of "s"
+ Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
+
+ // Create a slice that refers to s[0,strlen(s)-1]
+ Slice(const char* s) : data_(s), size_(strlen(s)) { }
+
+ // Return a pointer to the beginning of the referenced data
+ const char* data() const { return data_; }
+
+ // Return the length (in bytes) of the referenced data
+ size_t size() const { return size_; }
+
+ // Return true iff the length of the referenced data is zero
+ bool empty() const { return size_ == 0; }
+
+ // Return the ith byte in the referenced data.
+ // REQUIRES: n < size()
+ char operator[](size_t n) const {
+ assert(n < size());
+ return data_[n];
+ }
+
+ // Change this slice to refer to an empty array
+ void clear() { data_ = ""; size_ = 0; }
+
+ // Drop the first "n" bytes from this slice.
+ void remove_prefix(size_t n) {
+ assert(n <= size());
+ data_ += n;
+ size_ -= n;
+ }
+
+ // Return a string that contains the copy of the referenced data.
+ std::string ToString() const { return std::string(data_, size_); }
+
+ // Three-way comparison. Returns value:
+ // < 0 iff "*this" < "b",
+ // == 0 iff "*this" == "b",
+ // > 0 iff "*this" > "b"
+ int compare(const Slice& b) const;
+
+ // Return true iff "x" is a prefix of "*this"
+ bool starts_with(const Slice& x) const {
+ return ((size_ >= x.size_) &&
+ (memcmp(data_, x.data_, x.size_) == 0));
+ }
+
+ private:
+ const char* data_;
+ size_t size_;
+
+ // Intentionally copyable
+};
+
+inline bool operator==(const Slice& x, const Slice& y) {
+ return ((x.size() == y.size()) &&
+ (memcmp(x.data(), y.data(), x.size()) == 0));
+}
+
+inline bool operator!=(const Slice& x, const Slice& y) {
+ return !(x == y);
+}
+
+inline int Slice::compare(const Slice& b) const {
+ const int min_len = (size_ < b.size_) ? size_ : b.size_;
+ int r = memcmp(data_, b.data_, min_len);
+ if (r == 0) {
+ if (size_ < b.size_) r = -1;
+ else if (size_ > b.size_) r = +1;
+ }
+ return r;
+}
+
+}
+
+
+#endif // STORAGE_LEVELDB_INCLUDE_SLICE_H_
diff --git a/include/status.h b/include/status.h
new file mode 100644
index 0000000..cd148f6
--- /dev/null
+++ b/include/status.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation. It may indicate success,
+// or it may indicate an error with an associated error message.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_STATUS_H_
+#define STORAGE_LEVELDB_INCLUDE_STATUS_H_
+
+#include <string>
+#include <utility>
+#include "include/slice.h"
+
+namespace leveldb {
+
+class Status {
+ public:
+ // Create a success status.
+ Status() : state_(NULL) { }
+ ~Status() { delete state_; }
+
+ // Copy the specified status.
+ Status(const Status& s);
+ void operator=(const Status& s);
+
+ // Return a success status.
+ static Status OK() { return Status(); }
+
+ // Return error status of an appropriate type.
+ static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotFound, msg, Slice());
+ }
+ static Status Corruption(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kCorruption, msg, msg2);
+ }
+ static Status NotSupported(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kNotSupported, msg, msg2);
+ }
+ static Status InvalidArgument(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kInvalidArgument, msg, msg2);
+ }
+ static Status IOError(const Slice& msg, const Slice& msg2 = Slice()) {
+ return Status(kIOError, msg, msg2);
+ }
+
+ // Returns true iff the status indicates success.
+ bool ok() const { return (state_ == NULL); }
+
+ // Returns true iff the status indicates a NotFound error.
+ bool IsNotFound() const { return code() == kNotFound; }
+
+ // Return a string representation of this status suitable for printing.
+ // Returns the string "OK" for success.
+ std::string ToString() const;
+
+ private:
+ enum Code {
+ kOk = 0,
+ kNotFound = 1,
+ kCorruption = 2,
+ kNotSupported = 3,
+ kInvalidArgument = 4,
+ kIOError = 5,
+ };
+ Code code() const { return (state_ == NULL) ? kOk : state_->first; }
+
+ Status(Code code, const Slice& msg, const Slice& msg2);
+
+ typedef std::pair<Code, std::string> State;
+ State* state_;
+};
+
+inline Status::Status(const Status& s) {
+ state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+}
+inline void Status::operator=(const Status& s) {
+ if (this != &s) {
+ delete state_;
+ state_ = (s.state_ == NULL) ? NULL : new State(*s.state_);
+ }
+}
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_STATUS_H_
diff --git a/include/table.h b/include/table.h
new file mode 100644
index 0000000..96b2196
--- /dev/null
+++ b/include/table.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_H_
+#define STORAGE_LEVELDB_INCLUDE_TABLE_H_
+
+#include <stdint.h>
+#include "include/iterator.h"
+
+namespace leveldb {
+
+class Block;
+class BlockHandle;
+struct Options;
+class RandomAccessFile;
+struct ReadOptions;
+
+// A Table is a sorted map from strings to strings. Tables are
+// immutable and persistent.
+class Table {
+ public:
+ // Attempt to open the table that is stored in "file", and read the
+ // metadata entries necessary to allow retrieving data from the table.
+ //
+ // If successful, returns ok and sets "*table" to the newly opened
+ // table. The client should delete "*table" when no longer needed.
+ // If there was an error while initializing the table, sets "*table"
+ // to NULL and returns a non-ok status. Does not take ownership of
+ // "*source", but the client must ensure that "source" remains live
+ // for the duration of the returned table's lifetime.
+ //
+ // *file must remain live while this Table is in use.
+ static Status Open(const Options& options,
+ RandomAccessFile* file,
+ Table** table);
+
+ ~Table();
+
+ // Returns a new iterator over the table contents.
+ // The result of NewIterator() is initially invalid (caller must
+ // call one of the Seek methods on the iterator before using it).
+ Iterator* NewIterator(const ReadOptions&) const;
+
+ // Given a key, return an approximate byte offset in the file where
+ // the data for that key begins (or would begin if the key were
+ // present in the file). The returned value is in terms of file
+ // bytes, and so includes effects like compression of the underlying data.
+ // E.g., the approximate offset of the last key in the table will
+ // be close to the file length.
+ uint64_t ApproximateOffsetOf(const Slice& key) const;
+
+ private:
+ struct Rep;
+ Rep* rep_;
+
+ explicit Table(Rep* rep) { rep_ = rep; }
+ static Iterator* BlockReader(void*, const ReadOptions&, const Slice&);
+
+ // No copying allowed
+ Table(const Table&);
+ void operator=(const Table&);
+};
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_TABLE_H_
diff --git a/include/table_builder.h b/include/table_builder.h
new file mode 100644
index 0000000..ecd852e
--- /dev/null
+++ b/include/table_builder.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// TableBuilder provides the interface used to build a Table
+// (an immutable and sorted map from keys to values).
+
+#ifndef STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
+#define STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
+
+#include <stdint.h>
+#include "include/options.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+class BlockBuilder;
+class BlockHandle;
+class WritableFile;
+
+class TableBuilder {
+ public:
+ // Create a builder that will store the contents of the table it is
+ // building in *file. Does not close the file. It is up to the
+ // caller to close the file after calling Finish().
+ TableBuilder(const Options& options, WritableFile* file);
+
+ // REQUIRES: Either Finish() or Abandon() has been called.
+ ~TableBuilder();
+
+ // Change the options used by this builder. Note: only some of the
+ // option fields can be changed after construction. If a field is
+ // not allowed to change dynamically and its value in the structure
+ // passed to the constructor is different from its value in the
+ // structure passed to this method, this method will return an error
+ // without changing any fields.
+ Status ChangeOptions(const Options& options);
+
+ // Add key,value to the table being constructed.
+ // REQUIRES: key is after any previously added key according to comparator.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Add(const Slice& key, const Slice& value);
+
+ // Advanced operation: flush any buffered key/value pairs to file.
+ // Can be used to ensure that two adjacent entries never live in
+ // the same data block. Most clients should not need to use this method.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Flush();
+
+ // Return non-ok iff some error has been detected.
+ Status status() const;
+
+ // Finish building the table. Stops using the file passed to the
+ // constructor after this function returns.
+ // REQUIRES: Finish(), Abandon() have not been called
+ Status Finish();
+
+ // Indicate that the contents of this builder should be abandoned. Stops
+ // using the file passed to the constructor after this function returns.
+ // If the caller is not going to call Finish(), it must call Abandon()
+ // before destroying this builder.
+ // REQUIRES: Finish(), Abandon() have not been called
+ void Abandon();
+
+ // Number of calls to Add() so far.
+ uint64_t NumEntries() const;
+
+ // Size of the file generated so far. If invoked after a successful
+ // Finish() call, returns the size of the final generated file.
+ uint64_t FileSize() const;
+
+ private:
+ bool ok() const { return status().ok(); }
+ void WriteBlock(BlockBuilder* block, BlockHandle* handle);
+
+ struct Rep;
+ Rep* rep_;
+
+ // No copying allowed
+ TableBuilder(const TableBuilder&);
+ void operator=(const TableBuilder&);
+};
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_TABLE_BUILDER_H_
diff --git a/include/write_batch.h b/include/write_batch.h
new file mode 100644
index 0000000..3411952
--- /dev/null
+++ b/include/write_batch.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch holds a collection of updates to apply atomically to a DB.
+//
+// The updates are applied in the order in which they are added
+// to the WriteBatch. For example, the value of "key" will be "v3"
+// after the following batch is written:
+//
+// batch.Put("key", "v1");
+// batch.Delete("key");
+// batch.Put("key", "v2");
+// batch.Put("key", "v3");
+
+#ifndef STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+#define STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
+
+#include <string>
+
+namespace leveldb {
+
+class Slice;
+
+class WriteBatch {
+ public:
+ WriteBatch();
+ ~WriteBatch();
+
+ // Store the mapping "key->value" in the database.
+ void Put(const Slice& key, const Slice& value);
+
+ // If the database contains a mapping for "key", erase it. Else do nothing.
+ void Delete(const Slice& key);
+
+ // Clear all updates buffered in this batch.
+ void Clear();
+
+ private:
+ friend class WriteBatchInternal;
+
+ std::string rep_; // See comment in write_batch.cc for the format of rep_
+
+ // Intentionally copyable
+};
+
+}
+
+#endif // STORAGE_LEVELDB_INCLUDE_WRITE_BATCH_H_
diff --git a/leveldb.gyp b/leveldb.gyp
new file mode 100644
index 0000000..81dd523
--- /dev/null
+++ b/leveldb.gyp
@@ -0,0 +1,329 @@
+# Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+{
+ 'variables': {
+ 'use_snappy%': 0,
+ },
+ 'target_defaults': {
+ 'defines': [
+ 'LEVELDB_PLATFORM_CHROMIUM=1',
+ ],
+ 'include_dirs': [
+ # MOE:begin_strip
+ '../..',
+ # MOE:end_strip_and_replace '.',
+ ],
+ 'conditions': [
+ ['OS == "win"', {
+ 'include_dirs': [
+ 'port/win',
+ ],
+ }],
+ ['use_snappy', {
+ 'defines': [
+ 'USE_SNAPPY=1',
+ ],
+ }],
+ ],
+ },
+ 'targets': [
+ {
+ 'target_name': 'leveldb',
+ 'type': '<(library)',
+ 'dependencies': [
+ # The base libary is a lightweight abstraction layer for things like
+ # threads and IO. http://src.chromium.org/viewvc/chrome/trunk/src/base/
+ # MOE:begin_strip
+ '../../../../base/base.gyp:base',
+ # MOE:end_strip_and_replace '../../base/base.gyp:base',
+ ],
+ 'conditions': [
+ ['use_snappy', {
+ 'dependencies': [
+ '../../../../third_party/snappy/snappy.gyp:snappy',
+ ],
+ }],
+ ],
+ 'sources': [
+ # Include and then exclude so that all files show up in IDEs, even if
+ # they don't build.
+ 'db/builder.cc',
+ 'db/builder.h',
+ 'db/db_impl.cc',
+ 'db/db_impl.h',
+ 'db/db_iter.cc',
+ 'db/db_iter.h',
+ 'db/filename.cc',
+ 'db/filename.h',
+ 'db/dbformat.cc',
+ 'db/dbformat.h',
+ 'db/log_format.h',
+ 'db/log_reader.cc',
+ 'db/log_reader.h',
+ 'db/log_writer.cc',
+ 'db/log_writer.h',
+ 'db/memtable.cc',
+ 'db/memtable.h',
+ 'db/repair.cc',
+ 'db/skiplist.h',
+ 'db/snapshot.h',
+ 'db/table_cache.cc',
+ 'db/table_cache.h',
+ 'db/version_edit.cc',
+ 'db/version_edit.h',
+ 'db/version_set.cc',
+ 'db/version_set.h',
+ 'db/write_batch.cc',
+ 'db/write_batch_internal.h',
+ 'include/cache.h',
+ 'include/comparator.h',
+ 'include/db.h',
+ 'include/env.h',
+ 'include/iterator.h',
+ 'include/options.h',
+ 'include/slice.h',
+ 'include/status.h',
+ 'include/table.h',
+ 'include/table_builder.h',
+ 'include/write_batch.h',
+ 'port/port.h',
+ 'port/port_chromium.cc',
+ 'port/port_chromium.h',
+ 'port/port_example.h',
+ 'port/port_posix.cc',
+ 'port/port_posix.h',
+ 'port/sha1_portable.cc',
+ 'port/sha1_portable.h',
+ 'table/block.cc',
+ 'table/block.h',
+ 'table/block_builder.cc',
+ 'table/block_builder.h',
+ 'table/format.cc',
+ 'table/format.h',
+ 'table/iterator.cc',
+ 'table/iterator_wrapper.h',
+ 'table/merger.cc',
+ 'table/merger.h',
+ 'table/table.cc',
+ 'table/table_builder.cc',
+ 'table/two_level_iterator.cc',
+ 'table/two_level_iterator.h',
+ 'util/arena.cc',
+ 'util/arena.h',
+ 'util/cache.cc',
+ 'util/coding.cc',
+ 'util/coding.h',
+ 'util/comparator.cc',
+ 'util/crc32c.cc',
+ 'util/crc32c.h',
+ 'util/env.cc',
+ 'util/env_chromium.cc',
+ 'util/env_posix.cc',
+ 'util/hash.cc',
+ 'util/hash.h',
+ 'util/logging.cc',
+ 'util/logging.h',
+ 'util/mutexlock.h',
+ 'util/options.cc',
+ 'util/random.h',
+ 'util/status.cc',
+ ],
+ 'sources/': [
+ ['exclude', '_(android|example|portable|posix)\\.cc$'],
+ ],
+ },
+ {
+ 'target_name': 'leveldb_testutil',
+ 'type': '<(library)',
+ 'dependencies': [
+ # MOE:begin_strip
+ '../../../../base/base.gyp:base',
+ # MOE:end_strip_and_replace '../../base/base.gyp:base',
+ 'leveldb',
+ ],
+ 'export_dependent_settings': [
+ # The tests use include directories from these projects.
+ # MOE:begin_strip
+ '../../../../base/base.gyp:base',
+ # MOE:end_strip_and_replace '../../base/base.gyp:base',
+ 'leveldb',
+ ],
+ 'sources': [
+ 'util/histogram.cc',
+ 'util/histogram.h',
+ 'util/testharness.cc',
+ 'util/testharness.h',
+ 'util/testutil.cc',
+ 'util/testutil.h',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_arena_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'util/arena_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_cache_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'util/cache_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_coding_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'util/coding_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_corruption_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/corruption_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_crc32c_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'util/crc32c_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_db_bench',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/db_bench.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_db_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/db_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_dbformat_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/dbformat_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_env_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'util/env_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_filename_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/filename_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_log_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/log_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_sha1_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'port/sha1_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_skiplist_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/skiplist_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_table_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'table/table_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_version_edit_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/version_edit_test.cc',
+ ],
+ },
+ {
+ 'target_name': 'leveldb_write_batch_test',
+ 'type': 'executable',
+ 'dependencies': [
+ 'leveldb_testutil',
+ ],
+ 'sources': [
+ 'db/write_batch_test.cc',
+ ],
+ },
+ ],
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/port/README b/port/README
new file mode 100644
index 0000000..422563e
--- /dev/null
+++ b/port/README
@@ -0,0 +1,10 @@
+This directory contains interfaces and implementations that isolate the
+rest of the package from platform details.
+
+Code in the rest of the package includes "port.h" from this directory.
+"port.h" in turn includes a platform specific "port_<platform>.h" file
+that provides the platform specific implementation.
+
+See port_posix.h for an example of what must be provided in a platform
+specific header file.
+
diff --git a/port/port.h b/port/port.h
new file mode 100644
index 0000000..816826b
--- /dev/null
+++ b/port/port.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_H_
+#define STORAGE_LEVELDB_PORT_PORT_H_
+
+#include <string.h>
+
+// Include the appropriate platform specific file below. If you are
+// porting to a new platform, see "port_example.h" for documentation
+// of what the new port_<platform>.h file must provide.
+#if defined(LEVELDB_PLATFORM_POSIX)
+# include "port/port_posix.h"
+#elif defined(LEVELDB_PLATFORM_CHROMIUM)
+# include "port/port_chromium.h"
+#elif defined(LEVELDB_PLATFORM_ANDROID)
+# include "port/port_android.h"
+#endif
+
+#endif // STORAGE_LEVELDB_PORT_PORT_H_
diff --git a/port/port_android.cc b/port/port_android.cc
new file mode 100644
index 0000000..8a74111
--- /dev/null
+++ b/port/port_android.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_android.h"
+
+#include <cstdlib>
+
+extern "C" {
+size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d) {
+ return fread(a, b, c, d);
+}
+
+size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d) {
+ return fwrite(a, b, c, d);
+}
+
+int fflush_unlocked(FILE *f) {
+ return fflush(f);
+}
+
+int fdatasync(int fd) {
+ return fsync(fd);
+}
+}
+
+// TODO(gabor): This is copied from port_posix.cc - not sure if I should do this?
+namespace leveldb {
+namespace port {
+
+static void PthreadCall(const char* label, int result) {
+ if (result != 0) {
+ fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+ abort();
+ }
+}
+
+Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
+void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
+
+CondVar::CondVar(Mutex* mu)
+ : mu_(mu) {
+ PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+}
+
+CondVar::~CondVar() {
+ PthreadCall("destroy cv", pthread_cond_destroy(&cv_));
+}
+
+void CondVar::Wait() {
+ PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+}
+
+void CondVar::Signal(){
+ PthreadCall("signal", pthread_cond_signal(&cv_));
+}
+
+void CondVar::SignalAll() {
+ PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+}
+}
diff --git a/port/port_android.h b/port/port_android.h
new file mode 100644
index 0000000..2770a0c
--- /dev/null
+++ b/port/port_android.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
+#define STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
+
+#include <endian.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <sha1.h>
+#include <cstdatomic>
+#include <string>
+#include <cctype>
+
+extern "C" {
+ size_t fread_unlocked(void *a, size_t b, size_t c, FILE *d);
+ size_t fwrite_unlocked(const void *a, size_t b, size_t c, FILE *d);
+ int fflush_unlocked(FILE *f);
+ int fdatasync (int fd);
+}
+
+namespace leveldb {
+namespace port {
+
+static const bool kLittleEndian = __BYTE_ORDER == __LITTLE_ENDIAN;
+
+class CondVar;
+
+class Mutex {
+ public:
+ Mutex();
+ ~Mutex();
+
+ void Lock();
+ void Unlock();
+ void AssertHeld() {
+ //TODO(gabor): How can I implement this?
+ }
+
+ private:
+ friend class CondVar;
+ pthread_mutex_t mu_;
+
+ // No copying
+ Mutex(const Mutex&);
+ void operator=(const Mutex&);
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+ void Wait();
+ void Signal();
+ void SignalAll();
+ private:
+ Mutex* mu_;
+ pthread_cond_t cv_;
+};
+
+// Storage for a lock-free pointer
+class AtomicPointer {
+ private:
+ std::atomic<void*> rep_;
+ public:
+ AtomicPointer() { }
+ explicit AtomicPointer(void* v) : rep_(v) { }
+ inline void* Acquire_Load() const {
+ return rep_.load(std::memory_order_acquire);
+ }
+ inline void Release_Store(void* v) {
+ rep_.store(v, std::memory_order_release);
+ }
+ inline void* NoBarrier_Load() const {
+ return rep_.load(std::memory_order_relaxed);
+ }
+ inline void NoBarrier_Store(void* v) {
+ rep_.store(v, std::memory_order_relaxed);
+ }
+};
+
+/**
+ * TODO(gabor): Implement actual compress
+ * This is a hack - it just copies input to output.
+ * No actual compression occurs.
+ */
+inline void Lightweight_Compress(
+ const char* input,
+ size_t input_length,
+ std::string* output) {
+ output->copy((char*)input,0,input_length);
+}
+
+/**
+ * TODO(gabor): Implement actual compress
+ * This is a hack - it just copies input to output.
+ * No actual uncompression occurs.
+ */
+inline bool Lightweight_Uncompress(
+ const char* input_data,
+ size_t input_length,
+ std::string* output) {
+ output->copy((char*)input_data,0,input_length);
+ return (bool)1;
+}
+
+inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
+ SHA1_CTX sha1_ctx;
+ SHA1Init(&sha1_ctx);
+ SHA1Update(&sha1_ctx, (const u_char*)data, len);
+ SHA1Final((u_char*)hash_array, &sha1_ctx);
+}
+
+inline uint64_t ThreadIdentifier() {
+ pthread_t tid = pthread_self();
+ uint64_t r = 0;
+ memcpy(&r, &tid, sizeof(r) < sizeof(tid) ? sizeof(r) : sizeof(tid));
+ return r;
+}
+
+inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
+ return false;
+}
+
+}
+}
+
+#endif // STORAGE_LEVELDB_PORT_PORT_ANDROID_H_
diff --git a/port/port_chromium.cc b/port/port_chromium.cc
new file mode 100644
index 0000000..c022ec4
--- /dev/null
+++ b/port/port_chromium.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_chromium.h"
+
+#include "util/logging.h"
+
+#if defined(USE_SNAPPY)
+# include "third_party/snappy/src/snappy.h"
+# include "third_party/snappy/src/snappy-stubs.h"
+#endif
+
+namespace leveldb {
+namespace port {
+
+Mutex::Mutex() {
+}
+
+Mutex::~Mutex() {
+}
+
+void Mutex::Lock() {
+ mu_.Acquire();
+}
+
+void Mutex::Unlock() {
+ mu_.Release();
+}
+
+void Mutex::AssertHeld() {
+ mu_.AssertAcquired();
+}
+
+CondVar::CondVar(Mutex* mu)
+ : cv_(&mu->mu_) {
+}
+
+CondVar::~CondVar() { }
+
+void CondVar::Wait() {
+ cv_.Wait();
+}
+
+void CondVar::Signal(){
+ cv_.Signal();
+}
+
+void CondVar::SignalAll() {
+ cv_.Broadcast();
+}
+
+void Lightweight_Compress(const char* input, size_t input_length,
+ std::string* output) {
+#if defined(USE_SNAPPY)
+ output->resize(snappy::MaxCompressedLength(input_length));
+ size_t outlen;
+ snappy::RawCompress(snappy::StringPiece(input, input_length),
+ &(*output)[0], &outlen);
+ output->resize(outlen);
+#else
+ output->assign(input, input_length);
+#endif
+}
+
+bool Lightweight_Uncompress(const char* input_data, size_t input_length,
+ std::string* output) {
+#if defined(USE_SNAPPY)
+ snappy::StringPiece input(input_data, input_length);
+ size_t ulength;
+ if (!snappy::GetUncompressedLength(input, &ulength)) {
+ return false;
+ }
+ output->resize(ulength);
+ return snappy::RawUncompress(input, &(*output)[0]);
+#else
+ output->assign(input_data, input_length);
+ return true;
+#endif
+}
+
+}
+}
diff --git a/port/port_chromium.h b/port/port_chromium.h
new file mode 100644
index 0000000..b33bdde
--- /dev/null
+++ b/port/port_chromium.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
+#define STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
+
+#include <stdint.h>
+#include <string>
+#include <cstring>
+#include "base/atomicops.h"
+#include "base/basictypes.h"
+#include "base/logging.h"
+#include "base/sha1.h"
+#include "base/synchronization/condition_variable.h"
+#include "base/synchronization/lock.h"
+
+// Linux's ThreadIdentifier() needs this.
+#if defined(OS_LINUX)
+# include <linux/unistd.h>
+#endif
+
+#if defined(OS_WIN)
+#define snprintf _snprintf
+#define va_copy(a, b) do { (a) = (b); } while (0)
+#endif
+
+namespace leveldb {
+namespace port {
+
+// Chromium only supports little endian.
+static const bool kLittleEndian = true;
+
+class Mutex {
+ public:
+ Mutex();
+ ~Mutex();
+ void Lock();
+ void Unlock();
+ void AssertHeld();
+
+ private:
+ base::Lock mu_;
+
+ friend class CondVar;
+ DISALLOW_COPY_AND_ASSIGN(Mutex);
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+ void Wait();
+ void Signal();
+ void SignalAll();
+
+ private:
+ base::ConditionVariable cv_;
+
+ DISALLOW_COPY_AND_ASSIGN(CondVar);
+};
+
+class AtomicPointer {
+ private:
+ typedef base::subtle::AtomicWord Rep;
+ Rep rep_;
+ public:
+ AtomicPointer() { }
+ explicit AtomicPointer(void* p) : rep_(reinterpret_cast<Rep>(p)) {}
+ inline void* Acquire_Load() const {
+ return reinterpret_cast<void*>(::base::subtle::Acquire_Load(&rep_));
+ }
+ inline void Release_Store(void* v) {
+ ::base::subtle::Release_Store(&rep_, reinterpret_cast<Rep>(v));
+ }
+ inline void* NoBarrier_Load() const {
+ return reinterpret_cast<void*>(::base::subtle::NoBarrier_Load(&rep_));
+ }
+ inline void NoBarrier_Store(void* v) {
+ ::base::subtle::NoBarrier_Store(&rep_, reinterpret_cast<Rep>(v));
+ }
+};
+
+inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
+ return ::base::SHA1HashBytes(reinterpret_cast<const unsigned char*>(data),
+ len,
+ reinterpret_cast<unsigned char*>(hash_array));
+}
+
+void Lightweight_Compress(const char* input, size_t input_length,
+ std::string* output);
+bool Lightweight_Uncompress(const char* input_data, size_t input_length,
+ std::string* output);
+
+inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
+ return false;
+}
+
+}
+}
+
+#endif // STORAGE_LEVELDB_PORT_PORT_CHROMIUM_H_
diff --git a/port/port_example.h b/port/port_example.h
new file mode 100644
index 0000000..ee25a01
--- /dev/null
+++ b/port/port_example.h
@@ -0,0 +1,119 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This file contains the specification, but not the implementations,
+// of the types/operations/etc. that should be defined by a platform
+// specific port_<platform>.h file. Use this file as a reference for
+// how to port this package to a new platform.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+#define STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
+
+namespace leveldb {
+namespace port {
+
+// TODO(jorlow): Many of these belong more in the environment class rather than
+// here. We should try moving them and see if it affects perf.
+
+// The following boolean constant must be true on a little-endian machine
+// and false otherwise.
+static const bool kLittleEndian = true /* or some other expression */;
+
+// ------------------ Threading -------------------
+
+// A Mutex represents an exclusive lock.
+class Mutex {
+ public:
+ Mutex();
+ ~Mutex();
+
+ // Lock the mutex. Waits until other lockers have exited.
+ // Will deadlock if the mutex is already locked by this thread.
+ void Lock();
+
+ // Unlock the mutex.
+ // REQUIRES: This mutex was locked by this thread.
+ void Unlock();
+
+ // Optionally crash if this thread does not hold this mutex.
+ // The implementation must be fast, especially if NDEBUG is
+ // defined. The implementation is allowed to skip all checks.
+ void AssertHeld();
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+
+ // Atomically release *mu and block on this condition variable until
+ // either a call to SignalAll(), or a call to Signal() that picks
+ // this thread to wakeup.
+ // REQUIRES: this thread holds *mu
+ void Wait();
+
+ // If there are some threads waiting, wake up at least one of them.
+ void Signal();
+
+ // Wake up all waiting threads.
+ void SignallAll();
+};
+
+// A type that holds a pointer that can be read or written atomically
+// (i.e., without word-tearing.)
+class AtomicPointer {
+ private:
+ intptr_t rep_;
+ public:
+ // Initialize to arbitrary value
+ AtomicPointer();
+
+ // Initialize to hold v
+ explicit AtomicPointer(void* v) : rep_(v) { }
+
+ // Read and return the stored pointer with the guarantee that no
+ // later memory access (read or write) by this thread can be
+ // reordered ahead of this read.
+ void* Acquire_Load() const;
+
+ // Set v as the stored pointer with the guarantee that no earlier
+ // memory access (read or write) by this thread can be reordered
+ // after this store.
+ void Release_Store(void* v);
+
+ // Read the stored pointer with no ordering guarantees.
+ void* NoBarrier_Load() const;
+
+ // Set va as the stored pointer with no ordering guarantees.
+ void NoBarrier_Store(void* v);
+};
+
+// ------------------ Checksumming -------------------
+
+// Store a 160-bit hash of "data[0..len-1]" in "hash_array[0]..hash_array[19]"
+extern void SHA1_Hash(const char* data, size_t len, char* hash_array);
+
+// ------------------ Compression -------------------
+
+// Store the lightweight compression of "input[0,input_length-1]" in *output.
+extern void Lightweight_Compress(const char* input, size_t input_length,
+ std::string* output);
+
+// Attempt to lightweight uncompress input[0,input_length-1] into *output.
+// Returns true if successful, false if the input is invalid lightweight
+// compressed data.
+extern bool Lightweight_Uncompress(const char* input_data, size_t input_length,
+ std::string* output);
+
+// ------------------ Miscellaneous -------------------
+
+// If heap profiling is not supported, returns false.
+// Else repeatedly calls (*func)(arg, data, n) and then returns true.
+// The concatenation of all "data[0,n-1]" fragments is the heap profile.
+extern bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg);
+
+}
+}
+
+#endif // STORAGE_LEVELDB_PORT_PORT_EXAMPLE_H_
diff --git a/port/port_posix.cc b/port/port_posix.cc
new file mode 100644
index 0000000..e75da8b
--- /dev/null
+++ b/port/port_posix.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port_posix.h"
+
+#include <cstdlib>
+#include <stdio.h>
+#include <string.h>
+#include "util/logging.h"
+
+namespace leveldb {
+namespace port {
+
+static void PthreadCall(const char* label, int result) {
+ if (result != 0) {
+ fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+ abort();
+ }
+}
+
+Mutex::Mutex() { PthreadCall("init mutex", pthread_mutex_init(&mu_, NULL)); }
+
+Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); }
+
+void Mutex::Lock() { PthreadCall("lock", pthread_mutex_lock(&mu_)); }
+
+void Mutex::Unlock() { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); }
+
+CondVar::CondVar(Mutex* mu)
+ : mu_(mu) {
+ PthreadCall("init cv", pthread_cond_init(&cv_, NULL));
+}
+
+CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); }
+
+void CondVar::Wait() {
+ PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_));
+}
+
+void CondVar::Signal() {
+ PthreadCall("signal", pthread_cond_signal(&cv_));
+}
+
+void CondVar::SignalAll() {
+ PthreadCall("broadcast", pthread_cond_broadcast(&cv_));
+}
+
+}
+}
diff --git a/port/port_posix.h b/port/port_posix.h
new file mode 100644
index 0000000..e7bc5b8
--- /dev/null
+++ b/port/port_posix.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// See port_example.h for documentation for the following types/functions.
+
+#ifndef STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+#define STORAGE_LEVELDB_PORT_PORT_POSIX_H_
+
+#include <endian.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <string>
+#include <cstdatomic>
+#include <cstring>
+#include "port/sha1_portable.h"
+
+namespace leveldb {
+namespace port {
+
+static const bool kLittleEndian = (__BYTE_ORDER == __LITTLE_ENDIAN);
+
+class CondVar;
+
+class Mutex {
+ public:
+ Mutex();
+ ~Mutex();
+
+ void Lock();
+ void Unlock();
+ void AssertHeld() { }
+
+ private:
+ friend class CondVar;
+ pthread_mutex_t mu_;
+
+ // No copying
+ Mutex(const Mutex&);
+ void operator=(const Mutex&);
+};
+
+class CondVar {
+ public:
+ explicit CondVar(Mutex* mu);
+ ~CondVar();
+ void Wait();
+ void Signal();
+ void SignalAll();
+ private:
+ pthread_cond_t cv_;
+ Mutex* mu_;
+};
+
+// Storage for a lock-free pointer
+class AtomicPointer {
+ private:
+ std::atomic<void*> rep_;
+ public:
+ AtomicPointer() { }
+ explicit AtomicPointer(void* v) : rep_(v) { }
+ inline void* Acquire_Load() const {
+ return rep_.load(std::memory_order_acquire);
+ }
+ inline void Release_Store(void* v) {
+ rep_.store(v, std::memory_order_release);
+ }
+ inline void* NoBarrier_Load() const {
+ return rep_.load(std::memory_order_relaxed);
+ }
+ inline void NoBarrier_Store(void* v) {
+ rep_.store(v, std::memory_order_relaxed);
+ }
+};
+
+inline void SHA1_Hash(const char* data, size_t len, char* hash_array) {
+ SHA1_Hash_Portable(data, len, hash_array);
+}
+
+/**
+ * TODO(gabor): Implement actual compress
+ * This is a hack - it just copies input to output.
+ * No actual compression occurs.
+ */
+inline void Lightweight_Compress(const char* input, size_t input_length,
+ std::string* output) {
+ output->assign(input, input_length);
+}
+
+/**
+ * TODO(gabor): Implement actual uncompress
+ * This is a hack - it just copies input to output.
+ * No actual uncompression occurs.
+ */
+inline bool Lightweight_Uncompress(const char* input_data, size_t input_length,
+ std::string* output) {
+ output->assign(input_data, input_length);
+ return true;
+}
+
+inline bool GetHeapProfile(void (*func)(void*, const char*, int), void* arg) {
+ return false;
+}
+
+}
+}
+
+#endif // STORAGE_LEVELDB_PORT_PORT_POSIX_H_
diff --git a/port/sha1_portable.cc b/port/sha1_portable.cc
new file mode 100644
index 0000000..8fa7277
--- /dev/null
+++ b/port/sha1_portable.cc
@@ -0,0 +1,298 @@
+// Portions copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// This module provides a slow but portable implementation of
+// the SHA1 hash function.
+//
+// It is adapted from free code written by Paul E. Jones
+// <paulej@packetizer.com>. See http://www.packetizer.com/security/sha1/
+//
+// The license for the original code is:
+/*
+ Copyright (C) 1998, 2009
+ Paul E. Jones <paulej@packetizer.com>
+
+ Freeware Public License (FPL)
+
+ This software is licensed as "freeware." Permission to distribute
+ this software in source and binary forms, including incorporation
+ into other products, is hereby granted without a fee. THIS SOFTWARE
+ IS PROVIDED 'AS IS' AND WITHOUT ANY EXPRESSED OR IMPLIED WARRANTIES,
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR A PARTICULAR PURPOSE. THE AUTHOR SHALL NOT BE HELD
+ LIABLE FOR ANY DAMAGES RESULTING FROM THE USE OF THIS SOFTWARE, EITHER
+ DIRECTLY OR INDIRECTLY, INCLUDING, BUT NOT LIMITED TO, LOSS OF DATA
+ OR DATA BEING RENDERED INACCURATE.
+*/
+
+#include "port/sha1_portable.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+namespace leveldb {
+namespace port {
+
+/*
+ * Description:
+ * This class implements the Secure Hashing Standard as defined
+ * in FIPS PUB 180-1 published April 17, 1995.
+ */
+
+/*
+ * This structure will hold context information for the hashing
+ * operation
+ */
+typedef struct SHA1Context {
+ unsigned Message_Digest[5]; /* Message Digest (output) */
+
+ unsigned Length_Low; /* Message length in bits */
+ unsigned Length_High; /* Message length in bits */
+
+ unsigned char Message_Block[64]; /* 512-bit message blocks */
+ int Message_Block_Index; /* Index into message block array */
+
+ bool Computed; /* Is the digest computed? */
+ bool Corrupted; /* Is the message digest corruped? */
+} SHA1Context;
+
+/*
+ * Portability Issues:
+ * SHA-1 is defined in terms of 32-bit "words". This code was
+ * written with the expectation that the processor has at least
+ * a 32-bit machine word size. If the machine word size is larger,
+ * the code should still function properly. One caveat to that
+ * is that the input functions taking characters and character
+ * arrays assume that only 8 bits of information are stored in each
+ * character.
+ */
+
+/*
+ * Define the circular shift macro
+ */
+#define SHA1CircularShift(bits,word) \
+ ((((word) << (bits)) & 0xFFFFFFFF) | \
+ ((word) >> (32-(bits))))
+
+/* Function prototypes */
+static void SHA1ProcessMessageBlock(SHA1Context *);
+static void SHA1PadMessage(SHA1Context *);
+
+// Initialize the SHA1Context in preparation for computing a new
+// message digest.
+static void SHA1Reset(SHA1Context* context) {
+ context->Length_Low = 0;
+ context->Length_High = 0;
+ context->Message_Block_Index = 0;
+
+ context->Message_Digest[0] = 0x67452301;
+ context->Message_Digest[1] = 0xEFCDAB89;
+ context->Message_Digest[2] = 0x98BADCFE;
+ context->Message_Digest[3] = 0x10325476;
+ context->Message_Digest[4] = 0xC3D2E1F0;
+
+ context->Computed = false;
+ context->Corrupted = false;
+}
+
+// This function will return the 160-bit message digest into the
+// Message_Digest array within the SHA1Context provided
+static bool SHA1Result(SHA1Context *context) {
+ if (context->Corrupted) {
+ return false;
+ }
+
+ if (!context->Computed) {
+ SHA1PadMessage(context);
+ context->Computed = true;
+ }
+ return true;
+}
+
+// This function accepts an array of bytes as the next portion of
+// the message.
+static void SHA1Input(SHA1Context *context,
+ const unsigned char *message_array,
+ unsigned length) {
+ if (!length) return;
+
+ if (context->Computed || context->Corrupted) {
+ context->Corrupted = true;
+ return;
+ }
+
+ while(length-- && !context->Corrupted) {
+ context->Message_Block[context->Message_Block_Index++] =
+ (*message_array & 0xFF);
+
+ context->Length_Low += 8;
+ /* Force it to 32 bits */
+ context->Length_Low &= 0xFFFFFFFF;
+ if (context->Length_Low == 0) {
+ context->Length_High++;
+ /* Force it to 32 bits */
+ context->Length_High &= 0xFFFFFFFF;
+ if (context->Length_High == 0)
+ {
+ /* Message is too long */
+ context->Corrupted = true;
+ }
+ }
+
+ if (context->Message_Block_Index == 64)
+ {
+ SHA1ProcessMessageBlock(context);
+ }
+
+ message_array++;
+ }
+}
+
+// This function will process the next 512 bits of the message stored
+// in the Message_Block array.
+static void SHA1ProcessMessageBlock(SHA1Context *context) {
+ const unsigned K[] = // Constants defined in SHA-1
+ {
+ 0x5A827999,
+ 0x6ED9EBA1,
+ 0x8F1BBCDC,
+ 0xCA62C1D6
+ };
+ int t; // Loop counter
+ unsigned temp; // Temporary word value
+ unsigned W[80]; // Word sequence
+ unsigned A, B, C, D, E; // Word buffers
+
+ // Initialize the first 16 words in the array W
+ for(t = 0; t < 16; t++) {
+ W[t] = ((unsigned) context->Message_Block[t * 4]) << 24;
+ W[t] |= ((unsigned) context->Message_Block[t * 4 + 1]) << 16;
+ W[t] |= ((unsigned) context->Message_Block[t * 4 + 2]) << 8;
+ W[t] |= ((unsigned) context->Message_Block[t * 4 + 3]);
+ }
+
+ for(t = 16; t < 80; t++) {
+ W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
+ }
+
+ A = context->Message_Digest[0];
+ B = context->Message_Digest[1];
+ C = context->Message_Digest[2];
+ D = context->Message_Digest[3];
+ E = context->Message_Digest[4];
+
+ for(t = 0; t < 20; t++) {
+ temp = SHA1CircularShift(5,A) +
+ ((B & C) | ((~B) & D)) + E + W[t] + K[0];
+ temp &= 0xFFFFFFFF;
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 20; t < 40; t++) {
+ temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
+ temp &= 0xFFFFFFFF;
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 40; t < 60; t++) {
+ temp = SHA1CircularShift(5,A) +
+ ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
+ temp &= 0xFFFFFFFF;
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 60; t < 80; t++) {
+ temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
+ temp &= 0xFFFFFFFF;
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ context->Message_Digest[0] = (context->Message_Digest[0] + A) & 0xFFFFFFFF;
+ context->Message_Digest[1] = (context->Message_Digest[1] + B) & 0xFFFFFFFF;
+ context->Message_Digest[2] = (context->Message_Digest[2] + C) & 0xFFFFFFFF;
+ context->Message_Digest[3] = (context->Message_Digest[3] + D) & 0xFFFFFFFF;
+ context->Message_Digest[4] = (context->Message_Digest[4] + E) & 0xFFFFFFFF;
+
+ context->Message_Block_Index = 0;
+}
+
+// According to the standard, the message must be padded to an even
+// 512 bits. The first padding bit must be a '1'. The last 64 bits
+// represent the length of the original message. All bits in between
+// should be 0. This function will pad the message according to those
+// rules by filling the Message_Block array accordingly. It will also
+// call SHA1ProcessMessageBlock() appropriately. When it returns, it
+// can be assumed that the message digest has been computed.
+static void SHA1PadMessage(SHA1Context *context) {
+ // Check to see if the current message block is too small to hold
+ // the initial padding bits and length. If so, we will pad the
+ // block, process it, and then continue padding into a second block.
+ if (context->Message_Block_Index > 55) {
+ context->Message_Block[context->Message_Block_Index++] = 0x80;
+ while(context->Message_Block_Index < 64) {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+
+ SHA1ProcessMessageBlock(context);
+
+ while(context->Message_Block_Index < 56) {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+ } else {
+ context->Message_Block[context->Message_Block_Index++] = 0x80;
+ while(context->Message_Block_Index < 56) {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+ }
+
+ // Store the message length as the last 8 octets
+ context->Message_Block[56] = (context->Length_High >> 24) & 0xFF;
+ context->Message_Block[57] = (context->Length_High >> 16) & 0xFF;
+ context->Message_Block[58] = (context->Length_High >> 8) & 0xFF;
+ context->Message_Block[59] = (context->Length_High) & 0xFF;
+ context->Message_Block[60] = (context->Length_Low >> 24) & 0xFF;
+ context->Message_Block[61] = (context->Length_Low >> 16) & 0xFF;
+ context->Message_Block[62] = (context->Length_Low >> 8) & 0xFF;
+ context->Message_Block[63] = (context->Length_Low) & 0xFF;
+
+ SHA1ProcessMessageBlock(context);
+}
+
+
+void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array) {
+ SHA1Context context;
+ SHA1Reset(&context);
+ SHA1Input(&context, reinterpret_cast<const unsigned char*>(data), len);
+ bool ok = SHA1Result(&context);
+ if (!ok) {
+ fprintf(stderr, "Unexpected error in SHA1_Hash_Portable code\n");
+ exit(1);
+ }
+ for (int i = 0; i < 5; i++) {
+ uint32_t value = context.Message_Digest[i];
+ hash_array[i*4 + 0] = (value >> 24) & 0xff;
+ hash_array[i*4 + 1] = (value >> 16) & 0xff;
+ hash_array[i*4 + 2] = (value >> 8) & 0xff;
+ hash_array[i*4 + 3] = value & 0xff;
+ }
+}
+
+}
+}
diff --git a/port/sha1_portable.h b/port/sha1_portable.h
new file mode 100644
index 0000000..31db305
--- /dev/null
+++ b/port/sha1_portable.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
+#define STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
+
+#include <stddef.h>
+
+namespace leveldb {
+namespace port {
+
+// Compute the SHA1 hash value of "data[0..len-1]" and store it in
+// "hash_array[0..19]". hash_array must have 20 bytes of space available.
+//
+// This function is portable but may not be as fast as a version
+// optimized for your platform. It is provided as a default method
+// that can be used when porting leveldb to a new platform if no
+// better SHA1 hash implementation is available.
+void SHA1_Hash_Portable(const char* data, size_t len, char* hash_array);
+
+}
+}
+
+#endif // STORAGE_LEVELDB_PORT_SHA1_PORTABLE_H_
diff --git a/port/sha1_test.cc b/port/sha1_test.cc
new file mode 100644
index 0000000..46bbeba
--- /dev/null
+++ b/port/sha1_test.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "port/port.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace port {
+
+class SHA1 { };
+
+static std::string TestSHA1(const char* data, size_t len) {
+ char hash_val[20];
+ SHA1_Hash(data, len, hash_val);
+ char buf[41];
+ for (int i = 0; i < 20; i++) {
+ snprintf(buf + i * 2, 41 - i * 2,
+ "%02x",
+ static_cast<unsigned int>(static_cast<unsigned char>(
+ hash_val[i])));
+ }
+ return std::string(buf, 40);
+}
+
+TEST(SHA1, Simple) {
+ ASSERT_EQ("da39a3ee5e6b4b0d3255bfef95601890afd80709", TestSHA1("", 0));
+ ASSERT_EQ("aaf4c61ddcc5e8a2dabede0f3b482cd9aea9434d", TestSHA1("hello", 5));
+ std::string x(10000, 'x');
+ ASSERT_EQ("f8c5cde791c5056cf515881e701c8a9ecb439a75",
+ TestSHA1(x.data(), x.size()));
+}
+
+TEST(SHA1, Benchmark) {
+ std::string data(1048576 * 100, 'x');
+ double start = Env::Default()->NowMicros() * 1e-6;
+ static const int kIters = 10;
+ uint32_t sha1 = 0;
+ for (int i = 0; i < kIters; i++) {
+ char hash_val[20];
+ SHA1_Hash(data.data(), data.size(), hash_val);
+ sha1 |= hash_val[0];
+ }
+ double finish = Env::Default()->NowMicros() * 1e-6;
+ double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0;
+ fprintf(stderr, "SHA1 %0.0f MB: %.3f secs; %.1f MB/s, dummy=0x%02x\n",
+ mb, (finish - start), mb / (finish - start), sha1);
+}
+
+}
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/table/block.cc b/table/block.cc
new file mode 100644
index 0000000..351eb48
--- /dev/null
+++ b/table/block.cc
@@ -0,0 +1,261 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Decodes the blocks generated by block_builder.cc.
+
+#include "table/block.h"
+
+#include <vector>
+#include <algorithm>
+#include "include/comparator.h"
+#include "util/coding.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+inline uint32_t Block::NumRestarts() const {
+ assert(size_ >= 2*sizeof(uint32_t));
+ return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
+}
+
+Block::Block(const char* data, size_t size)
+ : data_(data),
+ size_(size) {
+ if (size_ < sizeof(uint32_t)) {
+ size_ = 0; // Error marker
+ } else {
+ restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t);
+ if (restart_offset_ > size_ - sizeof(uint32_t)) {
+ // The size is too small for NumRestarts() and therefore
+ // restart_offset_ wrapped around.
+ size_ = 0;
+ }
+ }
+}
+
+Block::~Block() {
+ delete[] data_;
+}
+
+// Helper routine: decode the next block entry starting at "p",
+// storing the number of shared key bytes, non_shared key bytes,
+// and the length of the value in "*shared", "*non_shared", and
+// "*value_length", respectively. Will not derefence past "limit".
+//
+// If any errors are detected, returns NULL. Otherwise, returns a
+// pointer to the key delta (just past the three decoded values).
+static inline const char* DecodeEntry(const char* p, const char* limit,
+ uint32_t* shared,
+ uint32_t* non_shared,
+ uint32_t* value_length) {
+ if (limit - p < 3) return NULL;
+ *shared = reinterpret_cast<const unsigned char*>(p)[0];
+ *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
+ *value_length = reinterpret_cast<const unsigned char*>(p)[2];
+ if ((*shared | *non_shared | *value_length) < 128) {
+ // Fast path: all three values are encoded in one byte each
+ p += 3;
+ } else {
+ if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
+ if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
+ if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
+ }
+
+ if (limit - p < (*non_shared + *value_length)) return NULL;
+ return p;
+}
+
+class Block::Iter : public Iterator {
+ private:
+ const Comparator* const comparator_;
+ const char* const data_; // underlying block contents
+ uint32_t const restarts_; // Offset of restart array (list of fixed32)
+ uint32_t const num_restarts_; // Number of uint32_t entries in restart array
+
+ // current_ is offset in data_ of current entry. >= restarts_ if !Valid
+ uint32_t current_;
+ uint32_t restart_index_; // Index of restart block in which current_ falls
+ std::string key_;
+ Slice value_;
+ Status status_;
+
+ inline int Compare(const Slice& a, const Slice& b) const {
+ return comparator_->Compare(a, b);
+ }
+
+ // Return the offset in data_ just past the end of the current entry.
+ inline uint32_t NextEntryOffset() const {
+ return (value_.data() + value_.size()) - data_;
+ }
+
+ uint32_t GetRestartPoint(uint32_t index) {
+ assert(index < num_restarts_);
+ return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
+ }
+
+ void SeekToRestartPoint(uint32_t index) {
+ key_.clear();
+ restart_index_ = index;
+ // current_ will be fixed by ParseNextKey();
+
+ // ParseNextKey() starts at the end of value_, so set value_ accordingly
+ uint32_t offset = GetRestartPoint(index);
+ value_ = Slice(data_ + offset, 0);
+ }
+
+ public:
+ Iter(const Comparator* comparator,
+ const char* data,
+ uint32_t restarts,
+ uint32_t num_restarts)
+ : comparator_(comparator),
+ data_(data),
+ restarts_(restarts),
+ num_restarts_(num_restarts),
+ current_(restarts_),
+ restart_index_(num_restarts_) {
+ assert(num_restarts_ > 0);
+ }
+
+ virtual bool Valid() const { return current_ < restarts_; }
+ virtual Status status() const { return status_; }
+ virtual Slice key() const {
+ assert(Valid());
+ return key_;
+ }
+ virtual Slice value() const {
+ assert(Valid());
+ return value_;
+ }
+
+ virtual void Next() {
+ assert(Valid());
+ ParseNextKey();
+ }
+
+ virtual void Prev() {
+ assert(Valid());
+
+ // Scan backwards to a restart point before current_
+ const uint32_t original = current_;
+ while (GetRestartPoint(restart_index_) >= original) {
+ if (restart_index_ == 0) {
+ // No more entries
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return;
+ }
+ restart_index_--;
+ }
+
+ SeekToRestartPoint(restart_index_);
+ do {
+ // Loop until end of current entry hits the start of original entry
+ } while (ParseNextKey() && NextEntryOffset() < original);
+ }
+
+ virtual void Seek(const Slice& target) {
+ // Binary search in restart array to find the first restart point
+ // with a key >= target
+ uint32_t left = 0;
+ uint32_t right = num_restarts_ - 1;
+ while (left < right) {
+ uint32_t mid = (left + right + 1) / 2;
+ uint32_t region_offset = GetRestartPoint(mid);
+ uint32_t shared, non_shared, value_length;
+ const char* key_ptr = DecodeEntry(data_ + region_offset,
+ data_ + restarts_,
+ &shared, &non_shared, &value_length);
+ if (key_ptr == NULL || (shared != 0)) {
+ CorruptionError();
+ return;
+ }
+ Slice mid_key(key_ptr, non_shared);
+ if (Compare(mid_key, target) < 0) {
+ // Key at "mid" is smaller than "target". Therefore all
+ // blocks before "mid" are uninteresting.
+ left = mid;
+ } else {
+ // Key at "mid" is >= "target". Therefore all blocks at or
+ // after "mid" are uninteresting.
+ right = mid - 1;
+ }
+ }
+
+ // Linear search (within restart block) for first key >= target
+ SeekToRestartPoint(left);
+ while (true) {
+ if (!ParseNextKey()) {
+ return;
+ }
+ if (Compare(key_, target) >= 0) {
+ return;
+ }
+ }
+ }
+
+ virtual void SeekToFirst() {
+ SeekToRestartPoint(0);
+ ParseNextKey();
+ }
+
+ virtual void SeekToLast() {
+ SeekToRestartPoint(num_restarts_ - 1);
+ while (ParseNextKey() && NextEntryOffset() < restarts_) {
+ // Keep skipping
+ }
+ }
+
+ private:
+ void CorruptionError() {
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ status_ = Status::Corruption("bad entry in block");
+ key_.clear();
+ value_.clear();
+ }
+
+ bool ParseNextKey() {
+ current_ = NextEntryOffset();
+ const char* p = data_ + current_;
+ const char* limit = data_ + restarts_; // Restarts come right after data
+ if (p >= limit) {
+ // No more entries to return. Mark as invalid.
+ current_ = restarts_;
+ restart_index_ = num_restarts_;
+ return false;
+ }
+
+ // Decode next entry
+ uint32_t shared, non_shared, value_length;
+ p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
+ if (p == NULL || key_.size() < shared) {
+ CorruptionError();
+ return false;
+ } else {
+ key_.resize(shared);
+ key_.append(p, non_shared);
+ value_ = Slice(p + non_shared, value_length);
+ while (restart_index_ + 1 < num_restarts_ &&
+ GetRestartPoint(restart_index_ + 1) < current_) {
+ ++restart_index_;
+ }
+ return true;
+ }
+ }
+};
+
+Iterator* Block::NewIterator(const Comparator* cmp) {
+ if (size_ < 2*sizeof(uint32_t)) {
+ return NewErrorIterator(Status::Corruption("bad block contents"));
+ }
+ const uint32_t num_restarts = NumRestarts();
+ if (num_restarts == 0) {
+ return NewEmptyIterator();
+ } else {
+ return new Iter(cmp, data_, restart_offset_, num_restarts);
+ }
+}
+
+}
diff --git a/table/block.h b/table/block.h
new file mode 100644
index 0000000..9372001
--- /dev/null
+++ b/table/block.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_BLOCK_H_
+#define STORAGE_LEVELDB_TABLE_BLOCK_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include "include/iterator.h"
+
+namespace leveldb {
+
+class Comparator;
+
+class Block {
+ public:
+ // Initialize the block with the specified contents.
+ // Takes ownership of data[] and will delete[] it when done.
+ Block(const char* data, size_t size);
+
+ ~Block();
+
+ size_t size() const { return size_; }
+ Iterator* NewIterator(const Comparator* comparator);
+
+ private:
+ uint32_t NumRestarts() const;
+
+ const char* data_;
+ size_t size_;
+ uint32_t restart_offset_; // Offset in data_ of restart array
+
+ // No copying allowed
+ Block(const Block&);
+ void operator=(const Block&);
+
+ class Iter;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_TABLE_BLOCK_H_
diff --git a/table/block_builder.cc b/table/block_builder.cc
new file mode 100644
index 0000000..2c33492
--- /dev/null
+++ b/table/block_builder.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// BlockBuilder generates blocks where keys are prefix-compressed:
+//
+// When we store a key, we drop the prefix shared with the previous
+// string. This helps reduce the space requirement significantly.
+// Furthermore, once every K keys, we do not apply the prefix
+// compression and store the entire key. We call this a "restart
+// point". The tail end of the block stores the offsets of all of the
+// restart points, and can be used to do a binary search when looking
+// for a particular key. Values are stored as-is (without compression)
+// immediately following the corresponding key.
+//
+// An entry for a particular key-value pair has the form:
+// shared_bytes: varint32
+// unshared_bytes: varint32
+// value_length: varint32
+// key_delta: char[unshared_bytes]
+// value: char[value_length]
+// shared_bytes == 0 for restart points.
+//
+// The trailer of the block has the form:
+// restarts: uint32[num_restarts]
+// num_restarts: uint32
+// restarts[i] contains the offset within the block of the ith restart point.
+
+#include "table/block_builder.h"
+
+#include <algorithm>
+#include <assert.h>
+#include "include/comparator.h"
+#include "include/table_builder.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+BlockBuilder::BlockBuilder(const Options* options)
+ : options_(options),
+ restarts_(),
+ counter_(0),
+ finished_(false) {
+ assert(options->block_restart_interval >= 1);
+ restarts_.push_back(0); // First restart point is at offset 0
+}
+
+void BlockBuilder::Reset() {
+ buffer_.clear();
+ restarts_.clear();
+ restarts_.push_back(0); // First restart point is at offset 0
+ counter_ = 0;
+ finished_ = false;
+ last_key_.clear();
+}
+
+size_t BlockBuilder::CurrentSizeEstimate() const {
+ return (buffer_.size() + // Raw data buffer
+ restarts_.size() * sizeof(uint32_t) + // Restart array
+ sizeof(uint32_t)); // Restart array length
+}
+
+Slice BlockBuilder::Finish() {
+ // Append restart array
+ for (int i = 0; i < restarts_.size(); i++) {
+ PutFixed32(&buffer_, restarts_[i]);
+ }
+ PutFixed32(&buffer_, restarts_.size());
+ finished_ = true;
+ return Slice(buffer_);
+}
+
+void BlockBuilder::Add(const Slice& key, const Slice& value) {
+ Slice last_key_piece(last_key_);
+ assert(!finished_);
+ assert(counter_ <= options_->block_restart_interval);
+ assert(buffer_.empty() // No values yet?
+ || options_->comparator->Compare(key, last_key_piece) > 0);
+ size_t shared = 0;
+ if (counter_ < options_->block_restart_interval) {
+ // See how much sharing to do with previous string
+ const size_t min_length = std::min(last_key_piece.size(), key.size());
+ while ((shared < min_length) && (last_key_[shared] == key[shared])) {
+ shared++;
+ }
+ } else {
+ // Restart compression
+ restarts_.push_back(buffer_.size());
+ counter_ = 0;
+ }
+ const size_t non_shared = key.size() - shared;
+
+ // Add "<shared><non_shared><value_size>" to buffer_
+ PutVarint32(&buffer_, shared);
+ PutVarint32(&buffer_, non_shared);
+ PutVarint32(&buffer_, value.size());
+
+ // Add string delta to buffer_ followed by value
+ buffer_.append(key.data() + shared, non_shared);
+ buffer_.append(value.data(), value.size());
+
+ // Update state
+ last_key_.resize(shared);
+ last_key_.append(key.data() + shared, non_shared);
+ assert(Slice(last_key_) == key);
+ counter_++;
+}
+
+}
diff --git a/table/block_builder.h b/table/block_builder.h
new file mode 100644
index 0000000..beab168
--- /dev/null
+++ b/table/block_builder.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
+#define STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
+
+#include <vector>
+
+#include <stdint.h>
+#include "include/slice.h"
+
+namespace leveldb {
+
+struct Options;
+
+class BlockBuilder {
+ public:
+ explicit BlockBuilder(const Options* options);
+
+ // Reset the contents as if the BlockBuilder was just constructed.
+ void Reset();
+
+ // REQUIRES: Finish() has not been callled since the last call to Reset().
+ // REQUIRES: key is larger than any previously added key
+ void Add(const Slice& key, const Slice& value);
+
+ // Finish building the block and return a slice that refers to the
+ // block contents. The returned slice will remain valid for the
+ // lifetime of this builder or until Reset() is called.
+ Slice Finish();
+
+ // Returns an estimate of the current (uncompressed) size of the block
+ // we are building.
+ size_t CurrentSizeEstimate() const;
+
+ // Return true iff no entries have been added since the last Reset()
+ bool empty() const {
+ return buffer_.empty();
+ }
+
+ private:
+ const Options* options_;
+ std::string buffer_; // Destination buffer
+ std::vector<uint32_t> restarts_; // Restart points
+ int counter_; // Number of entries emitted since restart
+ bool finished_; // Has Finish() been called?
+ std::string last_key_;
+
+ // No copying allowed
+ BlockBuilder(const BlockBuilder&);
+ void operator=(const BlockBuilder&);
+};
+
+}
+
+#endif // STORAGE_LEVELDB_TABLE_BLOCK_BUILDER_H_
diff --git a/table/format.cc b/table/format.cc
new file mode 100644
index 0000000..d292dad
--- /dev/null
+++ b/table/format.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/format.h"
+
+#include "include/env.h"
+#include "port/port.h"
+#include "table/block.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace leveldb {
+
+void BlockHandle::EncodeTo(std::string* dst) const {
+ // Sanity check that all fields have been set
+ assert(offset_ != ~static_cast<uint64_t>(0));
+ assert(size_ != ~static_cast<uint64_t>(0));
+ PutVarint64(dst, offset_);
+ PutVarint64(dst, size_);
+}
+
+Status BlockHandle::DecodeFrom(Slice* input) {
+ if (GetVarint64(input, &offset_) &&
+ GetVarint64(input, &size_)) {
+ return Status::OK();
+ } else {
+ return Status::Corruption("bad block handle");
+ }
+}
+
+void Footer::EncodeTo(std::string* dst) const {
+#ifndef NDEBUG
+ const size_t original_size = dst->size();
+#endif
+ metaindex_handle_.EncodeTo(dst);
+ index_handle_.EncodeTo(dst);
+ dst->resize(2 * BlockHandle::kMaxEncodedLength); // Padding
+ PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber));
+ PutFixed32(dst, static_cast<uint32_t>(kTableMagicNumber >> 32));
+ assert(dst->size() == original_size + kEncodedLength);
+}
+
+Status Footer::DecodeFrom(Slice* input) {
+ const char* magic_ptr = input->data() + kEncodedLength - 8;
+ const uint32_t magic_lo = DecodeFixed32(magic_ptr);
+ const uint32_t magic_hi = DecodeFixed32(magic_ptr + 4);
+ const uint64_t magic = ((static_cast<uint64_t>(magic_hi) << 32) |
+ (static_cast<uint64_t>(magic_lo)));
+ if (magic != kTableMagicNumber) {
+ return Status::InvalidArgument("not an sstable (bad magic number)");
+ }
+
+ Status result = metaindex_handle_.DecodeFrom(input);
+ if (result.ok()) {
+ result = index_handle_.DecodeFrom(input);
+ }
+ if (result.ok()) {
+ // We skip over any leftover data (just padding for now) in "input"
+ const char* end = magic_ptr + 8;
+ *input = Slice(end, input->data() + input->size() - end);
+ }
+ return result;
+}
+
+Status ReadBlock(RandomAccessFile* file,
+ const ReadOptions& options,
+ const BlockHandle& handle,
+ Block** block) {
+ *block = NULL;
+
+ // Read the block contents as well as the type/crc footer.
+ // See table_builder.cc for the code that built this structure.
+ size_t n = handle.size();
+ char* buf = new char[n + kBlockTrailerSize];
+ Slice contents;
+ Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
+ if (!s.ok()) {
+ delete[] buf;
+ return s;
+ }
+ if (contents.size() != n + kBlockTrailerSize) {
+ delete[] buf;
+ return Status::Corruption("truncated block read");
+ }
+
+ // Check the crc of the type and the block contents
+ const char* data = contents.data(); // Pointer to where Read put the data
+ if (options.verify_checksums) {
+ const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));
+ const uint32_t actual = crc32c::Value(data, n + 1);
+ if (actual != crc) {
+ delete[] buf;
+ s = Status::Corruption("block checksum mismatch");
+ return s;
+ }
+ }
+
+ switch (data[n]) {
+ case kNoCompression:
+ if (data != buf) {
+ // File implementation gave us pointer to some other data.
+ // Copy into buf[].
+ memcpy(buf, data, n + kBlockTrailerSize);
+ }
+
+ // Ok
+ break;
+ case kLightweightCompression: {
+ std::string decompressed;
+ if (!port::Lightweight_Uncompress(data, n, &decompressed)) {
+ delete[] buf;
+ s = Status::Corruption("corrupted compressed block contents");
+ return s;
+ }
+ delete[] buf; // Done with uncompressed data
+ buf = new char[decompressed.size()];
+ memcpy(buf, decompressed.data(), decompressed.size());
+ n = decompressed.size();
+ break;
+ }
+ default:
+ delete[] buf;
+ return Status::Corruption("bad block type");
+ }
+
+ *block = new Block(buf, n); // Block takes ownership of buf[]
+ return Status::OK();
+}
+
+}
diff --git a/table/format.h b/table/format.h
new file mode 100644
index 0000000..03e3ee2
--- /dev/null
+++ b/table/format.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_FORMAT_H_
+#define STORAGE_LEVELDB_TABLE_FORMAT_H_
+
+#include <string>
+#include <stdint.h>
+#include "include/slice.h"
+#include "include/status.h"
+#include "include/table_builder.h"
+
+namespace leveldb {
+
+class Block;
+class RandomAccessFile;
+struct ReadOptions;
+
+// BlockHandle is a pointer to the extent of a file that stores a data
+// block or a meta block.
+class BlockHandle {
+ public:
+ BlockHandle();
+
+ // The offset of the block in the file.
+ uint64_t offset() const { return offset_; }
+ void set_offset(uint64_t offset) { offset_ = offset; }
+
+ // The size of the stored block
+ uint64_t size() const { return size_; }
+ void set_size(uint64_t size) { size_ = size; }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(Slice* input);
+
+ // Maximum encoding length of a BlockHandle
+ enum { kMaxEncodedLength = 10 + 10 };
+
+ private:
+ uint64_t offset_;
+ uint64_t size_;
+};
+
+// Footer encapsulates the fixed information stored at the tail
+// end of every table file.
+class Footer {
+ public:
+ Footer() { }
+
+ // The block handle for the metaindex block of the table
+ const BlockHandle& metaindex_handle() const { return metaindex_handle_; }
+ void set_metaindex_handle(const BlockHandle& h) { metaindex_handle_ = h; }
+
+ // The block handle for the index block of the table
+ const BlockHandle& index_handle() const {
+ return index_handle_;
+ }
+ void set_index_handle(const BlockHandle& h) {
+ index_handle_ = h;
+ }
+
+ void EncodeTo(std::string* dst) const;
+ Status DecodeFrom(Slice* input);
+
+ // Encoded length of a Footer. Note that the serialization of a
+ // Footer will always occupy exactly this many bytes. It consists
+ // of two block handles and a magic number.
+ enum {
+ kEncodedLength = 2*BlockHandle::kMaxEncodedLength + 8
+ };
+
+ private:
+ BlockHandle metaindex_handle_;
+ BlockHandle index_handle_;
+};
+
+// kTableMagicNumber was picked by running
+// echo http://code.google.com/p/leveldb/ | sha1sum
+// and taking the leading 64 bits.
+static const uint64_t kTableMagicNumber = 0xdb4775248b80fb57ull;
+
+// 1-byte type + 32-bit crc
+static const size_t kBlockTrailerSize = 5;
+
+// Read the block identified by "handle" from "file". On success,
+// store a pointer to the heap-allocated result in *block and return
+// OK. On failure store NULL in *block and return non-OK.
+extern Status ReadBlock(RandomAccessFile* file,
+ const ReadOptions& options,
+ const BlockHandle& handle,
+ Block** block);
+
+// Implementation details follow. Clients should ignore,
+
+inline BlockHandle::BlockHandle()
+ : offset_(~static_cast<uint64_t>(0)),
+ size_(~static_cast<uint64_t>(0)) {
+}
+
+}
+
+#endif // STORAGE_LEVELDB_TABLE_FORMAT_H_
diff --git a/table/iterator.cc b/table/iterator.cc
new file mode 100644
index 0000000..f3c0856
--- /dev/null
+++ b/table/iterator.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/iterator.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+Iterator::Iterator() {
+ cleanup_.function = NULL;
+ cleanup_.next = NULL;
+}
+
+Iterator::~Iterator() {
+ if (cleanup_.function != NULL) {
+ (*cleanup_.function)(cleanup_.arg1, cleanup_.arg2);
+ for (Cleanup* c = cleanup_.next; c != NULL; ) {
+ (*c->function)(c->arg1, c->arg2);
+ Cleanup* next = c->next;
+ delete c;
+ c = next;
+ }
+ }
+}
+
+void Iterator::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) {
+ assert(func != NULL);
+ Cleanup* c;
+ if (cleanup_.function == NULL) {
+ c = &cleanup_;
+ } else {
+ c = new Cleanup;
+ c->next = cleanup_.next;
+ cleanup_.next = c;
+ }
+ c->function = func;
+ c->arg1 = arg1;
+ c->arg2 = arg2;
+}
+
+namespace {
+class EmptyIterator : public Iterator {
+ public:
+ EmptyIterator(const Status& s) : status_(s) { }
+ virtual bool Valid() const { return false; }
+ virtual void Seek(const Slice& target) { }
+ virtual void SeekToFirst() { }
+ virtual void SeekToLast() { }
+ virtual void Next() { assert(false); }
+ virtual void Prev() { assert(false); }
+ Slice key() const { assert(false); return Slice(); }
+ Slice value() const { assert(false); return Slice(); }
+ virtual Status status() const { return status_; }
+ private:
+ Status status_;
+};
+}
+
+Iterator* NewEmptyIterator() {
+ return new EmptyIterator(Status::OK());
+}
+
+Iterator* NewErrorIterator(const Status& status) {
+ return new EmptyIterator(status);
+}
+
+}
diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h
new file mode 100644
index 0000000..158d3a7
--- /dev/null
+++ b/table/iterator_wrapper.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
+#define STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
+
+namespace leveldb {
+
+// A internal wrapper class with an interface similar to Iterator that
+// caches the valid() and key() results for an underlying iterator.
+// This can help avoid virtual function calls and also gives better
+// cache locality.
+class IteratorWrapper {
+ private:
+ Iterator* iter_;
+ bool valid_;
+ Slice key_;
+ public:
+ IteratorWrapper(): iter_(NULL), valid_(false) { }
+ explicit IteratorWrapper(Iterator* iter): iter_(NULL) {
+ Set(iter);
+ }
+ ~IteratorWrapper() { delete iter_; }
+ Iterator* iter() const { return iter_; }
+
+ // Takes ownership of "iter" and will delete it when destroyed, or
+ // when Set() is invoked again.
+ void Set(Iterator* iter) {
+ delete iter_;
+ iter_ = iter;
+ if (iter_ == NULL) {
+ valid_ = false;
+ } else {
+ Update();
+ }
+ }
+
+
+ // Iterator interface methods
+ bool Valid() const { return valid_; }
+ Slice key() const { assert(Valid()); return key_; }
+ Slice value() const { assert(Valid()); return iter_->value(); }
+ // Methods below require iter() != NULL
+ Status status() const { assert(iter_); return iter_->status(); }
+ void Next() { assert(iter_); iter_->Next(); Update(); }
+ void Prev() { assert(iter_); iter_->Prev(); Update(); }
+ void Seek(const Slice& k) { assert(iter_); iter_->Seek(k); Update(); }
+ void SeekToFirst() { assert(iter_); iter_->SeekToFirst(); Update(); }
+ void SeekToLast() { assert(iter_); iter_->SeekToLast(); Update(); }
+
+ private:
+ void Update() {
+ valid_ = iter_->Valid();
+ if (valid_) {
+ key_ = iter_->key();
+ }
+ }
+};
+
+}
+
+
+#endif // STORAGE_LEVELDB_TABLE_ITERATOR_WRAPPER_H_
diff --git a/table/merger.cc b/table/merger.cc
new file mode 100644
index 0000000..74c1aaa
--- /dev/null
+++ b/table/merger.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/merger.h"
+
+#include "include/comparator.h"
+#include "include/iterator.h"
+#include "table/iterator_wrapper.h"
+
+namespace leveldb {
+
+namespace {
+class MergingIterator : public Iterator {
+ public:
+ MergingIterator(const Comparator* comparator, Iterator** children, int n)
+ : comparator_(comparator),
+ children_(new IteratorWrapper[n]),
+ n_(n),
+ current_(NULL) {
+ for (int i = 0; i < n; i++) {
+ children_[i].Set(children[i]);
+ }
+ }
+
+ virtual ~MergingIterator() {
+ delete[] children_;
+ }
+
+ virtual bool Valid() const {
+ return (current_ != NULL);
+ }
+
+ virtual void SeekToFirst() {
+ for (int i = 0; i < n_; i++) {
+ children_[i].SeekToFirst();
+ }
+ FindSmallest();
+ }
+
+ virtual void SeekToLast() {
+ for (int i = 0; i < n_; i++) {
+ children_[i].SeekToLast();
+ }
+ FindLargest();
+ }
+
+ virtual void Seek(const Slice& target) {
+ for (int i = 0; i < n_; i++) {
+ children_[i].Seek(target);
+ }
+ FindSmallest();
+ }
+
+ virtual void Next() {
+ assert(Valid());
+ current_->Next();
+ FindSmallest();
+ }
+
+ virtual void Prev() {
+ assert(Valid());
+ current_->Prev();
+ FindLargest();
+ }
+
+ virtual Slice key() const {
+ assert(Valid());
+ return current_->key();
+ }
+
+ virtual Slice value() const {
+ assert(Valid());
+ return current_->value();
+ }
+
+ virtual Status status() const {
+ Status status;
+ for (int i = 0; i < n_; i++) {
+ status = children_[i].status();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ return status;
+ }
+
+ private:
+ void FindSmallest();
+ void FindLargest();
+
+ // We might want to use a heap in case there are lots of children.
+ // For now we use a simple array since we expect a very small number
+ // of children in leveldb.
+ const Comparator* comparator_;
+ IteratorWrapper* children_;
+ int n_;
+ IteratorWrapper* current_;
+};
+
+void MergingIterator::FindSmallest() {
+ IteratorWrapper* smallest = NULL;
+ for (int i = 0; i < n_; i++) {
+ IteratorWrapper* child = &children_[i];
+ if (child->Valid()) {
+ if (smallest == NULL) {
+ smallest = child;
+ } else if (comparator_->Compare(child->key(), smallest->key()) < 0) {
+ smallest = child;
+ }
+ }
+ }
+ current_ = smallest;
+}
+
+void MergingIterator::FindLargest() {
+ IteratorWrapper* largest = NULL;
+ for (int i = n_-1; i >= 0; i--) {
+ IteratorWrapper* child = &children_[i];
+ if (child->Valid()) {
+ if (largest == NULL) {
+ largest = child;
+ } else if (comparator_->Compare(child->key(), largest->key()) > 0) {
+ largest = child;
+ }
+ }
+ }
+ current_ = largest;
+}
+}
+
+Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n) {
+ assert(n >= 0);
+ if (n == 0) {
+ return NewEmptyIterator();
+ } else if (n == 1) {
+ return list[0];
+ } else {
+ return new MergingIterator(cmp, list, n);
+ }
+}
+
+}
diff --git a/table/merger.h b/table/merger.h
new file mode 100644
index 0000000..71d9dc5
--- /dev/null
+++ b/table/merger.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_MERGER_H_
+#define STORAGE_LEVELDB_TABLE_MERGER_H_
+
+namespace leveldb {
+
+class Comparator;
+class Iterator;
+
+// Return an iterator that provided the union of the data in
+// children[0,n-1]. Takes ownership of the child iterators and
+// will delete them when the result iterator is deleted.
+//
+// The result does no duplicate suppression. I.e., if a particular
+// key is present in K child iterators, it will be yielded K times.
+//
+// REQUIRES: n >= 0
+extern Iterator* NewMergingIterator(
+ const Comparator* comparator, Iterator** children, int n);
+
+}
+
+#endif // STORAGE_LEVELDB_TABLE_MERGER_H_
diff --git a/table/table.cc b/table/table.cc
new file mode 100644
index 0000000..dffc217
--- /dev/null
+++ b/table/table.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/table.h"
+
+#include "include/cache.h"
+#include "include/env.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "table/two_level_iterator.h"
+#include "util/coding.h"
+
+namespace leveldb {
+
+struct Table::Rep {
+ ~Rep() {
+ delete index_block;
+ }
+
+ Options options;
+ Status status;
+ RandomAccessFile* file;
+ uint64_t cache_id;
+
+ BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
+ Block* index_block;
+};
+
+Status Table::Open(const Options& options,
+ RandomAccessFile* file,
+ Table** table) {
+ *table = NULL;
+ const uint64_t size = file->Size();
+ if (size < Footer::kEncodedLength) {
+ return Status::InvalidArgument("file is too short to be an sstable");
+ }
+
+ char footer_space[Footer::kEncodedLength];
+ Slice footer_input;
+ Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
+ &footer_input, footer_space);
+ if (!s.ok()) return s;
+
+ Footer footer;
+ s = footer.DecodeFrom(&footer_input);
+ if (!s.ok()) return s;
+
+ // Read the index block
+ Block* index_block = NULL;
+ if (s.ok()) {
+ s = ReadBlock(file, ReadOptions(), footer.index_handle(), &index_block);
+ }
+
+ if (s.ok()) {
+ // We've successfully read the footer and the index block: we're
+ // ready to serve requests.
+ Rep* rep = new Table::Rep;
+ rep->options = options;
+ rep->file = file;
+ rep->metaindex_handle = footer.metaindex_handle();
+ rep->index_block = index_block;
+ rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
+ *table = new Table(rep);
+ } else {
+ if (index_block) delete index_block;
+ }
+
+ return s;
+}
+
+Table::~Table() {
+ delete rep_;
+}
+
+static void DeleteBlock(void* arg, void* ignored) {
+ delete reinterpret_cast<Block*>(arg);
+}
+
+static void DeleteCachedBlock(const Slice& key, void* value) {
+ Block* block = reinterpret_cast<Block*>(value);
+ delete block;
+}
+
+static void ReleaseBlock(void* arg, void* h) {
+ Cache* cache = reinterpret_cast<Cache*>(arg);
+ Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
+ cache->Release(handle);
+}
+
+// Convert an index iterator value (i.e., an encoded BlockHandle)
+// into an iterator over the contents of the corresponding block.
+Iterator* Table::BlockReader(void* arg,
+ const ReadOptions& options,
+ const Slice& index_value) {
+ Table* table = reinterpret_cast<Table*>(arg);
+ Cache* block_cache = table->rep_->options.block_cache;
+ Block* block = NULL;
+ Cache::Handle* cache_handle = NULL;
+
+ BlockHandle handle;
+ Slice input = index_value;
+ Status s = handle.DecodeFrom(&input);
+ // We intentionally allow extra stuff in index_value so that we
+ // can add more features in the future.
+
+ if (s.ok()) {
+ if (block_cache != NULL) {
+ char cache_key_buffer[16];
+ EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
+ EncodeFixed64(cache_key_buffer+8, handle.offset());
+ Slice key(cache_key_buffer, sizeof(cache_key_buffer));
+ cache_handle = block_cache->Lookup(key);
+ if (cache_handle != NULL) {
+ block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
+ } else {
+ s = ReadBlock(table->rep_->file, options, handle, &block);
+ if (s.ok() && options.fill_cache) {
+ cache_handle = block_cache->Insert(
+ key, block, block->size(), &DeleteCachedBlock);
+ }
+ }
+ } else {
+ s = ReadBlock(table->rep_->file, options, handle, &block);
+ }
+ }
+
+ Iterator* iter;
+ if (block != NULL) {
+ iter = block->NewIterator(table->rep_->options.comparator);
+ if (cache_handle == NULL) {
+ iter->RegisterCleanup(&DeleteBlock, block, NULL);
+ } else {
+ iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
+ }
+ } else {
+ iter = NewErrorIterator(s);
+ }
+ return iter;
+}
+
+Iterator* Table::NewIterator(const ReadOptions& options) const {
+ return NewTwoLevelIterator(
+ rep_->index_block->NewIterator(rep_->options.comparator),
+ &Table::BlockReader, const_cast<Table*>(this), options);
+}
+
+uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
+ Iterator* index_iter =
+ rep_->index_block->NewIterator(rep_->options.comparator);
+ index_iter->Seek(key);
+ uint64_t result;
+ if (index_iter->Valid()) {
+ BlockHandle handle;
+ Slice input = index_iter->value();
+ Status s = handle.DecodeFrom(&input);
+ if (s.ok()) {
+ result = handle.offset();
+ } else {
+ // Strange: we can't decode the block handle in the index block.
+ // We'll just return the offset of the metaindex block, which is
+ // close to the whole file size for this case.
+ result = rep_->metaindex_handle.offset();
+ }
+ } else {
+ // key is past the last key in the file. Approximate the offset
+ // by returning the offset of the metaindex block (which is
+ // right near the end of the file).
+ result = rep_->metaindex_handle.offset();
+ }
+ delete index_iter;
+ return result;
+}
+
+}
diff --git a/table/table_builder.cc b/table/table_builder.cc
new file mode 100644
index 0000000..38ad392
--- /dev/null
+++ b/table/table_builder.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/table_builder.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include "include/comparator.h"
+#include "include/env.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+struct TableBuilder::Rep {
+ Options options;
+ Options index_block_options;
+ WritableFile* file;
+ uint64_t offset;
+ Status status;
+ BlockBuilder data_block;
+ BlockBuilder index_block;
+ std::string last_key;
+ int64_t num_entries;
+ bool closed; // Either Finish() or Abandon() has been called.
+
+ // We do not emit the index entry for a block until we have seen the
+ // first key for the next data block. This allows us to use shorter
+ // keys in the index block. For example, consider a block boundary
+ // between the keys "the quick brown fox" and "the who". We can use
+ // "the r" as the key for the index block entry since it is >= all
+ // entries in the first block and < all entries in subsequent
+ // blocks.
+ //
+ // Invariant: r->pending_index_entry is true only if data_block is empty.
+ bool pending_index_entry;
+ BlockHandle pending_handle; // Handle to add to index block
+
+ std::string compressed_output;
+
+ Rep(const Options& opt, WritableFile* f)
+ : options(opt),
+ index_block_options(opt),
+ file(f),
+ offset(0),
+ data_block(&options),
+ index_block(&index_block_options),
+ num_entries(0),
+ closed(false),
+ pending_index_entry(false) {
+ index_block_options.block_restart_interval = 1;
+ }
+};
+
+TableBuilder::TableBuilder(const Options& options, WritableFile* file)
+ : rep_(new Rep(options, file)) {
+}
+
+TableBuilder::~TableBuilder() {
+ assert(rep_->closed); // Catch errors where caller forgot to call Finish()
+ delete rep_;
+}
+
+Status TableBuilder::ChangeOptions(const Options& options) {
+ // Note: if more fields are added to Options, update
+ // this function to catch changes that should not be allowed to
+ // change in the middle of building a Table.
+ if (options.comparator != rep_->options.comparator) {
+ return Status::InvalidArgument("changing comparator while building table");
+ }
+
+ // Note that any live BlockBuilders point to rep_->options and therefore
+ // will automatically pick up the updated options.
+ rep_->options = options;
+ rep_->index_block_options = options;
+ rep_->index_block_options.block_restart_interval = 1;
+ return Status::OK();
+}
+
+void TableBuilder::Add(const Slice& key, const Slice& value) {
+ Rep* r = rep_;
+ assert(!r->closed);
+ if (!ok()) return;
+ if (r->num_entries > 0) {
+ assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
+ }
+
+ if (r->pending_index_entry) {
+ assert(r->data_block.empty());
+ r->options.comparator->FindShortestSeparator(&r->last_key, key);
+ std::string handle_encoding;
+ r->pending_handle.EncodeTo(&handle_encoding);
+ r->index_block.Add(r->last_key, Slice(handle_encoding));
+ r->pending_index_entry = false;
+ }
+
+ r->last_key.assign(key.data(), key.size());
+ r->num_entries++;
+ r->data_block.Add(key, value);
+
+ const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
+ if (estimated_block_size >= r->options.block_size) {
+ Flush();
+ }
+}
+
+void TableBuilder::Flush() {
+ Rep* r = rep_;
+ assert(!r->closed);
+ if (!ok()) return;
+ if (r->data_block.empty()) return;
+ assert(!r->pending_index_entry);
+ WriteBlock(&r->data_block, &r->pending_handle);
+ if (ok()) {
+ r->pending_index_entry = true;
+ r->status = r->file->Flush();
+ }
+}
+
+void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
+ // File format contains a sequence of blocks where each block has:
+ // block_data: uint8[n]
+ // type: uint8
+ // crc: uint32
+ assert(ok());
+ Rep* r = rep_;
+ Slice raw = block->Finish();
+
+ Slice block_contents;
+ CompressionType type = r->options.compression;
+ // TODO(postrelease): Support more compression options: zlib?
+ switch (type) {
+ case kNoCompression:
+ block_contents = raw;
+ break;
+
+ case kLightweightCompression: {
+ port::Lightweight_Compress(raw.data(), raw.size(), &r->compressed_output);
+ block_contents = r->compressed_output;
+ if (block_contents.size() >= raw.size() - (raw.size() / 8u)) {
+ // Compressed less than 12.5%, so just store uncompressed form
+ block_contents = raw;
+ type = kNoCompression;
+ }
+ break;
+ }
+ }
+ handle->set_offset(r->offset);
+ handle->set_size(block_contents.size());
+ r->status = r->file->Append(block_contents);
+ if (r->status.ok()) {
+ char trailer[kBlockTrailerSize];
+ trailer[0] = type;
+ uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
+ crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
+ EncodeFixed32(trailer+1, crc32c::Mask(crc));
+ r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
+ if (r->status.ok()) {
+ r->offset += block_contents.size() + kBlockTrailerSize;
+ }
+ }
+ r->compressed_output.clear();
+ block->Reset();
+}
+
+Status TableBuilder::status() const {
+ return rep_->status;
+}
+
+Status TableBuilder::Finish() {
+ Rep* r = rep_;
+ Flush();
+ assert(!r->closed);
+ r->closed = true;
+ BlockHandle metaindex_block_handle;
+ BlockHandle index_block_handle;
+ if (ok()) {
+ BlockBuilder meta_index_block(&r->options);
+ // TODO(postrelease): Add stats and other meta blocks
+ WriteBlock(&meta_index_block, &metaindex_block_handle);
+ }
+ if (ok()) {
+ if (r->pending_index_entry) {
+ r->options.comparator->FindShortSuccessor(&r->last_key);
+ std::string handle_encoding;
+ r->pending_handle.EncodeTo(&handle_encoding);
+ r->index_block.Add(r->last_key, Slice(handle_encoding));
+ r->pending_index_entry = false;
+ }
+ WriteBlock(&r->index_block, &index_block_handle);
+ }
+ if (ok()) {
+ Footer footer;
+ footer.set_metaindex_handle(metaindex_block_handle);
+ footer.set_index_handle(index_block_handle);
+ std::string footer_encoding;
+ footer.EncodeTo(&footer_encoding);
+ r->status = r->file->Append(footer_encoding);
+ if (r->status.ok()) {
+ r->offset += footer_encoding.size();
+ }
+ }
+ return r->status;
+}
+
+void TableBuilder::Abandon() {
+ Rep* r = rep_;
+ assert(!r->closed);
+ r->closed = true;
+}
+
+uint64_t TableBuilder::NumEntries() const {
+ return rep_->num_entries;
+}
+
+uint64_t TableBuilder::FileSize() const {
+ return rep_->offset;
+}
+
+}
diff --git a/table/table_test.cc b/table/table_test.cc
new file mode 100644
index 0000000..f4bd7c7
--- /dev/null
+++ b/table/table_test.cc
@@ -0,0 +1,808 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/table.h"
+
+#include <map>
+#include "db/dbformat.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "include/db.h"
+#include "include/env.h"
+#include "include/iterator.h"
+#include "include/table_builder.h"
+#include "table/block.h"
+#include "table/block_builder.h"
+#include "table/format.h"
+#include "util/random.h"
+#include "util/testharness.h"
+#include "util/testutil.h"
+
+namespace leveldb {
+
+// Return reverse of "key".
+// Used to test non-lexicographic comparators.
+static std::string Reverse(const Slice& key) {
+ std::string str(key.ToString());
+ std::string rev(str.rbegin(), str.rend());
+ return rev;
+}
+
+namespace {
+class ReverseKeyComparator : public Comparator {
+ public:
+ virtual const char* Name() const {
+ return "leveldb.ReverseBytewiseComparator";
+ }
+
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return BytewiseComparator()->Compare(Reverse(a), Reverse(b));
+ }
+
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const {
+ std::string s = Reverse(*start);
+ std::string l = Reverse(limit);
+ BytewiseComparator()->FindShortestSeparator(&s, l);
+ *start = Reverse(s);
+ }
+
+ virtual void FindShortSuccessor(std::string* key) const {
+ std::string s = Reverse(*key);
+ BytewiseComparator()->FindShortSuccessor(&s);
+ *key = Reverse(s);
+ }
+};
+}
+static ReverseKeyComparator reverse_key_comparator;
+
+static void Increment(const Comparator* cmp, std::string* key) {
+ if (cmp == BytewiseComparator()) {
+ key->push_back('\0');
+ } else {
+ assert(cmp == &reverse_key_comparator);
+ std::string rev = Reverse(*key);
+ rev.push_back('\0');
+ *key = Reverse(rev);
+ }
+}
+
+// An STL comparator that uses a Comparator
+namespace {
+struct STLLessThan {
+ const Comparator* cmp;
+
+ STLLessThan() : cmp(BytewiseComparator()) { }
+ STLLessThan(const Comparator* c) : cmp(c) { }
+ bool operator()(const std::string& a, const std::string& b) const {
+ return cmp->Compare(Slice(a), Slice(b)) < 0;
+ }
+};
+}
+
+class StringSink: public WritableFile {
+ public:
+ ~StringSink() { }
+
+ const std::string& contents() const { return contents_; }
+
+ virtual Status Close() { return Status::OK(); }
+ virtual Status Flush() { return Status::OK(); }
+ virtual Status Sync() { return Status::OK(); }
+
+ virtual Status Append(const Slice& data) {
+ contents_.append(data.data(), data.size());
+ return Status::OK();
+ }
+
+ private:
+ std::string contents_;
+};
+
+
+class StringSource: public RandomAccessFile {
+ public:
+ StringSource(const Slice& contents)
+ : contents_(contents.data(), contents.size()) {
+ }
+
+ virtual ~StringSource() { }
+
+ virtual uint64_t Size() const { return contents_.size(); }
+
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ if (offset > contents_.size()) {
+ return Status::InvalidArgument("invalid Read offset");
+ }
+ if (offset + n > contents_.size()) {
+ n = contents_.size() - offset;
+ }
+ memcpy(scratch, &contents_[offset], n);
+ *result = Slice(scratch, n);
+ return Status::OK();
+ }
+
+ private:
+ std::string contents_;
+};
+
+typedef std::map<std::string, std::string, STLLessThan> KVMap;
+
+// Helper class for tests to unify the interface between
+// BlockBuilder/TableBuilder and Block/Table.
+class Constructor {
+ public:
+ explicit Constructor(const Comparator* cmp) : data_(STLLessThan(cmp)) { }
+ virtual ~Constructor() { }
+
+ void Add(const std::string& key, const Slice& value) {
+ data_[key] = value.ToString();
+ }
+
+ // Finish constructing the data structure with all the keys that have
+ // been added so far. Returns the keys in sorted order in "*keys"
+ // and stores the key/value pairs in "*kvmap"
+ void Finish(const Options& options,
+ std::vector<std::string>* keys,
+ KVMap* kvmap) {
+ *kvmap = data_;
+ keys->clear();
+ for (KVMap::const_iterator it = data_.begin();
+ it != data_.end();
+ ++it) {
+ keys->push_back(it->first);
+ }
+ data_.clear();
+ Status s = FinishImpl(options, *kvmap);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ }
+
+ // Construct the data structure from the data in "data"
+ virtual Status FinishImpl(const Options& options, const KVMap& data) = 0;
+
+ virtual size_t NumBytes() const = 0;
+
+ virtual Iterator* NewIterator() const = 0;
+
+ virtual const KVMap& data() { return data_; }
+
+ private:
+ KVMap data_;
+};
+
+class BlockConstructor: public Constructor {
+ public:
+ explicit BlockConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ comparator_(cmp),
+ block_size_(-1),
+ block_(NULL) { }
+ ~BlockConstructor() {
+ delete block_;
+ }
+ virtual Status FinishImpl(const Options& options, const KVMap& data) {
+ delete block_;
+ block_ = NULL;
+ BlockBuilder builder(&options);
+
+ for (KVMap::const_iterator it = data.begin();
+ it != data.end();
+ ++it) {
+ builder.Add(it->first, it->second);
+ }
+ // Open the block
+ Slice block_data = builder.Finish();
+ block_size_ = block_data.size();
+ char* block_data_copy = new char[block_size_];
+ memcpy(block_data_copy, block_data.data(), block_size_);
+ block_ = new Block(block_data_copy, block_size_);
+ return Status::OK();
+ }
+ virtual size_t NumBytes() const { return block_size_; }
+
+ virtual Iterator* NewIterator() const {
+ return block_->NewIterator(comparator_);
+ }
+
+ private:
+ const Comparator* comparator_;
+ int block_size_;
+ Block* block_;
+
+ BlockConstructor();
+};
+
+class TableConstructor: public Constructor {
+ public:
+ TableConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ source_(NULL), table_(NULL) {
+ }
+ ~TableConstructor() {
+ Reset();
+ }
+ virtual Status FinishImpl(const Options& options, const KVMap& data) {
+ Reset();
+ StringSink sink;
+ TableBuilder builder(options, &sink);
+
+ for (KVMap::const_iterator it = data.begin();
+ it != data.end();
+ ++it) {
+ builder.Add(it->first, it->second);
+ ASSERT_TRUE(builder.status().ok());
+ }
+ Status s = builder.Finish();
+ ASSERT_TRUE(s.ok()) << s.ToString();
+
+ ASSERT_EQ(sink.contents().size(), builder.FileSize());
+
+ // Open the table
+ source_ = new StringSource(sink.contents());
+ Options table_options;
+ table_options.comparator = options.comparator;
+ return Table::Open(table_options, source_, &table_);
+ }
+ virtual size_t NumBytes() const { return source_->Size(); }
+
+ virtual Iterator* NewIterator() const {
+ return table_->NewIterator(ReadOptions());
+ }
+
+ uint64_t ApproximateOffsetOf(const Slice& key) const {
+ return table_->ApproximateOffsetOf(key);
+ }
+
+ private:
+ void Reset() {
+ delete table_;
+ delete source_;
+ table_ = NULL;
+ source_ = NULL;
+ }
+
+ StringSource* source_;
+ Table* table_;
+
+ TableConstructor();
+};
+
+// A helper class that converts internal format keys into user keys
+class KeyConvertingIterator: public Iterator {
+ public:
+ explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { }
+ virtual ~KeyConvertingIterator() { delete iter_; }
+ virtual bool Valid() const { return iter_->Valid(); }
+ virtual void Seek(const Slice& target) {
+ ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue);
+ std::string encoded;
+ AppendInternalKey(&encoded, ikey);
+ iter_->Seek(encoded);
+ }
+ virtual void SeekToFirst() { iter_->SeekToFirst(); }
+ virtual void SeekToLast() { iter_->SeekToLast(); }
+ virtual void Next() { iter_->Next(); }
+ virtual void Prev() { iter_->Prev(); }
+
+ virtual Slice key() const {
+ assert(Valid());
+ ParsedInternalKey key;
+ if (!ParseInternalKey(iter_->key(), &key)) {
+ status_ = Status::Corruption("malformed internal key");
+ return Slice("corrupted key");
+ }
+ return key.user_key;
+ }
+
+ virtual Slice value() const { return iter_->value(); }
+ virtual Status status() const {
+ return status_.ok() ? iter_->status() : status_;
+ }
+
+ private:
+ mutable Status status_;
+ Iterator* iter_;
+
+ // No copying allowed
+ KeyConvertingIterator(const KeyConvertingIterator&);
+ void operator=(const KeyConvertingIterator&);
+};
+
+class MemTableConstructor: public Constructor {
+ public:
+ explicit MemTableConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ internal_comparator_(cmp) {
+ memtable_ = new MemTable(internal_comparator_);
+ }
+ ~MemTableConstructor() {
+ delete memtable_;
+ }
+ virtual Status FinishImpl(const Options& options, const KVMap& data) {
+ delete memtable_;
+ memtable_ = new MemTable(internal_comparator_);
+ int seq = 1;
+ for (KVMap::const_iterator it = data.begin();
+ it != data.end();
+ ++it) {
+ memtable_->Add(seq, kTypeValue, it->first, it->second);
+ seq++;
+ }
+ return Status::OK();
+ }
+ virtual size_t NumBytes() const {
+ return memtable_->ApproximateMemoryUsage();
+ }
+
+ virtual Iterator* NewIterator() const {
+ return new KeyConvertingIterator(memtable_->NewIterator());
+ }
+
+ private:
+ InternalKeyComparator internal_comparator_;
+ MemTable* memtable_;
+};
+
+class DBConstructor: public Constructor {
+ public:
+ explicit DBConstructor(const Comparator* cmp)
+ : Constructor(cmp),
+ comparator_(cmp) {
+ db_ = NULL;
+ NewDB();
+ }
+ ~DBConstructor() {
+ delete db_;
+ }
+ virtual Status FinishImpl(const Options& options, const KVMap& data) {
+ delete db_;
+ db_ = NULL;
+ NewDB();
+ for (KVMap::const_iterator it = data.begin();
+ it != data.end();
+ ++it) {
+ WriteBatch batch;
+ batch.Put(it->first, it->second);
+ ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok());
+ }
+ return Status::OK();
+ }
+ virtual size_t NumBytes() const {
+ Range r("", "\xff\xff");
+ uint64_t size;
+ db_->GetApproximateSizes(&r, 1, &size);
+ return size;
+ }
+
+ virtual Iterator* NewIterator() const {
+ return db_->NewIterator(ReadOptions());
+ }
+
+ private:
+ void NewDB() {
+ std::string name = test::TmpDir() + "/table_testdb";
+
+ Options options;
+ options.comparator = comparator_;
+ Status status = DestroyDB(name, options);
+ ASSERT_TRUE(status.ok()) << status.ToString();
+
+ options.create_if_missing = true;
+ options.error_if_exists = true;
+ status = DB::Open(options, name, &db_);
+ ASSERT_TRUE(status.ok()) << status.ToString();
+ }
+
+ const Comparator* comparator_;
+ DB* db_;
+};
+
+enum TestType {
+ TABLE_TEST,
+ BLOCK_TEST,
+ MEMTABLE_TEST,
+ DB_TEST,
+};
+
+struct TestArgs {
+ TestType type;
+ bool reverse_compare;
+ int restart_interval;
+};
+
+static const TestArgs kTestArgList[] = {
+ { TABLE_TEST, false, 16 },
+ { TABLE_TEST, false, 1 },
+ { TABLE_TEST, false, 1024 },
+ { TABLE_TEST, true, 16 },
+ { TABLE_TEST, true, 1 },
+ { TABLE_TEST, true, 1024 },
+
+ { BLOCK_TEST, false, 16 },
+ { BLOCK_TEST, false, 1 },
+ { BLOCK_TEST, false, 1024 },
+ { BLOCK_TEST, true, 16 },
+ { BLOCK_TEST, true, 1 },
+ { BLOCK_TEST, true, 1024 },
+
+ // Restart interval does not matter for memtables
+ { MEMTABLE_TEST, false, 16 },
+ { MEMTABLE_TEST, true, 16 },
+
+ // Do not bother with restart interval variations for DB
+ { DB_TEST, false, 16 },
+ { DB_TEST, true, 16 },
+};
+static const int kNumTestArgs = sizeof(kTestArgList) / sizeof(kTestArgList[0]);
+
+class Harness {
+ public:
+ Harness() : constructor_(NULL) { }
+
+ void Init(const TestArgs& args) {
+ delete constructor_;
+ constructor_ = NULL;
+ options_ = Options();
+
+ options_.block_restart_interval = args.restart_interval;
+ // Use shorter block size for tests to exercise block boundary
+ // conditions more.
+ options_.block_size = 256;
+ if (args.reverse_compare) {
+ options_.comparator = &reverse_key_comparator;
+ }
+ switch (args.type) {
+ case TABLE_TEST:
+ constructor_ = new TableConstructor(options_.comparator);
+ break;
+ case BLOCK_TEST:
+ constructor_ = new BlockConstructor(options_.comparator);
+ break;
+ case MEMTABLE_TEST:
+ constructor_ = new MemTableConstructor(options_.comparator);
+ break;
+ case DB_TEST:
+ constructor_ = new DBConstructor(options_.comparator);
+ break;
+ }
+ }
+
+ ~Harness() {
+ delete constructor_;
+ }
+
+ void Add(const std::string& key, const std::string& value) {
+ constructor_->Add(key, value);
+ }
+
+ void Test(Random* rnd) {
+ std::vector<std::string> keys;
+ KVMap data;
+ constructor_->Finish(options_, &keys, &data);
+
+ TestForwardScan(keys, data);
+ TestBackwardScan(keys, data);
+ TestRandomAccess(rnd, keys, data);
+ }
+
+ void TestForwardScan(const std::vector<std::string>& keys,
+ const KVMap& data) {
+ Iterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ iter->SeekToFirst();
+ for (KVMap::const_iterator model_iter = data.begin();
+ model_iter != data.end();
+ ++model_iter) {
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ iter->Next();
+ }
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ }
+
+ void TestBackwardScan(const std::vector<std::string>& keys,
+ const KVMap& data) {
+ Iterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ iter->SeekToLast();
+ for (KVMap::const_reverse_iterator model_iter = data.rbegin();
+ model_iter != data.rend();
+ ++model_iter) {
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ iter->Prev();
+ }
+ ASSERT_TRUE(!iter->Valid());
+ delete iter;
+ }
+
+ void TestRandomAccess(Random* rnd,
+ const std::vector<std::string>& keys,
+ const KVMap& data) {
+ static const bool kVerbose = false;
+ Iterator* iter = constructor_->NewIterator();
+ ASSERT_TRUE(!iter->Valid());
+ KVMap::const_iterator model_iter = data.begin();
+ if (kVerbose) fprintf(stderr, "---\n");
+ for (int i = 0; i < 200; i++) {
+ const int toss = rnd->Uniform(5);
+ switch (toss) {
+ case 0: {
+ if (iter->Valid()) {
+ if (kVerbose) fprintf(stderr, "Next\n");
+ iter->Next();
+ ++model_iter;
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ }
+ break;
+ }
+
+ case 1: {
+ if (kVerbose) fprintf(stderr, "SeekToFirst\n");
+ iter->SeekToFirst();
+ model_iter = data.begin();
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+
+ case 2: {
+ std::string key = PickRandomKey(rnd, keys);
+ model_iter = data.lower_bound(key);
+ if (kVerbose) fprintf(stderr, "Seek '%s'\n",
+ EscapeString(key).c_str());
+ iter->Seek(Slice(key));
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+
+ case 3: {
+ if (iter->Valid()) {
+ if (kVerbose) fprintf(stderr, "Prev\n");
+ iter->Prev();
+ if (model_iter == data.begin()) {
+ model_iter = data.end(); // Wrap around to invalid value
+ } else {
+ --model_iter;
+ }
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ }
+ break;
+ }
+
+ case 4: {
+ if (kVerbose) fprintf(stderr, "SeekToLast\n");
+ iter->SeekToLast();
+ if (keys.empty()) {
+ model_iter = data.end();
+ } else {
+ std::string last = data.rbegin()->first;
+ model_iter = data.lower_bound(last);
+ }
+ ASSERT_EQ(ToString(data, model_iter), ToString(iter));
+ break;
+ }
+ }
+ }
+ delete iter;
+ }
+
+ std::string ToString(const KVMap& data, const KVMap::const_iterator& it) {
+ if (it == data.end()) {
+ return "END";
+ } else {
+ return "'" + it->first + "->" + it->second + "'";
+ }
+ }
+
+ std::string ToString(const KVMap& data,
+ const KVMap::const_reverse_iterator& it) {
+ if (it == data.rend()) {
+ return "END";
+ } else {
+ return "'" + it->first + "->" + it->second + "'";
+ }
+ }
+
+ std::string ToString(const Iterator* it) {
+ if (!it->Valid()) {
+ return "END";
+ } else {
+ return "'" + it->key().ToString() + "->" + it->value().ToString() + "'";
+ }
+ }
+
+ std::string PickRandomKey(Random* rnd, const std::vector<std::string>& keys) {
+ if (keys.empty()) {
+ return "foo";
+ } else {
+ const int index = rnd->Uniform(keys.size());
+ std::string result = keys[index];
+ switch (rnd->Uniform(3)) {
+ case 0:
+ // Return an existing key
+ break;
+ case 1: {
+ // Attempt to return something smaller than an existing key
+ if (result.size() > 0 && result[result.size()-1] > '\0') {
+ result[result.size()-1]--;
+ }
+ break;
+ }
+ case 2: {
+ // Return something larger than an existing key
+ Increment(options_.comparator, &result);
+ break;
+ }
+ }
+ return result;
+ }
+ }
+
+ private:
+ Options options_;
+ Constructor* constructor_;
+};
+
+// Test the empty key
+TEST(Harness, SimpleEmptyKey) {
+ for (int i = 0; i < kNumTestArgs; i++) {
+ Init(kTestArgList[i]);
+ Random rnd(test::RandomSeed() + 1);
+ Add("", "v");
+ Test(&rnd);
+ }
+}
+
+TEST(Harness, SimpleSingle) {
+ for (int i = 0; i < kNumTestArgs; i++) {
+ Init(kTestArgList[i]);
+ Random rnd(test::RandomSeed() + 2);
+ Add("abc", "v");
+ Test(&rnd);
+ }
+}
+
+TEST(Harness, SimpleMulti) {
+ for (int i = 0; i < kNumTestArgs; i++) {
+ Init(kTestArgList[i]);
+ Random rnd(test::RandomSeed() + 3);
+ Add("abc", "v");
+ Add("abcd", "v");
+ Add("ac", "v2");
+ Test(&rnd);
+ }
+}
+
+TEST(Harness, SimpleSpecialKey) {
+ for (int i = 0; i < kNumTestArgs; i++) {
+ Init(kTestArgList[i]);
+ Random rnd(test::RandomSeed() + 4);
+ Add("\xff\xff", "v3");
+ Test(&rnd);
+ }
+}
+
+TEST(Harness, Randomized) {
+ for (int i = 0; i < kNumTestArgs; i++) {
+ Init(kTestArgList[i]);
+ Random rnd(test::RandomSeed() + 5);
+ for (int num_entries = 0; num_entries < 2000;
+ num_entries += (num_entries < 50 ? 1 : 200)) {
+ if ((num_entries % 10) == 0) {
+ fprintf(stderr, "case %d of %d: num_entries = %d\n",
+ (i + 1), int(kNumTestArgs), num_entries);
+ }
+ for (int e = 0; e < num_entries; e++) {
+ std::string v;
+ Add(test::RandomKey(&rnd, rnd.Skewed(4)),
+ test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+ }
+ Test(&rnd);
+ }
+ }
+}
+
+class MemTableTest { };
+
+TEST(MemTableTest, Simple) {
+ InternalKeyComparator cmp(BytewiseComparator());
+ MemTable memtable(cmp);
+ WriteBatch batch;
+ WriteBatchInternal::SetSequence(&batch, 100);
+ batch.Put(std::string("k1"), std::string("v1"));
+ batch.Put(std::string("k2"), std::string("v2"));
+ batch.Put(std::string("k3"), std::string("v3"));
+ batch.Put(std::string("largekey"), std::string("vlarge"));
+ ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &memtable).ok());
+
+ Iterator* iter = memtable.NewIterator();
+ iter->SeekToFirst();
+ while (iter->Valid()) {
+ fprintf(stderr, "key: '%s' -> '%s'\n",
+ iter->key().ToString().c_str(),
+ iter->value().ToString().c_str());
+ iter->Next();
+ }
+
+ delete iter;
+}
+
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+ bool result = (val >= low) && (val <= high);
+ if (!result) {
+ fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+ (unsigned long long)(val),
+ (unsigned long long)(low),
+ (unsigned long long)(high));
+ }
+ return result;
+}
+
+class TableTest { };
+
+TEST(TableTest, ApproximateOffsetOfPlain) {
+ TableConstructor c(BytewiseComparator());
+ c.Add("k01", "hello");
+ c.Add("k02", "hello2");
+ c.Add("k03", std::string(10000, 'x'));
+ c.Add("k04", std::string(200000, 'x'));
+ c.Add("k05", std::string(300000, 'x'));
+ c.Add("k06", "hello3");
+ c.Add("k07", std::string(100000, 'x'));
+ std::vector<std::string> keys;
+ KVMap kvmap;
+ Options options;
+ options.block_size = 1024;
+ options.compression = kNoCompression;
+ c.Finish(options, &keys, &kvmap);
+
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01a"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 10000, 11000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04a"), 210000, 211000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k05"), 210000, 211000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k06"), 510000, 511000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k07"), 510000, 511000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 610000, 611000));
+
+}
+
+TEST(TableTest, ApproximateOffsetOfCompressed) {
+#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_CHROMIUM)
+ // Compression not supported yet, so skip this test.
+ // TODO(sanjay) Reenable after compression support is added
+ return;
+#endif
+
+ Random rnd(301);
+ TableConstructor c(BytewiseComparator());
+ std::string tmp;
+ c.Add("k01", "hello");
+ c.Add("k02", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+ c.Add("k03", "hello3");
+ c.Add("k04", test::CompressibleString(&rnd, 0.25, 10000, &tmp));
+ std::vector<std::string> keys;
+ KVMap kvmap;
+ Options options;
+ options.block_size = 1024;
+ options.compression = kLightweightCompression;
+ c.Finish(options, &keys, &kvmap);
+
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3000));
+ ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 6000));
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc
new file mode 100644
index 0000000..9b081f4
--- /dev/null
+++ b/table/two_level_iterator.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "table/two_level_iterator.h"
+
+#include "include/table.h"
+#include "table/block.h"
+#include "table/format.h"
+#include "table/iterator_wrapper.h"
+
+namespace leveldb {
+
+namespace {
+
+typedef Iterator* (*BlockFunction)(void*, const ReadOptions&, const Slice&);
+
+class TwoLevelIterator: public Iterator {
+ public:
+ TwoLevelIterator(
+ Iterator* index_iter,
+ BlockFunction block_function,
+ void* arg,
+ const ReadOptions& options);
+
+ virtual ~TwoLevelIterator();
+
+ virtual void Seek(const Slice& target);
+ virtual void SeekToFirst();
+ virtual void SeekToLast();
+ virtual void Next();
+ virtual void Prev();
+
+ virtual bool Valid() const {
+ return data_iter_.Valid();
+ }
+ virtual Slice key() const {
+ assert(Valid());
+ return data_iter_.key();
+ }
+ virtual Slice value() const {
+ assert(Valid());
+ return data_iter_.value();
+ }
+ virtual Status status() const {
+ // It'd be nice if status() returned a const Status& instead of a Status
+ if (!index_iter_.status().ok()) {
+ return index_iter_.status();
+ } else if (data_iter_.iter() != NULL && !data_iter_.status().ok()) {
+ return data_iter_.status();
+ } else {
+ return status_;
+ }
+ }
+
+ private:
+ void SaveError(const Status& s) {
+ if (status_.ok() && !s.ok()) status_ = s;
+ }
+ void SkipEmptyDataBlocksForward();
+ void SkipEmptyDataBlocksBackward();
+ void SetDataIterator(Iterator* data_iter);
+ void InitDataBlock();
+
+ BlockFunction block_function_;
+ void* arg_;
+ const ReadOptions options_;
+ Status status_;
+ IteratorWrapper index_iter_;
+ IteratorWrapper data_iter_; // May be NULL
+ // If data_iter_ is non-NULL, then "data_block_handle_" holds the
+ // "index_value" passed to block_function_ to create the data_iter_.
+ std::string data_block_handle_;
+};
+
+TwoLevelIterator::TwoLevelIterator(
+ Iterator* index_iter,
+ BlockFunction block_function,
+ void* arg,
+ const ReadOptions& options)
+ : block_function_(block_function),
+ arg_(arg),
+ options_(options),
+ index_iter_(index_iter),
+ data_iter_(NULL) {
+}
+
+TwoLevelIterator::~TwoLevelIterator() {
+}
+
+void TwoLevelIterator::Seek(const Slice& target) {
+ index_iter_.Seek(target);
+ InitDataBlock();
+ if (data_iter_.iter() != NULL) data_iter_.Seek(target);
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToFirst() {
+ index_iter_.SeekToFirst();
+ InitDataBlock();
+ if (data_iter_.iter() != NULL) data_iter_.SeekToFirst();
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::SeekToLast() {
+ index_iter_.SeekToLast();
+ InitDataBlock();
+ if (data_iter_.iter() != NULL) data_iter_.SeekToLast();
+ SkipEmptyDataBlocksBackward();
+}
+
+void TwoLevelIterator::Next() {
+ assert(Valid());
+ data_iter_.Next();
+ SkipEmptyDataBlocksForward();
+}
+
+void TwoLevelIterator::Prev() {
+ assert(Valid());
+ data_iter_.Prev();
+ SkipEmptyDataBlocksBackward();
+}
+
+
+void TwoLevelIterator::SkipEmptyDataBlocksForward() {
+ while (data_iter_.iter() == NULL || !data_iter_.Valid()) {
+ // Move to next block
+ if (!index_iter_.Valid()) {
+ SetDataIterator(NULL);
+ return;
+ }
+ index_iter_.Next();
+ InitDataBlock();
+ if (data_iter_.iter() != NULL) data_iter_.SeekToFirst();
+ }
+}
+
+void TwoLevelIterator::SkipEmptyDataBlocksBackward() {
+ while (data_iter_.iter() == NULL || !data_iter_.Valid()) {
+ // Move to next block
+ if (!index_iter_.Valid()) {
+ SetDataIterator(NULL);
+ return;
+ }
+ index_iter_.Prev();
+ InitDataBlock();
+ if (data_iter_.iter() != NULL) data_iter_.SeekToLast();
+ }
+}
+
+void TwoLevelIterator::SetDataIterator(Iterator* data_iter) {
+ if (data_iter_.iter() != NULL) SaveError(data_iter_.status());
+ data_iter_.Set(data_iter);
+}
+
+void TwoLevelIterator::InitDataBlock() {
+ if (!index_iter_.Valid()) {
+ SetDataIterator(NULL);
+ } else {
+ Slice handle = index_iter_.value();
+ if (data_iter_.iter() != NULL && handle.compare(data_block_handle_) == 0) {
+ // data_iter_ is already constructed with this iterator, so
+ // no need to change anything
+ } else {
+ Iterator* iter = (*block_function_)(arg_, options_, handle);
+ data_block_handle_.assign(handle.data(), handle.size());
+ SetDataIterator(iter);
+ }
+ }
+}
+
+}
+
+Iterator* NewTwoLevelIterator(
+ Iterator* index_iter,
+ BlockFunction block_function,
+ void* arg,
+ const ReadOptions& options) {
+ return new TwoLevelIterator(index_iter, block_function, arg, options);
+}
+
+}
diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h
new file mode 100644
index 0000000..57e439c
--- /dev/null
+++ b/table/two_level_iterator.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
+#define STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
+
+#include "include/iterator.h"
+
+namespace leveldb {
+
+struct ReadOptions;
+
+// Return a new two level iterator. A two-level iterator contains an
+// index iterator whose values point to a sequence of blocks where
+// each block is itself a sequence of key,value pairs. The returned
+// two-level iterator yields the concatenation of all key/value pairs
+// in the sequence of blocks. Takes ownership of "index_iter" and
+// will delete it when no longer needed.
+//
+// Uses a supplied function to convert an index_iter value into
+// an iterator over the contents of the corresponding block.
+extern Iterator* NewTwoLevelIterator(
+ Iterator* index_iter,
+ Iterator* (*block_function)(
+ void* arg,
+ const ReadOptions& options,
+ const Slice& index_value),
+ void* arg,
+ const ReadOptions& options);
+
+}
+
+#endif // STORAGE_LEVELDB_TABLE_TWO_LEVEL_ITERATOR_H_
diff --git a/util/arena.cc b/util/arena.cc
new file mode 100644
index 0000000..4bf6e36
--- /dev/null
+++ b/util/arena.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+#include <assert.h>
+
+namespace leveldb {
+
+static const int kBlockSize = 4096;
+
+Arena::Arena() {
+ blocks_memory_ = 0;
+ alloc_ptr_ = NULL; // First allocation will allocate a block
+ alloc_bytes_remaining_ = 0;
+}
+
+Arena::~Arena() {
+ for (int i = 0; i < blocks_.size(); i++) {
+ delete[] blocks_[i];
+ }
+}
+
+char* Arena::AllocateFallback(size_t bytes) {
+ if (bytes > kBlockSize / 4) {
+ // Object is more than a quarter of our block size. Allocate it separately
+ // to avoid wasting too much space in leftover bytes.
+ char* result = AllocateNewBlock(bytes);
+ return result;
+ }
+
+ // We waste the remaining space in the current block.
+ alloc_ptr_ = AllocateNewBlock(kBlockSize);
+ alloc_bytes_remaining_ = kBlockSize;
+
+ char* result = alloc_ptr_;
+ alloc_ptr_ += bytes;
+ alloc_bytes_remaining_ -= bytes;
+ return result;
+}
+
+char* Arena::AllocateAligned(size_t bytes) {
+ const int align = sizeof(void*); // We'll align to pointer size
+ assert((align & (align-1)) == 0); // Pointer size should be a power of 2
+ size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align-1);
+ size_t slop = (current_mod == 0 ? 0 : align - current_mod);
+ size_t needed = bytes + slop;
+ char* result;
+ if (needed <= alloc_bytes_remaining_) {
+ result = alloc_ptr_ + slop;
+ alloc_ptr_ += needed;
+ alloc_bytes_remaining_ -= needed;
+ } else {
+ // AllocateFallback always returned aligned memory
+ result = AllocateFallback(bytes);
+ }
+ assert((reinterpret_cast<uintptr_t>(result) & (align-1)) == 0);
+ return result;
+}
+
+char* Arena::AllocateNewBlock(size_t block_bytes) {
+ char* result = new char[block_bytes];
+ blocks_memory_ += block_bytes;
+ blocks_.push_back(result);
+ return result;
+}
+
+}
diff --git a/util/arena.h b/util/arena.h
new file mode 100644
index 0000000..fcb5d5b
--- /dev/null
+++ b/util/arena.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_ARENA_H_
+#define STORAGE_LEVELDB_UTIL_ARENA_H_
+
+#include <cstddef>
+#include <vector>
+#include <assert.h>
+#include <stdint.h>
+
+namespace leveldb {
+
+class Arena {
+ public:
+ Arena();
+ ~Arena();
+
+ // Return a pointer to a newly allocated memory block of "bytes" bytes.
+ char* Allocate(size_t bytes);
+
+ // Allocate memory with the normal alignment guarantees provided by malloc
+ char* AllocateAligned(size_t bytes);
+
+ // Returns an estimate of the total memory usage of data allocated
+ // by the arena (including space allocated but not yet used for user
+ // allocations).
+ size_t MemoryUsage() const {
+ return blocks_memory_ + blocks_.capacity() * sizeof(char*);
+ }
+
+ private:
+ char* AllocateFallback(size_t bytes);
+ char* AllocateNewBlock(size_t block_bytes);
+
+ // Allocation state
+ char* alloc_ptr_;
+ size_t alloc_bytes_remaining_;
+
+ // Array of new[] allocated memory blocks
+ std::vector<char*> blocks_;
+
+ // Bytes of memory in blocks allocated so far
+ size_t blocks_memory_;
+
+ // No copying allowed
+ Arena(const Arena&);
+ void operator=(const Arena&);
+};
+
+inline char* Arena::Allocate(size_t bytes) {
+ // The semantics of what to return are a bit messy if we allow
+ // 0-byte allocations, so we disallow them here (we don't need
+ // them for our internal use).
+ assert(bytes > 0);
+ if (bytes <= alloc_bytes_remaining_) {
+ char* result = alloc_ptr_;
+ alloc_ptr_ += bytes;
+ alloc_bytes_remaining_ -= bytes;
+ return result;
+ }
+ return AllocateFallback(bytes);
+}
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_ARENA_H_
diff --git a/util/arena_test.cc b/util/arena_test.cc
new file mode 100644
index 0000000..c33b552
--- /dev/null
+++ b/util/arena_test.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/arena.h"
+
+#include "util/random.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class ArenaTest { };
+
+TEST(ArenaTest, Empty) {
+ Arena arena;
+}
+
+TEST(ArenaTest, Simple) {
+ std::vector<std::pair<size_t, char*> > allocated;
+ Arena arena;
+ const int N = 100000;
+ size_t bytes = 0;
+ Random rnd(301);
+ for (int i = 0; i < N; i++) {
+ size_t s;
+ if (i % (N / 10) == 0) {
+ s = i;
+ } else {
+ s = rnd.OneIn(4000) ? rnd.Uniform(6000) :
+ (rnd.OneIn(10) ? rnd.Uniform(100) : rnd.Uniform(20));
+ }
+ if (s == 0) {
+ // Our arena disallows size 0 allocations.
+ s = 1;
+ }
+ char* r;
+ if (rnd.OneIn(10)) {
+ r = arena.AllocateAligned(s);
+ } else {
+ r = arena.Allocate(s);
+ }
+
+ for (int b = 0; b < s; b++) {
+ // Fill the "i"th allocation with a known bit pattern
+ r[b] = i % 256;
+ }
+ bytes += s;
+ allocated.push_back(std::make_pair(s, r));
+ ASSERT_GE(arena.MemoryUsage(), bytes);
+ if (i > N/10) {
+ ASSERT_LE(arena.MemoryUsage(), bytes * 1.10);
+ }
+ }
+ for (int i = 0; i < allocated.size(); i++) {
+ size_t num_bytes = allocated[i].first;
+ const char* p = allocated[i].second;
+ for (int b = 0; b < num_bytes; b++) {
+ // Check the "i"th allocation for the known bit pattern
+ ASSERT_EQ(int(p[b]) & 0xff, i % 256);
+ }
+ }
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/util/cache.cc b/util/cache.cc
new file mode 100644
index 0000000..958de66
--- /dev/null
+++ b/util/cache.cc
@@ -0,0 +1,253 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#if defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID)
+#include <unordered_set>
+#elif defined(LEVELDB_PLATFORM_CHROMIUM)
+#include "base/hash_tables.h"
+#else
+#include <hash_set> // TODO(sanjay): Switch to unordered_set when possible.
+#endif
+
+#include <assert.h>
+
+#include "include/cache.h"
+#include "port/port.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+
+namespace leveldb {
+
+Cache::~Cache() {
+}
+
+namespace {
+
+// LRU cache implementation
+
+// An entry is a variable length heap-allocated structure. Entries
+// are kept in a circular doubly linked list ordered by access time.
+struct LRUHandle {
+ void* value;
+ void (*deleter)(const Slice&, void* value);
+ LRUHandle* next;
+ LRUHandle* prev;
+ size_t charge; // TODO(opt): Only allow uint32_t?
+ size_t key_length;
+ size_t refs; // TODO(opt): Pack with "key_length"?
+ char key_data[1]; // Beginning of key
+
+ Slice key() const {
+ // For cheaper lookups, we allow a temporary Handle object
+ // to store a pointer to a key in "value".
+ if (next == this) {
+ return *(reinterpret_cast<Slice*>(value));
+ } else {
+ return Slice(key_data, key_length);
+ }
+ }
+};
+
+// Pick a platform specific hash_set instantiation
+#if defined(LEVELDB_PLATFORM_CHROMIUM) && defined(OS_WIN)
+ // Microsoft's hash_set deviates from the standard. See
+ // http://msdn.microsoft.com/en-us/library/1t4xas78(v=vs.80).aspx
+ // for details. Basically the 2 param () operator is a less than and
+ // the 1 param () operator is a hash function.
+ struct HandleHashCompare : public stdext::hash_compare<LRUHandle*> {
+ size_t operator() (LRUHandle* h) const {
+ Slice k = h->key();
+ return Hash(k.data(), k.size(), 0);
+ }
+ bool operator() (LRUHandle* a, LRUHandle* b) const {
+ return a->key().compare(b->key()) < 0;
+ }
+ };
+ typedef base::hash_set<LRUHandle*, HandleHashCompare> HandleTable;
+#else
+ struct HandleHash {
+ inline size_t operator()(LRUHandle* h) const {
+ Slice k = h->key();
+ return Hash(k.data(), k.size(), 0);
+ }
+ };
+
+ struct HandleEq {
+ inline bool operator()(LRUHandle* a, LRUHandle* b) const {
+ return a->key() == b->key();
+ }
+ };
+# if defined(LEVELDB_PLATFORM_CHROMIUM)
+ typedef base::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
+# elif defined(LEVELDB_PLATFORM_POSIX) || defined(LEVELDB_PLATFORM_ANDROID)
+ typedef std::unordered_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
+# else
+ typedef __gnu_cxx::hash_set<LRUHandle*, HandleHash, HandleEq> HandleTable;
+# endif
+#endif
+
+class LRUCache : public Cache {
+ public:
+ explicit LRUCache(size_t capacity);
+ virtual ~LRUCache();
+
+ virtual Handle* Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value));
+ virtual Handle* Lookup(const Slice& key);
+ virtual void Release(Handle* handle);
+ virtual void* Value(Handle* handle);
+ virtual void Erase(const Slice& key);
+ virtual uint64_t NewId();
+
+ private:
+ void LRU_Remove(LRUHandle* e);
+ void LRU_Append(LRUHandle* e);
+ void Unref(LRUHandle* e);
+
+ // Constructor parameters
+ const size_t capacity_;
+
+ // mutex_ protects the following state.
+ port::Mutex mutex_;
+ size_t usage_;
+ uint64_t last_id_;
+
+ // Dummy head of LRU list.
+ // lru.prev is newest entry, lru.next is oldest entry.
+ LRUHandle lru_;
+
+ HandleTable table_;
+};
+
+LRUCache::LRUCache(size_t capacity)
+ : capacity_(capacity),
+ usage_(0),
+ last_id_(0) {
+ // Make empty circular linked list
+ lru_.next = &lru_;
+ lru_.prev = &lru_;
+}
+
+LRUCache::~LRUCache() {
+ table_.clear();
+ for (LRUHandle* e = lru_.next; e != &lru_; ) {
+ LRUHandle* next = e->next;
+ assert(e->refs == 1); // Error if caller has an unreleased handle
+ Unref(e);
+ e = next;
+ }
+}
+
+void LRUCache::Unref(LRUHandle* e) {
+ assert(e->refs > 0);
+ e->refs--;
+ if (e->refs <= 0) {
+ usage_ -= e->charge;
+ (*e->deleter)(e->key(), e->value);
+ free(e);
+ }
+}
+
+void LRUCache::LRU_Remove(LRUHandle* e) {
+ e->next->prev = e->prev;
+ e->prev->next = e->next;
+}
+
+void LRUCache::LRU_Append(LRUHandle* e) {
+ // Make "e" newest entry by inserting just before lru_
+ e->next = &lru_;
+ e->prev = lru_.prev;
+ e->prev->next = e;
+ e->next->prev = e;
+}
+
+Cache::Handle* LRUCache::Lookup(const Slice& key) {
+ MutexLock l(&mutex_);
+
+ LRUHandle dummy;
+ dummy.next = &dummy;
+ dummy.value = const_cast<Slice*>(&key);
+ HandleTable::iterator iter = table_.find(&dummy);
+ if (iter == table_.end()) {
+ return NULL;
+ } else {
+ LRUHandle* e = const_cast<LRUHandle*>(*iter);
+ e->refs++;
+ LRU_Remove(e);
+ LRU_Append(e);
+ return reinterpret_cast<Handle*>(e);
+ }
+}
+
+void* LRUCache::Value(Handle* handle) {
+ return reinterpret_cast<LRUHandle*>(handle)->value;
+}
+
+void LRUCache::Release(Handle* handle) {
+ MutexLock l(&mutex_);
+ Unref(reinterpret_cast<LRUHandle*>(handle));
+}
+
+Cache::Handle* LRUCache::Insert(const Slice& key, void* value, size_t charge,
+ void (*deleter)(const Slice& key, void* value)) {
+ MutexLock l(&mutex_);
+
+ LRUHandle* e = reinterpret_cast<LRUHandle*>(
+ malloc(sizeof(LRUHandle)-1 + key.size()));
+ e->value = value;
+ e->deleter = deleter;
+ e->charge = charge;
+ e->key_length = key.size();
+ e->refs = 2; // One from LRUCache, one for the returned handle
+ memcpy(e->key_data, key.data(), key.size());
+ LRU_Append(e);
+ usage_ += charge;
+
+ std::pair<HandleTable::iterator,bool> p = table_.insert(e);
+ if (!p.second) {
+ // Kill existing entry
+ LRUHandle* old = const_cast<LRUHandle*>(*(p.first));
+ LRU_Remove(old);
+ table_.erase(p.first);
+ table_.insert(e);
+ Unref(old);
+ }
+
+ while (usage_ > capacity_ && lru_.next != &lru_) {
+ LRUHandle* old = lru_.next;
+ LRU_Remove(old);
+ table_.erase(old);
+ Unref(old);
+ }
+
+ return reinterpret_cast<Handle*>(e);
+}
+
+void LRUCache::Erase(const Slice& key) {
+ MutexLock l(&mutex_);
+
+ LRUHandle dummy;
+ dummy.next = &dummy;
+ dummy.value = const_cast<Slice*>(&key);
+ HandleTable::iterator iter = table_.find(&dummy);
+ if (iter != table_.end()) {
+ LRUHandle* e = const_cast<LRUHandle*>(*iter);
+ LRU_Remove(e);
+ table_.erase(iter);
+ Unref(e);
+ }
+}
+
+uint64_t LRUCache::NewId() {
+ MutexLock l(&mutex_);
+ return ++(last_id_);
+}
+
+} // end anonymous namespace
+
+Cache* NewLRUCache(size_t capacity) {
+ return new LRUCache(capacity);
+}
+
+}
diff --git a/util/cache_test.cc b/util/cache_test.cc
new file mode 100644
index 0000000..05de5d9
--- /dev/null
+++ b/util/cache_test.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/cache.h"
+
+#include <vector>
+#include "util/coding.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+// Conversions between numeric keys/values and the types expected by Cache.
+static std::string EncodeKey(int k) {
+ std::string result;
+ PutFixed32(&result, k);
+ return result;
+}
+static int DecodeKey(const Slice& k) {
+ assert(k.size() == 4);
+ return DecodeFixed32(k.data());
+}
+static void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+static int DecodeValue(void* v) { return reinterpret_cast<uintptr_t>(v); }
+
+class CacheTest {
+ public:
+ static CacheTest* current_;
+
+ static void Deleter(const Slice& key, void* v) {
+ current_->deleted_keys_.push_back(DecodeKey(key));
+ current_->deleted_values_.push_back(DecodeValue(v));
+ }
+
+ static const int kCacheSize = 100;
+ std::vector<int> deleted_keys_;
+ std::vector<int> deleted_values_;
+ Cache* cache_;
+
+ CacheTest() : cache_(NewLRUCache(kCacheSize)) {
+ current_ = this;
+ }
+
+ ~CacheTest() {
+ delete cache_;
+ }
+
+ int Lookup(int key) {
+ Cache::Handle* handle = cache_->Lookup(EncodeKey(key));
+ const int r = (handle == NULL) ? -1 : DecodeValue(cache_->Value(handle));
+ if (handle != NULL) {
+ cache_->Release(handle);
+ }
+ return r;
+ }
+
+ void Insert(int key, int value, int charge = 1) {
+ cache_->Release(cache_->Insert(EncodeKey(key), EncodeValue(value), charge,
+ &CacheTest::Deleter));
+ }
+
+ void Erase(int key) {
+ cache_->Erase(EncodeKey(key));
+ }
+};
+CacheTest* CacheTest::current_;
+
+TEST(CacheTest, HitAndMiss) {
+ ASSERT_EQ(-1, Lookup(100));
+
+ Insert(100, 101);
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(-1, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ Insert(200, 201);
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ Insert(100, 102);
+ ASSERT_EQ(102, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(-1, Lookup(300));
+
+ ASSERT_EQ(1, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST(CacheTest, Erase) {
+ Erase(200);
+ ASSERT_EQ(0, deleted_keys_.size());
+
+ Insert(100, 101);
+ Insert(200, 201);
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(1, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(201, Lookup(200));
+ ASSERT_EQ(1, deleted_keys_.size());
+}
+
+TEST(CacheTest, EntriesArePinned) {
+ Insert(100, 101);
+ Cache::Handle* h1 = cache_->Lookup(EncodeKey(100));
+ ASSERT_EQ(101, DecodeValue(cache_->Value(h1)));
+
+ Insert(100, 102);
+ Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
+ ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
+ ASSERT_EQ(0, deleted_keys_.size());
+
+ cache_->Release(h1);
+ ASSERT_EQ(1, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+
+ Erase(100);
+ ASSERT_EQ(-1, Lookup(100));
+ ASSERT_EQ(1, deleted_keys_.size());
+
+ cache_->Release(h2);
+ ASSERT_EQ(2, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[1]);
+ ASSERT_EQ(102, deleted_values_[1]);
+}
+
+TEST(CacheTest, EvictionPolicy) {
+ Insert(100, 101);
+ Insert(200, 201);
+
+ // Frequently used entry must be kept around
+ for (int i = 0; i < kCacheSize; i++) {
+ Insert(1000+i, 2000+i);
+ ASSERT_EQ(2000+i, Lookup(1000+i));
+ ASSERT_EQ(101, Lookup(100));
+ }
+ ASSERT_EQ(101, Lookup(100));
+ ASSERT_EQ(2, deleted_keys_.size());
+ ASSERT_EQ(200, deleted_keys_[0]);
+ ASSERT_EQ(201, deleted_values_[0]);
+}
+
+TEST(CacheTest, HeavyEntry) {
+ Insert(100, 101);
+ Insert(200, 201, kCacheSize);
+ ASSERT_EQ(1, deleted_keys_.size());
+ ASSERT_EQ(100, deleted_keys_[0]);
+ ASSERT_EQ(101, deleted_values_[0]);
+}
+
+TEST(CacheTest, NewId) {
+ uint64_t a = cache_->NewId();
+ uint64_t b = cache_->NewId();
+ ASSERT_NE(a, b);
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/util/coding.cc b/util/coding.cc
new file mode 100644
index 0000000..680e2ad
--- /dev/null
+++ b/util/coding.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+namespace leveldb {
+
+void EncodeFixed32(char* buf, uint32_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ memcpy(buf, &value, sizeof(value));
+#else
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+#endif
+}
+
+void EncodeFixed64(char* buf, uint64_t value) {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ memcpy(buf, &value, sizeof(value));
+#else
+ buf[0] = value & 0xff;
+ buf[1] = (value >> 8) & 0xff;
+ buf[2] = (value >> 16) & 0xff;
+ buf[3] = (value >> 24) & 0xff;
+ buf[4] = (value >> 32) & 0xff;
+ buf[5] = (value >> 40) & 0xff;
+ buf[6] = (value >> 48) & 0xff;
+ buf[7] = (value >> 56) & 0xff;
+#endif
+}
+
+void PutFixed32(std::string* dst, uint32_t value) {
+ char buf[sizeof(value)];
+ EncodeFixed32(buf, value);
+ dst->append(buf, sizeof(buf));
+}
+
+void PutFixed64(std::string* dst, uint64_t value) {
+ char buf[sizeof(value)];
+ EncodeFixed64(buf, value);
+ dst->append(buf, sizeof(buf));
+}
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+ // Operate on characters as unsigneds
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ static const int B = 128;
+ if (v < (1<<7)) {
+ *(ptr++) = v;
+ } else if (v < (1<<14)) {
+ *(ptr++) = v | B;
+ *(ptr++) = v>>7;
+ } else if (v < (1<<21)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = v>>14;
+ } else if (v < (1<<28)) {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = v>>21;
+ } else {
+ *(ptr++) = v | B;
+ *(ptr++) = (v>>7) | B;
+ *(ptr++) = (v>>14) | B;
+ *(ptr++) = (v>>21) | B;
+ *(ptr++) = v>>28;
+ }
+ return reinterpret_cast<char*>(ptr);
+}
+
+void PutVarint32(std::string* dst, uint32_t v) {
+ char buf[5];
+ char* ptr = EncodeVarint32(buf, v);
+ dst->append(buf, ptr - buf);
+}
+
+char* EncodeVarint64(char* dst, uint64_t v) {
+ static const int B = 128;
+ unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+ while (v >= B) {
+ *(ptr++) = (v & (B-1)) | B;
+ v >>= 7;
+ }
+ *(ptr++) = v;
+ return reinterpret_cast<char*>(ptr);
+}
+
+void PutVarint64(std::string* dst, uint64_t v) {
+ char buf[10];
+ char* ptr = EncodeVarint64(buf, v);
+ dst->append(buf, ptr - buf);
+}
+
+void PutLengthPrefixedSlice(std::string* dst, const Slice& value) {
+ PutVarint32(dst, value.size());
+ dst->append(value.data(), value.size());
+}
+
+int VarintLength(uint64_t v) {
+ int len = 1;
+ while (v >= 128) {
+ v >>= 7;
+ len++;
+ }
+ return len;
+}
+
+const char* GetVarint32PtrFallback(const char* p,
+ const char* limit,
+ uint32_t* value) {
+ uint32_t result = 0;
+ for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
+ uint32_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return NULL;
+}
+
+bool GetVarint32(Slice* input, uint32_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint32Ptr(p, limit, value);
+ if (q == NULL) {
+ return false;
+ } else {
+ *input = Slice(q, limit - q);
+ return true;
+ }
+}
+
+const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) {
+ uint64_t result = 0;
+ for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) {
+ uint64_t byte = *(reinterpret_cast<const unsigned char*>(p));
+ p++;
+ if (byte & 128) {
+ // More bytes are present
+ result |= ((byte & 127) << shift);
+ } else {
+ result |= (byte << shift);
+ *value = result;
+ return reinterpret_cast<const char*>(p);
+ }
+ }
+ return NULL;
+}
+
+bool GetVarint64(Slice* input, uint64_t* value) {
+ const char* p = input->data();
+ const char* limit = p + input->size();
+ const char* q = GetVarint64Ptr(p, limit, value);
+ if (q == NULL) {
+ return false;
+ } else {
+ *input = Slice(q, limit - q);
+ return true;
+ }
+}
+
+const char* GetLengthPrefixedSlice(const char* p, const char* limit,
+ Slice* result) {
+ uint32_t len;
+ p = GetVarint32Ptr(p, limit, &len);
+ if (p == NULL) return NULL;
+ if (p + len > limit) return NULL;
+ *result = Slice(p, len);
+ return p + len;
+}
+
+bool GetLengthPrefixedSlice(Slice* input, Slice* result) {
+ uint32_t len;
+ if (GetVarint32(input, &len) &&
+ input->size() >= len) {
+ *result = Slice(input->data(), len);
+ input->remove_prefix(len);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+}
diff --git a/util/coding.h b/util/coding.h
new file mode 100644
index 0000000..a42e714
--- /dev/null
+++ b/util/coding.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Endian-neutral encoding:
+// * Fixed-length numbers are encoded with least-significant byte first
+// * In addition we support variable length "varint" encoding
+// * Strings are encoded prefixed by their length in varint format
+
+#ifndef STORAGE_LEVELDB_UTIL_CODING_H_
+#define STORAGE_LEVELDB_UTIL_CODING_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <string>
+#include "include/slice.h"
+#include "port/port.h"
+
+namespace leveldb {
+
+// Standard Put... routines append to a string
+extern void PutFixed32(std::string* dst, uint32_t value);
+extern void PutFixed64(std::string* dst, uint64_t value);
+extern void PutVarint32(std::string* dst, uint32_t value);
+extern void PutVarint64(std::string* dst, uint64_t value);
+extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value);
+
+// Standard Get... routines parse a value from the beginning of a Slice
+// and advance the slice past the parsed value.
+extern bool GetVarint32(Slice* input, uint32_t* value);
+extern bool GetVarint64(Slice* input, uint64_t* value);
+extern bool GetLengthPrefixedSlice(Slice* input, Slice* result);
+
+// Pointer-based variants of GetVarint... These either store a value
+// in *v and return a pointer just past the parsed value, or return
+// NULL on error. These routines only look at bytes in the range
+// [p..limit-1]
+extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v);
+extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v);
+
+// Returns the length of the varint32 or varint64 encoding of "v"
+extern int VarintLength(uint64_t v);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// REQUIRES: dst has enough space for the value being written
+extern void EncodeFixed32(char* dst, uint32_t value);
+extern void EncodeFixed64(char* dst, uint64_t value);
+
+// Lower-level versions of Put... that write directly into a character buffer
+// and return a pointer just past the last byte written.
+// REQUIRES: dst has enough space for the value being written
+extern char* EncodeVarint32(char* dst, uint32_t value);
+extern char* EncodeVarint64(char* dst, uint64_t value);
+
+// Lower-level versions of Get... that read directly from a character buffer
+// without any bounds checking.
+
+inline uint32_t DecodeFixed32(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint32_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ return ((static_cast<uint32_t>(ptr[0]))
+ | (static_cast<uint32_t>(ptr[1]) << 8)
+ | (static_cast<uint32_t>(ptr[2]) << 16)
+ | (static_cast<uint32_t>(ptr[3]) << 24));
+ }
+}
+
+inline uint64_t DecodeFixed64(const char* ptr) {
+ if (port::kLittleEndian) {
+ // Load the raw bytes
+ uint64_t result;
+ memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+ return result;
+ } else {
+ uint64_t lo = DecodeFixed32(ptr);
+ uint64_t hi = DecodeFixed32(ptr + 4);
+ return (hi << 32) | lo;
+ }
+}
+
+// Internal routine for use by fallback path of GetVarint32Ptr
+extern const char* GetVarint32PtrFallback(const char* p,
+ const char* limit,
+ uint32_t* value);
+inline const char* GetVarint32Ptr(const char* p,
+ const char* limit,
+ uint32_t* value) {
+ if (p < limit) {
+ uint32_t result = *(reinterpret_cast<const unsigned char*>(p));
+ if ((result & 128) == 0) {
+ *value = result;
+ return p + 1;
+ }
+ }
+ return GetVarint32PtrFallback(p, limit, value);
+}
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_CODING_H_
diff --git a/util/coding_test.cc b/util/coding_test.cc
new file mode 100644
index 0000000..a8dba04
--- /dev/null
+++ b/util/coding_test.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/coding.h"
+
+#include "util/testharness.h"
+
+namespace leveldb {
+
+class Coding { };
+
+TEST(Coding, Fixed32) {
+ std::string s;
+ for (uint32_t v = 0; v < 100000; v++) {
+ PutFixed32(&s, v);
+ }
+
+ const char* p = s.data();
+ for (uint32_t v = 0; v < 100000; v++) {
+ uint32_t actual = DecodeFixed32(p);
+ ASSERT_EQ(v, actual);
+ p += sizeof(uint32_t);
+ }
+}
+
+TEST(Coding, Fixed64) {
+ std::string s;
+ for (int power = 0; power <= 63; power++) {
+ uint64_t v = static_cast<uint64_t>(1) << power;
+ PutFixed64(&s, v - 1);
+ PutFixed64(&s, v + 0);
+ PutFixed64(&s, v + 1);
+ }
+
+ const char* p = s.data();
+ for (int power = 0; power <= 63; power++) {
+ uint64_t v = static_cast<uint64_t>(1) << power;
+ uint64_t actual;
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v-1, actual);
+ p += sizeof(uint64_t);
+
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v+0, actual);
+ p += sizeof(uint64_t);
+
+ actual = DecodeFixed64(p);
+ ASSERT_EQ(v+1, actual);
+ p += sizeof(uint64_t);
+ }
+}
+
+TEST(Coding, Varint32) {
+ std::string s;
+ for (uint32_t i = 0; i < (32 * 32); i++) {
+ uint32_t v = (i / 32) << (i % 32);
+ PutVarint32(&s, v);
+ }
+
+ const char* p = s.data();
+ const char* limit = p + s.size();
+ for (uint32_t i = 0; i < (32 * 32); i++) {
+ uint32_t expected = (i / 32) << (i % 32);
+ uint32_t actual;
+ const char* start = p;
+ p = GetVarint32Ptr(p, limit, &actual);
+ ASSERT_TRUE(p != NULL);
+ ASSERT_EQ(expected, actual);
+ ASSERT_EQ(VarintLength(actual), p - start);
+ }
+ ASSERT_EQ(p, s.data() + s.size());
+}
+
+TEST(Coding, Varint64) {
+ // Construct the list of values to check
+ std::vector<uint64_t> values;
+ // Some special values
+ values.push_back(0);
+ values.push_back(100);
+ values.push_back(~static_cast<uint64_t>(0));
+ values.push_back(~static_cast<uint64_t>(0) - 1);
+ for (uint32_t k = 0; k < 64; k++) {
+ // Test values near powers of two
+ const uint64_t power = 1ull << k;
+ values.push_back(power);
+ values.push_back(power-1);
+ values.push_back(power+1);
+ };
+
+ std::string s;
+ for (int i = 0; i < values.size(); i++) {
+ PutVarint64(&s, values[i]);
+ }
+
+ const char* p = s.data();
+ const char* limit = p + s.size();
+ for (int i = 0; i < values.size(); i++) {
+ ASSERT_TRUE(p < limit);
+ uint64_t actual;
+ const char* start = p;
+ p = GetVarint64Ptr(p, limit, &actual);
+ ASSERT_TRUE(p != NULL);
+ ASSERT_EQ(values[i], actual);
+ ASSERT_EQ(VarintLength(actual), p - start);
+ }
+ ASSERT_EQ(p, limit);
+
+}
+
+TEST(Coding, Varint32Overflow) {
+ uint32_t result;
+ std::string input("\x81\x82\x83\x84\x85\x11");
+ ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), &result)
+ == NULL);
+}
+
+TEST(Coding, Varint32Truncation) {
+ uint32_t large_value = (1u << 31) + 100;
+ std::string s;
+ PutVarint32(&s, large_value);
+ uint32_t result;
+ for (int len = 0; len < s.size() - 1; len++) {
+ ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == NULL);
+ }
+ ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != NULL);
+ ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Varint64Overflow) {
+ uint64_t result;
+ std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+ ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), &result)
+ == NULL);
+}
+
+TEST(Coding, Varint64Truncation) {
+ uint64_t large_value = (1ull << 63) + 100ull;
+ std::string s;
+ PutVarint64(&s, large_value);
+ uint64_t result;
+ for (int len = 0; len < s.size() - 1; len++) {
+ ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == NULL);
+ }
+ ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != NULL);
+ ASSERT_EQ(large_value, result);
+}
+
+TEST(Coding, Strings) {
+ std::string s;
+ PutLengthPrefixedSlice(&s, Slice(""));
+ PutLengthPrefixedSlice(&s, Slice("foo"));
+ PutLengthPrefixedSlice(&s, Slice("bar"));
+ PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x')));
+
+ Slice input(s);
+ Slice v;
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("foo", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ("bar", v.ToString());
+ ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v));
+ ASSERT_EQ(std::string(200, 'x'), v.ToString());
+ ASSERT_EQ("", input.ToString());
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/util/comparator.cc b/util/comparator.cc
new file mode 100644
index 0000000..dca3b4d
--- /dev/null
+++ b/util/comparator.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdint.h>
+#include "include/comparator.h"
+#include "include/slice.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+Comparator::~Comparator() { }
+
+namespace {
+class BytewiseComparatorImpl : public Comparator {
+ public:
+ BytewiseComparatorImpl() { }
+
+ virtual const char* Name() const {
+ return "leveldb.BytewiseComparator";
+ }
+
+ virtual int Compare(const Slice& a, const Slice& b) const {
+ return a.compare(b);
+ }
+
+ virtual void FindShortestSeparator(
+ std::string* start,
+ const Slice& limit) const {
+ // Find length of common prefix
+ size_t min_length = std::min(start->size(), limit.size());
+ size_t diff_index = 0;
+ while ((diff_index < min_length) &&
+ ((*start)[diff_index] == limit[diff_index])) {
+ diff_index++;
+ }
+
+ if (diff_index >= min_length) {
+ // Do not shorten if one string is a prefix of the other
+ } else {
+ uint8_t diff_byte = static_cast<uint8_t>((*start)[diff_index]);
+ if (diff_byte < static_cast<uint8_t>(0xff) &&
+ diff_byte + 1 < static_cast<uint8_t>(limit[diff_index])) {
+ (*start)[diff_index]++;
+ start->resize(diff_index + 1);
+ assert(Compare(*start, limit) < 0);
+ }
+ }
+ }
+
+ virtual void FindShortSuccessor(std::string* key) const {
+ // Find first character that can be incremented
+ size_t n = key->size();
+ for (int i = 0; i < n; i++) {
+ const uint8_t byte = (*key)[i];
+ if (byte != static_cast<uint8_t>(0xff)) {
+ (*key)[i] = byte + 1;
+ key->resize(i+1);
+ return;
+ }
+ }
+ // *key is a run of 0xffs. Leave it alone.
+ }
+};
+}
+static const BytewiseComparatorImpl bytewise;
+
+const Comparator* BytewiseComparator() {
+ return &bytewise;
+}
+
+}
diff --git a/util/crc32c.cc b/util/crc32c.cc
new file mode 100644
index 0000000..28c2401
--- /dev/null
+++ b/util/crc32c.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A portable implementation of crc32c, optimized to handle
+// four bytes at a time.
+
+#include "util/crc32c.h"
+
+#include <stdint.h>
+#include "util/coding.h"
+
+namespace leveldb {
+namespace crc32c {
+
+static const uint32_t table0_[256] = {
+ 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4,
+ 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb,
+ 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b,
+ 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24,
+ 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b,
+ 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384,
+ 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54,
+ 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b,
+ 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a,
+ 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35,
+ 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5,
+ 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa,
+ 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45,
+ 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a,
+ 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a,
+ 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595,
+ 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48,
+ 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957,
+ 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687,
+ 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198,
+ 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927,
+ 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38,
+ 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8,
+ 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7,
+ 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096,
+ 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789,
+ 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859,
+ 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46,
+ 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9,
+ 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6,
+ 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36,
+ 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829,
+ 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c,
+ 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93,
+ 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043,
+ 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c,
+ 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3,
+ 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc,
+ 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c,
+ 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033,
+ 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652,
+ 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d,
+ 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d,
+ 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982,
+ 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d,
+ 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622,
+ 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2,
+ 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed,
+ 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530,
+ 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f,
+ 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff,
+ 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0,
+ 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f,
+ 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540,
+ 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90,
+ 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f,
+ 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee,
+ 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1,
+ 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321,
+ 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e,
+ 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81,
+ 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e,
+ 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e,
+ 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351
+};
+static const uint32_t table1_[256] = {
+ 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899,
+ 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945,
+ 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21,
+ 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd,
+ 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918,
+ 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4,
+ 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0,
+ 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c,
+ 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b,
+ 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47,
+ 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823,
+ 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff,
+ 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a,
+ 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6,
+ 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2,
+ 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e,
+ 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d,
+ 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41,
+ 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25,
+ 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9,
+ 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c,
+ 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0,
+ 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4,
+ 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78,
+ 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f,
+ 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43,
+ 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27,
+ 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb,
+ 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e,
+ 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2,
+ 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6,
+ 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a,
+ 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260,
+ 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc,
+ 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8,
+ 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004,
+ 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1,
+ 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d,
+ 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059,
+ 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185,
+ 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162,
+ 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be,
+ 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da,
+ 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306,
+ 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3,
+ 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f,
+ 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b,
+ 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287,
+ 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464,
+ 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8,
+ 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc,
+ 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600,
+ 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5,
+ 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439,
+ 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d,
+ 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781,
+ 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766,
+ 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba,
+ 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de,
+ 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502,
+ 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7,
+ 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b,
+ 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f,
+ 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483
+};
+static const uint32_t table2_[256] = {
+ 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073,
+ 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469,
+ 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6,
+ 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac,
+ 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9,
+ 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3,
+ 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c,
+ 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726,
+ 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67,
+ 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d,
+ 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2,
+ 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8,
+ 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed,
+ 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7,
+ 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828,
+ 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32,
+ 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa,
+ 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0,
+ 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f,
+ 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75,
+ 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20,
+ 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a,
+ 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5,
+ 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff,
+ 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe,
+ 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4,
+ 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b,
+ 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161,
+ 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634,
+ 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e,
+ 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1,
+ 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb,
+ 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730,
+ 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a,
+ 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5,
+ 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def,
+ 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba,
+ 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0,
+ 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f,
+ 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065,
+ 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24,
+ 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e,
+ 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1,
+ 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb,
+ 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae,
+ 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4,
+ 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b,
+ 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71,
+ 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9,
+ 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3,
+ 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c,
+ 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36,
+ 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63,
+ 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79,
+ 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6,
+ 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc,
+ 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd,
+ 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7,
+ 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238,
+ 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622,
+ 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177,
+ 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d,
+ 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2,
+ 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8
+};
+static const uint32_t table3_[256] = {
+ 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939,
+ 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca,
+ 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf,
+ 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c,
+ 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804,
+ 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7,
+ 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2,
+ 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11,
+ 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2,
+ 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41,
+ 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54,
+ 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7,
+ 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f,
+ 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c,
+ 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69,
+ 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a,
+ 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de,
+ 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d,
+ 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538,
+ 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb,
+ 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3,
+ 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610,
+ 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405,
+ 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6,
+ 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255,
+ 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6,
+ 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3,
+ 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040,
+ 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368,
+ 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b,
+ 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e,
+ 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d,
+ 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006,
+ 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5,
+ 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0,
+ 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213,
+ 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b,
+ 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8,
+ 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd,
+ 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e,
+ 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d,
+ 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e,
+ 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b,
+ 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698,
+ 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0,
+ 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443,
+ 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656,
+ 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5,
+ 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1,
+ 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12,
+ 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07,
+ 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4,
+ 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc,
+ 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f,
+ 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a,
+ 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9,
+ 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a,
+ 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99,
+ 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c,
+ 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f,
+ 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57,
+ 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4,
+ 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1,
+ 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842
+};
+
+// Used to fetch a naturally-aligned 32-bit word in little endian byte-order
+static inline uint32_t LE_LOAD32(const uint8_t *p) {
+ return DecodeFixed32(reinterpret_cast<const char*>(p));
+}
+
+uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
+ const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+ const uint8_t *e = p + size;
+ uint32_t l = crc ^ 0xffffffffu;
+
+#define STEP1 do { \
+ int c = (l & 0xff) ^ *p++; \
+ l = table0_[c] ^ (l >> 8); \
+} while (0)
+#define STEP4 do { \
+ uint32_t c = l ^ LE_LOAD32(p); \
+ p += 4; \
+ l = table3_[c & 0xff] ^ \
+ table2_[(c >> 8) & 0xff] ^ \
+ table1_[(c >> 16) & 0xff] ^ \
+ table0_[c >> 24]; \
+} while (0)
+
+ // Point x at first 4-byte aligned byte in string. This might be
+ // just past the end of the string.
+ const uintptr_t pval = reinterpret_cast<uintptr_t>(p);
+ const uint8_t* x = reinterpret_cast<const uint8_t*>(((pval + 3) >> 2) << 2);
+ if (x <= e) {
+ // Process bytes until finished or p is 4-byte aligned
+ while (p != x) {
+ STEP1;
+ }
+ }
+ // Process bytes 16 at a time
+ while ((e-p) >= 16) {
+ STEP4; STEP4; STEP4; STEP4;
+ }
+ // Process bytes 4 at a time
+ while ((e-p) >= 4) {
+ STEP4;
+ }
+ // Process the last few bytes
+ while (p != e) {
+ STEP1;
+ }
+#undef STEP4
+#undef STEP1
+ return l ^ 0xffffffffu;
+}
+
+}
+}
diff --git a/util/crc32c.h b/util/crc32c.h
new file mode 100644
index 0000000..938d8ff
--- /dev/null
+++ b/util/crc32c.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_CRC32C_H_
+#define STORAGE_LEVELDB_UTIL_CRC32C_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace leveldb {
+namespace crc32c {
+
+// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the
+// crc32c of some string A. Extend() is often used to maintain the
+// crc32c of a stream of data.
+extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n);
+
+// Return the crc32c of data[0,n-1]
+inline uint32_t Value(const char* data, size_t n) {
+ return Extend(0, data, n);
+}
+
+static const uint32_t kMaskDelta = 0xa282ead8ul;
+
+// Return a masked representation of crc.
+//
+// Motivation: it is problematic to compute the CRC of a string that
+// contains embedded CRCs. Therefore we recommend that CRCs stored
+// somewhere (e.g., in files) should be masked before being stored.
+inline uint32_t Mask(uint32_t crc) {
+ // Rotate right by 15 bits and add a constant.
+ return ((crc >> 15) | (crc << 17)) + kMaskDelta;
+}
+
+// Return the crc whose masked representation is masked_crc.
+inline uint32_t Unmask(uint32_t masked_crc) {
+ uint32_t rot = masked_crc - kMaskDelta;
+ return ((rot >> 17) | (rot << 15));
+}
+
+}
+}
+
+#endif // STORAGE_LEVELDB_UTIL_CRC32C_H_
diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc
new file mode 100644
index 0000000..a7fc758
--- /dev/null
+++ b/util/crc32c_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/crc32c.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+namespace crc32c {
+
+class CRC { };
+
+TEST(CRC, StandardResults) {
+ // From rfc3720 section B.4.
+ char buf[32];
+
+ memset(buf, 0, sizeof(buf));
+ ASSERT_EQ(0x8a9136aa, Value(buf, sizeof(buf)));
+
+ memset(buf, 0xff, sizeof(buf));
+ ASSERT_EQ(0x62a8ab43, Value(buf, sizeof(buf)));
+
+ for (int i = 0; i < 32; i++) {
+ buf[i] = i;
+ }
+ ASSERT_EQ(0x46dd794e, Value(buf, sizeof(buf)));
+
+ for (int i = 0; i < 32; i++) {
+ buf[i] = 31 - i;
+ }
+ ASSERT_EQ(0x113fdb5c, Value(buf, sizeof(buf)));
+
+ unsigned char data[48] = {
+ 0x01, 0xc0, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x14, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x04, 0x00,
+ 0x00, 0x00, 0x00, 0x14,
+ 0x00, 0x00, 0x00, 0x18,
+ 0x28, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ };
+ ASSERT_EQ(0xd9963a56, Value(reinterpret_cast<char*>(data), sizeof(data)));
+}
+
+TEST(CRC, Values) {
+ ASSERT_NE(Value("a", 1), Value("foo", 3));
+}
+
+TEST(CRC, Extend) {
+ ASSERT_EQ(Value("hello world", 11),
+ Extend(Value("hello ", 6), "world", 5));
+}
+
+TEST(CRC, Mask) {
+ uint32_t crc = Value("foo", 3);
+ ASSERT_NE(crc, Mask(crc));
+ ASSERT_NE(crc, Mask(Mask(crc)));
+ ASSERT_EQ(crc, Unmask(Mask(crc)));
+ ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc)))));
+}
+
+TEST(CRC, Benchmark) {
+ std::string data(1048576 * 100, 'x');
+ double start = Env::Default()->NowMicros() * 1e-6;
+ static const int kIters = 10;
+ uint32_t crc = 0;
+ for (int i = 0; i < kIters; i++) {
+ crc |= Value(data.data(), data.size());
+ }
+ double finish = Env::Default()->NowMicros() * 1e-6;
+ double mb = (static_cast<long long int>(data.size()) * kIters) / 1048576.0;
+ fprintf(stderr, "CRC %0.0f MB: %.3f secs; %.1f MB/s, crc=0x%08x\n",
+ mb, (finish - start), mb / (finish - start), crc);
+}
+
+}
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/util/env.cc b/util/env.cc
new file mode 100644
index 0000000..3c2ca89
--- /dev/null
+++ b/util/env.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/env.h"
+
+namespace leveldb {
+
+Env::~Env() {
+}
+
+SequentialFile::~SequentialFile() {
+}
+
+RandomAccessFile::~RandomAccessFile() {
+}
+
+WritableFile::~WritableFile() {
+}
+
+FileLock::~FileLock() {
+}
+
+void Log(Env* env, WritableFile* info_log, const char* format, ...) {
+ va_list ap;
+ va_start(ap, format);
+ env->Logv(info_log, format, ap);
+ va_end(ap);
+}
+
+Status WriteStringToFile(Env* env, const Slice& data,
+ const std::string& fname) {
+ WritableFile* file;
+ Status s = env->NewWritableFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ s = file->Append(data);
+ if (s.ok()) {
+ s = file->Close();
+ }
+ delete file; // Will auto-close if we did not close above
+ if (!s.ok()) {
+ env->DeleteFile(fname);
+ }
+ return s;
+}
+
+Status ReadFileToString(Env* env, const std::string& fname, std::string* data) {
+ data->clear();
+ SequentialFile* file;
+ Status s = env->NewSequentialFile(fname, &file);
+ if (!s.ok()) {
+ return s;
+ }
+ static const int kBufferSize = 8192;
+ char* space = new char[kBufferSize];
+ while (true) {
+ Slice fragment;
+ s = file->Read(kBufferSize, &fragment, space);
+ if (!s.ok()) {
+ break;
+ }
+ data->append(fragment.data(), fragment.size());
+ if (fragment.empty()) {
+ break;
+ }
+ }
+ delete[] space;
+ delete file;
+ return s;
+}
+
+EnvWrapper::~EnvWrapper() {
+}
+
+}
diff --git a/util/env_chromium.cc b/util/env_chromium.cc
new file mode 100644
index 0000000..e39ac71
--- /dev/null
+++ b/util/env_chromium.cc
@@ -0,0 +1,608 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <deque>
+#include <errno.h>
+#include <stdio.h>
+#include "base/at_exit.h"
+#include "base/file_path.h"
+#include "base/file_util.h"
+#include "base/lazy_instance.h"
+#include "base/message_loop.h"
+#include "base/platform_file.h"
+#include "base/process_util.h"
+#include "base/ref_counted.h"
+#include "base/synchronization/lock.h"
+#include "base/sys_info.h"
+#include "base/task.h"
+#include "base/threading/platform_thread.h"
+#include "base/threading/thread.h"
+#include "base/utf_string_conversions.h"
+#include "include/env.h"
+#include "include/slice.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+#if defined(OS_WIN)
+#include <io.h>
+#include "base/win/win_util.h"
+#endif
+
+#if defined(OS_MACOSX) || defined(OS_WIN)
+// The following are glibc-specific
+extern "C" {
+size_t fread_unlocked(void *ptr, size_t size, size_t n, FILE *file) {
+ return fread(ptr, size, n, file);
+}
+
+size_t fwrite_unlocked(const void *ptr, size_t size, size_t n, FILE *file) {
+ return fwrite(ptr, size, n, file);
+}
+
+int fflush_unlocked(FILE *file) {
+ return fflush(file);
+}
+
+int fdatasync(int fildes) {
+#if defined(OS_WIN)
+ return _commit(fildes);
+#else
+ return fsync(fildes);
+#endif
+}
+}
+#endif
+
+namespace leveldb {
+
+namespace {
+
+class Thread;
+
+::FilePath CreateFilePath(const std::string& file_path) {
+#if defined(OS_WIN)
+ return FilePath(UTF8ToUTF16(file_path));
+#else
+ return FilePath(file_path);
+#endif
+}
+
+std::string FilePathToString(const ::FilePath& file_path) {
+#if defined(OS_WIN)
+ return UTF16ToUTF8(file_path.value());
+#else
+ return file_path.value();
+#endif
+}
+
+// TODO(jorlow): This should be moved into Chromium's base.
+const char* PlatformFileErrorString(const ::base::PlatformFileError& error) {
+ switch (error) {
+ case ::base::PLATFORM_FILE_ERROR_FAILED:
+ return "Opening file failed.";
+ case ::base::PLATFORM_FILE_ERROR_IN_USE:
+ return "File currently in use.";
+ case ::base::PLATFORM_FILE_ERROR_EXISTS:
+ return "File already exists.";
+ case ::base::PLATFORM_FILE_ERROR_NOT_FOUND:
+ return "File not found.";
+ case ::base::PLATFORM_FILE_ERROR_ACCESS_DENIED:
+ return "Access denied.";
+ case ::base::PLATFORM_FILE_ERROR_TOO_MANY_OPENED:
+ return "Too many files open.";
+ case ::base::PLATFORM_FILE_ERROR_NO_MEMORY:
+ return "Out of memory.";
+ case ::base::PLATFORM_FILE_ERROR_NO_SPACE:
+ return "No space left on drive.";
+ case ::base::PLATFORM_FILE_ERROR_NOT_A_DIRECTORY:
+ return "Not a directory.";
+ case ::base::PLATFORM_FILE_ERROR_INVALID_OPERATION:
+ return "Invalid operation.";
+ case ::base::PLATFORM_FILE_ERROR_SECURITY:
+ return "Security error.";
+ case ::base::PLATFORM_FILE_ERROR_ABORT:
+ return "File operation aborted.";
+ case ::base::PLATFORM_FILE_ERROR_NOT_A_FILE:
+ return "The supplied path was not a file.";
+ case ::base::PLATFORM_FILE_ERROR_NOT_EMPTY:
+ return "The file was not empty.";
+ }
+ NOTIMPLEMENTED();
+ return "Unknown error.";
+}
+
+class ChromiumSequentialFile: public SequentialFile {
+ private:
+ std::string filename_;
+ FILE* file_;
+
+ public:
+ ChromiumSequentialFile(const std::string& fname, FILE* f)
+ : filename_(fname), file_(f) { }
+ virtual ~ChromiumSequentialFile() { fclose(file_); }
+
+ virtual Status Read(size_t n, Slice* result, char* scratch) {
+ Status s;
+ size_t r = fread_unlocked(scratch, 1, n, file_);
+ *result = Slice(scratch, r);
+ if (r < n) {
+ if (feof(file_)) {
+ // We leave status as ok if we hit the end of the file
+ } else {
+ // A partial read with an error: return a non-ok status
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+ return s;
+ }
+};
+
+class ChromiumRandomAccessFile: public RandomAccessFile {
+ private:
+ std::string filename_;
+ uint64_t size_;
+ ::base::PlatformFile file_;
+
+ public:
+ ChromiumRandomAccessFile(const std::string& fname, uint64_t size,
+ ::base::PlatformFile file)
+ : filename_(fname), size_(size), file_(file) { }
+ virtual ~ChromiumRandomAccessFile() { ::base::ClosePlatformFile(file_); }
+
+ virtual uint64_t Size() const { return size_; }
+
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ Status s;
+ int r = ::base::ReadPlatformFile(file_, offset, scratch, n);
+ *result = Slice(scratch, (r < 0) ? 0 : r);
+ if (r < 0) {
+ // An error: return a non-ok status
+ s = Status::IOError(filename_, "Could not preform read");
+ }
+ return s;
+ }
+};
+
+class ChromiumWritableFile : public WritableFile {
+ private:
+ std::string filename_;
+ FILE* file_;
+
+ public:
+ ChromiumWritableFile(const std::string& fname, FILE* f)
+ : filename_(fname), file_(f) { }
+
+ ~ChromiumWritableFile() {
+ if (file_ != NULL) {
+ // Ignoring any potential errors
+ fclose(file_);
+ }
+ }
+
+ virtual Status Append(const Slice& data) {
+ size_t r = fwrite_unlocked(data.data(), 1, data.size(), file_);
+ Status result;
+ if (r != data.size()) {
+ result = Status::IOError(filename_, strerror(errno));
+ }
+ return result;
+ }
+
+ virtual Status Close() {
+ Status result;
+ if (fclose(file_) != 0) {
+ result = Status::IOError(filename_, strerror(errno));
+ }
+ file_ = NULL;
+ return result;
+ }
+
+ virtual Status Flush() {
+ Status result;
+ if (fflush_unlocked(file_) != 0) {
+ result = Status::IOError(filename_, strerror(errno));
+ }
+ return result;
+ }
+
+ virtual Status Sync() {
+ Status result;
+ if ((fflush_unlocked(file_) != 0) ||
+ (fdatasync(fileno(file_)) != 0)) {
+ result = Status::IOError(filename_, strerror(errno));
+ }
+ return result;
+ }
+};
+
+class ChromiumFileLock : public FileLock {
+ public:
+ ::base::PlatformFile file_;
+};
+
+class ChromiumEnv : public Env {
+ public:
+ ChromiumEnv();
+ virtual ~ChromiumEnv() {
+ fprintf(stderr, "Destroying Env::Default()\n");
+ exit(1);
+ }
+
+ virtual Status NewSequentialFile(const std::string& fname,
+ SequentialFile** result) {
+ FILE* f = fopen(fname.c_str(), "rb");
+ if (f == NULL) {
+ *result = NULL;
+ return Status::IOError(fname, strerror(errno));
+ } else {
+ *result = new ChromiumSequentialFile(fname, f);
+ return Status::OK();
+ }
+ }
+
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ RandomAccessFile** result) {
+ int flags = ::base::PLATFORM_FILE_READ | ::base::PLATFORM_FILE_OPEN;
+ bool created;
+ ::base::PlatformFileError error_code;
+ ::base::PlatformFile file = ::base::CreatePlatformFile(
+ CreateFilePath(fname), flags, &created, &error_code);
+ if (error_code != ::base::PLATFORM_FILE_OK) {
+ *result = NULL;
+ return Status::IOError(fname, PlatformFileErrorString(error_code));
+ }
+ ::base::PlatformFileInfo info;
+ if (!::base::GetPlatformFileInfo(file, &info)) {
+ *result = NULL;
+ ::base::ClosePlatformFile(file);
+ return Status::IOError(fname, PlatformFileErrorString(error_code));
+ }
+ *result = new ChromiumRandomAccessFile(fname, info.size, file);
+ return Status::OK();
+ }
+
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ *result = NULL;
+ FILE* f = fopen(fname.c_str(), "wb");
+ if (f == NULL) {
+ return Status::IOError(fname, strerror(errno));
+ } else {
+ *result = new ChromiumWritableFile(fname, f);
+ return Status::OK();
+ }
+ }
+
+ virtual bool FileExists(const std::string& fname) {
+ return ::file_util::PathExists(CreateFilePath(fname));
+ }
+
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) {
+ result->clear();
+ ::file_util::FileEnumerator iter(
+ CreateFilePath(dir), false, ::file_util::FileEnumerator::FILES);
+ ::FilePath current = iter.Next();
+ while (!current.empty()) {
+ result->push_back(FilePathToString(current.BaseName()));
+ current = iter.Next();
+ }
+ // TODO(jorlow): Unfortunately, the FileEnumerator swallows errors, so
+ // we'll always return OK. Maybe manually check for error
+ // conditions like the file not existing?
+ return Status::OK();
+ }
+
+ virtual Status DeleteFile(const std::string& fname) {
+ Status result;
+ // TODO(jorlow): Should we assert this is a file?
+ if (!::file_util::Delete(CreateFilePath(fname), false)) {
+ result = Status::IOError(fname, "Could not delete file.");
+ }
+ return result;
+ };
+
+ virtual Status CreateDir(const std::string& name) {
+ Status result;
+ if (!::file_util::CreateDirectory(CreateFilePath(name))) {
+ result = Status::IOError(name, "Could not create directory.");
+ }
+ return result;
+ };
+
+ virtual Status DeleteDir(const std::string& name) {
+ Status result;
+ // TODO(jorlow): Should we assert this is a directory?
+ if (!::file_util::Delete(CreateFilePath(name), false)) {
+ result = Status::IOError(name, "Could not delete directory.");
+ }
+ return result;
+ };
+
+ virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+ Status s;
+ int64 signed_size;
+ if (!::file_util::GetFileSize(CreateFilePath(fname), &signed_size)) {
+ *size = 0;
+ s = Status::IOError(fname, "Could not determine file size.");
+ } else {
+ *size = static_cast<uint64_t>(signed_size);
+ }
+ return s;
+ }
+
+ virtual Status RenameFile(const std::string& src, const std::string& dst) {
+ Status result;
+ if (!::file_util::ReplaceFile(CreateFilePath(src), CreateFilePath(dst))) {
+ result = Status::IOError(src, "Could not rename file.");
+ }
+ return result;
+ }
+
+ virtual Status LockFile(const std::string& fname, FileLock** lock) {
+ *lock = NULL;
+ Status result;
+ int flags = ::base::PLATFORM_FILE_OPEN_ALWAYS |
+ ::base::PLATFORM_FILE_READ |
+ ::base::PLATFORM_FILE_WRITE |
+ ::base::PLATFORM_FILE_EXCLUSIVE_READ |
+ ::base::PLATFORM_FILE_EXCLUSIVE_WRITE;
+ bool created;
+ ::base::PlatformFileError error_code;
+ ::base::PlatformFile file = ::base::CreatePlatformFile(
+ CreateFilePath(fname), flags, &created, &error_code);
+ if (error_code != ::base::PLATFORM_FILE_OK) {
+ result = Status::IOError(fname, PlatformFileErrorString(error_code));
+ } else {
+ ChromiumFileLock* my_lock = new ChromiumFileLock;
+ my_lock->file_ = file;
+ *lock = my_lock;
+ }
+ return result;
+ }
+
+ virtual Status UnlockFile(FileLock* lock) {
+ ChromiumFileLock* my_lock = reinterpret_cast<ChromiumFileLock*>(lock);
+ Status result;
+ if (!::base::ClosePlatformFile(my_lock->file_)) {
+ result = Status::IOError("Could not close lock file.");
+ }
+ delete my_lock;
+ return result;
+ }
+
+ virtual void Schedule(void (*function)(void*), void* arg);
+
+ virtual void StartThread(void (*function)(void* arg), void* arg);
+
+ virtual std::string UserIdentifier() {
+#if defined(OS_WIN)
+ std::wstring user_sid;
+ bool ret = ::base::win::GetUserSidString(&user_sid);
+ DCHECK(ret);
+ return UTF16ToUTF8(user_sid);
+#else
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d", int(geteuid()));
+ return buf;
+#endif
+ }
+
+ virtual Status GetTestDirectory(std::string* path) {
+ if (test_directory_.empty()) {
+ if (!::file_util::CreateNewTempDirectory("leveldb-", &test_directory_)) {
+ return Status::IOError("Could not create temp directory.");
+ }
+ }
+ *path = FilePathToString(test_directory_);
+ return Status::OK();
+ }
+
+ virtual void Logv(WritableFile* info_log, const char* format, va_list ap) {
+ // TODO(jorlow): We may want to just use Chromium's built in logging.
+
+ uint64_t thread_id = 0;
+ // Coppied from base/logging.cc.
+#if defined(OS_WIN)
+ thread_id = GetCurrentThreadId();
+#elif defined(OS_MACOSX)
+ thread_id = mach_thread_self();
+#elif defined(OS_LINUX)
+ thread_id = syscall(__NR_gettid);
+#elif defined(OS_FREEBSD) || defined(OS_NACL)
+ // TODO(BSD): find a better thread ID
+ pthread_t tid = pthread_self();
+ memcpy(&thread_id, &tid, min(sizeof(r), sizeof(tid)));
+#endif
+
+ // We try twice: the first time with a fixed-size stack allocated buffer,
+ // and the second time with a much larger dynamically allocated buffer.
+ char buffer[500];
+ for (int iter = 0; iter < 2; iter++) {
+ char* base;
+ int bufsize;
+ if (iter == 0) {
+ bufsize = sizeof(buffer);
+ base = buffer;
+ } else {
+ bufsize = 30000;
+ base = new char[bufsize];
+ }
+ char* p = base;
+ char* limit = base + bufsize;
+
+ ::base::Time::Exploded t;
+ ::base::Time::Now().LocalExplode(&t);
+ p += snprintf(p, limit - p,
+ "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+ t.year,
+ t.month,
+ t.day_of_month,
+ t.hour,
+ t.minute,
+ t.second,
+ static_cast<int>(t.millisecond) * 1000,
+ static_cast<long long unsigned int>(thread_id));
+
+ // Print the message
+ if (p < limit) {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ p += vsnprintf(p, limit - p, format, backup_ap);
+ va_end(backup_ap);
+ }
+
+ // Truncate to available space if necessary
+ if (p >= limit) {
+ if (iter == 0) {
+ continue; // Try again with larger buffer
+ } else {
+ p = limit - 1;
+ }
+ }
+
+ // Add newline if necessary
+ if (p == base || p[-1] != '\n') {
+ *p++ = '\n';
+ }
+
+ assert(p <= limit);
+ info_log->Append(Slice(base, p - base));
+ info_log->Flush();
+ if (base != buffer) {
+ delete[] base;
+ }
+ break;
+ }
+ }
+
+ virtual int AppendLocalTimeToBuffer(char* buffer, size_t size) {
+ ::base::Time::Exploded t;
+ ::base::Time::Now().LocalExplode(&t);
+ return snprintf(buffer, size,
+ "%04d/%02d/%02d-%02d:%02d:%02d.%06d",
+ t.year,
+ t.month,
+ t.day_of_month,
+ t.hour,
+ t.minute,
+ t.second,
+ static_cast<int>(t.millisecond) * 1000);
+ }
+
+ virtual uint64_t NowMicros() {
+ return ::base::TimeTicks::HighResNow().ToInternalValue();
+ }
+
+ virtual void SleepForMicroseconds(int micros) {
+ // Round up to the next millisecond.
+ ::base::PlatformThread::Sleep((micros + 999) / 1000);
+ }
+
+ private:
+ // BGThread() is the body of the background thread
+ void BGThread();
+ static void BGThreadWrapper(void* arg) {
+ reinterpret_cast<ChromiumEnv*>(arg)->BGThread();
+ }
+
+ FilePath test_directory_;
+
+ size_t page_size_;
+ ::base::Lock mu_;
+ ::base::ConditionVariable bgsignal_;
+ bool started_bgthread_;
+
+ // Entry per Schedule() call
+ struct BGItem { void* arg; void (*function)(void*); };
+ typedef std::deque<BGItem> BGQueue;
+ BGQueue queue_;
+};
+
+ChromiumEnv::ChromiumEnv()
+ : page_size_(::base::SysInfo::VMAllocationGranularity()),
+ bgsignal_(&mu_),
+ started_bgthread_(false) {
+#if defined(OS_MACOSX)
+ ::base::EnableTerminationOnHeapCorruption();
+ ::base::EnableTerminationOnOutOfMemory();
+#endif // OS_MACOSX
+}
+
+class Thread : public ::base::PlatformThread::Delegate {
+ public:
+ Thread(void (*function)(void* arg), void* arg)
+ : function_(function), arg_(arg) {
+ ::base::PlatformThreadHandle handle;
+ bool success = ::base::PlatformThread::Create(0, this, &handle);
+ DCHECK(success);
+ }
+ virtual ~Thread() {}
+ virtual void ThreadMain() {
+ (*function_)(arg_);
+ delete this;
+ }
+
+ private:
+ void (*function_)(void* arg);
+ void* arg_;
+};
+
+void ChromiumEnv::Schedule(void (*function)(void*), void* arg) {
+ mu_.Acquire();
+
+ // Start background thread if necessary
+ if (!started_bgthread_) {
+ started_bgthread_ = true;
+ StartThread(&ChromiumEnv::BGThreadWrapper, this);
+ }
+
+ // If the queue is currently empty, the background thread may currently be
+ // waiting.
+ if (queue_.empty()) {
+ bgsignal_.Signal();
+ }
+
+ // Add to priority queue
+ queue_.push_back(BGItem());
+ queue_.back().function = function;
+ queue_.back().arg = arg;
+
+ mu_.Release();
+}
+
+void ChromiumEnv::BGThread() {
+ while (true) {
+ // Wait until there is an item that is ready to run
+ mu_.Acquire();
+ while (queue_.empty()) {
+ bgsignal_.Wait();
+ }
+
+ void (*function)(void*) = queue_.front().function;
+ void* arg = queue_.front().arg;
+ queue_.pop_front();
+
+ mu_.Release();
+ (*function)(arg);
+ }
+}
+
+void ChromiumEnv::StartThread(void (*function)(void* arg), void* arg) {
+ new Thread(function, arg); // Will self-delete.
+}
+
+// TODO(jorlow): This won't co-exist with Chrome. Need to find a better way.
+::base::AtExitManager exit_manager;
+
+::base::LazyInstance<ChromiumEnv> default_env(::base::LINKER_INITIALIZED);
+
+}
+
+Env* Env::Default() {
+ return default_env.Pointer();
+}
+
+}
diff --git a/util/env_posix.cc b/util/env_posix.cc
new file mode 100644
index 0000000..b662f9c
--- /dev/null
+++ b/util/env_posix.cc
@@ -0,0 +1,609 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <deque>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#if defined(LEVELDB_PLATFORM_ANDROID)
+#include <sys/stat.h>
+#endif
+#include "include/env.h"
+#include "include/slice.h"
+#include "port/port.h"
+#include "util/logging.h"
+
+namespace leveldb {
+
+namespace {
+
+class PosixSequentialFile: public SequentialFile {
+ private:
+ std::string filename_;
+ FILE* file_;
+
+ public:
+ PosixSequentialFile(const std::string& fname, FILE* f)
+ : filename_(fname), file_(f) { }
+ virtual ~PosixSequentialFile() { fclose(file_); }
+
+ virtual Status Read(size_t n, Slice* result, char* scratch) {
+ Status s;
+ size_t r = fread_unlocked(scratch, 1, n, file_);
+ *result = Slice(scratch, r);
+ if (r < n) {
+ if (feof(file_)) {
+ // We leave status as ok if we hit the end of the file
+ } else {
+ // A partial read with an error: return a non-ok status
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+ return s;
+ }
+};
+
+class PosixRandomAccessFile: public RandomAccessFile {
+ private:
+ std::string filename_;
+ uint64_t size_;
+ int fd_;
+
+ public:
+ PosixRandomAccessFile(const std::string& fname, uint64_t size, int fd)
+ : filename_(fname), size_(size), fd_(fd) { }
+ virtual ~PosixRandomAccessFile() { close(fd_); }
+
+ virtual uint64_t Size() const { return size_; }
+
+ virtual Status Read(uint64_t offset, size_t n, Slice* result,
+ char* scratch) const {
+ Status s;
+ ssize_t r = pread(fd_, scratch, n, static_cast<off_t>(offset));
+ *result = Slice(scratch, (r < 0) ? 0 : r);
+ if (r < 0) {
+ // An error: return a non-ok status
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ return s;
+ }
+};
+
+// We preallocate up to an extra megabyte and use memcpy to append new
+// data to the file. This is safe since we either properly close the
+// file before reading from it, or for log files, the reading code
+// knows enough to skip zero suffixes.
+class PosixMmapFile : public WritableFile {
+ private:
+ std::string filename_;
+ int fd_;
+ size_t page_size_;
+ size_t map_size_; // How much extra memory to map at a time
+ char* base_; // The mapped region
+ char* limit_; // Limit of the mapped region
+ char* dst_; // Where to write next (in range [base_,limit_])
+ char* last_sync_; // Where have we synced up to
+ uint64_t file_offset_; // Offset of base_ in file
+
+ // Have we done an munmap of unsynced data?
+ bool pending_sync_;
+
+ // Roundup x to a multiple of y
+ static size_t Roundup(size_t x, size_t y) {
+ return ((x + y - 1) / y) * y;
+ }
+
+ size_t TruncateToPageBoundary(size_t s) {
+ s -= (s & (page_size_ - 1));
+ assert((s % page_size_) == 0);
+ return s;
+ }
+
+ void UnmapCurrentRegion() {
+ if (base_ != NULL) {
+ if (last_sync_ < limit_) {
+ // Defer syncing this data until next Sync() call, if any
+ pending_sync_ = true;
+ }
+ munmap(base_, limit_ - base_);
+ file_offset_ += limit_ - base_;
+ base_ = NULL;
+ limit_ = NULL;
+ last_sync_ = NULL;
+ dst_ = NULL;
+
+ // Increase the amount we map the next time, but capped at 1MB
+ if (map_size_ < (1<<20)) {
+ map_size_ *= 2;
+ }
+ }
+ }
+
+ bool MapNewRegion() {
+ assert(base_ == NULL);
+ if (ftruncate(fd_, file_offset_ + map_size_) < 0) {
+ return false;
+ }
+ void* ptr = mmap(NULL, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
+ fd_, file_offset_);
+ if (ptr == MAP_FAILED) {
+ return false;
+ }
+ base_ = reinterpret_cast<char*>(ptr);
+ limit_ = base_ + map_size_;
+ dst_ = base_;
+ last_sync_ = base_;
+ return true;
+ }
+
+ public:
+ PosixMmapFile(const std::string& fname, int fd, size_t page_size)
+ : filename_(fname),
+ fd_(fd),
+ page_size_(page_size),
+ map_size_(Roundup(65536, page_size)),
+ base_(NULL),
+ limit_(NULL),
+ dst_(NULL),
+ last_sync_(NULL),
+ file_offset_(0),
+ pending_sync_(false) {
+ assert((page_size & (page_size - 1)) == 0);
+ }
+
+
+ ~PosixMmapFile() {
+ if (fd_ >= 0) {
+ PosixMmapFile::Close();
+ }
+ }
+
+ virtual Status Append(const Slice& data) {
+ const char* src = data.data();
+ size_t left = data.size();
+ while (left > 0) {
+ assert(base_ <= dst_);
+ assert(dst_ <= limit_);
+ size_t avail = limit_ - dst_;
+ if (avail == 0) {
+ UnmapCurrentRegion();
+ MapNewRegion();
+ }
+
+ size_t n = (left <= avail) ? left : avail;
+ memcpy(dst_, src, n);
+ dst_ += n;
+ src += n;
+ left -= n;
+ }
+ return Status::OK();
+ }
+
+ virtual Status Close() {
+ Status s;
+ size_t unused = limit_ - dst_;
+ UnmapCurrentRegion();
+ if (unused > 0) {
+ // Trim the extra space at the end of the file
+ if (ftruncate(fd_, file_offset_ - unused) < 0) {
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+
+ if (close(fd_) < 0) {
+ if (s.ok()) {
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+
+ fd_ = -1;
+ base_ = NULL;
+ limit_ = NULL;
+ return s;
+ }
+
+ virtual Status Flush() {
+ return Status::OK();
+ }
+
+ virtual Status Sync() {
+ Status s;
+
+ if (pending_sync_) {
+ // Some unmapped data was not synced
+ pending_sync_ = false;
+ if (fdatasync(fd_) < 0) {
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+
+ if (dst_ > last_sync_) {
+ // Find the beginnings of the pages that contain the first and last
+ // bytes to be synced.
+ size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
+ size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
+ last_sync_ = dst_;
+ if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
+ s = Status::IOError(filename_, strerror(errno));
+ }
+ }
+
+ return s;
+ }
+};
+
+static int LockOrUnlock(int fd, bool lock) {
+ errno = 0;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = (lock ? F_WRLCK : F_UNLCK);
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 0; // Lock/unlock entire file
+ return fcntl(fd, F_SETLK, &f);
+}
+
+class PosixFileLock : public FileLock {
+ public:
+ int fd_;
+};
+
+class PosixEnv : public Env {
+ public:
+ PosixEnv();
+ virtual ~PosixEnv() {
+ fprintf(stderr, "Destroying Env::Default()\n");
+ exit(1);
+ }
+
+ virtual Status NewSequentialFile(const std::string& fname,
+ SequentialFile** result) {
+ FILE* f = fopen(fname.c_str(), "r");
+ if (f == NULL) {
+ *result = NULL;
+ return Status::IOError(fname, strerror(errno));
+ } else {
+ *result = new PosixSequentialFile(fname, f);
+ return Status::OK();
+ }
+ }
+
+ virtual Status NewRandomAccessFile(const std::string& fname,
+ RandomAccessFile** result) {
+ int fd = open(fname.c_str(), O_RDONLY);
+ if (fd < 0) {
+ *result = NULL;
+ return Status::IOError(fname, strerror(errno));
+ }
+ struct stat sbuf;
+ if (fstat(fd, &sbuf) != 0) {
+ *result = NULL;
+ Status s = Status::IOError(fname, strerror(errno));
+ close(fd);
+ return s;
+ }
+ *result = new PosixRandomAccessFile(fname, sbuf.st_size, fd);
+ return Status::OK();
+ }
+
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ Status s;
+ const int fd = open(fname.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+ if (fd < 0) {
+ *result = NULL;
+ s = Status::IOError(fname, strerror(errno));
+ } else {
+ *result = new PosixMmapFile(fname, fd, page_size_);
+ }
+ return s;
+ }
+
+ virtual bool FileExists(const std::string& fname) {
+ return access(fname.c_str(), F_OK) == 0;
+ }
+
+ virtual Status GetChildren(const std::string& dir,
+ std::vector<std::string>* result) {
+ result->clear();
+ DIR* d = opendir(dir.c_str());
+ if (d == NULL) {
+ return Status::IOError(dir, strerror(errno));
+ }
+ struct dirent* entry;
+ while ((entry = readdir(d)) != NULL) {
+ result->push_back(entry->d_name);
+ }
+ closedir(d);
+ return Status::OK();
+ }
+
+ virtual Status DeleteFile(const std::string& fname) {
+ Status result;
+ if (unlink(fname.c_str()) != 0) {
+ result = Status::IOError(fname, strerror(errno));
+ }
+ return result;
+ };
+
+ virtual Status CreateDir(const std::string& name) {
+ Status result;
+ if (mkdir(name.c_str(), 0755) != 0) {
+ result = Status::IOError(name, strerror(errno));
+ }
+ return result;
+ };
+
+ virtual Status DeleteDir(const std::string& name) {
+ Status result;
+ if (rmdir(name.c_str()) != 0) {
+ result = Status::IOError(name, strerror(errno));
+ }
+ return result;
+ };
+
+ virtual Status GetFileSize(const std::string& fname, uint64_t* size) {
+ Status s;
+ struct stat sbuf;
+ if (stat(fname.c_str(), &sbuf) != 0) {
+ *size = 0;
+ s = Status::IOError(fname, strerror(errno));
+ } else {
+ *size = sbuf.st_size;
+ }
+ return s;
+ }
+
+ virtual Status RenameFile(const std::string& src, const std::string& target) {
+ Status result;
+ if (rename(src.c_str(), target.c_str()) != 0) {
+ result = Status::IOError(src, strerror(errno));
+ }
+ return result;
+ }
+
+ virtual Status LockFile(const std::string& fname, FileLock** lock) {
+ *lock = NULL;
+ Status result;
+ int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+ if (fd < 0) {
+ result = Status::IOError(fname, strerror(errno));
+ } else if (LockOrUnlock(fd, true) == -1) {
+ result = Status::IOError("lock " + fname, strerror(errno));
+ close(fd);
+ } else {
+ PosixFileLock* my_lock = new PosixFileLock;
+ my_lock->fd_ = fd;
+ *lock = my_lock;
+ }
+ return result;
+ }
+
+ virtual Status UnlockFile(FileLock* lock) {
+ PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
+ Status result;
+ if (LockOrUnlock(my_lock->fd_, false) == -1) {
+ result = Status::IOError(strerror(errno));
+ }
+ close(my_lock->fd_);
+ delete my_lock;
+ return result;
+ }
+
+ virtual void Schedule(void (*function)(void*), void* arg);
+
+ virtual void StartThread(void (*function)(void* arg), void* arg);
+
+ virtual Status GetTestDirectory(std::string* result) {
+ const char* env = getenv("TEST_TMPDIR");
+ if (env && env[0] != '\0') {
+ *result = env;
+ } else {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "/tmp/leveldbtest-%d", int(geteuid()));
+ *result = buf;
+ }
+ // Directory may already exist
+ CreateDir(*result);
+ return Status::OK();
+ }
+
+ virtual void Logv(WritableFile* info_log, const char* format, va_list ap) {
+ pthread_t tid = pthread_self();
+ uint64_t thread_id = 0;
+ memcpy(&thread_id, &tid, min(sizeof(thread_id), sizeof(tid)));
+
+ // We try twice: the first time with a fixed-size stack allocated buffer,
+ // and the second time with a much larger dynamically allocated buffer.
+ char buffer[500];
+ for (int iter = 0; iter < 2; iter++) {
+ char* base;
+ int bufsize;
+ if (iter == 0) {
+ bufsize = sizeof(buffer);
+ base = buffer;
+ } else {
+ bufsize = 30000;
+ base = new char[bufsize];
+ }
+ char* p = base;
+ char* limit = base + bufsize;
+
+ struct timeval now_tv;
+ gettimeofday(&now_tv, NULL);
+ const time_t seconds = now_tv.tv_sec;
+ struct tm t;
+ localtime_r(&seconds, &t);
+ p += snprintf(p, limit - p,
+ "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ",
+ t.tm_year + 1900,
+ t.tm_mon + 1,
+ t.tm_mday,
+ t.tm_hour,
+ t.tm_min,
+ t.tm_sec,
+ static_cast<int>(now_tv.tv_usec),
+ static_cast<long long unsigned int>(thread_id));
+
+ // Print the message
+ if (p < limit) {
+ va_list backup_ap;
+ va_copy(backup_ap, ap);
+ p += vsnprintf(p, limit - p, format, backup_ap);
+ va_end(backup_ap);
+ }
+
+ // Truncate to available space if necessary
+ if (p >= limit) {
+ if (iter == 0) {
+ continue; // Try again with larger buffer
+ } else {
+ p = limit - 1;
+ }
+ }
+
+ // Add newline if necessary
+ if (p == base || p[-1] != '\n') {
+ *p++ = '\n';
+ }
+
+ assert(p <= limit);
+ info_log->Append(Slice(base, p - base));
+ info_log->Flush();
+ if (base != buffer) {
+ delete[] base;
+ }
+ break;
+ }
+ }
+
+ virtual uint64_t NowMicros() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+ }
+
+ virtual void SleepForMicroseconds(int micros) {
+ usleep(micros);
+ }
+
+ private:
+ void PthreadCall(const char* label, int result) {
+ if (result != 0) {
+ fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
+ exit(1);
+ }
+ }
+
+ // BGThread() is the body of the background thread
+ void BGThread();
+ static void* BGThreadWrapper(void* arg) {
+ reinterpret_cast<PosixEnv*>(arg)->BGThread();
+ return NULL;
+ }
+
+ size_t page_size_;
+ pthread_mutex_t mu_;
+ pthread_cond_t bgsignal_;
+ pthread_t bgthread_;
+ bool started_bgthread_;
+
+ // Entry per Schedule() call
+ struct BGItem { void* arg; void (*function)(void*); };
+ typedef std::deque<BGItem> BGQueue;
+ BGQueue queue_;
+};
+
+PosixEnv::PosixEnv() : page_size_(getpagesize()),
+ started_bgthread_(false) {
+ PthreadCall("mutex_init", pthread_mutex_init(&mu_, NULL));
+ PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, NULL));
+}
+
+void PosixEnv::Schedule(void (*function)(void*), void* arg) {
+ PthreadCall("lock", pthread_mutex_lock(&mu_));
+
+ // Start background thread if necessary
+ if (!started_bgthread_) {
+ started_bgthread_ = true;
+ PthreadCall(
+ "create thread",
+ pthread_create(&bgthread_, NULL, &PosixEnv::BGThreadWrapper, this));
+ }
+
+ // If the queue is currently empty, the background thread may currently be
+ // waiting.
+ if (queue_.empty()) {
+ PthreadCall("signal", pthread_cond_signal(&bgsignal_));
+ }
+
+ // Add to priority queue
+ queue_.push_back(BGItem());
+ queue_.back().function = function;
+ queue_.back().arg = arg;
+
+ PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+}
+
+void PosixEnv::BGThread() {
+ while (true) {
+ // Wait until there is an item that is ready to run
+ PthreadCall("lock", pthread_mutex_lock(&mu_));
+ while (queue_.empty()) {
+ PthreadCall("wait", pthread_cond_wait(&bgsignal_, &mu_));
+ }
+
+ void (*function)(void*) = queue_.front().function;
+ void* arg = queue_.front().arg;
+ queue_.pop_front();
+
+ PthreadCall("unlock", pthread_mutex_unlock(&mu_));
+ (*function)(arg);
+ }
+}
+
+namespace {
+struct StartThreadState {
+ void (*user_function)(void*);
+ void* arg;
+};
+}
+static void* StartThreadWrapper(void* arg) {
+ StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
+ state->user_function(state->arg);
+ delete state;
+ return NULL;
+}
+
+void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
+ pthread_t t;
+ StartThreadState* state = new StartThreadState;
+ state->user_function = function;
+ state->arg = arg;
+ PthreadCall("start thread",
+ pthread_create(&t, NULL, &StartThreadWrapper, state));
+}
+
+}
+
+static pthread_once_t once = PTHREAD_ONCE_INIT;
+static Env* default_env;
+static void InitDefaultEnv() { default_env = new PosixEnv; }
+
+Env* Env::Default() {
+ pthread_once(&once, InitDefaultEnv);
+ return default_env;
+}
+
+}
diff --git a/util/env_test.cc b/util/env_test.cc
new file mode 100644
index 0000000..4d17564
--- /dev/null
+++ b/util/env_test.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/env.h"
+
+#include "port/port.h"
+#include "util/testharness.h"
+
+namespace leveldb {
+
+static const int kDelayMicros = 100000;
+
+class EnvPosixTest {
+ private:
+ port::Mutex mu_;
+ std::string events_;
+
+ public:
+ Env* env_;
+ EnvPosixTest() : env_(Env::Default()) { }
+};
+
+static void SetBool(void* ptr) {
+ *(reinterpret_cast<bool*>(ptr)) = true;
+}
+
+TEST(EnvPosixTest, RunImmediately) {
+ bool called = false;
+ env_->Schedule(&SetBool, &called);
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_TRUE(called);
+}
+
+TEST(EnvPosixTest, RunMany) {
+ int last_id = 0;
+
+ struct CB {
+ int* last_id_ptr; // Pointer to shared slot
+ int id; // Order# for the execution of this callback
+
+ CB(int* p, int i) : last_id_ptr(p), id(i) { }
+
+ static void Run(void* v) {
+ CB* cb = reinterpret_cast<CB*>(v);
+ ASSERT_EQ(cb->id-1, *cb->last_id_ptr);
+ *cb->last_id_ptr = cb->id;
+ }
+ };
+
+ // Schedule in different order than start time
+ CB cb1(&last_id, 1);
+ CB cb2(&last_id, 2);
+ CB cb3(&last_id, 3);
+ CB cb4(&last_id, 4);
+ env_->Schedule(&CB::Run, &cb1);
+ env_->Schedule(&CB::Run, &cb2);
+ env_->Schedule(&CB::Run, &cb3);
+ env_->Schedule(&CB::Run, &cb4);
+
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ ASSERT_EQ(4, last_id);
+}
+
+struct State {
+ port::Mutex mu;
+ int val;
+ int num_running;
+};
+
+static void ThreadBody(void* arg) {
+ State* s = reinterpret_cast<State*>(arg);
+ s->mu.Lock();
+ s->val += 1;
+ s->num_running -= 1;
+ s->mu.Unlock();
+}
+
+TEST(EnvPosixTest, StartThread) {
+ State state;
+ state.val = 0;
+ state.num_running = 3;
+ for (int i = 0; i < 3; i++) {
+ env_->StartThread(&ThreadBody, &state);
+ }
+ while (true) {
+ state.mu.Lock();
+ int num = state.num_running;
+ state.mu.Unlock();
+ if (num == 0) {
+ break;
+ }
+ Env::Default()->SleepForMicroseconds(kDelayMicros);
+ }
+ ASSERT_EQ(state.val, 3);
+}
+
+}
+
+int main(int argc, char** argv) {
+ return leveldb::test::RunAllTests();
+}
diff --git a/util/hash.cc b/util/hash.cc
new file mode 100644
index 0000000..d19afd1
--- /dev/null
+++ b/util/hash.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <string.h>
+#include "util/coding.h"
+#include "util/hash.h"
+
+namespace leveldb {
+
+uint32_t Hash(const char* data, size_t n, uint32_t seed) {
+ // Similar to murmur hash
+ const uint32_t m = 0xc6a4a793;
+ const uint32_t r = 24;
+ const char* limit = data + n;
+ uint32_t h = seed ^ (n * m);
+
+ // Pick up four bytes at a time
+ while (data + 4 <= limit) {
+ uint32_t w = DecodeFixed32(data);
+ data += 4;
+ h += w;
+ h *= m;
+ h ^= (h >> 16);
+ }
+
+ // Pick up remaining bytes
+ switch (limit - data) {
+ case 3:
+ h += data[2] << 16;
+ // fall through
+ case 2:
+ h += data[1] << 8;
+ // fall through
+ case 1:
+ h += data[0];
+ h *= m;
+ h ^= (h >> r);
+ break;
+ }
+ return h;
+}
+
+
+}
diff --git a/util/hash.h b/util/hash.h
new file mode 100644
index 0000000..8889d56
--- /dev/null
+++ b/util/hash.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Simple hash function used for internal data structures
+
+#ifndef STORAGE_LEVELDB_UTIL_HASH_H_
+#define STORAGE_LEVELDB_UTIL_HASH_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace leveldb {
+
+extern uint32_t Hash(const char* data, size_t n, uint32_t seed);
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_HASH_H_
diff --git a/util/histogram.cc b/util/histogram.cc
new file mode 100644
index 0000000..c5178ef
--- /dev/null
+++ b/util/histogram.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <math.h>
+#include <stdio.h>
+#include "port/port.h"
+#include "util/histogram.h"
+
+namespace leveldb {
+
+const double Histogram::kBucketLimit[kNumBuckets] = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45,
+ 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450,
+ 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000,
+ 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000,
+ 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000,
+ 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000,
+ 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000,
+ 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
+ 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000,
+ 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000,
+ 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000,
+ 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000,
+ 180000000, 200000000, 250000000, 300000000, 350000000, 400000000,
+ 450000000, 500000000, 600000000, 700000000, 800000000, 900000000,
+ 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000,
+ 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0,
+ 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0,
+ 1e200,
+};
+
+void Histogram::Clear() {
+ min_ = kBucketLimit[kNumBuckets-1];
+ max_ = 0;
+ num_ = 0;
+ sum_ = 0;
+ sum_squares_ = 0;
+ for (int i = 0; i < kNumBuckets; i++) {
+ buckets_[i] = 0;
+ }
+}
+
+void Histogram::Add(double value) {
+ // Linear search is fast enough for our usage in db_bench
+ int b = 0;
+ while (b < kNumBuckets - 1 && kBucketLimit[b] <= value) {
+ b++;
+ }
+ buckets_[b] += 1.0;
+ if (min_ > value) min_ = value;
+ if (max_ < value) max_ = value;
+ num_++;
+ sum_ += value;
+ sum_squares_ += (value * value);
+}
+
+double Histogram::Median() const {
+ return Percentile(50.0);
+}
+
+double Histogram::Percentile(double p) const {
+ double threshold = num_ * (p / 100.0);
+ double sum = 0;
+ for (int b = 0; b < kNumBuckets; b++) {
+ sum += buckets_[b];
+ if (sum >= threshold) {
+ // Scale linearly within this bucket
+ double left_point = (b == 0) ? 0 : kBucketLimit[b-1];
+ double right_point = kBucketLimit[b];
+ double left_sum = sum - buckets_[b];
+ double right_sum = sum;
+ double pos = (threshold - left_sum) / (right_sum - left_sum);
+ double r = left_point + (right_point - left_point) * pos;
+ if (r < min_) r = min_;
+ if (r > max_) r = max_;
+ return r;
+ }
+ }
+ return max_;
+}
+
+double Histogram::Average() const {
+ if (num_ == 0.0) return 0;
+ return sum_ / num_;
+}
+
+double Histogram::StandardDeviation() const {
+ if (num_ == 0.0) return 0;
+ double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_);
+ return sqrt(variance);
+}
+
+std::string Histogram::ToString() const {
+ std::string r;
+ char buf[200];
+ snprintf(buf, sizeof(buf),
+ "Count: %.0f Average: %.4f StdDev: %.2f\n",
+ num_, Average(), StandardDeviation());
+ r.append(buf);
+ snprintf(buf, sizeof(buf),
+ "Min: %.4f Median: %.4f Max: %.4f\n",
+ (num_ == 0.0 ? 0.0 : min_), Median(), max_);
+ r.append(buf);
+ r.append("------------------------------------------------------\n");
+ const double mult = 100.0 / num_;
+ double sum = 0;
+ for (int b = 0; b < kNumBuckets; b++) {
+ if (buckets_[b] <= 0.0) continue;
+ sum += buckets_[b];
+ snprintf(buf, sizeof(buf),
+ "[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ",
+ ((b == 0) ? 0.0 : kBucketLimit[b-1]), // left
+ kBucketLimit[b], // right
+ buckets_[b], // count
+ mult * buckets_[b], // percentage
+ mult * sum); // cumulative percentage
+ r.append(buf);
+
+ // Add hash marks based on percentage; 20 marks for 100%.
+ int marks = static_cast<int>(20*(buckets_[b] / num_) + 0.5);
+ r.append(marks, '#');
+ r.push_back('\n');
+ }
+ return r;
+}
+
+}
diff --git a/util/histogram.h b/util/histogram.h
new file mode 100644
index 0000000..f72f122
--- /dev/null
+++ b/util/histogram.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_HISTOGRAM_H_
+#define STORAGE_LEVELDB_UTIL_HISTOGRAM_H_
+
+#include <string>
+
+namespace leveldb {
+
+class Histogram {
+ public:
+ Histogram() { }
+ ~Histogram() { }
+
+ void Clear();
+ void Add(double value);
+
+ std::string ToString() const;
+
+ private:
+ double min_;
+ double max_;
+ double num_;
+ double sum_;
+ double sum_squares_;
+
+ enum { kNumBuckets = 154 };
+ static const double kBucketLimit[kNumBuckets];
+ double buckets_[kNumBuckets];
+
+ double Median() const;
+ double Percentile(double p) const;
+ double Average() const;
+ double StandardDeviation() const;
+};
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_HISTOGRAM_H_
diff --git a/util/logging.cc b/util/logging.cc
new file mode 100644
index 0000000..6b7c410
--- /dev/null
+++ b/util/logging.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/logging.h"
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "include/env.h"
+#include "include/slice.h"
+
+namespace leveldb {
+
+void AppendNumberTo(std::string* str, uint64_t num) {
+ char buf[30];
+ snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num);
+ str->append(buf);
+}
+
+void AppendEscapedStringTo(std::string* str, const Slice& value) {
+ for (int i = 0; i < value.size(); i++) {
+ char c = value[i];
+ if (c >= ' ' && c <= '~') {
+ str->push_back(c);
+ } else {
+ char buf[10];
+ snprintf(buf, sizeof(buf), "\\x%02x",
+ static_cast<unsigned int>(c) & 0xff);
+ str->append(buf);
+ }
+ }
+}
+
+std::string NumberToString(uint64_t num) {
+ std::string r;
+ AppendNumberTo(&r, num);
+ return r;
+}
+
+std::string EscapeString(const Slice& value) {
+ std::string r;
+ AppendEscapedStringTo(&r, value);
+ return r;
+}
+
+bool ConsumeChar(Slice* in, char c) {
+ if (!in->empty() && (*in)[0] == c) {
+ in->remove_prefix(1);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool ConsumeDecimalNumber(Slice* in, uint64_t* val) {
+ uint64_t v = 0;
+ int digits = 0;
+ while (!in->empty()) {
+ char c = (*in)[0];
+ if (c >= '0' && c <= '9') {
+ ++digits;
+ const int delta = (c - '0');
+ static const uint64_t kMaxUint64 = ~static_cast<uint64_t>(0);
+ if (v > kMaxUint64/10 ||
+ (v == kMaxUint64/10 && delta > kMaxUint64%10)) {
+ // Overflow
+ return false;
+ }
+ v = (v * 10) + delta;
+ in->remove_prefix(1);
+ } else {
+ break;
+ }
+ }
+ *val = v;
+ return (digits > 0);
+}
+
+}
diff --git a/util/logging.h b/util/logging.h
new file mode 100644
index 0000000..1cd0a4b
--- /dev/null
+++ b/util/logging.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Must not be included from any .h files to avoid polluting the namespace
+// with macros.
+
+#ifndef STORAGE_LEVELDB_UTIL_LOGGING_H_
+#define STORAGE_LEVELDB_UTIL_LOGGING_H_
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string>
+#include "port/port.h"
+
+namespace leveldb {
+
+class Slice;
+class WritableFile;
+
+// Append a human-readable printout of "num" to *str
+extern void AppendNumberTo(std::string* str, uint64_t num);
+
+// Append a human-readable printout of "value" to *str.
+// Escapes any non-printable characters found in "value".
+extern void AppendEscapedStringTo(std::string* str, const Slice& value);
+
+// Return a human-readable printout of "num"
+extern std::string NumberToString(uint64_t num);
+
+// Return a human-readable version of "value".
+// Escapes any non-printable characters found in "value".
+extern std::string EscapeString(const Slice& value);
+
+// If *in starts with "c", advances *in past the first character and
+// returns true. Otherwise, returns false.
+extern bool ConsumeChar(Slice* in, char c);
+
+// Parse a human-readable number from "*in" into *value. On success,
+// advances "*in" past the consumed number and sets "*val" to the
+// numeric value. Otherwise, returns false and leaves *in in an
+// unspecified state.
+extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val);
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_LOGGING_H_
diff --git a/util/mutexlock.h b/util/mutexlock.h
new file mode 100644
index 0000000..05fe279
--- /dev/null
+++ b/util/mutexlock.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
+#define STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
+
+#include "port/port.h"
+
+namespace leveldb {
+
+// Helper class that locks a mutex on construction and unlocks the mutex when
+// the destructor of the MutexLock object is invoked.
+//
+// Typical usage:
+//
+// void MyClass::MyMethod() {
+// MutexLock l(&mu_); // mu_ is an instance variable
+// ... some complex code, possibly with multiple return paths ...
+// }
+
+class MutexLock {
+ public:
+ explicit MutexLock(port::Mutex *mu) : mu_(mu) {
+ this->mu_->Lock();
+ }
+ ~MutexLock() { this->mu_->Unlock(); }
+
+ private:
+ port::Mutex *const mu_;
+ // No copying allowed
+ MutexLock(const MutexLock&);
+ void operator=(const MutexLock&);
+};
+
+}
+
+
+#endif // STORAGE_LEVELDB_UTIL_MUTEXLOCK_H_
diff --git a/util/options.cc b/util/options.cc
new file mode 100644
index 0000000..b792bb1
--- /dev/null
+++ b/util/options.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "include/options.h"
+
+#include "include/comparator.h"
+#include "include/env.h"
+
+namespace leveldb {
+
+Options::Options()
+ : comparator(BytewiseComparator()),
+ create_if_missing(false),
+ error_if_exists(false),
+ paranoid_checks(false),
+ env(Env::Default()),
+ info_log(NULL),
+ write_buffer_size(1<<20),
+ max_open_files(1000),
+ large_value_threshold(65536),
+ block_cache(NULL),
+ block_size(8192),
+ block_restart_interval(16),
+ compression(kLightweightCompression) {
+}
+
+
+}
diff --git a/util/random.h b/util/random.h
new file mode 100644
index 0000000..2d458e8
--- /dev/null
+++ b/util/random.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_RANDOM_H_
+#define STORAGE_LEVELDB_UTIL_RANDOM_H_
+
+#include <stdint.h>
+
+namespace leveldb {
+
+// A very simple random number generator. Not especially good at
+// generating truly random bits, but good enough for our needs in this
+// package.
+class Random {
+ private:
+ uint32_t seed_;
+ public:
+ explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { }
+ uint32_t Next() {
+ static const uint32_t M = 2147483647L; // 2^31-1
+ static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0
+ // We are computing
+ // seed_ = (seed_ * A) % M, where M = 2^31-1
+ //
+ // seed_ must not be zero or M, or else all subsequent computed values
+ // will be zero or M respectively. For all other values, seed_ will end
+ // up cycling through every number in [1,M-1]
+ uint64_t product = seed_ * A;
+
+ // Compute (product % M) using the fact that ((x << 31) % M) == x.
+ seed_ = (product >> 31) + (product & M);
+ // The first reduction may overflow by 1 bit, so we may need to
+ // repeat. mod == M is not possible; using > allows the faster
+ // sign-bit-based test.
+ if (seed_ > M) {
+ seed_ -= M;
+ }
+ return seed_;
+ }
+ // Returns a uniformly distributed value in the range [0..n-1]
+ // REQUIRES: n > 0
+ uint32_t Uniform(int n) { return Next() % n; }
+
+ // Randomly returns true ~"1/n" of the time, and false otherwise.
+ // REQUIRES: n > 0
+ bool OneIn(int n) { return (Next() % n) == 0; }
+
+ // Skewed: pick "base" uniformly from range [0,max_log] and then
+ // return "base" random bits. The effect is to pick a number in the
+ // range [0,2^max_log-1] with exponential bias towards smaller numbers.
+ uint32_t Skewed(int max_log) {
+ return Uniform(1 << Uniform(max_log + 1));
+ }
+};
+
+}
+
+#endif // STORAGE_LEVELDB_UTIL_RANDOM_H_
diff --git a/util/status.cc b/util/status.cc
new file mode 100644
index 0000000..2ed799d
--- /dev/null
+++ b/util/status.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+#include "port/port.h"
+#include "include/status.h"
+
+namespace leveldb {
+
+Status::Status(Code code, const Slice& msg, const Slice& msg2) {
+ assert(code != kOk);
+ state_ = new State(make_pair(code, std::string(msg.data(), msg.size())));
+ if (!msg2.empty()) {
+ state_->second.append(": ");
+ state_->second.append(msg2.data(), msg2.size());
+ }
+}
+
+std::string Status::ToString() const {
+ if (state_ == NULL) {
+ return "OK";
+ } else {
+ char tmp[30];
+ const char* type;
+ switch (state_->first) {
+ case kOk:
+ type = "OK";
+ break;
+ case kNotFound:
+ type = "NotFound";
+ break;
+ case kCorruption:
+ type = "Corruption: ";
+ break;
+ case kNotSupported:
+ type = "Not implemented: ";
+ break;
+ case kInvalidArgument:
+ type = "Invalid argument: ";
+ break;
+ case kIOError:
+ type = "IO error: ";
+ break;
+ default:
+ snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
+ static_cast<int>(state_->first));
+ type = tmp;
+ break;
+ }
+ std::string result(type);
+ if (!state_->second.empty()) {
+ result.append(state_->second);
+ }
+ return result;
+ }
+}
+
+}
diff --git a/util/testharness.cc b/util/testharness.cc
new file mode 100644
index 0000000..b686ac3
--- /dev/null
+++ b/util/testharness.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testharness.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+namespace leveldb {
+namespace test {
+
+namespace {
+struct Test {
+ const char* base;
+ const char* name;
+ void (*func)();
+};
+std::vector<Test>* tests;
+}
+
+bool RegisterTest(const char* base, const char* name, void (*func)()) {
+ if (tests == NULL) {
+ tests = new std::vector<Test>;
+ }
+ Test t;
+ t.base = base;
+ t.name = name;
+ t.func = func;
+ tests->push_back(t);
+ return true;
+}
+
+int RunAllTests() {
+ int num = 0;
+ if (tests != NULL) {
+ for (int i = 0; i < tests->size(); i++) {
+ const Test& t = (*tests)[i];
+ fprintf(stderr, "==== Test %s.%s\n", t.base, t.name);
+ (*t.func)();
+ ++num;
+ }
+ }
+ fprintf(stderr, "==== PASSED %d tests\n", num);
+ return 0;
+}
+
+std::string TmpDir() {
+ std::string dir;
+ Status s = Env::Default()->GetTestDirectory(&dir);
+ ASSERT_TRUE(s.ok()) << s.ToString();
+ return dir;
+}
+
+int RandomSeed() {
+ const char* env = getenv("TEST_RANDOM_SEED");
+ int result = (env != NULL ? atoi(env) : 301);
+ if (result <= 0) {
+ result = 301;
+ }
+ return result;
+}
+
+}
+}
diff --git a/util/testharness.h b/util/testharness.h
new file mode 100644
index 0000000..93309dc
--- /dev/null
+++ b/util/testharness.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_TESTHARNESS_H_
+#define STORAGE_LEVELDB_UTIL_TESTHARNESS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sstream>
+#include "include/env.h"
+#include "include/slice.h"
+#include "util/random.h"
+
+namespace leveldb {
+namespace test {
+
+// Run all tests registered by the TEST() macro.
+// Returns 0 if all tests pass.
+// Dies or returns a non-zero value if some test fails.
+extern int RunAllTests();
+
+// Return the directory to use for temporary storage.
+extern std::string TmpDir();
+
+// Return a randomization seed for this run. Typically returns the
+// same number on repeated invocations of this binary, but automated
+// runs may be able to vary the seed.
+extern int RandomSeed();
+
+// An instance of Tester is allocated to hold temporary state during
+// the execution of an assertion.
+class Tester {
+ private:
+ bool ok_;
+ const char* fname_;
+ int line_;
+ std::stringstream ss_;
+
+ public:
+ Tester(const char* f, int l)
+ : ok_(true), fname_(f), line_(l) {
+ }
+
+ ~Tester() {
+ if (!ok_) {
+ fprintf(stderr, "%s:%d:%s\n", fname_, line_, ss_.str().c_str());
+ exit(1);
+ }
+ }
+
+ Tester& Is(bool b, const char* msg) {
+ if (!b) {
+ ss_ << " Assertion failure " << msg;
+ ok_ = false;
+ }
+ return *this;
+ }
+
+ Tester& IsOk(const Status& s) {
+ if (!s.ok()) {
+ ss_ << " " << s.ToString();
+ ok_ = false;
+ }
+ return *this;
+ }
+
+#define BINARY_OP(name,op) \
+ template <class X, class Y> \
+ Tester& name(const X& x, const Y& y) { \
+ if (! (x op y)) { \
+ ss_ << " failed: " << x << (" " #op " ") << y; \
+ ok_ = false; \
+ } \
+ return *this; \
+ }
+
+ BINARY_OP(IsEq, ==)
+ BINARY_OP(IsNe, !=)
+ BINARY_OP(IsGe, >=)
+ BINARY_OP(IsGt, >)
+ BINARY_OP(IsLe, <=)
+ BINARY_OP(IsLt, <)
+#undef BINARY_OP
+
+ // Attach the specified value to the error message if an error has occurred
+ template <class V>
+ Tester& operator<<(const V& value) {
+ if (!ok_) {
+ ss_ << " " << value;
+ }
+ return *this;
+ }
+};
+
+#define ASSERT_TRUE(c) ::leveldb::test::Tester(__FILE__, __LINE__).Is((c), #c)
+#define ASSERT_OK(s) ::leveldb::test::Tester(__FILE__, __LINE__).IsOk((s))
+#define ASSERT_EQ(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsEq((a),(b))
+#define ASSERT_NE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsNe((a),(b))
+#define ASSERT_GE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGe((a),(b))
+#define ASSERT_GT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsGt((a),(b))
+#define ASSERT_LE(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLe((a),(b))
+#define ASSERT_LT(a,b) ::leveldb::test::Tester(__FILE__, __LINE__).IsLt((a),(b))
+
+#define TCONCAT(a,b) TCONCAT1(a,b)
+#define TCONCAT1(a,b) a##b
+
+#define TEST(base,name) \
+class TCONCAT(_Test_,name) : public base { \
+ public: \
+ void _Run(); \
+ static void _RunIt() { \
+ TCONCAT(_Test_,name) t; \
+ t._Run(); \
+ } \
+}; \
+bool TCONCAT(_Test_ignored_,name) = \
+ ::leveldb::test::RegisterTest(#base, #name, &TCONCAT(_Test_,name)::_RunIt); \
+void TCONCAT(_Test_,name)::_Run()
+
+// Register the specified test. Typically not used directly, but
+// invoked via the macro expansion of TEST.
+extern bool RegisterTest(const char* base, const char* name, void (*func)());
+
+
+}
+}
+
+#endif // STORAGE_LEVELDB_UTIL_TESTHARNESS_H_
diff --git a/util/testutil.cc b/util/testutil.cc
new file mode 100644
index 0000000..8d6cf3c
--- /dev/null
+++ b/util/testutil.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "util/testutil.h"
+
+#include "util/random.h"
+
+namespace leveldb {
+namespace test {
+
+Slice RandomString(Random* rnd, int len, std::string* dst) {
+ dst->resize(len);
+ for (int i = 0; i < len; i++) {
+ (*dst)[i] = static_cast<char>(' ' + rnd->Uniform(95)); // ' ' .. '~'
+ }
+ return Slice(*dst);
+}
+
+std::string RandomKey(Random* rnd, int len) {
+ // Make sure to generate a wide variety of characters so we
+ // test the boundary conditions for short-key optimizations.
+ static const char kTestChars[] = {
+ '\0', '\1', 'a', 'b', 'c', 'd', 'e', '\xfd', '\xfe', '\xff'
+ };
+ std::string result;
+ for (int i = 0; i < len; i++) {
+ result += kTestChars[rnd->Uniform(sizeof(kTestChars))];
+ }
+ return result;
+}
+
+
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+ int len, std::string* dst) {
+ int raw = static_cast<int>(len * compressed_fraction);
+ if (raw < 1) raw = 1;
+ std::string raw_data;
+ RandomString(rnd, raw, &raw_data);
+
+ // Duplicate the random data until we have filled "len" bytes
+ dst->clear();
+ while (dst->size() < len) {
+ dst->append(raw_data);
+ }
+ dst->resize(len);
+ return Slice(*dst);
+}
+
+}
+}
diff --git a/util/testutil.h b/util/testutil.h
new file mode 100644
index 0000000..0e8a177
--- /dev/null
+++ b/util/testutil.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef STORAGE_LEVELDB_UTIL_TESTUTIL_H_
+#define STORAGE_LEVELDB_UTIL_TESTUTIL_H_
+
+#include "include/env.h"
+#include "include/slice.h"
+#include "util/random.h"
+
+namespace leveldb {
+namespace test {
+
+// Store in *dst a random string of length "len" and return a Slice that
+// references the generated data.
+extern Slice RandomString(Random* rnd, int len, std::string* dst);
+
+// Return a random key with the specified length that may contain interesting
+// characters (e.g. \x00, \xff, etc.).
+extern std::string RandomKey(Random* rnd, int len);
+
+// Store in *dst a string of length "len" that will compress to
+// "N*compressed_fraction" bytes and return a Slice that references
+// the generated data.
+extern Slice CompressibleString(Random* rnd, double compressed_fraction,
+ int len, std::string* dst);
+
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+ bool writable_file_error_;
+ int num_writable_file_errors_;
+
+ ErrorEnv() : EnvWrapper(Env::Default()),
+ writable_file_error_(false),
+ num_writable_file_errors_(0) { }
+
+ virtual Status NewWritableFile(const std::string& fname,
+ WritableFile** result) {
+ if (writable_file_error_) {
+ ++num_writable_file_errors_;
+ *result = NULL;
+ return Status::IOError(fname, "fake error");
+ }
+ return target()->NewWritableFile(fname, result);
+ }
+};
+
+}
+}
+
+#endif // STORAGE_LEVELDB_UTIL_TESTUTIL_H_