summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjorlow@chromium.org <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-03-22 18:32:49 +0000
committerjorlow@chromium.org <jorlow@chromium.org@62dab493-f737-651d-591e-8d6aee1b9529>2011-03-22 18:32:49 +0000
commit13b72af77b2b7fed156c26107476f84d35e4481d (patch)
treed21e4ec2ca349fab041bda922f58303f1a2933a7
parent9df5aa892ecf4f418f6777bcc18651ecd5f0d679 (diff)
downloadleveldb-13b72af77b2b7fed156c26107476f84d35e4481d.tar.gz
More changes from upstream.
git-svn-id: https://leveldb.googlecode.com/svn/trunk@12 62dab493-f737-651d-591e-8d6aee1b9529
-rw-r--r--db/db_bench.cc264
-rw-r--r--db/db_impl.cc20
-rw-r--r--db/db_impl.h4
-rw-r--r--db/db_test.cc74
-rw-r--r--db/version_set.cc122
-rw-r--r--db/version_set.h25
-rw-r--r--doc/impl.html10
-rw-r--r--include/db.h3
-rw-r--r--leveldb.gyp2
9 files changed, 405 insertions, 119 deletions
diff --git a/db/db_bench.cc b/db/db_bench.cc
index db8deea..72e0699 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -17,11 +17,14 @@
// Comma-separated list of operations to run in the specified order
// Actual benchmarks:
-// writeseq -- write N values in sequential key order
-// writerandom -- write N values in random key order
-// writebig -- write N/1000 100K valuesin random order
-// readseq -- read N values sequentially
-// readrandom -- read N values in random order
+// fillseq -- write N values in sequential key order in async mode
+// fillrandom -- write N values in random key order in async mode
+// overwrite -- overwrite N values in random key order in async mode
+// fillsync -- write N/100 values in random key order in sync mode
+// fill100K -- write N/1000 100K values in random order in async mode
+// readseq -- read N values sequentially
+// readreverse -- read N values in reverse order
+// readrandom -- read N values in random order
// Meta operations:
// compact -- Compact the entire DB
// heapprofile -- Dump a heap profile (if supported by this port)
@@ -30,10 +33,10 @@
// tenth -- divide N by 10 (i.e., following benchmarks are smaller)
// normal -- reset N back to its normal value (1000000)
static const char* FLAGS_benchmarks =
- "writeseq,"
- "writeseq,"
- "writerandom,"
- "sync,tenth,tenth,writerandom,nosync,normal,"
+ "fillseq,"
+ "fillrandom,"
+ "overwrite,"
+ "fillsync,"
"readseq,"
"readreverse,"
"readrandom,"
@@ -41,7 +44,7 @@ static const char* FLAGS_benchmarks =
"readseq,"
"readreverse,"
"readrandom,"
- "writebig";
+ "fill100K";
// Number of key/values to place in database
static int FLAGS_num = 1000000;
@@ -51,7 +54,7 @@ static int FLAGS_value_size = 100;
// Arrange to generate values that shrink to this fraction of
// their original size after compression
-static double FLAGS_compression_ratio = 0.25;
+static double FLAGS_compression_ratio = 0.5;
// Print histogram of operation timings
static bool FLAGS_histogram = false;
@@ -93,6 +96,19 @@ class RandomGenerator {
return Slice(data_.data() + pos_ - len, len);
}
};
+
+static Slice TrimSpace(Slice s) {
+ int start = 0;
+ while (start < s.size() && isspace(s[start])) {
+ start++;
+ }
+ int limit = s.size();
+ while (limit > start && isspace(s[limit-1])) {
+ limit--;
+ }
+ return Slice(s.data() + start, limit - start);
+}
+
}
class Benchmark {
@@ -100,7 +116,6 @@ class Benchmark {
Cache* cache_;
DB* db_;
int num_;
- bool sync_;
int heap_counter_;
double start_;
double last_op_finish_;
@@ -114,6 +129,70 @@ class Benchmark {
int done_;
int next_report_; // When to report next
+ void PrintHeader() {
+ const int kKeySize = 16;
+ PrintEnvironment();
+ fprintf(stdout, "Keys: %d bytes each\n", kKeySize);
+ fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n",
+ FLAGS_value_size,
+ static_cast<int>(FLAGS_value_size * FLAGS_compression_ratio + 0.5));
+ fprintf(stdout, "Entries: %d\n", num_);
+ fprintf(stdout, "RawSize: %.1f MB (estimated)\n",
+ (((kKeySize + FLAGS_value_size) * num_) / 1048576.0));
+ fprintf(stdout, "FileSize: %.1f MB (estimated)\n",
+ (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_)
+ / 1048576.0));
+ PrintWarnings();
+ fprintf(stdout, "------------------------------------------------\n");
+ }
+
+ void PrintWarnings() {
+#if defined(__GNUC__) && !defined(__OPTIMIZE__)
+ fprintf(stdout,
+ "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"
+ );
+#endif
+#ifndef NDEBUG
+ fprintf(stdout,
+ "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
+#endif
+ }
+
+ void PrintEnvironment() {
+ fprintf(stderr, "LevelDB: version %d.%d\n",
+ kMajorVersion, kMinorVersion);
+
+#if defined(__linux)
+ time_t now = time(NULL);
+ fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline
+
+ FILE* cpuinfo = fopen("/proc/cpuinfo", "r");
+ if (cpuinfo != NULL) {
+ char line[1000];
+ int num_cpus = 0;
+ std::string cpu_type;
+ std::string cache_size;
+ while (fgets(line, sizeof(line), cpuinfo) != NULL) {
+ const char* sep = strchr(line, ':');
+ if (sep == NULL) {
+ continue;
+ }
+ Slice key = TrimSpace(Slice(line, sep - 1 - line));
+ Slice val = TrimSpace(Slice(sep + 1));
+ if (key == "model name") {
+ ++num_cpus;
+ cpu_type = val.ToString();
+ } else if (key == "cache size") {
+ cache_size = val.ToString();
+ }
+ }
+ fclose(cpuinfo);
+ fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str());
+ fprintf(stderr, "CPUCache: %s\n", cache_size.c_str());
+ }
+#endif
+ }
+
void Start() {
start_ = Env::Default()->NowMicros() * 1e-6;
bytes_ = 0;
@@ -164,9 +243,10 @@ class Benchmark {
snprintf(rate, sizeof(rate), "%5.1f MB/s",
(bytes_ / 1048576.0) / (finish - start_));
if (!message_.empty()) {
- message_.push_back(' ');
+ message_ = std::string(rate) + " " + message_;
+ } else {
+ message_ = rate;
}
- message_.append(rate);
}
fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n",
@@ -183,14 +263,16 @@ class Benchmark {
public:
enum Order {
SEQUENTIAL,
- REVERSE, // Currently only supported for reads
RANDOM
};
+ enum DBState {
+ FRESH,
+ EXISTING
+ };
Benchmark() : cache_(NewLRUCache(200<<20)),
db_(NULL),
num_(FLAGS_num),
- sync_(false),
heap_counter_(0),
bytes_(0),
rand_(301) {
@@ -210,19 +292,8 @@ class Benchmark {
}
void Run() {
- Options options;
- options.create_if_missing = true;
- options.max_open_files = 10000;
- options.block_cache = cache_;
- options.write_buffer_size = FLAGS_write_buffer_size;
-
- Start();
- Status s = DB::Open(options, "/tmp/dbbench", &db_);
- Stop("open");
- if (!s.ok()) {
- fprintf(stderr, "open error: %s\n", s.ToString().c_str());
- exit(1);
- }
+ PrintHeader();
+ Open();
const char* benchmarks = FLAGS_benchmarks;
while (benchmarks != NULL) {
@@ -237,30 +308,30 @@ class Benchmark {
}
Start();
- if (name == Slice("writeseq")) {
- Write(SEQUENTIAL, num_, FLAGS_value_size);
- } else if (name == Slice("writerandom")) {
- Write(RANDOM, num_, FLAGS_value_size);
- } else if (name == Slice("writebig")) {
- Write(RANDOM, num_ / 1000, 100 * 1000);
+
+ WriteOptions write_options;
+ write_options.sync = false;
+ if (name == Slice("fillseq")) {
+ Write(write_options, SEQUENTIAL, FRESH, num_, FLAGS_value_size);
+ } else if (name == Slice("fillrandom")) {
+ Write(write_options, RANDOM, FRESH, num_, FLAGS_value_size);
+ } else if (name == Slice("overwrite")) {
+ Write(write_options, RANDOM, EXISTING, num_, FLAGS_value_size);
+ } else if (name == Slice("fillsync")) {
+ write_options.sync = true;
+ Write(write_options, RANDOM, FRESH, num_ / 100, FLAGS_value_size);
+ } else if (name == Slice("fill100K")) {
+ Write(write_options, RANDOM, FRESH, num_ / 1000, 100 * 1000);
} else if (name == Slice("readseq")) {
- Read(SEQUENTIAL);
+ ReadSequential();
} else if (name == Slice("readreverse")) {
- Read(REVERSE);
+ ReadReverse();
} else if (name == Slice("readrandom")) {
- Read(RANDOM);
+ ReadRandom();
} else if (name == Slice("compact")) {
Compact();
} else if (name == Slice("heapprofile")) {
HeapProfile();
- } else if (name == Slice("sync")) {
- sync_ = true;
- } else if (name == Slice("nosync")) {
- sync_ = false;
- } else if (name == Slice("tenth")) {
- num_ = num_ / 10;
- } else if (name == Slice("normal")) {
- num_ = FLAGS_num;
} else {
fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str());
}
@@ -268,16 +339,44 @@ class Benchmark {
}
}
- void Write(Order order, int num_entries, int value_size) {
+ private:
+ void Open() {
+ assert(db_ == NULL);
+ Options options;
+ options.create_if_missing = true;
+ options.max_open_files = 10000;
+ options.block_cache = cache_;
+ options.write_buffer_size = FLAGS_write_buffer_size;
+ Status s = DB::Open(options, "/tmp/dbbench", &db_);
+ if (!s.ok()) {
+ fprintf(stderr, "open error: %s\n", s.ToString().c_str());
+ exit(1);
+ }
+ }
+
+ void Write(const WriteOptions& options, Order order, DBState state,
+ int num_entries, int value_size) {
+ if (state == FRESH) {
+ delete db_;
+ db_ = NULL;
+ DestroyDB("/tmp/dbbench", Options());
+ Open();
+ Start(); // Do not count time taken to destroy/open
+ }
+
+ if (num_entries != num_) {
+ char msg[100];
+ snprintf(msg, sizeof(msg), "(%d ops)", num_entries);
+ message_ = msg;
+ }
+
WriteBatch batch;
Status s;
std::string val;
- WriteOptions options;
- options.sync = sync_;
for (int i = 0; i < num_entries; i++) {
const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
char key[100];
- snprintf(key, sizeof(key), "%012d", k);
+ snprintf(key, sizeof(key), "%016d", k);
batch.Clear();
batch.Put(key, gen_.Generate(value_size));
s = db_->Write(options, &batch);
@@ -290,42 +389,37 @@ class Benchmark {
}
}
- void Read(Order order) {
+ void ReadSequential() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int i = 0;
+ for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
+ bytes_ += iter->key().size() + iter->value().size();
+ FinishedSingleOp();
+ ++i;
+ }
+ delete iter;
+ }
+
+ void ReadReverse() {
+ Iterator* iter = db_->NewIterator(ReadOptions());
+ int i = 0;
+ for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
+ bytes_ += iter->key().size() + iter->value().size();
+ FinishedSingleOp();
+ ++i;
+ }
+ delete iter;
+ }
+
+ void ReadRandom() {
ReadOptions options;
- switch (order) {
- case SEQUENTIAL: {
- Iterator* iter = db_->NewIterator(options);
- int i = 0;
- for (iter->SeekToFirst(); i < num_ && iter->Valid(); iter->Next()) {
- bytes_ += iter->key().size() + iter->value().size();
- FinishedSingleOp();
- ++i;
- }
- delete iter;
- break;
- }
- case REVERSE: {
- Iterator* iter = db_->NewIterator(options);
- int i = 0;
- for (iter->SeekToLast(); i < num_ && iter->Valid(); iter->Prev()) {
- bytes_ += iter->key().size() + iter->value().size();
- FinishedSingleOp();
- ++i;
- }
- delete iter;
- break;
- }
- case RANDOM: {
- std::string value;
- for (int i = 0; i < num_; i++) {
- char key[100];
- const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % FLAGS_num);
- snprintf(key, sizeof(key), "%012d", k);
- db_->Get(options, key, &value);
- FinishedSingleOp();
- }
- break;
- }
+ std::string value;
+ for (int i = 0; i < num_; i++) {
+ char key[100];
+ const int k = rand_.Next() % FLAGS_num;
+ snprintf(key, sizeof(key), "%016d", k);
+ db_->Get(options, key, &value);
+ FinishedSingleOp();
}
}
diff --git a/db/db_impl.cc b/db/db_impl.cc
index 5008af0..4d66044 100644
--- a/db/db_impl.cc
+++ b/db/db_impl.cc
@@ -532,8 +532,9 @@ void DBImpl::BackgroundCompaction() {
}
Status status;
- if (c->num_input_files(0) == 1 && c->num_input_files(1) == 0) {
+ if (c->IsTrivialMove()) {
// Move file to next level
+ assert(c->num_input_files(0) == 1);
FileMetaData* f = c->input(0, 0);
c->edit()->DeleteFile(c->level(), f->number);
c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
@@ -718,8 +719,18 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
bool has_current_user_key = false;
SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
- // Handle key/value, add to state, etc.
Slice key = input->key();
+ InternalKey tmp_internal_key;
+ tmp_internal_key.DecodeFrom(key);
+ if (compact->compaction->ShouldStopBefore(tmp_internal_key) &&
+ compact->builder != NULL) {
+ status = FinishCompactionOutputFile(compact, input);
+ if (!status.ok()) {
+ break;
+ }
+ }
+
+ // Handle key/value, add to state, etc.
bool drop = false;
if (!ParseInternalKey(key, &ikey)) {
// Do not hide error keys
@@ -855,6 +866,11 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
return NewInternalIterator(ReadOptions(), &ignored);
}
+int64 DBImpl::TEST_MaxNextLevelOverlappingBytes() {
+ MutexLock l(&mutex_);
+ return versions_->MaxNextLevelOverlappingBytes();
+}
+
Status DBImpl::Get(const ReadOptions& options,
const Slice& key,
std::string* value) {
diff --git a/db/db_impl.h b/db/db_impl.h
index fc3d3f2..980d512 100644
--- a/db/db_impl.h
+++ b/db/db_impl.h
@@ -55,6 +55,10 @@ class DBImpl : public DB {
// The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator();
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64 TEST_MaxNextLevelOverlappingBytes();
+
private:
friend class DB;
diff --git a/db/db_test.cc b/db/db_test.cc
index 895a5e1..f8accf6 100644
--- a/db/db_test.cc
+++ b/db/db_test.cc
@@ -72,15 +72,19 @@ class DBTest {
}
Status Put(const std::string& k, const std::string& v) {
+ WriteOptions options;
+ options.sync = false;
WriteBatch batch;
batch.Put(k, v);
- return db_->Write(WriteOptions(), &batch);
+ return db_->Write(options, &batch);
}
Status Delete(const std::string& k) {
+ WriteOptions options;
+ options.sync = false;
WriteBatch batch;
batch.Delete(k);
- return db_->Write(WriteOptions(), &batch);
+ return db_->Write(options, &batch);
}
std::string Get(const std::string& k, const Snapshot* snapshot = NULL) {
@@ -176,6 +180,35 @@ class DBTest {
fprintf(stderr, "Found %d live large value files\n", (int)live.size());
return live;
}
+
+ void Compact(const Slice& start, const Slice& limit) {
+ dbfull()->TEST_CompactMemTable();
+ int max_level_with_files = 1;
+ for (int level = 1; level < config::kNumLevels; level++) {
+ uint64_t v;
+ char name[100];
+ snprintf(name, sizeof(name), "leveldb.num-files-at-level%d", level);
+ if (dbfull()->GetProperty(name, &v) && v > 0) {
+ max_level_with_files = level;
+ }
+ }
+ for (int level = 0; level < max_level_with_files; level++) {
+ dbfull()->TEST_CompactRange(level, "", "~");
+ }
+ }
+
+ void DumpFileCounts(const char* label) {
+ fprintf(stderr, "---\n%s:\n", label);
+ fprintf(stderr, "maxoverlap: %lld\n",
+ static_cast<long long>(
+ dbfull()->TEST_MaxNextLevelOverlappingBytes()));
+ for (int level = 0; level < config::kNumLevels; level++) {
+ int num = NumTableFilesAtLevel(level);
+ if (num > 0) {
+ fprintf(stderr, " level %3d : %d files\n", level, num);
+ }
+ }
+ }
};
TEST(DBTest, Empty) {
@@ -315,6 +348,43 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) {
}
}
+TEST(DBTest, SparseMerge) {
+ Options options;
+ options.compression = kNoCompression;
+ Reopen(&options);
+
+ // Suppose there is:
+ // small amount of data with prefix A
+ // large amount of data with prefix B
+ // small amount of data with prefix C
+ // and that recent updates have made small changes to all three prefixes.
+ // Check that we do not do a compaction that merges all of B in one shot.
+ const std::string value(1000, 'x');
+ Put("A", "va");
+ // Write approximately 100MB of "B" values
+ for (int i = 0; i < 100000; i++) {
+ char key[100];
+ snprintf(key, sizeof(key), "B%010d", i);
+ Put(key, value);
+ }
+ Put("C", "vc");
+ Compact("", "z");
+
+ // Make sparse update
+ Put("A", "va2");
+ Put("B100", "bvalue2");
+ Put("C", "vc2");
+ dbfull()->TEST_CompactMemTable();
+
+ // Compactions should not cause us to create a situation where
+ // a file overlaps too much data at the next level.
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+ dbfull()->TEST_CompactRange(0, "", "z");
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+ dbfull()->TEST_CompactRange(1, "", "z");
+ ASSERT_LE(dbfull()->TEST_MaxNextLevelOverlappingBytes(), 20*1048576);
+}
+
static bool Between(uint64_t val, uint64_t low, uint64_t high) {
bool result = (val >= low) && (val <= high);
if (!result) {
diff --git a/db/version_set.cc b/db/version_set.cc
index 2435fa2..46333f4 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -20,6 +20,10 @@
namespace leveldb {
+// Maximum number of overlaps in grandparent (i.e., level+2) before we
+// stop building a single file in a level->level+1 compaction.
+static const int kMaxGrandParentFiles = 10;
+
static double MaxBytesForLevel(int level) {
if (level == 0) {
return 4 * 1048576.0;
@@ -509,7 +513,7 @@ Status VersionSet::Finalize(Version* v) {
double best_score = -1;
Status s;
- for (int level = 0; s.ok() && level < config::kNumLevels; level++) {
+ for (int level = 0; s.ok() && level < config::kNumLevels-1; level++) {
s = SortLevel(v, level);
// Compute the ratio of current size to size limit.
@@ -751,6 +755,25 @@ void VersionSet::AddLiveFiles(std::set<uint64_t>* live) {
}
}
+int64 VersionSet::MaxNextLevelOverlappingBytes() {
+ int64 result = 0;
+ std::vector<FileMetaData*> overlaps;
+ for (int level = 0; level < config::kNumLevels - 1; level++) {
+ for (int i = 0; i < current_->files_[level].size(); i++) {
+ const FileMetaData* f = current_->files_[level][i];
+ GetOverlappingInputs(level+1, f->smallest, f->largest, &overlaps);
+ int64 sum = 0;
+ for (int j = 0; j < overlaps.size(); j++) {
+ sum += overlaps[j]->file_size;
+ }
+ if (sum > result) {
+ result = sum;
+ }
+ }
+ }
+ return result;
+}
+
// Store in "*inputs" all files in "level" that overlap [begin,end]
void VersionSet::GetOverlappingInputs(
int level,
@@ -797,6 +820,18 @@ void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs,
}
}
+// Stores the minimal range that covers all entries in inputs1 and inputs2
+// in *smallest, *largest.
+// REQUIRES: inputs is not empty
+void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest) {
+ std::vector<FileMetaData*> all = inputs1;
+ all.insert(all.end(), inputs2.begin(), inputs2.end());
+ GetRange(all, smallest, largest);
+}
+
Iterator* VersionSet::MakeInputIterator(Compaction* c) {
ReadOptions options;
options.verify_checksums = options_->paranoid_checks;
@@ -836,6 +871,7 @@ Compaction* VersionSet::PickCompaction() {
}
const int level = current_->compaction_level_;
assert(level >= 0);
+ assert(level+1 < config::kNumLevels);
Compaction* c = new Compaction(level);
c->input_version_ = current_;
@@ -855,31 +891,36 @@ Compaction* VersionSet::PickCompaction() {
c->inputs_[0].push_back(current_->files_[level][0]);
}
- // Find the range we are compacting
- InternalKey smallest, largest;
- GetRange(c->inputs_[0], &smallest, &largest);
-
// Files in level 0 may overlap each other, so pick up all overlapping ones
if (level == 0) {
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
// Note that the next call will discard the file we placed in
// c->inputs_[0] earlier and replace it with an overlapping set
// which will include the picked file.
GetOverlappingInputs(0, smallest, largest, &c->inputs_[0]);
assert(!c->inputs_[0].empty());
- GetRange(c->inputs_[0], &smallest, &largest);
}
+ SetupOtherInputs(c);
+
+ return c;
+}
+
+void VersionSet::SetupOtherInputs(Compaction* c) {
+ const int level = c->level();
+ InternalKey smallest, largest;
+ GetRange(c->inputs_[0], &smallest, &largest);
+
GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
+ // Get entire range covered by compaction
+ InternalKey all_start, all_limit;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
+
// See if we can grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up.
if (!c->inputs_[1].empty()) {
- // Get entire range covered by compaction
- std::vector<FileMetaData*> all = c->inputs_[0];
- all.insert(all.end(), c->inputs_[1].begin(), c->inputs_[1].end());
- InternalKey all_start, all_limit;
- GetRange(all, &all_start, &all_limit);
-
std::vector<FileMetaData*> expanded0;
GetOverlappingInputs(level, all_start, all_limit, &expanded0);
if (expanded0.size() > c->inputs_[0].size()) {
@@ -899,10 +940,17 @@ Compaction* VersionSet::PickCompaction() {
largest = new_limit;
c->inputs_[0] = expanded0;
c->inputs_[1] = expanded1;
+ GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
}
}
}
+ // Compute the set of grandparent files that overlap this compaction
+ // (parent == level+1; grandparent == level+2)
+ if (level + 2 < config::kNumLevels) {
+ GetOverlappingInputs(level + 2, all_start, all_limit, &c->grandparents_);
+ }
+
if (false) {
Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
level,
@@ -916,8 +964,6 @@ Compaction* VersionSet::PickCompaction() {
// key range next time.
compact_pointer_[level] = largest.Encode().ToString();
c->edit_.SetCompactPointer(level, largest);
-
- return c;
}
Compaction* VersionSet::CompactRange(
@@ -934,25 +980,16 @@ Compaction* VersionSet::CompactRange(
c->input_version_ = current_;
c->input_version_->Ref();
c->inputs_[0] = inputs;
-
- // Find the range we are compacting
- InternalKey smallest, largest;
- GetRange(c->inputs_[0], &smallest, &largest);
-
- GetOverlappingInputs(level+1, smallest, largest, &c->inputs_[1]);
- if (false) {
- Log(env_, options_->info_log, "Compacting %d '%s' .. '%s'",
- level,
- EscapeString(smallest.Encode()).c_str(),
- EscapeString(largest.Encode()).c_str());
- }
+ SetupOtherInputs(c);
return c;
}
Compaction::Compaction(int level)
: level_(level),
max_output_file_size_(MaxFileSizeForLevel(level)),
- input_version_(NULL) {
+ input_version_(NULL),
+ grandparent_index_(0),
+ output_start_(-1) {
for (int i = 0; i < config::kNumLevels; i++) {
level_ptrs_[i] = 0;
}
@@ -964,6 +1001,15 @@ Compaction::~Compaction() {
}
}
+bool Compaction::IsTrivialMove() const {
+ // Avoid a move if there are lots of overlapping grandparent files.
+ // Otherwise, the move could create a parent file that will require
+ // a very expensive merge later on.
+ return (num_input_files(0) == 1
+ && num_input_files(1) == 0
+ && grandparents_.size() <= kMaxGrandParentFiles);
+}
+
void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < 2; which++) {
for (int i = 0; i < inputs_[which].size(); i++) {
@@ -993,6 +1039,28 @@ bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
return true;
}
+bool Compaction::ShouldStopBefore(const InternalKey& key) {
+ // Scan to find earliest grandparent file that contains key.
+ const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
+ while (grandparent_index_ < grandparents_.size() &&
+ icmp->Compare(key, grandparents_[grandparent_index_]->largest) > 0) {
+ grandparent_index_++;
+ }
+
+ // First call?
+ if (output_start_ < 0) {
+ output_start_ = grandparent_index_;
+ }
+
+ if (grandparent_index_ - output_start_ + 1 > kMaxGrandParentFiles) {
+ // Too many overlaps for current output; start new output
+ output_start_ = grandparent_index_;
+ return true;
+ } else {
+ return false;
+ }
+}
+
void Compaction::ReleaseInputs() {
if (input_version_ != NULL) {
input_version_->Unref();
diff --git a/db/version_set.h b/db/version_set.h
index b8eee3d..0ea763d 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -139,6 +139,10 @@ class VersionSet {
const InternalKey& begin,
const InternalKey& end);
+ // Return the maximum overlapping data (in bytes) at next level for any
+ // file at a level >= 1.
+ int64 MaxNextLevelOverlappingBytes();
+
// Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed.
Iterator* MakeInputIterator(Compaction* c);
@@ -195,6 +199,13 @@ class VersionSet {
InternalKey* smallest,
InternalKey* largest);
+ void GetRange2(const std::vector<FileMetaData*>& inputs1,
+ const std::vector<FileMetaData*>& inputs2,
+ InternalKey* smallest,
+ InternalKey* largest);
+
+ void SetupOtherInputs(Compaction* c);
+
Env* const env_;
const std::string dbname_;
const Options* const options_;
@@ -250,6 +261,10 @@ class Compaction {
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
+ // Is this a trivial compaction that can be implemented by just
+ // moving a single input file to the next level (no merging or splitting)
+ bool IsTrivialMove() const;
+
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
@@ -258,6 +273,10 @@ class Compaction {
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
+ // Returns true iff we should stop building the current output
+ // before processing "key".
+ bool ShouldStopBefore(const InternalKey& key);
+
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
@@ -276,6 +295,12 @@ class Compaction {
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
+ // State used to check for number of of overlapping grandparent files
+ // (parent == level_ + 1, grandparent == level_ + 2)
+ std::vector<FileMetaData*> grandparents_;
+ int grandparent_index_; // Index in grandparent_starts_
+ int output_start_; // Index in grandparent_starts_ where output started
+
// State for implementing IsBaseLevelForKey
// level_ptrs_ holds indices into input_version_->levels_: our state
diff --git a/doc/impl.html b/doc/impl.html
index 2f2c809..b190d2c 100644
--- a/doc/impl.html
+++ b/doc/impl.html
@@ -123,8 +123,14 @@ one level-0 file in case some of these files overlap each other.
A compaction merges the contents of the picked files to produce a
sequence of level-(L+1) files. We switch to producing a new
level-(L+1) file after the current output file has reached the target
-file size (2MB). The old files are discarded and the new files are
-added to the serving state.
+file size (2MB). We also switch to a new output file when the key
+range of the current output file has grown enough to overlap more then
+ten level-(L+2) files. This last rule ensures that a later compaction
+of a level-(L+1) file will not pick up too much data from level-(L+2).
+
+<p>
+The old files are discarded and the new files are added to the serving
+state.
<p>
Compactions for a particular level rotate through the key space. In
diff --git a/include/db.h b/include/db.h
index c4d152d..75be1ca 100644
--- a/include/db.h
+++ b/include/db.h
@@ -12,6 +12,9 @@
namespace leveldb {
+static const int kMajorVersion = 1;
+static const int kMinorVersion = 0;
+
struct Options;
struct ReadOptions;
struct WriteOptions;
diff --git a/leveldb.gyp b/leveldb.gyp
index 934f2d0..eb809f3 100644
--- a/leveldb.gyp
+++ b/leveldb.gyp
@@ -4,7 +4,7 @@
{
'variables': {
- 'use_snappy%': 1,
+ 'use_snappy%': 0,
},
'target_defaults': {
'defines': [