summaryrefslogtreecommitdiff
path: root/src/os/LevelDBStore.h
blob: bc5b612a97a048159509e32e02007868888a3d75 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#ifndef LEVEL_DB_STORE_H
#define LEVEL_DB_STORE_H

#include "include/types.h"
#include "include/buffer.h"
#include "KeyValueDB.h"
#include <set>
#include <map>
#include <string>
#include <tr1/memory>
#include <boost/scoped_ptr.hpp>
#include "leveldb/db.h"
#include "leveldb/env.h"
#include "leveldb/write_batch.h"
#include "leveldb/slice.h"
#include "leveldb/cache.h"
#ifdef HAVE_LEVELDB_FILTER_POLICY
#include "leveldb/filter_policy.h"
#endif

#include <errno.h>
#include "common/errno.h"
#include "common/dout.h"
#include "include/assert.h"
#include "common/Formatter.h"

#include "common/ceph_context.h"

class PerfCounters;

enum {
  l_leveldb_first = 34300,
  l_leveldb_gets,
  l_leveldb_txns,
  l_leveldb_compact,
  l_leveldb_compact_range,
  l_leveldb_compact_queue_merge,
  l_leveldb_compact_queue_len,
  l_leveldb_last,
};

/**
 * Uses LevelDB to implement the KeyValueDB interface
 */
class LevelDBStore : public KeyValueDB {
  CephContext *cct;
  PerfCounters *logger;
  string path;
  boost::scoped_ptr<leveldb::DB> db;
  boost::scoped_ptr<leveldb::Cache> db_cache;
#ifdef HAVE_LEVELDB_FILTER_POLICY
  boost::scoped_ptr<const leveldb::FilterPolicy> filterpolicy;
#endif

  int init(ostream &out, bool create_if_missing);

  // manage async compactions
  Mutex compact_queue_lock;
  Cond compact_queue_cond;
  list< pair<string,string> > compact_queue;
  bool compact_queue_stop;
  class CompactThread : public Thread {
    LevelDBStore *db;
  public:
    CompactThread(LevelDBStore *d) : db(d) {}
    void *entry() {
      db->compact_thread_entry();
      return NULL;
    }
    friend class LevelDBStore;
  } compact_thread;

  void compact_thread_entry();

  void compact_range(const string& start, const string& end) {
    leveldb::Slice cstart(start);
    leveldb::Slice cend(end);
    db->CompactRange(&cstart, &cend);
  }
  void compact_range_async(const string& start, const string& end);

public:
  /// compact the underlying leveldb store
  void compact();

  /// compact leveldb for all keys with a given prefix
  void compact_prefix(const string& prefix) {
    compact_range(prefix, past_prefix(prefix));
  }
  void compact_prefix_async(const string& prefix) {
    compact_range_async(prefix, past_prefix(prefix));
  }

  void compact_range(const string& prefix, const string& start, const string& end) {
    compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
  }
  void compact_range_async(const string& prefix, const string& start, const string& end) {
    compact_range_async(combine_strings(prefix, start), combine_strings(prefix, end));
  }

  /**
   * options_t: Holds options which are minimally interpreted
   * on initialization and then passed through to LevelDB.
   * We transform a couple of these into actual LevelDB
   * structures, but the rest are simply passed through unchanged. See
   * leveldb/options.h for more precise details on each.
   *
   * Set them after constructing the LevelDBStore, but before calling
   * open() or create_and_open().
   */
  struct options_t {
    uint64_t write_buffer_size; /// in-memory write buffer size
    int max_open_files; /// maximum number of files LevelDB can open at once
    uint64_t cache_size; /// size of extra decompressed cache to use
    uint64_t block_size; /// user data per block
    int bloom_size; /// number of bits per entry to put in a bloom filter
    bool compression_enabled; /// whether to use libsnappy compression or not

    // don't change these ones. No, seriously
    int block_restart_interval;
    bool error_if_exists;
    bool paranoid_checks;

    string log_file;

    options_t() :
      write_buffer_size(0), //< 0 means default
      max_open_files(0), //< 0 means default
      cache_size(0), //< 0 means no cache (default)
      block_size(0), //< 0 means default
      bloom_size(0), //< 0 means no bloom filter (default)
      compression_enabled(true), //< set to false for no compression
      block_restart_interval(0), //< 0 means default
      error_if_exists(false), //< set to true if you want to check nonexistence
      paranoid_checks(false) //< set to true if you want paranoid checks
    {}
  } options;

  LevelDBStore(CephContext *c, const string &path) :
    cct(c),
    logger(NULL),
    path(path),
    db_cache(NULL),
#ifdef HAVE_LEVELDB_FILTER_POLICY
    filterpolicy(NULL),
#endif
    compact_queue_lock("LevelDBStore::compact_thread_lock"),
    compact_queue_stop(false),
    compact_thread(this),
    options()
  {}

  ~LevelDBStore();

  /// Opens underlying db
  int open(ostream &out) {
    return init(out, false);
  }
  /// Creates underlying db if missing and opens it
  int create_and_open(ostream &out) {
    return init(out, true);
  }

  void close();

  class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
  public:
    leveldb::WriteBatch bat;
    list<bufferlist> buffers;
    list<string> keys;
    LevelDBStore *db;

    LevelDBTransactionImpl(LevelDBStore *db) : db(db) {}
    void set(
      const string &prefix,
      const string &k,
      const bufferlist &bl);
    void rmkey(
      const string &prefix,
      const string &k);
    void rmkeys_by_prefix(
      const string &prefix
      );
  };

  KeyValueDB::Transaction get_transaction() {
    return std::tr1::shared_ptr< LevelDBTransactionImpl >(
      new LevelDBTransactionImpl(this));
  }

  int submit_transaction(KeyValueDB::Transaction t);
  int submit_transaction_sync(KeyValueDB::Transaction t);
  int get(
    const string &prefix,
    const std::set<string> &key,
    std::map<string, bufferlist> *out
    );

  class LevelDBWholeSpaceIteratorImpl :
    public KeyValueDB::WholeSpaceIteratorImpl {
  protected:
    boost::scoped_ptr<leveldb::Iterator> dbiter;
  public:
    LevelDBWholeSpaceIteratorImpl(leveldb::Iterator *iter) :
      dbiter(iter) { }
    virtual ~LevelDBWholeSpaceIteratorImpl() { }

    int seek_to_first() {
      dbiter->SeekToFirst();
      return dbiter->status().ok() ? 0 : -1;
    }
    int seek_to_first(const string &prefix) {
      leveldb::Slice slice_prefix(prefix);
      dbiter->Seek(slice_prefix);
      return dbiter->status().ok() ? 0 : -1;
    }
    int seek_to_last() {
      dbiter->SeekToLast();
      return dbiter->status().ok() ? 0 : -1;
    }
    int seek_to_last(const string &prefix) {
      string limit = past_prefix(prefix);
      leveldb::Slice slice_limit(limit);
      dbiter->Seek(slice_limit);

      if (!dbiter->Valid()) {
        dbiter->SeekToLast();
      } else {
        dbiter->Prev();
      }
      return dbiter->status().ok() ? 0 : -1;
    }
    int upper_bound(const string &prefix, const string &after) {
      lower_bound(prefix, after);
      if (valid()) {
	pair<string,string> key = raw_key();
	if (key.first == prefix && key.second == after)
	  next();
      }
      return dbiter->status().ok() ? 0 : -1;
    }
    int lower_bound(const string &prefix, const string &to) {
      string bound = combine_strings(prefix, to);
      leveldb::Slice slice_bound(bound);
      dbiter->Seek(slice_bound);
      return dbiter->status().ok() ? 0 : -1;
    }
    bool valid() {
      return dbiter->Valid();
    }
    int next() {
      if (valid())
	dbiter->Next();
      return dbiter->status().ok() ? 0 : -1;
    }
    int prev() {
      if (valid())
	dbiter->Prev();
      return dbiter->status().ok() ? 0 : -1;
    }
    string key() {
      string out_key;
      split_key(dbiter->key(), 0, &out_key);
      return out_key;
    }
    pair<string,string> raw_key() {
      string prefix, key;
      split_key(dbiter->key(), &prefix, &key);
      return make_pair(prefix, key);
    }
    bufferlist value() {
      return to_bufferlist(dbiter->value());
    }
    int status() {
      return dbiter->status().ok() ? 0 : -1;
    }
  };

  class LevelDBSnapshotIteratorImpl : public LevelDBWholeSpaceIteratorImpl {
    leveldb::DB *db;
    const leveldb::Snapshot *snapshot;
  public:
    LevelDBSnapshotIteratorImpl(leveldb::DB *db, const leveldb::Snapshot *s,
				leveldb::Iterator *iter) :
      LevelDBWholeSpaceIteratorImpl(iter), db(db), snapshot(s) { }

    ~LevelDBSnapshotIteratorImpl() {
      assert(snapshot != NULL);
      db->ReleaseSnapshot(snapshot);
    }
  };

  /// Utility
  static string combine_strings(const string &prefix, const string &value);
  static int split_key(leveldb::Slice in, string *prefix, string *key);
  static bufferlist to_bufferlist(leveldb::Slice in);
  static bool in_prefix(const string &prefix, leveldb::Slice key) {
    return (key.compare(leveldb::Slice(past_prefix(prefix))) < 0) &&
      (key.compare(leveldb::Slice(prefix)) > 0);
  }
  static string past_prefix(const string &prefix) {
    string limit = prefix;
    limit.push_back(1);
    return limit;
  }

  virtual uint64_t get_estimated_size(map<string,uint64_t> &extra) {
    DIR *store_dir = opendir(path.c_str());
    if (!store_dir) {
      lderr(cct) << __func__ << " something happened opening the store: "
                 << cpp_strerror(errno) << dendl;
      return 0;
    }

    uint64_t total_size = 0;
    uint64_t sst_size = 0;
    uint64_t log_size = 0;
    uint64_t misc_size = 0;

    struct dirent *entry = NULL;
    while ((entry = readdir(store_dir)) != NULL) {
      string n(entry->d_name);

      if (n == "." || n == "..")
        continue;

      string fpath = path + '/' + n;
      struct stat s;
      int err = stat(fpath.c_str(), &s);
      if (err < 0)
	err = -errno;
      // we may race against leveldb while reading files; this should only
      // happen when those files are being updated, data is being shuffled
      // and files get removed, in which case there's not much of a problem
      // as we'll get to them next time around.
      if ((err < 0) && (err != -ENOENT)) {
        lderr(cct) << __func__ << " error obtaining stats for " << fpath
                   << ": " << cpp_strerror(err) << dendl;
        goto err;
      }

      size_t pos = n.find_last_of('.');
      if (pos == string::npos) {
        misc_size += s.st_size;
        continue;
      }

      string ext = n.substr(pos+1);
      if (ext == "sst") {
        sst_size += s.st_size;
      } else if (ext == "log") {
        log_size += s.st_size;
      } else {
        misc_size += s.st_size;
      }
    }

    total_size = sst_size + log_size + misc_size;

    extra["sst"] = sst_size;
    extra["log"] = log_size;
    extra["misc"] = misc_size;
    extra["total"] = total_size;

err:
    closedir(store_dir);
    return total_size;
  }


protected:
  WholeSpaceIterator _get_iterator() {
    return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
      new LevelDBWholeSpaceIteratorImpl(
	db->NewIterator(leveldb::ReadOptions())
      )
    );
  }

  WholeSpaceIterator _get_snapshot_iterator() {
    const leveldb::Snapshot *snapshot;
    leveldb::ReadOptions options;

    snapshot = db->GetSnapshot();
    options.snapshot = snapshot;

    return std::tr1::shared_ptr<KeyValueDB::WholeSpaceIteratorImpl>(
      new LevelDBSnapshotIteratorImpl(db.get(), snapshot,
	db->NewIterator(options))
    );
  }

};

#endif