From a786ad773cd33880075f1deb3691528d1afd03ec Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 9 Sep 2013 22:27:23 -0700
Subject: ceph-disk: make initial journal files 0 bytes

The ceph-osd will resize journal files up and properly fallocate() them
so that the blocks are preallocated and (hopefully) contiguous.  We
don't need to do it here too, and getting fallocate() to work from
python is a pain in the butt.

Fixes: #5981
Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/ceph-disk | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/ceph-disk b/src/ceph-disk
index 3d09bdf7418..641bd64d60e 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -888,15 +888,12 @@ def prepare_journal_dev(
 
 
 def prepare_journal_file(
-    journal,
-    journal_size):
+    journal):
 
     if not os.path.exists(journal):
-        LOG.debug('Creating journal file %s with size %dM', journal, journal_size)
+        LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
         with file(journal, 'wb') as journal_file:
-            journal_file.truncate(journal_size * 1048576)
-
-    # FIXME: should we resize an existing journal file?
+            pass
 
     LOG.debug('Journal is file %s', journal)
     LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
@@ -921,13 +918,13 @@ def prepare_journal(
     if not os.path.exists(journal):
         if force_dev:
             raise Error('Journal does not exist; not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     jmode = os.stat(journal).st_mode
     if stat.S_ISREG(jmode):
         if force_dev:
             raise Error('Journal is not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     if stat.S_ISBLK(jmode):
         if force_file:
-- 
cgit v1.2.1


From eda807e01e39522ec20f4e90af8c44e7514d8af2 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 18 Sep 2013 14:33:12 -0700
Subject: common/bloom_filter: make mode match formatting

3 space tabs.. blech.  Let's not change it now, though.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/include/bloom_filter.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
index 41aba4bad47..5c4fb699587 100644
--- a/src/include/bloom_filter.hpp
+++ b/src/include/bloom_filter.hpp
@@ -1,3 +1,6 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:3; indent-tabs-mode:t -*-
+// vim: ts=8 sw=3 smarttab
+
 /*
  *******************************************************************
  *                                                                 *
-- 
cgit v1.2.1


From 8dcdeb2e02ea86ceb40cbef812b33f4243838f7b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 18 Sep 2013 14:50:05 -0700
Subject: common/bloom_filter: make optimal parameter calculation static

We pass the ctor our target behavior and calculate parameters based on
that.  Avoid storing the target behavior, and make that calc a static
method.  And add a new ctor that takes the parameters explicitly.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/include/bloom_filter.hpp | 46 ++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
index 5c4fb699587..59dafc9b5f7 100644
--- a/src/include/bloom_filter.hpp
+++ b/src/include/bloom_filter.hpp
@@ -55,13 +55,26 @@ public:
    bloom_filter(const std::size_t& predicted_inserted_element_count,
                 const double& false_positive_probability,
                 const std::size_t& random_seed)
-   : bit_table_(0),
-     predicted_inserted_element_count_(predicted_inserted_element_count),
-     inserted_element_count_(0),
-     random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
-     desired_false_positive_probability_(false_positive_probability)
+      : bit_table_(0),
+	inserted_element_count_(0),
+	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+   {
+      find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			      &salt_count_, &table_size_);
+      generate_unique_salt();
+      raw_table_size_ = table_size_ / bits_per_char;
+      bit_table_ = new cell_type[raw_table_size_];
+      std::fill_n(bit_table_,raw_table_size_,0x00);
+   }
+
+   bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+                const std::size_t& random_seed)
+      : bit_table_(0),
+	salt_count_(salt_count),
+	table_size_(table_size),
+	inserted_element_count_(0),
+	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
    {
-      find_optimal_parameters();
       generate_unique_salt();
       raw_table_size_ = table_size_ / bits_per_char;
       bit_table_ = new cell_type[raw_table_size_];
@@ -79,10 +92,8 @@ public:
         salt_count_ = filter.salt_count_;
         table_size_ = filter.table_size_;
         raw_table_size_ = filter.raw_table_size_;
-        predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
         inserted_element_count_ = filter.inserted_element_count_;
         random_seed_ = filter.random_seed_;
-        desired_false_positive_probability_ = filter.desired_false_positive_probability_;
         delete[] bit_table_;
         bit_table_ = new cell_type[raw_table_size_];
         std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
@@ -370,7 +381,10 @@ protected:
       }
    }
 
-   void find_optimal_parameters()
+   static void find_optimal_parameters(std::size_t target_insert_count,
+				       double target_fpp,
+				       std::size_t *salt_count,
+				       std::size_t *table_size)
    {
       /*
         Note:
@@ -386,8 +400,8 @@ protected:
       double k = 1.0;
       while (k < 1000.0)
       {
-         double numerator   = (- k * predicted_inserted_element_count_);
-         double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
+         double numerator   = (- k * target_insert_count);
+         double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
          curr_m = numerator / denominator;
 
          if (curr_m < min_m)
@@ -398,9 +412,10 @@ protected:
          k += 1.0;
       }
 
-      salt_count_ = static_cast<std::size_t>(min_k);
-      table_size_ = static_cast<std::size_t>(min_m);
-      table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
+      *salt_count = static_cast<std::size_t>(min_k);
+      size_t t = static_cast<std::size_t>(min_m);
+      t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+      *table_size = t;
    }
 
    inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
@@ -436,10 +451,8 @@ protected:
    std::size_t             salt_count_;
    std::size_t             table_size_;
    std::size_t             raw_table_size_;
-   std::size_t             predicted_inserted_element_count_;
    std::size_t             inserted_element_count_;
    std::size_t             random_seed_;
-   double                  desired_false_positive_probability_;
 };
 
 inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
@@ -497,7 +510,6 @@ public:
          return false;
       }
 
-      desired_false_positive_probability_ = effective_fpp();
       cell_type* tmp = new cell_type[new_table_size / bits_per_char];
       std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
       cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-- 
cgit v1.2.1


From 9df9155c4edc413185e6edeff4f2450af2461cfc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 18 Sep 2013 20:07:01 -0700
Subject: common/bloom_filter: make bloom_filter encodable

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/Makefile.am       |  3 +-
 src/common/bloom_filter.cc   | 76 ++++++++++++++++++++++++++++++++++++++++++++
 src/include/bloom_filter.hpp | 27 +++++++++++++---
 src/test/encoding/types.h    |  3 ++
 4 files changed, 104 insertions(+), 5 deletions(-)
 create mode 100644 src/common/bloom_filter.cc

diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 3526118205f..9a368f91a07 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -64,7 +64,8 @@ libcommon_la_SOURCES = \
 	common/ceph_strings.cc \
 	common/ceph_frag.cc \
 	common/addr_parsing.c \
-	common/hobject.cc
+	common/hobject.cc \
+	common/bloom_filter.cc
 
 if LINUX
 libcommon_la_SOURCES += common/secret.c
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..f602b80149e
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode((uint64_t)salt_count_, bl);
+  ::encode((uint64_t)table_size_, bl);
+  ::encode((uint64_t)inserted_element_count_, bl);
+  ::encode((uint64_t)random_seed_, bl);
+  bufferptr bp((const char*)bit_table_, raw_table_size_);
+  ::encode(bp, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  uint64_t v;
+  ::decode(v, p);
+  salt_count_ = v;
+  ::decode(v, p);
+  table_size_ = v;
+  ::decode(v, p);
+  inserted_element_count_ = v;
+  ::decode(v, p);
+  random_seed_ = v;
+  bufferlist t;
+  ::decode(t, p);
+
+  salt_.clear();
+  generate_unique_salt();
+  raw_table_size_ = t.length();
+  assert(raw_table_size_ == table_size_ / bits_per_char);
+  delete bit_table_;
+  bit_table_ = new cell_type[raw_table_size_];
+  t.copy(0, raw_table_size_, (char *)bit_table_);
+
+  DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+  f->dump_unsigned("salt_count", salt_count_);
+  f->dump_unsigned("table_size", table_size_);
+  f->dump_unsigned("raw_table_size", raw_table_size_);
+  f->dump_unsigned("insert_count", inserted_element_count_);
+  f->dump_unsigned("random_seed", random_seed_);
+
+  f->open_array_section("salt_table");
+  for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+    f->dump_unsigned("salt", *i);
+  f->close_section();
+
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < raw_table_size_; ++i)
+    f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+  f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->insert("boogggg");
+}
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
index 59dafc9b5f7..a65543c88ed 100644
--- a/src/include/bloom_filter.hpp
+++ b/src/include/bloom_filter.hpp
@@ -29,6 +29,8 @@
 #include <string>
 #include <vector>
 
+#include "include/encoding.h"
+#include "common/Formatter.h"
 
 static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
 static const unsigned char bit_mask[bits_per_char] = {
@@ -52,6 +54,15 @@ protected:
 
 public:
 
+   bloom_filter()
+      : bit_table_(0),
+	salt_count_(0),
+	table_size_(0),
+	raw_table_size_(0),
+	inserted_element_count_(0),
+	random_seed_(0)
+   {}
+
    bloom_filter(const std::size_t& predicted_inserted_element_count,
                 const double& false_positive_probability,
                 const std::size_t& random_seed)
@@ -61,10 +72,7 @@ public:
    {
       find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
 			      &salt_count_, &table_size_);
-      generate_unique_salt();
-      raw_table_size_ = table_size_ / bits_per_char;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::fill_n(bit_table_,raw_table_size_,0x00);
+      init();
    }
 
    bloom_filter(const std::size_t& salt_count, std::size_t table_size,
@@ -75,6 +83,10 @@ public:
 	inserted_element_count_(0),
 	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
    {
+      init();
+   }
+
+   void init() {
       generate_unique_salt();
       raw_table_size_ = table_size_ / bits_per_char;
       bit_table_ = new cell_type[raw_table_size_];
@@ -453,7 +465,14 @@ protected:
    std::size_t             raw_table_size_;
    std::size_t             inserted_element_count_;
    std::size_t             random_seed_;
+
+public:
+   void encode(bufferlist& bl) const;
+   void decode(bufferlist::iterator& bl);
+   void dump(Formatter *f) const;
+   static void generate_test_instances(list<bloom_filter*>& ls);
 };
+WRITE_CLASS_ENCODER(bloom_filter)
 
 inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
 {
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index fe17f077d8e..dd1ea4d570c 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,9 @@ TYPE(CompatSet)
 #include "include/filepath.h"
 TYPE(filepath)
 
+#include "include/bloom_filter.hpp"
+TYPE(bloom_filter)
+
 #include "common/snap_types.h"
 TYPE(SnapContext)
 TYPE(SnapRealmInfo)
-- 
cgit v1.2.1


From 12aa53cc940766b5ef1aabbbd1a252659d9654ef Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 18 Sep 2013 20:40:57 -0700
Subject: common/bloom_filter: move header from include/

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/Makefile.am       |   1 +
 src/common/bloom_filter.hpp  | 578 +++++++++++++++++++++++++++++++++++++++++++
 src/include/Makefile.am      |   1 -
 src/include/bloom_filter.hpp | 578 -------------------------------------------
 src/mds/CDir.cc              |   2 +-
 src/test/encoding/types.h    |   2 +-
 6 files changed, 581 insertions(+), 581 deletions(-)
 create mode 100644 src/common/bloom_filter.hpp
 delete mode 100644 src/include/bloom_filter.hpp

diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index 9a368f91a07..c29195abade 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -97,6 +97,7 @@ LIBCOMMON_DEPS += libcommon_crc.la
 noinst_LTLIBRARIES += libcommon_crc.la
 
 noinst_HEADERS += \
+	common/bloom_filter.hpp \
 	common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h \
 	common/crc32c_intel_fast.h
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..2a1ee2c4217
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,578 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:3; indent-tabs-mode:t -*-
+// vim: ts=8 sw=3 smarttab
+
+/*
+ *******************************************************************
+ *                                                                 *
+ *                        Open Bloom Filter                        *
+ *                                                                 *
+ * Author: Arash Partow - 2000                                     *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ *                                                                 *
+ * Copyright notice:                                               *
+ * Free use of the Open Bloom Filter Library is permitted under    *
+ * the guidelines and in accordance with the most current version  *
+ * of the Boost Software License, Version 1.0                      *
+ * http://www.opensource.org/licenses/bsl1.0.html                  *
+ *                                                                 *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+                                                       0x01,  //00000001
+                                                       0x02,  //00000010
+                                                       0x04,  //00000100
+                                                       0x08,  //00001000
+                                                       0x10,  //00010000
+                                                       0x20,  //00100000
+                                                       0x40,  //01000000
+                                                       0x80   //10000000
+                                                     };
+
+
+class bloom_filter
+{
+protected:
+
+   typedef unsigned int bloom_type;
+   typedef unsigned char cell_type;
+
+public:
+
+   bloom_filter()
+      : bit_table_(0),
+	salt_count_(0),
+	table_size_(0),
+	raw_table_size_(0),
+	inserted_element_count_(0),
+	random_seed_(0)
+   {}
+
+   bloom_filter(const std::size_t& predicted_inserted_element_count,
+                const double& false_positive_probability,
+                const std::size_t& random_seed)
+      : bit_table_(0),
+	inserted_element_count_(0),
+	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+   {
+      find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			      &salt_count_, &table_size_);
+      init();
+   }
+
+   bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+                const std::size_t& random_seed)
+      : bit_table_(0),
+	salt_count_(salt_count),
+	table_size_(table_size),
+	inserted_element_count_(0),
+	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+   {
+      init();
+   }
+
+   void init() {
+      generate_unique_salt();
+      raw_table_size_ = table_size_ / bits_per_char;
+      bit_table_ = new cell_type[raw_table_size_];
+      std::fill_n(bit_table_,raw_table_size_,0x00);
+   }
+
+   bloom_filter(const bloom_filter& filter)
+   {
+      this->operator=(filter);
+   }
+
+   bloom_filter& operator = (const bloom_filter& filter)
+   {
+      if (this != &filter) {
+        salt_count_ = filter.salt_count_;
+        table_size_ = filter.table_size_;
+        raw_table_size_ = filter.raw_table_size_;
+        inserted_element_count_ = filter.inserted_element_count_;
+        random_seed_ = filter.random_seed_;
+        delete[] bit_table_;
+        bit_table_ = new cell_type[raw_table_size_];
+        std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+        salt_ = filter.salt_;
+      }
+      return *this;
+   }
+
+   virtual ~bloom_filter()
+   {
+      delete[] bit_table_;
+   }
+
+   inline bool operator!() const
+   {
+      return (0 == table_size_);
+   }
+
+   inline void clear()
+   {
+      std::fill_n(bit_table_,raw_table_size_,0x00);
+      inserted_element_count_ = 0;
+   }
+
+   inline void insert(const unsigned char* key_begin, const std::size_t& length)
+   {
+      std::size_t bit_index = 0;
+      std::size_t bit = 0;
+      for (std::size_t i = 0; i < salt_.size(); ++i)
+      {
+         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+         bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+      }
+      ++inserted_element_count_;
+   }
+
+   template<typename T>
+   inline void insert(const T& t)
+   {
+      // Note: T must be a C++ POD type.
+      insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+   }
+
+   inline void insert(const std::string& key)
+   {
+      insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+   }
+
+   inline void insert(const char* data, const std::size_t& length)
+   {
+      insert(reinterpret_cast<const unsigned char*>(data),length);
+   }
+
+   template<typename InputIterator>
+   inline void insert(const InputIterator begin, const InputIterator end)
+   {
+      InputIterator itr = begin;
+      while (end != itr)
+      {
+         insert(*(itr++));
+      }
+   }
+
+   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+   {
+      std::size_t bit_index = 0;
+      std::size_t bit = 0;
+      for (std::size_t i = 0; i < salt_.size(); ++i)
+      {
+         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+         if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+         {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   template<typename T>
+   inline bool contains(const T& t) const
+   {
+      return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+   }
+
+   inline bool contains(const std::string& key) const
+   {
+      return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+   }
+
+   inline bool contains(const char* data, const std::size_t& length) const
+   {
+      return contains(reinterpret_cast<const unsigned char*>(data),length);
+   }
+
+   template<typename InputIterator>
+   inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+   {
+      InputIterator itr = begin;
+      while (end != itr)
+      {
+         if (!contains(*itr))
+         {
+            return itr;
+         }
+         ++itr;
+      }
+      return end;
+   }
+
+   template<typename InputIterator>
+   inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+   {
+      InputIterator itr = begin;
+      while (end != itr)
+      {
+         if (contains(*itr))
+         {
+            return itr;
+         }
+         ++itr;
+      }
+      return end;
+   }
+
+   inline virtual std::size_t size() const
+   {
+      return table_size_;
+   }
+
+   inline std::size_t element_count() const
+   {
+      return inserted_element_count_;
+   }
+
+   inline double effective_fpp() const
+   {
+      /*
+        Note:
+        The effective false positive probability is calculated using the
+        designated table size and hash function count in conjunction with
+        the current number of inserted elements - not the user defined
+        predicated/expected number of inserted elements.
+      */
+      return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+   }
+
+   inline bloom_filter& operator &= (const bloom_filter& filter)
+   {
+      /* intersection */
+      if (
+          (salt_count_  == filter.salt_count_) &&
+          (table_size_  == filter.table_size_) &&
+          (random_seed_ == filter.random_seed_)
+         )
+      {
+         for (std::size_t i = 0; i < raw_table_size_; ++i)
+         {
+            bit_table_[i] &= filter.bit_table_[i];
+         }
+      }
+      return *this;
+   }
+
+   inline bloom_filter& operator |= (const bloom_filter& filter)
+   {
+      /* union */
+      if (
+          (salt_count_  == filter.salt_count_) &&
+          (table_size_  == filter.table_size_) &&
+          (random_seed_ == filter.random_seed_)
+         )
+      {
+         for (std::size_t i = 0; i < raw_table_size_; ++i)
+         {
+            bit_table_[i] |= filter.bit_table_[i];
+         }
+      }
+      return *this;
+   }
+
+   inline bloom_filter& operator ^= (const bloom_filter& filter)
+   {
+      /* difference */
+      if (
+          (salt_count_  == filter.salt_count_) &&
+          (table_size_  == filter.table_size_) &&
+          (random_seed_ == filter.random_seed_)
+         )
+      {
+         for (std::size_t i = 0; i < raw_table_size_; ++i)
+         {
+            bit_table_[i] ^= filter.bit_table_[i];
+         }
+      }
+      return *this;
+   }
+
+   inline const cell_type* table() const
+   {
+      return bit_table_;
+   }
+
+protected:
+
+   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+   {
+      bit_index = hash % table_size_;
+      bit = bit_index % bits_per_char;
+   }
+
+   void generate_unique_salt()
+   {
+      /*
+        Note:
+        A distinct hash function need not be implementation-wise
+        distinct. In the current implementation "seeding" a common
+        hash function with different values seems to be adequate.
+      */
+      const unsigned int predef_salt_count = 128;
+      static const bloom_type predef_salt[predef_salt_count] =
+                                 {
+                                    0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+                                    0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+                                    0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+                                    0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+                                    0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+                                    0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+                                    0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+                                    0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+                                    0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+                                    0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+                                    0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+                                    0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+                                    0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+                                    0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+                                    0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+                                    0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+                                    0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+                                    0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+                                    0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+                                    0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+                                    0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+                                    0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+                                    0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+                                    0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+                                    0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+                                    0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+                                    0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+                                    0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+                                    0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+                                    0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+                                    0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+                                    0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+                                 };
+
+      if (salt_count_ <= predef_salt_count)
+      {
+         std::copy(predef_salt,
+                   predef_salt + salt_count_,
+                   std::back_inserter(salt_));
+          for (unsigned int i = 0; i < salt_.size(); ++i)
+          {
+            /*
+              Note:
+              This is done to integrate the user defined random seed,
+              so as to allow for the generation of unique bloom filter
+              instances.
+            */
+            salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+          }
+      }
+      else
+      {
+         std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+         srand(static_cast<unsigned int>(random_seed_));
+         while (salt_.size() < salt_count_)
+         {
+            bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+            if (0 == current_salt) continue;
+            if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+            {
+               salt_.push_back(current_salt);
+            }
+         }
+      }
+   }
+
+   static void find_optimal_parameters(std::size_t target_insert_count,
+				       double target_fpp,
+				       std::size_t *salt_count,
+				       std::size_t *table_size)
+   {
+      /*
+        Note:
+        The following will attempt to find the number of hash functions
+        and minimum amount of storage bits required to construct a bloom
+        filter consistent with the user defined false positive probability
+        and estimated element insertion count.
+      */
+
+      double min_m = std::numeric_limits<double>::infinity();
+      double min_k = 0.0;
+      double curr_m = 0.0;
+      double k = 1.0;
+      while (k < 1000.0)
+      {
+         double numerator   = (- k * target_insert_count);
+         double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+         curr_m = numerator / denominator;
+
+         if (curr_m < min_m)
+         {
+            min_m = curr_m;
+            min_k = k;
+         }
+         k += 1.0;
+      }
+
+      *salt_count = static_cast<std::size_t>(min_k);
+      size_t t = static_cast<std::size_t>(min_m);
+      t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+      *table_size = t;
+   }
+
+   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+   {
+      const unsigned char* itr = begin;
+
+      while (remaining_length >= 4)
+      {
+         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+         remaining_length -= 4;
+      }
+
+      while (remaining_length >= 2)
+      {
+         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+         remaining_length -= 2;
+      }
+
+      if (remaining_length)
+      {
+         hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+      }
+
+      return hash;
+   }
+
+   std::vector<bloom_type> salt_;
+   unsigned char*          bit_table_;
+   std::size_t             salt_count_;
+   std::size_t             table_size_;
+   std::size_t             raw_table_size_;
+   std::size_t             inserted_element_count_;
+   std::size_t             random_seed_;
+
+public:
+   void encode(bufferlist& bl) const;
+   void decode(bufferlist::iterator& bl);
+   void dump(Formatter *f) const;
+   static void generate_test_instances(list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+   bloom_filter result = a;
+   result &= b;
+   return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+   bloom_filter result = a;
+   result |= b;
+   return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+   bloom_filter result = a;
+   result ^= b;
+   return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+   compressible_bloom_filter(const std::size_t& predicted_element_count,
+                             const double& false_positive_probability,
+                             const std::size_t& random_seed)
+   : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+   {
+      size_list.push_back(table_size_);
+   }
+
+   inline virtual std::size_t size() const
+   {
+      return size_list.back();
+   }
+
+   inline bool compress(const double& percentage)
+   {
+      if ((0.0 >= percentage) || (percentage >= 100.0))
+      {
+         return false;
+      }
+
+      std::size_t original_table_size = size_list.back();
+      std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
+      new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+
+      if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+      {
+         return false;
+      }
+
+      cell_type* tmp = new cell_type[new_table_size / bits_per_char];
+      std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
+      cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
+      cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+      cell_type* itr_tmp = tmp;
+
+      while (end != itr)
+      {
+         *(itr_tmp++) |= (*itr++);
+      }
+
+      delete[] bit_table_;
+      bit_table_ = tmp;
+      size_list.push_back(new_table_size);
+
+      return true;
+   }
+
+private:
+
+   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+   {
+      bit_index = hash;
+      for (std::size_t i = 0; i < size_list.size(); ++i)
+      {
+         bit_index %= size_list[i];
+      }
+      bit = bit_index % bits_per_char;
+   }
+
+   std::vector<std::size_t> size_list;
+};
+
+#endif
+
+
+/*
+  Note 1:
+  If it can be guaranteed that bits_per_char will be of the form 2^n then
+  the following optimization can be used:
+
+  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+  Note 2:
+  For performance reasons where possible when allocating memory it should
+  be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index d702ebd2795..2d98e777f00 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -18,7 +18,6 @@ rados_include_DATA = \
 	$(srcdir)/include/crc32c.h
 
 noinst_HEADERS += \
-	include/bloom_filter.hpp \
 	include/Context.h \
 	include/CompatSet.h \
 	include/Distribution.h \
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index a65543c88ed..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,578 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:3; indent-tabs-mode:t -*-
-// vim: ts=8 sw=3 smarttab
-
-/*
- *******************************************************************
- *                                                                 *
- *                        Open Bloom Filter                        *
- *                                                                 *
- * Author: Arash Partow - 2000                                     *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- *                                                                 *
- * Copyright notice:                                               *
- * Free use of the Open Bloom Filter Library is permitted under    *
- * the guidelines and in accordance with the most current version  *
- * of the Boost Software License, Version 1.0                      *
- * http://www.opensource.org/licenses/bsl1.0.html                  *
- *                                                                 *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-#include "include/encoding.h"
-#include "common/Formatter.h"
-
-static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
-                                                       0x01,  //00000001
-                                                       0x02,  //00000010
-                                                       0x04,  //00000100
-                                                       0x08,  //00001000
-                                                       0x10,  //00010000
-                                                       0x20,  //00100000
-                                                       0x40,  //01000000
-                                                       0x80   //10000000
-                                                     };
-
-
-class bloom_filter
-{
-protected:
-
-   typedef unsigned int bloom_type;
-   typedef unsigned char cell_type;
-
-public:
-
-   bloom_filter()
-      : bit_table_(0),
-	salt_count_(0),
-	table_size_(0),
-	raw_table_size_(0),
-	inserted_element_count_(0),
-	random_seed_(0)
-   {}
-
-   bloom_filter(const std::size_t& predicted_inserted_element_count,
-                const double& false_positive_probability,
-                const std::size_t& random_seed)
-      : bit_table_(0),
-	inserted_element_count_(0),
-	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
-   {
-      find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
-			      &salt_count_, &table_size_);
-      init();
-   }
-
-   bloom_filter(const std::size_t& salt_count, std::size_t table_size,
-                const std::size_t& random_seed)
-      : bit_table_(0),
-	salt_count_(salt_count),
-	table_size_(table_size),
-	inserted_element_count_(0),
-	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
-   {
-      init();
-   }
-
-   void init() {
-      generate_unique_salt();
-      raw_table_size_ = table_size_ / bits_per_char;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-   }
-
-   bloom_filter(const bloom_filter& filter)
-   {
-      this->operator=(filter);
-   }
-
-   bloom_filter& operator = (const bloom_filter& filter)
-   {
-      if (this != &filter) {
-        salt_count_ = filter.salt_count_;
-        table_size_ = filter.table_size_;
-        raw_table_size_ = filter.raw_table_size_;
-        inserted_element_count_ = filter.inserted_element_count_;
-        random_seed_ = filter.random_seed_;
-        delete[] bit_table_;
-        bit_table_ = new cell_type[raw_table_size_];
-        std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
-        salt_ = filter.salt_;
-      }
-      return *this;
-   }
-
-   virtual ~bloom_filter()
-   {
-      delete[] bit_table_;
-   }
-
-   inline bool operator!() const
-   {
-      return (0 == table_size_);
-   }
-
-   inline void clear()
-   {
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-      inserted_element_count_ = 0;
-   }
-
-   inline void insert(const unsigned char* key_begin, const std::size_t& length)
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
-      }
-      ++inserted_element_count_;
-   }
-
-   template<typename T>
-   inline void insert(const T& t)
-   {
-      // Note: T must be a C++ POD type.
-      insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
-   }
-
-   inline void insert(const std::string& key)
-   {
-      insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline void insert(const char* data, const std::size_t& length)
-   {
-      insert(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline void insert(const InputIterator begin, const InputIterator end)
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         insert(*(itr++));
-      }
-   }
-
-   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
-         {
-            return false;
-         }
-      }
-      return true;
-   }
-
-   template<typename T>
-   inline bool contains(const T& t) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
-   }
-
-   inline bool contains(const std::string& key) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline bool contains(const char* data, const std::size_t& length) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (!contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return table_size_;
-   }
-
-   inline std::size_t element_count() const
-   {
-      return inserted_element_count_;
-   }
-
-   inline double effective_fpp() const
-   {
-      /*
-        Note:
-        The effective false positive probability is calculated using the
-        designated table size and hash function count in conjunction with
-        the current number of inserted elements - not the user defined
-        predicated/expected number of inserted elements.
-      */
-      return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
-   }
-
-   inline bloom_filter& operator &= (const bloom_filter& filter)
-   {
-      /* intersection */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] &= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator |= (const bloom_filter& filter)
-   {
-      /* union */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] |= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator ^= (const bloom_filter& filter)
-   {
-      /* difference */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] ^= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline const cell_type* table() const
-   {
-      return bit_table_;
-   }
-
-protected:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash % table_size_;
-      bit = bit_index % bits_per_char;
-   }
-
-   void generate_unique_salt()
-   {
-      /*
-        Note:
-        A distinct hash function need not be implementation-wise
-        distinct. In the current implementation "seeding" a common
-        hash function with different values seems to be adequate.
-      */
-      const unsigned int predef_salt_count = 128;
-      static const bloom_type predef_salt[predef_salt_count] =
-                                 {
-                                    0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
-                                    0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
-                                    0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
-                                    0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
-                                    0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
-                                    0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
-                                    0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
-                                    0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
-                                    0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
-                                    0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
-                                    0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
-                                    0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
-                                    0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
-                                    0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
-                                    0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
-                                    0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
-                                    0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
-                                    0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
-                                    0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
-                                    0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
-                                    0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
-                                    0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
-                                    0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
-                                    0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
-                                    0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
-                                    0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
-                                    0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
-                                    0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
-                                    0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
-                                    0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
-                                    0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
-                                    0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
-                                 };
-
-      if (salt_count_ <= predef_salt_count)
-      {
-         std::copy(predef_salt,
-                   predef_salt + salt_count_,
-                   std::back_inserter(salt_));
-          for (unsigned int i = 0; i < salt_.size(); ++i)
-          {
-            /*
-              Note:
-              This is done to integrate the user defined random seed,
-              so as to allow for the generation of unique bloom filter
-              instances.
-            */
-            salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
-          }
-      }
-      else
-      {
-         std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
-         srand(static_cast<unsigned int>(random_seed_));
-         while (salt_.size() < salt_count_)
-         {
-            bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
-            if (0 == current_salt) continue;
-            if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
-            {
-               salt_.push_back(current_salt);
-            }
-         }
-      }
-   }
-
-   static void find_optimal_parameters(std::size_t target_insert_count,
-				       double target_fpp,
-				       std::size_t *salt_count,
-				       std::size_t *table_size)
-   {
-      /*
-        Note:
-        The following will attempt to find the number of hash functions
-        and minimum amount of storage bits required to construct a bloom
-        filter consistent with the user defined false positive probability
-        and estimated element insertion count.
-      */
-
-      double min_m = std::numeric_limits<double>::infinity();
-      double min_k = 0.0;
-      double curr_m = 0.0;
-      double k = 1.0;
-      while (k < 1000.0)
-      {
-         double numerator   = (- k * target_insert_count);
-         double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
-         curr_m = numerator / denominator;
-
-         if (curr_m < min_m)
-         {
-            min_m = curr_m;
-            min_k = k;
-         }
-         k += 1.0;
-      }
-
-      *salt_count = static_cast<std::size_t>(min_k);
-      size_t t = static_cast<std::size_t>(min_m);
-      t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
-      *table_size = t;
-   }
-
-   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
-   {
-      const unsigned char* itr = begin;
-
-      while (remaining_length >= 4)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 4;
-      }
-
-      while (remaining_length >= 2)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 2;
-      }
-
-      if (remaining_length)
-      {
-         hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
-      }
-
-      return hash;
-   }
-
-   std::vector<bloom_type> salt_;
-   unsigned char*          bit_table_;
-   std::size_t             salt_count_;
-   std::size_t             table_size_;
-   std::size_t             raw_table_size_;
-   std::size_t             inserted_element_count_;
-   std::size_t             random_seed_;
-
-public:
-   void encode(bufferlist& bl) const;
-   void decode(bufferlist::iterator& bl);
-   void dump(Formatter *f) const;
-   static void generate_test_instances(list<bloom_filter*>& ls);
-};
-WRITE_CLASS_ENCODER(bloom_filter)
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result &= b;
-   return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result |= b;
-   return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result ^= b;
-   return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
-   compressible_bloom_filter(const std::size_t& predicted_element_count,
-                             const double& false_positive_probability,
-                             const std::size_t& random_seed)
-   : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
-   {
-      size_list.push_back(table_size_);
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return size_list.back();
-   }
-
-   inline bool compress(const double& percentage)
-   {
-      if ((0.0 >= percentage) || (percentage >= 100.0))
-      {
-         return false;
-      }
-
-      std::size_t original_table_size = size_list.back();
-      std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
-      new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
-      if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
-      {
-         return false;
-      }
-
-      cell_type* tmp = new cell_type[new_table_size / bits_per_char];
-      std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
-      cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-      cell_type* end = bit_table_ + (original_table_size / bits_per_char);
-      cell_type* itr_tmp = tmp;
-
-      while (end != itr)
-      {
-         *(itr_tmp++) |= (*itr++);
-      }
-
-      delete[] bit_table_;
-      bit_table_ = tmp;
-      size_list.push_back(new_table_size);
-
-      return true;
-   }
-
-private:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash;
-      for (std::size_t i = 0; i < size_list.size(); ++i)
-      {
-         bit_index %= size_list[i];
-      }
-      bit = bit_index % bits_per_char;
-   }
-
-   std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
-  Note 1:
-  If it can be guaranteed that bits_per_char will be of the form 2^n then
-  the following optimization can be used:
-
-  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
-  Note 2:
-  For performance reasons where possible when allocating memory it should
-  be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index c77ca180a6f..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
 #include "MDLog.h"
 #include "LogSegment.h"
 
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
 #include "include/Context.h"
 #include "common/Clock.h"
 
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index dd1ea4d570c..3ff2821c428 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,7 +4,7 @@ TYPE(CompatSet)
 #include "include/filepath.h"
 TYPE(filepath)
 
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
 TYPE(bloom_filter)
 
 #include "common/snap_types.h"
-- 
cgit v1.2.1


From bab72ed394161feb47637f9d2d07ff421e97726c Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Thu, 26 Sep 2013 14:26:52 -0700
Subject: os: Simplify collection_list* funcs by removing dynamic_cast

Signed-off-by: David Zafman <david.zafman@inktank.com>
---
 src/os/ObjectStore.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 84549821aff..1a1bbcb0b67 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -501,8 +501,7 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
 int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
 {
   vector<ghobject_t> go;
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list(c, go);
+  int ret = collection_list(c, go);
   if (ret == 0) {
     o.reserve(go.size());
     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
@@ -517,8 +516,7 @@ int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
 {
   vector<ghobject_t> go;
   ghobject_t gnext, gstart(start);
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
+  int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
   if (ret == 0) {
     *next = gnext.hobj;
     ls->reserve(go.size());
@@ -533,8 +531,7 @@ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
 {
   vector<ghobject_t> go;
   ghobject_t gstart(start), gend(end);
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list_range(c, gstart, gend, seq, &go);
+  int ret = collection_list_range(c, gstart, gend, seq, &go);
   if (ret == 0) {
     ls->reserve(go.size());
     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
-- 
cgit v1.2.1


From dbfd4781a20302af506847a78007e029e34856b0 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 13:28:03 -0700
Subject: ReplicatedPG: update pg stats correctly when doing a copy

The obs.oi.size needs to updated in the middle so that we actually
change the stats -- this got set backwards by mistake during one
of the refactors to support large objects!
(See 4e29e362e7981634d751ee982144fbf602782a9a)

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index fcaca434ba8..7d4c79b9553 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4549,8 +4549,8 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
 
   if (cop->cursor.data_offset != obs.oi.size) {
     ctx->delta_stats.num_bytes -= obs.oi.size;
-    ctx->delta_stats.num_bytes += obs.oi.size;
     obs.oi.size = cop->cursor.data_offset;
+    ctx->delta_stats.num_bytes += obs.oi.size;
   }
   ctx->delta_stats.num_wr++;
   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
-- 
cgit v1.2.1


From 4f5b317714928a48b3cece80f7b99e52613e6944 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 15:36:27 -0700
Subject: ReplicatedPG: copy: do not use an OpContext in _copy_some

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 10 +++++-----
 src/osd/ReplicatedPG.h  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 7d4c79b9553..846669d58db 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4397,14 +4397,14 @@ int ReplicatedPG::start_copy(OpContext *ctx,
   ctx->copy_op = cop;
   ++ctx->obc->copyfrom_readside;
 
-  _copy_some(ctx, cop);
+  _copy_some(ctx->obc, cop);
 
   return 0;
 }
 
-void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
+void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
 {
-  dout(10) << __func__ << " " << ctx << " " << cop << dendl;
+  dout(10) << __func__ << " " << obc << " " << cop << dendl;
   ObjectOperation op;
   if (cop->version) {
     op.assert_version(cop->version);
@@ -4418,7 +4418,7 @@ void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
 	      &cop->data, &cop->omap,
 	      &cop->rval);
 
-  C_Copyfrom *fin = new C_Copyfrom(this, ctx->obs->oi.soid,
+  C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
 				   get_last_peering_reset());
   osd->objecter_lock.Lock();
   tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
@@ -4479,7 +4479,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
     repop->put();
 
     dout(10) << __func__ << " fetching more" << dendl;
-    _copy_some(ctx, cop);
+    _copy_some(ctx->obc, cop);
     return;
   }
 
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index abee57ffe7d..f6898d80f13 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -725,7 +725,7 @@ protected:
   int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
-  void _copy_some(OpContext *ctx, CopyOpRef cop);
+  void _copy_some(ObjectContextRef obc, CopyOpRef cop);
   int finish_copy(OpContext *ctx);
   void cancel_copy(CopyOpRef cop);
   void cancel_copy_ops();
-- 
cgit v1.2.1


From 6658f3cef1062bdab78a2139cd939e5e11021bc5 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 15:47:31 -0700
Subject: ReplicatedPG: copy: remove most references to OpContext from
 process_copy_chunk

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 846669d58db..4748c31203b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4447,11 +4447,12 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
     return;
   }
   OpContext *ctx = cop->ctx;
+  ObjectContextRef obc = ctx->obc;
   cop->objecter_tid = 0;
   if (r < 0) {
-    copy_ops.erase(ctx->obc->obs.oi.soid);
-    --ctx->obc->copyfrom_readside;
-    kick_object_context_blocked(ctx->obc);
+    copy_ops.erase(obc->obs.oi.soid);
+    obc->copyfrom_readside;
+    kick_object_context_blocked(obc);
     reply_ctx(ctx, r);
     return;
   }
@@ -4462,9 +4463,9 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
     vector<OSDOp> ops;
     tid_t rep_tid = osd->get_tid();
     osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
-    OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &ctx->obc->obs, ctx->obc->ssc, this);
+    OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
     tctx->mtime = ceph_clock_now(g_ceph_context);
-    RepGather *repop = new_repop(tctx, ctx->obc, rep_tid);
+    RepGather *repop = new_repop(tctx, obc, rep_tid);
 
     if (cop->temp_cursor.is_initial()) {
       cop->temp_coll = get_temp_coll(&tctx->local_t);
@@ -4479,17 +4480,17 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
     repop->put();
 
     dout(10) << __func__ << " fetching more" << dendl;
-    _copy_some(ctx->obc, cop);
+    _copy_some(obc, cop);
     return;
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
   execute_ctx(ctx);
 
-  copy_ops.erase(ctx->obc->obs.oi.soid);
-  --ctx->obc->copyfrom_readside;
+  copy_ops.erase(obc->obs.oi.soid);
+  --obc->copyfrom_readside;
   ctx->copy_op.reset();
-  kick_object_context_blocked(ctx->obc);
+  kick_object_context_blocked(obc);
 }
 
 void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
-- 
cgit v1.2.1


From 5307703bf18fea717f9daadba1c1653a5a30b716 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 15:52:33 -0700
Subject: ReplicatedPG: follow the same finish path for failed copy ops

We don't necessarily want to respond to clients with a failure if
a copy got an error code. Instead, conditionally execute the success
path and always launch back into execute_ctx() when the copy has
stopped (either due to completion or failure).

Update the COPY_FROM section so it returns the CopyOp::rval (instead
of always zero) and only launches finish_copy() on success.

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 59 ++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 4748c31203b..feee6de920f 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3770,7 +3770,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -EINPROGRESS;
 	} else {
 	  // finish
-	  result = finish_copy(ctx);
+	  result = ctx->copy_op->rval;
+	  if (ctx->copy_op->rval >= 0) { //success!
+	    result = finish_copy(ctx);
+	  }
 	}
       }
       break;
@@ -4449,39 +4452,35 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
   OpContext *ctx = cop->ctx;
   ObjectContextRef obc = ctx->obc;
   cop->objecter_tid = 0;
-  if (r < 0) {
-    copy_ops.erase(obc->obs.oi.soid);
-    obc->copyfrom_readside;
-    kick_object_context_blocked(obc);
-    reply_ctx(ctx, r);
-    return;
-  }
-  assert(cop->rval >= 0);
 
-  if (!cop->cursor.is_complete()) {
-    // write out what we have so far
-    vector<OSDOp> ops;
-    tid_t rep_tid = osd->get_tid();
-    osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
-    OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
-    tctx->mtime = ceph_clock_now(g_ceph_context);
-    RepGather *repop = new_repop(tctx, obc, rep_tid);
-
-    if (cop->temp_cursor.is_initial()) {
-      cop->temp_coll = get_temp_coll(&tctx->local_t);
-      cop->temp_oid = generate_temp_object();
-      repop->ctx->new_temp_oid = cop->temp_oid;
-    }
+  if (r >= 0) {
+    assert(cop->rval >= 0);
+
+    if (!cop->cursor.is_complete()) {
+      // write out what we have so far
+      vector<OSDOp> ops;
+      tid_t rep_tid = osd->get_tid();
+      osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+      OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
+      tctx->mtime = ceph_clock_now(g_ceph_context);
+      RepGather *repop = new_repop(tctx, obc, rep_tid);
+
+      if (cop->temp_cursor.is_initial()) {
+	cop->temp_coll = get_temp_coll(&tctx->local_t);
+	cop->temp_oid = generate_temp_object();
+	repop->ctx->new_temp_oid = cop->temp_oid;
+      }
 
-    _write_copy_chunk(cop, &tctx->op_t);
+      _write_copy_chunk(cop, &tctx->op_t);
 
-    issue_repop(repop, repop->ctx->mtime);
-    eval_repop(repop);
-    repop->put();
+      issue_repop(repop, repop->ctx->mtime);
+      eval_repop(repop);
+      repop->put();
 
-    dout(10) << __func__ << " fetching more" << dendl;
-    _copy_some(obc, cop);
-    return;
+      dout(10) << __func__ << " fetching more" << dendl;
+      _copy_some(obc, cop);
+      return;
+    }
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-- 
cgit v1.2.1


From 6ed8b7ae56d88c586e3d359e649b41b892d3cee1 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 16:20:26 -0700
Subject: ReplicatedPG: copy: add an ObjectContextRef to CopyOp

Use that instead of the OpContext::obc in copy codepaths.

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 15 +++++++--------
 src/osd/ReplicatedPG.h  |  5 +++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index feee6de920f..aba0a1067c0 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4395,7 +4395,7 @@ int ReplicatedPG::start_copy(OpContext *ctx,
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, src, oloc, version));
+  CopyOpRef cop(new CopyOp(ctx, ctx->obc, src, oloc, version));
   copy_ops[dest] = cop;
   ctx->copy_op = cop;
   ++ctx->obc->copyfrom_readside;
@@ -4449,8 +4449,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
 	     << " tid " << cop->objecter_tid << dendl;
     return;
   }
-  OpContext *ctx = cop->ctx;
-  ObjectContextRef obc = ctx->obc;
+  ObjectContextRef obc = cop->obc;
   cop->objecter_tid = 0;
 
   if (r >= 0) {
@@ -4484,11 +4483,11 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-  execute_ctx(ctx);
+  execute_ctx(cop->ctx);
 
   copy_ops.erase(obc->obs.oi.soid);
   --obc->copyfrom_readside;
-  ctx->copy_op.reset();
+  cop->ctx->copy_op.reset();
   kick_object_context_blocked(obc);
 }
 
@@ -4571,11 +4570,11 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
     osd->objecter->op_cancel(cop->objecter_tid);
   }
 
-  copy_ops.erase(ctx->obc->obs.oi.soid);
-  --ctx->obc->copyfrom_readside;
+  copy_ops.erase(cop->obc->obs.oi.soid);
+  --cop->obc->copyfrom_readside;
   ctx->copy_op.reset();
 
-  kick_object_context_blocked(ctx->obc);
+  kick_object_context_blocked(cop->obc);
 
   delete ctx;
 }
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index f6898d80f13..c968ad3ea2f 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -96,6 +96,7 @@ public:
 
   struct CopyOp {
     OpContext *ctx;
+    ObjectContextRef obc;
     hobject_t src;
     object_locator_t oloc;
     version_t version;
@@ -114,8 +115,8 @@ public:
     hobject_t temp_oid;
     object_copy_cursor_t temp_cursor;
 
-    CopyOp(OpContext *c, hobject_t s, object_locator_t l, version_t v)
-      : ctx(c), src(s), oloc(l), version(v),
+    CopyOp(OpContext *c, ObjectContextRef _obc, hobject_t s, object_locator_t l, version_t v)
+      : ctx(c), obc(_obc), src(s), oloc(l), version(v),
 	objecter_tid(0),
 	size(0),
 	rval(-1)
-- 
cgit v1.2.1


From 1ae8ef28e7c511ac754429c8c061513fdf1c22b6 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 16:27:39 -0700
Subject: ReplicatedPG: copy: take an ObjectContextRef in start_copy and use
 that

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 10 +++++-----
 src/osd/ReplicatedPG.h  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index aba0a1067c0..cfac7dd7db2 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3764,7 +3764,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    result = -EINVAL;
 	    break;
 	  }
-	  result = start_copy(ctx, src, src_oloc, src_version);
+	  result = start_copy(ctx, ctx->obc, src, src_oloc, src_version);
 	  if (result < 0)
 	    goto fail;
 	  result = -EINPROGRESS;
@@ -4379,7 +4379,7 @@ struct C_Copyfrom : public Context {
   }
 };
 
-int ReplicatedPG::start_copy(OpContext *ctx,
+int ReplicatedPG::start_copy(OpContext *ctx, ObjectContextRef obc,
 			     hobject_t src, object_locator_t oloc, version_t version)
 {
   const hobject_t& dest = ctx->obs->oi.soid;
@@ -4395,12 +4395,12 @@ int ReplicatedPG::start_copy(OpContext *ctx,
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, ctx->obc, src, oloc, version));
+  CopyOpRef cop(new CopyOp(ctx, obc, src, oloc, version));
   copy_ops[dest] = cop;
   ctx->copy_op = cop;
-  ++ctx->obc->copyfrom_readside;
+  ++obc->copyfrom_readside;
 
-  _copy_some(ctx->obc, cop);
+  _copy_some(obc, cop);
 
   return 0;
 }
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index c968ad3ea2f..5f0c97b3716 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -723,7 +723,7 @@ protected:
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version);
+  int start_copy(OpContext *ctx, ObjectContextRef obc, hobject_t src, object_locator_t oloc, version_t version);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
   void _copy_some(ObjectContextRef obc, CopyOpRef cop);
-- 
cgit v1.2.1


From 010ff3759efc650d766348ab988c302996b8fc50 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 30 Sep 2013 16:59:52 -0700
Subject: ReplicatedPG: copy: specify the temp_oid in the caller

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 10 ++++++----
 src/osd/ReplicatedPG.h  | 10 +++++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index cfac7dd7db2..c3a572509bc 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3764,7 +3764,9 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    result = -EINVAL;
 	    break;
 	  }
-	  result = start_copy(ctx, ctx->obc, src, src_oloc, src_version);
+	  hobject_t temp_target = generate_temp_object();
+	  result = start_copy(ctx, ctx->obc, src, src_oloc, src_version,
+	                      temp_target);
 	  if (result < 0)
 	    goto fail;
 	  result = -EINPROGRESS;
@@ -4380,7 +4382,8 @@ struct C_Copyfrom : public Context {
 };
 
 int ReplicatedPG::start_copy(OpContext *ctx, ObjectContextRef obc,
-			     hobject_t src, object_locator_t oloc, version_t version)
+			     hobject_t src, object_locator_t oloc, version_t version,
+			     const hobject_t& temp_dest_oid)
 {
   const hobject_t& dest = ctx->obs->oi.soid;
   dout(10) << __func__ << " " << dest << " ctx " << ctx
@@ -4395,7 +4398,7 @@ int ReplicatedPG::start_copy(OpContext *ctx, ObjectContextRef obc,
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, obc, src, oloc, version));
+  CopyOpRef cop(new CopyOp(ctx, obc, src, oloc, version, temp_dest_oid));
   copy_ops[dest] = cop;
   ctx->copy_op = cop;
   ++obc->copyfrom_readside;
@@ -4466,7 +4469,6 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
 
       if (cop->temp_cursor.is_initial()) {
 	cop->temp_coll = get_temp_coll(&tctx->local_t);
-	cop->temp_oid = generate_temp_object();
 	repop->ctx->new_temp_oid = cop->temp_oid;
       }
 
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 5f0c97b3716..e4f6848d6c0 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -115,11 +115,13 @@ public:
     hobject_t temp_oid;
     object_copy_cursor_t temp_cursor;
 
-    CopyOp(OpContext *c, ObjectContextRef _obc, hobject_t s, object_locator_t l, version_t v)
+    CopyOp(OpContext *c, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+           version_t v, const hobject_t& dest)
       : ctx(c), obc(_obc), src(s), oloc(l), version(v),
 	objecter_tid(0),
 	size(0),
-	rval(-1)
+	rval(-1),
+	temp_oid(dest)
     {}
   };
   typedef boost::shared_ptr<CopyOp> CopyOpRef;
@@ -723,7 +725,9 @@ protected:
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, ObjectContextRef obc, hobject_t src, object_locator_t oloc, version_t version);
+  int start_copy(OpContext *ctx, ObjectContextRef obc, hobject_t src,
+                 object_locator_t oloc, version_t version,
+                 const hobject_t& temp_dest_oid);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
   void _copy_some(ObjectContextRef obc, CopyOpRef cop);
-- 
cgit v1.2.1


From 1784ef96f474ccc9a7f2d8ea8d6ce90daddd1fdb Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 12:48:48 -0700
Subject: ReplicatedPG: copy: split up the transaction generation from the PG
 management

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 28 ++++++++++++++++++++--------
 src/osd/ReplicatedPG.h  |  2 ++
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index c3a572509bc..efc1cf8f603 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4517,16 +4517,12 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
   cop->temp_cursor = cop->cursor;
 }
 
-int ReplicatedPG::finish_copy(OpContext *ctx)
+void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
+                                                  ObjectStore::Transaction& t)
 {
-  CopyOpRef cop = ctx->copy_op;
-  ObjectState& obs = ctx->new_obs;
-  ObjectStore::Transaction& t = ctx->op_t;
+  ObjectState& obs = cop->obc->obs;
 
-  if (!obs.exists) {
-    ctx->delta_stats.num_objects++;
-    obs.exists = true;
-  } else {
+  if (obs.exists) {
     t.remove(coll, obs.oi.soid);
   }
 
@@ -4540,9 +4536,25 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
     _write_copy_chunk(cop, &t);
     t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
     pgbackend->clear_temp_obj(cop->temp_oid);
+  }
+}
+
+int ReplicatedPG::finish_copy(OpContext *ctx)
+{
+  CopyOpRef cop = ctx->copy_op;
+  ObjectState& obs = ctx->new_obs;
+  ObjectStore::Transaction& t = ctx->op_t;
+
+  if (!ctx->obs->exists) {
+    ctx->delta_stats.num_objects++;
+    obs.exists = true;
+  }
+  if (cop->temp_cursor.is_initial()) {
     ctx->discard_temp_oid = cop->temp_oid;
   }
 
+  _build_finish_copy_transaction(cop, t);
+
   interval_set<uint64_t> ch;
   if (obs.oi.size > 0)
     ch.insert(0, obs.oi.size);
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index e4f6848d6c0..c4384e34057 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -731,6 +731,8 @@ protected:
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
   void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+  void _build_finish_copy_transaction(CopyOpRef cop,
+                                      ObjectStore::Transaction& t);
   int finish_copy(OpContext *ctx);
   void cancel_copy(CopyOpRef cop);
   void cancel_copy_ops();
-- 
cgit v1.2.1


From 0b472766f11e3bf30012d2958bf0564aa9354e17 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 12:49:04 -0700
Subject: ReplicatedPG: copy: start defining CopyCallback structures

Outline the basic interfaces we're going to use, and implement
the more obvious ones.

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index c4384e34057..a93f53be414 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -93,6 +93,7 @@ public:
    * state associated with a copy operation
    */
   struct OpContext;
+  class CopyCallback;
 
   struct CopyOp {
     OpContext *ctx;
@@ -126,6 +127,62 @@ public:
   };
   typedef boost::shared_ptr<CopyOp> CopyOpRef;
 
+  /**
+   * The CopyCallback class defines an interface for completions to the
+   * copy_start code. Users of the copy infrastructure must implement
+   * one and give an instance of the class to start_copy.
+   * In particular,
+   * 1) Once the copy code has placed data in the temp object, it calls
+   * the data_in_temp_obj() function.
+   * 2) if everything has succeeded, it may call copy_complete_ops() and
+   * pass in a Transaction which contains the ops that must be executed
+   * in order to complete the copy. The implementer must make sure these ops
+   * are executed if they are provide (at present, they are).
+   * 3) If everything has succeeded, it will call data_size() with the
+   * size of copied object
+   * 4) It will call finish().
+   *
+   * The implementer is responsible for making sure that the CopyCallback
+   * can associate itself with the correct copy operation. The presence
+   * of copy_complete_ops ensures that write operations can be performed
+   * atomically with the copy being completed (which doing them in separate
+   * transactions would not allow); if you are doing the copy for a read
+   * op you will have to generate a separate op to finish the copy with.
+   */
+  class CopyCallback : public Context {
+  protected:
+    bool data_in_temp;
+    uint64_t data_size;
+    int result_code;
+
+    CopyCallback() : data_in_temp(false), data_size((uint64_t)-1),
+	result_code(0) {}
+    virtual void finish(int r) { result_code = r; }
+  public:
+    /// Give the CopyCallback ops to perform to complete the copy
+    virtual void copy_complete_ops(ObjectStore::Transaction& t) = 0;
+    /// Tell the CopyCallback that there is now data in the temp object
+    virtual void data_in_temp_obj() { data_in_temp = true; };
+    bool is_temp_obj_used() { return data_in_temp; }
+    /// Provide the final size of the copied object to the CopyCallback
+    virtual void set_data_size(uint64_t size) { data_size = size; }
+    uint64_t get_data_size() { return data_size; }
+    int get_result() { return result_code; }
+    virtual ~CopyCallback() {};
+  };
+
+  class CopyFromCallback: public CopyCallback {
+  protected:
+    virtual void finish(int r) {}
+  public:
+    OpContext *ctx;
+    hobject_t temp_obj;
+    CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+      ctx(ctx_), temp_obj(temp_obj_) {}
+    void copy_complete_ops(ObjectStore::Transaction& t);
+    ~CopyFromCallback() {}
+  };
+
   boost::scoped_ptr<PGBackend> pgbackend;
 
   /// Listener methods
-- 
cgit v1.2.1


From 613841a6703bbf198b971dd3fdeda15446cabe82 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 13:13:44 -0700
Subject: ReplicatedPG: copy: add CopyCallback pointer to CopyOp, and set it up

We'll start using it in the next commit; eventually we can use the interfaces
we're putting their to replace our link to the OpContext.

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 7 ++++---
 src/osd/ReplicatedPG.h  | 9 +++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index efc1cf8f603..f83f2312bf3 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3765,7 +3765,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    break;
 	  }
 	  hobject_t temp_target = generate_temp_object();
-	  result = start_copy(ctx, ctx->obc, src, src_oloc, src_version,
+	  CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
+	  result = start_copy(ctx, cb, ctx->obc, src, src_oloc, src_version,
 	                      temp_target);
 	  if (result < 0)
 	    goto fail;
@@ -4381,7 +4382,7 @@ struct C_Copyfrom : public Context {
   }
 };
 
-int ReplicatedPG::start_copy(OpContext *ctx, ObjectContextRef obc,
+int ReplicatedPG::start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef obc,
 			     hobject_t src, object_locator_t oloc, version_t version,
 			     const hobject_t& temp_dest_oid)
 {
@@ -4398,7 +4399,7 @@ int ReplicatedPG::start_copy(OpContext *ctx, ObjectContextRef obc,
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, obc, src, oloc, version, temp_dest_oid));
+  CopyOpRef cop(new CopyOp(ctx, cb, obc, src, oloc, version, temp_dest_oid));
   copy_ops[dest] = cop;
   ctx->copy_op = cop;
   ++obc->copyfrom_readside;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index a93f53be414..6ca252214c0 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -97,6 +97,7 @@ public:
 
   struct CopyOp {
     OpContext *ctx;
+    CopyCallback *cb;
     ObjectContextRef obc;
     hobject_t src;
     object_locator_t oloc;
@@ -116,9 +117,9 @@ public:
     hobject_t temp_oid;
     object_copy_cursor_t temp_cursor;
 
-    CopyOp(OpContext *c, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+    CopyOp(OpContext *c, CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
            version_t v, const hobject_t& dest)
-      : ctx(c), obc(_obc), src(s), oloc(l), version(v),
+      : ctx(c), cb(cb_), obc(_obc), src(s), oloc(l), version(v),
 	objecter_tid(0),
 	size(0),
 	rval(-1),
@@ -179,7 +180,7 @@ public:
     hobject_t temp_obj;
     CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
       ctx(ctx_), temp_obj(temp_obj_) {}
-    void copy_complete_ops(ObjectStore::Transaction& t);
+    void copy_complete_ops(ObjectStore::Transaction& t) {}
     ~CopyFromCallback() {}
   };
 
@@ -782,7 +783,7 @@ protected:
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, ObjectContextRef obc, hobject_t src,
+  int start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef obc, hobject_t src,
                  object_locator_t oloc, version_t version,
                  const hobject_t& temp_dest_oid);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
-- 
cgit v1.2.1


From da1b9b6c107d34392d24812da1501dd99cc483bf Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 15:45:10 -0700
Subject: ReplicatedPG: copy: implement CopyFromCallback::finish, remove
 CopyOp::ctx

We implement enough of the CopyFromCallback that CopyOp no longer needs
a direct reference to the OpContext, so we remove it and replace all
references with calls to cop->cb->complete().

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 12 ++++--------
 src/osd/ReplicatedPG.h  | 26 ++++++++++++++++++++++----
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index f83f2312bf3..8d8dde1b365 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4399,7 +4399,7 @@ int ReplicatedPG::start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, cb, obc, src, oloc, version, temp_dest_oid));
+  CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
   copy_ops[dest] = cop;
   ctx->copy_op = cop;
   ++obc->copyfrom_readside;
@@ -4486,11 +4486,10 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-  execute_ctx(cop->ctx);
+  cop->cb->complete(cop->rval);
 
   copy_ops.erase(obc->obs.oi.soid);
   --obc->copyfrom_readside;
-  cop->ctx->copy_op.reset();
   kick_object_context_blocked(obc);
 }
 
@@ -4574,8 +4573,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
 
 void ReplicatedPG::cancel_copy(CopyOpRef cop)
 {
-  OpContext *ctx = cop->ctx;
-  dout(10) << __func__ << " " << ctx->obc->obs.oi.soid << " ctx " << ctx
+  dout(10) << __func__ << " " << cop->obc->obs.oi.soid
 	   << " from " << cop->src << " " << cop->oloc << " v" << cop->version
 	   << dendl;
 
@@ -4587,11 +4585,9 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
 
   copy_ops.erase(cop->obc->obs.oi.soid);
   --cop->obc->copyfrom_readside;
-  ctx->copy_op.reset();
 
   kick_object_context_blocked(cop->obc);
-
-  delete ctx;
+  cop->cb->complete(-ECANCELED);
 }
 
 void ReplicatedPG::cancel_copy_ops()
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 6ca252214c0..00611104555 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -96,7 +96,6 @@ public:
   class CopyCallback;
 
   struct CopyOp {
-    OpContext *ctx;
     CopyCallback *cb;
     ObjectContextRef obc;
     hobject_t src;
@@ -117,9 +116,9 @@ public:
     hobject_t temp_oid;
     object_copy_cursor_t temp_cursor;
 
-    CopyOp(OpContext *c, CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+    CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
            version_t v, const hobject_t& dest)
-      : ctx(c), cb(cb_), obc(_obc), src(s), oloc(l), version(v),
+      : cb(cb_), obc(_obc), src(s), oloc(l), version(v),
 	objecter_tid(0),
 	size(0),
 	rval(-1),
@@ -158,6 +157,10 @@ public:
 
     CopyCallback() : data_in_temp(false), data_size((uint64_t)-1),
 	result_code(0) {}
+    /**
+     * @param r The copy return code. 0 for success; -ECANCELLED if
+     * the operation was cancelled by the local OSD; -errno for other issues.
+     */
     virtual void finish(int r) { result_code = r; }
   public:
     /// Give the CopyCallback ops to perform to complete the copy
@@ -174,7 +177,21 @@ public:
 
   class CopyFromCallback: public CopyCallback {
   protected:
-    virtual void finish(int r) {}
+    virtual void finish(int r) {
+      result_code = r;
+      if (r >= 0) {
+	ctx->pg->execute_ctx(ctx);
+      }
+      ctx->copy_op.reset();
+      ctx->copy_cb = NULL;
+      if (r < 0) {
+	if (r == -ECANCELED) { // toss it out; client resends
+	  delete ctx;
+	} else {
+	  ctx->pg->osd->reply_op_error(ctx->op, r);
+	}
+      }
+    }
   public:
     OpContext *ctx;
     hobject_t temp_obj;
@@ -183,6 +200,7 @@ public:
     void copy_complete_ops(ObjectStore::Transaction& t) {}
     ~CopyFromCallback() {}
   };
+  friend class CopyFromCallback;
 
   boost::scoped_ptr<PGBackend> pgbackend;
 
-- 
cgit v1.2.1


From 18fcd91319be2be11e643eb40c5bd0beee21cf51 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Tue, 1 Oct 2013 14:28:58 +0100
Subject: test: test_store_tool: add 'crc <prefix> <key>' command

Returns the CRC of contents for a given key with a given prefix.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 .../ObjectMap/test_store_tool/test_store_tool.cc    | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index f81598ccfb8..66d55bee587 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -111,7 +111,7 @@ void usage(const char *pname)
     << "  list [prefix]\n"
     << "  exists <prefix> [key]\n"
     << "  get <prefix> <key>\n"
-    << "  verify <store path>\n"
+    << "  crc <prefix> <key>\n"
     << "  get-size\n"
     << std::endl;
 }
@@ -183,8 +183,23 @@ int main(int argc, const char *argv[])
     bl.hexdump(os);
     std::cout << os.str() << std::endl;
 
-  } else if (cmd == "verify") {
-    assert(0);
+  } else if (cmd == "crc") {
+    if (argc < 5) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+
+    bool exists = false;
+    bufferlist bl = st.get(prefix, key, exists);
+    std::cout << "(" << prefix << ", " << key << ") ";
+    if (!exists) {
+      std::cout << " does not exist" << std::endl;
+      return 1;
+    }
+    std::cout << " crc " << bl.crc32c(0) << std::endl;
+
   } else if (cmd == "get-size") {
     std::cout << "estimated store size: " << st.get_size() << std::endl;
   } else {
-- 
cgit v1.2.1


From 398249a05f762f92a53f5add568e181a954a3f28 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:22:40 +0100
Subject: test: test_store_tool: optionally output value crc when listing keys

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/test/ObjectMap/test_store_tool/test_store_tool.cc | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index 66d55bee587..7d943c9ca08 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -38,7 +38,7 @@ class StoreTool
     db.reset(db_ptr);
   }
 
-  void list(const string &prefix) {
+  void list(const string &prefix, const bool do_crc) {
     KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
 
     if (prefix.empty())
@@ -51,7 +51,11 @@ class StoreTool
       if (!prefix.empty() && (rk.first != prefix))
 	break;
 
-      std::cout << rk.first << ":" << rk.second << std::endl;
+      std::cout << rk.first << ":" << rk.second;
+      if (do_crc) {
+        std::cout << " (" << iter->value().crc32c(0) << ")";
+      }
+      std::cout << std::endl;
       iter->next();
     }
   }
@@ -109,6 +113,7 @@ void usage(const char *pname)
     << "\n"
     << "Commands:\n"
     << "  list [prefix]\n"
+    << "  list-crc [prefix]\n"
     << "  exists <prefix> [key]\n"
     << "  get <prefix> <key>\n"
     << "  crc <prefix> <key>\n"
@@ -140,12 +145,14 @@ int main(int argc, const char *argv[])
 
   StoreTool st(path);
 
-  if (cmd == "list") {
+  if (cmd == "list" || cmd == "list-crc") {
     string prefix;
     if (argc > 3)
       prefix = argv[3];
 
-    st.list(prefix);
+    bool do_crc = (cmd == "list-crc");
+
+    st.list(prefix, do_crc);
 
   } else if (cmd == "exists") {
     string key;
-- 
cgit v1.2.1


From dfea81e77a3242c8232b99c1df1b88f6aff22866 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:30:19 +0100
Subject: ceph_test_store_tool: add 'set prefix key' feature

Allow reading from a file.  See --help for more info.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/Makefile.am                                    |  2 +-
 .../ObjectMap/test_store_tool/test_store_tool.cc   | 54 +++++++++++++++++++++-
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/src/Makefile.am b/src/Makefile.am
index 3bdec278c6f..d6c94efc25b 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1269,7 +1269,7 @@ ceph_test_keyvaluedb_iterators_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} ${
 bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators
 
 ceph_test_store_tool_SOURCES = test/ObjectMap/test_store_tool/test_store_tool.cc \
-			  os/LevelDBStore.cc
+			  os/LevelDBStore.cc common/strtol.cc
 ceph_test_store_tool_LDFLAGS = ${AM_LDFLAGS}
 ceph_test_store_tool_LDADD =  $(LIBOS_LDA) $(LIBGLOBAL_LDA)
 ceph_test_store_tool_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
index 7d943c9ca08..8fcf3f30e82 100644
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
@@ -24,6 +24,7 @@
 #include "common/errno.h"
 #include "common/safe_io.h"
 #include "common/config.h"
+#include "common/strtol.h"
 
 using namespace std;
 
@@ -83,7 +84,7 @@ class StoreTool
     assert(!prefix.empty() && !key.empty());
 
     map<string,bufferlist> result;
-    set<string> keys;
+    std::set<std::string> keys;
     keys.insert(key);
     db->get(prefix, keys, &result);
 
@@ -105,6 +106,18 @@ class StoreTool
     std::cout << "total: " << s << std::endl;
     return s;
   }
+
+  bool set(const string &prefix, const string &key, bufferlist &val) {
+    assert(!prefix.empty());
+    assert(!key.empty());
+    assert(val.length() > 0);
+
+    KeyValueDB::Transaction tx = db->get_transaction();
+    tx->set(prefix, key, val);
+    int ret = db->submit_transaction_sync(tx);
+
+    return (ret == 0);
+  }
 };
 
 void usage(const char *pname)
@@ -118,6 +131,7 @@ void usage(const char *pname)
     << "  get <prefix> <key>\n"
     << "  crc <prefix> <key>\n"
     << "  get-size\n"
+    << "  set <prefix> <key> [ver <N>|in <file>]\n"
     << std::endl;
 }
 
@@ -209,6 +223,44 @@ int main(int argc, const char *argv[])
 
   } else if (cmd == "get-size") {
     std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+  } else if (cmd == "set") {
+    if (argc < 7) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+    string subcmd(argv[5]);
+
+    bufferlist val;
+    string errstr;
+    if (subcmd == "ver") {
+      version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+      if (!errstr.empty()) {
+        std::cerr << "error reading version: " << errstr << std::endl;
+        return 1;
+      }
+      ::encode(v, val);
+    } else if (subcmd == "in") {
+      int ret = val.read_file(argv[6], &errstr);
+      if (ret < 0 || !errstr.empty()) {
+        std::cerr << "error reading file: " << errstr << std::endl;
+        return 1;
+      }
+    } else {
+      std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+      usage(argv[0]);
+      return 1;
+    }
+
+    bool ret = st.set(prefix, key, val);
+    if (!ret) {
+      std::cerr << "error setting ("
+                << prefix << "," << key << ")" << std::endl;
+      return 1;
+    }
+
   } else {
     std::cerr << "Unrecognized command: " << cmd << std::endl;
     return 1;
-- 
cgit v1.2.1


From fbeabccaf060fa79a65ffa61a0a1ac40100e1451 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 1 Oct 2013 21:07:49 -0700
Subject: os/FileStore: report errors from _crc_load_... and _crc_save

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/os/GenericFileStoreBackend.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index dad1a9c220c..81d896a0943 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -263,6 +263,7 @@ int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
 {
   char buf[100];
   bufferptr bp;
+  int r = 0;
   int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
   if (l == -ENODATA) {
     return 0;
@@ -284,16 +285,21 @@ int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
     ::decode(*cm, p);
   }
   catch (buffer::error &e) {
-    return -EIO;
+    r = -EIO;
   }
-  return 0;
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
 }
 
 int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
 {
   bufferlist bl;
   ::encode(*cm, bl);
-  return chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
 }
 
 int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
-- 
cgit v1.2.1


From d2cb2bf6bac83ac6db9df6cb876317d30e7493cc Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Wed, 2 Oct 2013 11:43:12 +0800
Subject: mds: return -EAGAIN if standby replay falls behind

standby replay may fall behind and get -ENOENT when reading the
journal. return -EAGAIN in this case, it makes the MDS respawn itself.

fixes: #5458

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 src/mds/MDLog.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index bd89da71495..cacbebfd3f6 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -499,7 +499,11 @@ void MDLog::_replay_thread()
     if (journaler->get_error()) {
       r = journaler->get_error();
       dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
-      if (r == -EINVAL) {
+      if (r == -ENOENT) {
+	// journal has been trimmed by somebody else?
+	assert(journaler->is_readonly());
+	r = -EAGAIN;
+      } else if (r == -EINVAL) {
         if (journaler->get_read_pos() < journaler->get_expire_pos()) {
           // this should only happen if you're following somebody else
           assert(journaler->is_readonly());
-- 
cgit v1.2.1


From a96b12f03a5bf3189cf758379238c4c57202c4a7 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 16:41:22 -0700
Subject: ReplicatedPG: copy: use CopyCallback instead of CopyOp in OpContext

In order to make this happen, we make the switch to generate the complete
transaction in the generic copy code and save it into the Callback. Then
in finish_copy() we just take that transaction and prepend it to the existing
transaction.
With that change, and by making use of the existing CopyCallback data,
we no longer need to access the CopyOp from the OpContext, so we can remove it.
Hurray, the pipelines are now independent!

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 36 ++++++++++++++++++++----------------
 src/osd/ReplicatedPG.h  | 11 ++++++-----
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 8d8dde1b365..35e28c18361 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3752,7 +3752,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -EINVAL;
 	  goto fail;
 	}
-	if (!ctx->copy_op) {
+	if (!ctx->copy_cb) {
 	  // start
 	  pg_t raw_pg;
 	  get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
@@ -3766,15 +3766,16 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  }
 	  hobject_t temp_target = generate_temp_object();
 	  CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
-	  result = start_copy(ctx, cb, ctx->obc, src, src_oloc, src_version,
+	  ctx->copy_cb = cb;
+	  result = start_copy(cb, ctx->obc, src, src_oloc, src_version,
 	                      temp_target);
 	  if (result < 0)
 	    goto fail;
 	  result = -EINPROGRESS;
 	} else {
 	  // finish
-	  result = ctx->copy_op->rval;
-	  if (ctx->copy_op->rval >= 0) { //success!
+	  result = ctx->copy_cb->get_result();
+	  if (result >= 0) { //success!
 	    result = finish_copy(ctx);
 	  }
 	}
@@ -4382,12 +4383,12 @@ struct C_Copyfrom : public Context {
   }
 };
 
-int ReplicatedPG::start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef obc,
+int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
 			     hobject_t src, object_locator_t oloc, version_t version,
 			     const hobject_t& temp_dest_oid)
 {
-  const hobject_t& dest = ctx->obs->oi.soid;
-  dout(10) << __func__ << " " << dest << " ctx " << ctx
+  const hobject_t& dest = obc->obs.oi.soid;
+  dout(10) << __func__ << " " << dest
 	   << " from " << src << " " << oloc << " v" << version
 	   << dendl;
 
@@ -4401,7 +4402,6 @@ int ReplicatedPG::start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef
 
   CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
   copy_ops[dest] = cop;
-  ctx->copy_op = cop;
   ++obc->copyfrom_readside;
 
   _copy_some(obc, cop);
@@ -4482,6 +4482,11 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
       dout(10) << __func__ << " fetching more" << dendl;
       _copy_some(obc, cop);
       return;
+    } else {
+      ObjectStore::Transaction t;
+      _build_finish_copy_transaction(cop, t);
+      cop->cb->copy_complete_ops(t);
+      cop->cb->set_data_size(cop->temp_cursor.data_offset);
     }
   }
 
@@ -4541,28 +4546,27 @@ void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
 
 int ReplicatedPG::finish_copy(OpContext *ctx)
 {
-  CopyOpRef cop = ctx->copy_op;
   ObjectState& obs = ctx->new_obs;
-  ObjectStore::Transaction& t = ctx->op_t;
+  CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
 
   if (!ctx->obs->exists) {
     ctx->delta_stats.num_objects++;
     obs.exists = true;
   }
-  if (cop->temp_cursor.is_initial()) {
-    ctx->discard_temp_oid = cop->temp_oid;
+  if (cb->is_temp_obj_used()) {
+    ctx->discard_temp_oid = cb->temp_obj;
   }
-
-  _build_finish_copy_transaction(cop, t);
+  ctx->op_t.swap(cb->final_tx);
+  ctx->op_t.append(cb->final_tx);
 
   interval_set<uint64_t> ch;
   if (obs.oi.size > 0)
     ch.insert(0, obs.oi.size);
   ctx->modified_ranges.union_of(ch);
 
-  if (cop->cursor.data_offset != obs.oi.size) {
+  if (cb->get_data_size() != obs.oi.size) {
     ctx->delta_stats.num_bytes -= obs.oi.size;
-    obs.oi.size = cop->cursor.data_offset;
+    obs.oi.size = cb->get_data_size();
     ctx->delta_stats.num_bytes += obs.oi.size;
   }
   ctx->delta_stats.num_wr++;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 00611104555..d5ef9b26ca3 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -182,7 +182,6 @@ public:
       if (r >= 0) {
 	ctx->pg->execute_ctx(ctx);
       }
-      ctx->copy_op.reset();
       ctx->copy_cb = NULL;
       if (r < 0) {
 	if (r == -ECANCELED) { // toss it out; client resends
@@ -195,9 +194,10 @@ public:
   public:
     OpContext *ctx;
     hobject_t temp_obj;
+    ObjectStore::Transaction final_tx;
     CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
       ctx(ctx_), temp_obj(temp_obj_) {}
-    void copy_complete_ops(ObjectStore::Transaction& t) {}
+    void copy_complete_ops(ObjectStore::Transaction& t) { final_tx.swap(t); }
     ~CopyFromCallback() {}
   };
   friend class CopyFromCallback;
@@ -375,7 +375,7 @@ public:
     int num_read;    ///< count read ops
     int num_write;   ///< count update ops
 
-    CopyOpRef copy_op;
+    CopyCallback *copy_cb;
 
     hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
 
@@ -392,7 +392,8 @@ public:
       current_osd_subop_num(0),
       data_off(0), reply(NULL), pg(_pg),
       num_read(0),
-      num_write(0) {
+      num_write(0),
+      copy_cb(NULL) {
       if (_ssc) {
 	new_snapset = _ssc->snapset;
 	snapset = &_ssc->snapset;
@@ -801,7 +802,7 @@ protected:
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+  int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
                  object_locator_t oloc, version_t version,
                  const hobject_t& temp_dest_oid);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
-- 
cgit v1.2.1


From d29be45319204d4f1be62404918a73bcbc6d543e Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 1 Oct 2013 16:42:55 -0700
Subject: ReplicatedPG: rename finish_copy -> finish_copyfrom

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 4 ++--
 src/osd/ReplicatedPG.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 35e28c18361..6118a25d510 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3776,7 +3776,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  // finish
 	  result = ctx->copy_cb->get_result();
 	  if (result >= 0) { //success!
-	    result = finish_copy(ctx);
+	    result = finish_copyfrom(ctx);
 	  }
 	}
       }
@@ -4544,7 +4544,7 @@ void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
   }
 }
 
-int ReplicatedPG::finish_copy(OpContext *ctx)
+int ReplicatedPG::finish_copyfrom(OpContext *ctx)
 {
   ObjectState& obs = ctx->new_obs;
   CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index d5ef9b26ca3..f7e677f7b84 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -810,7 +810,7 @@ protected:
   void _copy_some(ObjectContextRef obc, CopyOpRef cop);
   void _build_finish_copy_transaction(CopyOpRef cop,
                                       ObjectStore::Transaction& t);
-  int finish_copy(OpContext *ctx);
+  int finish_copyfrom(OpContext *ctx);
   void cancel_copy(CopyOpRef cop);
   void cancel_copy_ops();
 
-- 
cgit v1.2.1


From 2d7dced184defeb46a462d89d89b3be529b356b6 Mon Sep 17 00:00:00 2001
From: Dan Mick <dan.mick@inktank.com>
Date: Tue, 1 Oct 2013 22:23:24 -0700
Subject: mon/PGMap.cc: don't output header for pg dump_stuck if nothing stuck

Formatted output is already correct (no header)

Fixes: #4577
Signed-off-by: Dan Mick <dan.mick@inktank.com>
---
 src/mon/PGMap.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..4be39aba902 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -701,7 +701,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c
 {
   hash_map<pg_t, pg_stat_t> stuck_pg_stats;
   get_stuck_stats(type, cutoff, stuck_pg_stats);
-  dump_pg_stats_plain(ss, stuck_pg_stats);
+  if (!stuck_pg_stats.empty())
+    dump_pg_stats_plain(ss, stuck_pg_stats);
 }
 
 void PGMap::dump_osd_perf_stats(Formatter *f) const
-- 
cgit v1.2.1


From 16fbdcdf9fd4fc28e67104f2cb20ff0e4043d3aa Mon Sep 17 00:00:00 2001
From: Loic Dachary <loic@dachary.org>
Date: Fri, 27 Sep 2013 15:34:18 +0200
Subject: common: ghobject get_filestore_key* use hobject counterpart

The get_filestore_key* methods are changed to just call the
corresponding hobject methods instead of providing an identical
implementation.

Reviewed-by: David Zafman <david.zafman@inktank.com>
Signed-off-by: Loic Dachary <loic@dachary.org>
---
 src/common/hobject.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/common/hobject.h b/src/common/hobject.h
index e483b664347..46fc67b893a 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -241,14 +241,10 @@ public:
     return ret;
   }
   filestore_hobject_key_t get_filestore_key_u32() const {
-    assert(!hobj.max);
-    return hobj._reverse_nibbles(hobj.hash);
+    return hobj.get_filestore_key_u32();
   }
   filestore_hobject_key_t get_filestore_key() const {
-    if (hobj.max)
-      return 0x100000000ull;
-    else
-      return get_filestore_key_u32();
+    return hobj.get_filestore_key();
   }
 
   // maximum sorted value.
-- 
cgit v1.2.1


From d1c1f3eb9098e74e9483609ff4f90e8de83c4016 Mon Sep 17 00:00:00 2001
From: Loic Dachary <loic@dachary.org>
Date: Fri, 27 Sep 2013 19:09:23 +0200
Subject: common: document ghobject sort order rationale

Intuition differs regarding the sort order of the ghobject shard and
generation. Document the rationale for the chosen sort order.

Reviewed-by: Samuel Just <sam.just@inktank.com>
Reviewed-by: David Zafman <david.zafman@inktank.com>
Signed-off-by: Loic Dachary <loic@dachary.org>
---
 src/common/hobject.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/common/hobject.h b/src/common/hobject.h
index 46fc67b893a..82eecf3bfc7 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -288,8 +288,12 @@ namespace __gnu_cxx {
 
 ostream& operator<<(ostream& out, const ghobject_t& o);
 
-WRITE_EQ_OPERATORS_3(ghobject_t, hobj, generation, shard_id)
-// sort ghobject_t's by <hobj, generation, shard_id>
+WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
+// sort ghobject_t's by <hobj, shard_id, generation> 
+// 
+// Two objects which differ by generation are more related than
+// two objects of the same generation which differ by shard.
+// 
 WRITE_CMP_OPERATORS_3(ghobject_t,
 		      hobj,
 		      shard_id,
-- 
cgit v1.2.1


From ff4887324ab020d129f9aa64077eb8696802a576 Mon Sep 17 00:00:00 2001
From: Loic Dachary <loic@dachary.org>
Date: Sun, 22 Sep 2013 18:40:48 +0200
Subject: ErasureCode: doc updates

* Update to the current state of the ghobject implementaiton and the fact
  that they encode the shard_t Although the pool also contains the shard
  id, it is less relevant to understand the implementation.

* Update with the erasure code plugin infrastructure and the example
  plugin now in master.

* Move jerasure to a separate page to be expanded and link it from the
  toc

* Kill the partial read and writes notes as it will probably not be
  implemented in the near future. Kill some of the notes because they
  are no longer relevant.

* Add a definition for "chunk rank"

* Reword, update schemas, fix typos.

Signed-off-by: Loic Dachary <loic@dachary.org>
---
 doc/dev/osd_internals/erasure_coding.rst           |  25 +-
 .../erasure_coding/developer_notes.rst             | 258 +++++++--------------
 doc/dev/osd_internals/erasure_coding/jerasure.rst  |  22 ++
 3 files changed, 129 insertions(+), 176 deletions(-)
 create mode 100644 doc/dev/osd_internals/erasure_coding/jerasure.rst

diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..d3f19b6fb8e 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
 ==============================
 
 The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
 changes. It is meant to be updated to reflect the `progress of these
 architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
 the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
 --------
 
 *chunk* 
-   when the encoding function is called, it returns chunks of the
-   same size.
+   when the encoding function is called, it returns chunks of the same
+   size. Data chunks which can be concated to reconstruct the original
+   object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+   the index of a chunk when returned by the encoding function. The
+   rank of the first chunk is 0, the rank of the second chunk is 1
+   etc.
 
 *stripe* 
    when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
    called a stripe.
 
 *shard|strip*
-   the file that holds all chunks of a same rank for a given object.
+   an ordered sequence of chunks of the same rank from the same
+   object.  For a given placement group, each OSD contains shards of
+   the same rank. When dealing with objects that are encoded with a
+   single operation, *chunk* is sometime used instead of *shard*
+   because the shard is made of a single chunk.
 
-Example:
+The definitions are illustrated as follows:
 ::
  
                  OSD 40                       OSD 33
@@ -53,6 +63,7 @@ Table of content
 .. toctree::
    :maxdepth: 1
 
-   High level design document <erasure_coding/pgbackend>
    Developer notes <erasure_coding/developer_notes>
+   Jerasure plugin <erasure_coding/jerasure>
+   High level design document <erasure_coding/pgbackend>
    Draft PGBackend.h header <erasure_coding/PGBackend-h>
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 2bc796c67e5..568b2b4634a 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -10,7 +10,7 @@ of the erasure code within Ceph. It is mostly based on examples being
 explained to demonstrate how things work. It is written as if the
 implementation is complete although it may not be the case. For
 instance the plugin system and the jerasure plugin are implemented but
-the erasure code pool is not.
+the erasure coded pool is not.
 
 Reading and writing encoded chunks from and to OSDs
 ---------------------------------------------------
@@ -18,8 +18,8 @@ Reading and writing encoded chunks from and to OSDs
 An erasure coded pool stores each object as K+M chunks. It is divided
 into K data chunks and M coding chunks. The pool is configured to have
 a size of K+M so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as `an attribute of the pool
-<http://tracker.ceph.com/issues/5862>`_ containing the object.
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
 
 For instance an erasure coded pool is created to use five OSDs ( K+M =
 5 ) and sustain the loss of two of them ( M = 2 ).
@@ -33,9 +33,9 @@ coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
 chunk is stored in an OSD in the acting set. The chunks are stored in
 objects that have the same name ( *NYAN* ) but reside on different
 OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the pool containing the object. Chunk
-*1* contains *ABC* and is stored on *OSD5* while chunk *4* contains
-*XYY* and is stored on *OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
 
 ::
  
@@ -56,7 +56,7 @@ is stored as an attribute of the pool containing the object. Chunk
             +--v---+   +--v---+   +--v---+  +--v---+  +--v---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
             +------+   +------+   +------+  +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |  | QGC  |
             +--+---+   +--+---+   +--+---+  +--+---+  +--+---+
@@ -85,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
 decoding function reads three chunks : chunk *1* containing *ABC*,
 chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
 the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.  
+
 ::
  
                              +-------------------+
@@ -110,17 +112,17 @@ and its chunk was not taken into account.
             +--+---+   +------+   +--+---+  +--+---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |
             +------+   +------+   +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |
             +--+---+   +--+---+   +--+---+  +--+---+
-               ^          ^          ^         ^
-               |          |          |         |
-               |          |       +--+---+     |
-               |          |       | OSD1 |     |
+               ^          .          ^         ^
+               |    TOO   .          |         |
+               |    SLOW  .       +--+---+     |
+               |          ^       | OSD1 |     |
                |          |       +------+     |
                |          |       +------+     |
-               |     SLOW +-------| OSD2 |     |
+               |          +-------| OSD2 |     |
                |                  +------+     |
                |                  +------+     |
                |                  | OSD3 |-----+
@@ -137,8 +139,9 @@ Interrupted full writes
 
 In an erasure coded pool the primary OSD in the up set receives all
 write operations. It is responsible for encoding the payload into K+M
-chunks and send them to the OSDs in the up set. It is also responsible
+chunks and sends them to the other OSDs. It is also responsible
 for maintaining an authoritative version of the placement group logs.
+
 ::
  
      primary
@@ -168,8 +171,8 @@ set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
 object has been encoded and stored in the OSDs : the chunk D1v1
 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
 and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
-placement group logs on each OSD are in sync at epoch 1 version 1
-(i.e. 1,1).  
+placement group logs on each OSD are identical (i.e. 1,1).  
+
 ::
  
      primary
@@ -196,21 +199,23 @@ placement group logs on each OSD are in sync at epoch 1 version 1
                +-----------+
 
 *OSD 1* is the primary and receives a WRITE FULL from a client, which
-means the payload is to replace the object entirely instead of only
-overwriting a portion of it. Version two of the object is created
-to override version one. *OSD 1* encodes the payload into three
-chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*,
-D2v2 on *OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on
-*OSD 3*. Each chunk is sent to the target OSD, including the primary
-OSD which is responsible for storing chunks in addition to handling
-write operations and maintaining an authoritative version of the
-placement group logs. When an OSD receives the message instructing it
-to write the chunk, it also creates a new entry in the placement group
-logs to reflect the change. For instance, as soon as *OSD 3* stores
-*C1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
-logs. Because the OSDs work asynchronously, some chunks may still be
-in flight ( such as *D2v2* ) while others are acknowledged and on disk
-( such as *C1v1* and *D1v1* ).  ::
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
+::
  
      primary
    +---OSD 1---+
@@ -243,6 +248,7 @@ acting set and the logs' *last_complete* pointer can move from
 *1,1* to *1,2* and the files used to store the chunks of the previous
 version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 *OSD 2* and *C1v1* on *OSD 3*.
+
 ::
  
                +---OSD 1---+
@@ -271,13 +277,14 @@ version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 
 But accidents happen. If *OSD 1* goes down while *D2v2* is still in
 flight, the object's version 2 is partially written : *OSD 3* has
-one chunk but does not have enough to recover. It lost two chunks :
-*D1v2* and *D2v2* but the erasure coding parameters K = 2 + M = 1
-requires that at least two chunks are available to rebuild the
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
 third. *OSD 4* becomes the new primary and finds that the
 *last_complete* log entry ( i.e. all objects before this entry were
 known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.
+*1,1* and that will be the head of the new authoritative log.
+
 ::
  
                +---OSD 2---+
@@ -299,6 +306,7 @@ known to be available on all OSDs in the previous acting set ) is
 The log entry *1,2* found on *OSD 3* is divergent from the new
 authoritative log provided by *OSD 4* : it is discarded and the file
 containing the *C1v2* chunk is removed.
+
 ::
  
                +---OSD 2---+
@@ -323,14 +331,14 @@ coding library during scrubbing and stored on the new primary *OSD 4*.
 Interrupted append
 ------------------
 
-An object is coded in stripes, either because they are too big or
-because they are created with multiple operations instead of a single
-full write. A single stripe will exist/exists in the case of a full
-write, assuming the object size is not too large to encode in memory.
-When appending to an existing object, the stripe size is retrieved
-from the attributes of the object. It applies, for instance, when
-*rgw* writes an object with sequence of append instead of a single
-write.  ::
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
+::
  
      primary
    +---OSD 1---+
@@ -354,7 +362,7 @@ write.  ::
                +-----------+
 
 *OSD 1* is the primary and receives an APPEND from a client, meaning
-the payload is to be appended at the end of the object. *OSD 1*
+the payload is to be appended to the end of the object. *OSD 1*
 encodes the payload into three chunks : S2D1 (i.e. Stripe two data
 chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
 *OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
@@ -368,8 +376,8 @@ logs to reflect the change. For instance, as soon as *OSD 3* stores
 logs. The log entry also carries the nature of the operation: in this
 case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
 asynchronously, some chunks may still be in flight ( such as *S2D2* )
-while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
-).
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
 ::
  
                +---OSD 1---+
@@ -396,14 +404,16 @@ while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
   +-----------+
 
 If *OSD 1* goes down while *S2D2* is still in flight, the payload is
-partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does
-not have enough to recover because s1 and s2 don't have it. Two chunks
-were lost (*S2D1* and S2D2) but the erasure coding parameters K = 2 +
-M = 1 requires that at least two chunks are available to rebuild the
-third. *OSD 4* becomes the new primary and finds that the
-*last_complete* log entry ( i.e. all objects before this entry were
-known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.  ::
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
+::
  
                +---OSD 2---+
                |+-s2-+ log |
@@ -429,8 +439,6 @@ the stripe size.
 Erasure code library
 --------------------
 
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5877>`_
-
 Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
 with parameters K+M, object O is encoded by dividing it into chunks O1,
 O2, ...  OM and computing coding chunks P1, P2, ... PK. Any K chunks
@@ -443,8 +451,8 @@ Reading the original content of object O could be a simple
 concatenation of O1, O2, ... OM, because the plugins are using
 `systematic codes
 <http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
-must be given to the erasure code library to retrieve the content of
-the object.
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
 
 Reed-Solomon is significantly more expensive to encode than fountain
 codes with the current `jerasure implementation
@@ -462,10 +470,11 @@ functions ( for Cauchy or Liberation for instance ): smaller packets
 means more calls and more overhead.
 
 Although Reed-Solomon is provided as a default, Ceph uses it via an
-`abstract API <http://tracker.ceph.com/issues/5878>`_ designed to
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
 allow each pool to choose the plugin that implements it using
 `key=value pairs when creating the pool
-<http://tracker.ceph.com/issues/6113>`_.
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
 ::
  
   ceph osd pool create <pool> \
@@ -473,18 +482,22 @@ allow each pool to choose the plugin that implements it using
      erasure-code-plugin=<plugin>
 
 The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*int __erasure_code_init(char *plugin_name)* function 
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
 ::
  
-  ErasureCodePluginRegistry::add(plugin_name, 
-                                 new ErasureCodePluginExample());
+  ErasureCodePluginRegistry &instance = 
+                             ErasureCodePluginRegistry::instance();
+  instance.add(plugin_name, new ErasureCodePluginExample());
 
 The *ErasureCodePlugin* derived object must provide a factory method
 from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ plugin
+reads:
+
 ::
  
   virtual int factory(const map<std::string,std::string> &parameters,
@@ -493,39 +506,23 @@ object can be generated:
     return 0;
   } 
 
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
+
 ::
  
-  ceph osd pool create <pool> \
+  ceph osd pool create poolname 123 \
      erasure-code-directory=<dir>         \ # mandatory
      erasure-code-plugin=jerasure         \ # mandatory
      erasure-code-m=10                    \ # optional and plugin dependant
      erasure-code-k=3                     \ # optional and plugin dependant
      erasure-code-technique=reed_sol_van  \ # optional and plugin dependant
 
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
- 
-  ceph osd pool create <pool> \
-     erasure-code-directory=<dir>         \ # plugin directory absolute path
-     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
-     erasure-code-k=<k>                   \ # data chunks (default 2)
-     erasure-code-m=<m>                   \ # coding chunks (default 2)
-     erasure-code-technique=<technique>   \ # coding technique
-
-The coding techniques can be chosen among *reed_sol_van*,
-*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
-*blaum_roth* and *liber8tion*.
-
 Scrubbing
 ---------
 
 See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
-
 The simplest form of scrubbing is to check with each OSDs holding a
 chunk if it exists locally. If more thank M chunks are missing the
 object is marked as lost. If up to M chunks are missing they are
@@ -547,13 +544,6 @@ built-in on a per block basis.
 Notes
 -----
 
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
 If the objects are large, it may be impractical to encode and decode
 them in memory. However, when using *RBD* a 1TB device is divided in
 many individual 4MB objects and *RGW* does the same.
@@ -561,73 +551,3 @@ many individual 4MB objects and *RGW* does the same.
 Encoding and decoding is implemented in the OSD. Although it could be
 implemented client side for read write, the OSD must be able to encode
 and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the
-payload with the `CEPH_OSD_OP_WRITE
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_
-operation. It is refered to as *partial write* and is different from
-the `CEPH_OSD_OP_WRITEFULL operation
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_
-which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least M chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the *radosgw* or *librbd*, objects will mostly
-have the same size. The *radosgw* user may upload a 1GB object, which will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with *rgw obj stripe size* ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
- 
-           +-----------------------+
-           |+---------------------+|
-           ||    stripe 0         ||
-           ||    [0,N)            ||
-           |+---------------------+|
-           |+---------------------+|
-           ||    stripe 1         ||
-           ||    [N,N*2)          ||
-           |+---------------------+|
-           |+---------------------+|
-           || stripe 3 [N*2,len)  ||
-           |+---------------------+|
-           +-----------------------+
-               object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+ 
+  ceph osd pool create <pool> \
+     erasure-code-directory=<dir>         \ # plugin directory absolute path
+     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
+     erasure-code-k=<k>                   \ # data chunks (default 2)
+     erasure-code-m=<m>                   \ # coding chunks (default 2)
+     erasure-code-technique=<technique>   \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
-- 
cgit v1.2.1


From 238a303cffc387548c9695ff75171d333ec6bedd Mon Sep 17 00:00:00 2001
From: Loic Dachary <loic@dachary.org>
Date: Wed, 2 Oct 2013 15:06:27 +0200
Subject: ErasureCode: update PGBackend description

Based on a dialog with Sam ( as published at http://dachary.org/?p=2320 ).

* Remove PGBackend-h.rst because PGBackend.h is now in master.

* Fix typos caught by ispell

* Update recovery links to point to PGBackend recover methods

* Workaround formating warning
  developer_notes.rst:3: WARNING: Duplicate explicit target name:
  "erasurecodepluginexample" which should be legitimate.

Signed-off-by: Loic Dachary <loic@dachary.org>
---
 doc/dev/osd_internals/erasure_coding.rst           |   1 -
 .../osd_internals/erasure_coding/PGBackend-h.rst   | 156 ---------------------
 .../erasure_coding/developer_notes.rst             |   3 +-
 doc/dev/osd_internals/erasure_coding/pgbackend.rst |  42 +++---
 doc/dev/osd_internals/erasure_coding/recovery.rst  |   4 -
 5 files changed, 22 insertions(+), 184 deletions(-)
 delete mode 100644 doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
 delete mode 100644 doc/dev/osd_internals/erasure_coding/recovery.rst

diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index d3f19b6fb8e..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -66,4 +66,3 @@ Table of content
    Developer notes <erasure_coding/developer_notes>
    Jerasure plugin <erasure_coding/jerasure>
    High level design document <erasure_coding/pgbackend>
-   Draft PGBackend.h header <erasure_coding/PGBackend-h>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
- 
- /**
-  * PGBackend
-  *
-  * PGBackend defines an interface for logic handling IO and
-  * replication on RADOS objects.  The PGBackend implementation
-  * is responsible for:
-  *
-  * 1) Handling client operations
-  * 2) Handling object recovery
-  * 3) Handling object access
-  */
- class PGBackend {
- public:	
-   /// IO
- 
-   /// Perform write
-   int perform_write(
-     const vector<OSDOp> &ops,  ///< [in] ops to perform
-     Context *onreadable,       ///< [in] called when readable on all reaplicas
-     Context *onreadable,       ///< [in] called when durable on all replicas
-     ) = 0; ///< @return 0 or error
- 
-   /// Attempt to roll back a log entry
-   int try_rollback(
-     const pg_log_entry_t &entry, ///< [in] entry to roll back
-     ObjectStore::Transaction *t  ///< [out] transaction
-     ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
- 
-   /// Perform async read, oncomplete is called when ops out_bls are filled in
-   int perform_read(
-     vector<OSDOp> &ops,        ///< [in, out] ops
-     Context *oncomplete        ///< [out] called with r code
-     ) = 0; ///< @return 0 or error
- 
-   /// Peering
- 
-   /**
-    * have_enough_infos
-    *
-    * Allows PGBackend implementation to ensure that enough peers have
-    * been contacted to satisfy its requirements.
-    *
-    * TODO: this interface should yield diagnostic info about which infos
-    * are required
-    */
-   bool have_enough_infos(
-     const map<epoch_t, pg_interval_t> &past_intervals,      ///< [in] intervals
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     ) = 0; ///< @return true if we can continue peering
- 
-   /**
-    * choose_acting
-    *
-    * Allows PGBackend implementation to select the acting set based on the
-    * received infos
-    *
-    * @return False if the current acting set is inadequate, *req_acting will
-    *         be filled in with the requested new acting set.  True if the
-    *         current acting set is adequate, *auth_log will be filled in
-    *         with the correct location of the authoritative log.
-    */
-   bool choose_acting(
-     const map<int, pg_info_t> &peer_infos, ///< [in] received infos
-     int *auth_log,                         ///< [out] osd with auth log
-     vector<int> *req_acting                ///< [out] requested acting set
-     ) = 0;
- 
-   /// Scrub
- 
-   /// scan
-   int scan(
-     const hobject_t &start, ///< [in] scan objects >= start
-     const hobject_t &up_to, ///< [in] scan objects < up_to
-     vector<hobject_t> *out  ///< [out] objects returned
-     ) = 0; ///< @return 0 or error
- 
-   /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
-   int scrub(
-     const hobject_t &to_stat, ///< [in] object to stat
-     bool deep,                ///< [in] true if deep scrub
-     ScrubMap::object *o       ///< [out] result
-     ) = 0; ///< @return 0 or error
- 
-   /**
-    * compare_scrub_maps
-    *
-    * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
-    * @param errstr [out] stream of text about inconsistencies for user
-    *                     perusal
-    *
-    * TODO: this interface doesn't actually make sense...
-    */
-   void compare_scrub_maps(
-     const map<int, ScrubMap> &maps, ///< [in] maps to compare
-     bool deep,                      ///< [in] true if scrub is deep
-     map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
-     std:ostream *errstr
-     ) = 0;
- 
-   /// Recovery
- 
-   /**
-    * might_have_unrecoverable
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    * @param intervals [in] past intervals
-    * @param should_query [out] pair<int, cpg_t> shards to query
-    */
-   void might_have_unrecoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const map<epoch_t, pg_interval_t> &past_intervals,
-     set<pair<int, cpg_t> > *should_query
-     ) = 0;
- 
-   /**
-    * might_have_unfound
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    */
-   bool recoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const hobject_t &hoid ///< [in] object to check
-     ) = 0; ///< @return true if object can be recovered given missing
- 
-   /**
-    * recover_object
-    *
-    * Triggers a recovery operation on the specified hobject_t
-    * onreadable must be called before onwriteable
-    *
-    * @param missing [in] set of info, missing pairs for queried nodes
-    */
-   void recover_object(
-     const hobject_t &hoid, ///< [in] object to recover
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
-     Context *onreadable,   ///< [in] called when object can be read
-     Context *onwriteable   ///< [in] called when object can be written
-     ) = 0;
- 
-   /// Backfill
- 
-   /// choose_backfill
-   void choose_backfill(
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     const vector<int> &acting, ///< [in] acting set
-     const vector<int> &up,     ///< [in] up set
-     set<int> *to_backfill      ///< [out] osds to backfill
-     ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 568b2b4634a..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -495,8 +495,7 @@ registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97
 
 The *ErasureCodePlugin* derived object must provide a factory method
 from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ plugin
-reads:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
 
 ::
  
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
 PG Backend Proposal
 ===================
 
-See also `PGBackend.h <../PGBackend-h>`_
-
 Motivation
 ----------
 
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
 
 Much of the existing PG logic, particularly that for dealing with
 peering, will be common to each.  With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
    positions are not interchangeable.  In particular, it might make
    sense for a single OSD to hold more than 1 PG copy for different
    acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
    and erasure coded backends.
 6. The set of necessary osds from a particular interval required to
-   to continue peering may difer between replicated and erasure
+   to continue peering may differ between replicated and erasure
    coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
    and erasure coded backends.
 
 Client Writes
@@ -78,8 +77,9 @@ Core Changes:
 - Current code should be adapted to use and rollback as appropriate
   APPEND, DELETE, (SET|RM)ATTR log entries.
 - The filestore needs to be able to deal with multiply versioned
-  hobjects.  This probably means adapting the filestore internally to
-  use a ghobject which is basically a tuple<hobject_t, gen_t,
+  hobjects.  This means adapting the filestore internally to
+  use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ 
+  which is basically a tuple<hobject_t, gen_t,
   shard_t>.  The gen_t + shard_t need to be included in the on-disk
   filename.  gen_t is a unique object identifier to make sure there
   are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects.  Thus, we must choose the *oldest* last_update from
 the last interval which went active in order to minimize the number of
 divergent objects.
 
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
 an info from at least 1 osd from the prior interval, it can complete
 peering.  In order to ensure that we do not end up with an
 unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
 ------------
 
 Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd.  With an erasure coded strategy,
+synchronously out of the primary osd.  With an erasure coded strategy,
 the primary will need to request data from some number of replicas in
 order to satisfy a read.  The perform_read() interface for PGBackend
 therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
 Core changes:
 
 - The filestore `ghobject_t needs to also include a chunk id
-  <http://tracker.ceph.com/issues/5862>`_ making it more like
+  <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
   tuple<hobject_t, gen_t, shard_t>.
 - coll_t needs to include a shard_t.
 - The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
 Recovery
 --------
 
-See `Issue #5857`_. The logic for recovering an object depends on the backend.  With
+The logic for recovering an object depends on the backend.  With
 the current replicated strategy, we first pull the object replica
 to the primary and then concurrently push it out to the replicas.
 With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
 Another difference is that objects in erasure coded pg may be
 unrecoverable without being unfound.  The "unfound" concept
 should probably then be renamed to unrecoverable.  Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
 for pg replicas with unrecoverable object chunks and to be able
 to determine whether a particular object is recoverable.
 
@@ -281,9 +281,11 @@ Core changes:
 
 PGBackend interfaces:
 
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
 
 Backfill
 --------
@@ -316,6 +318,4 @@ PGBackend interfaces:
 - choose_backfill(): allows the implementation to determine which osds
   should be backfilled in a particular interval.
 
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856
\ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/dev/osd_internals/erasure_coding/recovery.rst b/doc/dev/osd_internals/erasure_coding/recovery.rst
deleted file mode 100644
index 793a5b003dc..00000000000
--- a/doc/dev/osd_internals/erasure_coding/recovery.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-===================
-PGBackend Recovery
-===================
-
-- 
cgit v1.2.1


From c0cbd9aa5e5673ebf482458d08ce4f342b0c5493 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Wed, 2 Oct 2013 10:00:10 -0700
Subject: osd: In read_meta() leave an extra byte in buffer to nul terminate

Signed-off-by: David Zafman <david.zafman@inktank.com>
---
 src/osd/OSD.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 9a2fbb5c576..0a2d64ee6e1 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -840,7 +840,7 @@ int OSD::read_meta(const  std::string &base, const std::string &file,
     int err = errno;
     return -err;
   }
-  len = safe_read(fd, val, vallen);
+  len = safe_read(fd, val, vallen - 1);
   if (len < 0) {
     TEMP_FAILURE_RETRY(::close(fd));
     return len;
-- 
cgit v1.2.1


From 8835ef8f9833b250e8d09716f683893073db5306 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Thu, 26 Sep 2013 17:42:13 -0700
Subject: common, os, osd: Use common functions for safe file reading and
 writing

Add new safe_read_file() and safe_write_file() to update files atomically
Used instead of original OSD::read_meta(), OSD::write_meta() they are based on
Used by read_superblock() and write_superblock()
Used by write_version_stamp() and version_stamp_is_valid()

Fixes: #6422

Signed-off-by: David Zafman <david.zafman@inktank.com>
---
 src/common/safe_io.c |  80 +++++++++++++++++++++++++++++++++++++++++
 src/common/safe_io.h |   9 +++++
 src/os/FileStore.cc  |  68 ++++++++++-------------------------
 src/osd/OSD.cc       | 100 +++++----------------------------------------------
 src/osd/OSD.h        |   4 ---
 5 files changed, 116 insertions(+), 145 deletions(-)

diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index ac99db04ad3..afee82edf07 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -14,8 +14,12 @@
 
 #define _XOPEN_SOURCE 500
 
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
 
 #include "common/safe_io.h"
 
@@ -112,3 +116,79 @@ ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
 	}
 	return 0;
 }
+
+int safe_write_file(const char *base, const char *file,
+		    const char *val, size_t vallen)
+{
+  int ret;
+  char fn[PATH_MAX];
+  char tmp[PATH_MAX];
+  int fd;
+
+  // does the file already have correct content?
+  char oldval[80];
+  ret = safe_read_file(base, file, oldval, sizeof(oldval));
+  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+    return 0;  // yes.
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+  fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+  if (fd < 0) {
+    ret = errno;
+    return -ret;
+  }
+  ret = safe_write(fd, val, vallen);
+  if (ret) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return ret;
+  }
+
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+  if (ret < 0) {
+    unlink(tmp);
+    return ret;
+  }
+  ret = rename(tmp, fn);
+  if (ret < 0) {
+    ret = -errno;
+    unlink(tmp);
+    return ret;
+  }
+
+  fd = open(base, O_RDONLY);
+  if (fd < 0) {
+    ret = -errno;
+    return ret;
+  }
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+
+  return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+		   char *val, size_t vallen)
+{
+  char fn[PATH_MAX];
+  int fd, len;
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  fd = open(fn, O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  len = safe_read(fd, val, vallen - 1);
+  if (len < 0) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return len;
+  }
+  // close sometimes returns errors, but only after write()
+  TEMP_FAILURE_RETRY(close(fd));
+
+  val[len] = 0;
+  return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
index 4c2991fe6e8..a4c9bc7a72f 100644
--- a/src/common/safe_io.h
+++ b/src/common/safe_io.h
@@ -45,6 +45,15 @@ extern "C" {
   ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
       WARN_UNUSED_RESULT;
 
+
+  /*
+   * Safe functions to read and write an entire file.
+   */
+  int safe_write_file(const char *base, const char *file,
+			const char *val, size_t vallen);
+  int safe_read_file(const char *base, const char *file,
+		       char *val, size_t vallen);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 343fb25c0e4..583147fb631 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -946,43 +946,25 @@ int FileStore::_sanity_check_fs()
 
 int FileStore::write_superblock()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/superblock", basedir.c_str());
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0)
-    return -errno;
   bufferlist bl;
   ::encode(superblock, bl);
-
-  int ret = safe_write(fd, bl.c_str(), bl.length());
-  if (ret < 0)
-    goto out;
-  ret = ::fsync(fd);
-  if (ret < 0)
-    ret = -errno;
-  // XXX: fsync() man page says I need to sync containing directory
-out:
-  TEMP_FAILURE_RETRY(::close(fd));
-  return ret;
+  return safe_write_file(basedir.c_str(), "superblock",
+      bl.c_str(), bl.length());
 }
 
 int FileStore::read_superblock()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/superblock", basedir.c_str());
-  int fd = ::open(fn, O_RDONLY, 0644);
-  if (fd < 0) {
-    if (errno == ENOENT) {
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "superblock",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT) {
       // If the file doesn't exist write initial CompatSet
       return write_superblock();
-    } else
-      return -errno;
-  }
-  bufferptr bp(PATH_MAX);
-  int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
+    }
     return ret;
+  }
+
   bufferlist bl;
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
@@ -1012,20 +994,14 @@ int FileStore::update_version_stamp()
 
 int FileStore::version_stamp_is_valid(uint32_t *version)
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_RDONLY, 0644);
-  if (fd < 0) {
-    if (errno == ENOENT)
-      return 0;
-    else 
-      return -errno;
-  }
   bufferptr bp(PATH_MAX);
-  int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
+  int ret = safe_read_file(basedir.c_str(), "store_version",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT)
+      return 0;
     return ret;
+  }
   bufferlist bl;
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
@@ -1038,17 +1014,11 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
 
 int FileStore::write_version_stamp()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0)
-    return -errno;
   bufferlist bl;
   ::encode(target_version, bl);
-  
-  int ret = safe_write(fd, bl.c_str(), bl.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  return ret;
+
+  return safe_write_file(basedir.c_str(), "store_version",
+      bl.c_str(), bl.length());
 }
 
 int FileStore::read_op_seq(uint64_t *seq)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 0a2d64ee6e1..916e002012a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -712,7 +712,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
       goto umount_store;
     }
 
-    ret = write_meta(dev, "ready", "ready\n", 6);
+    ret = safe_write_file(dev.c_str(), "ready", "ready\n", 6);
     if (ret) {
       derr << "OSD::mkfs: failed to write ready file: error " << ret << dendl;
       goto umount_store;
@@ -768,103 +768,19 @@ int OSD::dump_journal(CephContext *cct, const std::string &dev, const std::strin
   return err;
 }
 
-int OSD::write_meta(const std::string &base, const std::string &file,
-		    const char *val, size_t vallen)
-{
-  int ret;
-  char fn[PATH_MAX];
-  char tmp[PATH_MAX];
-  int fd;
-
-  // does the file already have correct content?
-  char oldval[80];
-  ret = read_meta(base, file, oldval, sizeof(oldval));
-  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
-    return 0;  // yes.
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base.c_str(), file.c_str());
-  fd = ::open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: error opening '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ret = safe_write(fd, val, vallen);
-  if (ret) {
-    derr << "write_meta: failed to write to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    TEMP_FAILURE_RETRY(::close(fd));
-    return ret;
-  }
-
-  ret = ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to fsync to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-  ret = ::rename(tmp, fn);
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to rename '" << tmp << "' to '" << fn << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-
-  fd = ::open(base.c_str(), O_RDONLY);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: failed to open dir '" << base << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  return 0;
-}
-
-int OSD::read_meta(const  std::string &base, const std::string &file,
-		   char *val, size_t vallen)
-{
-  char fn[PATH_MAX];
-  int fd, len;
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  fd = ::open(fn, O_RDONLY);
-  if (fd < 0) {
-    int err = errno;
-    return -err;
-  }
-  len = safe_read(fd, val, vallen - 1);
-  if (len < 0) {
-    TEMP_FAILURE_RETRY(::close(fd));
-    return len;
-  }
-  // close sometimes returns errors, but only after write()
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  val[len] = 0;
-  return len;
-}
-
 int OSD::write_meta(const std::string &base, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
 {
   char val[80];
   
   snprintf(val, sizeof(val), "%s\n", CEPH_OSD_ONDISK_MAGIC);
-  write_meta(base, "magic", val, strlen(val));
+  safe_write_file(base.c_str(), "magic", val, strlen(val));
 
   snprintf(val, sizeof(val), "%d\n", whoami);
-  write_meta(base, "whoami", val, strlen(val));
+  safe_write_file(base.c_str(), "whoami", val, strlen(val));
 
   cluster_fsid.print(val);
   strcat(val, "\n");
-  write_meta(base, "ceph_fsid", val, strlen(val));
+  safe_write_file(base.c_str(), "ceph_fsid", val, strlen(val));
 
   return 0;
 }
@@ -874,24 +790,24 @@ int OSD::peek_meta(const std::string &dev, std::string& magic,
 {
   char val[80] = { 0 };
 
-  if (read_meta(dev, "magic", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "magic", val, sizeof(val)) < 0)
     return -errno;
   int l = strlen(val);
   if (l && val[l-1] == '\n')
     val[l-1] = 0;
   magic = val;
 
-  if (read_meta(dev, "whoami", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "whoami", val, sizeof(val)) < 0)
     return -errno;
   whoami = atoi(val);
 
-  if (read_meta(dev, "ceph_fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "ceph_fsid", val, sizeof(val)) < 0)
     return -errno;
   if (strlen(val) > 36)
     val[36] = 0;
   cluster_fsid.parse(val);
 
-  if (read_meta(dev, "fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "fsid", val, sizeof(val)) < 0)
     osd_fsid = uuid_d();
   else {
     if (strlen(val) > 36)
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 5fe667344a9..9346cee6890 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1738,10 +1738,6 @@ protected:
   }
 
 private:
-  static int write_meta(const std::string &base, const std::string &file,
-			const char *val, size_t vallen);
-  static int read_meta(const std::string &base, const std::string &file,
-		       char *val, size_t vallen);
   static int write_meta(const std::string &base,
 			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
 public:
-- 
cgit v1.2.1


From f1584fb05c57e47dcee218982e27a74ee4d8a227 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 19 Sep 2013 17:57:14 -0700
Subject: common/bloom_filter: unit tests

Fun facts:

- fpp = false positive probability
- fpp is a function of insert count only
- at .1% fpp, we pay about 2 bytes per insert
- at 1-2% fpp, we pay about 1 byte per insert
- at 15% fpp, we pay about .5 bytes per insert

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/bloom_filter.hpp          |  3 +-
 src/test/Makefile.am                 |  5 +++
 src/test/common/test_bloom_filter.cc | 62 ++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 src/test/common/test_bloom_filter.cc

diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 2a1ee2c4217..cc22136e5ca 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <cmath>
 #include <limits>
+#include <list>
 #include <string>
 #include <vector>
 
@@ -470,7 +471,7 @@ public:
    void encode(bufferlist& bl) const;
    void decode(bufferlist::iterator& bl);
    void dump(Formatter *f) const;
-   static void generate_test_instances(list<bloom_filter*>& ls);
+   static void generate_test_instances(std::list<bloom_filter*>& ls);
 };
 WRITE_CLASS_ENCODER(bloom_filter)
 
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 88cf1ce970f..d2d76f4008f 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -258,6 +258,11 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_addrs
 
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
 unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
 unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..8be52511362
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,62 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+  bloom_filter bf(10, .1, 1);
+  bf.insert("foo");
+  bf.insert("bar");
+
+  ASSERT_TRUE(bf.contains("foo"));
+  ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Sweep) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex++) {
+    for (float fpp = .001; fpp < .5; fpp *= 2.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      for (int n = 0; n < max; n++)
+	bf.insert("ok" + stringify(n));
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains("asdf" + stringify(n)))
+	  hit++;
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+
+    }
+  }
+}
-- 
cgit v1.2.1


From fdb8b0d8ffdda20dae3fcc8c15aa62e645111fde Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 19 Sep 2013 18:23:07 -0700
Subject: common/bloom_filter: test behavior of sequences of bloom filters

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/common/test_bloom_filter.cc | 78 ++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index 8be52511362..66bda6bcd33 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -60,3 +60,81 @@ TEST(BloomFilter, Sweep) {
     }
   }
 }
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+  }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters.  in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    int run = 0;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  run++;
+	  if (run >= 2) {
+	    hit++;
+	    break;
+	  }
+	} else {
+	  run = 0;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+	      << " expected " << (fpp*fpp*(double)seq) << std::endl;
+  }
+}
-- 
cgit v1.2.1


From f31d69127571a165c5d11deb3ab35c21d611ffff Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 19 Sep 2013 18:31:45 -0700
Subject: common/bloom_filter: fix whitespace

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/bloom_filter.hpp | 946 ++++++++++++++++++++++----------------------
 1 file changed, 470 insertions(+), 476 deletions(-)

diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index cc22136e5ca..15400b14b9e 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -1,5 +1,5 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:3; indent-tabs-mode:t -*-
-// vim: ts=8 sw=3 smarttab
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
 
 /*
  *******************************************************************
@@ -35,465 +35,459 @@
 
 static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
 static const unsigned char bit_mask[bits_per_char] = {
-                                                       0x01,  //00000001
-                                                       0x02,  //00000010
-                                                       0x04,  //00000100
-                                                       0x08,  //00001000
-                                                       0x10,  //00010000
-                                                       0x20,  //00100000
-                                                       0x40,  //01000000
-                                                       0x80   //10000000
-                                                     };
+  0x01,  //00000001
+  0x02,  //00000010
+  0x04,  //00000100
+  0x08,  //00001000
+  0x10,  //00010000
+  0x20,  //00100000
+  0x40,  //01000000
+  0x80   //10000000
+};
 
 
 class bloom_filter
 {
 protected:
 
-   typedef unsigned int bloom_type;
-   typedef unsigned char cell_type;
+  typedef unsigned int bloom_type;
+  typedef unsigned char cell_type;
 
 public:
 
-   bloom_filter()
-      : bit_table_(0),
-	salt_count_(0),
-	table_size_(0),
-	raw_table_size_(0),
-	inserted_element_count_(0),
-	random_seed_(0)
-   {}
-
-   bloom_filter(const std::size_t& predicted_inserted_element_count,
-                const double& false_positive_probability,
-                const std::size_t& random_seed)
-      : bit_table_(0),
-	inserted_element_count_(0),
-	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
-   {
-      find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
-			      &salt_count_, &table_size_);
-      init();
-   }
-
-   bloom_filter(const std::size_t& salt_count, std::size_t table_size,
-                const std::size_t& random_seed)
-      : bit_table_(0),
-	salt_count_(salt_count),
-	table_size_(table_size),
-	inserted_element_count_(0),
-	random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
-   {
-      init();
-   }
-
-   void init() {
-      generate_unique_salt();
-      raw_table_size_ = table_size_ / bits_per_char;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-   }
-
-   bloom_filter(const bloom_filter& filter)
-   {
-      this->operator=(filter);
-   }
-
-   bloom_filter& operator = (const bloom_filter& filter)
-   {
-      if (this != &filter) {
-        salt_count_ = filter.salt_count_;
-        table_size_ = filter.table_size_;
-        raw_table_size_ = filter.raw_table_size_;
-        inserted_element_count_ = filter.inserted_element_count_;
-        random_seed_ = filter.random_seed_;
-        delete[] bit_table_;
-        bit_table_ = new cell_type[raw_table_size_];
-        std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
-        salt_ = filter.salt_;
-      }
-      return *this;
-   }
-
-   virtual ~bloom_filter()
-   {
+  bloom_filter()
+    : bit_table_(0),
+      salt_count_(0),
+      table_size_(0),
+      raw_table_size_(0),
+      inserted_element_count_(0),
+      random_seed_(0)
+  {}
+
+  bloom_filter(const std::size_t& predicted_inserted_element_count,
+	       const double& false_positive_probability,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			    &salt_count_, &table_size_);
+    init();
+  }
+
+  bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      salt_count_(salt_count),
+      table_size_(table_size),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    init();
+  }
+
+  void init() {
+    generate_unique_salt();
+    raw_table_size_ = table_size_ / bits_per_char;
+    bit_table_ = new cell_type[raw_table_size_];
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+  }
+
+  bloom_filter(const bloom_filter& filter)
+  {
+    this->operator=(filter);
+  }
+
+  bloom_filter& operator = (const bloom_filter& filter)
+  {
+    if (this != &filter) {
+      salt_count_ = filter.salt_count_;
+      table_size_ = filter.table_size_;
+      raw_table_size_ = filter.raw_table_size_;
+      inserted_element_count_ = filter.inserted_element_count_;
+      random_seed_ = filter.random_seed_;
       delete[] bit_table_;
-   }
-
-   inline bool operator!() const
-   {
-      return (0 == table_size_);
-   }
-
-   inline void clear()
-   {
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-      inserted_element_count_ = 0;
-   }
-
-   inline void insert(const unsigned char* key_begin, const std::size_t& length)
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
-      }
-      ++inserted_element_count_;
-   }
-
-   template<typename T>
-   inline void insert(const T& t)
-   {
-      // Note: T must be a C++ POD type.
-      insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
-   }
-
-   inline void insert(const std::string& key)
-   {
-      insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline void insert(const char* data, const std::size_t& length)
-   {
-      insert(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline void insert(const InputIterator begin, const InputIterator end)
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         insert(*(itr++));
-      }
-   }
-
-   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
+      bit_table_ = new cell_type[raw_table_size_];
+      std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+      salt_ = filter.salt_;
+    }
+    return *this;
+  }
+
+  virtual ~bloom_filter()
+  {
+    delete[] bit_table_;
+  }
+
+  inline bool operator!() const
+  {
+    return (0 == table_size_);
+  }
+
+  inline void clear()
+  {
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+    inserted_element_count_ = 0;
+  }
+
+  inline void insert(const unsigned char* key_begin, const std::size_t& length)
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
+  template<typename T>
+  inline void insert(const T& t)
+  {
+    // Note: T must be a C++ POD type.
+    insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+  }
+
+  inline void insert(const std::string& key)
+  {
+    insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline void insert(const char* data, const std::size_t& length)
+  {
+    insert(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline void insert(const InputIterator begin, const InputIterator end)
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      insert(*(itr++));
+    }
+  }
+
+  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
       {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
-         {
-            return false;
-         }
+        return false;
       }
-      return true;
-   }
-
-   template<typename T>
-   inline bool contains(const T& t) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
-   }
-
-   inline bool contains(const std::string& key) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline bool contains(const char* data, const std::size_t& length) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
+    }
+    return true;
+  }
+
+  template<typename T>
+  inline bool contains(const T& t) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+  }
+
+  inline bool contains(const std::string& key) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline bool contains(const char* data, const std::size_t& length) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (!contains(*itr))
       {
-         if (!contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
+        return itr;
       }
-      return end;
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
+      ++itr;
+    }
+    return end;
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (contains(*itr))
       {
-         if (contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
+        return itr;
       }
-      return end;
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return table_size_;
-   }
-
-   inline std::size_t element_count() const
-   {
-      return inserted_element_count_;
-   }
-
-   inline double effective_fpp() const
-   {
-      /*
-        Note:
-        The effective false positive probability is calculated using the
-        designated table size and hash function count in conjunction with
-        the current number of inserted elements - not the user defined
-        predicated/expected number of inserted elements.
-      */
-      return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
-   }
-
-   inline bloom_filter& operator &= (const bloom_filter& filter)
-   {
-      /* intersection */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] &= filter.bit_table_[i];
-         }
+      ++itr;
+    }
+    return end;
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return table_size_;
+  }
+
+  inline std::size_t element_count() const
+  {
+    return inserted_element_count_;
+  }
+
+  inline double effective_fpp() const
+  {
+    /*
+      Note:
+      The effective false positive probability is calculated using the
+      designated table size and hash function count in conjunction with
+      the current number of inserted elements - not the user defined
+      predicated/expected number of inserted elements.
+    */
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+  }
+
+  inline bloom_filter& operator &= (const bloom_filter& filter)
+  {
+    /* intersection */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] &= filter.bit_table_[i];
       }
-      return *this;
-   }
-
-   inline bloom_filter& operator |= (const bloom_filter& filter)
-   {
-      /* union */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] |= filter.bit_table_[i];
-         }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator |= (const bloom_filter& filter)
+  {
+    /* union */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+        bit_table_[i] |= filter.bit_table_[i];
       }
-      return *this;
-   }
-
-   inline bloom_filter& operator ^= (const bloom_filter& filter)
-   {
-      /* difference */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] ^= filter.bit_table_[i];
-         }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator ^= (const bloom_filter& filter)
+  {
+    /* difference */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] ^= filter.bit_table_[i];
       }
-      return *this;
-   }
+    }
+    return *this;
+  }
 
-   inline const cell_type* table() const
-   {
-      return bit_table_;
-   }
+  inline const cell_type* table() const
+  {
+    return bit_table_;
+  }
 
 protected:
 
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash % table_size_;
-      bit = bit_index % bits_per_char;
-   }
-
-   void generate_unique_salt()
-   {
-      /*
-        Note:
-        A distinct hash function need not be implementation-wise
-        distinct. In the current implementation "seeding" a common
-        hash function with different values seems to be adequate.
-      */
-      const unsigned int predef_salt_count = 128;
-      static const bloom_type predef_salt[predef_salt_count] =
-                                 {
-                                    0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
-                                    0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
-                                    0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
-                                    0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
-                                    0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
-                                    0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
-                                    0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
-                                    0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
-                                    0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
-                                    0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
-                                    0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
-                                    0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
-                                    0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
-                                    0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
-                                    0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
-                                    0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
-                                    0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
-                                    0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
-                                    0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
-                                    0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
-                                    0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
-                                    0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
-                                    0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
-                                    0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
-                                    0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
-                                    0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
-                                    0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
-                                    0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
-                                    0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
-                                    0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
-                                    0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
-                                    0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
-                                 };
-
-      if (salt_count_ <= predef_salt_count)
-      {
-         std::copy(predef_salt,
-                   predef_salt + salt_count_,
-                   std::back_inserter(salt_));
-          for (unsigned int i = 0; i < salt_.size(); ++i)
-          {
-            /*
-              Note:
-              This is done to integrate the user defined random seed,
-              so as to allow for the generation of unique bloom filter
-              instances.
-            */
-            salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
-          }
-      }
-      else
-      {
-         std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
-         srand(static_cast<unsigned int>(random_seed_));
-         while (salt_.size() < salt_count_)
-         {
-            bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
-            if (0 == current_salt) continue;
-            if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
-            {
-               salt_.push_back(current_salt);
-            }
-         }
-      }
-   }
-
-   static void find_optimal_parameters(std::size_t target_insert_count,
-				       double target_fpp,
-				       std::size_t *salt_count,
-				       std::size_t *table_size)
-   {
-      /*
-        Note:
-        The following will attempt to find the number of hash functions
-        and minimum amount of storage bits required to construct a bloom
-        filter consistent with the user defined false positive probability
-        and estimated element insertion count.
-      */
-
-      double min_m = std::numeric_limits<double>::infinity();
-      double min_k = 0.0;
-      double curr_m = 0.0;
-      double k = 1.0;
-      while (k < 1000.0)
-      {
-         double numerator   = (- k * target_insert_count);
-         double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
-         curr_m = numerator / denominator;
-
-         if (curr_m < min_m)
-         {
-            min_m = curr_m;
-            min_k = k;
-         }
-         k += 1.0;
-      }
-
-      *salt_count = static_cast<std::size_t>(min_k);
-      size_t t = static_cast<std::size_t>(min_m);
-      t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
-      *table_size = t;
-   }
-
-   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
-   {
-      const unsigned char* itr = begin;
-
-      while (remaining_length >= 4)
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash % table_size_;
+    bit = bit_index % bits_per_char;
+  }
+
+  void generate_unique_salt()
+  {
+    /*
+      Note:
+      A distinct hash function need not be implementation-wise
+      distinct. In the current implementation "seeding" a common
+      hash function with different values seems to be adequate.
+    */
+    const unsigned int predef_salt_count = 128;
+    static const bloom_type predef_salt[predef_salt_count] = {
+      0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+      0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+      0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+      0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+      0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+      0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+      0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+      0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+      0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+      0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+      0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+      0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+      0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+      0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+      0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+      0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+      0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+      0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+      0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+      0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+      0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+      0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+      0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+      0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+      0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+      0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+      0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+      0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+      0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+      0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+      0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+      0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+    };
+
+    if (salt_count_ <= predef_salt_count)
+    {
+      std::copy(predef_salt,
+		predef_salt + salt_count_,
+		std::back_inserter(salt_));
+       for (unsigned int i = 0; i < salt_.size(); ++i)
+       {
+        /*
+          Note:
+          This is done to integrate the user defined random seed,
+          so as to allow for the generation of unique bloom filter
+          instances.
+        */
+        salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+       }
+    }
+    else
+    {
+      std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+      srand(static_cast<unsigned int>(random_seed_));
+      while (salt_.size() < salt_count_)
       {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 4;
+        bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+        if (0 == current_salt)
+	  continue;
+        if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+        {
+          salt_.push_back(current_salt);
+        }
       }
-
-      while (remaining_length >= 2)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 2;
-      }
-
-      if (remaining_length)
+    }
+  }
+
+  static void find_optimal_parameters(std::size_t target_insert_count,
+				      double target_fpp,
+				      std::size_t *salt_count,
+				      std::size_t *table_size)
+  {
+    /*
+      Note:
+      The following will attempt to find the number of hash functions
+      and minimum amount of storage bits required to construct a bloom
+      filter consistent with the user defined false positive probability
+      and estimated element insertion count.
+    */
+
+    double min_m = std::numeric_limits<double>::infinity();
+    double min_k = 0.0;
+    double curr_m = 0.0;
+    double k = 1.0;
+    while (k < 1000.0)
+    {
+      double numerator  = (- k * target_insert_count);
+      double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+      curr_m = numerator / denominator;
+
+      if (curr_m < min_m)
       {
-         hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+        min_m = curr_m;
+        min_k = k;
       }
-
-      return hash;
-   }
-
-   std::vector<bloom_type> salt_;
-   unsigned char*          bit_table_;
-   std::size_t             salt_count_;
-   std::size_t             table_size_;
-   std::size_t             raw_table_size_;
-   std::size_t             inserted_element_count_;
-   std::size_t             random_seed_;
+      k += 1.0;
+    }
+
+    *salt_count = static_cast<std::size_t>(min_k);
+    size_t t = static_cast<std::size_t>(min_m);
+    t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+    *table_size = t;
+  }
+
+  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+  {
+    const unsigned char* itr = begin;
+
+    while (remaining_length >= 4)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 4;
+    }
+
+    while (remaining_length >= 2)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 2;
+    }
+
+    if (remaining_length)
+    {
+      hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+    }
+
+    return hash;
+  }
+
+  std::vector<bloom_type> salt_;
+  unsigned char*       bit_table_;
+  std::size_t         salt_count_;
+  std::size_t         table_size_;
+  std::size_t         raw_table_size_;
+  std::size_t         inserted_element_count_;
+  std::size_t         random_seed_;
 
 public:
-   void encode(bufferlist& bl) const;
-   void decode(bufferlist::iterator& bl);
-   void dump(Formatter *f) const;
-   static void generate_test_instances(std::list<bloom_filter*>& ls);
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<bloom_filter*>& ls);
 };
 WRITE_CLASS_ENCODER(bloom_filter)
 
 inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
 {
-   bloom_filter result = a;
-   result &= b;
-   return result;
+  bloom_filter result = a;
+  result &= b;
+  return result;
 }
 
 inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
 {
-   bloom_filter result = a;
-   result |= b;
-   return result;
+  bloom_filter result = a;
+  result |= b;
+  return result;
 }
 
 inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
 {
-   bloom_filter result = a;
-   result ^= b;
-   return result;
+  bloom_filter result = a;
+  result ^= b;
+  return result;
 }
 
 
@@ -501,66 +495,66 @@ class compressible_bloom_filter : public bloom_filter
 {
 public:
 
-   compressible_bloom_filter(const std::size_t& predicted_element_count,
-                             const double& false_positive_probability,
-                             const std::size_t& random_seed)
-   : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
-   {
-      size_list.push_back(table_size_);
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return size_list.back();
-   }
-
-   inline bool compress(const double& percentage)
-   {
-      if ((0.0 >= percentage) || (percentage >= 100.0))
-      {
-         return false;
-      }
-
-      std::size_t original_table_size = size_list.back();
-      std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
-      new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
-      if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
-      {
-         return false;
-      }
-
-      cell_type* tmp = new cell_type[new_table_size / bits_per_char];
-      std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
-      cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-      cell_type* end = bit_table_ + (original_table_size / bits_per_char);
-      cell_type* itr_tmp = tmp;
-
-      while (end != itr)
-      {
-         *(itr_tmp++) |= (*itr++);
-      }
-
-      delete[] bit_table_;
-      bit_table_ = tmp;
-      size_list.push_back(new_table_size);
-
-      return true;
-   }
+  compressible_bloom_filter(const std::size_t& predicted_element_count,
+			    const double& false_positive_probability,
+			    const std::size_t& random_seed)
+    : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return size_list.back();
+  }
+
+  inline bool compress(const double& percentage)
+  {
+    if ((0.0 >= percentage) || (percentage >= 100.0))
+    {
+      return false;
+    }
+
+    std::size_t original_table_size = size_list.back();
+    std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
+    new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+
+    if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+    {
+      return false;
+    }
+
+    cell_type* tmp = new cell_type[new_table_size / bits_per_char];
+    std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
+    cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
+    cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+    cell_type* itr_tmp = tmp;
+
+    while (end != itr)
+    {
+      *(itr_tmp++) |= (*itr++);
+    }
+
+    delete[] bit_table_;
+    bit_table_ = tmp;
+    size_list.push_back(new_table_size);
+
+    return true;
+  }
 
 private:
 
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash;
-      for (std::size_t i = 0; i < size_list.size(); ++i)
-      {
-         bit_index %= size_list[i];
-      }
-      bit = bit_index % bits_per_char;
-   }
-
-   std::vector<std::size_t> size_list;
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash;
+    for (std::size_t i = 0; i < size_list.size(); ++i)
+    {
+      bit_index %= size_list[i];
+    }
+    bit = bit_index % bits_per_char;
+  }
+
+  std::vector<std::size_t> size_list;
 };
 
 #endif
-- 
cgit v1.2.1


From a2e175bf100d5318045b8bdbe94f43063ef2a638 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 19 Sep 2013 18:34:56 -0700
Subject: COPYING: make note of common/bloom_filer.hpp (boost) license

Signed-off-by: Sage Weil <sage@inktank.com>
---
 COPYING          | 4 ++++
 debian/copyright | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/COPYING b/COPYING
index 920b049b7fa..c301044a13c 100644
--- a/COPYING
+++ b/COPYING
@@ -18,6 +18,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
diff --git a/debian/copyright b/debian/copyright
index d11a0f7f5da..d9ba190f40e 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -19,6 +19,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
-- 
cgit v1.2.1


From fea12e21e806a7c75bef8cb1013f5e0023d68d7d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 2 Oct 2013 14:14:48 -0700
Subject: COPYING: fix formatting

Signed-off-by: Sage Weil <sage@inktank.com>
---
 COPYING | 60 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/COPYING b/COPYING
index 920b049b7fa..ee3ed593eef 100644
--- a/COPYING
+++ b/COPYING
@@ -94,33 +94,33 @@ Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
 License: LGPL2 or later
 
 Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
-Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
- - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in
-   the documentation and/or other materials provided with the
-   distribution.
-
- - Neither the name of the University of Tennessee nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
-- 
cgit v1.2.1


From e70ea84cb903365f20bb66fbe2786643df5b521d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 2 Oct 2013 14:15:13 -0700
Subject: COPYING: add debian-style headers

This may not be necessary here, but it makes this identical to the
debian/copyright file, which is a win.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 COPYING | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/COPYING b/COPYING
index ee3ed593eef..b47a28ef18f 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,8 @@
+Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Name: ceph
+Maintainer: Sage Weil <sage@newdream.net>
+Source: http://ceph.com/
+
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
 License: LGPL2.1 (see COPYING-LGPL2.1)
-- 
cgit v1.2.1


From 1a56fe9935a33e50cab782479a03f2015b313725 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 2 Oct 2013 14:16:19 -0700
Subject: COPYING: add Packaging: section

Again, debian-specific, but who cares.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 COPYING | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/COPYING b/COPYING
index b47a28ef18f..18cb8a0d92b 100644
--- a/COPYING
+++ b/COPYING
@@ -129,3 +129,8 @@ License:
   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
   WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE.
+
+Packaging:
+    Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
+    Copyright (C) 2010 Canonical, Ltd.
+    Licensed under LGPL-2.1
-- 
cgit v1.2.1


From 11461cbeef1d2467b3ef60fa6b244a72acd92358 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 2 Oct 2013 14:18:07 -0700
Subject: debian/copyright: sync up with COPYING

Signed-off-by: Sage Weil <sage@inktank.com>
---
 debian/copyright | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/debian/copyright b/debian/copyright
index d11a0f7f5da..18cb8a0d92b 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -5,7 +5,11 @@ Source: http://ceph.com/
 
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
-License: LGPL2.1 (see /usr/share/common-licenses/LGPL-2.1)
+License: LGPL2.1 (see COPYING-LGPL2.1)
+
+Files: doc/*
+Copyright: (c) 2010-2012 New Dream Network and contributors
+License: Creative Commons Attribution-ShareAlike (CC BY-SA)
 
 Files: src/mount/canonicalize.c
 Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
@@ -28,25 +32,25 @@ Copyright:
     Copyright 2012-2013 Intel Corporation All Rights Reserved.
 License: BSD 3-clause
 
-Files: src/common/sctp_crc32.c:
+Files: src/common/sctp_crc32.c: 
 Copyright:
     Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
     Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
 License:
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
-
+ 
   a) Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
-
+ 
   b) Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
     the documentation and/or other materials provided with the distribution.
-
+ 
   c) Neither the name of Cisco Systems, Inc. nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
-
+ 
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -88,6 +92,44 @@ License:
   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   OTHER DEALINGS IN THE SOFTWARE.
 
+
+
+Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
 Packaging:
     Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
     Copyright (C) 2010 Canonical, Ltd.
-- 
cgit v1.2.1


From 65ae9b8aebdac0df9731101ba28e0f0a81e62206 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 2 Oct 2013 14:30:19 -0700
Subject: COPYING: fix URL

Signed-off-by: Sage Weil <sage@inktank.com>
---
 COPYING          | 2 +-
 debian/copyright | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/COPYING b/COPYING
index 18cb8a0d92b..f18b45ceec0 100644
--- a/COPYING
+++ b/COPYING
@@ -1,4 +1,4 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
 Name: ceph
 Maintainer: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
diff --git a/debian/copyright b/debian/copyright
index 18cb8a0d92b..f18b45ceec0 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,4 +1,4 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
 Name: ceph
 Maintainer: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
-- 
cgit v1.2.1


From f8a947d92005a4cda42cab48ddfcd6e419c0b5d7 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Fri, 9 Aug 2013 13:43:54 +0800
Subject: client: trim deleted inode

Previous patch makes MDS send notification to clients when an inode
is deleted. When receiving a such notification, we invalidate any
dentry link to the deleted inode. If there is no other reference to
the inode, the inode gets trimmed.

For cephfs fuse client, we use fuse_lowlevel_notify_inval_entry() or
fuse_lowlevel_notify_delete() to notify the kernel to trim the deleted
inode. (this is not completely reliable because we play unlink/link
tricks when  handle MDS replies. it's difficult to keep the user space
cache and kernel dcache in sync)

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
---
 src/client/Client.cc  | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 src/client/Client.h   | 14 +++++++++++
 src/client/fuse_ll.cc | 18 ++++++++++++--
 3 files changed, 98 insertions(+), 3 deletions(-)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 77fd2084cf1..285b7c543c6 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc)
     timer(m->cct, client_lock),
     ino_invalidate_cb(NULL),
     ino_invalidate_cb_handle(NULL),
+    dentry_invalidate_cb(NULL),
+    dentry_invalidate_cb_handle(NULL),
     getgroups_cb(NULL),
     getgroups_cb_handle(NULL),
     async_ino_invalidator(m->cct),
+    async_dentry_invalidator(m->cct),
     tick_event(NULL),
     monclient(mc), messenger(m), whoami(m->get_myname().num()),
     initialized(false), mounted(false), unmounting(false),
@@ -410,11 +413,17 @@ void Client::shutdown()
   admin_socket->unregister_command("dump_cache");
 
   if (ino_invalidate_cb) {
-    ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl;
+    ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
     async_ino_invalidator.wait_for_empty();
     async_ino_invalidator.stop();
   }
 
+  if (dentry_invalidate_cb) {
+    ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
+    async_dentry_invalidator.wait_for_empty();
+    async_dentry_invalidator.stop();
+  }
+
   objectcacher->stop();  // outside of client_lock! this does a join.
 
   client_lock.Lock();
@@ -3551,6 +3560,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
   m->put();
 }
 
+class C_Client_DentryInvalidate : public Context  {
+private:
+  Client *client;
+  vinodeno_t dirino;
+  vinodeno_t ino;
+  string name;
+public:
+  C_Client_DentryInvalidate(Client *c, Dentry *dn) :
+			    client(c), dirino(dn->dir->parent_inode->vino()),
+			    ino(dn->inode->vino()), name(dn->name) { }
+  void finish(int r) {
+    client->_async_dentry_invalidate(dirino, ino, name);
+  }
+};
+
+void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
+{
+  ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
+		 << " in dir " << dirino << dendl;
+  dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
+}
+
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+{
+  if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
+    async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+}
+
+void Client::_invalidate_inode_parents(Inode *in)
+{
+  set<Dentry*>::iterator q = in->dn_set.begin();
+  while (q != in->dn_set.end()) {
+    Dentry *dn = *q++;
+    // FIXME: we play lots of unlink/link tricks when handling MDS replies,
+    //        so in->dn_set doesn't always reflect the state of kernel's dcache.
+    _schedule_invalidate_dentry_callback(dn);
+    unlink(dn, false);
+  }
+}
 
 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
 {
@@ -3578,8 +3626,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     in->uid = m->head.uid;
     in->gid = m->head.gid;
   }
+  bool deleted_inode = false;
   if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
     in->nlink = m->head.nlink;
+    if (in->nlink == 0 &&
+	(new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+      deleted_inode = true;
   }
   if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
       m->xattrbl.length() &&
@@ -3633,6 +3685,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   if (new_caps)
     signal_cond_list(in->waitfor_caps);
 
+  // may drop inode's last ref
+  if (deleted_inode)
+    _invalidate_inode_parents(in);
+
   m->put();
 }
 
@@ -6319,6 +6375,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl
   async_ino_invalidator.start();
 }
 
+void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle)
+{
+  Mutex::Locker l(client_lock);
+  ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl;
+  if (cb == NULL)
+    return;
+  dentry_invalidate_cb = cb;
+  dentry_invalidate_cb_handle = handle;
+  async_dentry_invalidator.start();
+}
+
 void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
 {
   Mutex::Locker l(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index c7c9cef0e0c..61f29f39120 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -120,6 +120,9 @@ struct MetaRequest;
 
 typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
 
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+					 vinodeno_t ino, string& name);
+
 typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
 
 // ========================================================
@@ -211,10 +214,14 @@ class Client : public Dispatcher {
   client_ino_callback_t ino_invalidate_cb;
   void *ino_invalidate_cb_handle;
 
+  client_dentry_callback_t dentry_invalidate_cb;
+  void *dentry_invalidate_cb_handle;
+
   client_getgroups_callback_t getgroups_cb;
   void *getgroups_cb_handle;
 
   Finisher async_ino_invalidator;
+  Finisher async_dentry_invalidator;
 
   Context *tick_event;
   utime_t last_cap_renew;
@@ -357,6 +364,7 @@ protected:
 
   friend class C_Client_PutInode; // calls put_inode()
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
+  friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
 
   //int get_cache_size() { return lru.lru_get_size(); }
   //void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -459,6 +467,10 @@ protected:
   void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
   void _flushed_cap_snap(Inode *in, snapid_t seq);
 
+  void _schedule_invalidate_dentry_callback(Dentry *dn);
+  void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
+  void _invalidate_inode_parents(Inode *in);
+
   void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _invalidate_inode_cache(Inode *in, bool keep_caps);
   void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
@@ -735,6 +747,8 @@ public:
 
   void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
 
+  void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
+
   void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
 };
 
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 6bf5ea3d34f..2a25246905a 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
 }
 #endif
 
-static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
+static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
 {
 #if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
   CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
@@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le
 #endif
 }
 
+static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
+				 vinodeno_t ino, string& name)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+  fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+  fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+  fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
+#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+  fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
+#endif
+}
+
 static void do_init(void *data, fuse_conn_info *bar)
 {
   CephFuse::Handle *cfuse = (CephFuse::Handle *)data;
@@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[])
   client->ll_register_getgroups_cb(getgroups_cb, this);
 
    */
+  client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this);
 
   if (client->cct->_conf->fuse_use_invalidate_cb)
-    client->ll_register_ino_invalidate_cb(invalidate_cb, this);
+    client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this);
 
 done:
   fuse_opt_free_args(&args);
-- 
cgit v1.2.1


From 63f5814855d36e9e79a125c9a3321cea62d9dd1c Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Thu, 1 Aug 2013 11:19:02 +0800
Subject: ceph: Update FUSE_USE_VERSION from 26 to 30.

When compiling, it met this error:
>In file included from /usr/local/include/fuse/fuse.h:19:0,
>                 from client/fuse_ll.cc:17:
>/usr/local/include/fuse/fuse_common.h:474:4: error: #error only API
>version 30 or greater is supported
Update FUSE_USE_VERSION from 26 to 30.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
---
 fusetrace/fusetrace_ll.cc | 2 +-
 src/client/fuse_ll.cc     | 2 +-
 src/rbd_fuse/rbd-fuse.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc
index eb7100a867f..7f2b8438f1f 100644
--- a/fusetrace/fusetrace_ll.cc
+++ b/fusetrace/fusetrace_ll.cc
@@ -11,7 +11,7 @@
     gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh
 */
 
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 2a25246905a..88f727e454e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -12,7 +12,7 @@
  * 
  */
 
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #include <fuse/fuse.h>
 #include <fuse/fuse_lowlevel.h>
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index eea6edb9eb8..2a6a8d22e81 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -1,7 +1,7 @@
 /*
  * rbd-fuse
  */
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #include "include/int_types.h"
 
-- 
cgit v1.2.1


From 721f1703a89d7ec629df2cbb368bee4bf5abfef6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 30 Sep 2013 14:44:17 -0700
Subject: client: remove requests from closed MetaSession

If we get a CLOSED message on a session, remove/kick any requests on
that session before tearing it down.  Otherwise, we get a crash like

2013-09-26 03:51:44.704446 7f4d35a46700 10 client.4111 kick_requests for mds.0
2013-09-26 03:51:45.014156 7f4d35a46700 -1 ./include/xlist.h: In function 'xlist<T>::~xlist() [with T = MetaRequest*]' thread 7f4d35a46700 time 2013-09-26 03:51:44.751908
./include/xlist.h: 69: FAILED assert(_size == 0)

 ceph version 0.61.5 (8ee10dc4bb73bdd918873f29c70eedc3c7ef1979)
 1: (MetaSession::~MetaSession()+0x425) [0x4e0105]
 2: (Client::_closed_mds_session(MetaSession*)+0x116) [0x48a696]
 3: (Client::handle_client_session(MClientSession*)+0x2bb) [0x48bf5b]
 4: (Client::ms_dispatch(Message*)+0x56b) [0x4bfa0b]
 5: (DispatchQueue::entry()+0x3f1) [0x621b31]
 6: (DispatchQueue::DispatchThread::entry()+0xd) [0x6191bd]
 7: (()+0x7851) [0x7f4d3c168851]
 8: (clone()+0x6d) [0x7f4d3b09d90d]

Note that this can happen if we fail to reconnect do an MDS during its
reconnect interval.  If that happens, we probably have inodes in our
cache with no caps and things are generally not going to work very well.
This is but one step in improving the situation.

Separate out the two methods since they share little/no behavior.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/client/Client.cc | 40 +++++++++++++++++++++++++---------------
 src/client/Client.h  |  3 ++-
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/src/client/Client.cc b/src/client/Client.cc
index 285b7c543c6..60a5e4550b8 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -1541,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s)
   signal_context_list(s->waiting_for_open);
   mount_cond.Signal();
   remove_session_caps(s);
-  kick_requests(s, true);
+  kick_requests_closed(s);
   mds_sessions.erase(s->mds_num);
   delete s;
 }
@@ -1914,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m)
 
     if (newstate >= MDSMap::STATE_ACTIVE) {
       if (oldstate < MDSMap::STATE_ACTIVE) {
-	kick_requests(p->second, false);
+	kick_requests(p->second);
 	kick_flushing_caps(p->second);
 	signal_context_list(p->second->waiting_for_open);
 	kick_maxsize_requests(p->second);
@@ -1998,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session)
 }
 
 
-void Client::kick_requests(MetaSession *session, bool signal)
+void Client::kick_requests(MetaSession *session)
 {
   ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
-
   for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
        p != mds_requests.end();
-       ++p) 
+       ++p) {
     if (p->second->mds == session->mds_num) {
-      if (signal) {
-	// only signal caller if there is a caller
-	// otherwise, let resend_unsafe handle it
-	if (p->second->caller_cond) {
-	  p->second->kick = true;
-	  p->second->caller_cond->Signal();
-	}
-      } else {
-	send_request(p->second, session);
-      }
+      send_request(p->second, session);
     }
+  }
 }
 
 void Client::resend_unsafe_requests(MetaSession *session)
@@ -2027,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session)
     send_request(*iter, session);
 }
 
+void Client::kick_requests_closed(MetaSession *session)
+{
+  ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
+  for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
+       p != mds_requests.end();
+       ++p) {
+    if (p->second->mds == session->mds_num) {
+      if (p->second->caller_cond) {
+	p->second->kick = true;
+	p->second->caller_cond->Signal();
+      }
+      p->second->item.remove_myself();
+      p->second->unsafe_item.remove_myself();
+    }
+  }
+  assert(session->requests.empty());
+  assert(session->unsafe_requests.empty());
+}
+
 
 
diff --git a/src/client/Client.h b/src/client/Client.h
index 61f29f39120..df59f235de4 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -277,7 +277,8 @@ public:
   void connect_mds_targets(int mds);
   void send_request(MetaRequest *request, MetaSession *session);
   MClientRequest *build_client_request(MetaRequest *request);
-  void kick_requests(MetaSession *session, bool signal);
+  void kick_requests(MetaSession *session);
+  void kick_requests_closed(MetaSession *session);
   void handle_client_request_forward(MClientRequestForward *reply);
   void handle_client_reply(MClientReply *reply);
 
-- 
cgit v1.2.1


From 22f8325dbfce7ef2e97bf015c0f8bba53e75dfe9 Mon Sep 17 00:00:00 2001
From: git-harry <git-harry@live.co.uk>
Date: Thu, 3 Oct 2013 10:32:50 +0100
Subject: Make fsid comparison case-insensitive

get_fsid and find_cluster_by_uuid are modified so ceph-disk activate and
ceph-disk activate-all will work if the fsid uses uppercase characters.

Signed-off-by: Harry Harrington <git-harry@live.co.uk>
---
 src/ceph-disk | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/ceph-disk b/src/ceph-disk
index 939f65b85dd..64d944d9db0 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -570,7 +570,7 @@ def get_fsid(cluster):
     fsid = get_conf(cluster=cluster, variable='fsid')
     if fsid is None:
         raise Error('getting cluster uuid from configuration failed')
-    return fsid
+    return fsid.lower()
 
 
 def get_or_create_dmcrypt_key(
@@ -1601,6 +1601,7 @@ def find_cluster_by_uuid(_uuid):
     Find a cluster name by searching /etc/ceph/*.conf for a conf file
     with the right uuid.
     """
+    _uuid = _uuid.lower()
     no_fsid = []
     if not os.path.exists('/etc/ceph'):
         return None
@@ -1608,11 +1609,15 @@ def find_cluster_by_uuid(_uuid):
         if not conf_file.endswith('.conf'):
             continue
         cluster = conf_file[:-5]
-        fsid = get_conf(cluster, 'fsid')
-        if fsid is None:
+        try:
+            fsid = get_fsid(cluster)
+        except Error as e: 
+            if e.message != 'getting cluster uuid from configuration failed':
+                raise e
             no_fsid.append(cluster)
-        elif fsid == _uuid:
-            return cluster
+        else:
+            if fsid == _uuid:
+                return cluster
     # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
     if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
         LOG.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
-- 
cgit v1.2.1


From 0122ee993e014dc2add67978e0a37902fa60b149 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 19 Sep 2013 20:02:50 -0700
Subject: common/bloom_filter: insert/contains methods for uint32_t

This will let us pass in an hobject_t::hash directly (for example) without
rehashing a string.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/bloom_filter.hpp          | 35 +++++++++++++++++++++++++++++++++
 src/test/common/test_bloom_filter.cc | 38 ++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 15400b14b9e..6d5f645d8c9 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -131,6 +131,17 @@ public:
     inserted_element_count_ = 0;
   }
 
+  inline void insert(uint32_t val) {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
   inline void insert(const unsigned char* key_begin, const std::size_t& length)
   {
     std::size_t bit_index = 0;
@@ -170,6 +181,21 @@ public:
     }
   }
 
+  inline virtual bool contains(uint32_t val) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
   {
     std::size_t bit_index = 0;
@@ -425,6 +451,15 @@ protected:
     *table_size = t;
   }
 
+  inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+  {
+    hash ^=    (hash <<  7) ^  ((val & 0xff000000) >> 24) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+    hash ^=    (hash <<  7) ^  ((val & 0xff00) >> 8) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+    return hash;
+  }
+
   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
   {
     const unsigned char* itr = begin;
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index 66bda6bcd33..5df2c15eb07 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -61,6 +61,44 @@ TEST(BloomFilter, Sweep) {
   }
 }
 
+TEST(BloomFilter, SweepInt) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex++) {
+    for (float fpp = .001; fpp < .5; fpp *= 2.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      for (int n = 0; n < max; n++)
+	bf.insert(n);
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains(100000 + n))
+	  hit++;
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+      ASSERT_TRUE(actual > fpp / 10);
+    }
+  }
+}
+
 // test the fpp over a sequence of bloom filters, each with unique
 // items inserted into it.
 //
-- 
cgit v1.2.1


From 0a69baeb3dd0bd85500ab1ca10d64e9c25e24356 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 25 Sep 2013 14:20:21 -0700
Subject: common/bloom_filter: disable sequential tests

These are slow and are not useful.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/common/test_bloom_filter.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index 5df2c15eb07..d074407a851 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -99,6 +99,12 @@ TEST(BloomFilter, SweepInt) {
   }
 }
 
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
 // test the fpp over a sequence of bloom filters, each with unique
 // items inserted into it.
 //
@@ -176,3 +182,5 @@ TEST(BloomFilter, SequenceDouble) {
 	      << " expected " << (fpp*fpp*(double)seq) << std::endl;
   }
 }
+
+#endif
-- 
cgit v1.2.1


From 4b23b653788bec82b10c4163006674e4158e3f3c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 25 Sep 2013 17:43:40 -0700
Subject: common/bloom_filter: test binning fpp behavior

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/common/test_bloom_filter.cc | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index d074407a851..f0714cb530c 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -100,6 +100,42 @@ TEST(BloomFilter, SweepInt) {
 }
 
 
+TEST(BloomFilter, BinSweep) {
+  int total_max = 16384;
+  float total_fpp = .01;
+  std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+  for (int bins = 1; bins < 16; ++bins) {
+    int max = total_max / bins;
+    float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+    std::vector<bloom_filter*> ls;
+    bufferlist bl;
+    for (int i=0; i<bins; i++) {
+      ls.push_back(new bloom_filter(max, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert(10000 * (i+1) + j);
+      }
+      ::encode(*ls.front(), bl);
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains(i * 732)) {  // note: sequential i does not work here; the intenral int hash is weak!!
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+	      << " actual-fpp " << actual
+	      << " total-size " << bl.length() << std::endl;
+  }
+}
+
 // disable these tests; doing dual insertions in consecutive filters
 // appears to be equivalent to doing a single insertion in a bloom
 // filter that is twice as big.
-- 
cgit v1.2.1


From 9299f501ea07fce83dcd03cb8e6c9fec5496de57 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 09:14:34 -0700
Subject: common/bloom_filter: speed up unit tests a bit

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/common/test_bloom_filter.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
index f0714cb530c..8e3661b2cc1 100644
--- a/src/test/common/test_bloom_filter.cc
+++ b/src/test/common/test_bloom_filter.cc
@@ -25,8 +25,8 @@ TEST(BloomFilter, Basic) {
 
 TEST(BloomFilter, Sweep) {
   std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
-  for (int ex = 3; ex < 12; ex++) {
-    for (float fpp = .001; fpp < .5; fpp *= 2.0) {
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
       int max = 2 << ex;
       bloom_filter bf(max, fpp, 1);
       bf.insert("foo");
@@ -63,8 +63,8 @@ TEST(BloomFilter, Sweep) {
 
 TEST(BloomFilter, SweepInt) {
   std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
-  for (int ex = 3; ex < 12; ex++) {
-    for (float fpp = .001; fpp < .5; fpp *= 2.0) {
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
       int max = 2 << ex;
       bloom_filter bf(max, fpp, 1);
       bf.insert("foo");
-- 
cgit v1.2.1


From 8cfeb8342a08774fb1030830859c3fc30514f0b5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 09:20:34 -0700
Subject: common/bloom_filter: note that uint32_t interface requires well-mixed
 values

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/bloom_filter.hpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
index 6d5f645d8c9..6216c7fb34d 100644
--- a/src/common/bloom_filter.hpp
+++ b/src/common/bloom_filter.hpp
@@ -131,6 +131,15 @@ public:
     inserted_element_count_ = 0;
   }
 
+  /**
+   * insert a u32 into the set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to insert
+   */
   inline void insert(uint32_t val) {
     std::size_t bit_index = 0;
     std::size_t bit = 0;
@@ -181,6 +190,16 @@ public:
     }
   }
 
+  /**
+   * check if a u32 is contained by set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to query
+   * @returns true if value is (probably) in the set, false if it definitely is not
+   */
   inline virtual bool contains(uint32_t val) const
   {
     std::size_t bit_index = 0;
-- 
cgit v1.2.1


From dce3d26d84f140d01a968dc0a615372c1cb8f7f3 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Thu, 3 Oct 2013 19:07:12 +0100
Subject: mon: MonmapMonitor: make 'ceph mon add' idempotent

MonMap changes lead to bootstraps.  Callbacks waiting for a proposal to
finish can have several fates, depending on what happens: finished, rerun
or aborted.

In the case of a bootstrap right after a monmap change, callbacks are
rerun.  Considering we queued the message that lead to the monmap change
on this queue, if we instead of finishing it end up reruning it, we will
end up trying to perform the same modification twice -- the last one will
try to modify an already existing state and we will return just that:
whatever you're attempting to do has already been done.

This patch makes 'ceph mon add' completely idempotent.  If one tries to
add an already existing monitor (i.e., same name, same ip:port), one
simply gets a 'monitor foo added', with return 0, no matter how many
times one runs the command.

Fixes: #5896

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/MonmapMonitor.cc | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 799f19df154..ca855592445 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
       addr.set_port(CEPH_MON_PORT);
     }
 
-    if (pending_map.contains(addr) ||
-	pending_map.contains(name)) {
+    /**
+     * If we have a monitor with the same name and different addr, then EEXIST
+     * If we have a monitor with the same addr and different name, then EEXIST
+     * If we have a monitor with the same addr and same name, then return as if
+     * we had just added the monitor.
+     * If we don't have the monitor, add it.
+     */
+
+    err = 0;
+    if (!ss.str().empty())
+      ss << "; ";
+
+    do {
+      if (pending_map.contains(addr)) {
+        string n = pending_map.get_name(addr);
+        if (n == name)
+          break;
+      } else if (pending_map.contains(name)) {
+        entity_addr_t tmp_addr = pending_map.get_addr(name);
+        if (tmp_addr == addr)
+          break;
+      } else {
+        break;
+      }
       err = -EEXIST;
-      if (!ss.str().empty())
-	ss << "; ";
-      ss << "mon " << name << " " << addr << " already exists";
+      ss << "mon." << name << " at " << addr << " already exists";
+      goto out;
+    } while (false);
+
+    ss << "added mon." << name << " at " << addr;
+    if (pending_map.contains(name)) {
       goto out;
     }
 
     pending_map.add(name, addr);
     pending_map.last_changed = ceph_clock_now(g_ceph_context);
-    ss << "added mon." << name << " at " << addr;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+                                                      get_last_committed()));
     return true;
 
   } else if (prefix == "mon remove") {
-- 
cgit v1.2.1


From ed1a54ecd3be6fed3410b50f318086967d10ceda Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:50:29 +0100
Subject: mon: Monitor: assert on absense of connection during dispatch

We expect to always have a connection associated with a message.
If that happens to not be so, assert.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/Monitor.cc | 84 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 10f5bfb149c..54ea2f41fce 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2561,53 +2561,57 @@ bool Monitor::_ms_dispatch(Message *m)
   EntityName entity_name;
   bool src_is_mon;
 
-  src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
-  if (connection) {
-    bool reuse_caps = false;
-    dout(20) << "have connection" << dendl;
-    s = static_cast<MonSession *>(connection->get_priv());
-    if (s && s->closed) {
-      caps = s->caps;
-      reuse_caps = true;
-      s->put();
-      s = NULL;
+  // regardless of who we are or who the sender is, the message must
+  // have a connection associated.  If it doesn't then something fishy
+  // is going on.
+  assert(connection);
+
+  src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+
+  bool reuse_caps = false;
+  dout(20) << "have connection" << dendl;
+  s = static_cast<MonSession *>(connection->get_priv());
+  if (s && s->closed) {
+    caps = s->caps;
+    reuse_caps = true;
+    s->put();
+    s = NULL;
+  }
+  if (!s) {
+    if (!exited_quorum.is_zero() && !src_is_mon) {
+      waitlist_or_zap_client(m);
+      return true;
     }
-    if (!s) {
-      if (!exited_quorum.is_zero() && !src_is_mon) {
-	waitlist_or_zap_client(m);
-	return true;
-      }
-      dout(10) << "do not have session, making new one" << dendl;
-      s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
-      m->get_connection()->set_priv(s->get());
-      dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
-
-      if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
-	dout(10) << "setting timeout on session" << dendl;
-	// set an initial timeout here, so we will trim this session even if they don't
-	// do anything.
-	s->until = ceph_clock_now(g_ceph_context);
-	s->until += g_conf->mon_subscribe_interval;
-      } else {
-	//give it monitor caps; the peer type has been authenticated
-	reuse_caps = false;
-	dout(5) << "setting monitor caps on this connection" << dendl;
-	if (!s->caps.is_allow_all()) //but no need to repeatedly copy
-	  s->caps = *mon_caps;
-      }
-      if (reuse_caps)
-        s->caps = caps;
+    dout(10) << "do not have session, making new one" << dendl;
+    s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+    m->get_connection()->set_priv(s->get());
+    dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+
+    if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
+      dout(10) << "setting timeout on session" << dendl;
+      // set an initial timeout here, so we will trim this session even if they don't
+      // do anything.
+      s->until = ceph_clock_now(g_ceph_context);
+      s->until += g_conf->mon_subscribe_interval;
     } else {
-      dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+      //give it monitor caps; the peer type has been authenticated
+      reuse_caps = false;
+      dout(5) << "setting monitor caps on this connection" << dendl;
+      if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+        s->caps = *mon_caps;
     }
+    if (reuse_caps)
+      s->caps = caps;
+  } else {
+    dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+  }
+
+  if (s) {
     if (s->auth_handler) {
       entity_name = s->auth_handler->get_entity_name();
     }
-  }
-
-  if (s)
     dout(20) << " caps " << s->caps.get_str() << dendl;
+  }
 
   if (is_synchronizing() && !src_is_mon) {
     waitlist_or_zap_client(m);
-- 
cgit v1.2.1


From d0d61b488a5eaf84bb115954272fb61735d505d2 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:54:09 +0100
Subject: mon: Monitor: drop client msg if no session exists and msg is not
 MAuth

If we are not a monitor and we don't have a session yet, we must first
authenticate with the cluster.  Therefore, the first message to the
monitor must be an MAuth.  If not, we assume it's a stray message and
just drop it.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/Monitor.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 54ea2f41fce..cb19dd4b25c 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2578,10 +2578,23 @@ bool Monitor::_ms_dispatch(Message *m)
     s = NULL;
   }
   if (!s) {
+    // if the sender is not a monitor, make sure their first message for a
+    // session is an MAuth.  If it is not, assume it's a stray message,
+    // and considering that we are creating a new session it is safe to
+    // assume that the sender hasn't authenticated yet, so we have no way
+    // of assessing whether we should handle it or not.
+    if (!src_is_mon && m->get_type() != CEPH_MSG_AUTH) {
+      dout(1) << __func__ << " dropping stray message " << *m
+        << " from " << m->get_source_inst() << dendl;
+      m->put();
+      return false;
+    }
+
     if (!exited_quorum.is_zero() && !src_is_mon) {
       waitlist_or_zap_client(m);
       return true;
     }
+
     dout(10) << "do not have session, making new one" << dendl;
     s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
     m->get_connection()->set_priv(s->get());
-- 
cgit v1.2.1


From b8a148804d68f20aa7db8149b4363dc512faf23e Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:56:55 +0100
Subject: mon: Monitor: dissociate msg handling from session & connection logic

Makes for simpler logic for patches to come.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/Monitor.cc | 109 +++++++++++++++++++++++++++++------------------------
 src/mon/Monitor.h  |   2 +
 2 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index cb19dd4b25c..baf6c035b7d 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2631,14 +2631,28 @@ bool Monitor::_ms_dispatch(Message *m)
     return true;
   }
 
-  {
-    switch (m->get_type()) {
-      
+  ret = dispatch(s, m, src_is_mon);
+
+  if (s) {
+    s->put();
+  }
+
+  return ret;
+}
+
+bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+{
+  bool ret = true;
+
+  assert(m != NULL);
+
+  switch (m->get_type()) {
+
     case MSG_ROUTE:
       handle_route(static_cast<MRoute*>(m));
       break;
 
-      // misc
+    // misc
     case CEPH_MSG_MON_GET_MAP:
       handle_mon_get_map(static_cast<MMonGetMap*>(m));
       break;
@@ -2664,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m)
     case MSG_MON_SYNC:
       handle_sync(static_cast<MMonSync*>(m));
       break;
-
     case MSG_MON_SCRUB:
       handle_scrub(static_cast<MMonScrub*>(m));
       break;
 
-      // OSDs
+    // OSDs
     case MSG_OSD_MARK_ME_DOWN:
     case MSG_OSD_FAILURE:
     case MSG_OSD_BOOT:
@@ -2682,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m)
       paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // MDSs
+    // MDSs
     case MSG_MDS_BEACON:
     case MSG_MDS_OFFLOAD_TARGETS:
       paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // auth
+    // auth
     case MSG_MON_GLOBAL_ID:
     case CEPH_MSG_AUTH:
       /* no need to check caps here */
       paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // pg
+    // pg
     case CEPH_MSG_STATFS:
     case MSG_PGSTATS:
     case MSG_GETPOOLSTATS:
@@ -2706,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m)
       paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // log
+    // log
     case MSG_LOG:
       paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
       break;
@@ -2715,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m)
       clog.handle_log_ack((MLogAck*)m);
       break;
 
-      // monmap
+    // monmap
     case MSG_MON_JOIN:
       paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // paxos
+    // paxos
     case MSG_MON_PAXOS:
       {
-	MMonPaxos *pm = static_cast<MMonPaxos*>(m);
-	if (!src_is_mon && 
-	    !s->is_capable("mon", MON_CAP_X)) {
-	  //can't send these!
-	  pm->put();
-	  break;
-	}
+        MMonPaxos *pm = static_cast<MMonPaxos*>(m);
+        if (!src_is_mon &&
+            !s->is_capable("mon", MON_CAP_X)) {
+          //can't send these!
+          pm->put();
+          break;
+        }
 
-	if (state == STATE_SYNCHRONIZING) {
-	  // we are synchronizing. These messages would do us no
-	  // good, thus just drop them and ignore them.
-	  dout(10) << __func__ << " ignore paxos msg from "
-		   << pm->get_source_inst() << dendl;
-	  pm->put();
-	  break;
-	}
+        if (state == STATE_SYNCHRONIZING) {
+          // we are synchronizing. These messages would do us no
+          // good, thus just drop them and ignore them.
+          dout(10) << __func__ << " ignore paxos msg from "
+            << pm->get_source_inst() << dendl;
+          pm->put();
+          break;
+        }
 
-	// sanitize
-	if (pm->epoch > get_epoch()) {
-	  bootstrap();
-	  pm->put();
-	  break;
-	}
-	if (pm->epoch != get_epoch()) {
-	  pm->put();
-	  break;
-	}
+        // sanitize
+        if (pm->epoch > get_epoch()) {
+          bootstrap();
+          pm->put();
+          break;
+        }
+        if (pm->epoch != get_epoch()) {
+          pm->put();
+          break;
+        }
 
-	paxos->dispatch((PaxosServiceMessage*)m);
+        paxos->dispatch((PaxosServiceMessage*)m);
       }
       break;
 
-      // elector messages
+    // elector messages
     case MSG_MON_ELECTION:
       //check privileges here for simplicity
       if (s &&
-	  !s->is_capable("mon", MON_CAP_X)) {
-	dout(0) << "MMonElection received from entity without enough caps!"
-		<< s->caps << dendl;
-	m->put();
-	break;
+          !s->is_capable("mon", MON_CAP_X)) {
+        dout(0) << "MMonElection received from entity without enough caps!"
+          << s->caps << dendl;
+        m->put();
+        break;
       }
       if (!is_probing() && !is_synchronizing()) {
-	elector.dispatch(m);
+        elector.dispatch(m);
       } else {
-	m->put();
+        m->put();
       }
       break;
 
@@ -2786,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m)
 
     default:
       ret = false;
-    }
-  }
-  if (s) {
-    s->put();
   }
 
   return ret;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index df4a751361a..19adf20c474 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -700,6 +700,8 @@ public:
     lock.Unlock();
     return ret;
   }
+  // dissociate message handling from session and connection logic
+  bool dispatch(MonSession *s, Message *m, const bool src_is_mon);
   //mon_caps is used for un-connected messages from monitors
   MonCap * mon_caps;
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
-- 
cgit v1.2.1


From 29cf2ff02aa921ba619367158d68c579c337600e Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Mon, 23 Sep 2013 11:34:05 +0100
Subject: mon: Monitor: only handle paxos msgs from a mon && if session has
 exec caps

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/Monitor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index baf6c035b7d..d60ffceb87a 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2737,7 +2737,7 @@ bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
     case MSG_MON_PAXOS:
       {
         MMonPaxos *pm = static_cast<MMonPaxos*>(m);
-        if (!src_is_mon &&
+        if (!src_is_mon ||
             !s->is_capable("mon", MON_CAP_X)) {
           //can't send these!
           pm->put();
-- 
cgit v1.2.1


From f1e23937a6386c7d1e23af115098fa81a2ca4230 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 01:59:11 +0100
Subject: mon: Monitor: reuse 'src_is_mon' bool on dispatch

Instead of making the same comparison twice with no gain at all.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/Monitor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index d60ffceb87a..42cf3fa661d 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2600,7 +2600,7 @@ bool Monitor::_ms_dispatch(Message *m)
     m->get_connection()->set_priv(s->get());
     dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
 
-    if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
+    if (!src_is_mon) {
       dout(10) << "setting timeout on session" << dendl;
       // set an initial timeout here, so we will trim this session even if they don't
       // do anything.
-- 
cgit v1.2.1


From afb4d83d0cdb8fce229ef4943eb6acf64403e5a8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 12:16:55 -0700
Subject: librados: drop #include of int_types.h from installed headers

These are unnecessary, and breaks compilation for outside users.

Prefer inttypes.h over stdint.h.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/include/buffer.h         | 9 ++++-----
 src/include/crc32c.h         | 3 +--
 src/include/rados/librados.h | 6 ++----
 3 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/include/buffer.h b/src/include/buffer.h
index f4a2f5c3264..ffa3d6e1b97 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -14,8 +14,6 @@
 #ifndef CEPH_BUFFER_H
 #define CEPH_BUFFER_H
 
-#include "include/int_types.h"
-
 #if defined(__linux__)
 #include <stdlib.h>
 #include <linux/types.h>
@@ -46,6 +44,7 @@ void	*valloc(size_t);
 #include <malloc.h>
 #endif
 
+#include <inttypes.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -420,7 +419,7 @@ public:
     ssize_t read_fd(int fd, size_t len);
     int write_file(const char *fn, int mode=0644);
     int write_fd(int fd) const;
-    __u32 crc32c(__u32 crc) const;
+    uint32_t crc32c(uint32_t crc) const;
   };
 
   /*
@@ -428,7 +427,7 @@ public:
    */
 
   class hash {
-    __u32 crc;
+    uint32_t crc;
 
   public:
     hash() : crc(0) { }
@@ -437,7 +436,7 @@ public:
       crc = bl.crc32c(crc);
     }
 
-    __u32 digest() {
+    uint32_t digest() {
       return crc;
     }
   };
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 8e22c624636..49d68474d68 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,8 +1,7 @@
 #ifndef CEPH_CRC32C_H
 #define CEPH_CRC32C_H
 
-#include "include/int_types.h"
-
+#include <inttypes.h>
 #include <string.h>
 
 typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index a85ef3057bc..515663c2335 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -1,8 +1,6 @@
 #ifndef CEPH_LIBRADOS_H
 #define CEPH_LIBRADOS_H
 
-#include "include/int_types.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -566,7 +564,7 @@ int rados_pool_create_with_auid(rados_t cluster, const char *pool_name, uint64_t
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
-				      __u8 crush_rule_num);
+				      uint8_t crush_rule_num);
 
 /**
  * Create a pool with a specific CRUSH rule and auid
@@ -581,7 +579,7 @@ int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_all(rados_t cluster, const char *pool_name, uint64_t auid,
-			       __u8 crush_rule_num);
+			       uint8_t crush_rule_num);
 
 /**
  * Delete a pool and all data inside it
-- 
cgit v1.2.1


From c19935cd09ad1f821c7648d2e08eb7dcce0b7f38 Mon Sep 17 00:00:00 2001
From: John Wilkins <john.wilkins@inktank.com>
Date: Thu, 3 Oct 2013 13:30:18 -0700
Subject: doc: Fixed typo.

Signed-off-by: John Wilkins <john.wilkins@inktank.com>
---
 doc/architecture.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/architecture.rst b/doc/architecture.rst
index 9f57bbbd58a..988475f53b6 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -387,7 +387,7 @@ steps to compute PG IDs.
 #. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get 
    a PG ID.
 #. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+#. CRUSH prepends the pool ID to the PG ID (e.g., ``4.0x58``).
 
 Computing object locations is much faster than performing object location query
 over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
-- 
cgit v1.2.1


From 10335883a9e3d65ce546ee14890c27a53b26ef2c Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 3 Oct 2013 15:16:09 -0700
Subject: qa: fix rbd cli tests checking size

b43bc1a0b0692818d789f9f489b9aba5dd40522f changed the kilo prefix
from K to k in a few places.

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
---
 qa/workunits/rbd/copy.sh                        | 12 ++++++------
 qa/workunits/rbd/import_export.sh               |  8 ++++----
 src/test/cli-integration/rbd/formatted-output.t | 22 +++++++++++-----------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh
index 8430fca7665..7abb3956c88 100755
--- a/qa/workunits/rbd/copy.sh
+++ b/qa/workunits/rbd/copy.sh
@@ -109,8 +109,8 @@ test_ls() {
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
     # look for fields in output of ls -l without worrying about space
-    rbd ls -l | grep 'test1.*1024K.*1'
-    rbd ls -l | grep 'test2.*1024K.*1'
+    rbd ls -l | grep 'test1.*1024k.*1'
+    rbd ls -l | grep 'test2.*1024k.*1'
 
     rbd rm test1
     rbd rm test2
@@ -120,8 +120,8 @@ test_ls() {
     rbd ls | grep test1
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
-    rbd ls -l | grep 'test1.*1024K.*2'
-    rbd ls -l | grep 'test2.*1024K.*2'
+    rbd ls -l | grep 'test1.*1024k.*2'
+    rbd ls -l | grep 'test2.*1024k.*2'
 
     rbd rm test1
     rbd rm test2
@@ -131,8 +131,8 @@ test_ls() {
     rbd ls | grep test1
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
-    rbd ls -l | grep 'test1.*1024K.*2'
-    rbd ls -l | grep 'test2.*1024K.*1'
+    rbd ls -l | grep 'test1.*1024k.*2'
+    rbd ls -l | grep 'test2.*1024k.*1'
     remove_images
 	
     # test that many images can be shown by ls
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index 353a47fffbe..1813f7a9a88 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M
 
 # 1M sparse, 1M data
 rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '2048K'
+rbd ls -l | grep sparse1 | grep '2048k'
 [ "$(objects sparse1)" = '1' ]
 
 # export, compare contents and on-disk size
@@ -77,7 +77,7 @@ rbd rm sparse1
 
 # 1M data, 1M sparse
 rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '2048K'
+rbd ls -l | grep sparse2 | grep '2048k'
 [ "$(objects sparse2)" = '0' ]
 rbd export sparse2 /tmp/sparse2.out
 compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
@@ -88,7 +88,7 @@ rbd rm sparse2
 truncate /tmp/sparse1 -s 10M
 # import from stdin just for fun, verify still sparse
 rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '10240K'
+rbd ls -l | grep sparse1 | grep '10240k'
 [ "$(objects sparse1)" = '1' ]
 rbd export sparse1 /tmp/sparse1.out
 compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out
@@ -99,7 +99,7 @@ rbd rm sparse1
 dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc
 # again from stding
 rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '4096K'
+rbd ls -l | grep sparse2 | grep '4096k'
 [ "$(objects sparse2)" = '0 2 3' ]
 rbd export sparse2 /tmp/sparse2.out
 compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t
index bece14f11f1..707e0749367 100644
--- a/src/test/cli-integration/rbd/formatted-output.t
+++ b/src/test/cli-integration/rbd/formatted-output.t
@@ -39,7 +39,7 @@ For now, use a more inclusive regex.
   $ rbd info foo
   rbd image 'foo':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   $ rbd info foo --format json | python -mjson.tool
@@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info foo@snap
   rbd image 'foo':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   \tprotected: False (esc)
@@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar
   rbd image 'bar':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar@snap
   rbd image 'bar':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar@snap2
   rbd image 'bar':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info baz
   rbd image 'baz':
   \tsize 2048 MB in 512 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   </image>
   $ rbd info quux
   rbd image 'quux':
-  \tsize 1024 KB in 1 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \tsize 1024 kB in 1 objects (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   $ rbd info quux --format json | python -mjson.tool
@@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info data/child
   rbd image 'child':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info data/child@snap
   rbd image 'child':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   NAME       SIZE PARENT FMT PROT LOCK 
   foo       1024M          1           
   foo@snap  1024M          1           
-  quux      1024K          1      excl 
+  quux      1024k          1      excl 
   bar       1024M          2           
   bar@snap   512M          2 yes       
   bar@snap2 1024M          2           
-- 
cgit v1.2.1


From fea1e0e56cbd3d82a173b50532ff51b8bcee9359 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 17:05:41 -0700
Subject: PendingReleaseNotes: make a note about K vs k

Signed-off-by: Sage Weil <sage@inktank.com>
---
 PendingReleaseNotes | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 9a751ffdb49..a0817118df1 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -51,3 +51,7 @@ v0.71
 * Any direct users of the 'tmap' portion of the librados API should be
   aware that the automatic tmap -> omap conversion functionality has
   been removed.
+
+* Most output that used K or KK (e.g., for kilobyte) now uses a
+  lower-case k to match the official SI convention.  Any scripts that
+  parse output and check for an upper-case K will need to be modified.
-- 
cgit v1.2.1


From aacd67e07174ca1e8687e78dcec1a2f68e3bdf5f Mon Sep 17 00:00:00 2001
From: Josh Durgin <josh.durgin@inktank.com>
Date: Thu, 3 Oct 2013 20:07:13 -0700
Subject: PendingReleaseNotes: fix typo

Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
---
 PendingReleaseNotes | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index a0817118df1..779a081480f 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -52,6 +52,6 @@ v0.71
   aware that the automatic tmap -> omap conversion functionality has
   been removed.
 
-* Most output that used K or KK (e.g., for kilobyte) now uses a
+* Most output that used K or KB (e.g., for kilobyte) now uses a
   lower-case k to match the official SI convention.  Any scripts that
   parse output and check for an upper-case K will need to be modified.
-- 
cgit v1.2.1


From 03ba7408db5d693dc8616c8146bbbb9f0ab66291 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 21:27:36 -0700
Subject: osd/ReplicatedPG: fix null deref on rollback_to whiteout check

Bring this whole if/else chain up one level so that we can capture both
ENOENT and whiteout in the same case.  (And don't dereference the
pointer when we know it is NULL.)

Fixes: #6474
Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/osd/ReplicatedPG.cc | 52 ++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index fb5e45a1a71..c4316196178 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3785,37 +3785,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
   int ret = find_object_context(
     hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
     &rollback_to, false, &cloneid);
-  if (ret) {
-    if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) {
-      // there's no snapshot here, or there's no object.
-      // if there's no snapshot, we delete the object; otherwise, do nothing.
-      dout(20) << "_rollback_to deleting head on " << soid.oid
-	       << " because got ENOENT|whiteout on find_object_context" << dendl;
-      if (ctx->obc->obs.oi.watchers.size()) {
-	// Cannot delete an object with watchers
-	ret = -EBUSY;
-      } else {
-	_delete_head(ctx);
-	ret = 0;
-      }
-    } else if (-EAGAIN == ret) {
-      /* a different problem, like degraded pool
-       * with not-yet-restored object. We shouldn't have been able
-       * to get here; recovery should have completed first! */
-      hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
-				info.pgid.pool(), soid.get_namespace());
-      assert(is_missing_object(rollback_target));
-      dout(20) << "_rollback_to attempted to roll back to a missing object " 
-	       << rollback_target << " (requested snapid: ) " << snapid << dendl;
-      wait_for_missing_object(rollback_target, ctx->op);
+  if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+    // there's no snapshot here, or there's no object.
+    // if there's no snapshot, we delete the object; otherwise, do nothing.
+    dout(20) << "_rollback_to deleting head on " << soid.oid
+	     << " because got ENOENT|whiteout on find_object_context" << dendl;
+    if (ctx->obc->obs.oi.watchers.size()) {
+      // Cannot delete an object with watchers
+      ret = -EBUSY;
     } else {
-      // ummm....huh? It *can't* return anything else at time of writing.
-      assert(0);
-    }
+      _delete_head(ctx);
+      ret = 0;
+    }
+  } else if (-EAGAIN == ret) {
+    /* a different problem, like degraded pool
+     * with not-yet-restored object. We shouldn't have been able
+     * to get here; recovery should have completed first! */
+    hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
+			      info.pgid.pool(), soid.get_namespace());
+    assert(is_missing_object(rollback_target));
+    dout(20) << "_rollback_to attempted to roll back to a missing object "
+	     << rollback_target << " (requested snapid: ) " << snapid << dendl;
+    wait_for_missing_object(rollback_target, ctx->op);
+  } else if (ret) {
+    // ummm....huh? It *can't* return anything else at time of writing.
+    assert(0 == "unexpected error code in _rollback_to");
   } else { //we got our context, let's use it to do the rollback!
     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
     if (is_degraded_object(rollback_to_sobject)) {
-      dout(20) << "_rollback_to attempted to roll back to a degraded object " 
+      dout(20) << "_rollback_to attempted to roll back to a degraded object "
 	       << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
       wait_for_degraded_object(rollback_to_sobject, ctx->op);
       ret = -EAGAIN;
-- 
cgit v1.2.1


From b09a1ef946b0ee4bb4546a52e25caebc08677a67 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 21:44:06 -0700
Subject: ceph_test_rados: stop on read error!

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/osd/RadosModel.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index a87ecebb4c1..322a591b409 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -1043,6 +1043,7 @@ public:
       if (!(err == -ENOENT && old_value.deleted())) {
 	cerr << num << ": Error: oid " << oid << " read returned error code "
 	     << err << std::endl;
+	context->errors++;
       }
     } else {
       cout << num << ":  expect " << old_value.most_recent() << std::endl;
-- 
cgit v1.2.1


From 55d279b98553ba4542219b126fc7159b20b18b1f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Thu, 3 Oct 2013 21:47:26 -0700
Subject: ceph_test_rados: do not let rollback race with snap delete

Note that the OSD behaves in a weird way when you rollback to a non-
existent snap, so the test probably isn't the only party at fault here.

Fixes (test half of): #6254
Backport: dumpling, cuttlefish
Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/osd/RadosModel.h | 4 ++++
 src/test/osd/TestRados.cc | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index 322a591b409..aba6a531c6f 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -143,6 +143,7 @@ public:
   map<int, map<string,ObjectDesc> > pool_obj_cont;
   set<string> oid_in_use;
   set<string> oid_not_in_use;
+  set<int> snaps_in_use;
   int current_snap;
   string pool_name;
   librados::IoCtx io_ctx;
@@ -1315,6 +1316,8 @@ public:
     }
     context->oid_in_use.insert(oid);
     context->oid_not_in_use.erase(oid);
+    context->snaps_in_use.insert(roll_back_to);
+
     context->roll_back(oid, roll_back_to);
     uint64_t snap = context->snaps[roll_back_to];
 
@@ -1342,6 +1345,7 @@ public:
     context->update_object_version(oid, comp->get_version64());
     context->oid_in_use.erase(oid);
     context->oid_not_in_use.insert(oid);
+    context->snaps_in_use.erase(roll_back_to);
     context->kick();
   }
 
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index be919161579..7158f50a74a 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -120,13 +120,16 @@ private:
       }
 
     case TEST_OP_ROLLBACK:
-      if (context.snaps.empty()) {
+      if (context.snaps.size() <= context.snaps_in_use.size()) {
 	return NULL;
-      } else {
+      }
+      while (true) {
 	int snap = rand_choose(context.snaps)->first;
+	if (context.snaps_in_use.count(snap))
+	  continue;  // in use; try again!
 	string oid = *(rand_choose(context.oid_not_in_use));
 	cout << "rollback oid " << oid << " to " << snap << std::endl;
-        return new RollbackOp(m_op, &context, oid, snap);
+	return new RollbackOp(m_op, &context, oid, snap);
       }
 
     case TEST_OP_SETATTR:
-- 
cgit v1.2.1


From dc0dfb9e01d593afdd430ca776cf4da2c2240a20 Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Fri, 27 Sep 2013 16:23:09 -0700
Subject: common,os: Remove filestore_xattr_use_omap option

Now we operate just like when this was set to true

Fixes: #6143

Signed-off-by: David Zafman <david.zafman@inktank.com>
---
 src/common/config_opts.h |   1 -
 src/os/FileStore.cc      | 125 +++++++++++++++++++++--------------------------
 2 files changed, 56 insertions(+), 70 deletions(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index fad831f5543..f9a1e45ff80 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -545,7 +545,6 @@ OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
 
 OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
 // Use omap for xattrs for attrs over
-OPTION(filestore_xattr_use_omap, OPT_BOOL, false)
 // filestore_max_inline_xattr_size or
 OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
 // for more than filestore_max_inline_xattrs attrs
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index cd8a8e50658..1a9206083c9 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -887,14 +887,7 @@ int FileStore::_detect_fs()
   chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
   ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
   if (ret == -ENOSPC) {
-    if (!g_conf->filestore_xattr_use_omap) {
-      dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl;
-      g_conf->set_val("filestore_xattr_use_omap", "true");
-      g_conf->apply_changes(NULL);
-      assert(g_conf->filestore_xattr_use_omap == true);
-    } else {
-      dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl;
-    }
+    dout(0) << "limited size xattrs" << dendl;
   }
   chain_fremovexattr(tmpfd, "user.test");
   chain_fremovexattr(tmpfd, "user.test2");
@@ -3397,7 +3390,7 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
   get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
   r = _fgetattr(**fd, n, bp);
   lfn_close(fd);
-  if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+  if (r == -ENODATA) {
     map<string, bufferlist> got;
     set<string> to_get;
     to_get.insert(string(name));
@@ -3433,6 +3426,9 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
 
 int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
 {
+  set<string> omap_attrs;
+  map<string, bufferlist> omap_aset;
+  Index index;
   dout(15) << "getattrs " << cid << "/" << oid << dendl;
   FDRef fd;
   int r = lfn_open(cid, oid, false, &fd);
@@ -3440,43 +3436,41 @@ int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>
     goto out;
   }
   r = _fgetattrs(**fd, aset, user_only);
+  if (r < 0) {
+    goto out;
+  }
   lfn_close(fd);
-  if (g_conf->filestore_xattr_use_omap) {
-    set<string> omap_attrs;
-    map<string, bufferlist> omap_aset;
-    Index index;
-    int r = get_index(cid, &index);
-    if (r < 0) {
-      dout(10) << __func__ << " could not get index r = " << r << dendl;
-      goto out;
-    }
-    r = object_map->get_all_xattrs(oid, &omap_attrs);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      goto out;
-    }
-    r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      goto out;
-    }
-    assert(omap_attrs.size() == omap_aset.size());
-    for (map<string, bufferlist>::iterator i = omap_aset.begin();
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __func__ << " could not get index r = " << r << dendl;
+    goto out;
+  }
+  r = object_map->get_all_xattrs(oid, &omap_attrs);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    goto out;
+  }
+  r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    goto out;
+  }
+  assert(omap_attrs.size() == omap_aset.size());
+  for (map<string, bufferlist>::iterator i = omap_aset.begin();
 	 i != omap_aset.end();
 	 ++i) {
-      string key;
-      if (user_only) {
+    string key;
+    if (user_only) {
 	if (i->first[0] != '_')
 	  continue;
 	if (i->first == "_")
 	  continue;
 	key = i->first.substr(1, i->first.size());
-      } else {
+    } else {
 	key = i->first;
-      }
-      aset.insert(make_pair(key,
-			    bufferptr(i->second.c_str(), i->second.length())));
     }
+    aset.insert(make_pair(key,
+			    bufferptr(i->second.c_str(), i->second.length())));
   }
  out:
   dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -3502,10 +3496,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
   if (r < 0) {
     goto out;
   }
-  if (g_conf->filestore_xattr_use_omap) {
-    r = _fgetattrs(**fd, inline_set, false);
-    assert(!m_filestore_fail_eio || r != -EIO);
-  }
+  r = _fgetattrs(**fd, inline_set, false);
+  assert(!m_filestore_fail_eio || r != -EIO);
   dout(15) << "setattrs " << cid << "/" << oid << dendl;
   r = 0;
   for (map<string,bufferptr>::iterator p = aset.begin();
@@ -3513,8 +3505,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
        ++p) {
     char n[CHAIN_XATTR_MAX_NAME_LEN];
     get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
-    if (g_conf->filestore_xattr_use_omap) {
-      if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+
+    if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
 	  r = chain_fremovexattr(**fd, n);
@@ -3523,9 +3515,9 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
 	}
 	omap_set[p->first].push_back(p->second);
 	continue;
-      }
+    }
 
-      if (!inline_set.count(p->first) &&
+    if (!inline_set.count(p->first) &&
 	  inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
@@ -3535,10 +3527,9 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
 	}
 	omap_set[p->first].push_back(p->second);
 	continue;
-      }
-      omap_remove.insert(p->first);
-      inline_set.insert(*p);
     }
+    omap_remove.insert(p->first);
+    inline_set.insert(*p);
 
     inline_to_set.insert(*p);
 
@@ -3549,7 +3540,6 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
     goto out_close;
 
   if (!omap_remove.empty()) {
-    assert(g_conf->filestore_xattr_use_omap);
     r = object_map->remove_xattrs(oid, omap_remove, &spos);
     if (r < 0 && r != -ENOENT) {
       dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
@@ -3559,7 +3549,6 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
   }
   
   if (!omap_set.empty()) {
-    assert(g_conf->filestore_xattr_use_omap);
     r = object_map->set_xattrs(oid, omap_set, &spos);
     if (r < 0) {
       dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
@@ -3587,7 +3576,7 @@ int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
   char n[CHAIN_XATTR_MAX_NAME_LEN];
   get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
   r = chain_fremovexattr(**fd, n);
-  if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+  if (r == -ENODATA) {
     Index index;
     r = get_index(cid, &index);
     if (r < 0) {
@@ -3617,6 +3606,8 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
 
   map<string,bufferptr> aset;
   FDRef fd;
+  set<string> omap_attrs;
+  Index index;
   int r = lfn_open(cid, oid, false, &fd);
   if (r < 0) {
     goto out;
@@ -3633,25 +3624,21 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
   }
   lfn_close(fd);
 
-  if (g_conf->filestore_xattr_use_omap) {
-    set<string> omap_attrs;
-    Index index;
-    r = get_index(cid, &index);
-    if (r < 0) {
-      dout(10) << __func__ << " could not get index r = " << r << dendl;
-      return r;
-    }
-    r = object_map->get_all_xattrs(oid, &omap_attrs);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      assert(!m_filestore_fail_eio || r != -EIO);
-      return r;
-    }
-    r = object_map->remove_xattrs(oid, omap_attrs, &spos);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
-      return r;
-    }
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __func__ << " could not get index r = " << r << dendl;
+    return r;
+  }
+  r = object_map->get_all_xattrs(oid, &omap_attrs);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    assert(!m_filestore_fail_eio || r != -EIO);
+    return r;
+  }
+  r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+    return r;
   }
  out:
   dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
-- 
cgit v1.2.1


From f3733a205238516ec2c20f24f61aa0366bac78e5 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Fri, 4 Oct 2013 09:10:20 -0700
Subject: ReplicatedPG: copy: switch CopyCallback to use a GenContext

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 18 +++++++------
 src/osd/ReplicatedPG.h  | 72 ++++++++++++++++++++++---------------------------
 2 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 6118a25d510..ce5d347cb3b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4456,6 +4456,7 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
   ObjectContextRef obc = cop->obc;
   cop->objecter_tid = 0;
 
+  CopyResults results;
   if (r >= 0) {
     assert(cop->rval >= 0);
 
@@ -4483,15 +4484,14 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
       _copy_some(obc, cop);
       return;
     } else {
-      ObjectStore::Transaction t;
-      _build_finish_copy_transaction(cop, t);
-      cop->cb->copy_complete_ops(t);
-      cop->cb->set_data_size(cop->temp_cursor.data_offset);
+      _build_finish_copy_transaction(cop, results.get<3>());
+      results.get<1>() = cop->temp_cursor.data_offset;
     }
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-  cop->cb->complete(cop->rval);
+  results.get<0>() = cop->rval;
+  cop->cb->complete(results);
 
   copy_ops.erase(obc->obs.oi.soid);
   --obc->copyfrom_readside;
@@ -4556,8 +4556,8 @@ int ReplicatedPG::finish_copyfrom(OpContext *ctx)
   if (cb->is_temp_obj_used()) {
     ctx->discard_temp_oid = cb->temp_obj;
   }
-  ctx->op_t.swap(cb->final_tx);
-  ctx->op_t.append(cb->final_tx);
+  ctx->op_t.swap(cb->results.get<3>());
+  ctx->op_t.append(cb->results.get<3>());
 
   interval_set<uint64_t> ch;
   if (obs.oi.size > 0)
@@ -4591,7 +4591,9 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
   --cop->obc->copyfrom_readside;
 
   kick_object_context_blocked(cop->obc);
-  cop->cb->complete(-ECANCELED);
+  bool temp_obj_created = !cop->cursor.is_initial();
+  CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction());
+  cop->cb->complete(result);
 }
 
 void ReplicatedPG::cancel_copy_ops()
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index f7e677f7b84..2a8b722c752 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -18,6 +18,7 @@
 #define CEPH_REPLICATEDPG_H
 
 #include <boost/optional.hpp>
+#include <boost/tuple/tuple.hpp>
 
 #include "include/assert.h" 
 #include "common/cmdparse.h"
@@ -131,54 +132,49 @@ public:
    * The CopyCallback class defines an interface for completions to the
    * copy_start code. Users of the copy infrastructure must implement
    * one and give an instance of the class to start_copy.
-   * In particular,
-   * 1) Once the copy code has placed data in the temp object, it calls
-   * the data_in_temp_obj() function.
-   * 2) if everything has succeeded, it may call copy_complete_ops() and
-   * pass in a Transaction which contains the ops that must be executed
-   * in order to complete the copy. The implementer must make sure these ops
-   * are executed if they are provide (at present, they are).
-   * 3) If everything has succeeded, it will call data_size() with the
-   * size of copied object
-   * 4) It will call finish().
    *
    * The implementer is responsible for making sure that the CopyCallback
    * can associate itself with the correct copy operation. The presence
-   * of copy_complete_ops ensures that write operations can be performed
+   * of the closing Transaction ensures that write operations can be performed
    * atomically with the copy being completed (which doing them in separate
    * transactions would not allow); if you are doing the copy for a read
    * op you will have to generate a separate op to finish the copy with.
    */
-  class CopyCallback : public Context {
+  /// return code, total object size, data in temp object?, final Transaction
+  typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults;
+  class CopyCallback : public GenContext<CopyResults&> {
   protected:
-    bool data_in_temp;
-    uint64_t data_size;
-    int result_code;
-
-    CopyCallback() : data_in_temp(false), data_size((uint64_t)-1),
-	result_code(0) {}
+    CopyCallback() {}
     /**
-     * @param r The copy return code. 0 for success; -ECANCELLED if
+     * results.get<0>() is the return code: 0 for success; -ECANCELLED if
      * the operation was cancelled by the local OSD; -errno for other issues.
+     * results.get<1>() is the total size of the object (for updating pg stats)
+     * results.get<2>() indicates whether we have already written data to
+     * the temp object (so it needs to get cleaned up, if the return code
+     * indicates a failure)
+     * results.get<3>() is a Transaction; if non-empty you need to perform
+     * its results before any other accesses to the object in order to
+     * complete the copy.
      */
-    virtual void finish(int r) { result_code = r; }
+    virtual void finish(CopyResults& results_) = 0;
+
   public:
-    /// Give the CopyCallback ops to perform to complete the copy
-    virtual void copy_complete_ops(ObjectStore::Transaction& t) = 0;
-    /// Tell the CopyCallback that there is now data in the temp object
-    virtual void data_in_temp_obj() { data_in_temp = true; };
-    bool is_temp_obj_used() { return data_in_temp; }
     /// Provide the final size of the copied object to the CopyCallback
-    virtual void set_data_size(uint64_t size) { data_size = size; }
-    uint64_t get_data_size() { return data_size; }
-    int get_result() { return result_code; }
     virtual ~CopyCallback() {};
   };
 
   class CopyFromCallback: public CopyCallback {
-  protected:
-    virtual void finish(int r) {
-      result_code = r;
+  public:
+    CopyResults results;
+    OpContext *ctx;
+    hobject_t temp_obj;
+    CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+      ctx(ctx_), temp_obj(temp_obj_) {}
+    ~CopyFromCallback() {}
+
+    virtual void finish(CopyResults& results_) {
+      results = results_;
+      int r = results.get<0>();
       if (r >= 0) {
 	ctx->pg->execute_ctx(ctx);
       }
@@ -191,14 +187,10 @@ public:
 	}
       }
     }
-  public:
-    OpContext *ctx;
-    hobject_t temp_obj;
-    ObjectStore::Transaction final_tx;
-    CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
-      ctx(ctx_), temp_obj(temp_obj_) {}
-    void copy_complete_ops(ObjectStore::Transaction& t) { final_tx.swap(t); }
-    ~CopyFromCallback() {}
+
+    bool is_temp_obj_used() { return results.get<2>(); }
+    uint64_t get_data_size() { return results.get<1>(); }
+    int get_result() { return results.get<0>(); }
   };
   friend class CopyFromCallback;
 
@@ -375,7 +367,7 @@ public:
     int num_read;    ///< count read ops
     int num_write;   ///< count update ops
 
-    CopyCallback *copy_cb;
+    CopyFromCallback *copy_cb;
 
     hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
 
-- 
cgit v1.2.1


From 469d471a8b668e4001561898ab9b908aa28477a1 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Fri, 4 Oct 2013 09:26:02 -0700
Subject: ReplicatedPG: assert that we have succeeded in do_osd_ops on copyfrom
 repeats

Our callback is handling errors on its own at this point.

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index ce5d347cb3b..8ba4c6cdc9f 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -3774,10 +3774,8 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -EINPROGRESS;
 	} else {
 	  // finish
-	  result = ctx->copy_cb->get_result();
-	  if (result >= 0) { //success!
-	    result = finish_copyfrom(ctx);
-	  }
+	  assert(ctx->copy_cb->get_result() >= 0);
+	  result = finish_copyfrom(ctx);
 	}
       }
       break;
-- 
cgit v1.2.1


From 639ff9f776bf7e18ba9d8bd0f73784df0bf02ce0 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Fri, 4 Oct 2013 09:47:54 -0700
Subject: ReplicatedPG: copy: don't leak a ctx on failed copy ops

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 2a8b722c752..f337a8f4202 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -180,11 +180,10 @@ public:
       }
       ctx->copy_cb = NULL;
       if (r < 0) {
-	if (r == -ECANCELED) { // toss it out; client resends
-	  delete ctx;
-	} else {
+	if (r != -ECANCELED) { // on cancel just toss it out; client resends
 	  ctx->pg->osd->reply_op_error(ctx->op, r);
 	}
+	delete ctx;
       }
     }
 
-- 
cgit v1.2.1


From 806725a8b0f3bf79af9bca3c2c6ed8d70655deff Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Fri, 4 Oct 2013 10:50:29 -0700
Subject: ReplicatedPG: copy: add op progression output

Signed-off-by: Greg Farnum <greg@inktank.com>
---
 src/osd/ReplicatedPG.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 8ba4c6cdc9f..711dc8e93f7 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4544,6 +4544,7 @@ void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
 
 int ReplicatedPG::finish_copyfrom(OpContext *ctx)
 {
+  dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
   ObjectState& obs = ctx->new_obs;
   CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
 
-- 
cgit v1.2.1


From e3bb0656d92e74ead0342ae696039a51170fe941 Mon Sep 17 00:00:00 2001
From: Gary Lowell <gary.lowell@inktank.com>
Date: Fri, 4 Oct 2013 20:12:24 +0000
Subject: v0.70

---
 configure.ac     | 2 +-
 debian/changelog | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index b45828f1602..b3568d8e4eb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.69], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index ce73472f9eb..4628bb52175 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.70-1) stable; urgency=low
+
+  * New upstream release 
+
+ -- Gary Lowell <gary.lowell@inktank.com>  Fri, 04 Oct 2013 20:11:51 +0000
+
 ceph (0.69-1) precise; urgency=low
 
   * New upstream release 
-- 
cgit v1.2.1


From 091809b8149c7595cbcca439c5b8b75a0c42efe1 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Fri, 4 Oct 2013 12:29:26 -0700
Subject: PGMap,PGMonitor: maintain mapping of osd to recent stat epoch

Also, osd_stat will be empty for out osd.

When an osd is marked out, rather than remove it from osd_stat,
we instead 0 out the structure.

This patch also makes osd_stat_updates and osd_stat_rm private.
This should make it simpler to enforce invariants on these
mappings.

Each up osd will have a mapping since out osds are now included as
empty stats.

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/mon/PGMap.cc     | 45 ++++++++++++++++++++++++++++++++++++++-------
 src/mon/PGMap.h      | 37 +++++++++++++++++++++++++++++++++++--
 src/mon/PGMonitor.cc | 24 ++++++++++++++----------
 3 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..0b40e9264ce 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(7, 5, bl);
   ::encode(version, bl);
   ::encode(pg_stat_updates, bl);
   ::encode(osd_stat_updates, bl);
@@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
   ::encode(nearfull_ratio, bl);
   ::encode(pg_remove, bl);
   ::encode(stamp, bl);
+  ::encode(osd_epochs, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
   }
   if (struct_v >= 6)
     ::decode(stamp, bl);
+  if (struct_v >= 7) {
+    ::decode(osd_epochs, bl);
+  } else {
+    for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
+	 i != osd_stat_updates.end();
+	 ++i) {
+      // This isn't accurate, but will cause trimming to behave like
+      // previously.
+      osd_epochs.insert(make_pair(i->first, osdmap_epoch));
+    }
+  }
   DECODE_FINISH(bl);
 }
 
@@ -195,8 +207,10 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     }
     stat_pg_add(update_pg, update_stat);
   }
-  for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin();
-       p != inc.osd_stat_updates.end();
+  assert(osd_stat.size() == osd_epochs.size());
+  for (map<int32_t,osd_stat_t>::const_iterator p =
+	 inc.get_osd_stat_updates().begin();
+       p != inc.get_osd_stat_updates().end();
        ++p) {
     int osd = p->first;
     const osd_stat_t &new_stats(p->second);
@@ -209,6 +223,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
       stat_osd_sub(t->second);
       t->second = new_stats;
     }
+    assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end());
+    osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
 
     stat_osd_add(new_stats);
     
@@ -226,8 +242,8 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     }
   }
   
-  for (set<int>::iterator p = inc.osd_stat_rm.begin();
-       p != inc.osd_stat_rm.end();
+  for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
+       p != inc.get_osd_stat_rm().end();
        ++p) {
     hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
     if (t != osd_stat.end()) {
@@ -434,7 +450,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(5, 4, bl);
+  ENCODE_START(6, 4, bl);
   ::encode(version, bl);
   ::encode(pg_stat, bl);
   ::encode(osd_stat, bl);
@@ -443,6 +459,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
   ::encode(full_ratio, bl);
   ::encode(nearfull_ratio, bl);
   ::encode(stamp, bl);
+  ::encode(osd_epochs, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -472,6 +489,17 @@ void PGMap::decode(bufferlist::iterator &bl)
   }
   if (struct_v >= 5)
     ::decode(stamp, bl);
+  if (struct_v >= 6) {
+    ::decode(osd_epochs, bl);
+  } else {
+    for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
+	 i != osd_stat.end();
+	 ++i) {
+      // This isn't accurate, but will cause trimming to behave like
+      // previously.
+      osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
+    }
+  }
   DECODE_FINISH(bl);
 
   calc_stats();
@@ -488,7 +516,10 @@ void PGMap::dirty_all(Incremental& inc)
     inc.pg_stat_updates[p->first] = p->second;
   }
   for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
-    inc.osd_stat_updates[p->first] = p->second;
+    assert(inc.get_osd_epochs().count(p->first));
+    inc.update_stat(p->first,
+		   inc.get_osd_epochs().find(p->first)->second,
+		   p->second);
   }
 }
 
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 84d89f87517..7a202fc0006 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -43,12 +43,13 @@ public:
   float full_ratio;
   float nearfull_ratio;
 
+  // mapping of osd to most recently reported osdmap epoch
+  hash_map<int32_t,epoch_t> osd_epochs;
+
   class Incremental {
   public:
     version_t version;
     map<pg_t,pg_stat_t> pg_stat_updates;
-    map<int32_t,osd_stat_t> osd_stat_updates;
-    set<int32_t> osd_stat_rm;
     epoch_t osdmap_epoch;
     epoch_t pg_scan;  // osdmap epoch
     set<pg_t> pg_remove;
@@ -56,6 +57,38 @@ public:
     float nearfull_ratio;
     utime_t stamp;
 
+  private:
+    map<int32_t,osd_stat_t> osd_stat_updates;
+    set<int32_t> osd_stat_rm;
+
+    // mapping of osd to most recently reported osdmap epoch
+    map<int32_t,epoch_t> osd_epochs;
+  public:
+
+    const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+      return osd_stat_updates;
+    }
+    const set<int32_t> &get_osd_stat_rm() const {
+      return osd_stat_rm;
+    }
+    const map<int32_t, epoch_t> &get_osd_epochs() const {
+      return osd_epochs;
+    }
+
+    void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
+      osd_stat_updates[osd] = stat;
+      osd_epochs[osd] = epoch;
+      assert(osd_epochs.size() == osd_stat_updates.size());
+    }
+    void stat_osd_out(int32_t osd) {
+      // 0 the stats for the osd
+      osd_stat_updates[osd] = osd_stat_t();
+    }
+    void rm_stat(int32_t osd) {
+      osd_stat_rm.insert(osd);
+      osd_epochs.erase(osd);
+      osd_stat_updates.erase(osd);
+    }
     void encode(bufferlist &bl, uint64_t features=-1) const;
     void decode(bufferlist::iterator &bl);
     void dump(Formatter *f) const;
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 0f495052747..0644922ddb4 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -494,15 +494,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
   {
     bufferlist dirty;
     string prefix = pgmap_osd_prefix;
-    for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin();
-	 p != pending_inc.osd_stat_updates.end();
+    for (map<int32_t,osd_stat_t>::const_iterator p =
+	   pending_inc.get_osd_stat_updates().begin();
+	 p != pending_inc.get_osd_stat_updates().end();
 	 ++p) {
       ::encode(p->first, dirty);
       bufferlist bl;
       ::encode(p->second, bl, features);
       t->put(prefix, stringify(p->first), bl);
     }
-    for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) {
+    for (set<int32_t>::const_iterator p =
+	   pending_inc.get_osd_stat_rm().begin();
+	 p != pending_inc.get_osd_stat_rm().end();
+	 ++p) {
       ::encode(*p, dirty);
       t->erase(prefix, stringify(*p));
     }
@@ -725,7 +729,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
   }
 
   // osd stat
-  pending_inc.osd_stat_updates[from] = stats->osd_stat;
+  if (mon->osdmon()->osdmap.is_in(from)) {
+    pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+  } else {
+    pending_inc.update_stat(from, stats->epoch, osd_stat_t());
+  }
   
   if (pg_map.osd_stat.count(from))
     dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl;
@@ -842,11 +850,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 	 ++p)
       if (p->second == CEPH_OSD_OUT) {
 	dout(10) << "check_osd_map  osd." << p->first << " went OUT" << dendl;
-	pending_inc.osd_stat_rm.insert(p->first);
-      } else {
-	dout(10) << "check_osd_map  osd." << p->first << " is IN" << dendl;
-	pending_inc.osd_stat_rm.erase(p->first);
-	pending_inc.osd_stat_updates[p->first]; 
+	pending_inc.stat_osd_out(p->first);
       }
 
     // this is conservative: we want to know if any osds (maybe) got marked down.
@@ -867,7 +871,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 	// whether it was created *or* destroyed, we can safely drop
 	// it's osd_stat_t record.
 	dout(10) << "check_osd_map  osd." << p->first << " created or destroyed" << dendl;
-	pending_inc.osd_stat_rm.insert(p->first);
+	pending_inc.rm_stat(p->first);
 
 	// and adjust full, nearfull set
 	pg_map.nearfull_osds.erase(p->first);
-- 
cgit v1.2.1


From c8a4411db11b085ea0678bcf3f51aa411bd3e106 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Fri, 9 Aug 2013 17:59:25 -0700
Subject: PGMap: calc_min_last_epoch_clean() will now also use osd_epochs

We don't want to trim past the current osd map for any up osd.
osd_epochs provides a lower bound for that epoch for each osd.

Fixes: 5869
Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/mon/PGMap.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 0b40e9264ce..13a2af33213 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -432,6 +432,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const
     if (lec < min)
       min = lec;
   }
+  // also scan osd epochs
+  // don't trim past the oldest reported osd epoch
+  for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
+       i != osd_epochs.end();
+       ++i) {
+    if (i->second < min)
+      min = i->second;
+  }
   return min;
 }
 
-- 
cgit v1.2.1


From e73ec48371fffbb16b03e57b157e35c087e0b342 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Thu, 26 Sep 2013 12:31:35 -0700
Subject: common/hobject: add is_degenerate method

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/common/hobject.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common/hobject.h b/src/common/hobject.h
index e483b664347..0a140c3089b 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -251,6 +251,10 @@ public:
       return get_filestore_key_u32();
   }
 
+  bool is_degenerate() const {
+    return generation == NO_GEN && shard_id == NO_SHARD;
+  }
+
   // maximum sorted value.
   static ghobject_t get_max() {
     ghobject_t h(hobject_t::get_max());
-- 
cgit v1.2.1


From 664b589b05243b30a92ac3642958d56fb9144e3d Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 08:26:51 -0700
Subject: ReplicatedPG: don't rescan the local collection if we can avoid it

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.h            |  2 ++
 src/osd/ReplicatedPG.cc | 85 +++++++++++++++++++++++++++++++++++++++----------
 src/osd/ReplicatedPG.h  |  8 ++++-
 3 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/src/osd/PG.h b/src/osd/PG.h
index 74809eea268..78377d03ad6 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -439,6 +439,7 @@ protected:
    */
   struct BackfillInterval {
     // info about a backfill interval on a peer
+    eversion_t version; /// version at which the scan occurred
     map<hobject_t,eversion_t> objects;
     hobject_t begin;
     hobject_t end;
@@ -447,6 +448,7 @@ protected:
     void clear() {
       objects.clear();
       begin = end = hobject_t();
+      version = eversion_t();
     }
 
     void reset(hobject_t start) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index a661aa7f786..cc156a16e97 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1531,8 +1531,9 @@ void ReplicatedPG::do_scan(
 
       BackfillInterval bi;
       osr->flush();
+      bi.begin = m->begin;
       scan_range(
-	m->begin, cct->_conf->osd_backfill_scan_min,
+	cct->_conf->osd_backfill_scan_min,
 	cct->_conf->osd_backfill_scan_max, &bi, handle);
       MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
 					 get_osdmap()->get_epoch(), m->query_epoch,
@@ -7953,17 +7954,12 @@ int ReplicatedPG::recover_backfill(
 	   << " interval " << pbi.begin << "-" << pbi.end
 	   << " " << pbi.objects.size() << " objects" << dendl;
 
-  int local_min = osd->store->get_ideal_list_min();
-  int local_max = osd->store->get_ideal_list_max();
+  int local_min = cct->_conf->osd_backfill_scan_min;
+  int local_max = cct->_conf->osd_backfill_scan_max;
 
-  // re-scan our local interval to cope with recent changes
-  // FIXME: we could track the eversion_t when we last scanned, and invalidate
-  // that way.  or explicitly modify/invalidate when we actually change specific
-  // objects.
-  dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
-  backfill_info.clear();
-  osr->flush();
-  scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+  // update our local interval to cope with recent changes
+  backfill_info.begin = backfill_pos;
+  update_range(&backfill_info, handle);
 
   int ops = 0;
   map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7977,7 +7973,8 @@ int ReplicatedPG::recover_backfill(
     if (backfill_info.begin <= pbi.begin &&
 	!backfill_info.extends_to_end() && backfill_info.empty()) {
       osr->flush();
-      scan_range(backfill_info.end, local_min, local_max, &backfill_info,
+      backfill_info.begin = backfill_info.end;
+      scan_range(local_min, local_max, &backfill_info,
 		 handle);
       backfill_info.trim();
     }
@@ -8138,25 +8135,81 @@ void ReplicatedPG::prep_backfill_object_push(
   start_recovery_op(oid);
   recovering.insert(oid);
   ObjectContextRef obc = get_object_context(oid, false);
+
+  // We need to take the read_lock here in order to flush in-progress writes
+  obc->ondisk_read_lock();
   pgbackend->recover_object(
     oid,
     ObjectContextRef(),
     obc,
     h);
+  obc->ondisk_read_unlock();
+}
+
+void ReplicatedPG::update_range(
+  BackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  int local_min = cct->_conf->osd_backfill_scan_min;
+  int local_max = cct->_conf->osd_backfill_scan_max;
+  if (bi->version >= info.last_update) {
+    dout(10) << __func__<< ": bi is current " << dendl;
+    assert(bi->version == info.last_update);
+  } else if (bi->version >= info.log_tail) {
+    assert(!pg_log.get_log().empty());
+    dout(10) << __func__<< ": bi is old, (" << bi->version
+	     << ") can be updated with log" << dendl;
+    list<pg_log_entry_t>::const_iterator i =
+      pg_log.get_log().log.end();
+    --i;
+    while (i != pg_log.get_log().log.begin() &&
+           i->version > bi->version) {
+      --i;
+    }
+    if (i->version == bi->version)
+      ++i;
+
+    assert(i != pg_log.get_log().log.end());
+    dout(10) << __func__ << ": updating from version " << i->version
+	     << dendl;
+    for (; i != pg_log.get_log().log.end(); ++i) {
+      const hobject_t &soid = i->soid;
+      if (soid >= bi->begin && soid < bi->end) {
+	if (i->is_update()) {
+	  dout(10) << __func__ << ": " << i->soid << " updated to version "
+		   << i->version << dendl;
+	  bi->objects.erase(i->soid);
+	  bi->objects.insert(
+	    make_pair(
+	      i->soid,
+	      i->version));
+	} else if (i->is_delete()) {
+	  dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+	  bi->objects.erase(i->soid);
+	}
+      }
+    }
+    bi->version = info.last_update;
+  } else {
+    dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+	     << dendl;
+    osr->flush();
+    scan_range(local_min, local_max, &backfill_info, handle);
+  }
 }
 
 void ReplicatedPG::scan_range(
-  hobject_t begin, int min, int max, BackfillInterval *bi,
+  int min, int max, BackfillInterval *bi,
   ThreadPool::TPHandle &handle)
 {
   assert(is_locked());
-  dout(10) << "scan_range from " << begin << dendl;
-  bi->begin = begin;
+  dout(10) << "scan_range from " << bi->begin << dendl;
+  bi->version = info.last_update;
   bi->objects.clear();  // for good measure
 
   vector<hobject_t> ls;
   ls.reserve(max);
-  int r = osd->store->collection_list_partial(coll, begin, min, max,
+  int r = osd->store->collection_list_partial(coll, bi->begin, min, max,
 					      0, &ls, &bi->end);
   assert(r >= 0);
   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index abee57ffe7d..bea793878d6 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -618,10 +618,16 @@ protected:
    * @bi [out] resulting map of objects to eversion_t's
    */
   void scan_range(
-    hobject_t begin, int min, int max, BackfillInterval *bi,
+    int min, int max, BackfillInterval *bi,
     ThreadPool::TPHandle &handle
     );
 
+  /// Update a hash range to reflect changes since the last scan
+  void update_range(
+    BackfillInterval *bi,        ///< [in,out] interval to update
+    ThreadPool::TPHandle &handle ///< [in] tp handle
+    );
+
   void prep_backfill_object_push(
     hobject_t oid, eversion_t v, eversion_t have, int peer,
     PGBackend::RecoveryHandle *h);
-- 
cgit v1.2.1


From 4df481c2da145d4ff649ce0b5131c03b4b7a8bc5 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 10:11:54 -0700
Subject: PGBackend,ReplicatedBackend: add interfaces for scanning the pg

This will be important since the erasure coded pg will have a different
on-disk format than the replicated backend.

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PGBackend.h          | 20 ++++++++++++
 src/osd/ReplicatedBackend.cc | 72 ++++++++++++++++++++++++++++++++++++++++++++
 src/osd/ReplicatedBackend.h  | 20 ++++++++++++
 3 files changed, 112 insertions(+)

diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index e3cc05bf345..408c589a08a 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -205,6 +205,26 @@
    virtual void clear_temp_obj(const hobject_t &oid) = 0;
 
    virtual ~PGBackend() {}
+
+   /// List objects in collection
+   virtual int objects_list_partial(
+     const hobject_t &begin,
+     int min,
+     int max,
+     snapid_t seq,
+     vector<hobject_t> *ls,
+     hobject_t *next) = 0;
+
+   virtual int objects_list_range(
+     const hobject_t &start,
+     const hobject_t &end,
+     snapid_t seq,
+     vector<hobject_t> *ls) = 0;
+
+   virtual int objects_get_attr(
+     const hobject_t &hoid,
+     const string &attr,
+     bufferlist *out) = 0;
  };
 
 #endif
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 9868e7af2c8..ddc39d70372 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -194,3 +194,75 @@ void ReplicatedBackend::on_flushed()
     assert(0 == "found garbage in the temp collection");
   }
 }
+
+
+int ReplicatedBackend::objects_list_partial(
+  const hobject_t &begin,
+  int min,
+  int max,
+  snapid_t seq,
+  vector<hobject_t> *ls,
+  hobject_t *next)
+{
+  vector<ghobject_t> objects;
+  ghobject_t _next;
+  int r = osd->store->collection_list_partial(
+    coll,
+    begin,
+    min,
+    max,
+    seq,
+    &objects,
+    &_next);
+  ls->reserve(objects.size());
+  for (vector<ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    assert(i->is_degenerate());
+    ls->push_back(i->hobj);
+  }
+  assert(_next.is_degenerate());
+  *next = _next.hobj;
+  return r;
+}
+
+int ReplicatedBackend::objects_list_range(
+  const hobject_t &start,
+  const hobject_t &end,
+  snapid_t seq,
+  vector<hobject_t> *ls)
+{
+  vector<ghobject_t> objects;
+  int r = osd->store->collection_list_range(
+    coll,
+    start,
+    end,
+    seq,
+    &objects);
+  ls->reserve(objects.size());
+  for (vector<ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    assert(i->is_degenerate());
+    ls->push_back(i->hobj);
+  }
+  return r;
+}
+
+int ReplicatedBackend::objects_get_attr(
+  const hobject_t &hoid,
+  const string &attr,
+  bufferlist *out)
+{
+  bufferptr bp;
+  int r = osd->store->getattr(
+    coll,
+    hoid,
+    attr.c_str(),
+    bp);
+  if (r >= 0 && out) {
+    out->clear();
+    out->push_back(bp);
+  }
+  return r;
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index e34e55a618e..cc5f060e136 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -148,6 +148,26 @@ public:
       f->close_section();
     }
   }
+
+  /// List objects in collection
+  int objects_list_partial(
+    const hobject_t &begin,
+    int min,
+    int max,
+    snapid_t seq,
+    vector<hobject_t> *ls,
+    hobject_t *next);
+
+  int objects_list_range(
+    const hobject_t &start,
+    const hobject_t &end,
+    snapid_t seq,
+    vector<hobject_t> *ls);
+
+  int objects_get_attr(
+    const hobject_t &hoid,
+    const string &attr,
+    bufferlist *out);
 private:
   // push
   struct PushInfo {
-- 
cgit v1.2.1


From 9a10a801f05030ec9cf6db710eeea177fe3a0bd8 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 10:18:57 -0700
Subject: PG.cc: remove leading empty space

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f1985bf961b..f21bb66f0e4 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1,4 +1,3 @@
-
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 /*
-- 
cgit v1.2.1


From ff17e45fe27bf05e5f3f71ac697d44be9aad9d69 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 10:33:49 -0700
Subject: PG,ReplicatedPG: expose PGBackend to PG

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.h           | 3 +++
 src/osd/ReplicatedPG.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/osd/PG.h b/src/osd/PG.h
index 78377d03ad6..275d30c7658 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -48,6 +48,7 @@
 #include "common/WorkQueue.h"
 #include "common/ceph_context.h"
 #include "include/str_list.h"
+#include "PGBackend.h"
 
 #include <list>
 #include <memory>
@@ -193,6 +194,8 @@ protected:
   CephContext *cct;
   OSDriver osdriver;
   SnapMapper snap_mapper;
+
+  virtual PGBackend *get_pgbackend() = 0;
 public:
   void update_snap_mapper_bits(uint32_t bits) {
     snap_mapper.update_bits(bits);
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index bea793878d6..026c4df2a90 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -124,6 +124,9 @@ public:
   typedef boost::shared_ptr<CopyOp> CopyOpRef;
 
   boost::scoped_ptr<PGBackend> pgbackend;
+  PGBackend *get_pgbackend() {
+    return pgbackend.get();
+  }
 
   /// Listener methods
   void on_local_recover_start(
-- 
cgit v1.2.1


From 8a919fb41d4958f2856fe3631b748bdff5767563 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 10:12:11 -0700
Subject: ReplicatedPG,PG: adapt collection_list* users to PGBackend interface

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.cc           | 22 +++++++++++-----------
 src/osd/ReplicatedPG.cc | 18 +++++++++---------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f21bb66f0e4..e962ff64627 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1996,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
       hobject_t cur;
       vector<hobject_t> objects;
       while (1) {
-	int r = store->collection_list_partial(
-	  cid,
+	int r = get_pgbackend()->objects_list_partial(
 	  cur,
 	  store->get_ideal_list_min(),
 	  store->get_ideal_list_max(),
@@ -2045,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
   while (1) {
     dout(1) << "Updating snap_mapper from main collection, "
 	    << done << " objects done" << dendl;
-    int r = store->collection_list_partial(
-      cid,
+    int r = get_pgbackend()->objects_list_partial(
       cur,
       store->get_ideal_list_min(),
       store->get_ideal_list_max(),
@@ -3038,9 +3036,9 @@ int PG::build_scrub_map_chunk(
 
   // objects
   vector<hobject_t> ls;
-  int ret = osd->store->collection_list_range(coll, start, end, 0, &ls);
+  int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
   if (ret < 0) {
-    dout(5) << "collection_list_range error: " << ret << dendl;
+    dout(5) << "objects_list_range error: " << ret << dendl;
     return ret;
   }
 
@@ -3560,11 +3558,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           hobject_t start = scrubber.start;
           while (!boundary_found) {
             vector<hobject_t> objects;
-            ret = osd->store->collection_list_partial(coll, start,
-                                                      cct->_conf->osd_scrub_chunk_min,
-						      cct->_conf->osd_scrub_chunk_max,
-						      0,
-                                                      &objects, &scrubber.end);
+            ret = get_pgbackend()->objects_list_partial(
+	      start,
+	      cct->_conf->osd_scrub_chunk_min,
+	      cct->_conf->osd_scrub_chunk_max,
+	      0,
+	      &objects,
+	      &scrubber.end);
             assert(ret >= 0);
 
             // in case we don't find a boundary: start again at the end
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index cc156a16e97..646b01c47c7 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -639,12 +639,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	hobject_t next;
 	hobject_t current = response.handle;
 	osr->flush();
-	int r = osd->store->collection_list_partial(coll, current,
-						    list_size,
-						    list_size,
-						    snapid,
-						    &sentries,
-						    &next);
+	int r = pgbackend->objects_list_partial(
+	  current,
+	  list_size,
+	  list_size,
+	  snapid,
+	  &sentries,
+	  &next);
 	if (r != 0) {
 	  result = -EINVAL;
 	  break;
@@ -8209,8 +8210,7 @@ void ReplicatedPG::scan_range(
 
   vector<hobject_t> ls;
   ls.reserve(max);
-  int r = osd->store->collection_list_partial(coll, bi->begin, min, max,
-					      0, &ls, &bi->end);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
   assert(r >= 0);
   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
   dout(20) << ls << dendl;
@@ -8225,7 +8225,7 @@ void ReplicatedPG::scan_range(
       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
     } else {
       bufferlist bl;
-      int r = osd->store->getattr(coll, *p, OI_ATTR, bl);
+      int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
       assert(r >= 0);
       object_info_t oi(bl);
       bi->objects[*p] = oi.version;
-- 
cgit v1.2.1


From 2ae9ece1a88d6e428fb76d31b64aa5ec3f23c9e8 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Tue, 17 Sep 2013 10:35:47 -0700
Subject: ReplicatedPG,PG: use PGBackend methods for getattr

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.cc           |  9 +++------
 src/osd/ReplicatedPG.cc | 35 +++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index e962ff64627..17a80708324 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2067,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
 	 ++j) {
       if (j->snap < CEPH_MAXSNAP) {
 	OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-	bufferptr bp;
-	r = store->getattr(
-	  cid,
+	bufferlist bl;
+	r = get_pgbackend()->objects_get_attr(
 	  *j,
 	  OI_ATTR,
-	  bp);
+	  &bl);
 	if (r < 0) {
 	  derr << __func__ << ": getattr returned "
 	       << cpp_strerror(r) << dendl;
 	  assert(0);
 	}
-	bufferlist bl;
-	bl.push_back(bp);
 	object_info_t oi(bl);
 	set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
 	set<snapid_t> cur_snaps;
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 646b01c47c7..9c0b6b3d30b 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -398,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
 bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 {
   bufferlist bl;
-
-  int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl);
+  int ret = pgbackend->objects_get_attr(
+    sobj,
+    filter->get_xattr(),
+    &bl);
   dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
   if (ret < 0)
     return false;
@@ -683,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	  if (snapid != CEPH_NOSNAP) {
 	    bufferlist bl;
 	    if (candidate.snap == CEPH_NOSNAP) {
-	      osd->store->getattr(coll, candidate, SS_ATTR, bl);
+	      pgbackend->objects_get_attr(
+		candidate,
+		SS_ATTR,
+		&bl);
 	      SnapSet snapset(bl);
 	      if (snapid <= snapset.seq)
 		continue;
 	    } else {
 	      bufferlist attr_bl;
-	      osd->store->getattr(coll, candidate, OI_ATTR, attr_bl);
+	      pgbackend->objects_get_attr(
+		candidate, OI_ATTR, &attr_bl);
 	      object_info_t oi(attr_bl);
 	      vector<snapid_t>::iterator i = find(oi.snaps.begin(),
 						  oi.snaps.end(),
@@ -2637,7 +2643,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	string aname;
 	bp.copy(op.xattr.name_len, aname);
 	string name = "_" + aname;
-	int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata);
+	int r = pgbackend->objects_get_attr(
+	  soid,
+	  name,
+	  &(osd_op.outdata));
 	if (r >= 0) {
 	  op.xattr.value_len = r;
 	  result = 0;
@@ -2680,9 +2689,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	
 	bufferlist xattr;
 	if (op.op == CEPH_OSD_OP_CMPXATTR)
-	  result = osd->store->getattr(coll, soid, name.c_str(), xattr);
+	  result = pgbackend->objects_get_attr(
+	    soid,
+	    name,
+	    &xattr);
 	else
-	  result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr);
+	  result = pgbackend->objects_get_attr(
+	    src_obc->obs.oi.soid,
+	    name,
+	    &xattr);
 	if (result < 0 && result != -EEXIST && result != -ENODATA)
 	  break;
 	
@@ -5229,7 +5244,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
       assert(attrs->count(OI_ATTR));
       bv.push_back(attrs->find(OI_ATTR)->second);
     } else {
-      int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
+      int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
       if (r < 0) {
 	if (!can_create)
 	  return ObjectContextRef();   // -ENOENT!
@@ -5493,12 +5508,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
     if (!attrs) {
       hobject_t head(oid, key, CEPH_NOSNAP, seed,
 		     info.pgid.pool(), nspace);
-      int r = osd->store->getattr(coll, head, SS_ATTR, bv);
+      int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv);
       if (r < 0) {
 	// try _snapset
 	hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
 			  info.pgid.pool(), nspace);
-	r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
+	r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv);
 	if (r < 0 && !can_create)
 	  return NULL;
       }
-- 
cgit v1.2.1


From 0c1e251351ca030cc9e8eaa753e3a70890c8b3a0 Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Fri, 20 Sep 2013 11:26:49 -0700
Subject: ReplicatedPG: add debugging in recover_replicas for objects added for
 backfill

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/ReplicatedPG.cc | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 9c0b6b3d30b..2627d74738c 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -7890,6 +7890,8 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
     int peer = acting[i];
     map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
     assert(pm != peer_missing.end());
+    map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+    assert(pi != peer_info.end());
     size_t m_sz = pm->second.num_missing();
 
     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
@@ -7903,6 +7905,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       handle.reset_tp_timeout();
       const hobject_t soid(p->second);
 
+      if (soid > pi->second.last_backfill) {
+	if (!recovering.count(soid)) {
+	  derr << __func__ << ": object added to missing set for backfill, but "
+	       << "is not in recovering, error!" << dendl;
+	  assert(0);
+	}
+	continue;
+      }
+
       if (recovering.count(soid)) {
 	dout(10) << __func__ << ": already recovering" << soid << dendl;
 	continue;
-- 
cgit v1.2.1


From 391a885f703543b18b8f03265e429b3315abfaea Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Fri, 27 Sep 2013 15:31:56 -0700
Subject: FileStore: make _setattrs not return -ENOENT most of the time

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/os/FileStore.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index a470e63dc1c..d37a65dbe5a 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -3585,6 +3585,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
       dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
       assert(!m_filestore_fail_eio || r != -EIO);
       goto out_close;
+    } else {
+      r = 0; // don't confuse the debug output
     }
   }
   
-- 
cgit v1.2.1


From 0c2769d3321bff6e85ec57c85a08ee0b8e751bcb Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Mon, 30 Sep 2013 15:54:27 -0700
Subject: PGLog: on split, leave log head alone

This way last_update doesn't go backwards.

Fixes: 6447
Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/PG.cc    | 5 ++---
 src/osd/PGLog.cc | 4 ----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 17a80708324..1d9ed5f6a31 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -2406,9 +2406,8 @@ void PG::log_weirdness()
 			<< " log bound mismatch, empty but (" << pg_log.get_tail() << ","
 			<< pg_log.get_head() << "]\n";
   } else {
-    if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check
-	(pg_log.get_log().log.rbegin()->version != pg_log.get_head() &&
-	 !(pg_log.get_head() == pg_log.get_tail())))
+    // sloppy check
+    if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
       osd->clog.error() << info.pgid
 			<< " log bound mismatch, info (" << pg_log.get_tail() << ","
 			<< pg_log.get_head() << "]"
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 6e025f289bc..1949c96fd57 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -52,13 +52,9 @@ void PGLog::IndexedLog::split_into(
 
   if (log.empty())
     tail = head;
-  else
-    head = log.rbegin()->version;
 
   if (olog->empty())
     olog->tail = olog->head;
-  else
-    olog->head = olog->log.rbegin()->version;
 
   olog->index();
   index();
-- 
cgit v1.2.1


From b87bc2311aa4da065477f402a869e2edc1558e2f Mon Sep 17 00:00:00 2001
From: Samuel Just <sam.just@inktank.com>
Date: Wed, 2 Oct 2013 18:00:04 -0700
Subject: ReplicatedPG: lock snapdir obc during write

Otherwise, we won't block properly in prep_push_backfill_object.

Signed-off-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/ReplicatedPG.cc | 13 +++++++++++--
 src/osd/ReplicatedPG.h  |  9 +++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 2627d74738c..eb32c6065ef 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4651,10 +4651,19 @@ void ReplicatedPG::apply_repop(RepGather *repop)
   if (repop->ctx->clone_obc)
     repop->ctx->clone_obc->ondisk_write_lock();
 
+  bool unlock_snapset_obc = false;
+  if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid !=
+      repop->obc->obs.oi.soid) {
+    repop->ctx->snapset_obc->ondisk_write_lock();
+    unlock_snapset_obc = true;
+  }
+
   Context *oncommit = new C_OSD_OpCommit(this, repop);
   Context *onapplied = new C_OSD_OpApplied(this, repop);
-  Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc,
-							repop->ctx->clone_obc);
+  Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
+    repop->obc,
+    repop->ctx->clone_obc,
+    unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef());
   int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op);
   if (r) {
     derr << "apply_repop  queue_transactions returned " << r << " on " << *repop << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 026c4df2a90..b398dd2fea4 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -670,12 +670,17 @@ protected:
     }
   };
   struct C_OSD_OndiskWriteUnlock : public Context {
-    ObjectContextRef obc, obc2;
-    C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
+    ObjectContextRef obc, obc2, obc3;
+    C_OSD_OndiskWriteUnlock(
+      ObjectContextRef o,
+      ObjectContextRef o2 = ObjectContextRef(),
+      ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
     void finish(int r) {
       obc->ondisk_write_unlock();
       if (obc2)
 	obc2->ondisk_write_unlock();
+      if (obc3)
+	obc3->ondisk_write_unlock();
     }
   };
   struct C_OSD_OndiskWriteUnlockList : public Context {
-- 
cgit v1.2.1


From bb9b9c89537b10482d31b93e26b810edafb3492a Mon Sep 17 00:00:00 2001
From: David Zafman <david.zafman@inktank.com>
Date: Mon, 30 Sep 2013 15:53:35 -0700
Subject: common, os: Perform xattr handling based on detected fs type

In FileStore::_detect_fs() store discovered filesystem type in m_fs_type
Add per-filesystem filestore_max_inline_xattr_size_* variants
Add per-filesystem filestore_max_inline_xattrs_* variants
New function set_xattr_limits_via_conf()
  Set m_filestore_max_inline_xattr_size based on override or fs type
  Set m_filestore_max_inline_xattrs based on override or fs type
Handle conf change of any relevant value by calling set_xattr_limits_via_conf()
Change filestore_max_inline_xattr_size to override if non-zero
Change filestore_max_inline_xattrs to override if non-zero

Fixes: #6143

Signed-off-by: David Zafman <david.zafman@inktank.com>
---
 src/common/config_opts.h | 12 +++++++--
 src/os/FileStore.cc      | 64 +++++++++++++++++++++++++++++++++++++++++++++---
 src/os/FileStore.h       | 15 ++++++++++++
 3 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index f9a1e45ff80..2d3f981379b 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -544,11 +544,19 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
 OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
 
 OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
+
 // Use omap for xattrs for attrs over
 // filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
+OPTION(filestore_max_inline_xattr_size, OPT_U32, 0)	//Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
+
 // for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
+OPTION(filestore_max_inline_xattrs, OPT_U32, 0)	//Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
 
 OPTION(filestore_sloppy_crc, OPT_BOOL, false)         // track sloppy crcs
 OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 1a9206083c9..8330f5d2d79 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -422,7 +422,10 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
   m_filestore_do_dump(false),
   m_filestore_dump_fmt(true),
   m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
-  m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size)
+  m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+  m_fs_type(FS_TYPE_NONE),
+  m_filestore_max_inline_xattr_size(0),
+  m_filestore_max_inline_xattrs(0)
 {
   m_filestore_kill_at.set(g_conf->filestore_kill_at);
 
@@ -825,12 +828,14 @@ int FileStore::_detect_fs()
 
   blk_size = st.f_bsize;
 
+  m_fs_type = FS_TYPE_OTHER;
 #if defined(__linux__)
   if (st.f_type == BTRFS_SUPER_MAGIC) {
     dout(0) << "mount detected btrfs" << dendl;
     backend = new BtrfsFileStoreBackend(this);
 
     wbthrottle.set_fs(WBThrottle::BTRFS);
+    m_fs_type = FS_TYPE_BTRFS;
   } else if (st.f_type == XFS_SUPER_MAGIC) {
     dout(1) << "mount detected xfs" << dendl;
     if (m_filestore_replica_fadvise) {
@@ -838,15 +843,19 @@ int FileStore::_detect_fs()
       g_conf->set_val("filestore_replica_fadvise", "false");
       g_conf->apply_changes(NULL);
       assert(m_filestore_replica_fadvise == false);
+      m_fs_type = FS_TYPE_XFS;
     }
   }
 #endif
 #ifdef HAVE_LIBZFS
   if (st.f_type == ZFS_SUPER_MAGIC) {
     backend = new ZFSFileStoreBackend(this);
+    m_fs_type = FS_TYPE_ZFS;
   }
 #endif
 
+  set_xattr_limits_via_conf();
+
   r = backend->detect_features();
   if (r < 0) {
     derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
@@ -3506,7 +3515,7 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
     char n[CHAIN_XATTR_MAX_NAME_LEN];
     get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
 
-    if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+    if (p->second.length() > m_filestore_max_inline_xattr_size) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
 	  r = chain_fremovexattr(**fd, n);
@@ -3518,7 +3527,7 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
     }
 
     if (!inline_set.count(p->first) &&
-	  inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
+	  inline_set.size() >= m_filestore_max_inline_xattrs) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
 	  r = chain_fremovexattr(**fd, n);
@@ -4547,6 +4556,17 @@ const char** FileStore::get_tracked_conf_keys() const
 void FileStore::handle_conf_change(const struct md_config_t *conf,
 			  const std::set <std::string> &changed)
 {
+  if (changed.count("filestore_max_inline_xattr_size") ||
+      changed.count("filestore_max_inline_xattr_size_xfs") ||
+      changed.count("filestore_max_inline_xattr_size_btrfs") ||
+      changed.count("filestore_max_inline_xattr_size_other") ||
+      changed.count("filestore_max_inline_xattrs") ||
+      changed.count("filestore_max_inline_xattrs_xfs") ||
+      changed.count("filestore_max_inline_xattrs_btrfs") ||
+      changed.count("filestore_max_inline_xattrs_other")) {
+    Mutex::Locker l(lock);
+    set_xattr_limits_via_conf();
+  }
   if (changed.count("filestore_min_sync_interval") ||
       changed.count("filestore_max_sync_interval") ||
       changed.count("filestore_queue_max_ops") ||
@@ -4626,6 +4646,44 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
   m_filestore_dump.flush();
 }
 
+void FileStore::set_xattr_limits_via_conf()
+{
+  uint32_t fs_xattr_size;
+  uint32_t fs_xattrs;
+
+  assert(m_fs_type != FS_TYPE_NONE);
+
+  switch(m_fs_type) {
+    case FS_TYPE_XFS:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+      break;
+    case FS_TYPE_BTRFS:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+      break;
+    case FS_TYPE_ZFS:
+    case FS_TYPE_OTHER:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+      break;
+    default:
+      assert(!"Unknown fs type");
+  }
+
+  //Use override value if set
+  if (g_conf->filestore_max_inline_xattr_size)
+    m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+  else
+    m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+  //Use override value if set
+  if (g_conf->filestore_max_inline_xattrs)
+    m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+  else
+    m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
 // -- FSSuperblock --
 
 void FSSuperblock::encode(bufferlist &bl) const
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index fdab0ece34f..c489fdd5796 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -64,6 +64,14 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
 static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
 #endif
 
+enum fs_types {
+  FS_TYPE_NONE = 0,
+  FS_TYPE_XFS,
+  FS_TYPE_BTRFS,
+  FS_TYPE_ZFS,
+  FS_TYPE_OTHER
+};
+
 class FileStoreBackend;
 
 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
@@ -593,6 +601,13 @@ private:
   atomic_t m_filestore_kill_at;
   bool m_filestore_sloppy_crc;
   int m_filestore_sloppy_crc_block_size;
+  enum fs_types m_fs_type;
+
+  //Determined xattr handling based on fs type
+  void set_xattr_limits_via_conf();
+  uint32_t m_filestore_max_inline_xattr_size;
+  uint32_t m_filestore_max_inline_xattrs;
+
   FSSuperblock superblock;
 
   /**
-- 
cgit v1.2.1


From e927941fcadff56483137cffc0899b4ab9c6c297 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 4 Oct 2013 16:49:41 -0700
Subject: doc/release-notes: v0.67.4

Signed-off-by: Sage Weil <sage@inktank.com>
---
 doc/changelog/v0.67.4.txt | 550 ++++++++++++++++++++++++++++++++++++++++++++++
 doc/release-notes.rst     |  34 +++
 2 files changed, 584 insertions(+)
 create mode 100644 doc/changelog/v0.67.4.txt

diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt
new file mode 100644
index 00000000000..73b997ea304
--- /dev/null
+++ b/doc/changelog/v0.67.4.txt
@@ -0,0 +1,550 @@
+commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date:   Thu Oct 3 22:41:31 2013 +0000
+
+    v0.67.4
+
+commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Fri Sep 20 14:04:47 2013 -0700
+
+    rgw: fix keystone token expiration test
+    
+    Fixes: #6360
+    The test was inverted, need expiration to be greater than
+    current time in order for token to be valid.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d
+Author: David Zafman <david.zafman@inktank.com>
+Date:   Wed Sep 11 16:55:06 2013 -0700
+
+    osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
+    
+    Caused by 944f3b73531af791c90f0f061280160003545c63
+    
+    Fixes: #6291
+    
+    Backport: dumpling
+    
+    Signed-off-by: David Zafman <david.zafman@inktank.com>
+    Reviewed-by: Samuel Just <sam.just@inktank.com>
+    (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8)
+    
+    Conflicts:
+    
+    	src/osd/OSD.cc
+
+commit c376708358cedb5561fbb43e9b9e622df3ea7a58
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Wed Sep 25 22:08:24 2013 +0100
+
+    mon: OSDMonitor: do not write full_latest during trim
+    
+    On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we
+    write the latest full map version to 'full_latest' each time the latest
+    full map was built from the incremental versions.
+    
+    This change however clashed with OSDMonitor::encode_trim_extra(), which
+    also wrote to 'full_latest' on each trim, writing instead the version of
+    the *oldest* full map.  This duality of behaviors could lead the store
+    to an inconsistent state across the monitors (although there's no sign of
+    it actually imposing any issues besides rebuilding already existing full
+    maps on some monitors).
+    
+    We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'.
+    This function will still write out the oldest full map it has in the store,
+    but it will no longer write to full_latest, instead leaving it up to
+    OSDMonitor::update_from_paxos() to figure it out -- and it already does.
+    
+    Fixes: #6378
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba)
+
+commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Oct 1 15:53:42 2013 -0700
+
+    crush: invalidate rmap on create (and thus decode)
+    
+    If we have an existing CrushWrapper object and decode from a bufferlist,
+    reset build_rmaps so that they get rebuilt.
+    
+    Remove the build_rmaps() all in decode that was useless on a redecode
+    (because have_rmaps == true in that case and it did nothing).
+    
+    Fixes: #6442
+    Backport: dumpling, maybe cuttlefish
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417
+Author: Dan Mick <dan.mick@inktank.com>
+Date:   Mon Sep 30 14:58:11 2013 -0700
+
+    Invoke python with /usr/bin/env python instead of directly
+    
+    Fixes: #6311
+    Signed-off-by: Dan Mick <dan.mick@inktank.com>
+    (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14)
+
+commit 66aeca5a9079be398403bbff67bd5bf68c6fb111
+Author: Sage Weil <sage@inktank.com>
+Date:   Wed Sep 25 10:10:21 2013 -0700
+
+    qa/workunits/mon/crush_ops.sh: fix test
+    
+    Fix root.
+    
+    Fixes: #6392
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239)
+
+commit beb366302a125dd422c4f092b12eb541cb3bc788
+Author: Sage Weil <sage@inktank.com>
+Date:   Mon Sep 23 09:04:34 2013 -0700
+
+    Revert "ceph: parse CEPH_ARGS environment variable"
+    
+    This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad.
+    
+    We now put CEPH_ARGS in the actual args we parse in python, which are passed
+    to rados piecemeal later.  This lets you put things like --id ... in there
+    that need to be parsed before librados is initialized.
+    (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243)
+
+commit b475ff9576f145d31c053213c699e13df76d2bcb
+Author: Benoît Knecht <benoit.knecht@fsfe.org>
+Date:   Mon Sep 23 15:58:42 2013 +0200
+
+    Add CEPH_ARGS at the end of sys.argv
+    
+    This allows, for instance, to pass a different client name to ceph by
+    exporting CEPH_ARGS="--id client_id".
+    
+    Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4)
+
+commit 94548b4b67cca37366c7d8719209a6d2e7956811
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Sep 24 15:26:03 2013 -0700
+
+    mon/OSDMonitor: fix 'ceph osd crush reweight ...'
+    
+    The adjust method returns a count of adjusted items.
+    
+    Add a test.
+    
+    Fixes: #6382
+    Backport: dumpling
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Dan Mick <dan.mick@inktank.com>
+    (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339)
+
+commit 00ff7f5c20e13869d0694379739ba4e61d44b97c
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Tue Sep 10 00:20:41 2013 +0100
+
+    qa: workunits: mon: crush_ops: test 'ceph osd crush move'
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1)
+
+commit 0ff5b4a96833681e92cc41f019a569134474f4cf
+Author: Loic Dachary <loic@dachary.org>
+Date:   Tue Sep 24 19:04:23 2013 +0200
+
+    osd: change warn_interval_multiplier to uint32_t
+    
+    to prevent overflow in OpTracker::check_ops_in_flight when
+    multiplying warn_interval_multiplier *= 2
+    
+    Backport: cuttlefish, dumpling
+    
+    http://tracker.ceph.com/issues/6370 fixes #6370
+    
+    Signed-off-by: Loic Dachary <loic@dachary.org>
+    (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit fb15040b6cec6221baa550ddfffade823f784c4a
+Author: David Zafman <david.zafman@inktank.com>
+Date:   Mon Sep 9 13:01:12 2013 -0700
+
+    crushtool: do not dump core with non-unique bucket IDs
+    
+    Return -EEXIST on duplicate ID
+    BUG FIX: crush_add_bucket() mixes error returns and IDs
+      Add optional argument to return generated ID
+    
+    Fixes: #6246
+    
+    Signed-off-by: David Zafman <david.zafman@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7)
+
+commit 410db3f30c6eb54b807908c1f251ad4026e7d446
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 17:06:30 2013 +0100
+
+    qa: workunits: cephtool: check if 'heap' commands are parseable
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4)
+
+commit 062060a38bb26ff260cc51accc534413d726de49
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 17:50:27 2013 +0100
+
+    osd: OSD: add 'heap' command to known osd commands array
+    
+    Must have been forgotten during the cli rework.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3)
+
+commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 16:43:27 2013 +0100
+
+    mds: MDS: pass only heap profiler commands instead of the whole cmd vector
+    
+    The heap profiler doesn't care, nor should it, what our command name is.
+    It only cares about the commands it handles.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129)
+
+commit 46dcc46617d8f35ab8433540b22343ddcbcc3716
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 16:41:14 2013 +0100
+
+    perfglue/heap_profiler.cc: expect args as first element on cmd vector
+    
+    We used to pass 'heap' as the first element of the cmd vector when
+    handling commands.  We haven't been doing so for a while now, so we
+    needed to fix this.
+    
+    Not expecting 'heap' also makes sense, considering that what we need to
+    know when we reach this function is what command we should handle, and
+    we should not care what the caller calls us when handling his business.
+    
+    Fixes: #6361
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b)
+
+commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Mon Sep 16 14:35:25 2013 -0700
+
+    rgw: destroy get_obj handle in copy_obj()
+    
+    Fixes: #6176
+    Backport: dumpling
+    We take different code paths in copy_obj, make sure we close the handle
+    when we exit the function. Move the call to finish_get_obj() out of
+    copy_obj_data() as we don't create the handle there, so that should
+    makes code less confusing and less prone to errors.
+    Also, note that RGWRados::get_obj() also calls finish_get_obj(). For
+    everything to work in concert we need to pass a pointer to the handle
+    and not the handle itself. Therefore we needed to also change the call
+    to copy_obj_data().
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1)
+
+commit 471233e98a9f64ad513a4a196b7661b80534cb00
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Mon Sep 9 23:14:11 2013 +0100
+
+    mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+    
+    Fixes: #6230
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0)
+
+commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638
+Author: Sage Weil <sage@inktank.com>
+Date:   Mon Sep 23 16:23:33 2013 -0700
+
+    osd: revert 'osd max xattr size' limit
+    
+    Set it to 0 (unlimited) for now.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361)
+
+commit b3d3b3747c1eef695138dac828e5fcb435309c7b
+Author: Greg Farnum <greg@inktank.com>
+Date:   Wed Sep 11 16:24:32 2013 -0700
+
+    mds: be more careful about decoding LogEvents
+    
+    We need to wrap the full decode section or we can abort the process
+    if there's an issue (which we may want to just skip by).
+    
+    Signed-off-by: Greg Farnum <greg@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841)
+
+commit 06c58132199ed22413b509dfa751321ccdb24225
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Tue Sep 17 17:58:20 2013 +0100
+
+    mon: OSDMonitor: multiple rebuilt full maps per transaction
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a)
+
+commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Sun Sep 15 21:03:50 2013 +0100
+
+    mon: OSDMonitor: update latest_full while rebuilding full maps
+    
+    Not doing so will make the monitor rebuild the osdmap full versions, even
+    though they may have been rebuilt before, every time the monitor starts.
+    
+    This mostly happens when the cluster is left in an unhealthy state for
+    a long period of time and incremental versions build up.  Even though we
+    build the full maps on update_from_paxos(), not updating 'full_latest'
+    leads to the situation initially described.
+    
+    Fixes: #6322
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3)
+
+commit 9b9edb04581cca15e67c567332529f5b3f426743
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Sun Sep 15 21:00:55 2013 +0100
+
+    mon: OSDMonitor: smaller transactions when rebuilding full versions
+    
+    Otherwise, for considerably sized rebuilds, the monitor will not only
+    consume vast amounts of memory, but it will also have troubles committing
+    the transaction.  Anyway, it's also a good idea to adjust transactions to
+    the granularity we want, and to be fair we care that each rebuilt full map
+    gets to disk, even if subsequent full maps don't (those can be rebuilt
+    later).
+    
+    Fixes: #6323
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae)
+
+commit 298811f7a15541b9ec1015c416ad2aa075be5691
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Wed Aug 28 15:51:01 2013 +0100
+
+    mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap
+    
+    Backport: dumpling
+    Fixes: #6047
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075)
+
+commit a992664435db9dde3745eb7f354cce3fc5400a47
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Sep 12 14:32:17 2013 -0700
+
+    lru_map: don't use list::size()
+    
+    replace list::size() with map::size(), which should have
+    a constant time complexity.
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1)
+
+commit 788546ea71c994ff35323747294ed9c177fe7020
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Sep 12 14:30:19 2013 -0700
+
+    common/lru_map: rename tokens to entries
+    
+    This code was originally used in a token cache, now
+    as a generic infrastructure rename token fields.
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8)
+
+commit babeb00c42af760b3e7575166479e95365cfcc0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 18 10:37:21 2013 -0700
+
+    rgw: use bufferlist::append() instead of bufferlist::push_back()
+    
+    push_back() expects char *, whereas append can append a single char.
+    Appending a NULL char to push_back is cast as a NULL pointer which is
+    bad.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff)
+
+commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 13:46:31 2013 -0700
+
+    rgw: NULL terminate buffer before parsing it
+    
+    Fixes: #6175
+    Backport: dumpling
+    We get a buffer off the remote gateway which might
+    not be NULL terminated. The JSON parser needs the
+    buffer to be NULL terminated even though we provide
+    a buffer length as it calls strlen().
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51)
+
+commit c73040a5518971813b9ebaae1624c5bacef315d0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 22:30:12 2013 -0700
+
+    rgw: don't call list::size() in ObjectCache
+    
+    Fixes: #6286
+    Use an external counter instead of calling list::size()
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit a855aba9d18936e9a060119e041518790cd4b831
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Sep 10 12:18:55 2013 -0700
+
+    rgw: drain pending requests before completing write
+    
+    Fixes: #6268
+    When doing aio write of objects (either regular or multipart parts) we
+    need to drain pending aio requests. Otherwise if gateway goes down then
+    object might end up corrupted.
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8)
+
+commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Fri Sep 6 22:33:38 2013 -0700
+
+    rgw: fix get cors, delete cors
+    
+    Remove a couple of variables that overrode class member. Not
+    really clear how it was working before, might have been a bad
+    merge / rebase.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d)
+
+commit a304016fa01b02efd500135c00b9bf3407a9999c
+Merge: 408cd61 ac0a30f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 09:47:10 2013 -0700
+
+    Merge branch 'wip-6078-dumpling' into dumpling
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+
+commit ac0a30feb8c64a3b80d9c519a7b561213403afab
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 28 21:25:20 2013 -0700
+
+    rgw: fix certain return status cases in CORS
+    
+    Change return values in certain cases, reorder
+    checks, etc.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 28 21:24:36 2013 -0700
+
+    rgw: add COPY method to be handled by CORS
+    
+    Was missing this http method.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit d45c87ea738807487e72c0719b0d3d459cbe19e9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Aug 27 19:38:45 2013 -0700
+
+    rgw: fix CORS rule check
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Aug 27 19:38:18 2013 -0700
+
+    rgw: don't handle CORS if rule not found (is NULL)
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 71873aba6553492d3ad71596cefd7c841030a277
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Aug 22 13:38:55 2013 -0700
+
+    rgw: tie CORS header response to all relevant operations
+    
+    Have the CORS responses on all relevant operations. Also add headers
+    on failure cases.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 94e7b594d85dbd26e58d823b41f418032e9f163f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Aug 22 10:00:53 2013 -0700
+
+    rgw: add a generic CORS response handling
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit c3385d8a102faf5379559bb98cf89637ceda1579
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 21 17:22:46 2013 -0700
+
+    rgw: OPTIONS request doesn't need to read object info
+    
+    This is a bucket-only operation, so we shouldn't look at the
+    object. Object may not exist and we might respond with Not
+    Exists response which is not what we want.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 21 14:43:28 2013 -0700
+
+    rgw: remove use of s->bucket_cors
+    
+    Some old code still tried to use s->bucket_cors, which was
+    abandoned in a cleanup work.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index 604b4fa296b..bb1dfe4bfec 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -120,6 +120,40 @@ Notable Changes
 * sysvinit: add condrestart command (Dan van der Ster)
 
 
+
+v0.67.4 "Dumpling"
+------------------
+
+This point release fixes an important performance issue with radosgw,
+keystone authentication token caching, and CORS.  All users
+(especially those of rgw) are encouraged to upgrade.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* crush: fix invalidation of cached names
+* crushtool: do not crash on non-unique bucket ids
+* mds: be more careful when decoding LogEvents
+* mds: fix heap check debugging commands
+* mon: avoid rebuilding old full osdmaps
+* mon: fix 'ceph crush move ...'
+* mon: fix 'ceph osd crush reweight ...'
+* mon: fix writeout of full osdmaps during trim
+* mon: limit size of transactions
+* mon: prevent both unmanaged and pool snaps
+* osd: disable xattr size limit (prevents upload of large rgw objects)
+* osd: fix recovery op throttling
+* osd: fix throttling of log messages for very slow requests
+* rgw: drain pending requests before completing write
+* rgw: fix CORS
+* rgw: fix inefficient list::size() usage
+* rgw: fix keystone token expiration
+* rgw: fix minor memory leaks
+* rgw: fix null termination of buffer
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`.
+
+
 v0.67.3 "Dumpling"
 ------------------
 
-- 
cgit v1.2.1


From 92a60a05841397aa0c59a1097e133eaca27ca532 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Fri, 4 Oct 2013 22:06:04 -0700
Subject: mon/PGMap: make generated test instances obey new invariant

As of 091809b8149c7595cbcca439c5b8b75a0c42efe1 we keep an osd_map epoch
for any osd_stat update, and assert as much.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/mon/PGMap.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 0b3a0a6506c..ea70bbd61c3 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -152,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
   o.back()->version = 2;
   o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
   o.back()->osd_stat_updates[5] = osd_stat_t();
+  o.back()->osd_epochs[5] = 12;
   o.push_back(new Incremental);
   o.back()->version = 3;
   o.back()->osdmap_epoch = 1;
@@ -160,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
   o.back()->nearfull_ratio = .3;
   o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
   o.back()->osd_stat_updates[6] = osd_stat_t();
+  o.back()->osd_epochs[6] = 12;
   o.back()->pg_remove.insert(pg_t(1,2,3));
   o.back()->osd_stat_rm.insert(5);
 }
-- 
cgit v1.2.1


From f27964189419f590c5025c515986b1a3af5e2748 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 6 Oct 2013 15:12:57 -0700
Subject: mon: do not put() unhandle message

If we return false because we aren't handling a message, we should not put
the ref.  This fixes a double-free.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/mon/Monitor.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 3fe658d9623..aea81102720 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2586,7 +2586,6 @@ bool Monitor::_ms_dispatch(Message *m)
     if (!src_is_mon && m->get_type() != CEPH_MSG_AUTH) {
       dout(1) << __func__ << " dropping stray message " << *m
         << " from " << m->get_source_inst() << dendl;
-      m->put();
       return false;
     }
 
-- 
cgit v1.2.1


From 71ee6d7c8d3973361fa66f5e9eb4863cbc11c8a8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Sun, 6 Oct 2013 15:17:19 -0700
Subject: mon: allow MMonGetMap without authentication

This is used by the MonClient::get_monmap_privately() helper.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/mon/Monitor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index aea81102720..d8c90bc3d76 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2583,7 +2583,8 @@ bool Monitor::_ms_dispatch(Message *m)
     // and considering that we are creating a new session it is safe to
     // assume that the sender hasn't authenticated yet, so we have no way
     // of assessing whether we should handle it or not.
-    if (!src_is_mon && m->get_type() != CEPH_MSG_AUTH) {
+    if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+			m->get_type() != CEPH_MSG_MON_GET_MAP)) {
       dout(1) << __func__ << " dropping stray message " << *m
         << " from " << m->get_source_inst() << dendl;
       return false;
-- 
cgit v1.2.1


From 6da4b91c07878e07f23eee563cf1d2422f348c2f Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 7 Oct 2013 05:22:20 -0700
Subject: os/FileStore: fix ENOENT error code for getattrs()

In commit dc0dfb9e01d593afdd430ca776cf4da2c2240a20 the omap xattrs code
moved up a block and r was no longer local to the block.  Translate
ENOENT -> 0 to compensate.

Fix the same error in _rmattrs().

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
Reviewed-by: Samuel Just <sam.just@inktank.com>
---
 src/os/FileStore.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index 514ff022bee..3506c4a4ccd 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -3464,6 +3464,8 @@ int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>
     dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
     goto out;
   }
+  if (r == -ENOENT)
+    r = 0;
   assert(omap_attrs.size() == omap_aset.size());
   for (map<string, bufferlist>::iterator i = omap_aset.begin();
 	 i != omap_aset.end();
@@ -3651,6 +3653,8 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
     dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
     return r;
   }
+  if (r == -ENOENT)
+    r = 0;
  out:
   dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
   return r;
-- 
cgit v1.2.1


From 4b911cf81773d43f3574724a0ac97c79e1ab2b22 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Mon, 7 Oct 2013 13:11:21 -0700
Subject: ReplicatedPG: copy: use aggregate return code instead of individual
 Op return

It appears that the OSD is not filling in the individual return codes, and they
should be equivalent for all purposes we care about here (the only Op we are
doing is the copy-get, and if it fails we are getting its failure code).

Reported-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Farnum <greg@inktank.com>
Reviewed-by: Samuel Just <sam.just@inktank.com>
---
 src/osd/ReplicatedPG.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index d02a9c9cc48..6c8b092ca01 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -4412,14 +4412,13 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
       dout(10) << __func__ << " fetching more" << dendl;
       _copy_some(obc, cop);
       return;
-    } else {
-      _build_finish_copy_transaction(cop, results.get<3>());
-      results.get<1>() = cop->temp_cursor.data_offset;
     }
+    _build_finish_copy_transaction(cop, results.get<3>());
+    results.get<1>() = cop->temp_cursor.data_offset;
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-  results.get<0>() = cop->rval;
+  results.get<0>() = r;
   cop->cb->complete(results);
 
   copy_ops.erase(obc->obs.oi.soid);
-- 
cgit v1.2.1


From 6ff9570726b8ba6a6d1434a40ae86ca9649b05e6 Mon Sep 17 00:00:00 2001
From: Sandon Van Ness <sandon@inktank.com>
Date: Tue, 8 Oct 2013 11:58:57 -0700
Subject: Go back to $PWD in fsstress.sh if compiling from source.

Although fsstress was being called with a static path the directory
it was writing to was in the current directory so doing a cd to the
source directory that is made in /tmp and then removing it later
caused it to be unable to write the files in a non-existent dir.

This change gets the current path first and cd's back into it after
it is done compiling fsstress.

Issue #6479.

Signed-off-by: Sandon Van Ness <sandon@inktank.com>
Reviewed-by: Alfredo Deza <alfredo.deza@inktank.com>
---
 qa/workunits/suites/fsstress.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qa/workunits/suites/fsstress.sh b/qa/workunits/suites/fsstress.sh
index 7f945172687..394e5fad991 100755
--- a/qa/workunits/suites/fsstress.sh
+++ b/qa/workunits/suites/fsstress.sh
@@ -2,6 +2,7 @@
 
 if [ ! -f /usr/lib/ltp/testcases/bin/fsstress ]
 then
+    path=`pwd`
     mkdir -p /tmp/fsstress
     cd /tmp/fsstress
     wget -q -O /tmp/fsstress/ltp-full.tgz http://ceph.com/qa/ltp-full-20091231.tgz
@@ -13,6 +14,7 @@ then
     sudo cp -avf /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress/fsstress /usr/lib/ltp/testcases/bin/fsstress
     sudo chmod 755 /usr/lib/ltp/testcases/bin/fsstress
     rm -Rf /tmp/fsstress
+    cd $path
 fi
 
 command="/usr/lib/ltp/testcases/bin/fsstress -d fsstress-`hostname`$$ -l 1 -n 1000 -p 10 -v"
-- 
cgit v1.2.1


From 7ef5eb06ec07d4e7c3fee4d85de8a8310f7ed94f Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Thu, 3 Oct 2013 11:49:33 -0700
Subject: librados: drop reference to completion in container destructor

Move the PoolAsyncCompletionImpl reference drop from
C_PoolAsync_Safe::finish() to ~C_PoolAsyncSafe(), as finish() is only
called when the async request is actually sent.

Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 src/librados/PoolAsyncCompletionImpl.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/librados/PoolAsyncCompletionImpl.h b/src/librados/PoolAsyncCompletionImpl.h
index efb89641466..443b2c23a17 100644
--- a/src/librados/PoolAsyncCompletionImpl.h
+++ b/src/librados/PoolAsyncCompletionImpl.h
@@ -94,6 +94,9 @@ namespace librados {
     C_PoolAsync_Safe(PoolAsyncCompletionImpl *_c) : c(_c) {
       c->get();
     }
+    ~C_PoolAsync_Safe() {
+      c->put();
+    }
   
     void finish(int r) {
       c->lock.Lock();
@@ -109,7 +112,7 @@ namespace librados {
 	c->lock.Lock();
       }
 
-      c->put_unlock();
+      c->lock.Unlock();
     }
   };
 }
-- 
cgit v1.2.1


From a1825356ad7e0747bbef2faa3a085fab2d52db97 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Fri, 11 Oct 2013 14:52:13 -0700
Subject: librados: add some clarifying comments

about async pool operation handle

Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
---
 src/include/rados/librados.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index 5a750cbc0d1..23e270641cf 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -744,7 +744,12 @@ namespace librados
     int cluster_stat(cluster_stat_t& result);
     int cluster_fsid(std::string *fsid);
 
-    /* pool aio */
+    /*
+     * pool aio
+     *
+     * It is up to the caller to release the completion handler, even if the pool_create_async()
+     * and/or pool_delete_async() fails and does not send the async request
+     */
     static PoolAsyncCompletion *pool_async_create_completion();
 
    // -- aio --
-- 
cgit v1.2.1


From f2645e1c6d7383a0ace3b239f4304e353249c4bb Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Fri, 4 Oct 2013 13:00:26 -0700
Subject: rgw: swift update obj metadata also add generic attrs

Fixes: #6462
We were missing the generic attributes when we updated the object
metadata (operation that only exists in the swift api).

Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
---
 src/rgw/rgw_op.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 114b8709a22..fc4ad6d3511 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1604,6 +1604,13 @@ void RGWPutMetadata::execute()
     }
   }
 
+  map<string, string>::iterator giter;
+  for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) {
+    bufferlist& attrbl = attrs[giter->first];
+    const string& val = giter->second;
+    attrbl.append(val.c_str(), val.size() + 1);
+  }
+
   if (has_policy) {
     policy.encode(bl);
     attrs[RGW_ATTR_ACL] = bl;
-- 
cgit v1.2.1


From 82e3317d7990485f28edad8670e95414c438ad5c Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Tue, 15 Oct 2013 00:51:30 +0100
Subject: mon: PGMap: keep track of per-pool stats deltas

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/PGMap.cc     | 127 ++++++++++++++++++++++++++++++++++++++++++++-------
 src/mon/PGMap.h      |  40 +++++++++++++++-
 src/mon/PGMonitor.cc |  33 ++++++++++++-
 3 files changed, 181 insertions(+), 19 deletions(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index ea70bbd61c3..666b932bd41 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -180,6 +180,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
   stamp = inc.stamp;
 
   pool_stat_t pg_sum_old = pg_sum;
+  hash_map<uint64_t, pool_stat_t> pg_pool_sum_old;
 
   bool ratios_changed = false;
   if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
@@ -199,6 +200,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     const pg_t &update_pg(p->first);
     const pg_stat_t &update_stat(p->second);
 
+    if (pg_pool_sum_old.count(update_pg.pool()) == 0)
+      pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
+
     hash_map<pg_t,pg_stat_t>::iterator t = pg_stat.find(update_pg);
     if (t == pg_stat.end()) {
       hash_map<pg_t,pg_stat_t>::value_type v(update_pg, update_stat);
@@ -216,7 +220,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
        ++p) {
     int osd = p->first;
     const osd_stat_t &new_stats(p->second);
-    
+
     hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(osd);
     if (t == osd_stat.end()) {
       hash_map<int32_t,osd_stat_t>::value_type v(osd, new_stats);
@@ -229,7 +233,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
 
     stat_osd_add(new_stats);
-    
+
     // adjust [near]full status
     register_nearfull_status(osd, new_stats);
   }
@@ -243,7 +247,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
       pg_stat.erase(s);
     }
   }
-  
+
   for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
        p != inc.get_osd_stat_rm().end();
        ++p) {
@@ -270,7 +274,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     stamp_delta -= pg_sum_deltas.front().second;
     pg_sum_deltas.pop_front();
   }
-  
+
+  update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
+
   if (inc.osdmap_epoch)
     last_osdmap_epoch = inc.osdmap_epoch;
   if (inc.pg_scan)
@@ -841,24 +847,111 @@ void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const
   }
 }
 
-void PGMap::update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old)
+
+/**
+ * update aggregated delta
+ *
+ * @param cct               ceph context
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param old_pool_sum      Previous stats sum
+ * @param last_ts           Last timestamp for pool
+ * @param result_pool_sum   Resulting stats
+ * @param result_ts_delta   Resulting timestamp delta
+ * @param delta_avg_list    List of last N computed deltas, used to average
+ */
+void PGMap::update_delta(CephContext *cct,
+                         const utime_t ts,
+                         const pool_stat_t& old_pool_sum,
+                         utime_t *last_ts,
+                         const pool_stat_t& current_pool_sum,
+                         pool_stat_t *result_pool_delta,
+                         utime_t *result_ts_delta,
+                         list<pair<pool_stat_t,utime_t> > *delta_avg_list)
 {
+  /* @p ts is the timestamp we want to associate with the data
+   * in @p old_pool_sum, and on which we will base ourselves to
+   * calculate the delta, stored in 'delta_t'.
+   */
   utime_t delta_t;
-  delta_t = inc_stamp;
-  delta_t -= stamp;
-  stamp = inc_stamp;
+  delta_t = ts;         // start with the provided timestamp
+  delta_t -= *last_ts;  // take the last timestamp we saw
+  *last_ts = ts;        // @p ts becomes the last timestamp we saw
 
   // calculate a delta, and average over the last 2 deltas.
-  pool_stat_t d = pg_sum;
-  d.stats.sub(pg_sum_old.stats);
-  pg_sum_deltas.push_back(make_pair(d, delta_t));
-  stamp_delta += delta_t;
+  /* start by taking a copy of our current @p result_pool_sum, and by
+   * taking out the stats from @p old_pool_sum.  This generates a stats
+   * delta.  Stash this stats delta in @p delta_avg_list, along with the
+   * timestamp delta for these results.
+   */
+  pool_stat_t d = current_pool_sum;
+  d.stats.sub(old_pool_sum.stats);
+  delta_avg_list->push_back(make_pair(d,delta_t));
+  *result_ts_delta += delta_t;
+
+  /* Aggregate current delta, and take out the last seen delta (if any) to
+   * average it out.
+   */
+  result_pool_delta->stats.add(d.stats);
+  size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
+  if (delta_avg_list->size() > s) {
+    result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
+    *result_ts_delta -= delta_avg_list->front().second;
+    delta_avg_list->pop_front();
+  }
+}
 
-  pg_sum_delta.stats.add(d.stats);
-  if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
-    pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
-    stamp_delta -= pg_sum_deltas.front().second;
-    pg_sum_deltas.pop_front();
+/**
+ * update aggregated delta
+ *
+ * @param cct            ceph context
+ * @param ts             Timestamp
+ * @param pg_sum_old     Old pg_sum
+ */
+void PGMap::update_global_delta(CephContext *cct,
+                         const utime_t ts, const pool_stat_t& pg_sum_old)
+{
+  update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
+               &stamp_delta, &pg_sum_deltas);
+}
+
+/**
+ * Update a given pool's deltas
+ *
+ * @param cct           Ceph Context
+ * @param ts            Timestamp for the stats being delta'ed
+ * @param pool          Pool's id
+ * @param old_pool_sum  Previous stats sum
+ */
+void PGMap::update_one_pool_delta(CephContext *cct,
+                                  const utime_t ts,
+                                  const uint64_t pool,
+                                  const pool_stat_t& old_pool_sum)
+{
+  if (per_pool_sum_deltas.count(pool) == 0) {
+    assert(per_pool_sum_deltas_stamps.count(pool) == 0);
+    assert(per_pool_sum_delta.count(pool) == 0);
+  }
+
+  pair<pool_stat_t,utime_t>& sum_delta = per_pool_sum_delta[pool];
+
+  update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
+               &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
+               &per_pool_sum_deltas[pool]);
+}
+
+/**
+ * Update pools' deltas
+ *
+ * @param cct               CephContext
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param pg_pool_sum_old   Map of pool stats for delta calcs.
+ */
+void PGMap::update_pool_deltas(CephContext *cct, const utime_t ts,
+                               const hash_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
+{
+  for (hash_map<uint64_t,pool_stat_t>::const_iterator it = pg_pool_sum_old.begin();
+       it != pg_pool_sum_old.end(); ++it) {
+    update_one_pool_delta(cct, ts, it->first, it->second);
   }
 }
 
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 7a202fc0006..c8ea6ee9e83 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -109,13 +109,51 @@ public:
   utime_t stamp;
 
   // recent deltas, and summation
+  /**
+   * keep track of last deltas for each pool, calculated using
+   * @p pg_pool_sum as baseline.
+   */
+  hash_map<uint64_t, list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
+  /**
+   * keep track of per-pool timestamp deltas, according to last update on
+   * each pool.
+   */
+  hash_map<uint64_t, utime_t> per_pool_sum_deltas_stamps;
+  /**
+   * keep track of sum deltas, per-pool, taking into account any previous
+   * deltas existing in @p per_pool_sum_deltas.  The utime_t as second member
+   * of the pair is the timestamp refering to the last update (i.e., the first
+   * member of the pair) for a given pool.
+   */
+  hash_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta;
+
   list< pair<pool_stat_t, utime_t> > pg_sum_deltas;
   pool_stat_t pg_sum_delta;
   utime_t stamp_delta;
 
-  void update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old);
+  void update_global_delta(CephContext *cct,
+                           const utime_t ts, const pool_stat_t& pg_sum_old);
+  void update_pool_deltas(CephContext *cct,
+                          const utime_t ts,
+                          const hash_map<uint64_t, pool_stat_t>& pg_pool_sum_old);
   void clear_delta();
 
+ private:
+  void update_delta(CephContext *cct,
+                    const utime_t ts,
+                    const pool_stat_t& old_pool_sum,
+                    utime_t *last_ts,
+                    const pool_stat_t& current_pool_sum,
+                    pool_stat_t *result_pool_delta,
+                    utime_t *result_ts_delta,
+                    list<pair<pool_stat_t,utime_t> > *delta_avg_list);
+
+  void update_one_pool_delta(CephContext *cct,
+                             const utime_t ts,
+                             const uint64_t pool,
+                             const pool_stat_t& old_pool_sum);
+ public:
+
   set<pg_t> creating_pgs;   // lru: front = new additions, back = recently pinged
   map<int,set<pg_t> > creating_pgs_by_osd;
 
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 0644922ddb4..d9d49e10a08 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -141,6 +141,31 @@ void PGMonitor::tick()
     }
   }
 
+  /* If we have deltas for pools, run through pgmap's 'per_pool_sum_delta' and
+   * clear any deltas that are old enough.
+   *
+   * Note that 'per_pool_sum_delta' keeps a pool id as key, and a pair containing
+   * the calc'ed stats delta and an absolute timestamp from when those stats were
+   * obtained -- the timestamp IS NOT a delta itself.
+   */
+  if (!pg_map.per_pool_sum_deltas.empty()) {
+    hash_map<uint64_t,pair<pool_stat_t,utime_t> >::iterator it;
+    for (it = pg_map.per_pool_sum_delta.begin();
+         it != pg_map.per_pool_sum_delta.end(); ) {
+      utime_t age = ceph_clock_now(g_ceph_context) - it->second.second;
+      if (age > 2*g_conf->mon_delta_reset_interval) {
+        dout(10) << " clearing pg_map delta for pool " << it->first
+                 << " (" << age << " > " << g_conf->mon_delta_reset_interval
+                 << " seconds old)" << dendl;
+        pg_map.per_pool_sum_deltas.erase(it->first);
+        pg_map.per_pool_sum_deltas_stamps.erase(it->first);
+        pg_map.per_pool_sum_delta.erase((it++)->first);
+      } else {
+        ++it;
+      }
+    }
+  }
+
   dout(10) << pg_map << dendl;
 }
 
@@ -401,6 +426,7 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
   }
 
   pool_stat_t pg_sum_old = pg_map.pg_sum;
+  hash_map<uint64_t, pool_stat_t> pg_pool_sum_old;
 
   // pgs
   bufferlist::iterator p = dirty_pgs.begin();
@@ -410,6 +436,10 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
     dout(20) << " refreshing pg " << pgid << dendl;
     bufferlist bl;
     int r = mon->store->get(pgmap_pg_prefix, stringify(pgid), bl);
+
+    if (pg_pool_sum_old.count(pgid.pool()) == 0)
+      pg_pool_sum_old[pgid.pool()] = pg_map.pg_pool_sum[pgid.pool()];
+
     if (r >= 0) {
       pg_map.update_pg(pgid, bl);
     } else {
@@ -432,7 +462,8 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
     }
   }
 
-  pg_map.update_delta(g_ceph_context, inc_stamp, pg_sum_old);
+  pg_map.update_global_delta(g_ceph_context, inc_stamp, pg_sum_old);
+  pg_map.update_pool_deltas(g_ceph_context, inc_stamp, pg_pool_sum_old);
 
   // ok, we're now on the new version
   pg_map.version = v;
-- 
cgit v1.2.1


From e3ba8e82ac8cf10f5e478ce704a4430dc5c2b2ed Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Thu, 3 Oct 2013 01:08:07 +0100
Subject: mon: PGMap: reuse existing summary functions to output pool stats

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/PGMap.cc     | 81 ++++++++++++++++++++++++++++++++++++++--------------
 src/mon/PGMap.h      | 13 +++++++--
 src/mon/PGMonitor.cc |  2 +-
 3 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index 666b932bd41..e4a0cde0b20 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -786,54 +786,59 @@ void PGMap::print_osd_perf_stats(std::ostream *ss) const
   (*ss) << tab;
 }
 
-void PGMap::recovery_summary(Formatter *f, ostream *out) const
+void PGMap::recovery_summary(Formatter *f, ostream *out,
+                             pool_stat_t delta_sum) const
 {
   bool first = true;
-  if (pg_sum.stats.sum.num_objects_degraded) {
-    double pc = (double)pg_sum.stats.sum.num_objects_degraded / (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+  if (delta_sum.stats.sum.num_objects_degraded) {
+    double pc = (double)delta_sum.stats.sum.num_objects_degraded /
+      (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
     char b[20];
     snprintf(b, sizeof(b), "%.3lf", pc);
     if (f) {
-      f->dump_unsigned("degraded_objects", pg_sum.stats.sum.num_objects_degraded);
-      f->dump_unsigned("degraded_total", pg_sum.stats.sum.num_object_copies);
+      f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
+      f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
       f->dump_string("degrated_ratio", b);
     } else {
-      *out << pg_sum.stats.sum.num_objects_degraded 
-	   << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
+      *out << delta_sum.stats.sum.num_objects_degraded
+	   << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
     }
     first = false;
   }
-  if (pg_sum.stats.sum.num_objects_unfound) {
-    double pc = (double)pg_sum.stats.sum.num_objects_unfound / (double)pg_sum.stats.sum.num_objects * (double)100.0;
+  if (delta_sum.stats.sum.num_objects_unfound) {
+    double pc = (double)delta_sum.stats.sum.num_objects_unfound /
+      (double)delta_sum.stats.sum.num_objects * (double)100.0;
     char b[20];
     snprintf(b, sizeof(b), "%.3lf", pc);
     if (f) {
-      f->dump_unsigned("unfound_objects", pg_sum.stats.sum.num_objects_unfound);
-      f->dump_unsigned("unfound_total", pg_sum.stats.sum.num_objects);
+      f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
+      f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
       f->dump_string("unfound_ratio", b);
     } else {
       if (!first)
 	*out << "; ";
-      *out << pg_sum.stats.sum.num_objects_unfound
-	   << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+      *out << delta_sum.stats.sum.num_objects_unfound
+	   << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
     }
     first = false;
   }
 }
 
-void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const
+void PGMap::recovery_rate_summary(Formatter *f, ostream *out,
+                                  pool_stat_t delta_sum,
+                                  utime_t delta_stamp) const
 {
   // make non-negative; we can get negative values if osds send
   // uncommitted stats and then "go backward" or if they are just
   // buggy/wrong.
-  pool_stat_t pos_delta = pg_sum_delta;
+  pool_stat_t pos_delta = delta_sum;
   pos_delta.floor(0);
   if (pos_delta.stats.sum.num_objects_recovered ||
       pos_delta.stats.sum.num_bytes_recovered ||
       pos_delta.stats.sum.num_keys_recovered) {
-    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)stamp_delta;
-    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)stamp_delta;
-    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)stamp_delta;
+    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
+    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
+    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
     if (f) {
       f->dump_int("recovering_objects_per_sec", objps);
       f->dump_int("recovering_bytes_per_sec", bps);
@@ -847,6 +852,38 @@ void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const
   }
 }
 
+void PGMap::overall_recovery_rate_summary(Formatter *f, ostream *out) const
+{
+  recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMap::overall_recovery_summary(Formatter *f, ostream *out) const
+{
+  recovery_summary(f, out, pg_sum);
+}
+
+void PGMap::pool_recovery_rate_summary(Formatter *f, ostream *out,
+                                       uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  hash_map<uint64_t,utime_t>::const_iterator ts =
+    per_pool_sum_deltas_stamps.find(p->first);
+  assert(ts != per_pool_sum_deltas_stamps.end());
+  recovery_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMap::pool_recovery_summary(Formatter *f, ostream *out,
+                                  uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  recovery_summary(f, out, p->second.first);
+}
 
 /**
  * update aggregated delta
@@ -1004,7 +1041,7 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   }
 
   std::stringstream ssr;
-  recovery_summary(f, &ssr);
+  overall_recovery_summary(f, &ssr);
   if (!f && ssr.str().length())
     *out << "            " << ssr.str() << "\n";
   ssr.clear();
@@ -1013,7 +1050,7 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   if (!f)
     *out << ss.str();   // pgs by state
 
-  recovery_rate_summary(f, &ssr);
+  overall_recovery_rate_summary(f, &ssr);
   if (!f && ssr.str().length())
     *out << "recovery io " << ssr.str() << "\n";
 
@@ -1095,12 +1132,12 @@ void PGMap::print_oneline_summary(ostream *out) const
   }
 
   std::stringstream ssr;
-  recovery_summary(NULL, &ssr);
+  overall_recovery_summary(NULL, &ssr);
   if (ssr.str().length())
     *out << "; " << ssr.str();
   ssr.clear();
   ssr.str("");
-  recovery_rate_summary(NULL, &ssr);
+  overall_recovery_rate_summary(NULL, &ssr);
   if (ssr.str().length())
     *out << "; " << ssr.str() << " recovering";
 }
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index c8ea6ee9e83..81feed67384 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -243,8 +243,17 @@ public:
   void dump_osd_perf_stats(Formatter *f) const;
   void print_osd_perf_stats(std::ostream *ss) const;
 
-  void recovery_summary(Formatter *f, ostream *out) const;
-  void recovery_rate_summary(Formatter *f, ostream *out) const;
+  void recovery_summary(Formatter *f, ostream *out,
+                        pool_stat_t delta_sum) const;
+  void overall_recovery_summary(Formatter *f, ostream *out) const;
+  void pool_recovery_summary(Formatter *f, ostream *out,
+                             uint64_t poolid) const;
+  void recovery_rate_summary(Formatter *f, ostream *out,
+                             pool_stat_t delta_sum,
+                             utime_t delta_stamp) const;
+  void overall_recovery_rate_summary(Formatter *f, ostream *out) const;
+  void pool_recovery_rate_summary(Formatter *f, ostream *out,
+                                  uint64_t poolid) const;
   void print_summary(Formatter *f, ostream *out) const;
   void print_oneline_summary(ostream *out) const;
 
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index d9d49e10a08..2881e4985e3 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1862,7 +1862,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 
   // recovery
   stringstream rss;
-  pg_map.recovery_summary(NULL, &rss);
+  pg_map.overall_recovery_summary(NULL, &rss);
   if (!rss.str().empty()) {
     summary.push_back(make_pair(HEALTH_WARN, "recovery " + rss.str()));
     if (detail)
-- 
cgit v1.2.1


From 2cd532001949a947bf5565620371eb4b4e930400 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Thu, 3 Oct 2013 01:09:16 +0100
Subject: mon: PGMap: rework client IO rate calc and output

Create a function so we can use the same format when outputting per-pool
stats

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/PGMap.cc | 91 ++++++++++++++++++++++++++++++++++++--------------------
 src/mon/PGMap.h  | 20 +++++++++++++
 2 files changed, 78 insertions(+), 33 deletions(-)

diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e4a0cde0b20..39cb30f97c8 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -885,6 +885,57 @@ void PGMap::pool_recovery_summary(Formatter *f, ostream *out,
   recovery_summary(f, out, p->second.first);
 }
 
+void PGMap::client_io_rate_summary(Formatter *f, ostream *out,
+                                   pool_stat_t delta_sum,
+                                   utime_t delta_stamp) const
+{
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  if (pos_delta.stats.sum.num_rd ||
+      pos_delta.stats.sum.num_wr) {
+    if (pos_delta.stats.sum.num_rd) {
+      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("read_bytes_sec", rd);
+      } else {
+	*out << pretty_si_t(rd) << "B/s rd, ";
+      }
+    }
+    if (pos_delta.stats.sum.num_wr) {
+      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("write_bytes_sec", wr);
+      } else {
+	*out << pretty_si_t(wr) << "B/s wr, ";
+      }
+    }
+    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("op_per_sec", iops);
+    } else {
+      *out << pretty_si_t(iops) << "op/s";
+    }
+  }
+}
+
+void PGMap::overall_client_io_rate_summary(Formatter *f, ostream *out) const
+{
+  client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out,
+                                        uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  hash_map<uint64_t,utime_t>::const_iterator ts =
+    per_pool_sum_deltas_stamps.find(p->first);
+  assert(ts != per_pool_sum_deltas_stamps.end());
+  client_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
 /**
  * update aggregated delta
  *
@@ -1054,39 +1105,13 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   if (!f && ssr.str().length())
     *out << "recovery io " << ssr.str() << "\n";
 
-  // make non-negative; we can get negative values if osds send
-  // uncommitted stats and then "go backward" or if they are just
-  // buggy/wrong.
-  pool_stat_t pos_delta = pg_sum_delta;
-  pos_delta.floor(0);
-  if (pos_delta.stats.sum.num_rd ||
-      pos_delta.stats.sum.num_wr) {
-    if (!f)
-      *out << "  client io ";
-    if (pos_delta.stats.sum.num_rd) {
-      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
-      if (f) {
-	f->dump_int("read_bytes_sec", rd);
-      } else {
-	*out << pretty_si_t(rd) << "B/s rd, ";
-      }
-    }
-    if (pos_delta.stats.sum.num_wr) {
-      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
-      if (f) {
-	f->dump_int("write_bytes_sec", wr);
-      } else {
-	*out << pretty_si_t(wr) << "B/s wr, ";
-      }
-    }
-    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
-    if (f) {
-      f->dump_int("op_per_sec", iops);
-    } else {
-      *out << pretty_si_t(iops) << "op/s";
-      *out << "\n";
-    }
-  }
+  ssr.clear();
+  ssr.str("");
+
+  overall_client_io_rate_summary(f, &ssr);
+  if (!f && ssr.str().length())
+    *out << "  client io " << ssr.str() << "\n";
+
 
 }
 
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 81feed67384..c8ce7fd973e 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -254,6 +254,26 @@ public:
   void overall_recovery_rate_summary(Formatter *f, ostream *out) const;
   void pool_recovery_rate_summary(Formatter *f, ostream *out,
                                   uint64_t poolid) const;
+  /**
+   * Obtain a formatted/plain output for client I/O, source from stats for a
+   * given @p delta_sum pool over a given @p delta_stamp period of time.
+   */
+  void client_io_rate_summary(Formatter *f, ostream *out,
+                              pool_stat_t delta_sum,
+                              utime_t delta_stamp) const;
+  /**
+   * Obtain a formatted/plain output for the overall client I/O, which is
+   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+   */
+  void overall_client_io_rate_summary(Formatter *f, ostream *out) const;
+  /**
+   * Obtain a formatted/plain output for client I/O over a given pool
+   * with id @p pool_id.  We will then obtain pool-specific data
+   * from @p per_pool_sum_delta.
+   */
+  void pool_client_io_rate_summary(Formatter *f, ostream *out,
+                                   uint64_t poolid) const;
+
   void print_summary(Formatter *f, ostream *out) const;
   void print_oneline_summary(ostream *out) const;
 
-- 
cgit v1.2.1


From 5abe5c273ae2197ae0539191f1c64d6a9ee41873 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Wed, 2 Oct 2013 02:06:42 +0100
Subject: mon: OSDMonitor: add 'osd pool stats' command

Fixes: #6147

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/mon/MonCommands.h |  4 +++
 src/mon/OSDMonitor.cc | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)

diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 33e00a98d30..149469c232b 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -518,6 +518,10 @@ COMMAND("osd pool set-quota " \
 	"name=field,type=CephChoices,strings=max_objects|max_bytes " \
 	"name=val,type=CephString",
 	"set object or byte limit on pool", "osd", "rw", "cli,rest")
+COMMAND("osd pool stats " \
+        "name=name,type=CephString,req=false",
+        "obtain stats from all pools, or from specified pool",
+        "osd", "r", "cli,rest")
 COMMAND("osd reweight-by-utilization " \
 	"name=oload,type=CephInt,range=100,req=false", \
 	"reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 9144736d801..9d36e87788d 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2296,6 +2296,105 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     }
     r = 0;
 
+  } else if (prefix == "osd pool stats") {
+    string pool_name;
+    cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
+
+    PGMap& pg_map = mon->pgmon()->pg_map;
+
+    int64_t poolid = -ENOENT;
+    bool one_pool = false;
+    if (!pool_name.empty()) {
+      poolid = osdmap.lookup_pg_pool_name(pool_name);
+      if (poolid < 0) {
+        assert(poolid == -ENOENT);
+        ss << "unrecognized pool '" << pool_name << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      one_pool = true;
+    }
+
+    stringstream rs;
+
+    if (f)
+      f->open_array_section("pool_stats");
+    if (osdmap.get_pools().size() == 0) {
+      if (!f)
+        ss << "there are no pools!";
+      goto stats_out;
+    }
+
+    for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
+         it != osdmap.get_pools().end();
+         ++it) {
+
+      if (!one_pool)
+        poolid = it->first;
+
+      pool_name = osdmap.get_pool_name(poolid);
+
+      if (f) {
+        f->open_object_section("pool");
+        f->dump_string("pool_name", pool_name.c_str());
+        f->dump_int("pool_id", poolid);
+        f->open_object_section("recovery");
+      }
+
+      stringstream rss, tss;
+      pg_map.pool_recovery_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->open_object_section("recovery_rate");
+      }
+
+      rss.clear();
+      rss.str("");
+
+      pg_map.pool_recovery_rate_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  recovery io " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->open_object_section("client_io_rate");
+      }
+
+      rss.clear();
+      rss.str("");
+
+      pg_map.pool_client_io_rate_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  client io " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->close_section();
+      } else {
+        rs << "pool " << pool_name << " id " << poolid << "\n";
+        if (!tss.str().empty())
+          rs << tss.str() << "\n";
+        else
+          rs << "  nothing is going on\n\n";
+      }
+
+      if (one_pool)
+        break;
+    }
+
+stats_out:
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(rs.str());
+    }
+    rdata.append("\n");
+    r = 0;
+
   } else if (prefix == "osd crush rule list" ||
 	     prefix == "osd crush rule ls") {
     string format;
-- 
cgit v1.2.1


From bebbd6cb7b71697b34b8f27652cabdc40c97a33b Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Tue, 15 Oct 2013 10:20:48 -0700
Subject: rgw: fix authenticated users acl group check

Fixes: #6553
Backport: bobtail, cuttlefish, dumpling
Authenticated users group acl bit was not working correctly. Check to
test whether user is anonymous was wrong.

Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
---
 src/rgw/rgw_acl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc
index 3f99d72cd5b..02504524847 100644
--- a/src/rgw/rgw_acl.cc
+++ b/src/rgw/rgw_acl.cc
@@ -79,7 +79,7 @@ int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) {
   if ((perm & perm_mask) != perm_mask) {
     perm |= acl.get_group_perm(ACL_GROUP_ALL_USERS, perm_mask);
 
-    if (compare_group_name(id, ACL_GROUP_ALL_USERS) != 0) {
+    if (!compare_group_name(id, ACL_GROUP_ALL_USERS)) {
       /* this is not the anonymous user */
       perm |= acl.get_group_perm(ACL_GROUP_AUTHENTICATED_USERS, perm_mask);
     }
-- 
cgit v1.2.1


From 70cc6813262507405f3f726d21cede165fe87660 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Oct 2013 11:22:16 -0700
Subject: mon/PGMonitor: set floor below which we do not warn about objects/pg

If a cluster has very few objects, do not generate warnings when the
objects/pg for a pool diverges from the cluster average.  This avoids
spurious errors when you have a relatively empty cluster and a lone pool
with a modest number of objects is too far off the (mostly meaningless)
cluster-wide average.

Also include a per-pool min so we ignore mostly-empty pools.

Fixes: #6521
Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/common/config_opts.h | 2 ++
 src/mon/PGMonitor.cc     | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 2d3f981379b..08c2b0b4cae 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -160,6 +160,8 @@ OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20)  // min # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
+OPTION(mon_pg_warn_min_objects, OPT_INT, 10000)  // do not warn below this object #
+OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools below this object #
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
 OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 2881e4985e3..c14872d87ef 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1911,7 +1911,9 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
       }
       int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
-      if (average_objects_per_pg > 0) {
+      if (average_objects_per_pg > 0 &&
+	  pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects &&
+	  p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) {
 	int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
 	float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
 	if (g_conf->mon_pg_warn_max_object_skew > 0 &&
-- 
cgit v1.2.1


From 8d7dbf85472cfca9268d81ecf057ea078cf345b3 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Tue, 15 Oct 2013 10:55:07 -0700
Subject: rgw: change default log level

Fixes: #6554
Backport: cuttlefish, dumpling
Default log level was just too high, bring it down a bit.

Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 src/rgw/rgw_main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 54db609521c..2e0245587c9 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -461,7 +461,7 @@ int main(int argc, const char **argv)
 
   /* alternative default for module */
   vector<const char *> def_args;
-  def_args.push_back("--debug-rgw=20");
+  def_args.push_back("--debug-rgw=1/5");
   def_args.push_back("--keyring=$rgw_data/keyring");
   def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name");
 
-- 
cgit v1.2.1


From 270123124430f01957a5636db89a448c7d8b74d6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Oct 2013 15:39:04 -0700
Subject: os/LevelDBStore: handle deletion race when checking store size

This fixes the fix in 64774e5792f136df2bc78db686440fc2f3a7643f which mixed
up the return value and errno.

Fixes: #6550
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/os/LevelDBStore.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index 89718ce1987..bc5b612a97a 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -329,13 +329,15 @@ public:
       string fpath = path + '/' + n;
       struct stat s;
       int err = stat(fpath.c_str(), &s);
+      if (err < 0)
+	err = -errno;
       // we may race against leveldb while reading files; this should only
       // happen when those files are being updated, data is being shuffled
       // and files get removed, in which case there's not much of a problem
       // as we'll get to them next time around.
       if ((err < 0) && (err != -ENOENT)) {
         lderr(cct) << __func__ << " error obtaining stats for " << fpath
-                   << ": " << cpp_strerror(errno) << dendl;
+                   << ": " << cpp_strerror(err) << dendl;
         goto err;
       }
 
-- 
cgit v1.2.1


From 488678f31abfa08c375ee08f21d1c492d1d6aad0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Oct 2013 15:50:16 -0700
Subject: ceph_test_rados: fix snap remove vs rollback fix

In commit 55d279b98553ba4542219b126fc7159b20b18b1f we tried to fix a race
between rollback and snap removal, but got the logic wrong: we need to
prevent *snap removal* on in-use snaps, not prevent multiple rollbacks on
the same snap.

Fixes: #6254 (again)
Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Samuel Just <sam.just@inktank.com>
---
 src/test/osd/TestRados.cc | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index 7158f50a74a..842f9d2bca3 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -111,22 +111,23 @@ private:
       return new SnapCreateOp(m_op, &context, m_stats);
 
     case TEST_OP_SNAP_REMOVE:
-      if (context.snaps.empty()) {
+      if (context.snaps.size() <= context.snaps_in_use.size()) {
 	return NULL;
-      } else {
+      }
+      while (true) {
 	int snap = rand_choose(context.snaps)->first;
+	if (context.snaps_in_use.count(snap))
+	  continue;  // in use; try again!
 	cout << "snap_remove snap " << snap << std::endl;
 	return new SnapRemoveOp(m_op, &context, snap, m_stats);
       }
 
     case TEST_OP_ROLLBACK:
-      if (context.snaps.size() <= context.snaps_in_use.size()) {
+      if (context.snaps.empty()) {
 	return NULL;
       }
-      while (true) {
+      {
 	int snap = rand_choose(context.snaps)->first;
-	if (context.snaps_in_use.count(snap))
-	  continue;  // in use; try again!
 	string oid = *(rand_choose(context.oid_not_in_use));
 	cout << "rollback oid " << oid << " to " << snap << std::endl;
 	return new RollbackOp(m_op, &context, oid, snap);
-- 
cgit v1.2.1


From c7acc2aee2f4aef19386ede46b4562b6552a3955 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@inktank.com>
Date: Tue, 15 Oct 2013 16:05:14 -0700
Subject: rgw: gracefully handle bad root pool names

Fixes: #5716
When invalid root pools specified (don't start with a period) we return
with an error instead of asserting.

Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
---
 src/rgw/rgw_rados.cc | 82 +++++++++++++++++++++++++++++++++++++---------------
 src/rgw/rgw_rados.h  |  4 +--
 2 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 8b4d18f4e68..6d2cc9159a6 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -82,18 +82,26 @@ void RGWDefaultRegionInfo::decode_json(JSONObj *obj) {
   JSONDecoder::decode_json("default_region", default_region, obj);
 }
 
-string RGWRegion::get_pool_name(CephContext *cct)
+int RGWRegion::get_pool_name(CephContext *cct, string *pool_name)
 {
-  string pool_name = cct->_conf->rgw_region_root_pool;
-  if (pool_name.empty()) {
-    pool_name = RGW_DEFAULT_REGION_ROOT_POOL;
+  *pool_name = cct->_conf->rgw_region_root_pool;
+  if (pool_name->empty()) {
+    *pool_name = RGW_DEFAULT_REGION_ROOT_POOL;
+  } else if ((*pool_name)[0] != '.') {
+    derr << "ERROR: region root pool name must start with a period" << dendl;
+    return -EINVAL;
   }
-  return pool_name;
+  return 0;
 }
 
 int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0) {
+    return ret;
+  }
 
   string oid = cct->_conf->rgw_default_region_info_oid;
   if (oid.empty()) {
@@ -102,7 +110,7 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0)
     return ret;
 
@@ -121,7 +129,10 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 
 int RGWRegion::set_as_default()
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   string oid = cct->_conf->rgw_default_region_info_oid;
   if (oid.empty()) {
@@ -136,7 +147,7 @@ int RGWRegion::set_as_default()
 
   ::encode(default_info, bl);
 
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
   if (ret < 0)
     return ret;
 
@@ -185,7 +196,11 @@ int RGWRegion::init(CephContext *_cct, RGWRados *_store, bool setup_region)
 
 int RGWRegion::read_info(const string& region_name)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
+
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
 
@@ -193,7 +208,7 @@ int RGWRegion::read_info(const string& region_name)
 
   string oid = region_info_oid_prefix + name;
 
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0) {
     lderr(cct) << "failed reading region info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
     return ret;
@@ -246,7 +261,10 @@ int RGWRegion::create_default()
 
 int RGWRegion::store_info(bool exclusive)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
 
@@ -254,7 +272,7 @@ int RGWRegion::store_info(bool exclusive)
 
   bufferlist bl;
   ::encode(*this, bl);
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL);
 
   return ret;
 }
@@ -293,13 +311,17 @@ void RGWZoneParams::init_default(RGWRados *store)
   }
 }
 
-string RGWZoneParams::get_pool_name(CephContext *cct)
+int RGWZoneParams::get_pool_name(CephContext *cct, string *pool_name)
 {
-  string pool_name = cct->_conf->rgw_zone_root_pool;
-  if (pool_name.empty()) {
-    pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
+  *pool_name = cct->_conf->rgw_zone_root_pool;
+  if (pool_name->empty()) {
+    *pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
+  } else if ((*pool_name)[0] != '.') {
+    derr << "ERROR: zone root pool name must start with a period" << dendl;
+    return -EINVAL;
   }
-  return pool_name;
+
+  return 0;
 }
 
 void RGWZoneParams::init_name(CephContext *cct, RGWRegion& region)
@@ -319,13 +341,16 @@ int RGWZoneParams::init(CephContext *cct, RGWRados *store, RGWRegion& region)
 {
   init_name(cct, region);
 
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
 
   string oid = zone_info_oid_prefix + name;
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0)
     return ret;
 
@@ -344,14 +369,17 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi
 {
   init_name(cct, region);
 
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
   string oid = zone_info_oid_prefix + name;
 
   bufferlist bl;
   ::encode(*this, bl);
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
 
   return ret;
 }
@@ -1025,14 +1053,20 @@ int RGWRados::list_raw_prefixed_objs(string pool_name, const string& prefix, lis
 
 int RGWRados::list_regions(list<string>& regions)
 {
-  string pool_name = RGWRegion::get_pool_name(cct);
+  string pool_name;
+  int ret = RGWRegion::get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   return list_raw_prefixed_objs(pool_name, region_info_oid_prefix, regions);
 }
 
 int RGWRados::list_zones(list<string>& zones)
 {
-  string pool_name = RGWZoneParams::get_pool_name(cct);
+  string pool_name;
+  int ret = RGWZoneParams::get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   return list_raw_prefixed_objs(pool_name, zone_info_oid_prefix, zones);
 }
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 65765c414aa..72f0675e762 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -433,7 +433,7 @@ struct RGWZoneParams {
 
   map<string, RGWZonePlacementInfo> placement_pools;
 
-  static string get_pool_name(CephContext *cct);
+  static int get_pool_name(CephContext *cct, string *pool_name);
   void init_name(CephContext *cct, RGWRegion& region);
   int init(CephContext *cct, RGWRados *store, RGWRegion& region);
   void init_default(RGWRados *store);
@@ -622,7 +622,7 @@ struct RGWRegion {
   int set_as_default();
   int equals(const string& other_region);
 
-  static string get_pool_name(CephContext *cct);
+  static int get_pool_name(CephContext *cct, string *pool_name);
 
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
-- 
cgit v1.2.1


From 982511e968a3a026266c29603a0d350aff2b3a47 Mon Sep 17 00:00:00 2001
From: Greg Farnum <greg@inktank.com>
Date: Tue, 15 Oct 2013 15:45:05 -0700
Subject: MonCommands: note that pg dump options don't work in plaintext

Signed-off-by: Greg Farnum <greg@inktank.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 src/mon/MonCommands.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 149469c232b..b7a5f853928 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -112,7 +112,7 @@ COMMAND("pg send_pg_creates", "trigger pg creates to be issued",\
 	"pg", "rw", "cli,rest")
 COMMAND("pg dump " \
 	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \
-	"show human-readable versions of pg map", "pg", "r", "cli,rest")
+	"show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r", "cli,rest")
 COMMAND("pg dump_json " \
 	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \
 	"show human-readable version of pg map in json only",\
-- 
cgit v1.2.1


From 1d4f501a015727a7ff4b2f9b20dc91f2bbd9707b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Tue, 15 Oct 2013 16:00:26 -0700
Subject: test/filestore/run_seed_to.sh: avoid obsolete
 --filestore-xattr-use-omap

This option no longer exists.

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/test/filestore/run_seed_to.sh | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/test/filestore/run_seed_to.sh b/src/test/filestore/run_seed_to.sh
index fdf56141e12..d5bb671138c 100755
--- a/src/test/filestore/run_seed_to.sh
+++ b/src/test/filestore/run_seed_to.sh
@@ -246,13 +246,13 @@ do
   do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover
   $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \
     $tmp_name_a $tmp_name_a/journal \
-    --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \
+    --test-seed $seed --osd-journal-size 100 \
     --filestore-kill-at $killat $tmp_opts_a \
     --log-file $tmp_name_a.fail --debug-filestore 20 || true
 
   stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \
     $tmp_name_a $tmp_name_a/journal \
-    --filestore-xattr-use-omap --log-file $tmp_name_a.recover \
+    --log-file $tmp_name_a.recover \
     --debug-filestore 20 --debug-journal 20`
 
   if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then
@@ -265,12 +265,11 @@ do
   do_rm $tmp_name_b $tmp_name_b.clean
   $v ceph_test_filestore_idempotent_sequence run-sequence-to \
     $stop_at $tmp_name_b $tmp_name_b/journal \
-    --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \
+    --test-seed $seed --osd-journal-size 100 \
     --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b
 
   if $v ceph_test_filestore_idempotent_sequence diff \
-    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal \
-    --filestore-xattr-use-omap; then
+    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal ; then
       echo OK
   else
     echo "FAIL"
-- 
cgit v1.2.1


From da69fa09c8274585225471c68c1acc788a3881f4 Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Sun, 13 Oct 2013 13:40:57 +0100
Subject: tools: move 'test_store_tool' to 'tools/ceph-kvstore-tool'

ceph-kvstore-tool allows for lower-level access to leveldb stores.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/test/Makefile.am                               |   5 -
 .../ObjectMap/test_store_tool/test_store_tool.cc   | 270 ---------------------
 src/tools/Makefile.am                              |   6 +
 src/tools/ceph-kvstore-tool.cc                     | 270 +++++++++++++++++++++
 4 files changed, 276 insertions(+), 275 deletions(-)
 delete mode 100644 src/test/ObjectMap/test_store_tool/test_store_tool.cc
 create mode 100644 src/tools/ceph-kvstore-tool.cc

diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 59b4d89e930..84a228f1d4b 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -856,11 +856,6 @@ ceph_test_keyvaluedb_iterators_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_keyvaluedb_iterators_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators
 
-ceph_test_store_tool_SOURCES = test/ObjectMap/test_store_tool/test_store_tool.cc
-ceph_test_store_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
-ceph_test_store_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_store_tool
-
 ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
 bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate
 
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
deleted file mode 100644
index 8fcf3f30e82..00000000000
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
-* Ceph - scalable distributed file system
-*
-* Copyright (C) 2012 Inktank, Inc.
-*
-* This is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License version 2.1, as published by the Free Software
-* Foundation. See file COPYING.
-*/
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <map>
-#include <set>
-#include <boost/scoped_ptr.hpp>
-
-#include "os/LevelDBStore.h"
-
-#include "common/ceph_argparse.h"
-#include "global/global_init.h"
-#include "common/errno.h"
-#include "common/safe_io.h"
-#include "common/config.h"
-#include "common/strtol.h"
-
-using namespace std;
-
-class StoreTool
-{
-  boost::scoped_ptr<KeyValueDB> db;
-
-  public:
-  StoreTool(const string &path) {
-    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, path);
-    assert(!db_ptr->open(std::cerr));
-    db.reset(db_ptr);
-  }
-
-  void list(const string &prefix, const bool do_crc) {
-    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
-
-    if (prefix.empty())
-      iter->seek_to_first();
-    else
-      iter->seek_to_first(prefix);
-
-    while (iter->valid()) {
-      pair<string,string> rk = iter->raw_key();
-      if (!prefix.empty() && (rk.first != prefix))
-	break;
-
-      std::cout << rk.first << ":" << rk.second;
-      if (do_crc) {
-        std::cout << " (" << iter->value().crc32c(0) << ")";
-      }
-      std::cout << std::endl;
-      iter->next();
-    }
-  }
-
-  bool exists(const string &prefix) {
-    assert(!prefix.empty());
-    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
-    iter->seek_to_first(prefix);
-    return (iter->valid() && (iter->raw_key().first == prefix));
-  }
-
-  bool exists(const string &prefix, const string &key) {
-    assert(!prefix.empty());
-
-    if (key.empty()) {
-      return exists(prefix);
-    }
-
-    bool exists = false;
-    get(prefix, key, exists);
-    return exists;
-  }
-
-  bufferlist get(const string &prefix, const string &key, bool &exists) {
-    assert(!prefix.empty() && !key.empty());
-
-    map<string,bufferlist> result;
-    std::set<std::string> keys;
-    keys.insert(key);
-    db->get(prefix, keys, &result);
-
-    if (result.count(key) > 0) {
-      exists = true;
-      return result[key];
-    }
-    exists = false;
-    return bufferlist();
-  }
-
-  uint64_t get_size() {
-    map<string,uint64_t> extras;
-    uint64_t s = db->get_estimated_size(extras);
-    for (map<string,uint64_t>::iterator p = extras.begin();
-         p != extras.end(); ++p) {
-      std::cout << p->first << " - " << p->second << std::endl;
-    }
-    std::cout << "total: " << s << std::endl;
-    return s;
-  }
-
-  bool set(const string &prefix, const string &key, bufferlist &val) {
-    assert(!prefix.empty());
-    assert(!key.empty());
-    assert(val.length() > 0);
-
-    KeyValueDB::Transaction tx = db->get_transaction();
-    tx->set(prefix, key, val);
-    int ret = db->submit_transaction_sync(tx);
-
-    return (ret == 0);
-  }
-};
-
-void usage(const char *pname)
-{
-  std::cerr << "Usage: " << pname << " <store path> command [args...]\n"
-    << "\n"
-    << "Commands:\n"
-    << "  list [prefix]\n"
-    << "  list-crc [prefix]\n"
-    << "  exists <prefix> [key]\n"
-    << "  get <prefix> <key>\n"
-    << "  crc <prefix> <key>\n"
-    << "  get-size\n"
-    << "  set <prefix> <key> [ver <N>|in <file>]\n"
-    << std::endl;
-}
-
-int main(int argc, const char *argv[])
-{
-  vector<const char*> args;
-  argv_to_vec(argc, argv, args);
-  env_to_vec(args);
-
-  global_init(
-      NULL, args,
-      CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-  common_init_finish(g_ceph_context);
-
-
-  if (args.size() < 2) {
-    usage(argv[0]);
-    return 1;
-  }
-
-  string path(args[0]);
-  string cmd(args[1]);
-
-  std::cout << "path: " << path << " cmd " << cmd << std::endl;
-
-  StoreTool st(path);
-
-  if (cmd == "list" || cmd == "list-crc") {
-    string prefix;
-    if (argc > 3)
-      prefix = argv[3];
-
-    bool do_crc = (cmd == "list-crc");
-
-    st.list(prefix, do_crc);
-
-  } else if (cmd == "exists") {
-    string key;
-    if (argc < 4) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    if (argc > 4)
-      key = argv[4];
-
-    bool ret = st.exists(prefix, key);
-    std::cout << "(" << prefix << ", " << key << ") "
-      << (ret ? "exists" : "does not exist")
-      << std::endl;
-    return (ret ? 0 : 1);
-
-  } else if (cmd == "get") {
-    if (argc < 5) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    string key(argv[4]);
-
-    bool exists = false;
-    bufferlist bl = st.get(prefix, key, exists);
-    std::cout << "(" << prefix << ", " << key << ")";
-    if (!exists) {
-      std::cout << " does not exist" << std::endl;
-      return 1;
-    }
-    std::cout << std::endl;
-    ostringstream os;
-    bl.hexdump(os);
-    std::cout << os.str() << std::endl;
-
-  } else if (cmd == "crc") {
-    if (argc < 5) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    string key(argv[4]);
-
-    bool exists = false;
-    bufferlist bl = st.get(prefix, key, exists);
-    std::cout << "(" << prefix << ", " << key << ") ";
-    if (!exists) {
-      std::cout << " does not exist" << std::endl;
-      return 1;
-    }
-    std::cout << " crc " << bl.crc32c(0) << std::endl;
-
-  } else if (cmd == "get-size") {
-    std::cout << "estimated store size: " << st.get_size() << std::endl;
-
-  } else if (cmd == "set") {
-    if (argc < 7) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    string key(argv[4]);
-    string subcmd(argv[5]);
-
-    bufferlist val;
-    string errstr;
-    if (subcmd == "ver") {
-      version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
-      if (!errstr.empty()) {
-        std::cerr << "error reading version: " << errstr << std::endl;
-        return 1;
-      }
-      ::encode(v, val);
-    } else if (subcmd == "in") {
-      int ret = val.read_file(argv[6], &errstr);
-      if (ret < 0 || !errstr.empty()) {
-        std::cerr << "error reading file: " << errstr << std::endl;
-        return 1;
-      }
-    } else {
-      std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
-      usage(argv[0]);
-      return 1;
-    }
-
-    bool ret = st.set(prefix, key, val);
-    if (!ret) {
-      std::cerr << "error setting ("
-                << prefix << "," << key << ")" << std::endl;
-      return 1;
-    }
-
-  } else {
-    std::cerr << "Unrecognized command: " << cmd << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index 4b8da77951a..89417014dd4 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -6,6 +6,12 @@ ceph_monstore_tool_SOURCES = tools/ceph-monstore-tool.cc
 ceph_monstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
 bin_DEBUGPROGRAMS += ceph-monstore-tool
 
+ceph_kvstore_tool_SOURCES = tools/ceph-kvstore-tool.cc
+ceph_kvstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+ceph_kvstore_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph-kvstore-tool
+
+
 ceph_filestore_dump_SOURCES = tools/ceph-filestore-dump.cc
 ceph_filestore_dump_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
 if LINUX
diff --git a/src/tools/ceph-kvstore-tool.cc b/src/tools/ceph-kvstore-tool.cc
new file mode 100644
index 00000000000..8fcf3f30e82
--- /dev/null
+++ b/src/tools/ceph-kvstore-tool.cc
@@ -0,0 +1,270 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <map>
+#include <set>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/LevelDBStore.h"
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/config.h"
+#include "common/strtol.h"
+
+using namespace std;
+
+class StoreTool
+{
+  boost::scoped_ptr<KeyValueDB> db;
+
+  public:
+  StoreTool(const string &path) {
+    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, path);
+    assert(!db_ptr->open(std::cerr));
+    db.reset(db_ptr);
+  }
+
+  void list(const string &prefix, const bool do_crc) {
+    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
+
+    if (prefix.empty())
+      iter->seek_to_first();
+    else
+      iter->seek_to_first(prefix);
+
+    while (iter->valid()) {
+      pair<string,string> rk = iter->raw_key();
+      if (!prefix.empty() && (rk.first != prefix))
+	break;
+
+      std::cout << rk.first << ":" << rk.second;
+      if (do_crc) {
+        std::cout << " (" << iter->value().crc32c(0) << ")";
+      }
+      std::cout << std::endl;
+      iter->next();
+    }
+  }
+
+  bool exists(const string &prefix) {
+    assert(!prefix.empty());
+    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
+    iter->seek_to_first(prefix);
+    return (iter->valid() && (iter->raw_key().first == prefix));
+  }
+
+  bool exists(const string &prefix, const string &key) {
+    assert(!prefix.empty());
+
+    if (key.empty()) {
+      return exists(prefix);
+    }
+
+    bool exists = false;
+    get(prefix, key, exists);
+    return exists;
+  }
+
+  bufferlist get(const string &prefix, const string &key, bool &exists) {
+    assert(!prefix.empty() && !key.empty());
+
+    map<string,bufferlist> result;
+    std::set<std::string> keys;
+    keys.insert(key);
+    db->get(prefix, keys, &result);
+
+    if (result.count(key) > 0) {
+      exists = true;
+      return result[key];
+    }
+    exists = false;
+    return bufferlist();
+  }
+
+  uint64_t get_size() {
+    map<string,uint64_t> extras;
+    uint64_t s = db->get_estimated_size(extras);
+    for (map<string,uint64_t>::iterator p = extras.begin();
+         p != extras.end(); ++p) {
+      std::cout << p->first << " - " << p->second << std::endl;
+    }
+    std::cout << "total: " << s << std::endl;
+    return s;
+  }
+
+  bool set(const string &prefix, const string &key, bufferlist &val) {
+    assert(!prefix.empty());
+    assert(!key.empty());
+    assert(val.length() > 0);
+
+    KeyValueDB::Transaction tx = db->get_transaction();
+    tx->set(prefix, key, val);
+    int ret = db->submit_transaction_sync(tx);
+
+    return (ret == 0);
+  }
+};
+
+void usage(const char *pname)
+{
+  std::cerr << "Usage: " << pname << " <store path> command [args...]\n"
+    << "\n"
+    << "Commands:\n"
+    << "  list [prefix]\n"
+    << "  list-crc [prefix]\n"
+    << "  exists <prefix> [key]\n"
+    << "  get <prefix> <key>\n"
+    << "  crc <prefix> <key>\n"
+    << "  get-size\n"
+    << "  set <prefix> <key> [ver <N>|in <file>]\n"
+    << std::endl;
+}
+
+int main(int argc, const char *argv[])
+{
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(
+      NULL, args,
+      CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+
+  if (args.size() < 2) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  string path(args[0]);
+  string cmd(args[1]);
+
+  std::cout << "path: " << path << " cmd " << cmd << std::endl;
+
+  StoreTool st(path);
+
+  if (cmd == "list" || cmd == "list-crc") {
+    string prefix;
+    if (argc > 3)
+      prefix = argv[3];
+
+    bool do_crc = (cmd == "list-crc");
+
+    st.list(prefix, do_crc);
+
+  } else if (cmd == "exists") {
+    string key;
+    if (argc < 4) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    if (argc > 4)
+      key = argv[4];
+
+    bool ret = st.exists(prefix, key);
+    std::cout << "(" << prefix << ", " << key << ") "
+      << (ret ? "exists" : "does not exist")
+      << std::endl;
+    return (ret ? 0 : 1);
+
+  } else if (cmd == "get") {
+    if (argc < 5) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+
+    bool exists = false;
+    bufferlist bl = st.get(prefix, key, exists);
+    std::cout << "(" << prefix << ", " << key << ")";
+    if (!exists) {
+      std::cout << " does not exist" << std::endl;
+      return 1;
+    }
+    std::cout << std::endl;
+    ostringstream os;
+    bl.hexdump(os);
+    std::cout << os.str() << std::endl;
+
+  } else if (cmd == "crc") {
+    if (argc < 5) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+
+    bool exists = false;
+    bufferlist bl = st.get(prefix, key, exists);
+    std::cout << "(" << prefix << ", " << key << ") ";
+    if (!exists) {
+      std::cout << " does not exist" << std::endl;
+      return 1;
+    }
+    std::cout << " crc " << bl.crc32c(0) << std::endl;
+
+  } else if (cmd == "get-size") {
+    std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+  } else if (cmd == "set") {
+    if (argc < 7) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+    string subcmd(argv[5]);
+
+    bufferlist val;
+    string errstr;
+    if (subcmd == "ver") {
+      version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+      if (!errstr.empty()) {
+        std::cerr << "error reading version: " << errstr << std::endl;
+        return 1;
+      }
+      ::encode(v, val);
+    } else if (subcmd == "in") {
+      int ret = val.read_file(argv[6], &errstr);
+      if (ret < 0 || !errstr.empty()) {
+        std::cerr << "error reading file: " << errstr << std::endl;
+        return 1;
+      }
+    } else {
+      std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+      usage(argv[0]);
+      return 1;
+    }
+
+    bool ret = st.set(prefix, key, val);
+    if (!ret) {
+      std::cerr << "error setting ("
+                << prefix << "," << key << ")" << std::endl;
+      return 1;
+    }
+
+  } else {
+    std::cerr << "Unrecognized command: " << cmd << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
-- 
cgit v1.2.1


From 85914b27e67879e5d5b8f05c569919155b690d4f Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Sun, 13 Oct 2013 13:44:29 +0100
Subject: ceph-kvstore-tool: calc store crc

Reuse 'list()' function to traverse the store and calc not only version's
crcs, but also calc the store's crc.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/tools/ceph-kvstore-tool.cc | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/tools/ceph-kvstore-tool.cc b/src/tools/ceph-kvstore-tool.cc
index 8fcf3f30e82..ffd462bbabd 100644
--- a/src/tools/ceph-kvstore-tool.cc
+++ b/src/tools/ceph-kvstore-tool.cc
@@ -39,7 +39,9 @@ class StoreTool
     db.reset(db_ptr);
   }
 
-  void list(const string &prefix, const bool do_crc) {
+  uint32_t traverse(const string &prefix,
+                    const bool do_crc,
+                    ostream *out) {
     KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
 
     if (prefix.empty())
@@ -47,18 +49,36 @@ class StoreTool
     else
       iter->seek_to_first(prefix);
 
+    uint32_t crc = -1;
+
     while (iter->valid()) {
       pair<string,string> rk = iter->raw_key();
       if (!prefix.empty() && (rk.first != prefix))
-	break;
+        break;
 
-      std::cout << rk.first << ":" << rk.second;
+      if (out)
+        *out << rk.first << ":" << rk.second;
       if (do_crc) {
-        std::cout << " (" << iter->value().crc32c(0) << ")";
+        bufferlist bl;
+        bl.append(rk.first);
+        bl.append(rk.second);
+        bl.append(iter->value());
+
+        crc = bl.crc32c(crc);
+        if (out) {
+          *out << " (" << bl.crc32c(0) << ")";
+        }
       }
-      std::cout << std::endl;
+      if (out)
+        *out << std::endl;
       iter->next();
     }
+
+    return crc;
+  }
+
+  void list(const string &prefix, const bool do_crc) {
+    traverse(prefix, do_crc, &std::cout);
   }
 
   bool exists(const string &prefix) {
@@ -132,6 +152,7 @@ void usage(const char *pname)
     << "  crc <prefix> <key>\n"
     << "  get-size\n"
     << "  set <prefix> <key> [ver <N>|in <file>]\n"
+    << "  store-crc <path>\n"
     << std::endl;
 }
 
@@ -260,6 +281,9 @@ int main(int argc, const char *argv[])
                 << prefix << "," << key << ")" << std::endl;
       return 1;
     }
+  } else if (cmd == "store-crc") {
+    uint32_t crc = st.traverse(string(), true, NULL);
+    std::cout << "store at '" << path << "' crc " << crc << std::endl;
 
   } else {
     std::cerr << "Unrecognized command: " << cmd << std::endl;
-- 
cgit v1.2.1


From fd6e2b86188845e6c115d65a5cfaeb60f57c6a3a Mon Sep 17 00:00:00 2001
From: Joao Eduardo Luis <joao.luis@inktank.com>
Date: Sun, 13 Oct 2013 13:45:43 +0100
Subject: ceph-kvstore-tool: copy one leveldb store to some other place

Iterates over the provided source store's keys and copies them to the
provided destination store.

Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
---
 src/tools/ceph-kvstore-tool.cc | 94 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 4 deletions(-)

diff --git a/src/tools/ceph-kvstore-tool.cc b/src/tools/ceph-kvstore-tool.cc
index ffd462bbabd..e07391d5c51 100644
--- a/src/tools/ceph-kvstore-tool.cc
+++ b/src/tools/ceph-kvstore-tool.cc
@@ -25,16 +25,18 @@
 #include "common/safe_io.h"
 #include "common/config.h"
 #include "common/strtol.h"
+#include "include/stringify.h"
 
 using namespace std;
 
 class StoreTool
 {
   boost::scoped_ptr<KeyValueDB> db;
+  string store_path;
 
   public:
-  StoreTool(const string &path) {
-    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, path);
+  StoreTool(const string &path) : store_path(path) {
+    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, store_path);
     assert(!db_ptr->open(std::cerr));
     db.reset(db_ptr);
   }
@@ -138,6 +140,70 @@ class StoreTool
 
     return (ret == 0);
   }
+
+  int copy_store_to(const string &other_path, const int num_keys_per_tx) {
+
+    if (num_keys_per_tx <= 0) {
+      std::cerr << "must specify a number of keys/tx > 0" << std::endl;
+      return -EINVAL;
+    }
+
+    // open or create a leveldb store at @p other_path
+    LevelDBStore other(g_ceph_context, other_path);
+    int err = other.create_and_open(std::cerr);
+    if (err < 0)
+      return err;
+
+    KeyValueDB::WholeSpaceIterator it = db->get_iterator();
+    it->seek_to_first();
+    uint64_t total_keys = 0;
+    uint64_t total_size = 0;
+    uint64_t total_txs = 0;
+
+    utime_t started_at = ceph_clock_now(g_ceph_context);
+
+    do {
+      int num_keys = 0;
+
+      KeyValueDB::Transaction tx = other.get_transaction();
+
+
+      while (it->valid() && num_keys < num_keys_per_tx) {
+        pair<string,string> k = it->raw_key();
+        bufferlist v = it->value();
+        tx->set(k.first, k.second, v);
+
+        num_keys ++;
+        total_size += v.length();
+
+        it->next();
+      }
+
+      total_txs ++;
+      total_keys += num_keys;
+
+      if (num_keys > 0)
+        other.submit_transaction_sync(tx);
+
+      utime_t cur_duration = ceph_clock_now(g_ceph_context) - started_at;
+      std::cout << "ts = " << cur_duration << "s, copied " << total_keys
+                << " keys so far (" << stringify(si_t(total_size)) << ")"
+                << std::endl;
+
+    } while (it->valid());
+
+    utime_t time_taken = ceph_clock_now(g_ceph_context) - started_at;
+
+    std::cout << "summary:" << std::endl;
+    std::cout << "  copied " << total_keys << " keys" << std::endl;
+    std::cout << "  used " << total_txs << " transactions" << std::endl;
+    std::cout << "  total size " << stringify(si_t(total_size)) << std::endl;
+    std::cout << "  from '" << store_path << "' to '" << other_path << "'"
+              << std::endl;
+    std::cout << "  duration " << time_taken << " seconds" << std::endl;
+
+    return 0;
+  }
 };
 
 void usage(const char *pname)
@@ -152,6 +218,7 @@ void usage(const char *pname)
     << "  crc <prefix> <key>\n"
     << "  get-size\n"
     << "  set <prefix> <key> [ver <N>|in <file>]\n"
+    << "  store-copy <path> [num-keys-per-tx]\n"
     << "  store-crc <path>\n"
     << std::endl;
 }
@@ -176,8 +243,6 @@ int main(int argc, const char *argv[])
   string path(args[0]);
   string cmd(args[1]);
 
-  std::cout << "path: " << path << " cmd " << cmd << std::endl;
-
   StoreTool st(path);
 
   if (cmd == "list" || cmd == "list-crc") {
@@ -281,6 +346,27 @@ int main(int argc, const char *argv[])
                 << prefix << "," << key << ")" << std::endl;
       return 1;
     }
+  } else if (cmd == "store-copy") {
+    int num_keys_per_tx = 128; // magic number that just feels right.
+    if (argc < 4) {
+      usage(argv[0]);
+      return 1;
+    } else if (argc > 4) {
+      string err;
+      num_keys_per_tx = strict_strtol(argv[4], 10, &err);
+      if (!err.empty()) {
+        std::cerr << "invalid num_keys_per_tx: " << err << std::endl;
+        return 1;
+      }
+    }
+
+    int ret = st.copy_store_to(argv[3], num_keys_per_tx);
+    if (ret < 0) {
+      std::cerr << "error copying store to path '" << argv[3]
+                << "': " << cpp_strerror(ret) << std::endl;
+      return 1;
+    }
+
   } else if (cmd == "store-crc") {
     uint32_t crc = st.traverse(string(), true, NULL);
     std::cout << "store at '" << path << "' crc " << crc << std::endl;
-- 
cgit v1.2.1


From 14e91bf1246eed944adf8bbad627c030654da142 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 16 Oct 2013 13:59:00 -0700
Subject: debian, specfile: fix ceph-kvstore-tool packaging

See da69fa09c8274585225471c68c1acc788a3881f4

Signed-off-by: Sage Weil <sage@inktank.com>
---
 ceph.spec.in             | 2 +-
 debian/ceph-test.install | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ceph.spec.in b/ceph.spec.in
index a60d87ad814..3cee74b3d12 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -671,12 +671,12 @@ fi
 %{_bindir}/ceph_test_rados_watch_notify
 %{_bindir}/ceph_test_signal_handlers
 %{_bindir}/ceph_test_snap_mapper
-%{_bindir}/ceph_test_store_tool
 %{_bindir}/ceph_test_timers
 %{_bindir}/ceph_tpbench
 %{_bindir}/ceph_xattr_bench
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
+%{_bindir}/ceph-kvstore-tool
 
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
diff --git a/debian/ceph-test.install b/debian/ceph-test.install
index c5a5e0a9774..237a05850be 100644
--- a/debian/ceph-test.install
+++ b/debian/ceph-test.install
@@ -67,7 +67,6 @@ usr/bin/ceph_test_rados_watch_notify
 usr/bin/ceph_test_rewrite_latency
 usr/bin/ceph_test_signal_handlers
 usr/bin/ceph_test_snap_mapper
-usr/bin/ceph_test_store_tool
 usr/bin/ceph_test_stress_watch
 usr/bin/ceph_test_timers
 usr/bin/ceph_test_trans
@@ -75,4 +74,5 @@ usr/bin/ceph_tpbench
 usr/bin/ceph_xattr_bench
 usr/bin/ceph-monstore-tool
 usr/bin/ceph-osdomap-tool
+usr/bin/ceph-kvstore-tool
 usr/share/java/libcephfs-test.jar
-- 
cgit v1.2.1


From 5c280a242b69eb15a916f2a7b5032a2f7081913b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Wed, 16 Oct 2013 14:23:13 -0700
Subject: .gitignore: ceph-kvstore-tool

Signed-off-by: Sage Weil <sage@inktank.com>
---
 src/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/.gitignore b/src/.gitignore
index 6efe8dc6bc4..8542ba868f9 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -43,6 +43,7 @@ Makefile
 /ceph_smalliobenchrbd
 /ceph-monstore-tool
 /ceph-osdomap-tool
+/ceph-kvstore-tool
 /ceph_ver.h
 /dev
 /init-ceph
-- 
cgit v1.2.1


From e509cb1e69cd39e3702b5351188e60116bafc544 Mon Sep 17 00:00:00 2001
From: Gary Lowell <gary.lowell@inktank.com>
Date: Thu, 17 Oct 2013 09:19:36 +0000
Subject: v0.71

---
 configure.ac     | 2 +-
 debian/changelog | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 1eee4609ec1..7fc21c37905 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.70], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.71], [ceph-devel@vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/debian/changelog b/debian/changelog
index 4628bb52175..49e08ace0a1 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+ceph (0.71-1) stable; urgency=low
+
+  * New upstream release 
+
+ -- Gary Lowell <gary.lowell@inktank.com>  Thu, 17 Oct 2013 09:19:02 +0000
+
 ceph (0.70-1) stable; urgency=low
 
   * New upstream release 
-- 
cgit v1.2.1