Merge tag 'v0.71' into baserock/morph

v0.71
author: Mark Doffman <mark.doffman@gmail.com> 2013-10-19 19:51:30 +0000
committer: Mark Doffman <mark.doffman@gmail.com> 2013-10-19 19:51:30 +0000
commit: 0bad3151e50f2838193da8b3ed3e74c9758fb334 (patch)
tree: 17c38637d0f5e9028a1838c301bcb64eb6cc72d3
parent: 73b290bbc6d8965228a63930bf11aa41c88adcc2 (diff)
parent: e509cb1e69cd39e3702b5351188e60116bafc544 (diff)
download: ceph-0bad3151e50f2838193da8b3ed3e74c9758fb334.tar.gz
80 files changed, 3891 insertions, 1855 deletions
diff --git a/COPYING b/COPYING
index 920b049b7fa..a0034d58c3b 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,8 @@
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
+Name: ceph
+Maintainer: Sage Weil <sage@newdream.net>
+Source: http://ceph.com/
+
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
 License: LGPL2.1 (see COPYING-LGPL2.1)
@@ -18,6 +23,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow <arash@partow.net>
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
@@ -94,33 +103,38 @@ Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
 License: LGPL2 or later
 
 Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
-Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
- - Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in
-   the documentation and/or other materials provided with the
-   distribution.
-
- - Neither the name of the University of Tennessee nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
+Packaging:
+    Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
+    Copyright (C) 2010 Canonical, Ltd.
+    Licensed under LGPL-2.1
diff --git a/PendingReleaseNotes b/PendingReleaseNotes
index 9a751ffdb49..779a081480f 100644
--- a/PendingReleaseNotes
+++ b/PendingReleaseNotes
@@ -51,3 +51,7 @@ v0.71
 * Any direct users of the 'tmap' portion of the librados API should be
   aware that the automatic tmap -> omap conversion functionality has
   been removed.
+
+* Most output that used K or KB (e.g., for kilobyte) now uses a
+  lower-case k to match the official SI convention.  Any scripts that
+  parse output and check for an upper-case K will need to be modified.
diff --git a/ceph.spec.in b/ceph.spec.in
index a60d87ad814..3cee74b3d12 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -671,12 +671,12 @@ fi
 %{_bindir}/ceph_test_rados_watch_notify
 %{_bindir}/ceph_test_signal_handlers
 %{_bindir}/ceph_test_snap_mapper
-%{_bindir}/ceph_test_store_tool
 %{_bindir}/ceph_test_timers
 %{_bindir}/ceph_tpbench
 %{_bindir}/ceph_xattr_bench
 %{_bindir}/ceph-monstore-tool
 %{_bindir}/ceph-osdomap-tool
+%{_bindir}/ceph-kvstore-tool
 
 %files -n libcephfs_jni1
 %defattr(-,root,root,-)
diff --git a/configure.ac b/configure.ac
index eeecdbeffc8..7fc21c37905 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,7 +8,7 @@ AC_PREREQ(2.59)
 # VERSION define is not used by the code.  It gets a version string
 # from 'git describe'; see src/ceph_ver.[ch]
 
-AC_INIT([ceph], [0.69], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.71], [ceph-devel@vger.kernel.org])
 
 # Create release string.  Used with VERSION for RPMs.
 RPM_RELEASE=0
diff --git a/debian/ceph-test.install b/debian/ceph-test.install
index c5a5e0a9774..237a05850be 100644
--- a/debian/ceph-test.install
+++ b/debian/ceph-test.install
@@ -67,7 +67,6 @@ usr/bin/ceph_test_rados_watch_notify
 usr/bin/ceph_test_rewrite_latency
 usr/bin/ceph_test_signal_handlers
 usr/bin/ceph_test_snap_mapper
-usr/bin/ceph_test_store_tool
 usr/bin/ceph_test_stress_watch
 usr/bin/ceph_test_timers
 usr/bin/ceph_test_trans
@@ -75,4 +74,5 @@ usr/bin/ceph_tpbench
 usr/bin/ceph_xattr_bench
 usr/bin/ceph-monstore-tool
 usr/bin/ceph-osdomap-tool
+usr/bin/ceph-kvstore-tool
 usr/share/java/libcephfs-test.jar
diff --git a/debian/changelog b/debian/changelog
index ce73472f9eb..49e08ace0a1 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,15 @@
+ceph (0.71-1) stable; urgency=low
+
+  * New upstream release 
+
+ -- Gary Lowell <gary.lowell@inktank.com>  Thu, 17 Oct 2013 09:19:02 +0000
+
+ceph (0.70-1) stable; urgency=low
+
+  * New upstream release 
+
+ -- Gary Lowell <gary.lowell@inktank.com>  Fri, 04 Oct 2013 20:11:51 +0000
+
 ceph (0.69-1) precise; urgency=low
 
   * New upstream release 
diff --git a/debian/copyright b/debian/copyright
index d11a0f7f5da..d3906c44d35 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -1,11 +1,15 @@
-Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Format-Specification: http://anonscm.debian.org/viewvc/dep/web/deps/dep5/copyright-format.xml?revision=279&view=markup
 Name: ceph
 Maintainer: Sage Weil <sage@newdream.net>
 Source: http://ceph.com/
 
 Files: *
 Copyright: (c) 2004-2010 by Sage Weil <sage@newdream.net>
-License: LGPL2.1 (see /usr/share/common-licenses/LGPL-2.1)
+License: LGPL2.1 (see COPYING-LGPL2.1)
+
+Files: doc/*
+Copyright: (c) 2010-2012 New Dream Network and contributors
+License: Creative Commons Attribution-ShareAlike (CC BY-SA)
 
 Files: src/mount/canonicalize.c
 Copyright: Copyright (C) 1993 Rick Sladkey <jrs@world.std.com>
@@ -19,6 +23,10 @@ Files: src/include/ceph_hash.cc
 Copyright: None
 License: Public domain
 
+Files: src/common/bloom_filter.hpp
+Copyright: Copyright (C) 2000 Arash Partow
+License: Boost Software License, Version 1.0
+
 Files: m4/acx_pthread.m4
 Copyright: Steven G. Johnson <stevenj@alum.mit.edu>
 License: GPLWithACException
@@ -28,25 +36,25 @@ Copyright:
     Copyright 2012-2013 Intel Corporation All Rights Reserved.
 License: BSD 3-clause
 
-Files: src/common/sctp_crc32.c:
+Files: src/common/sctp_crc32.c: 
 Copyright:
     Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
     Copyright (c) 2004-2006 Intel Corporation - All Rights Reserved
 License:
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
-
+ 
   a) Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
-
+ 
   b) Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
     the documentation and/or other materials provided with the distribution.
-
+ 
   c) Neither the name of Cisco Systems, Inc. nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.
-
+ 
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -88,6 +96,44 @@ License:
   FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   OTHER DEALINGS IN THE SOFTWARE.
 
+
+
+Files: src/test/common/Throttle.cc src/test/filestore/chain_xattr.cc
+Copyright: Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
+License: LGPL2 or later
+
+Files: src/osd/ErasureCodePluginJerasure/*.{c,h}
+Copyright: Copyright (c) 2011, James S. Plank <plank@cs.utk.edu>
+License:
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  
+   - Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+  
+   - Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in
+     the documentation and/or other materials provided with the
+     distribution.
+  
+   - Neither the name of the University of Tennessee nor the names of its
+     contributors may be used to endorse or promote products derived
+     from this software without specific prior written permission.
+  
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+  POSSIBILITY OF SUCH DAMAGE.
+
 Packaging:
     Copyright (C) 2004-2009 by Sage Weil <sage@newdream.net>
     Copyright (C) 2010 Canonical, Ltd.
diff --git a/doc/architecture.rst b/doc/architecture.rst
index 9f57bbbd58a..988475f53b6 100644
--- a/doc/architecture.rst
+++ b/doc/architecture.rst
@@ -387,7 +387,7 @@ steps to compute PG IDs.
 #. CRUSH calculates the hash modulo the number of OSDs. (e.g., ``0x58``) to get 
    a PG ID.
 #. CRUSH gets the pool ID given the pool name (e.g., "liverpool" = ``4``)
-#. CRUSH prepends the pool ID to the pool ID to the PG ID (e.g., ``4.0x58``).
+#. CRUSH prepends the pool ID to the PG ID (e.g., ``4.0x58``).
 
 Computing object locations is much faster than performing object location query
 over a chatty session. The :abbr:`CRUSH (Controlled Replication Under Scalable
diff --git a/doc/changelog/v0.67.4.txt b/doc/changelog/v0.67.4.txt
new file mode 100644
index 00000000000..73b997ea304
--- /dev/null
+++ b/doc/changelog/v0.67.4.txt
@@ -0,0 +1,550 @@
+commit ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7
+Author: Gary Lowell <gary.lowell@inktank.com>
+Date:   Thu Oct 3 22:41:31 2013 +0000
+
+    v0.67.4
+
+commit 5cd66d3b4bca92b402c95ab256fbc3f0329c446f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Fri Sep 20 14:04:47 2013 -0700
+
+    rgw: fix keystone token expiration test
+    
+    Fixes: #6360
+    The test was inverted, need expiration to be greater than
+    current time in order for token to be valid.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+
+commit e0203c61a3f45fdd6d3d3ece26fef6152bdc036d
+Author: David Zafman <david.zafman@inktank.com>
+Date:   Wed Sep 11 16:55:06 2013 -0700
+
+    osd/OSD.cc: Use MIN() so that we don't exceed osd_recovery_max_active
+    
+    Caused by 944f3b73531af791c90f0f061280160003545c63
+    
+    Fixes: #6291
+    
+    Backport: dumpling
+    
+    Signed-off-by: David Zafman <david.zafman@inktank.com>
+    Reviewed-by: Samuel Just <sam.just@inktank.com>
+    (cherry picked from commit 139a714e13aa3c7f42091270b55dde8a17b3c4b8)
+    
+    Conflicts:
+    
+    	src/osd/OSD.cc
+
+commit c376708358cedb5561fbb43e9b9e622df3ea7a58
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Wed Sep 25 22:08:24 2013 +0100
+
+    mon: OSDMonitor: do not write full_latest during trim
+    
+    On commit 81983bab we patched OSDMonitor::update_from_paxos() such that we
+    write the latest full map version to 'full_latest' each time the latest
+    full map was built from the incremental versions.
+    
+    This change however clashed with OSDMonitor::encode_trim_extra(), which
+    also wrote to 'full_latest' on each trim, writing instead the version of
+    the *oldest* full map.  This duality of behaviors could lead the store
+    to an inconsistent state across the monitors (although there's no sign of
+    it actually imposing any issues besides rebuilding already existing full
+    maps on some monitors).
+    
+    We now stop OSDMonitor::encode_trim_extra() from writing to 'full_latest'.
+    This function will still write out the oldest full map it has in the store,
+    but it will no longer write to full_latest, instead leaving it up to
+    OSDMonitor::update_from_paxos() to figure it out -- and it already does.
+    
+    Fixes: #6378
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit bd0f29a2c28cca496ec830eac932477ebf3182ba)
+
+commit de40d0b3e35ab0124cd3c4ebfcaa435ab8abfab9
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Oct 1 15:53:42 2013 -0700
+
+    crush: invalidate rmap on create (and thus decode)
+    
+    If we have an existing CrushWrapper object and decode from a bufferlist,
+    reset build_rmaps so that they get rebuilt.
+    
+    Remove the build_rmaps() all in decode that was useless on a redecode
+    (because have_rmaps == true in that case and it did nothing).
+    
+    Fixes: #6442
+    Backport: dumpling, maybe cuttlefish
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 9b7a2ae329b6a511064dd3d6e549ba61f52cfd21)
+
+commit 32f5233288c47d95b87c0a9cab5f9c2ffcf15417
+Author: Dan Mick <dan.mick@inktank.com>
+Date:   Mon Sep 30 14:58:11 2013 -0700
+
+    Invoke python with /usr/bin/env python instead of directly
+    
+    Fixes: #6311
+    Signed-off-by: Dan Mick <dan.mick@inktank.com>
+    (cherry picked from commit b9000b314b9166845ff302d4a827a996775d9a14)
+
+commit 66aeca5a9079be398403bbff67bd5bf68c6fb111
+Author: Sage Weil <sage@inktank.com>
+Date:   Wed Sep 25 10:10:21 2013 -0700
+
+    qa/workunits/mon/crush_ops.sh: fix test
+    
+    Fix root.
+    
+    Fixes: #6392
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit c8cae87e9e08468cc86145e0fd60c05d12826239)
+
+commit beb366302a125dd422c4f092b12eb541cb3bc788
+Author: Sage Weil <sage@inktank.com>
+Date:   Mon Sep 23 09:04:34 2013 -0700
+
+    Revert "ceph: parse CEPH_ARGS environment variable"
+    
+    This reverts commit 67a95b9880c9bc6e858150352318d68d64ed74ad.
+    
+    We now put CEPH_ARGS in the actual args we parse in python, which are passed
+    to rados piecemeal later.  This lets you put things like --id ... in there
+    that need to be parsed before librados is initialized.
+    (cherry picked from commit 97f462be4829f0167ed3d65e6694dfc16f1f3243)
+
+commit b475ff9576f145d31c053213c699e13df76d2bcb
+Author: Benoît Knecht <benoit.knecht@fsfe.org>
+Date:   Mon Sep 23 15:58:42 2013 +0200
+
+    Add CEPH_ARGS at the end of sys.argv
+    
+    This allows, for instance, to pass a different client name to ceph by
+    exporting CEPH_ARGS="--id client_id".
+    
+    Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 30abe3244c86cbbe1f5b005850c29c9c0eafcad4)
+
+commit 94548b4b67cca37366c7d8719209a6d2e7956811
+Author: Sage Weil <sage@inktank.com>
+Date:   Tue Sep 24 15:26:03 2013 -0700
+
+    mon/OSDMonitor: fix 'ceph osd crush reweight ...'
+    
+    The adjust method returns a count of adjusted items.
+    
+    Add a test.
+    
+    Fixes: #6382
+    Backport: dumpling
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Dan Mick <dan.mick@inktank.com>
+    (cherry picked from commit 3de32562b55c6ece3a6ed783c36f8b9f21460339)
+
+commit 00ff7f5c20e13869d0694379739ba4e61d44b97c
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Tue Sep 10 00:20:41 2013 +0100
+
+    qa: workunits: mon: crush_ops: test 'ceph osd crush move'
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 3bc618b7b46496c5110edde0da9cae5d3e68e0e1)
+
+commit 0ff5b4a96833681e92cc41f019a569134474f4cf
+Author: Loic Dachary <loic@dachary.org>
+Date:   Tue Sep 24 19:04:23 2013 +0200
+
+    osd: change warn_interval_multiplier to uint32_t
+    
+    to prevent overflow in OpTracker::check_ops_in_flight when
+    multiplying warn_interval_multiplier *= 2
+    
+    Backport: cuttlefish, dumpling
+    
+    http://tracker.ceph.com/issues/6370 fixes #6370
+    
+    Signed-off-by: Loic Dachary <loic@dachary.org>
+    (cherry picked from commit 1bce1f009bffd3e28025a08775fec189907a81db)
+
+commit fb15040b6cec6221baa550ddfffade823f784c4a
+Author: David Zafman <david.zafman@inktank.com>
+Date:   Mon Sep 9 13:01:12 2013 -0700
+
+    crushtool: do not dump core with non-unique bucket IDs
+    
+    Return -EEXIST on duplicate ID
+    BUG FIX: crush_add_bucket() mixes error returns and IDs
+      Add optional argument to return generated ID
+    
+    Fixes: #6246
+    
+    Signed-off-by: David Zafman <david.zafman@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 8c76f3a0f9cf100ea2c941dc2b61c470aa5033d7)
+
+commit 410db3f30c6eb54b807908c1f251ad4026e7d446
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 17:06:30 2013 +0100
+
+    qa: workunits: cephtool: check if 'heap' commands are parseable
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit b1eeaddd5f214c1b0883b44fc8cae07c649be7c4)
+
+commit 062060a38bb26ff260cc51accc534413d726de49
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 17:50:27 2013 +0100
+
+    osd: OSD: add 'heap' command to known osd commands array
+    
+    Must have been forgotten during the cli rework.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit 296f2d0db31e9f5a59a3a62a1e95b6c440430fa3)
+
+commit 3f32f57b98e0224a1d30b2a81d7d260be0f53800
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 16:43:27 2013 +0100
+
+    mds: MDS: pass only heap profiler commands instead of the whole cmd vector
+    
+    The heap profiler doesn't care, nor should it, what our command name is.
+    It only cares about the commands it handles.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit 238fe272c6bdb62d4e57fd8555c0136de99c8129)
+
+commit 46dcc46617d8f35ab8433540b22343ddcbcc3716
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Fri Sep 20 16:41:14 2013 +0100
+
+    perfglue/heap_profiler.cc: expect args as first element on cmd vector
+    
+    We used to pass 'heap' as the first element of the cmd vector when
+    handling commands.  We haven't been doing so for a while now, so we
+    needed to fix this.
+    
+    Not expecting 'heap' also makes sense, considering that what we need to
+    know when we reach this function is what command we should handle, and
+    we should not care what the caller calls us when handling his business.
+    
+    Fixes: #6361
+    Backport: dumpling
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit c98b910d49bd2b46ceafdc430044a31524c29f5b)
+
+commit 9dc5f15fbae22244ad1f62925e17c9d81e856e55
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Mon Sep 16 14:35:25 2013 -0700
+
+    rgw: destroy get_obj handle in copy_obj()
+    
+    Fixes: #6176
+    Backport: dumpling
+    We take different code paths in copy_obj, make sure we close the handle
+    when we exit the function. Move the call to finish_get_obj() out of
+    copy_obj_data() as we don't create the handle there, so that should
+    makes code less confusing and less prone to errors.
+    Also, note that RGWRados::get_obj() also calls finish_get_obj(). For
+    everything to work in concert we need to pass a pointer to the handle
+    and not the handle itself. Therefore we needed to also change the call
+    to copy_obj_data().
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 9e98620e4325d15c88440a890b267131613e1aa1)
+
+commit 471233e98a9f64ad513a4a196b7661b80534cb00
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Mon Sep 9 23:14:11 2013 +0100
+
+    mon: MonCommands: expect a CephString as 1st arg for 'osd crush move'
+    
+    Fixes: #6230
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 7d3799fde19138f957f26ec6be10a8a0000fc1f0)
+
+commit 2908225092bd2aa1b8afcb7848c1cdac5bd9e638
+Author: Sage Weil <sage@inktank.com>
+Date:   Mon Sep 23 16:23:33 2013 -0700
+
+    osd: revert 'osd max xattr size' limit
+    
+    Set it to 0 (unlimited) for now.
+    
+    Backport: dumpling
+    
+    Signed-off-by: Sage Weil <sage@inktank.com>
+    Reviewed-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit abb88d70643c3a76435b7a9d5b04ff29f7502361)
+
+commit b3d3b3747c1eef695138dac828e5fcb435309c7b
+Author: Greg Farnum <greg@inktank.com>
+Date:   Wed Sep 11 16:24:32 2013 -0700
+
+    mds: be more careful about decoding LogEvents
+    
+    We need to wrap the full decode section or we can abort the process
+    if there's an issue (which we may want to just skip by).
+    
+    Signed-off-by: Greg Farnum <greg@inktank.com>
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    (cherry picked from commit 73289b34b0be5b6612e38944794d59b5e789f841)
+
+commit 06c58132199ed22413b509dfa751321ccdb24225
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Tue Sep 17 17:58:20 2013 +0100
+
+    mon: OSDMonitor: multiple rebuilt full maps per transaction
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 0d20cae0be701c5b6151a26ee5e4fe24d89aa20a)
+
+commit 65bbcaf4b68790dae4506c1f5db237077e1ff0ae
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Sun Sep 15 21:03:50 2013 +0100
+
+    mon: OSDMonitor: update latest_full while rebuilding full maps
+    
+    Not doing so will make the monitor rebuild the osdmap full versions, even
+    though they may have been rebuilt before, every time the monitor starts.
+    
+    This mostly happens when the cluster is left in an unhealthy state for
+    a long period of time and incremental versions build up.  Even though we
+    build the full maps on update_from_paxos(), not updating 'full_latest'
+    leads to the situation initially described.
+    
+    Fixes: #6322
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 81983bab3630520d6c7ee9b7e4a747bc17b8c5c3)
+
+commit 9b9edb04581cca15e67c567332529f5b3f426743
+Author: Joao Eduardo Luis <joao.luis@inktank.com>
+Date:   Sun Sep 15 21:00:55 2013 +0100
+
+    mon: OSDMonitor: smaller transactions when rebuilding full versions
+    
+    Otherwise, for considerably sized rebuilds, the monitor will not only
+    consume vast amounts of memory, but it will also have troubles committing
+    the transaction.  Anyway, it's also a good idea to adjust transactions to
+    the granularity we want, and to be fair we care that each rebuilt full map
+    gets to disk, even if subsequent full maps don't (those can be rebuilt
+    later).
+    
+    Fixes: #6323
+    
+    Signed-off-by: Joao Eduardo Luis <joao.luis@inktank.com>
+    (cherry picked from commit 4ac1570c5cdcd6556dc291cc6d7878fd92d343ae)
+
+commit 298811f7a15541b9ec1015c416ad2aa075be5691
+Author: Joao Eduardo Luis <jecluis@gmail.com>
+Date:   Wed Aug 28 15:51:01 2013 +0100
+
+    mon: OSDMonitor: check if pool is on unmanaged snaps mode on mk/rmsnap
+    
+    Backport: dumpling
+    Fixes: #6047
+    
+    Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
+    (cherry picked from commit fab79543c54c2e446d3f76520d7906645c6b0075)
+
+commit a992664435db9dde3745eb7f354cce3fc5400a47
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Sep 12 14:32:17 2013 -0700
+
+    lru_map: don't use list::size()
+    
+    replace list::size() with map::size(), which should have
+    a constant time complexity.
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 7c1d2ded8fa8061bf3f14932800998b963745dd1)
+
+commit 788546ea71c994ff35323747294ed9c177fe7020
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Sep 12 14:30:19 2013 -0700
+
+    common/lru_map: rename tokens to entries
+    
+    This code was originally used in a token cache, now
+    as a generic infrastructure rename token fields.
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 532e41a9985a16b35a6e49cdcba38af0ad166fa8)
+
+commit babeb00c42af760b3e7575166479e95365cfcc0a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 18 10:37:21 2013 -0700
+
+    rgw: use bufferlist::append() instead of bufferlist::push_back()
+    
+    push_back() expects char *, whereas append can append a single char.
+    Appending a NULL char to push_back is cast as a NULL pointer which is
+    bad.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    (cherry picked from commit 08fe028bad13096d482454a2f303158727c363ff)
+
+commit daf85c45dd4d158bc7c33a2fb784857bc7db35cd
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 13:46:31 2013 -0700
+
+    rgw: NULL terminate buffer before parsing it
+    
+    Fixes: #6175
+    Backport: dumpling
+    We get a buffer off the remote gateway which might
+    not be NULL terminated. The JSON parser needs the
+    buffer to be NULL terminated even though we provide
+    a buffer length as it calls strlen().
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit e7f7483192cddca1159aba439ce62b1e78669d51)
+
+commit c73040a5518971813b9ebaae1624c5bacef315d0
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 22:30:12 2013 -0700
+
+    rgw: don't call list::size() in ObjectCache
+    
+    Fixes: #6286
+    Use an external counter instead of calling list::size()
+    
+    Reviewed-by: Sage Weil <sage@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 31e3a51e933429d286104fe077e98ea883437ad6)
+
+commit a855aba9d18936e9a060119e041518790cd4b831
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Sep 10 12:18:55 2013 -0700
+
+    rgw: drain pending requests before completing write
+    
+    Fixes: #6268
+    When doing aio write of objects (either regular or multipart parts) we
+    need to drain pending aio requests. Otherwise if gateway goes down then
+    object might end up corrupted.
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 626669afaa333d73707553a85f5c874e99e9cbd8)
+
+commit 670db7e80ddc9c26c43a4f66907a5996ce207c4d
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Fri Sep 6 22:33:38 2013 -0700
+
+    rgw: fix get cors, delete cors
+    
+    Remove a couple of variables that overrode class member. Not
+    really clear how it was working before, might have been a bad
+    merge / rebase.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+    (cherry picked from commit 13872785aeeddbe1b8dd97e49fd6a2d879514f8d)
+
+commit a304016fa01b02efd500135c00b9bf3407a9999c
+Merge: 408cd61 ac0a30f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Sep 11 09:47:10 2013 -0700
+
+    Merge branch 'wip-6078-dumpling' into dumpling
+    
+    Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
+
+commit ac0a30feb8c64a3b80d9c519a7b561213403afab
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 28 21:25:20 2013 -0700
+
+    rgw: fix certain return status cases in CORS
+    
+    Change return values in certain cases, reorder
+    checks, etc.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 13b28cc3f1eb8ef42875b630c485ee0105cd244a
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 28 21:24:36 2013 -0700
+
+    rgw: add COPY method to be handled by CORS
+    
+    Was missing this http method.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit d45c87ea738807487e72c0719b0d3d459cbe19e9
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Aug 27 19:38:45 2013 -0700
+
+    rgw: fix CORS rule check
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 986fa92a7a1d88111ba28457160adfcfdaabc5d2
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Tue Aug 27 19:38:18 2013 -0700
+
+    rgw: don't handle CORS if rule not found (is NULL)
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 71873aba6553492d3ad71596cefd7c841030a277
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Aug 22 13:38:55 2013 -0700
+
+    rgw: tie CORS header response to all relevant operations
+    
+    Have the CORS responses on all relevant operations. Also add headers
+    on failure cases.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit 94e7b594d85dbd26e58d823b41f418032e9f163f
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Thu Aug 22 10:00:53 2013 -0700
+
+    rgw: add a generic CORS response handling
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit c3385d8a102faf5379559bb98cf89637ceda1579
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 21 17:22:46 2013 -0700
+
+    rgw: OPTIONS request doesn't need to read object info
+    
+    This is a bucket-only operation, so we shouldn't look at the
+    object. Object may not exist and we might respond with Not
+    Exists response which is not what we want.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
+
+commit a5fdd44e5d8ce4b8d82273d83e27aea19e63aa7c
+Author: Yehuda Sadeh <yehuda@inktank.com>
+Date:   Wed Aug 21 14:43:28 2013 -0700
+
+    rgw: remove use of s->bucket_cors
+    
+    Some old code still tried to use s->bucket_cors, which was
+    abandoned in a cleanup work.
+    
+    Signed-off-by: Yehuda Sadeh <yehuda@inktank.com>
diff --git a/doc/dev/osd_internals/erasure_coding.rst b/doc/dev/osd_internals/erasure_coding.rst
index cc1efe4b4bf..0586c46c3bb 100644
--- a/doc/dev/osd_internals/erasure_coding.rst
+++ b/doc/dev/osd_internals/erasure_coding.rst
@@ -3,8 +3,8 @@ Erasure Coded Placement Groups
 ==============================
 
 The documentation of the erasure coding implementation in Ceph was
-created in July 2013. It is included in Ceph even before erasure
-coding is available because it drives a number of architectural
+created in July 2013. It is included in Ceph even before erasure coded
+pools are available because it drives a number of architectural
 changes. It is meant to be updated to reflect the `progress of these
 architectural changes <http://tracker.ceph.com/issues/4929>`_, up to
 the point where it becomes a reference of the erasure coding
@@ -14,8 +14,14 @@ Glossary
 --------
 
 *chunk* 
-   when the encoding function is called, it returns chunks of the
-   same size.
+   when the encoding function is called, it returns chunks of the same
+   size. Data chunks which can be concated to reconstruct the original
+   object and coding chunks which can be used to rebuild a lost chunk.
+
+*chunk rank*
+   the index of a chunk when returned by the encoding function. The
+   rank of the first chunk is 0, the rank of the second chunk is 1
+   etc.
 
 *stripe* 
    when an object is too large to be encoded with a single call,
@@ -23,9 +29,13 @@ Glossary
    called a stripe.
 
 *shard|strip*
-   the file that holds all chunks of a same rank for a given object.
+   an ordered sequence of chunks of the same rank from the same
+   object.  For a given placement group, each OSD contains shards of
+   the same rank. When dealing with objects that are encoded with a
+   single operation, *chunk* is sometime used instead of *shard*
+   because the shard is made of a single chunk.
 
-Example:
+The definitions are illustrated as follows:
 ::
  
                  OSD 40                       OSD 33
@@ -53,6 +63,6 @@ Table of content
 .. toctree::
    :maxdepth: 1
 
-   High level design document <erasure_coding/pgbackend>
    Developer notes <erasure_coding/developer_notes>
-   Draft PGBackend.h header <erasure_coding/PGBackend-h>
+   Jerasure plugin <erasure_coding/jerasure>
+   High level design document <erasure_coding/pgbackend>
diff --git a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst b/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
deleted file mode 100644
index b39cdb0e88e..00000000000
--- a/doc/dev/osd_internals/erasure_coding/PGBackend-h.rst
+++ /dev/null
@@ -1,156 +0,0 @@
-===========
-PGBackend.h
-===========
-
-Work in progress:
-::
- 
- /**
-  * PGBackend
-  *
-  * PGBackend defines an interface for logic handling IO and
-  * replication on RADOS objects.  The PGBackend implementation
-  * is responsible for:
-  *
-  * 1) Handling client operations
-  * 2) Handling object recovery
-  * 3) Handling object access
-  */
- class PGBackend {
- public:	
-   /// IO
- 
-   /// Perform write
-   int perform_write(
-     const vector<OSDOp> &ops,  ///< [in] ops to perform
-     Context *onreadable,       ///< [in] called when readable on all reaplicas
-     Context *onreadable,       ///< [in] called when durable on all replicas
-     ) = 0; ///< @return 0 or error
- 
-   /// Attempt to roll back a log entry
-   int try_rollback(
-     const pg_log_entry_t &entry, ///< [in] entry to roll back
-     ObjectStore::Transaction *t  ///< [out] transaction
-     ) = 0; ///< @return 0 on success, -EINVAL if it can't be rolled back
- 
-   /// Perform async read, oncomplete is called when ops out_bls are filled in
-   int perform_read(
-     vector<OSDOp> &ops,        ///< [in, out] ops
-     Context *oncomplete        ///< [out] called with r code
-     ) = 0; ///< @return 0 or error
- 
-   /// Peering
- 
-   /**
-    * have_enough_infos
-    *
-    * Allows PGBackend implementation to ensure that enough peers have
-    * been contacted to satisfy its requirements.
-    *
-    * TODO: this interface should yield diagnostic info about which infos
-    * are required
-    */
-   bool have_enough_infos(
-     const map<epoch_t, pg_interval_t> &past_intervals,      ///< [in] intervals
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     ) = 0; ///< @return true if we can continue peering
- 
-   /**
-    * choose_acting
-    *
-    * Allows PGBackend implementation to select the acting set based on the
-    * received infos
-    *
-    * @return False if the current acting set is inadequate, *req_acting will
-    *         be filled in with the requested new acting set.  True if the
-    *         current acting set is adequate, *auth_log will be filled in
-    *         with the correct location of the authoritative log.
-    */
-   bool choose_acting(
-     const map<int, pg_info_t> &peer_infos, ///< [in] received infos
-     int *auth_log,                         ///< [out] osd with auth log
-     vector<int> *req_acting                ///< [out] requested acting set
-     ) = 0;
- 
-   /// Scrub
- 
-   /// scan
-   int scan(
-     const hobject_t &start, ///< [in] scan objects >= start
-     const hobject_t &up_to, ///< [in] scan objects < up_to
-     vector<hobject_t> *out  ///< [out] objects returned
-     ) = 0; ///< @return 0 or error
- 
-   /// stat (TODO: ScrubMap::object needs to have PGBackend specific metadata)
-   int scrub(
-     const hobject_t &to_stat, ///< [in] object to stat
-     bool deep,                ///< [in] true if deep scrub
-     ScrubMap::object *o       ///< [out] result
-     ) = 0; ///< @return 0 or error
- 
-   /**
-    * compare_scrub_maps
-    *
-    * @param inconsistent [out] map of inconsistent pgs to pair<correct, incorrect>
-    * @param errstr [out] stream of text about inconsistencies for user
-    *                     perusal
-    *
-    * TODO: this interface doesn't actually make sense...
-    */
-   void compare_scrub_maps(
-     const map<int, ScrubMap> &maps, ///< [in] maps to compare
-     bool deep,                      ///< [in] true if scrub is deep
-     map<hobject_t, pair<set<int>, set<int> > > *inconsistent,
-     std:ostream *errstr
-     ) = 0;
- 
-   /// Recovery
- 
-   /**
-    * might_have_unrecoverable
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    * @param intervals [in] past intervals
-    * @param should_query [out] pair<int, cpg_t> shards to query
-    */
-   void might_have_unrecoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const map<epoch_t, pg_interval_t> &past_intervals,
-     set<pair<int, cpg_t> > *should_query
-     ) = 0;
- 
-   /**
-    * might_have_unfound
-    *
-    * @param missing [in] missing,info gathered so far (must include acting)
-    */
-   bool recoverable(
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing,
-     const hobject_t &hoid ///< [in] object to check
-     ) = 0; ///< @return true if object can be recovered given missing
- 
-   /**
-    * recover_object
-    *
-    * Triggers a recovery operation on the specified hobject_t
-    * onreadable must be called before onwriteable
-    *
-    * @param missing [in] set of info, missing pairs for queried nodes
-    */
-   void recover_object(
-     const hobject_t &hoid, ///< [in] object to recover
-     const map<chunk_id_t, map<int, pair<pg_info_t, pg_missing_t> > &missing
-     Context *onreadable,   ///< [in] called when object can be read
-     Context *onwriteable   ///< [in] called when object can be written
-     ) = 0;
- 
-   /// Backfill
- 
-   /// choose_backfill
-   void choose_backfill(
-     const map<chunk_id_t, map<int, pg_info_t> > &peer_infos ///< [in] infos
-     const vector<int> &acting, ///< [in] acting set
-     const vector<int> &up,     ///< [in] up set
-     set<int> *to_backfill      ///< [out] osds to backfill
-     ) = 0;
- };
diff --git a/doc/dev/osd_internals/erasure_coding/developer_notes.rst b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
index 2bc796c67e5..454f087fe53 100644
--- a/doc/dev/osd_internals/erasure_coding/developer_notes.rst
+++ b/doc/dev/osd_internals/erasure_coding/developer_notes.rst
@@ -10,7 +10,7 @@ of the erasure code within Ceph. It is mostly based on examples being
 explained to demonstrate how things work. It is written as if the
 implementation is complete although it may not be the case. For
 instance the plugin system and the jerasure plugin are implemented but
-the erasure code pool is not.
+the erasure coded pool is not.
 
 Reading and writing encoded chunks from and to OSDs
 ---------------------------------------------------
@@ -18,8 +18,8 @@ Reading and writing encoded chunks from and to OSDs
 An erasure coded pool stores each object as K+M chunks. It is divided
 into K data chunks and M coding chunks. The pool is configured to have
 a size of K+M so that each chunk is stored in an OSD in the acting
-set. The rank of the chunks is stored as `an attribute of the pool
-<http://tracker.ceph.com/issues/5862>`_ containing the object.
+set. The rank of the chunk is stored as `an attribute of the object
+<http://tracker.ceph.com/issues/5862>`_.
 
 For instance an erasure coded pool is created to use five OSDs ( K+M =
 5 ) and sustain the loss of two of them ( M = 2 ).
@@ -33,9 +33,9 @@ coding chunks : the fourth with *YXY* and the fifth with *GQC*. Each
 chunk is stored in an OSD in the acting set. The chunks are stored in
 objects that have the same name ( *NYAN* ) but reside on different
 OSDs. The order in which the chunks were created must be preserved and
-is stored as an attribute of the pool containing the object. Chunk
-*1* contains *ABC* and is stored on *OSD5* while chunk *4* contains
-*XYY* and is stored on *OSD3*.
+is stored as an attribute of the object ( shard_t ), in addition to its
+name. Chunk *1* contains *ABC* and is stored on *OSD5* while chunk *4*
+contains *XYY* and is stored on *OSD3*.
 
 ::
  
@@ -56,7 +56,7 @@ is stored as an attribute of the pool containing the object. Chunk
             +--v---+   +--v---+   +--v---+  +--v---+  +--v---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |  |  5   |
             +------+   +------+   +------+  +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |  | QGC  |
             +--+---+   +--+---+   +--+---+  +--+---+  +--+---+
@@ -85,10 +85,12 @@ When the object *NYAN* is read from the erasure coded pool, the
 decoding function reads three chunks : chunk *1* containing *ABC*,
 chunk *3* containing *GHI* and chunk *4* containing *YXY* and rebuild
 the original content of the object *ABCDEFGHI*. The decoding function
-is informed that the chunks *2* and *5* are missing. The chunk *5*
-could not be read because the *OSD4* is *out*. The decoding function
-is called as soon as three chunks are read : *OSD2* was the slowest
-and its chunk was not taken into account.
+is informed that the chunks *2* and *5* are missing ( they are called
+*erasures* ). The chunk *5* could not be read because the *OSD4* is
+*out*. The decoding function can be called as soon as three chunks are
+read : *OSD2* was the slowest and its chunk was not taken into
+account.  
+
 ::
  
                              +-------------------+
@@ -110,17 +112,17 @@ and its chunk was not taken into account.
             +--+---+   +------+   +--+---+  +--+---+
       name  | NYAN |   | NYAN |   | NYAN |  | NYAN |
             +------+   +------+   +------+  +------+
- pool shard |  1   |   |  2   |   |  3   |  |  4   |
+     shard  |  1   |   |  2   |   |  3   |  |  4   |
             +------+   +------+   +------+  +------+
    content  | ABC  |   | DEF  |   | GHI  |  | YXY  |
             +--+---+   +--+---+   +--+---+  +--+---+
-               ^          ^          ^         ^
-               |          |          |         |
-               |          |       +--+---+     |
-               |          |       | OSD1 |     |
+               ^          .          ^         ^
+               |    TOO   .          |         |
+               |    SLOW  .       +--+---+     |
+               |          ^       | OSD1 |     |
                |          |       +------+     |
                |          |       +------+     |
-               |     SLOW +-------| OSD2 |     |
+               |          +-------| OSD2 |     |
                |                  +------+     |
                |                  +------+     |
                |                  | OSD3 |-----+
@@ -137,8 +139,9 @@ Interrupted full writes
 
 In an erasure coded pool the primary OSD in the up set receives all
 write operations. It is responsible for encoding the payload into K+M
-chunks and send them to the OSDs in the up set. It is also responsible
+chunks and sends them to the other OSDs. It is also responsible
 for maintaining an authoritative version of the placement group logs.
+
 ::
  
      primary
@@ -168,8 +171,8 @@ set of the placement group is made of *OSD 1*, *OSD 2* and *OSD 3*. An
 object has been encoded and stored in the OSDs : the chunk D1v1
 (i.e. Data chunk number 1 version 1) is on *OSD 1*, D2v1 on *OSD 2*
 and C1v1 (i.e. Coding chunk number 1 version 1) on *OSD 3*. The
-placement group logs on each OSD are in sync at epoch 1 version 1
-(i.e. 1,1).  
+placement group logs on each OSD are identical (i.e. 1,1).  
+
 ::
  
      primary
@@ -196,21 +199,23 @@ placement group logs on each OSD are in sync at epoch 1 version 1
                +-----------+
 
 *OSD 1* is the primary and receives a WRITE FULL from a client, which
-means the payload is to replace the object entirely instead of only
-overwriting a portion of it. Version two of the object is created
-to override version one. *OSD 1* encodes the payload into three
-chunks : D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*,
-D2v2 on *OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on
-*OSD 3*. Each chunk is sent to the target OSD, including the primary
-OSD which is responsible for storing chunks in addition to handling
-write operations and maintaining an authoritative version of the
-placement group logs. When an OSD receives the message instructing it
-to write the chunk, it also creates a new entry in the placement group
-logs to reflect the change. For instance, as soon as *OSD 3* stores
-*C1v2*, it adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its
-logs. Because the OSDs work asynchronously, some chunks may still be
-in flight ( such as *D2v2* ) while others are acknowledged and on disk
-( such as *C1v1* and *D1v1* ).  ::
+means the payload is to replace the object entirely instead of
+overwriting a portion of it. Version two of the object is created to
+override version one. *OSD 1* encodes the payload into three chunks :
+D1v2 (i.e. Data chunk number 1 version 2) will be on *OSD 1*, D2v2 on
+*OSD 2* and C1v2 (i.e. Coding chunk number 1 version 2) on *OSD
+3*. Each chunk is sent to the target OSD, including the primary OSD
+which is responsible for storing chunks in addition to handling write
+operations and maintaining an authoritative version of the placement
+group logs. When an OSD receives the message instructing it to write
+the chunk, it also creates a new entry in the placement group logs to
+reflect the change. For instance, as soon as *OSD 3* stores *C1v2*, it
+adds the entry 1,2 ( i.e. epoch 1, version 2 ) to its logs. Because
+the OSDs work asynchronously, some chunks may still be in flight (
+such as *D2v2* ) while others are acknowledged and on disk ( such as
+*C1v1* and *D1v1* ).
+
+::
  
      primary
    +---OSD 1---+
@@ -243,6 +248,7 @@ acting set and the logs' *last_complete* pointer can move from
 *1,1* to *1,2* and the files used to store the chunks of the previous
 version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 *OSD 2* and *C1v1* on *OSD 3*.
+
 ::
  
                +---OSD 1---+
@@ -271,13 +277,14 @@ version of the object can be removed : *D1v1* on *OSD 1*, *D2v1* on
 
 But accidents happen. If *OSD 1* goes down while *D2v2* is still in
 flight, the object's version 2 is partially written : *OSD 3* has
-one chunk but does not have enough to recover. It lost two chunks :
-*D1v2* and *D2v2* but the erasure coding parameters K = 2 + M = 1
-requires that at least two chunks are available to rebuild the
+one chunk but that is no not enough to recover. It lost two chunks :
+*D1v2* and *D2v2* and the erasure coding parameters K = 2 + M = 1
+require that at least two chunks are available to rebuild the
 third. *OSD 4* becomes the new primary and finds that the
 *last_complete* log entry ( i.e. all objects before this entry were
 known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.
+*1,1* and that will be the head of the new authoritative log.
+
 ::
  
                +---OSD 2---+
@@ -299,6 +306,7 @@ known to be available on all OSDs in the previous acting set ) is
 The log entry *1,2* found on *OSD 3* is divergent from the new
 authoritative log provided by *OSD 4* : it is discarded and the file
 containing the *C1v2* chunk is removed.
+
 ::
  
                +---OSD 2---+
@@ -323,14 +331,14 @@ coding library during scrubbing and stored on the new primary *OSD 4*.
 Interrupted append
 ------------------
 
-An object is coded in stripes, either because they are too big or
-because they are created with multiple operations instead of a single
-full write. A single stripe will exist/exists in the case of a full
-write, assuming the object size is not too large to encode in memory.
-When appending to an existing object, the stripe size is retrieved
-from the attributes of the object. It applies, for instance, when
-*rgw* writes an object with sequence of append instead of a single
-write.  ::
+An object is coded in stripes, either because it is too big or because
+it is created with multiple write operations instead of a single full
+write. When appending to an existing object, the stripe size is
+retrieved from the attributes of the object. It applies, for instance,
+when *rgw* writes an object with a sequence of appends instead of a
+single full write.
+
+::
  
      primary
    +---OSD 1---+
@@ -354,7 +362,7 @@ write.  ::
                +-----------+
 
 *OSD 1* is the primary and receives an APPEND from a client, meaning
-the payload is to be appended at the end of the object. *OSD 1*
+the payload is to be appended to the end of the object. *OSD 1*
 encodes the payload into three chunks : S2D1 (i.e. Stripe two data
 chunk number 1 ) will be in s1 ( shard 1 ) on *OSD 1*, S2D2 in s2 on
 *OSD 2* and S2C1 (i.e. Stripe two coding chunk number 1 ) in s3 on
@@ -368,8 +376,8 @@ logs to reflect the change. For instance, as soon as *OSD 3* stores
 logs. The log entry also carries the nature of the operation: in this
 case 1,2 is an APPEND where 1,1 was a CREATE. Because the OSDs work
 asynchronously, some chunks may still be in flight ( such as *S2D2* )
-while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
-).
+while others are acknowledged and on disk (such as *S2D1* and *S2C1*).
+
 ::
  
                +---OSD 1---+
@@ -396,14 +404,16 @@ while others are acknowledged and on disk ( such as *S2D1* and *S2C1*
   +-----------+
 
 If *OSD 1* goes down while *S2D2* is still in flight, the payload is
-partially appended : s3 ( shard 3) in *OSD 3* has one chunk but does
-not have enough to recover because s1 and s2 don't have it. Two chunks
-were lost (*S2D1* and S2D2) but the erasure coding parameters K = 2 +
-M = 1 requires that at least two chunks are available to rebuild the
-third. *OSD 4* becomes the new primary and finds that the
-*last_complete* log entry ( i.e. all objects before this entry were
-known to be available on all OSDs in the previous acting set ) is
-*1,1* and will be the head of the new authoritative log.  ::
+partially appended : s3 (shard 3) in *OSD 3* has one chunk but does
+not have enough to recover. Two chunks were lost (*S2D1* and S2D2) but
+the erasure coding parameters K = 2 + M = 1 requires that at least two
+chunks are available to rebuild the third. *OSD 4* becomes the new
+primary and finds that the *last_complete* log entry ( i.e. all
+objects before this entry were known to be available on all OSDs in
+the previous acting set ) is *1,1* and will be the head of the new
+authoritative log.
+
+::
  
                +---OSD 2---+
                |+-s2-+ log |
@@ -429,8 +439,6 @@ the stripe size.
 Erasure code library
 --------------------
 
-See also `the corresponding tracker issue <http://tracker.ceph.com/issues/5877>`_
-
 Using `Reed-Solomon <https://en.wikipedia.org/wiki/Reed_Solomon>`_,
 with parameters K+M, object O is encoded by dividing it into chunks O1,
 O2, ...  OM and computing coding chunks P1, P2, ... PK. Any K chunks
@@ -443,8 +451,8 @@ Reading the original content of object O could be a simple
 concatenation of O1, O2, ... OM, because the plugins are using
 `systematic codes
 <http://en.wikipedia.org/wiki/Systematic_code>`_. Otherwise the chunks
-must be given to the erasure code library to retrieve the content of
-the object.
+must be given to the erasure code library *decode* method to retrieve
+the content of the object.
 
 Reed-Solomon is significantly more expensive to encode than fountain
 codes with the current `jerasure implementation
@@ -462,10 +470,11 @@ functions ( for Cauchy or Liberation for instance ): smaller packets
 means more calls and more overhead.
 
 Although Reed-Solomon is provided as a default, Ceph uses it via an
-`abstract API <http://tracker.ceph.com/issues/5878>`_ designed to
+`abstract API <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/osd/ErasureCodeInterface.h>`_ designed to
 allow each pool to choose the plugin that implements it using
 `key=value pairs when creating the pool
-<http://tracker.ceph.com/issues/6113>`_.
+<https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/mon/MonCommands.h#L483>`_.
+
 ::
  
   ceph osd pool create <pool> \
@@ -473,18 +482,21 @@ allow each pool to choose the plugin that implements it using
      erasure-code-plugin=<plugin>
 
 The *<plugin>* is dynamically loaded from *<dir>* (defaults to
-*/usr/lib/ceph/erasure-code* ) and expected to implement the
-*int __erasure_code_init(char *plugin_name)* function 
-which is responsible for registering an object derived from
-*ErasureCodePlugin* in the registry :
+*/usr/lib/ceph/erasure-code* ) and expected to implement the *int
+__erasure_code_init(char *plugin_name)* function which is responsible
+for registering an object derived from *ErasureCodePlugin* in the
+registry. The `ErasureCodePluginExample <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L32>`_ plugin reads:
+
 ::
  
-  ErasureCodePluginRegistry::add(plugin_name, 
-                                 new ErasureCodePluginExample());
+  ErasureCodePluginRegistry &instance = 
+                             ErasureCodePluginRegistry::instance();
+  instance.add(plugin_name, new ErasureCodePluginExample());
 
 The *ErasureCodePlugin* derived object must provide a factory method
 from which the concrete implementation of the *ErasureCodeInterface*
-object can be generated:
+object can be generated. The `ErasureCodePluginExample plugin <https://github.com/ceph/ceph/blob/08a97ae45f4df58a6a8ea8a6400934d860cf5eb4/src/test/osd/ErasureCodePluginExample.cc#L22>`_ reads:
+
 ::
  
   virtual int factory(const map<std::string,std::string> &parameters,
@@ -493,39 +505,23 @@ object can be generated:
     return 0;
   } 
 
-The *parameters* is the list of *key=value* pairs that were set when the pool
-was created. Each *key* must be prefixed with erasure-code to avoid name collisions
+The *parameters* argument is the list of *key=value* pairs that were
+set when the pool was created. Each *key* must be prefixed with
+*erasure-code* to avoid name collisions:
+
 ::
  
-  ceph osd pool create <pool> \
+  ceph osd pool create poolname 123 \
      erasure-code-directory=<dir>         \ # mandatory
      erasure-code-plugin=jerasure         \ # mandatory
      erasure-code-m=10                    \ # optional and plugin dependant
      erasure-code-k=3                     \ # optional and plugin dependant
      erasure-code-technique=reed_sol_van  \ # optional and plugin dependant
 
-Erasure code jerasure plugin
-----------------------------
-
-The parameters interpreted by the jerasure plugin are:
-::
- 
-  ceph osd pool create <pool> \
-     erasure-code-directory=<dir>         \ # plugin directory absolute path
-     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
-     erasure-code-k=<k>                   \ # data chunks (default 2)
-     erasure-code-m=<m>                   \ # coding chunks (default 2)
-     erasure-code-technique=<technique>   \ # coding technique
-
-The coding techniques can be chosen among *reed_sol_van*,
-*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
-*blaum_roth* and *liber8tion*.
-
 Scrubbing
 ---------
 
 See also `Refactor scrub to use PGBackend methods <http://tracker.ceph.com/issues/5861>`_
-
 The simplest form of scrubbing is to check with each OSDs holding a
 chunk if it exists locally. If more thank M chunks are missing the
 object is marked as lost. If up to M chunks are missing they are
@@ -547,13 +543,6 @@ built-in on a per block basis.
 Notes
 -----
 
-This document is a description of how erasure coding could be
-implemented, it does not reflect the current state of the code
-base. Possible optimizations are mentionned where relevant but the
-first implementation should not include any of them: they are
-presented to show that there is a path toward optimization starting
-from simple minded implementation.
-
 If the objects are large, it may be impractical to encode and decode
 them in memory. However, when using *RBD* a 1TB device is divided in
 many individual 4MB objects and *RGW* does the same.
@@ -561,73 +550,3 @@ many individual 4MB objects and *RGW* does the same.
 Encoding and decoding is implemented in the OSD. Although it could be
 implemented client side for read write, the OSD must be able to encode
 and decode on its own when scrubbing.
-
-If a partial read is required, an optimization could be to only fetch
-the chunk that contains the data instead of always fetching all
-chunks. For instance if *H* is required in the example above, chunk 3
-is read if available. Reading 3 chunks is a fallback in case chunk 3 is
-not available.
-
-Partial reads and writes
-------------------------
-
-If an object is large, reading or writing all of it when changing only
-a few bytes is expensive. It is more efficient to only read or write a
-subset of the object. When a client writes on an existing object, it
-can provide the offset and the length of the write as well as the
-payload with the `CEPH_OSD_OP_WRITE
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2542>`_
-operation. It is refered to as *partial write* and is different from
-the `CEPH_OSD_OP_WRITEFULL operation
-<https://github.com/ceph/ceph/blob/962b64a83037ff79855c5261325de0cd1541f582/src/osd/ReplicatedPG.cc#L2552>`_
-which writes the entire object at once.
-
-When using replicas for partial writes or reads, the primary OSD
-translates them into read(2) and write(2) POSIX system calls. When
-writing, it then forwards the CEPH_OSD_OP_WRITE message to the
-replicas and waits for them to acknowledge they are done.
-
-When reading erasure coded objects, at least M chunks must be read and
-decoded to extract the desired bytes. If a `systematic code
-<https://en.wikipedia.org/wiki/Systematic_code>`_ is used ( i.e. the
-data chunks are readable by simple concatenation ) read can be
-optimized to use the chunk containing the desired bytes and rely on
-the erasure decoding function only if a chunk is missing.
-
-When writing an erasure coded object, changing even one byte requires
-that it is encoded again in full.
-
-If Ceph is only used thru the *radosgw* or *librbd*, objects will mostly
-have the same size. The *radosgw* user may upload a 1GB object, which will
-be divided into smaller 4MB objects behind the scene ( or whatever is
-set with *rgw obj stripe size* ). If a KVM is attached a 10GB RBD block
-device, it will also be divided into smaller 4BM objects ( or whatever
-size is given to the --stripe-unit argument when creating the RBD
-block ). In both cases, writing one byte at the beginning will only
-require to encode the first object and not all of them.
-
-Objects can be further divided into stripes to reduce the overhead of
-partial writes. For instance:
-::
- 
-           +-----------------------+
-           |+---------------------+|
-           ||    stripe 0         ||
-           ||    [0,N)            ||
-           |+---------------------+|
-           |+---------------------+|
-           ||    stripe 1         ||
-           ||    [N,N*2)          ||
-           |+---------------------+|
-           |+---------------------+|
-           || stripe 3 [N*2,len)  ||
-           |+---------------------+|
-           +-----------------------+
-               object of size len
-
-Each stripe is encoded independantly and the same OSDs are used for
-all of them. For instance, if stripe 0 is encoded into 3 chunks on
-OSDs 5, 8 and 9, stripe 1 is also encoded into 3 chunks on the same
-OSDs. The size of a stripe is stored as an attribute of the object.
-When writing one byte at offset N, instead of re-encoding the whole
-object it is enough to re-encode the stripe that contains it.
diff --git a/doc/dev/osd_internals/erasure_coding/jerasure.rst b/doc/dev/osd_internals/erasure_coding/jerasure.rst
new file mode 100644
index 00000000000..312eac52e5d
--- /dev/null
+++ b/doc/dev/osd_internals/erasure_coding/jerasure.rst
@@ -0,0 +1,22 @@
+===============
+jerasure plugin
+===============
+
+Introduction
+------------
+
+The parameters interpreted by the jerasure plugin are:
+
+::
+ 
+  ceph osd pool create <pool> \
+     erasure-code-directory=<dir>         \ # plugin directory absolute path
+     erasure-code-plugin=jerasure         \ # plugin name (only jerasure)
+     erasure-code-k=<k>                   \ # data chunks (default 2)
+     erasure-code-m=<m>                   \ # coding chunks (default 2)
+     erasure-code-technique=<technique>   \ # coding technique
+
+The coding techniques can be chosen among *reed_sol_van*,
+*reed_sol_r6_op*, *cauchy_orig*, *cauchy_good*, *liberation*,
+*blaum_roth* and *liber8tion*.
+
diff --git a/doc/dev/osd_internals/erasure_coding/pgbackend.rst b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
index c16354f5116..43415ba4f7e 100644
--- a/doc/dev/osd_internals/erasure_coding/pgbackend.rst
+++ b/doc/dev/osd_internals/erasure_coding/pgbackend.rst
@@ -2,14 +2,13 @@
 PG Backend Proposal
 ===================
 
-See also `PGBackend.h <../PGBackend-h>`_
-
 Motivation
 ----------
 
-The purpose of the PG Backend interface is to abstract over the
-differences between replication and erasure coding as failure recovery
-mechanisms.
+The purpose of the `PG Backend interface
+<https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h>`_
+is to abstract over the differences between replication and erasure
+coding as failure recovery mechanisms.
 
 Much of the existing PG logic, particularly that for dealing with
 peering, will be common to each.  With both schemes, a log of recent
@@ -34,12 +33,12 @@ and erasure coding which PGBackend must abstract over:
    positions are not interchangeable.  In particular, it might make
    sense for a single OSD to hold more than 1 PG copy for different
    acting set positions.
-5. Selection of a pgtemp for backfill may difer between replicated
+5. Selection of a pgtemp for backfill may differ between replicated
    and erasure coded backends.
 6. The set of necessary osds from a particular interval required to
-   to continue peering may difer between replicated and erasure
+   to continue peering may differ between replicated and erasure
    coded backends.
-7. The selection of the authoritative log may difer between replicated
+7. The selection of the authoritative log may differ between replicated
    and erasure coded backends.
 
 Client Writes
@@ -78,8 +77,9 @@ Core Changes:
 - Current code should be adapted to use and rollback as appropriate
   APPEND, DELETE, (SET|RM)ATTR log entries.
 - The filestore needs to be able to deal with multiply versioned
-  hobjects.  This probably means adapting the filestore internally to
-  use a ghobject which is basically a tuple<hobject_t, gen_t,
+  hobjects.  This means adapting the filestore internally to
+  use a `ghobject <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ 
+  which is basically a tuple<hobject_t, gen_t,
   shard_t>.  The gen_t + shard_t need to be included in the on-disk
   filename.  gen_t is a unique object identifier to make sure there
   are no name collisions when object N is created +
@@ -114,7 +114,7 @@ divergent objects.  Thus, we must choose the *oldest* last_update from
 the last interval which went active in order to minimize the number of
 divergent objects.
 
-The dificulty is that the current code assumes that as long as it has
+The difficulty is that the current code assumes that as long as it has
 an info from at least 1 osd from the prior interval, it can complete
 peering.  In order to ensure that we do not end up with an
 unrecoverably divergent object, a K+M erasure coded PG must hear from at
@@ -161,7 +161,7 @@ Client Reads
 ------------
 
 Reads with the replicated strategy can always be satisfied
-syncronously out of the primary osd.  With an erasure coded strategy,
+synchronously out of the primary osd.  With an erasure coded strategy,
 the primary will need to request data from some number of replicas in
 order to satisfy a read.  The perform_read() interface for PGBackend
 therefore will be async.
@@ -192,7 +192,7 @@ include the chunk id in the object key.
 Core changes:
 
 - The filestore `ghobject_t needs to also include a chunk id
-  <http://tracker.ceph.com/issues/5862>`_ making it more like
+  <https://github.com/ceph/ceph/blob/aba6efda13eb6ab4b96930e9cc2dbddebbe03f26/src/common/hobject.h#L193>`_ making it more like
   tuple<hobject_t, gen_t, shard_t>.
 - coll_t needs to include a shard_t.
 - The `OSD pg_map and similar pg mappings need to work in terms of a
@@ -260,7 +260,7 @@ Core changes:
 Recovery
 --------
 
-See `Issue #5857`_. The logic for recovering an object depends on the backend.  With
+The logic for recovering an object depends on the backend.  With
 the current replicated strategy, we first pull the object replica
 to the primary and then concurrently push it out to the replicas.
 With the erasure coded strategy, we probably want to read the
@@ -270,7 +270,7 @@ and push out the replacement chunks concurrently.
 Another difference is that objects in erasure coded pg may be
 unrecoverable without being unfound.  The "unfound" concept
 should probably then be renamed to unrecoverable.  Also, the
-PGBackend impementation will have to be able to direct the search
+PGBackend implementation will have to be able to direct the search
 for pg replicas with unrecoverable object chunks and to be able
 to determine whether a particular object is recoverable.
 
@@ -281,9 +281,11 @@ Core changes:
 
 PGBackend interfaces:
 
-- might_have_unrecoverable()
-- recoverable()
-- recover_object()
+- `on_local_recover_start <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L46>`_
+- `on_local_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L52>`_
+- `on_global_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L64>`_
+- `on_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L69>`_
+- `begin_peer_recover <https://github.com/ceph/ceph/blob/a287167cf8625165249b7636540591aefc0a693d/src/osd/PGBackend.h#L76>`_
 
 Backfill
 --------
@@ -316,6 +318,4 @@ PGBackend interfaces:
 - choose_backfill(): allows the implementation to determine which osds
   should be backfilled in a particular interval.
 
-
-.. _Issue #5857: http://tracker.ceph.com/issues/5857
-.. _Issue #5856: http://tracker.ceph.com/issues/5856
-\ No newline at end of file
+.. _Issue #5856: http://tracker.ceph.com/issues/5856
diff --git a/doc/dev/osd_internals/erasure_coding/recovery.rst b/doc/dev/osd_internals/erasure_coding/recovery.rst
deleted file mode 100644
index 793a5b003dc..00000000000
--- a/doc/dev/osd_internals/erasure_coding/recovery.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-===================
-PGBackend Recovery
-===================
-
diff --git a/doc/release-notes.rst b/doc/release-notes.rst
index 604b4fa296b..bb1dfe4bfec 100644
--- a/doc/release-notes.rst
+++ b/doc/release-notes.rst
@@ -120,6 +120,40 @@ Notable Changes
 * sysvinit: add condrestart command (Dan van der Ster)
 
 
+
+v0.67.4 "Dumpling"
+------------------
+
+This point release fixes an important performance issue with radosgw,
+keystone authentication token caching, and CORS.  All users
+(especially those of rgw) are encouraged to upgrade.
+
+Notable changes
+~~~~~~~~~~~~~~~
+
+* crush: fix invalidation of cached names
+* crushtool: do not crash on non-unique bucket ids
+* mds: be more careful when decoding LogEvents
+* mds: fix heap check debugging commands
+* mon: avoid rebuilding old full osdmaps
+* mon: fix 'ceph crush move ...'
+* mon: fix 'ceph osd crush reweight ...'
+* mon: fix writeout of full osdmaps during trim
+* mon: limit size of transactions
+* mon: prevent both unmanaged and pool snaps
+* osd: disable xattr size limit (prevents upload of large rgw objects)
+* osd: fix recovery op throttling
+* osd: fix throttling of log messages for very slow requests
+* rgw: drain pending requests before completing write
+* rgw: fix CORS
+* rgw: fix inefficient list::size() usage
+* rgw: fix keystone token expiration
+* rgw: fix minor memory leaks
+* rgw: fix null termination of buffer
+
+For more detailed information, see :download:`the complete changelog <changelog/v0.67.4.txt>`.
+
+
 v0.67.3 "Dumpling"
 ------------------
 
diff --git a/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc
index eb7100a867f..7f2b8438f1f 100644
--- a/fusetrace/fusetrace_ll.cc
+++ b/fusetrace/fusetrace_ll.cc
@@ -11,7 +11,7 @@
     gcc -Wall `pkg-config fuse --cflags --libs` -lulockmgr fusexmp_fh.c -o fusexmp_fh
 */
 
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
diff --git a/qa/workunits/rbd/copy.sh b/qa/workunits/rbd/copy.sh
index 8430fca7665..7abb3956c88 100755
--- a/qa/workunits/rbd/copy.sh
+++ b/qa/workunits/rbd/copy.sh
@@ -109,8 +109,8 @@ test_ls() {
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
     # look for fields in output of ls -l without worrying about space
-    rbd ls -l | grep 'test1.*1024K.*1'
-    rbd ls -l | grep 'test2.*1024K.*1'
+    rbd ls -l | grep 'test1.*1024k.*1'
+    rbd ls -l | grep 'test2.*1024k.*1'
 
     rbd rm test1
     rbd rm test2
@@ -120,8 +120,8 @@ test_ls() {
     rbd ls | grep test1
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
-    rbd ls -l | grep 'test1.*1024K.*2'
-    rbd ls -l | grep 'test2.*1024K.*2'
+    rbd ls -l | grep 'test1.*1024k.*2'
+    rbd ls -l | grep 'test2.*1024k.*2'
 
     rbd rm test1
     rbd rm test2
@@ -131,8 +131,8 @@ test_ls() {
     rbd ls | grep test1
     rbd ls | grep test2
     rbd ls | wc -l | grep 2
-    rbd ls -l | grep 'test1.*1024K.*2'
-    rbd ls -l | grep 'test2.*1024K.*1'
+    rbd ls -l | grep 'test1.*1024k.*2'
+    rbd ls -l | grep 'test2.*1024k.*1'
     remove_images
 	
     # test that many images can be shown by ls
diff --git a/qa/workunits/rbd/import_export.sh b/qa/workunits/rbd/import_export.sh
index 353a47fffbe..1813f7a9a88 100755
--- a/qa/workunits/rbd/import_export.sh
+++ b/qa/workunits/rbd/import_export.sh
@@ -66,7 +66,7 @@ dd if=/dev/urandom bs=1M count=1 of=/tmp/sparse2; truncate /tmp/sparse2 -s 2M
 
 # 1M sparse, 1M data
 rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '2048K'
+rbd ls -l | grep sparse1 | grep '2048k'
 [ "$(objects sparse1)" = '1' ]
 
 # export, compare contents and on-disk size
@@ -77,7 +77,7 @@ rbd rm sparse1
 
 # 1M data, 1M sparse
 rbd import $RBD_CREATE_ARGS --order 20 /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '2048K'
+rbd ls -l | grep sparse2 | grep '2048k'
 [ "$(objects sparse2)" = '0' ]
 rbd export sparse2 /tmp/sparse2.out
 compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
@@ -88,7 +88,7 @@ rbd rm sparse2
 truncate /tmp/sparse1 -s 10M
 # import from stdin just for fun, verify still sparse
 rbd import $RBD_CREATE_ARGS --order 20 - sparse1 < /tmp/sparse1
-rbd ls -l | grep sparse1 | grep '10240K'
+rbd ls -l | grep sparse1 | grep '10240k'
 [ "$(objects sparse1)" = '1' ]
 rbd export sparse1 /tmp/sparse1.out
 compare_files_and_ondisk_sizes /tmp/sparse1 /tmp/sparse1.out
@@ -99,7 +99,7 @@ rbd rm sparse1
 dd if=/dev/urandom bs=2M count=1 of=/tmp/sparse2 oflag=append conv=notrunc
 # again from stding
 rbd import $RBD_CREATE_ARGS --order 20 - sparse2 < /tmp/sparse2
-rbd ls -l | grep sparse2 | grep '4096K'
+rbd ls -l | grep sparse2 | grep '4096k'
 [ "$(objects sparse2)" = '0 2 3' ]
 rbd export sparse2 /tmp/sparse2.out
 compare_files_and_ondisk_sizes /tmp/sparse2 /tmp/sparse2.out
diff --git a/qa/workunits/suites/fsstress.sh b/qa/workunits/suites/fsstress.sh
index 7f945172687..394e5fad991 100755
--- a/qa/workunits/suites/fsstress.sh
+++ b/qa/workunits/suites/fsstress.sh
@@ -2,6 +2,7 @@
 
 if [ ! -f /usr/lib/ltp/testcases/bin/fsstress ]
 then
+    path=`pwd`
     mkdir -p /tmp/fsstress
     cd /tmp/fsstress
     wget -q -O /tmp/fsstress/ltp-full.tgz http://ceph.com/qa/ltp-full-20091231.tgz
@@ -13,6 +14,7 @@ then
     sudo cp -avf /tmp/fsstress/ltp-full-20091231/testcases/kernel/fs/fsstress/fsstress /usr/lib/ltp/testcases/bin/fsstress
     sudo chmod 755 /usr/lib/ltp/testcases/bin/fsstress
     rm -Rf /tmp/fsstress
+    cd $path
 fi
 
 command="/usr/lib/ltp/testcases/bin/fsstress -d fsstress-`hostname`$$ -l 1 -n 1000 -p 10 -v"
diff --git a/src/.gitignore b/src/.gitignore
index 6efe8dc6bc4..8542ba868f9 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -43,6 +43,7 @@ Makefile
 /ceph_smalliobenchrbd
 /ceph-monstore-tool
 /ceph-osdomap-tool
+/ceph-kvstore-tool
 /ceph_ver.h
 /dev
 /init-ceph
diff --git a/src/ceph-disk b/src/ceph-disk
index 0691d252cd7..64d944d9db0 100755
--- a/src/ceph-disk
+++ b/src/ceph-disk
@@ -570,7 +570,7 @@ def get_fsid(cluster):
     fsid = get_conf(cluster=cluster, variable='fsid')
     if fsid is None:
         raise Error('getting cluster uuid from configuration failed')
-    return fsid
+    return fsid.lower()
 
 
 def get_or_create_dmcrypt_key(
@@ -888,15 +888,12 @@ def prepare_journal_dev(
 
 
 def prepare_journal_file(
-    journal,
-    journal_size):
+    journal):
 
     if not os.path.exists(journal):
-        LOG.debug('Creating journal file %s with size %dM', journal, journal_size)
+        LOG.debug('Creating journal file %s with size 0 (ceph-osd will resize and allocate)', journal)
         with file(journal, 'wb') as journal_file:
-            journal_file.truncate(journal_size * 1048576)
-
-    # FIXME: should we resize an existing journal file?
+            pass
 
     LOG.debug('Journal is file %s', journal)
     LOG.warning('OSD will not be hot-swappable if journal is not the same device as the osd data')
@@ -921,13 +918,13 @@ def prepare_journal(
     if not os.path.exists(journal):
         if force_dev:
             raise Error('Journal does not exist; not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     jmode = os.stat(journal).st_mode
     if stat.S_ISREG(jmode):
         if force_dev:
             raise Error('Journal is not a block device', journal)
-        return prepare_journal_file(journal, journal_size)
+        return prepare_journal_file(journal)
 
     if stat.S_ISBLK(jmode):
         if force_file:
@@ -1604,6 +1601,7 @@ def find_cluster_by_uuid(_uuid):
     Find a cluster name by searching /etc/ceph/*.conf for a conf file
     with the right uuid.
     """
+    _uuid = _uuid.lower()
     no_fsid = []
     if not os.path.exists('/etc/ceph'):
         return None
@@ -1611,11 +1609,15 @@ def find_cluster_by_uuid(_uuid):
         if not conf_file.endswith('.conf'):
             continue
         cluster = conf_file[:-5]
-        fsid = get_conf(cluster, 'fsid')
-        if fsid is None:
+        try:
+            fsid = get_fsid(cluster)
+        except Error as e: 
+            if e.message != 'getting cluster uuid from configuration failed':
+                raise e
             no_fsid.append(cluster)
-        elif fsid == _uuid:
-            return cluster
+        else:
+            if fsid == _uuid:
+                return cluster
     # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
     if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
         LOG.warning('No fsid defined in /etc/ceph/ceph.conf; using anyway')
diff --git a/src/client/Client.cc b/src/client/Client.cc
index 77fd2084cf1..60a5e4550b8 100644
--- a/src/client/Client.cc
+++ b/src/client/Client.cc
@@ -148,9 +148,12 @@ Client::Client(Messenger *m, MonClient *mc)
     timer(m->cct, client_lock),
     ino_invalidate_cb(NULL),
     ino_invalidate_cb_handle(NULL),
+    dentry_invalidate_cb(NULL),
+    dentry_invalidate_cb_handle(NULL),
     getgroups_cb(NULL),
     getgroups_cb_handle(NULL),
     async_ino_invalidator(m->cct),
+    async_dentry_invalidator(m->cct),
     tick_event(NULL),
     monclient(mc), messenger(m), whoami(m->get_myname().num()),
     initialized(false), mounted(false), unmounting(false),
@@ -410,11 +413,17 @@ void Client::shutdown()
   admin_socket->unregister_command("dump_cache");
 
   if (ino_invalidate_cb) {
-    ldout(cct, 10) << "shutdown stopping invalidator finisher" << dendl;
+    ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
     async_ino_invalidator.wait_for_empty();
     async_ino_invalidator.stop();
   }
 
+  if (dentry_invalidate_cb) {
+    ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
+    async_dentry_invalidator.wait_for_empty();
+    async_dentry_invalidator.stop();
+  }
+
   objectcacher->stop();  // outside of client_lock! this does a join.
 
   client_lock.Lock();
@@ -1532,7 +1541,7 @@ void Client::_closed_mds_session(MetaSession *s)
   signal_context_list(s->waiting_for_open);
   mount_cond.Signal();
   remove_session_caps(s);
-  kick_requests(s, true);
+  kick_requests_closed(s);
   mds_sessions.erase(s->mds_num);
   delete s;
 }
@@ -1905,7 +1914,7 @@ void Client::handle_mds_map(MMDSMap* m)
 
     if (newstate >= MDSMap::STATE_ACTIVE) {
       if (oldstate < MDSMap::STATE_ACTIVE) {
-	kick_requests(p->second, false);
+	kick_requests(p->second);
 	kick_flushing_caps(p->second);
 	signal_context_list(p->second->waiting_for_open);
 	kick_maxsize_requests(p->second);
@@ -1989,25 +1998,16 @@ void Client::send_reconnect(MetaSession *session)
 }
 
 
-void Client::kick_requests(MetaSession *session, bool signal)
+void Client::kick_requests(MetaSession *session)
 {
   ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
-
   for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
        p != mds_requests.end();
-       ++p) 
+       ++p) {
     if (p->second->mds == session->mds_num) {
-      if (signal) {
-	// only signal caller if there is a caller
-	// otherwise, let resend_unsafe handle it
-	if (p->second->caller_cond) {
-	  p->second->kick = true;
-	  p->second->caller_cond->Signal();
-	}
-      } else {
-	send_request(p->second, session);
-      }
+      send_request(p->second, session);
     }
+  }
 }
 
 void Client::resend_unsafe_requests(MetaSession *session)
@@ -2018,6 +2018,25 @@ void Client::resend_unsafe_requests(MetaSession *session)
     send_request(*iter, session);
 }
 
+void Client::kick_requests_closed(MetaSession *session)
+{
+  ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
+  for (map<tid_t, MetaRequest*>::iterator p = mds_requests.begin();
+       p != mds_requests.end();
+       ++p) {
+    if (p->second->mds == session->mds_num) {
+      if (p->second->caller_cond) {
+	p->second->kick = true;
+	p->second->caller_cond->Signal();
+      }
+      p->second->item.remove_myself();
+      p->second->unsafe_item.remove_myself();
+    }
+  }
+  assert(session->requests.empty());
+  assert(session->unsafe_requests.empty());
+}
+
 
 
 
@@ -3551,6 +3570,45 @@ void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCa
   m->put();
 }
 
+class C_Client_DentryInvalidate : public Context  {
+private:
+  Client *client;
+  vinodeno_t dirino;
+  vinodeno_t ino;
+  string name;
+public:
+  C_Client_DentryInvalidate(Client *c, Dentry *dn) :
+			    client(c), dirino(dn->dir->parent_inode->vino()),
+			    ino(dn->inode->vino()), name(dn->name) { }
+  void finish(int r) {
+    client->_async_dentry_invalidate(dirino, ino, name);
+  }
+};
+
+void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
+{
+  ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
+		 << " in dir " << dirino << dendl;
+  dentry_invalidate_cb(dentry_invalidate_cb_handle, dirino, ino, name);
+}
+
+void Client::_schedule_invalidate_dentry_callback(Dentry *dn)
+{
+  if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
+    async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn));
+}
+
+void Client::_invalidate_inode_parents(Inode *in)
+{
+  set<Dentry*>::iterator q = in->dn_set.begin();
+  while (q != in->dn_set.end()) {
+    Dentry *dn = *q++;
+    // FIXME: we play lots of unlink/link tricks when handling MDS replies,
+    //        so in->dn_set doesn't always reflect the state of kernel's dcache.
+    _schedule_invalidate_dentry_callback(dn);
+    unlink(dn, false);
+  }
+}
 
 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
 {
@@ -3578,8 +3636,12 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
     in->uid = m->head.uid;
     in->gid = m->head.gid;
   }
+  bool deleted_inode = false;
   if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
     in->nlink = m->head.nlink;
+    if (in->nlink == 0 &&
+	(new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
+      deleted_inode = true;
   }
   if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
       m->xattrbl.length() &&
@@ -3633,6 +3695,10 @@ void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClient
   if (new_caps)
     signal_cond_list(in->waitfor_caps);
 
+  // may drop inode's last ref
+  if (deleted_inode)
+    _invalidate_inode_parents(in);
+
   m->put();
 }
 
@@ -6319,6 +6385,17 @@ void Client::ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handl
   async_ino_invalidator.start();
 }
 
+void Client::ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle)
+{
+  Mutex::Locker l(client_lock);
+  ldout(cct, 10) << "ll_register_dentry_invalidate_cb cb " << (void*)cb << " p " << (void*)handle << dendl;
+  if (cb == NULL)
+    return;
+  dentry_invalidate_cb = cb;
+  dentry_invalidate_cb_handle = handle;
+  async_dentry_invalidator.start();
+}
+
 void Client::ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle)
 {
   Mutex::Locker l(client_lock);
diff --git a/src/client/Client.h b/src/client/Client.h
index c7c9cef0e0c..df59f235de4 100644
--- a/src/client/Client.h
+++ b/src/client/Client.h
@@ -120,6 +120,9 @@ struct MetaRequest;
 
 typedef void (*client_ino_callback_t)(void *handle, vinodeno_t ino, int64_t off, int64_t len);
 
+typedef void (*client_dentry_callback_t)(void *handle, vinodeno_t dirino,
+					 vinodeno_t ino, string& name);
+
 typedef int (*client_getgroups_callback_t)(void *handle, uid_t uid, gid_t **sgids);
 
 // ========================================================
@@ -211,10 +214,14 @@ class Client : public Dispatcher {
   client_ino_callback_t ino_invalidate_cb;
   void *ino_invalidate_cb_handle;
 
+  client_dentry_callback_t dentry_invalidate_cb;
+  void *dentry_invalidate_cb_handle;
+
   client_getgroups_callback_t getgroups_cb;
   void *getgroups_cb_handle;
 
   Finisher async_ino_invalidator;
+  Finisher async_dentry_invalidator;
 
   Context *tick_event;
   utime_t last_cap_renew;
@@ -270,7 +277,8 @@ public:
   void connect_mds_targets(int mds);
   void send_request(MetaRequest *request, MetaSession *session);
   MClientRequest *build_client_request(MetaRequest *request);
-  void kick_requests(MetaSession *session, bool signal);
+  void kick_requests(MetaSession *session);
+  void kick_requests_closed(MetaSession *session);
   void handle_client_request_forward(MClientRequestForward *reply);
   void handle_client_reply(MClientReply *reply);
 
@@ -357,6 +365,7 @@ protected:
 
   friend class C_Client_PutInode; // calls put_inode()
   friend class C_Client_CacheInvalidate;  // calls ino_invalidate_cb
+  friend class C_Client_DentryInvalidate;  // calls dentry_invalidate_cb
 
   //int get_cache_size() { return lru.lru_get_size(); }
   //void set_cache_size(int m) { lru.lru_set_max(m); }
@@ -459,6 +468,10 @@ protected:
   void finish_cap_snap(Inode *in, CapSnap *capsnap, int used);
   void _flushed_cap_snap(Inode *in, snapid_t seq);
 
+  void _schedule_invalidate_dentry_callback(Dentry *dn);
+  void _async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name);
+  void _invalidate_inode_parents(Inode *in);
+
   void _schedule_invalidate_callback(Inode *in, int64_t off, int64_t len, bool keep_caps);
   void _invalidate_inode_cache(Inode *in, bool keep_caps);
   void _invalidate_inode_cache(Inode *in, int64_t off, int64_t len, bool keep_caps);
@@ -735,6 +748,8 @@ public:
 
   void ll_register_ino_invalidate_cb(client_ino_callback_t cb, void *handle);
 
+  void ll_register_dentry_invalidate_cb(client_dentry_callback_t cb, void *handle);
+
   void ll_register_getgroups_cb(client_getgroups_callback_t cb, void *handle);
 };
 
diff --git a/src/client/fuse_ll.cc b/src/client/fuse_ll.cc
index 6bf5ea3d34f..88f727e454e 100644
--- a/src/client/fuse_ll.cc
+++ b/src/client/fuse_ll.cc
@@ -12,7 +12,7 @@
  * 
  */
 
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #include <fuse/fuse.h>
 #include <fuse/fuse_lowlevel.h>
@@ -551,7 +551,7 @@ static int getgroups_cb(void *handle, uid_t uid, gid_t **sgids)
 }
 #endif
 
-static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
+static void ino_invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t len)
 {
 #if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
   CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
@@ -560,6 +560,19 @@ static void invalidate_cb(void *handle, vinodeno_t vino, int64_t off, int64_t le
 #endif
 }
 
+static void dentry_invalidate_cb(void *handle, vinodeno_t dirino,
+				 vinodeno_t ino, string& name)
+{
+  CephFuse::Handle *cfuse = (CephFuse::Handle *)handle;
+  fuse_ino_t fdirino = cfuse->make_fake_ino(dirino.ino, dirino.snapid);
+#if FUSE_VERSION >= FUSE_MAKE_VERSION(2, 9)
+  fuse_ino_t fino = cfuse->make_fake_ino(ino.ino, ino.snapid);
+  fuse_lowlevel_notify_delete(cfuse->ch, fdirino, fino, name.c_str(), name.length());
+#elif FUSE_VERSION >= FUSE_MAKE_VERSION(2, 8)
+  fuse_lowlevel_notify_inval_entry(cfuse->ch, fdirino, name.c_str(), name.length());
+#endif
+}
+
 static void do_init(void *data, fuse_conn_info *bar)
 {
   CephFuse::Handle *cfuse = (CephFuse::Handle *)data;
@@ -743,9 +756,10 @@ int CephFuse::Handle::init(int argc, const char *argv[])
   client->ll_register_getgroups_cb(getgroups_cb, this);
 
    */
+  client->ll_register_dentry_invalidate_cb(dentry_invalidate_cb, this);
 
   if (client->cct->_conf->fuse_use_invalidate_cb)
-    client->ll_register_ino_invalidate_cb(invalidate_cb, this);
+    client->ll_register_ino_invalidate_cb(ino_invalidate_cb, this);
 
 done:
   fuse_opt_free_args(&args);
diff --git a/src/common/Makefile.am b/src/common/Makefile.am
index deddc5d831c..9ec6c3e895b 100644
--- a/src/common/Makefile.am
+++ b/src/common/Makefile.am
@@ -65,7 +65,8 @@ libcommon_la_SOURCES = \
 	common/ceph_strings.cc \
 	common/ceph_frag.cc \
 	common/addr_parsing.c \
-	common/hobject.cc
+	common/hobject.cc \
+	common/bloom_filter.cc
 
 if LINUX
 libcommon_la_SOURCES += common/secret.c
@@ -97,6 +98,7 @@ LIBCOMMON_DEPS += libcommon_crc.la
 noinst_LTLIBRARIES += libcommon_crc.la
 
 noinst_HEADERS += \
+	common/bloom_filter.hpp \
 	common/sctp_crc32.h \
 	common/crc32c_intel_baseline.h \
 	common/crc32c_intel_fast.h
diff --git a/src/common/bloom_filter.cc b/src/common/bloom_filter.cc
new file mode 100644
index 00000000000..f602b80149e
--- /dev/null
+++ b/src/common/bloom_filter.cc
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "include/types.h"
+#include "common/bloom_filter.hpp"
+
+void bloom_filter::encode(bufferlist& bl) const
+{
+  ENCODE_START(1, 1, bl);
+  ::encode((uint64_t)salt_count_, bl);
+  ::encode((uint64_t)table_size_, bl);
+  ::encode((uint64_t)inserted_element_count_, bl);
+  ::encode((uint64_t)random_seed_, bl);
+  bufferptr bp((const char*)bit_table_, raw_table_size_);
+  ::encode(bp, bl);
+  ENCODE_FINISH(bl);
+}
+
+void bloom_filter::decode(bufferlist::iterator& p)
+{
+  DECODE_START(1, p);
+  uint64_t v;
+  ::decode(v, p);
+  salt_count_ = v;
+  ::decode(v, p);
+  table_size_ = v;
+  ::decode(v, p);
+  inserted_element_count_ = v;
+  ::decode(v, p);
+  random_seed_ = v;
+  bufferlist t;
+  ::decode(t, p);
+
+  salt_.clear();
+  generate_unique_salt();
+  raw_table_size_ = t.length();
+  assert(raw_table_size_ == table_size_ / bits_per_char);
+  delete bit_table_;
+  bit_table_ = new cell_type[raw_table_size_];
+  t.copy(0, raw_table_size_, (char *)bit_table_);
+
+  DECODE_FINISH(p);
+}
+
+void bloom_filter::dump(Formatter *f) const
+{
+  f->dump_unsigned("salt_count", salt_count_);
+  f->dump_unsigned("table_size", table_size_);
+  f->dump_unsigned("raw_table_size", raw_table_size_);
+  f->dump_unsigned("insert_count", inserted_element_count_);
+  f->dump_unsigned("random_seed", random_seed_);
+
+  f->open_array_section("salt_table");
+  for (std::vector<bloom_type>::const_iterator i = salt_.begin(); i != salt_.end(); ++i)
+    f->dump_unsigned("salt", *i);
+  f->close_section();
+
+  f->open_array_section("bit_table");
+  for (unsigned i = 0; i < raw_table_size_; ++i)
+    f->dump_unsigned("byte", (unsigned)bit_table_[i]);
+  f->close_section();
+}
+
+void bloom_filter::generate_test_instances(list<bloom_filter*>& ls)
+{
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.push_back(new bloom_filter(10, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.push_back(new bloom_filter(50, .5, 1));
+  ls.back()->insert("foo");
+  ls.back()->insert("bar");
+  ls.back()->insert("baz");
+  ls.back()->insert("boof");
+  ls.back()->insert("boogggg");
+}
diff --git a/src/common/bloom_filter.hpp b/src/common/bloom_filter.hpp
new file mode 100644
index 00000000000..6216c7fb34d
--- /dev/null
+++ b/src/common/bloom_filter.hpp
@@ -0,0 +1,627 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+/*
+ *******************************************************************
+ *                                                                 *
+ *                        Open Bloom Filter                        *
+ *                                                                 *
+ * Author: Arash Partow - 2000                                     *
+ * URL: http://www.partow.net/programming/hashfunctions/index.html *
+ *                                                                 *
+ * Copyright notice:                                               *
+ * Free use of the Open Bloom Filter Library is permitted under    *
+ * the guidelines and in accordance with the most current version  *
+ * of the Boost Software License, Version 1.0                      *
+ * http://www.opensource.org/licenses/bsl1.0.html                  *
+ *                                                                 *
+ *******************************************************************
+*/
+
+
+#ifndef COMMON_BLOOM_FILTER_HPP
+#define COMMON_BLOOM_FILTER_HPP
+
+#include <cstddef>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <list>
+#include <string>
+#include <vector>
+
+#include "include/encoding.h"
+#include "common/Formatter.h"
+
+static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
+static const unsigned char bit_mask[bits_per_char] = {
+  0x01,  //00000001
+  0x02,  //00000010
+  0x04,  //00000100
+  0x08,  //00001000
+  0x10,  //00010000
+  0x20,  //00100000
+  0x40,  //01000000
+  0x80   //10000000
+};
+
+
+class bloom_filter
+{
+protected:
+
+  typedef unsigned int bloom_type;
+  typedef unsigned char cell_type;
+
+public:
+
+  bloom_filter()
+    : bit_table_(0),
+      salt_count_(0),
+      table_size_(0),
+      raw_table_size_(0),
+      inserted_element_count_(0),
+      random_seed_(0)
+  {}
+
+  bloom_filter(const std::size_t& predicted_inserted_element_count,
+	       const double& false_positive_probability,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
+			    &salt_count_, &table_size_);
+    init();
+  }
+
+  bloom_filter(const std::size_t& salt_count, std::size_t table_size,
+	       const std::size_t& random_seed)
+    : bit_table_(0),
+      salt_count_(salt_count),
+      table_size_(table_size),
+      inserted_element_count_(0),
+      random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
+  {
+    init();
+  }
+
+  void init() {
+    generate_unique_salt();
+    raw_table_size_ = table_size_ / bits_per_char;
+    bit_table_ = new cell_type[raw_table_size_];
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+  }
+
+  bloom_filter(const bloom_filter& filter)
+  {
+    this->operator=(filter);
+  }
+
+  bloom_filter& operator = (const bloom_filter& filter)
+  {
+    if (this != &filter) {
+      salt_count_ = filter.salt_count_;
+      table_size_ = filter.table_size_;
+      raw_table_size_ = filter.raw_table_size_;
+      inserted_element_count_ = filter.inserted_element_count_;
+      random_seed_ = filter.random_seed_;
+      delete[] bit_table_;
+      bit_table_ = new cell_type[raw_table_size_];
+      std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
+      salt_ = filter.salt_;
+    }
+    return *this;
+  }
+
+  virtual ~bloom_filter()
+  {
+    delete[] bit_table_;
+  }
+
+  inline bool operator!() const
+  {
+    return (0 == table_size_);
+  }
+
+  inline void clear()
+  {
+    std::fill_n(bit_table_,raw_table_size_,0x00);
+    inserted_element_count_ = 0;
+  }
+
+  /**
+   * insert a u32 into the set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to insert
+   */
+  inline void insert(uint32_t val) {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
+  inline void insert(const unsigned char* key_begin, const std::size_t& length)
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
+    }
+    ++inserted_element_count_;
+  }
+
+  template<typename T>
+  inline void insert(const T& t)
+  {
+    // Note: T must be a C++ POD type.
+    insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
+  }
+
+  inline void insert(const std::string& key)
+  {
+    insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline void insert(const char* data, const std::size_t& length)
+  {
+    insert(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline void insert(const InputIterator begin, const InputIterator end)
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      insert(*(itr++));
+    }
+  }
+
+  /**
+   * check if a u32 is contained by set
+   *
+   * NOTE: the internal hash is weak enough that consecutive inputs do
+   * not achieve the desired fpp.  Well-mixed values should be used
+   * here (e.g., put rjhash(x) into the filter instead of just x).
+   *
+   * @param val integer value to query
+   * @returns true if value is (probably) in the set, false if it definitely is not
+   */
+  inline virtual bool contains(uint32_t val) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
+  {
+    std::size_t bit_index = 0;
+    std::size_t bit = 0;
+    for (std::size_t i = 0; i < salt_.size(); ++i)
+    {
+      compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
+      if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
+      {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  template<typename T>
+  inline bool contains(const T& t) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
+  }
+
+  inline bool contains(const std::string& key) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
+  }
+
+  inline bool contains(const char* data, const std::size_t& length) const
+  {
+    return contains(reinterpret_cast<const unsigned char*>(data),length);
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (!contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  template<typename InputIterator>
+  inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
+  {
+    InputIterator itr = begin;
+    while (end != itr)
+    {
+      if (contains(*itr))
+      {
+        return itr;
+      }
+      ++itr;
+    }
+    return end;
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return table_size_;
+  }
+
+  inline std::size_t element_count() const
+  {
+    return inserted_element_count_;
+  }
+
+  inline double effective_fpp() const
+  {
+    /*
+      Note:
+      The effective false positive probability is calculated using the
+      designated table size and hash function count in conjunction with
+      the current number of inserted elements - not the user defined
+      predicated/expected number of inserted elements.
+    */
+    return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
+  }
+
+  inline bloom_filter& operator &= (const bloom_filter& filter)
+  {
+    /* intersection */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] &= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator |= (const bloom_filter& filter)
+  {
+    /* union */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+        bit_table_[i] |= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline bloom_filter& operator ^= (const bloom_filter& filter)
+  {
+    /* difference */
+    if (
+	(salt_count_  == filter.salt_count_) &&
+	(table_size_  == filter.table_size_) &&
+	(random_seed_ == filter.random_seed_)
+	) {
+      for (std::size_t i = 0; i < raw_table_size_; ++i) {
+	bit_table_[i] ^= filter.bit_table_[i];
+      }
+    }
+    return *this;
+  }
+
+  inline const cell_type* table() const
+  {
+    return bit_table_;
+  }
+
+protected:
+
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash % table_size_;
+    bit = bit_index % bits_per_char;
+  }
+
+  void generate_unique_salt()
+  {
+    /*
+      Note:
+      A distinct hash function need not be implementation-wise
+      distinct. In the current implementation "seeding" a common
+      hash function with different values seems to be adequate.
+    */
+    const unsigned int predef_salt_count = 128;
+    static const bloom_type predef_salt[predef_salt_count] = {
+      0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
+      0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
+      0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
+      0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
+      0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
+      0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
+      0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
+      0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
+      0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
+      0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
+      0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
+      0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
+      0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
+      0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
+      0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
+      0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
+      0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
+      0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
+      0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
+      0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
+      0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
+      0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
+      0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
+      0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
+      0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
+      0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
+      0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
+      0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
+      0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
+      0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
+      0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
+      0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
+    };
+
+    if (salt_count_ <= predef_salt_count)
+    {
+      std::copy(predef_salt,
+		predef_salt + salt_count_,
+		std::back_inserter(salt_));
+       for (unsigned int i = 0; i < salt_.size(); ++i)
+       {
+        /*
+          Note:
+          This is done to integrate the user defined random seed,
+          so as to allow for the generation of unique bloom filter
+          instances.
+        */
+        salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
+       }
+    }
+    else
+    {
+      std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
+      srand(static_cast<unsigned int>(random_seed_));
+      while (salt_.size() < salt_count_)
+      {
+        bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
+        if (0 == current_salt)
+	  continue;
+        if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
+        {
+          salt_.push_back(current_salt);
+        }
+      }
+    }
+  }
+
+  static void find_optimal_parameters(std::size_t target_insert_count,
+				      double target_fpp,
+				      std::size_t *salt_count,
+				      std::size_t *table_size)
+  {
+    /*
+      Note:
+      The following will attempt to find the number of hash functions
+      and minimum amount of storage bits required to construct a bloom
+      filter consistent with the user defined false positive probability
+      and estimated element insertion count.
+    */
+
+    double min_m = std::numeric_limits<double>::infinity();
+    double min_k = 0.0;
+    double curr_m = 0.0;
+    double k = 1.0;
+    while (k < 1000.0)
+    {
+      double numerator  = (- k * target_insert_count);
+      double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
+      curr_m = numerator / denominator;
+
+      if (curr_m < min_m)
+      {
+        min_m = curr_m;
+        min_k = k;
+      }
+      k += 1.0;
+    }
+
+    *salt_count = static_cast<std::size_t>(min_k);
+    size_t t = static_cast<std::size_t>(min_m);
+    t += (((t % bits_per_char) != 0) ? (bits_per_char - (t % bits_per_char)) : 0);
+    *table_size = t;
+  }
+
+  inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
+  {
+    hash ^=    (hash <<  7) ^  ((val & 0xff000000) >> 24) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
+    hash ^=    (hash <<  7) ^  ((val & 0xff00) >> 8) * (hash >> 3);
+    hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
+    return hash;
+  }
+
+  inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
+  {
+    const unsigned char* itr = begin;
+
+    while (remaining_length >= 4)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 4;
+    }
+
+    while (remaining_length >= 2)
+    {
+      hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
+      hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
+      remaining_length -= 2;
+    }
+
+    if (remaining_length)
+    {
+      hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
+    }
+
+    return hash;
+  }
+
+  std::vector<bloom_type> salt_;
+  unsigned char*       bit_table_;
+  std::size_t         salt_count_;
+  std::size_t         table_size_;
+  std::size_t         raw_table_size_;
+  std::size_t         inserted_element_count_;
+  std::size_t         random_seed_;
+
+public:
+  void encode(bufferlist& bl) const;
+  void decode(bufferlist::iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(std::list<bloom_filter*>& ls);
+};
+WRITE_CLASS_ENCODER(bloom_filter)
+
+inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result &= b;
+  return result;
+}
+
+inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result |= b;
+  return result;
+}
+
+inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
+{
+  bloom_filter result = a;
+  result ^= b;
+  return result;
+}
+
+
+class compressible_bloom_filter : public bloom_filter
+{
+public:
+
+  compressible_bloom_filter(const std::size_t& predicted_element_count,
+			    const double& false_positive_probability,
+			    const std::size_t& random_seed)
+    : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
+  {
+    size_list.push_back(table_size_);
+  }
+
+  inline virtual std::size_t size() const
+  {
+    return size_list.back();
+  }
+
+  inline bool compress(const double& percentage)
+  {
+    if ((0.0 >= percentage) || (percentage >= 100.0))
+    {
+      return false;
+    }
+
+    std::size_t original_table_size = size_list.back();
+    std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
+    new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
+
+    if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
+    {
+      return false;
+    }
+
+    cell_type* tmp = new cell_type[new_table_size / bits_per_char];
+    std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
+    cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
+    cell_type* end = bit_table_ + (original_table_size / bits_per_char);
+    cell_type* itr_tmp = tmp;
+
+    while (end != itr)
+    {
+      *(itr_tmp++) |= (*itr++);
+    }
+
+    delete[] bit_table_;
+    bit_table_ = tmp;
+    size_list.push_back(new_table_size);
+
+    return true;
+  }
+
+private:
+
+  inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
+  {
+    bit_index = hash;
+    for (std::size_t i = 0; i < size_list.size(); ++i)
+    {
+      bit_index %= size_list[i];
+    }
+    bit = bit_index % bits_per_char;
+  }
+
+  std::vector<std::size_t> size_list;
+};
+
+#endif
+
+
+/*
+  Note 1:
+  If it can be guaranteed that bits_per_char will be of the form 2^n then
+  the following optimization can be used:
+
+  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
+
+  Note 2:
+  For performance reasons where possible when allocating memory it should
+  be aligned (aligned_alloc) according to the architecture being used.
+*/
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index fad831f5543..08c2b0b4cae 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -160,6 +160,8 @@ OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s
 OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
 OPTION(mon_pg_warn_min_per_osd, OPT_INT, 20)  // min # pgs per (in) osd before we warn the admin
 OPTION(mon_pg_warn_max_object_skew, OPT_FLOAT, 10.0) // max skew few average in objects per pg
+OPTION(mon_pg_warn_min_objects, OPT_INT, 10000)  // do not warn below this object #
+OPTION(mon_pg_warn_min_pool_objects, OPT_INT, 1000)  // do not warn on pools below this object #
 OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full"
 OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full
 OPTION(mon_globalid_prealloc, OPT_INT, 100)   // how many globalids to prealloc
@@ -544,12 +546,19 @@ OPTION(filestore_index_retry_probability, OPT_DOUBLE, 0)
 OPTION(filestore_debug_inject_read_err, OPT_BOOL, false)
 
 OPTION(filestore_debug_omap_check, OPT_BOOL, 0) // Expensive debugging check on sync
+
 // Use omap for xattrs for attrs over
-OPTION(filestore_xattr_use_omap, OPT_BOOL, false)
 // filestore_max_inline_xattr_size or
-OPTION(filestore_max_inline_xattr_size, OPT_U32, 512)
+OPTION(filestore_max_inline_xattr_size, OPT_U32, 0)	//Override
+OPTION(filestore_max_inline_xattr_size_xfs, OPT_U32, 65536)
+OPTION(filestore_max_inline_xattr_size_btrfs, OPT_U32, 2048)
+OPTION(filestore_max_inline_xattr_size_other, OPT_U32, 512)
+
 // for more than filestore_max_inline_xattrs attrs
-OPTION(filestore_max_inline_xattrs, OPT_U32, 2)
+OPTION(filestore_max_inline_xattrs, OPT_U32, 0)	//Override
+OPTION(filestore_max_inline_xattrs_xfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_btrfs, OPT_U32, 10)
+OPTION(filestore_max_inline_xattrs_other, OPT_U32, 2)
 
 OPTION(filestore_sloppy_crc, OPT_BOOL, false)         // track sloppy crcs
 OPTION(filestore_sloppy_crc_block_size, OPT_INT, 65536)
diff --git a/src/common/hobject.h b/src/common/hobject.h
index e483b664347..a769ad060d9 100644
--- a/src/common/hobject.h
+++ b/src/common/hobject.h
@@ -241,14 +241,14 @@ public:
     return ret;
   }
   filestore_hobject_key_t get_filestore_key_u32() const {
-    assert(!hobj.max);
-    return hobj._reverse_nibbles(hobj.hash);
+    return hobj.get_filestore_key_u32();
   }
   filestore_hobject_key_t get_filestore_key() const {
-    if (hobj.max)
-      return 0x100000000ull;
-    else
-      return get_filestore_key_u32();
+    return hobj.get_filestore_key();
+  }
+
+  bool is_degenerate() const {
+    return generation == NO_GEN && shard_id == NO_SHARD;
   }
 
   // maximum sorted value.
@@ -292,8 +292,12 @@ namespace __gnu_cxx {
 
 ostream& operator<<(ostream& out, const ghobject_t& o);
 
-WRITE_EQ_OPERATORS_3(ghobject_t, hobj, generation, shard_id)
-// sort ghobject_t's by <hobj, generation, shard_id>
+WRITE_EQ_OPERATORS_3(ghobject_t, hobj, shard_id, generation)
+// sort ghobject_t's by <hobj, shard_id, generation> 
+// 
+// Two objects which differ by generation are more related than
+// two objects of the same generation which differ by shard.
+// 
 WRITE_CMP_OPERATORS_3(ghobject_t,
 		      hobj,
 		      shard_id,
diff --git a/src/common/safe_io.c b/src/common/safe_io.c
index ac99db04ad3..afee82edf07 100644
--- a/src/common/safe_io.c
+++ b/src/common/safe_io.c
@@ -14,8 +14,12 @@
 
 #define _XOPEN_SOURCE 500
 
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
 
 #include "common/safe_io.h"
 
@@ -112,3 +116,79 @@ ssize_t safe_pwrite(int fd, const void *buf, size_t count, off_t offset)
 	}
 	return 0;
 }
+
+int safe_write_file(const char *base, const char *file,
+		    const char *val, size_t vallen)
+{
+  int ret;
+  char fn[PATH_MAX];
+  char tmp[PATH_MAX];
+  int fd;
+
+  // does the file already have correct content?
+  char oldval[80];
+  ret = safe_read_file(base, file, oldval, sizeof(oldval));
+  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
+    return 0;  // yes.
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base, file);
+  fd = open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+  if (fd < 0) {
+    ret = errno;
+    return -ret;
+  }
+  ret = safe_write(fd, val, vallen);
+  if (ret) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return ret;
+  }
+
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+  if (ret < 0) {
+    unlink(tmp);
+    return ret;
+  }
+  ret = rename(tmp, fn);
+  if (ret < 0) {
+    ret = -errno;
+    unlink(tmp);
+    return ret;
+  }
+
+  fd = open(base, O_RDONLY);
+  if (fd < 0) {
+    ret = -errno;
+    return ret;
+  }
+  ret = fsync(fd);
+  if (ret < 0) ret = -errno;
+  TEMP_FAILURE_RETRY(close(fd));
+
+  return ret;
+}
+
+int safe_read_file(const char *base, const char *file,
+		   char *val, size_t vallen)
+{
+  char fn[PATH_MAX];
+  int fd, len;
+
+  snprintf(fn, sizeof(fn), "%s/%s", base, file);
+  fd = open(fn, O_RDONLY);
+  if (fd < 0) {
+    return -errno;
+  }
+  len = safe_read(fd, val, vallen - 1);
+  if (len < 0) {
+    TEMP_FAILURE_RETRY(close(fd));
+    return len;
+  }
+  // close sometimes returns errors, but only after write()
+  TEMP_FAILURE_RETRY(close(fd));
+
+  val[len] = 0;
+  return len;
+}
diff --git a/src/common/safe_io.h b/src/common/safe_io.h
index 4c2991fe6e8..a4c9bc7a72f 100644
--- a/src/common/safe_io.h
+++ b/src/common/safe_io.h
@@ -45,6 +45,15 @@ extern "C" {
   ssize_t safe_pread_exact(int fd, void *buf, size_t count, off_t offset)
       WARN_UNUSED_RESULT;
 
+
+  /*
+   * Safe functions to read and write an entire file.
+   */
+  int safe_write_file(const char *base, const char *file,
+			const char *val, size_t vallen);
+  int safe_read_file(const char *base, const char *file,
+		       char *val, size_t vallen);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/include/Makefile.am b/src/include/Makefile.am
index d702ebd2795..2d98e777f00 100644
--- a/src/include/Makefile.am
+++ b/src/include/Makefile.am
@@ -18,7 +18,6 @@ rados_include_DATA = \
 	$(srcdir)/include/crc32c.h
 
 noinst_HEADERS += \
-	include/bloom_filter.hpp \
 	include/Context.h \
 	include/CompatSet.h \
 	include/Distribution.h \
diff --git a/src/include/bloom_filter.hpp b/src/include/bloom_filter.hpp
deleted file mode 100644
index 41aba4bad47..00000000000
--- a/src/include/bloom_filter.hpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- *******************************************************************
- *                                                                 *
- *                        Open Bloom Filter                        *
- *                                                                 *
- * Author: Arash Partow - 2000                                     *
- * URL: http://www.partow.net/programming/hashfunctions/index.html *
- *                                                                 *
- * Copyright notice:                                               *
- * Free use of the Open Bloom Filter Library is permitted under    *
- * the guidelines and in accordance with the most current version  *
- * of the Boost Software License, Version 1.0                      *
- * http://www.opensource.org/licenses/bsl1.0.html                  *
- *                                                                 *
- *******************************************************************
-*/
-
-
-#ifndef INCLUDE_BLOOM_FILTER_HPP
-#define INCLUDE_BLOOM_FILTER_HPP
-
-#include <cstddef>
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-
-
-static const std::size_t bits_per_char = 0x08;    // 8 bits in 1 char(unsigned)
-static const unsigned char bit_mask[bits_per_char] = {
-                                                       0x01,  //00000001
-                                                       0x02,  //00000010
-                                                       0x04,  //00000100
-                                                       0x08,  //00001000
-                                                       0x10,  //00010000
-                                                       0x20,  //00100000
-                                                       0x40,  //01000000
-                                                       0x80   //10000000
-                                                     };
-
-
-class bloom_filter
-{
-protected:
-
-   typedef unsigned int bloom_type;
-   typedef unsigned char cell_type;
-
-public:
-
-   bloom_filter(const std::size_t& predicted_inserted_element_count,
-                const double& false_positive_probability,
-                const std::size_t& random_seed)
-   : bit_table_(0),
-     predicted_inserted_element_count_(predicted_inserted_element_count),
-     inserted_element_count_(0),
-     random_seed_((random_seed) ? random_seed : 0xA5A5A5A5),
-     desired_false_positive_probability_(false_positive_probability)
-   {
-      find_optimal_parameters();
-      generate_unique_salt();
-      raw_table_size_ = table_size_ / bits_per_char;
-      bit_table_ = new cell_type[raw_table_size_];
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-   }
-
-   bloom_filter(const bloom_filter& filter)
-   {
-      this->operator=(filter);
-   }
-
-   bloom_filter& operator = (const bloom_filter& filter)
-   {
-      if (this != &filter) {
-        salt_count_ = filter.salt_count_;
-        table_size_ = filter.table_size_;
-        raw_table_size_ = filter.raw_table_size_;
-        predicted_inserted_element_count_ = filter.predicted_inserted_element_count_;
-        inserted_element_count_ = filter.inserted_element_count_;
-        random_seed_ = filter.random_seed_;
-        desired_false_positive_probability_ = filter.desired_false_positive_probability_;
-        delete[] bit_table_;
-        bit_table_ = new cell_type[raw_table_size_];
-        std::copy(filter.bit_table_,filter.bit_table_ + raw_table_size_,bit_table_);
-        salt_ = filter.salt_;
-      }
-      return *this;
-   }
-
-   virtual ~bloom_filter()
-   {
-      delete[] bit_table_;
-   }
-
-   inline bool operator!() const
-   {
-      return (0 == table_size_);
-   }
-
-   inline void clear()
-   {
-      std::fill_n(bit_table_,raw_table_size_,0x00);
-      inserted_element_count_ = 0;
-   }
-
-   inline void insert(const unsigned char* key_begin, const std::size_t& length)
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         bit_table_[bit_index / bits_per_char] |= bit_mask[bit];
-      }
-      ++inserted_element_count_;
-   }
-
-   template<typename T>
-   inline void insert(const T& t)
-   {
-      // Note: T must be a C++ POD type.
-      insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
-   }
-
-   inline void insert(const std::string& key)
-   {
-      insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline void insert(const char* data, const std::size_t& length)
-   {
-      insert(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline void insert(const InputIterator begin, const InputIterator end)
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         insert(*(itr++));
-      }
-   }
-
-   inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
-   {
-      std::size_t bit_index = 0;
-      std::size_t bit = 0;
-      for (std::size_t i = 0; i < salt_.size(); ++i)
-      {
-         compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
-         if ((bit_table_[bit_index / bits_per_char] & bit_mask[bit]) != bit_mask[bit])
-         {
-            return false;
-         }
-      }
-      return true;
-   }
-
-   template<typename T>
-   inline bool contains(const T& t) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
-   }
-
-   inline bool contains(const std::string& key) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
-   }
-
-   inline bool contains(const char* data, const std::size_t& length) const
-   {
-      return contains(reinterpret_cast<const unsigned char*>(data),length);
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (!contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   template<typename InputIterator>
-   inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
-   {
-      InputIterator itr = begin;
-      while (end != itr)
-      {
-         if (contains(*itr))
-         {
-            return itr;
-         }
-         ++itr;
-      }
-      return end;
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return table_size_;
-   }
-
-   inline std::size_t element_count() const
-   {
-      return inserted_element_count_;
-   }
-
-   inline double effective_fpp() const
-   {
-      /*
-        Note:
-        The effective false positive probability is calculated using the
-        designated table size and hash function count in conjunction with
-        the current number of inserted elements - not the user defined
-        predicated/expected number of inserted elements.
-      */
-      return std::pow(1.0 - std::exp(-1.0 * salt_.size() * inserted_element_count_ / size()), 1.0 * salt_.size());
-   }
-
-   inline bloom_filter& operator &= (const bloom_filter& filter)
-   {
-      /* intersection */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] &= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator |= (const bloom_filter& filter)
-   {
-      /* union */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] |= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline bloom_filter& operator ^= (const bloom_filter& filter)
-   {
-      /* difference */
-      if (
-          (salt_count_  == filter.salt_count_) &&
-          (table_size_  == filter.table_size_) &&
-          (random_seed_ == filter.random_seed_)
-         )
-      {
-         for (std::size_t i = 0; i < raw_table_size_; ++i)
-         {
-            bit_table_[i] ^= filter.bit_table_[i];
-         }
-      }
-      return *this;
-   }
-
-   inline const cell_type* table() const
-   {
-      return bit_table_;
-   }
-
-protected:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash % table_size_;
-      bit = bit_index % bits_per_char;
-   }
-
-   void generate_unique_salt()
-   {
-      /*
-        Note:
-        A distinct hash function need not be implementation-wise
-        distinct. In the current implementation "seeding" a common
-        hash function with different values seems to be adequate.
-      */
-      const unsigned int predef_salt_count = 128;
-      static const bloom_type predef_salt[predef_salt_count] =
-                                 {
-                                    0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
-                                    0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
-                                    0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
-                                    0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
-                                    0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
-                                    0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
-                                    0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
-                                    0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
-                                    0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
-                                    0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
-                                    0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
-                                    0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
-                                    0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
-                                    0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
-                                    0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
-                                    0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
-                                    0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
-                                    0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
-                                    0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
-                                    0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
-                                    0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
-                                    0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
-                                    0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
-                                    0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
-                                    0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
-                                    0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
-                                    0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
-                                    0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
-                                    0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
-                                    0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
-                                    0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
-                                    0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
-                                 };
-
-      if (salt_count_ <= predef_salt_count)
-      {
-         std::copy(predef_salt,
-                   predef_salt + salt_count_,
-                   std::back_inserter(salt_));
-          for (unsigned int i = 0; i < salt_.size(); ++i)
-          {
-            /*
-              Note:
-              This is done to integrate the user defined random seed,
-              so as to allow for the generation of unique bloom filter
-              instances.
-            */
-            salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
-          }
-      }
-      else
-      {
-         std::copy(predef_salt,predef_salt + predef_salt_count,std::back_inserter(salt_));
-         srand(static_cast<unsigned int>(random_seed_));
-         while (salt_.size() < salt_count_)
-         {
-            bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
-            if (0 == current_salt) continue;
-            if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
-            {
-               salt_.push_back(current_salt);
-            }
-         }
-      }
-   }
-
-   void find_optimal_parameters()
-   {
-      /*
-        Note:
-        The following will attempt to find the number of hash functions
-        and minimum amount of storage bits required to construct a bloom
-        filter consistent with the user defined false positive probability
-        and estimated element insertion count.
-      */
-
-      double min_m = std::numeric_limits<double>::infinity();
-      double min_k = 0.0;
-      double curr_m = 0.0;
-      double k = 1.0;
-      while (k < 1000.0)
-      {
-         double numerator   = (- k * predicted_inserted_element_count_);
-         double denominator = std::log(1.0 - std::pow(desired_false_positive_probability_, 1.0 / k));
-         curr_m = numerator / denominator;
-
-         if (curr_m < min_m)
-         {
-            min_m = curr_m;
-            min_k = k;
-         }
-         k += 1.0;
-      }
-
-      salt_count_ = static_cast<std::size_t>(min_k);
-      table_size_ = static_cast<std::size_t>(min_m);
-      table_size_ += (((table_size_ % bits_per_char) != 0) ? (bits_per_char - (table_size_ % bits_per_char)) : 0);
-   }
-
-   inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
-   {
-      const unsigned char* itr = begin;
-
-      while (remaining_length >= 4)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 4;
-      }
-
-      while (remaining_length >= 2)
-      {
-         hash ^=    (hash <<  7) ^  (*itr++) * (hash >> 3);
-         hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
-         remaining_length -= 2;
-      }
-
-      if (remaining_length)
-      {
-         hash ^= (hash <<  7) ^ (*itr) * (hash >> 3);
-      }
-
-      return hash;
-   }
-
-   std::vector<bloom_type> salt_;
-   unsigned char*          bit_table_;
-   std::size_t             salt_count_;
-   std::size_t             table_size_;
-   std::size_t             raw_table_size_;
-   std::size_t             predicted_inserted_element_count_;
-   std::size_t             inserted_element_count_;
-   std::size_t             random_seed_;
-   double                  desired_false_positive_probability_;
-};
-
-inline bloom_filter operator & (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result &= b;
-   return result;
-}
-
-inline bloom_filter operator | (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result |= b;
-   return result;
-}
-
-inline bloom_filter operator ^ (const bloom_filter& a, const bloom_filter& b)
-{
-   bloom_filter result = a;
-   result ^= b;
-   return result;
-}
-
-
-class compressible_bloom_filter : public bloom_filter
-{
-public:
-
-   compressible_bloom_filter(const std::size_t& predicted_element_count,
-                             const double& false_positive_probability,
-                             const std::size_t& random_seed)
-   : bloom_filter(predicted_element_count,false_positive_probability,random_seed)
-   {
-      size_list.push_back(table_size_);
-   }
-
-   inline virtual std::size_t size() const
-   {
-      return size_list.back();
-   }
-
-   inline bool compress(const double& percentage)
-   {
-      if ((0.0 >= percentage) || (percentage >= 100.0))
-      {
-         return false;
-      }
-
-      std::size_t original_table_size = size_list.back();
-      std::size_t new_table_size = static_cast<std::size_t>((size_list.back() * (1.0 - (percentage / 100.0))));
-      new_table_size -= (((new_table_size % bits_per_char) != 0) ? (new_table_size % bits_per_char) : 0);
-
-      if ((bits_per_char > new_table_size) || (new_table_size >= original_table_size))
-      {
-         return false;
-      }
-
-      desired_false_positive_probability_ = effective_fpp();
-      cell_type* tmp = new cell_type[new_table_size / bits_per_char];
-      std::copy(bit_table_, bit_table_ + (new_table_size / bits_per_char), tmp);
-      cell_type* itr = bit_table_ + (new_table_size / bits_per_char);
-      cell_type* end = bit_table_ + (original_table_size / bits_per_char);
-      cell_type* itr_tmp = tmp;
-
-      while (end != itr)
-      {
-         *(itr_tmp++) |= (*itr++);
-      }
-
-      delete[] bit_table_;
-      bit_table_ = tmp;
-      size_list.push_back(new_table_size);
-
-      return true;
-   }
-
-private:
-
-   inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
-   {
-      bit_index = hash;
-      for (std::size_t i = 0; i < size_list.size(); ++i)
-      {
-         bit_index %= size_list[i];
-      }
-      bit = bit_index % bits_per_char;
-   }
-
-   std::vector<std::size_t> size_list;
-};
-
-#endif
-
-
-/*
-  Note 1:
-  If it can be guaranteed that bits_per_char will be of the form 2^n then
-  the following optimization can be used:
-
-  hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
-
-  Note 2:
-  For performance reasons where possible when allocating memory it should
-  be aligned (aligned_alloc) according to the architecture being used.
-*/
diff --git a/src/include/buffer.h b/src/include/buffer.h
index f4a2f5c3264..ffa3d6e1b97 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -14,8 +14,6 @@
 #ifndef CEPH_BUFFER_H
 #define CEPH_BUFFER_H
 
-#include "include/int_types.h"
-
 #if defined(__linux__)
 #include <stdlib.h>
 #include <linux/types.h>
@@ -46,6 +44,7 @@ void	*valloc(size_t);
 #include <malloc.h>
 #endif
 
+#include <inttypes.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -420,7 +419,7 @@ public:
     ssize_t read_fd(int fd, size_t len);
     int write_file(const char *fn, int mode=0644);
     int write_fd(int fd) const;
-    __u32 crc32c(__u32 crc) const;
+    uint32_t crc32c(uint32_t crc) const;
   };
 
   /*
@@ -428,7 +427,7 @@ public:
    */
 
   class hash {
-    __u32 crc;
+    uint32_t crc;
 
   public:
     hash() : crc(0) { }
@@ -437,7 +436,7 @@ public:
       crc = bl.crc32c(crc);
     }
 
-    __u32 digest() {
+    uint32_t digest() {
       return crc;
     }
   };
diff --git a/src/include/crc32c.h b/src/include/crc32c.h
index 8e22c624636..49d68474d68 100644
--- a/src/include/crc32c.h
+++ b/src/include/crc32c.h
@@ -1,8 +1,7 @@
 #ifndef CEPH_CRC32C_H
 #define CEPH_CRC32C_H
 
-#include "include/int_types.h"
-
+#include <inttypes.h>
 #include <string.h>
 
 typedef uint32_t (*ceph_crc32c_func_t)(uint32_t crc, unsigned char const *data, unsigned length);
diff --git a/src/include/rados/librados.h b/src/include/rados/librados.h
index a85ef3057bc..515663c2335 100644
--- a/src/include/rados/librados.h
+++ b/src/include/rados/librados.h
@@ -1,8 +1,6 @@
 #ifndef CEPH_LIBRADOS_H
 #define CEPH_LIBRADOS_H
 
-#include "include/int_types.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -566,7 +564,7 @@ int rados_pool_create_with_auid(rados_t cluster, const char *pool_name, uint64_t
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
-				      __u8 crush_rule_num);
+				      uint8_t crush_rule_num);
 
 /**
  * Create a pool with a specific CRUSH rule and auid
@@ -581,7 +579,7 @@ int rados_pool_create_with_crush_rule(rados_t cluster, const char *pool_name,
  * @returns 0 on success, negative error code on failure
  */
 int rados_pool_create_with_all(rados_t cluster, const char *pool_name, uint64_t auid,
-			       __u8 crush_rule_num);
+			       uint8_t crush_rule_num);
 
 /**
  * Delete a pool and all data inside it
diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp
index 3f6d025ff41..c8de9f9df33 100644
--- a/src/include/rados/librados.hpp
+++ b/src/include/rados/librados.hpp
@@ -789,7 +789,12 @@ namespace librados
     int cluster_stat(cluster_stat_t& result);
     int cluster_fsid(std::string *fsid);
 
-    /* pool aio */
+    /*
+     * pool aio
+     *
+     * It is up to the caller to release the completion handler, even if the pool_create_async()
+     * and/or pool_delete_async() fails and does not send the async request
+     */
     static PoolAsyncCompletion *pool_async_create_completion();
 
    // -- aio --
diff --git a/src/librados/PoolAsyncCompletionImpl.h b/src/librados/PoolAsyncCompletionImpl.h
index efb89641466..443b2c23a17 100644
--- a/src/librados/PoolAsyncCompletionImpl.h
+++ b/src/librados/PoolAsyncCompletionImpl.h
@@ -94,6 +94,9 @@ namespace librados {
     C_PoolAsync_Safe(PoolAsyncCompletionImpl *_c) : c(_c) {
       c->get();
     }
+    ~C_PoolAsync_Safe() {
+      c->put();
+    }
   
     void finish(int r) {
       c->lock.Lock();
@@ -109,7 +112,7 @@ namespace librados {
 	c->lock.Lock();
       }
 
-      c->put_unlock();
+      c->lock.Unlock();
     }
   };
 }
diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index c77ca180a6f..4a5e636d9a6 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -27,7 +27,7 @@
 #include "MDLog.h"
 #include "LogSegment.h"
 
-#include "include/bloom_filter.hpp"
+#include "common/bloom_filter.hpp"
 #include "include/Context.h"
 #include "common/Clock.h"
 
diff --git a/src/mds/MDLog.cc b/src/mds/MDLog.cc
index bd89da71495..cacbebfd3f6 100644
--- a/src/mds/MDLog.cc
+++ b/src/mds/MDLog.cc
@@ -499,7 +499,11 @@ void MDLog::_replay_thread()
     if (journaler->get_error()) {
       r = journaler->get_error();
       dout(0) << "_replay journaler got error " << r << ", aborting" << dendl;
-      if (r == -EINVAL) {
+      if (r == -ENOENT) {
+	// journal has been trimmed by somebody else?
+	assert(journaler->is_readonly());
+	r = -EAGAIN;
+      } else if (r == -EINVAL) {
         if (journaler->get_read_pos() < journaler->get_expire_pos()) {
           // this should only happen if you're following somebody else
           assert(journaler->is_readonly());
diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h
index 33e00a98d30..b7a5f853928 100644
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@@ -112,7 +112,7 @@ COMMAND("pg send_pg_creates", "trigger pg creates to be issued",\
 	"pg", "rw", "cli,rest")
 COMMAND("pg dump " \
 	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|delta|pools|osds|pgs|pgs_brief,n=N,req=false", \
-	"show human-readable versions of pg map", "pg", "r", "cli,rest")
+	"show human-readable versions of pg map (only 'all' valid with plain)", "pg", "r", "cli,rest")
 COMMAND("pg dump_json " \
 	"name=dumpcontents,type=CephChoices,strings=all|summary|sum|pools|osds|pgs,n=N,req=false", \
 	"show human-readable version of pg map in json only",\
@@ -518,6 +518,10 @@ COMMAND("osd pool set-quota " \
 	"name=field,type=CephChoices,strings=max_objects|max_bytes " \
 	"name=val,type=CephString",
 	"set object or byte limit on pool", "osd", "rw", "cli,rest")
+COMMAND("osd pool stats " \
+        "name=name,type=CephString,req=false",
+        "obtain stats from all pools, or from specified pool",
+        "osd", "r", "cli,rest")
 COMMAND("osd reweight-by-utilization " \
 	"name=oload,type=CephInt,range=100,req=false", \
 	"reweight OSDs by utilization [overload-percentage-for-consideration, default 120]", \
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 2c64a8f2ef2..d8c90bc3d76 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -2561,67 +2561,98 @@ bool Monitor::_ms_dispatch(Message *m)
   EntityName entity_name;
   bool src_is_mon;
 
-  src_is_mon = !connection || (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
-
-  if (connection) {
-    bool reuse_caps = false;
-    dout(20) << "have connection" << dendl;
-    s = static_cast<MonSession *>(connection->get_priv());
-    if (s && s->closed) {
-      caps = s->caps;
-      reuse_caps = true;
-      s->put();
-      s = NULL;
+  // regardless of who we are or who the sender is, the message must
+  // have a connection associated.  If it doesn't then something fishy
+  // is going on.
+  assert(connection);
+
+  src_is_mon = (connection->get_peer_type() & CEPH_ENTITY_TYPE_MON);
+
+  bool reuse_caps = false;
+  dout(20) << "have connection" << dendl;
+  s = static_cast<MonSession *>(connection->get_priv());
+  if (s && s->closed) {
+    caps = s->caps;
+    reuse_caps = true;
+    s->put();
+    s = NULL;
+  }
+  if (!s) {
+    // if the sender is not a monitor, make sure their first message for a
+    // session is an MAuth.  If it is not, assume it's a stray message,
+    // and considering that we are creating a new session it is safe to
+    // assume that the sender hasn't authenticated yet, so we have no way
+    // of assessing whether we should handle it or not.
+    if (!src_is_mon && (m->get_type() != CEPH_MSG_AUTH &&
+			m->get_type() != CEPH_MSG_MON_GET_MAP)) {
+      dout(1) << __func__ << " dropping stray message " << *m
+        << " from " << m->get_source_inst() << dendl;
+      return false;
     }
-    if (!s) {
-      if (!exited_quorum.is_zero() && !src_is_mon) {
-	waitlist_or_zap_client(m);
-	return true;
-      }
-      dout(10) << "do not have session, making new one" << dendl;
-      s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
-      m->get_connection()->set_priv(s->get());
-      dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
-
-      if (m->get_connection()->get_peer_type() != CEPH_ENTITY_TYPE_MON) {
-	dout(10) << "setting timeout on session" << dendl;
-	// set an initial timeout here, so we will trim this session even if they don't
-	// do anything.
-	s->until = ceph_clock_now(g_ceph_context);
-	s->until += g_conf->mon_subscribe_interval;
-      } else {
-	//give it monitor caps; the peer type has been authenticated
-	reuse_caps = false;
-	dout(5) << "setting monitor caps on this connection" << dendl;
-	if (!s->caps.is_allow_all()) //but no need to repeatedly copy
-	  s->caps = *mon_caps;
-      }
-      if (reuse_caps)
-        s->caps = caps;
+
+    if (!exited_quorum.is_zero() && !src_is_mon) {
+      waitlist_or_zap_client(m);
+      return true;
+    }
+
+    dout(10) << "do not have session, making new one" << dendl;
+    s = session_map.new_session(m->get_source_inst(), m->get_connection().get());
+    m->get_connection()->set_priv(s->get());
+    dout(10) << "ms_dispatch new session " << s << " for " << s->inst << dendl;
+
+    if (!src_is_mon) {
+      dout(10) << "setting timeout on session" << dendl;
+      // set an initial timeout here, so we will trim this session even if they don't
+      // do anything.
+      s->until = ceph_clock_now(g_ceph_context);
+      s->until += g_conf->mon_subscribe_interval;
     } else {
-      dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+      //give it monitor caps; the peer type has been authenticated
+      reuse_caps = false;
+      dout(5) << "setting monitor caps on this connection" << dendl;
+      if (!s->caps.is_allow_all()) //but no need to repeatedly copy
+        s->caps = *mon_caps;
     }
+    if (reuse_caps)
+      s->caps = caps;
+  } else {
+    dout(20) << "ms_dispatch existing session " << s << " for " << s->inst << dendl;
+  }
+
+  if (s) {
     if (s->auth_handler) {
       entity_name = s->auth_handler->get_entity_name();
     }
-  }
-
-  if (s)
     dout(20) << " caps " << s->caps.get_str() << dendl;
+  }
 
   if (is_synchronizing() && !src_is_mon) {
     waitlist_or_zap_client(m);
     return true;
   }
 
-  {
-    switch (m->get_type()) {
-      
+  ret = dispatch(s, m, src_is_mon);
+
+  if (s) {
+    s->put();
+  }
+
+  return ret;
+}
+
+bool Monitor::dispatch(MonSession *s, Message *m, const bool src_is_mon)
+{
+  bool ret = true;
+
+  assert(m != NULL);
+
+  switch (m->get_type()) {
+
     case MSG_ROUTE:
       handle_route(static_cast<MRoute*>(m));
       break;
 
-      // misc
+    // misc
     case CEPH_MSG_MON_GET_MAP:
       handle_mon_get_map(static_cast<MMonGetMap*>(m));
       break;
@@ -2647,12 +2678,11 @@ bool Monitor::_ms_dispatch(Message *m)
     case MSG_MON_SYNC:
       handle_sync(static_cast<MMonSync*>(m));
       break;
-
     case MSG_MON_SCRUB:
       handle_scrub(static_cast<MMonScrub*>(m));
       break;
 
-      // OSDs
+    // OSDs
     case MSG_OSD_MARK_ME_DOWN:
     case MSG_OSD_FAILURE:
     case MSG_OSD_BOOT:
@@ -2665,20 +2695,20 @@ bool Monitor::_ms_dispatch(Message *m)
       paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // MDSs
+    // MDSs
     case MSG_MDS_BEACON:
     case MSG_MDS_OFFLOAD_TARGETS:
       paxos_service[PAXOS_MDSMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // auth
+    // auth
     case MSG_MON_GLOBAL_ID:
     case CEPH_MSG_AUTH:
       /* no need to check caps here */
       paxos_service[PAXOS_AUTH]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // pg
+    // pg
     case CEPH_MSG_STATFS:
     case MSG_PGSTATS:
     case MSG_GETPOOLSTATS:
@@ -2689,7 +2719,7 @@ bool Monitor::_ms_dispatch(Message *m)
       paxos_service[PAXOS_OSDMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // log
+    // log
     case MSG_LOG:
       paxos_service[PAXOS_LOG]->dispatch((PaxosServiceMessage*)m);
       break;
@@ -2698,60 +2728,60 @@ bool Monitor::_ms_dispatch(Message *m)
       clog.handle_log_ack((MLogAck*)m);
       break;
 
-      // monmap
+    // monmap
     case MSG_MON_JOIN:
       paxos_service[PAXOS_MONMAP]->dispatch((PaxosServiceMessage*)m);
       break;
 
-      // paxos
+    // paxos
     case MSG_MON_PAXOS:
       {
-	MMonPaxos *pm = static_cast<MMonPaxos*>(m);
-	if (!src_is_mon && 
-	    !s->is_capable("mon", MON_CAP_X)) {
-	  //can't send these!
-	  pm->put();
-	  break;
-	}
+        MMonPaxos *pm = static_cast<MMonPaxos*>(m);
+        if (!src_is_mon ||
+            !s->is_capable("mon", MON_CAP_X)) {
+          //can't send these!
+          pm->put();
+          break;
+        }
 
-	if (state == STATE_SYNCHRONIZING) {
-	  // we are synchronizing. These messages would do us no
-	  // good, thus just drop them and ignore them.
-	  dout(10) << __func__ << " ignore paxos msg from "
-		   << pm->get_source_inst() << dendl;
-	  pm->put();
-	  break;
-	}
+        if (state == STATE_SYNCHRONIZING) {
+          // we are synchronizing. These messages would do us no
+          // good, thus just drop them and ignore them.
+          dout(10) << __func__ << " ignore paxos msg from "
+            << pm->get_source_inst() << dendl;
+          pm->put();
+          break;
+        }
 
-	// sanitize
-	if (pm->epoch > get_epoch()) {
-	  bootstrap();
-	  pm->put();
-	  break;
-	}
-	if (pm->epoch != get_epoch()) {
-	  pm->put();
-	  break;
-	}
+        // sanitize
+        if (pm->epoch > get_epoch()) {
+          bootstrap();
+          pm->put();
+          break;
+        }
+        if (pm->epoch != get_epoch()) {
+          pm->put();
+          break;
+        }
 
-	paxos->dispatch((PaxosServiceMessage*)m);
+        paxos->dispatch((PaxosServiceMessage*)m);
       }
       break;
 
-      // elector messages
+    // elector messages
     case MSG_MON_ELECTION:
       //check privileges here for simplicity
       if (s &&
-	  !s->is_capable("mon", MON_CAP_X)) {
-	dout(0) << "MMonElection received from entity without enough caps!"
-		<< s->caps << dendl;
-	m->put();
-	break;
+          !s->is_capable("mon", MON_CAP_X)) {
+        dout(0) << "MMonElection received from entity without enough caps!"
+          << s->caps << dendl;
+        m->put();
+        break;
       }
       if (!is_probing() && !is_synchronizing()) {
-	elector.dispatch(m);
+        elector.dispatch(m);
       } else {
-	m->put();
+        m->put();
       }
       break;
 
@@ -2769,10 +2799,6 @@ bool Monitor::_ms_dispatch(Message *m)
 
     default:
       ret = false;
-    }
-  }
-  if (s) {
-    s->put();
   }
 
   return ret;
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index 9b304428732..2c1c2cdeb19 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -700,6 +700,8 @@ public:
     lock.Unlock();
     return ret;
   }
+  // dissociate message handling from session and connection logic
+  bool dispatch(MonSession *s, Message *m, const bool src_is_mon);
   //mon_caps is used for un-connected messages from monitors
   MonCap * mon_caps;
   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new);
diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc
index 799f19df154..ca855592445 100644
--- a/src/mon/MonmapMonitor.cc
+++ b/src/mon/MonmapMonitor.cc
@@ -298,20 +298,45 @@ bool MonmapMonitor::prepare_command(MMonCommand *m)
       addr.set_port(CEPH_MON_PORT);
     }
 
-    if (pending_map.contains(addr) ||
-	pending_map.contains(name)) {
+    /**
+     * If we have a monitor with the same name and different addr, then EEXIST
+     * If we have a monitor with the same addr and different name, then EEXIST
+     * If we have a monitor with the same addr and same name, then return as if
+     * we had just added the monitor.
+     * If we don't have the monitor, add it.
+     */
+
+    err = 0;
+    if (!ss.str().empty())
+      ss << "; ";
+
+    do {
+      if (pending_map.contains(addr)) {
+        string n = pending_map.get_name(addr);
+        if (n == name)
+          break;
+      } else if (pending_map.contains(name)) {
+        entity_addr_t tmp_addr = pending_map.get_addr(name);
+        if (tmp_addr == addr)
+          break;
+      } else {
+        break;
+      }
       err = -EEXIST;
-      if (!ss.str().empty())
-	ss << "; ";
-      ss << "mon " << name << " " << addr << " already exists";
+      ss << "mon." << name << " at " << addr << " already exists";
+      goto out;
+    } while (false);
+
+    ss << "added mon." << name << " at " << addr;
+    if (pending_map.contains(name)) {
       goto out;
     }
 
     pending_map.add(name, addr);
     pending_map.last_changed = ceph_clock_now(g_ceph_context);
-    ss << "added mon." << name << " at " << addr;
     getline(ss, rs);
-    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs, get_last_committed()));
+    wait_for_finished_proposal(new Monitor::C_Command(mon, m, 0, rs,
+                                                      get_last_committed()));
     return true;
 
   } else if (prefix == "mon remove") {
diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc
index 9144736d801..9d36e87788d 100644
--- a/src/mon/OSDMonitor.cc
+++ b/src/mon/OSDMonitor.cc
@@ -2296,6 +2296,105 @@ bool OSDMonitor::preprocess_command(MMonCommand *m)
     }
     r = 0;
 
+  } else if (prefix == "osd pool stats") {
+    string pool_name;
+    cmd_getval(g_ceph_context, cmdmap, "name", pool_name);
+
+    PGMap& pg_map = mon->pgmon()->pg_map;
+
+    int64_t poolid = -ENOENT;
+    bool one_pool = false;
+    if (!pool_name.empty()) {
+      poolid = osdmap.lookup_pg_pool_name(pool_name);
+      if (poolid < 0) {
+        assert(poolid == -ENOENT);
+        ss << "unrecognized pool '" << pool_name << "'";
+        r = -ENOENT;
+        goto reply;
+      }
+      one_pool = true;
+    }
+
+    stringstream rs;
+
+    if (f)
+      f->open_array_section("pool_stats");
+    if (osdmap.get_pools().size() == 0) {
+      if (!f)
+        ss << "there are no pools!";
+      goto stats_out;
+    }
+
+    for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
+         it != osdmap.get_pools().end();
+         ++it) {
+
+      if (!one_pool)
+        poolid = it->first;
+
+      pool_name = osdmap.get_pool_name(poolid);
+
+      if (f) {
+        f->open_object_section("pool");
+        f->dump_string("pool_name", pool_name.c_str());
+        f->dump_int("pool_id", poolid);
+        f->open_object_section("recovery");
+      }
+
+      stringstream rss, tss;
+      pg_map.pool_recovery_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->open_object_section("recovery_rate");
+      }
+
+      rss.clear();
+      rss.str("");
+
+      pg_map.pool_recovery_rate_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  recovery io " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->open_object_section("client_io_rate");
+      }
+
+      rss.clear();
+      rss.str("");
+
+      pg_map.pool_client_io_rate_summary(f.get(), &rss, poolid);
+      if (!f && !rss.str().empty())
+        tss << "  client io " << rss.str() << "\n";
+
+      if (f) {
+        f->close_section();
+        f->close_section();
+      } else {
+        rs << "pool " << pool_name << " id " << poolid << "\n";
+        if (!tss.str().empty())
+          rs << tss.str() << "\n";
+        else
+          rs << "  nothing is going on\n\n";
+      }
+
+      if (one_pool)
+        break;
+    }
+
+stats_out:
+    if (f) {
+      f->close_section();
+      f->flush(rdata);
+    } else {
+      rdata.append(rs.str());
+    }
+    rdata.append("\n");
+    r = 0;
+
   } else if (prefix == "osd crush rule list" ||
 	     prefix == "osd crush rule ls") {
     string format;
diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc
index e9a35c6b8ab..39cb30f97c8 100644
--- a/src/mon/PGMap.cc
+++ b/src/mon/PGMap.cc
@@ -30,7 +30,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(6, 5, bl);
+  ENCODE_START(7, 5, bl);
   ::encode(version, bl);
   ::encode(pg_stat_updates, bl);
   ::encode(osd_stat_updates, bl);
@@ -41,6 +41,7 @@ void PGMap::Incremental::encode(bufferlist &bl, uint64_t features) const
   ::encode(nearfull_ratio, bl);
   ::encode(pg_remove, bl);
   ::encode(stamp, bl);
+  ::encode(osd_epochs, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -89,6 +90,17 @@ void PGMap::Incremental::decode(bufferlist::iterator &bl)
   }
   if (struct_v >= 6)
     ::decode(stamp, bl);
+  if (struct_v >= 7) {
+    ::decode(osd_epochs, bl);
+  } else {
+    for (map<int32_t, osd_stat_t>::iterator i = osd_stat_updates.begin();
+	 i != osd_stat_updates.end();
+	 ++i) {
+      // This isn't accurate, but will cause trimming to behave like
+      // previously.
+      osd_epochs.insert(make_pair(i->first, osdmap_epoch));
+    }
+  }
   DECODE_FINISH(bl);
 }
 
@@ -140,6 +152,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
   o.back()->version = 2;
   o.back()->pg_stat_updates[pg_t(1,2,3)] = pg_stat_t();
   o.back()->osd_stat_updates[5] = osd_stat_t();
+  o.back()->osd_epochs[5] = 12;
   o.push_back(new Incremental);
   o.back()->version = 3;
   o.back()->osdmap_epoch = 1;
@@ -148,6 +161,7 @@ void PGMap::Incremental::generate_test_instances(list<PGMap::Incremental*>& o)
   o.back()->nearfull_ratio = .3;
   o.back()->pg_stat_updates[pg_t(4,5,6)] = pg_stat_t();
   o.back()->osd_stat_updates[6] = osd_stat_t();
+  o.back()->osd_epochs[6] = 12;
   o.back()->pg_remove.insert(pg_t(1,2,3));
   o.back()->osd_stat_rm.insert(5);
 }
@@ -166,6 +180,7 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
   stamp = inc.stamp;
 
   pool_stat_t pg_sum_old = pg_sum;
+  hash_map<uint64_t, pool_stat_t> pg_pool_sum_old;
 
   bool ratios_changed = false;
   if (inc.full_ratio != full_ratio && inc.full_ratio != -1) {
@@ -185,6 +200,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     const pg_t &update_pg(p->first);
     const pg_stat_t &update_stat(p->second);
 
+    if (pg_pool_sum_old.count(update_pg.pool()) == 0)
+      pg_pool_sum_old[update_pg.pool()] = pg_pool_sum[update_pg.pool()];
+
     hash_map<pg_t,pg_stat_t>::iterator t = pg_stat.find(update_pg);
     if (t == pg_stat.end()) {
       hash_map<pg_t,pg_stat_t>::value_type v(update_pg, update_stat);
@@ -195,12 +213,14 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     }
     stat_pg_add(update_pg, update_stat);
   }
-  for (map<int32_t,osd_stat_t>::const_iterator p = inc.osd_stat_updates.begin();
-       p != inc.osd_stat_updates.end();
+  assert(osd_stat.size() == osd_epochs.size());
+  for (map<int32_t,osd_stat_t>::const_iterator p =
+	 inc.get_osd_stat_updates().begin();
+       p != inc.get_osd_stat_updates().end();
        ++p) {
     int osd = p->first;
     const osd_stat_t &new_stats(p->second);
-    
+
     hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(osd);
     if (t == osd_stat.end()) {
       hash_map<int32_t,osd_stat_t>::value_type v(osd, new_stats);
@@ -209,9 +229,11 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
       stat_osd_sub(t->second);
       t->second = new_stats;
     }
+    assert(inc.get_osd_epochs().find(osd) != inc.get_osd_epochs().end());
+    osd_epochs.insert(*(inc.get_osd_epochs().find(osd)));
 
     stat_osd_add(new_stats);
-    
+
     // adjust [near]full status
     register_nearfull_status(osd, new_stats);
   }
@@ -225,9 +247,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
       pg_stat.erase(s);
     }
   }
-  
-  for (set<int>::iterator p = inc.osd_stat_rm.begin();
-       p != inc.osd_stat_rm.end();
+
+  for (set<int>::iterator p = inc.get_osd_stat_rm().begin();
+       p != inc.get_osd_stat_rm().end();
        ++p) {
     hash_map<int32_t,osd_stat_t>::iterator t = osd_stat.find(*p);
     if (t != osd_stat.end()) {
@@ -252,7 +274,9 @@ void PGMap::apply_incremental(CephContext *cct, const Incremental& inc)
     stamp_delta -= pg_sum_deltas.front().second;
     pg_sum_deltas.pop_front();
   }
-  
+
+  update_pool_deltas(cct, inc.stamp, pg_pool_sum_old);
+
   if (inc.osdmap_epoch)
     last_osdmap_epoch = inc.osdmap_epoch;
   if (inc.pg_scan)
@@ -416,6 +440,14 @@ epoch_t PGMap::calc_min_last_epoch_clean() const
     if (lec < min)
       min = lec;
   }
+  // also scan osd epochs
+  // don't trim past the oldest reported osd epoch
+  for (hash_map<int32_t, epoch_t>::const_iterator i = osd_epochs.begin();
+       i != osd_epochs.end();
+       ++i) {
+    if (i->second < min)
+      min = i->second;
+  }
   return min;
 }
 
@@ -434,7 +466,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
     return;
   }
 
-  ENCODE_START(5, 4, bl);
+  ENCODE_START(6, 4, bl);
   ::encode(version, bl);
   ::encode(pg_stat, bl);
   ::encode(osd_stat, bl);
@@ -443,6 +475,7 @@ void PGMap::encode(bufferlist &bl, uint64_t features) const
   ::encode(full_ratio, bl);
   ::encode(nearfull_ratio, bl);
   ::encode(stamp, bl);
+  ::encode(osd_epochs, bl);
   ENCODE_FINISH(bl);
 }
 
@@ -472,6 +505,17 @@ void PGMap::decode(bufferlist::iterator &bl)
   }
   if (struct_v >= 5)
     ::decode(stamp, bl);
+  if (struct_v >= 6) {
+    ::decode(osd_epochs, bl);
+  } else {
+    for (hash_map<int32_t, osd_stat_t>::iterator i = osd_stat.begin();
+	 i != osd_stat.end();
+	 ++i) {
+      // This isn't accurate, but will cause trimming to behave like
+      // previously.
+      osd_epochs.insert(make_pair(i->first, last_osdmap_epoch));
+    }
+  }
   DECODE_FINISH(bl);
 
   calc_stats();
@@ -488,7 +532,10 @@ void PGMap::dirty_all(Incremental& inc)
     inc.pg_stat_updates[p->first] = p->second;
   }
   for (hash_map<int32_t, osd_stat_t>::const_iterator p = osd_stat.begin(); p != osd_stat.end(); ++p) {
-    inc.osd_stat_updates[p->first] = p->second;
+    assert(inc.get_osd_epochs().count(p->first));
+    inc.update_stat(p->first,
+		   inc.get_osd_epochs().find(p->first)->second,
+		   p->second);
   }
 }
 
@@ -701,7 +748,8 @@ void PGMap::dump_stuck_plain(ostream& ss, PGMap::StuckPG type, utime_t cutoff) c
 {
   hash_map<pg_t, pg_stat_t> stuck_pg_stats;
   get_stuck_stats(type, cutoff, stuck_pg_stats);
-  dump_pg_stats_plain(ss, stuck_pg_stats);
+  if (!stuck_pg_stats.empty())
+    dump_pg_stats_plain(ss, stuck_pg_stats);
 }
 
 void PGMap::dump_osd_perf_stats(Formatter *f) const
@@ -738,54 +786,59 @@ void PGMap::print_osd_perf_stats(std::ostream *ss) const
   (*ss) << tab;
 }
 
-void PGMap::recovery_summary(Formatter *f, ostream *out) const
+void PGMap::recovery_summary(Formatter *f, ostream *out,
+                             pool_stat_t delta_sum) const
 {
   bool first = true;
-  if (pg_sum.stats.sum.num_objects_degraded) {
-    double pc = (double)pg_sum.stats.sum.num_objects_degraded / (double)pg_sum.stats.sum.num_object_copies * (double)100.0;
+  if (delta_sum.stats.sum.num_objects_degraded) {
+    double pc = (double)delta_sum.stats.sum.num_objects_degraded /
+      (double)delta_sum.stats.sum.num_object_copies * (double)100.0;
     char b[20];
     snprintf(b, sizeof(b), "%.3lf", pc);
     if (f) {
-      f->dump_unsigned("degraded_objects", pg_sum.stats.sum.num_objects_degraded);
-      f->dump_unsigned("degraded_total", pg_sum.stats.sum.num_object_copies);
+      f->dump_unsigned("degraded_objects", delta_sum.stats.sum.num_objects_degraded);
+      f->dump_unsigned("degraded_total", delta_sum.stats.sum.num_object_copies);
       f->dump_string("degrated_ratio", b);
     } else {
-      *out << pg_sum.stats.sum.num_objects_degraded 
-	   << "/" << pg_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
+      *out << delta_sum.stats.sum.num_objects_degraded
+	   << "/" << delta_sum.stats.sum.num_object_copies << " objects degraded (" << b << "%)";
     }
     first = false;
   }
-  if (pg_sum.stats.sum.num_objects_unfound) {
-    double pc = (double)pg_sum.stats.sum.num_objects_unfound / (double)pg_sum.stats.sum.num_objects * (double)100.0;
+  if (delta_sum.stats.sum.num_objects_unfound) {
+    double pc = (double)delta_sum.stats.sum.num_objects_unfound /
+      (double)delta_sum.stats.sum.num_objects * (double)100.0;
     char b[20];
     snprintf(b, sizeof(b), "%.3lf", pc);
     if (f) {
-      f->dump_unsigned("unfound_objects", pg_sum.stats.sum.num_objects_unfound);
-      f->dump_unsigned("unfound_total", pg_sum.stats.sum.num_objects);
+      f->dump_unsigned("unfound_objects", delta_sum.stats.sum.num_objects_unfound);
+      f->dump_unsigned("unfound_total", delta_sum.stats.sum.num_objects);
       f->dump_string("unfound_ratio", b);
     } else {
       if (!first)
 	*out << "; ";
-      *out << pg_sum.stats.sum.num_objects_unfound
-	   << "/" << pg_sum.stats.sum.num_objects << " unfound (" << b << "%)";
+      *out << delta_sum.stats.sum.num_objects_unfound
+	   << "/" << delta_sum.stats.sum.num_objects << " unfound (" << b << "%)";
     }
     first = false;
   }
 }
 
-void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const
+void PGMap::recovery_rate_summary(Formatter *f, ostream *out,
+                                  pool_stat_t delta_sum,
+                                  utime_t delta_stamp) const
 {
   // make non-negative; we can get negative values if osds send
   // uncommitted stats and then "go backward" or if they are just
   // buggy/wrong.
-  pool_stat_t pos_delta = pg_sum_delta;
+  pool_stat_t pos_delta = delta_sum;
   pos_delta.floor(0);
   if (pos_delta.stats.sum.num_objects_recovered ||
       pos_delta.stats.sum.num_bytes_recovered ||
       pos_delta.stats.sum.num_keys_recovered) {
-    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)stamp_delta;
-    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)stamp_delta;
-    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)stamp_delta;
+    int64_t objps = pos_delta.stats.sum.num_objects_recovered / (double)delta_stamp;
+    int64_t bps = pos_delta.stats.sum.num_bytes_recovered / (double)delta_stamp;
+    int64_t kps = pos_delta.stats.sum.num_keys_recovered / (double)delta_stamp;
     if (f) {
       f->dump_int("recovering_objects_per_sec", objps);
       f->dump_int("recovering_bytes_per_sec", bps);
@@ -799,24 +852,194 @@ void PGMap::recovery_rate_summary(Formatter *f, ostream *out) const
   }
 }
 
-void PGMap::update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old)
+void PGMap::overall_recovery_rate_summary(Formatter *f, ostream *out) const
+{
+  recovery_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMap::overall_recovery_summary(Formatter *f, ostream *out) const
+{
+  recovery_summary(f, out, pg_sum);
+}
+
+void PGMap::pool_recovery_rate_summary(Formatter *f, ostream *out,
+                                       uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  hash_map<uint64_t,utime_t>::const_iterator ts =
+    per_pool_sum_deltas_stamps.find(p->first);
+  assert(ts != per_pool_sum_deltas_stamps.end());
+  recovery_rate_summary(f, out, p->second.first, ts->second);
+}
+
+void PGMap::pool_recovery_summary(Formatter *f, ostream *out,
+                                  uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  recovery_summary(f, out, p->second.first);
+}
+
+void PGMap::client_io_rate_summary(Formatter *f, ostream *out,
+                                   pool_stat_t delta_sum,
+                                   utime_t delta_stamp) const
 {
+  pool_stat_t pos_delta = delta_sum;
+  pos_delta.floor(0);
+  if (pos_delta.stats.sum.num_rd ||
+      pos_delta.stats.sum.num_wr) {
+    if (pos_delta.stats.sum.num_rd) {
+      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("read_bytes_sec", rd);
+      } else {
+	*out << pretty_si_t(rd) << "B/s rd, ";
+      }
+    }
+    if (pos_delta.stats.sum.num_wr) {
+      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)delta_stamp;
+      if (f) {
+	f->dump_int("write_bytes_sec", wr);
+      } else {
+	*out << pretty_si_t(wr) << "B/s wr, ";
+      }
+    }
+    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)delta_stamp;
+    if (f) {
+      f->dump_int("op_per_sec", iops);
+    } else {
+      *out << pretty_si_t(iops) << "op/s";
+    }
+  }
+}
+
+void PGMap::overall_client_io_rate_summary(Formatter *f, ostream *out) const
+{
+  client_io_rate_summary(f, out, pg_sum_delta, stamp_delta);
+}
+
+void PGMap::pool_client_io_rate_summary(Formatter *f, ostream *out,
+                                        uint64_t poolid) const
+{
+  hash_map<uint64_t,pair<pool_stat_t,utime_t> >::const_iterator p =
+    per_pool_sum_delta.find(poolid);
+  if (p == per_pool_sum_delta.end())
+    return;
+  hash_map<uint64_t,utime_t>::const_iterator ts =
+    per_pool_sum_deltas_stamps.find(p->first);
+  assert(ts != per_pool_sum_deltas_stamps.end());
+  client_io_rate_summary(f, out, p->second.first, ts->second);
+}
+
+/**
+ * update aggregated delta
+ *
+ * @param cct               ceph context
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param old_pool_sum      Previous stats sum
+ * @param last_ts           Last timestamp for pool
+ * @param result_pool_sum   Resulting stats
+ * @param result_ts_delta   Resulting timestamp delta
+ * @param delta_avg_list    List of last N computed deltas, used to average
+ */
+void PGMap::update_delta(CephContext *cct,
+                         const utime_t ts,
+                         const pool_stat_t& old_pool_sum,
+                         utime_t *last_ts,
+                         const pool_stat_t& current_pool_sum,
+                         pool_stat_t *result_pool_delta,
+                         utime_t *result_ts_delta,
+                         list<pair<pool_stat_t,utime_t> > *delta_avg_list)
+{
+  /* @p ts is the timestamp we want to associate with the data
+   * in @p old_pool_sum, and on which we will base ourselves to
+   * calculate the delta, stored in 'delta_t'.
+   */
   utime_t delta_t;
-  delta_t = inc_stamp;
-  delta_t -= stamp;
-  stamp = inc_stamp;
+  delta_t = ts;         // start with the provided timestamp
+  delta_t -= *last_ts;  // take the last timestamp we saw
+  *last_ts = ts;        // @p ts becomes the last timestamp we saw
 
   // calculate a delta, and average over the last 2 deltas.
-  pool_stat_t d = pg_sum;
-  d.stats.sub(pg_sum_old.stats);
-  pg_sum_deltas.push_back(make_pair(d, delta_t));
-  stamp_delta += delta_t;
+  /* start by taking a copy of our current @p result_pool_sum, and by
+   * taking out the stats from @p old_pool_sum.  This generates a stats
+   * delta.  Stash this stats delta in @p delta_avg_list, along with the
+   * timestamp delta for these results.
+   */
+  pool_stat_t d = current_pool_sum;
+  d.stats.sub(old_pool_sum.stats);
+  delta_avg_list->push_back(make_pair(d,delta_t));
+  *result_ts_delta += delta_t;
+
+  /* Aggregate current delta, and take out the last seen delta (if any) to
+   * average it out.
+   */
+  result_pool_delta->stats.add(d.stats);
+  size_t s = MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1);
+  if (delta_avg_list->size() > s) {
+    result_pool_delta->stats.sub(delta_avg_list->front().first.stats);
+    *result_ts_delta -= delta_avg_list->front().second;
+    delta_avg_list->pop_front();
+  }
+}
 
-  pg_sum_delta.stats.add(d.stats);
-  if (pg_sum_deltas.size() > (std::list< pair<pool_stat_t, utime_t> >::size_type)MAX(1, cct ? cct->_conf->mon_stat_smooth_intervals : 1)) {
-    pg_sum_delta.stats.sub(pg_sum_deltas.front().first.stats);
-    stamp_delta -= pg_sum_deltas.front().second;
-    pg_sum_deltas.pop_front();
+/**
+ * update aggregated delta
+ *
+ * @param cct            ceph context
+ * @param ts             Timestamp
+ * @param pg_sum_old     Old pg_sum
+ */
+void PGMap::update_global_delta(CephContext *cct,
+                         const utime_t ts, const pool_stat_t& pg_sum_old)
+{
+  update_delta(cct, ts, pg_sum_old, &stamp, pg_sum, &pg_sum_delta,
+               &stamp_delta, &pg_sum_deltas);
+}
+
+/**
+ * Update a given pool's deltas
+ *
+ * @param cct           Ceph Context
+ * @param ts            Timestamp for the stats being delta'ed
+ * @param pool          Pool's id
+ * @param old_pool_sum  Previous stats sum
+ */
+void PGMap::update_one_pool_delta(CephContext *cct,
+                                  const utime_t ts,
+                                  const uint64_t pool,
+                                  const pool_stat_t& old_pool_sum)
+{
+  if (per_pool_sum_deltas.count(pool) == 0) {
+    assert(per_pool_sum_deltas_stamps.count(pool) == 0);
+    assert(per_pool_sum_delta.count(pool) == 0);
+  }
+
+  pair<pool_stat_t,utime_t>& sum_delta = per_pool_sum_delta[pool];
+
+  update_delta(cct, ts, old_pool_sum, &sum_delta.second, pg_pool_sum[pool],
+               &sum_delta.first, &per_pool_sum_deltas_stamps[pool],
+               &per_pool_sum_deltas[pool]);
+}
+
+/**
+ * Update pools' deltas
+ *
+ * @param cct               CephContext
+ * @param ts                Timestamp for the stats being delta'ed
+ * @param pg_pool_sum_old   Map of pool stats for delta calcs.
+ */
+void PGMap::update_pool_deltas(CephContext *cct, const utime_t ts,
+                               const hash_map<uint64_t,pool_stat_t>& pg_pool_sum_old)
+{
+  for (hash_map<uint64_t,pool_stat_t>::const_iterator it = pg_pool_sum_old.begin();
+       it != pg_pool_sum_old.end(); ++it) {
+    update_one_pool_delta(cct, ts, it->first, it->second);
   }
 }
 
@@ -869,7 +1092,7 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   }
 
   std::stringstream ssr;
-  recovery_summary(f, &ssr);
+  overall_recovery_summary(f, &ssr);
   if (!f && ssr.str().length())
     *out << "            " << ssr.str() << "\n";
   ssr.clear();
@@ -878,43 +1101,17 @@ void PGMap::print_summary(Formatter *f, ostream *out) const
   if (!f)
     *out << ss.str();   // pgs by state
 
-  recovery_rate_summary(f, &ssr);
+  overall_recovery_rate_summary(f, &ssr);
   if (!f && ssr.str().length())
     *out << "recovery io " << ssr.str() << "\n";
 
-  // make non-negative; we can get negative values if osds send
-  // uncommitted stats and then "go backward" or if they are just
-  // buggy/wrong.
-  pool_stat_t pos_delta = pg_sum_delta;
-  pos_delta.floor(0);
-  if (pos_delta.stats.sum.num_rd ||
-      pos_delta.stats.sum.num_wr) {
-    if (!f)
-      *out << "  client io ";
-    if (pos_delta.stats.sum.num_rd) {
-      int64_t rd = (pos_delta.stats.sum.num_rd_kb << 10) / (double)stamp_delta;
-      if (f) {
-	f->dump_int("read_bytes_sec", rd);
-      } else {
-	*out << pretty_si_t(rd) << "B/s rd, ";
-      }
-    }
-    if (pos_delta.stats.sum.num_wr) {
-      int64_t wr = (pos_delta.stats.sum.num_wr_kb << 10) / (double)stamp_delta;
-      if (f) {
-	f->dump_int("write_bytes_sec", wr);
-      } else {
-	*out << pretty_si_t(wr) << "B/s wr, ";
-      }
-    }
-    int64_t iops = (pos_delta.stats.sum.num_rd + pos_delta.stats.sum.num_wr) / (double)stamp_delta;
-    if (f) {
-      f->dump_int("op_per_sec", iops);
-    } else {
-      *out << pretty_si_t(iops) << "op/s";
-      *out << "\n";
-    }
-  }
+  ssr.clear();
+  ssr.str("");
+
+  overall_client_io_rate_summary(f, &ssr);
+  if (!f && ssr.str().length())
+    *out << "  client io " << ssr.str() << "\n";
+
 
 }
 
@@ -960,12 +1157,12 @@ void PGMap::print_oneline_summary(ostream *out) const
   }
 
   std::stringstream ssr;
-  recovery_summary(NULL, &ssr);
+  overall_recovery_summary(NULL, &ssr);
   if (ssr.str().length())
     *out << "; " << ssr.str();
   ssr.clear();
   ssr.str("");
-  recovery_rate_summary(NULL, &ssr);
+  overall_recovery_rate_summary(NULL, &ssr);
   if (ssr.str().length())
     *out << "; " << ssr.str() << " recovering";
 }
diff --git a/src/mon/PGMap.h b/src/mon/PGMap.h
index 84d89f87517..c8ce7fd973e 100644
--- a/src/mon/PGMap.h
+++ b/src/mon/PGMap.h
@@ -43,12 +43,13 @@ public:
   float full_ratio;
   float nearfull_ratio;
 
+  // mapping of osd to most recently reported osdmap epoch
+  hash_map<int32_t,epoch_t> osd_epochs;
+
   class Incremental {
   public:
     version_t version;
     map<pg_t,pg_stat_t> pg_stat_updates;
-    map<int32_t,osd_stat_t> osd_stat_updates;
-    set<int32_t> osd_stat_rm;
     epoch_t osdmap_epoch;
     epoch_t pg_scan;  // osdmap epoch
     set<pg_t> pg_remove;
@@ -56,6 +57,38 @@ public:
     float nearfull_ratio;
     utime_t stamp;
 
+  private:
+    map<int32_t,osd_stat_t> osd_stat_updates;
+    set<int32_t> osd_stat_rm;
+
+    // mapping of osd to most recently reported osdmap epoch
+    map<int32_t,epoch_t> osd_epochs;
+  public:
+
+    const map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
+      return osd_stat_updates;
+    }
+    const set<int32_t> &get_osd_stat_rm() const {
+      return osd_stat_rm;
+    }
+    const map<int32_t, epoch_t> &get_osd_epochs() const {
+      return osd_epochs;
+    }
+
+    void update_stat(int32_t osd, epoch_t epoch, const osd_stat_t &stat) {
+      osd_stat_updates[osd] = stat;
+      osd_epochs[osd] = epoch;
+      assert(osd_epochs.size() == osd_stat_updates.size());
+    }
+    void stat_osd_out(int32_t osd) {
+      // 0 the stats for the osd
+      osd_stat_updates[osd] = osd_stat_t();
+    }
+    void rm_stat(int32_t osd) {
+      osd_stat_rm.insert(osd);
+      osd_epochs.erase(osd);
+      osd_stat_updates.erase(osd);
+    }
     void encode(bufferlist &bl, uint64_t features=-1) const;
     void decode(bufferlist::iterator &bl);
     void dump(Formatter *f) const;
@@ -76,13 +109,51 @@ public:
   utime_t stamp;
 
   // recent deltas, and summation
+  /**
+   * keep track of last deltas for each pool, calculated using
+   * @p pg_pool_sum as baseline.
+   */
+  hash_map<uint64_t, list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
+  /**
+   * keep track of per-pool timestamp deltas, according to last update on
+   * each pool.
+   */
+  hash_map<uint64_t, utime_t> per_pool_sum_deltas_stamps;
+  /**
+   * keep track of sum deltas, per-pool, taking into account any previous
+   * deltas existing in @p per_pool_sum_deltas.  The utime_t as second member
+   * of the pair is the timestamp refering to the last update (i.e., the first
+   * member of the pair) for a given pool.
+   */
+  hash_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta;
+
   list< pair<pool_stat_t, utime_t> > pg_sum_deltas;
   pool_stat_t pg_sum_delta;
   utime_t stamp_delta;
 
-  void update_delta(CephContext *cct, utime_t inc_stamp, pool_stat_t& pg_sum_old);
+  void update_global_delta(CephContext *cct,
+                           const utime_t ts, const pool_stat_t& pg_sum_old);
+  void update_pool_deltas(CephContext *cct,
+                          const utime_t ts,
+                          const hash_map<uint64_t, pool_stat_t>& pg_pool_sum_old);
   void clear_delta();
 
+ private:
+  void update_delta(CephContext *cct,
+                    const utime_t ts,
+                    const pool_stat_t& old_pool_sum,
+                    utime_t *last_ts,
+                    const pool_stat_t& current_pool_sum,
+                    pool_stat_t *result_pool_delta,
+                    utime_t *result_ts_delta,
+                    list<pair<pool_stat_t,utime_t> > *delta_avg_list);
+
+  void update_one_pool_delta(CephContext *cct,
+                             const utime_t ts,
+                             const uint64_t pool,
+                             const pool_stat_t& old_pool_sum);
+ public:
+
   set<pg_t> creating_pgs;   // lru: front = new additions, back = recently pinged
   map<int,set<pg_t> > creating_pgs_by_osd;
 
@@ -172,8 +243,37 @@ public:
   void dump_osd_perf_stats(Formatter *f) const;
   void print_osd_perf_stats(std::ostream *ss) const;
 
-  void recovery_summary(Formatter *f, ostream *out) const;
-  void recovery_rate_summary(Formatter *f, ostream *out) const;
+  void recovery_summary(Formatter *f, ostream *out,
+                        pool_stat_t delta_sum) const;
+  void overall_recovery_summary(Formatter *f, ostream *out) const;
+  void pool_recovery_summary(Formatter *f, ostream *out,
+                             uint64_t poolid) const;
+  void recovery_rate_summary(Formatter *f, ostream *out,
+                             pool_stat_t delta_sum,
+                             utime_t delta_stamp) const;
+  void overall_recovery_rate_summary(Formatter *f, ostream *out) const;
+  void pool_recovery_rate_summary(Formatter *f, ostream *out,
+                                  uint64_t poolid) const;
+  /**
+   * Obtain a formatted/plain output for client I/O, source from stats for a
+   * given @p delta_sum pool over a given @p delta_stamp period of time.
+   */
+  void client_io_rate_summary(Formatter *f, ostream *out,
+                              pool_stat_t delta_sum,
+                              utime_t delta_stamp) const;
+  /**
+   * Obtain a formatted/plain output for the overall client I/O, which is
+   * calculated resorting to @p pg_sum_delta and @p stamp_delta.
+   */
+  void overall_client_io_rate_summary(Formatter *f, ostream *out) const;
+  /**
+   * Obtain a formatted/plain output for client I/O over a given pool
+   * with id @p pool_id.  We will then obtain pool-specific data
+   * from @p per_pool_sum_delta.
+   */
+  void pool_client_io_rate_summary(Formatter *f, ostream *out,
+                                   uint64_t poolid) const;
+
   void print_summary(Formatter *f, ostream *out) const;
   void print_oneline_summary(ostream *out) const;
 
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index 0f495052747..c14872d87ef 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -141,6 +141,31 @@ void PGMonitor::tick()
     }
   }
 
+  /* If we have deltas for pools, run through pgmap's 'per_pool_sum_delta' and
+   * clear any deltas that are old enough.
+   *
+   * Note that 'per_pool_sum_delta' keeps a pool id as key, and a pair containing
+   * the calc'ed stats delta and an absolute timestamp from when those stats were
+   * obtained -- the timestamp IS NOT a delta itself.
+   */
+  if (!pg_map.per_pool_sum_deltas.empty()) {
+    hash_map<uint64_t,pair<pool_stat_t,utime_t> >::iterator it;
+    for (it = pg_map.per_pool_sum_delta.begin();
+         it != pg_map.per_pool_sum_delta.end(); ) {
+      utime_t age = ceph_clock_now(g_ceph_context) - it->second.second;
+      if (age > 2*g_conf->mon_delta_reset_interval) {
+        dout(10) << " clearing pg_map delta for pool " << it->first
+                 << " (" << age << " > " << g_conf->mon_delta_reset_interval
+                 << " seconds old)" << dendl;
+        pg_map.per_pool_sum_deltas.erase(it->first);
+        pg_map.per_pool_sum_deltas_stamps.erase(it->first);
+        pg_map.per_pool_sum_delta.erase((it++)->first);
+      } else {
+        ++it;
+      }
+    }
+  }
+
   dout(10) << pg_map << dendl;
 }
 
@@ -401,6 +426,7 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
   }
 
   pool_stat_t pg_sum_old = pg_map.pg_sum;
+  hash_map<uint64_t, pool_stat_t> pg_pool_sum_old;
 
   // pgs
   bufferlist::iterator p = dirty_pgs.begin();
@@ -410,6 +436,10 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
     dout(20) << " refreshing pg " << pgid << dendl;
     bufferlist bl;
     int r = mon->store->get(pgmap_pg_prefix, stringify(pgid), bl);
+
+    if (pg_pool_sum_old.count(pgid.pool()) == 0)
+      pg_pool_sum_old[pgid.pool()] = pg_map.pg_pool_sum[pgid.pool()];
+
     if (r >= 0) {
       pg_map.update_pg(pgid, bl);
     } else {
@@ -432,7 +462,8 @@ void PGMonitor::apply_pgmap_delta(bufferlist& bl)
     }
   }
 
-  pg_map.update_delta(g_ceph_context, inc_stamp, pg_sum_old);
+  pg_map.update_global_delta(g_ceph_context, inc_stamp, pg_sum_old);
+  pg_map.update_pool_deltas(g_ceph_context, inc_stamp, pg_pool_sum_old);
 
   // ok, we're now on the new version
   pg_map.version = v;
@@ -494,15 +525,19 @@ void PGMonitor::encode_pending(MonitorDBStore::Transaction *t)
   {
     bufferlist dirty;
     string prefix = pgmap_osd_prefix;
-    for (map<int32_t,osd_stat_t>::const_iterator p = pending_inc.osd_stat_updates.begin();
-	 p != pending_inc.osd_stat_updates.end();
+    for (map<int32_t,osd_stat_t>::const_iterator p =
+	   pending_inc.get_osd_stat_updates().begin();
+	 p != pending_inc.get_osd_stat_updates().end();
 	 ++p) {
       ::encode(p->first, dirty);
       bufferlist bl;
       ::encode(p->second, bl, features);
       t->put(prefix, stringify(p->first), bl);
     }
-    for (set<int32_t>::const_iterator p = pending_inc.osd_stat_rm.begin(); p != pending_inc.osd_stat_rm.end(); ++p) {
+    for (set<int32_t>::const_iterator p =
+	   pending_inc.get_osd_stat_rm().begin();
+	 p != pending_inc.get_osd_stat_rm().end();
+	 ++p) {
       ::encode(*p, dirty);
       t->erase(prefix, stringify(*p));
     }
@@ -725,7 +760,11 @@ bool PGMonitor::prepare_pg_stats(MPGStats *stats)
   }
 
   // osd stat
-  pending_inc.osd_stat_updates[from] = stats->osd_stat;
+  if (mon->osdmon()->osdmap.is_in(from)) {
+    pending_inc.update_stat(from, stats->epoch, stats->osd_stat);
+  } else {
+    pending_inc.update_stat(from, stats->epoch, osd_stat_t());
+  }
   
   if (pg_map.osd_stat.count(from))
     dout(10) << " got osd." << from << " " << stats->osd_stat << " (was " << pg_map.osd_stat[from] << ")" << dendl;
@@ -842,11 +881,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 	 ++p)
       if (p->second == CEPH_OSD_OUT) {
 	dout(10) << "check_osd_map  osd." << p->first << " went OUT" << dendl;
-	pending_inc.osd_stat_rm.insert(p->first);
-      } else {
-	dout(10) << "check_osd_map  osd." << p->first << " is IN" << dendl;
-	pending_inc.osd_stat_rm.erase(p->first);
-	pending_inc.osd_stat_updates[p->first]; 
+	pending_inc.stat_osd_out(p->first);
       }
 
     // this is conservative: we want to know if any osds (maybe) got marked down.
@@ -867,7 +902,7 @@ void PGMonitor::check_osd_map(epoch_t epoch)
 	// whether it was created *or* destroyed, we can safely drop
 	// it's osd_stat_t record.
 	dout(10) << "check_osd_map  osd." << p->first << " created or destroyed" << dendl;
-	pending_inc.osd_stat_rm.insert(p->first);
+	pending_inc.rm_stat(p->first);
 
 	// and adjust full, nearfull set
 	pg_map.nearfull_osds.erase(p->first);
@@ -1827,7 +1862,7 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 
   // recovery
   stringstream rss;
-  pg_map.recovery_summary(NULL, &rss);
+  pg_map.overall_recovery_summary(NULL, &rss);
   if (!rss.str().empty()) {
     summary.push_back(make_pair(HEALTH_WARN, "recovery " + rss.str()));
     if (detail)
@@ -1876,7 +1911,9 @@ void PGMonitor::get_health(list<pair<health_status_t,string> >& summary,
 	  detail->push_back(make_pair(HEALTH_WARN, ss.str()));
       }
       int average_objects_per_pg = pg_map.pg_sum.stats.sum.num_objects / pg_map.pg_stat.size();
-      if (average_objects_per_pg > 0) {
+      if (average_objects_per_pg > 0 &&
+	  pg_map.pg_sum.stats.sum.num_objects >= g_conf->mon_pg_warn_min_objects &&
+	  p->second.stats.sum.num_objects >= g_conf->mon_pg_warn_min_pool_objects) {
 	int objects_per_pg = p->second.stats.sum.num_objects / pi->get_pg_num();
 	float ratio = (float)objects_per_pg / (float)average_objects_per_pg;
 	if (g_conf->mon_pg_warn_max_object_skew > 0 &&
diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc
index a470e63dc1c..3506c4a4ccd 100644
--- a/src/os/FileStore.cc
+++ b/src/os/FileStore.cc
@@ -422,7 +422,10 @@ FileStore::FileStore(const std::string &base, const std::string &jdev, const cha
   m_filestore_do_dump(false),
   m_filestore_dump_fmt(true),
   m_filestore_sloppy_crc(g_conf->filestore_sloppy_crc),
-  m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size)
+  m_filestore_sloppy_crc_block_size(g_conf->filestore_sloppy_crc_block_size),
+  m_fs_type(FS_TYPE_NONE),
+  m_filestore_max_inline_xattr_size(0),
+  m_filestore_max_inline_xattrs(0)
 {
   m_filestore_kill_at.set(g_conf->filestore_kill_at);
 
@@ -825,12 +828,14 @@ int FileStore::_detect_fs()
 
   blk_size = st.f_bsize;
 
+  m_fs_type = FS_TYPE_OTHER;
 #if defined(__linux__)
   if (st.f_type == BTRFS_SUPER_MAGIC) {
     dout(0) << "mount detected btrfs" << dendl;
     backend = new BtrfsFileStoreBackend(this);
 
     wbthrottle.set_fs(WBThrottle::BTRFS);
+    m_fs_type = FS_TYPE_BTRFS;
   } else if (st.f_type == XFS_SUPER_MAGIC) {
     dout(1) << "mount detected xfs" << dendl;
     if (m_filestore_replica_fadvise) {
@@ -838,15 +843,19 @@ int FileStore::_detect_fs()
       g_conf->set_val("filestore_replica_fadvise", "false");
       g_conf->apply_changes(NULL);
       assert(m_filestore_replica_fadvise == false);
+      m_fs_type = FS_TYPE_XFS;
     }
   }
 #endif
 #ifdef HAVE_LIBZFS
   if (st.f_type == ZFS_SUPER_MAGIC) {
     backend = new ZFSFileStoreBackend(this);
+    m_fs_type = FS_TYPE_ZFS;
   }
 #endif
 
+  set_xattr_limits_via_conf();
+
   r = backend->detect_features();
   if (r < 0) {
     derr << "_detect_fs: detect_features error: " << cpp_strerror(r) << dendl;
@@ -887,14 +896,7 @@ int FileStore::_detect_fs()
   chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf));
   ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf));
   if (ret == -ENOSPC) {
-    if (!g_conf->filestore_xattr_use_omap) {
-      dout(0) << "limited size xattrs -- automatically enabling filestore_xattr_use_omap" << dendl;
-      g_conf->set_val("filestore_xattr_use_omap", "true");
-      g_conf->apply_changes(NULL);
-      assert(g_conf->filestore_xattr_use_omap == true);
-    } else {
-      dout(0) << "limited size xattrs -- filestore_xattr_use_omap already enabled" << dendl;
-    }
+    dout(0) << "limited size xattrs" << dendl;
   }
   chain_fremovexattr(tmpfd, "user.test");
   chain_fremovexattr(tmpfd, "user.test2");
@@ -953,43 +955,25 @@ int FileStore::_sanity_check_fs()
 
 int FileStore::write_superblock()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/superblock", basedir.c_str());
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0)
-    return -errno;
   bufferlist bl;
   ::encode(superblock, bl);
-
-  int ret = safe_write(fd, bl.c_str(), bl.length());
-  if (ret < 0)
-    goto out;
-  ret = ::fsync(fd);
-  if (ret < 0)
-    ret = -errno;
-  // XXX: fsync() man page says I need to sync containing directory
-out:
-  TEMP_FAILURE_RETRY(::close(fd));
-  return ret;
+  return safe_write_file(basedir.c_str(), "superblock",
+      bl.c_str(), bl.length());
 }
 
 int FileStore::read_superblock()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/superblock", basedir.c_str());
-  int fd = ::open(fn, O_RDONLY, 0644);
-  if (fd < 0) {
-    if (errno == ENOENT) {
+  bufferptr bp(PATH_MAX);
+  int ret = safe_read_file(basedir.c_str(), "superblock",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT) {
       // If the file doesn't exist write initial CompatSet
       return write_superblock();
-    } else
-      return -errno;
-  }
-  bufferptr bp(PATH_MAX);
-  int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
+    }
     return ret;
+  }
+
   bufferlist bl;
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
@@ -1019,20 +1003,14 @@ int FileStore::update_version_stamp()
 
 int FileStore::version_stamp_is_valid(uint32_t *version)
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_RDONLY, 0644);
-  if (fd < 0) {
-    if (errno == ENOENT)
-      return 0;
-    else 
-      return -errno;
-  }
   bufferptr bp(PATH_MAX);
-  int ret = safe_read(fd, bp.c_str(), bp.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret < 0)
+  int ret = safe_read_file(basedir.c_str(), "store_version",
+      bp.c_str(), bp.length());
+  if (ret < 0) {
+    if (ret == -ENOENT)
+      return 0;
     return ret;
+  }
   bufferlist bl;
   bl.push_back(bp);
   bufferlist::iterator i = bl.begin();
@@ -1045,17 +1023,11 @@ int FileStore::version_stamp_is_valid(uint32_t *version)
 
 int FileStore::write_version_stamp()
 {
-  char fn[PATH_MAX];
-  snprintf(fn, sizeof(fn), "%s/store_version", basedir.c_str());
-  int fd = ::open(fn, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0)
-    return -errno;
   bufferlist bl;
   ::encode(target_version, bl);
-  
-  int ret = safe_write(fd, bl.c_str(), bl.length());
-  TEMP_FAILURE_RETRY(::close(fd));
-  return ret;
+
+  return safe_write_file(basedir.c_str(), "store_version",
+      bl.c_str(), bl.length());
 }
 
 int FileStore::read_op_seq(uint64_t *seq)
@@ -3427,7 +3399,7 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
   get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
   r = _fgetattr(**fd, n, bp);
   lfn_close(fd);
-  if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+  if (r == -ENODATA) {
     map<string, bufferlist> got;
     set<string> to_get;
     to_get.insert(string(name));
@@ -3463,6 +3435,9 @@ int FileStore::getattr(coll_t cid, const ghobject_t& oid, const char *name, buff
 
 int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>& aset, bool user_only)
 {
+  set<string> omap_attrs;
+  map<string, bufferlist> omap_aset;
+  Index index;
   dout(15) << "getattrs " << cid << "/" << oid << dendl;
   FDRef fd;
   int r = lfn_open(cid, oid, false, &fd);
@@ -3470,43 +3445,43 @@ int FileStore::getattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr>
     goto out;
   }
   r = _fgetattrs(**fd, aset, user_only);
+  if (r < 0) {
+    goto out;
+  }
   lfn_close(fd);
-  if (g_conf->filestore_xattr_use_omap) {
-    set<string> omap_attrs;
-    map<string, bufferlist> omap_aset;
-    Index index;
-    int r = get_index(cid, &index);
-    if (r < 0) {
-      dout(10) << __func__ << " could not get index r = " << r << dendl;
-      goto out;
-    }
-    r = object_map->get_all_xattrs(oid, &omap_attrs);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      goto out;
-    }
-    r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      goto out;
-    }
-    assert(omap_attrs.size() == omap_aset.size());
-    for (map<string, bufferlist>::iterator i = omap_aset.begin();
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __func__ << " could not get index r = " << r << dendl;
+    goto out;
+  }
+  r = object_map->get_all_xattrs(oid, &omap_attrs);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    goto out;
+  }
+  r = object_map->get_xattrs(oid, omap_attrs, &omap_aset);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    goto out;
+  }
+  if (r == -ENOENT)
+    r = 0;
+  assert(omap_attrs.size() == omap_aset.size());
+  for (map<string, bufferlist>::iterator i = omap_aset.begin();
 	 i != omap_aset.end();
 	 ++i) {
-      string key;
-      if (user_only) {
+    string key;
+    if (user_only) {
 	if (i->first[0] != '_')
 	  continue;
 	if (i->first == "_")
 	  continue;
 	key = i->first.substr(1, i->first.size());
-      } else {
+    } else {
 	key = i->first;
-      }
-      aset.insert(make_pair(key,
-			    bufferptr(i->second.c_str(), i->second.length())));
     }
+    aset.insert(make_pair(key,
+			    bufferptr(i->second.c_str(), i->second.length())));
   }
  out:
   dout(10) << "getattrs " << cid << "/" << oid << " = " << r << dendl;
@@ -3532,10 +3507,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
   if (r < 0) {
     goto out;
   }
-  if (g_conf->filestore_xattr_use_omap) {
-    r = _fgetattrs(**fd, inline_set, false);
-    assert(!m_filestore_fail_eio || r != -EIO);
-  }
+  r = _fgetattrs(**fd, inline_set, false);
+  assert(!m_filestore_fail_eio || r != -EIO);
   dout(15) << "setattrs " << cid << "/" << oid << dendl;
   r = 0;
   for (map<string,bufferptr>::iterator p = aset.begin();
@@ -3543,8 +3516,8 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
        ++p) {
     char n[CHAIN_XATTR_MAX_NAME_LEN];
     get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN);
-    if (g_conf->filestore_xattr_use_omap) {
-      if (p->second.length() > g_conf->filestore_max_inline_xattr_size) {
+
+    if (p->second.length() > m_filestore_max_inline_xattr_size) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
 	  r = chain_fremovexattr(**fd, n);
@@ -3553,10 +3526,10 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
 	}
 	omap_set[p->first].push_back(p->second);
 	continue;
-      }
+    }
 
-      if (!inline_set.count(p->first) &&
-	  inline_set.size() >= g_conf->filestore_max_inline_xattrs) {
+    if (!inline_set.count(p->first) &&
+	  inline_set.size() >= m_filestore_max_inline_xattrs) {
 	if (inline_set.count(p->first)) {
 	  inline_set.erase(p->first);
 	  r = chain_fremovexattr(**fd, n);
@@ -3565,10 +3538,9 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
 	}
 	omap_set[p->first].push_back(p->second);
 	continue;
-      }
-      omap_remove.insert(p->first);
-      inline_set.insert(*p);
     }
+    omap_remove.insert(p->first);
+    inline_set.insert(*p);
 
     inline_to_set.insert(*p);
 
@@ -3579,17 +3551,17 @@ int FileStore::_setattrs(coll_t cid, const ghobject_t& oid, map<string,bufferptr
     goto out_close;
 
   if (!omap_remove.empty()) {
-    assert(g_conf->filestore_xattr_use_omap);
     r = object_map->remove_xattrs(oid, omap_remove, &spos);
     if (r < 0 && r != -ENOENT) {
       dout(10) << __func__ << " could not remove_xattrs r = " << r << dendl;
       assert(!m_filestore_fail_eio || r != -EIO);
       goto out_close;
+    } else {
+      r = 0; // don't confuse the debug output
     }
   }
   
   if (!omap_set.empty()) {
-    assert(g_conf->filestore_xattr_use_omap);
     r = object_map->set_xattrs(oid, omap_set, &spos);
     if (r < 0) {
       dout(10) << __func__ << " could not set_xattrs r = " << r << dendl;
@@ -3617,7 +3589,7 @@ int FileStore::_rmattr(coll_t cid, const ghobject_t& oid, const char *name,
   char n[CHAIN_XATTR_MAX_NAME_LEN];
   get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN);
   r = chain_fremovexattr(**fd, n);
-  if (r == -ENODATA && g_conf->filestore_xattr_use_omap) {
+  if (r == -ENODATA) {
     Index index;
     r = get_index(cid, &index);
     if (r < 0) {
@@ -3647,6 +3619,8 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
 
   map<string,bufferptr> aset;
   FDRef fd;
+  set<string> omap_attrs;
+  Index index;
   int r = lfn_open(cid, oid, false, &fd);
   if (r < 0) {
     goto out;
@@ -3663,26 +3637,24 @@ int FileStore::_rmattrs(coll_t cid, const ghobject_t& oid,
   }
   lfn_close(fd);
 
-  if (g_conf->filestore_xattr_use_omap) {
-    set<string> omap_attrs;
-    Index index;
-    r = get_index(cid, &index);
-    if (r < 0) {
-      dout(10) << __func__ << " could not get index r = " << r << dendl;
-      return r;
-    }
-    r = object_map->get_all_xattrs(oid, &omap_attrs);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
-      assert(!m_filestore_fail_eio || r != -EIO);
-      return r;
-    }
-    r = object_map->remove_xattrs(oid, omap_attrs, &spos);
-    if (r < 0 && r != -ENOENT) {
-      dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
-      return r;
-    }
+  r = get_index(cid, &index);
+  if (r < 0) {
+    dout(10) << __func__ << " could not get index r = " << r << dendl;
+    return r;
   }
+  r = object_map->get_all_xattrs(oid, &omap_attrs);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not get omap_attrs r = " << r << dendl;
+    assert(!m_filestore_fail_eio || r != -EIO);
+    return r;
+  }
+  r = object_map->remove_xattrs(oid, omap_attrs, &spos);
+  if (r < 0 && r != -ENOENT) {
+    dout(10) << __func__ << " could not remove omap_attrs r = " << r << dendl;
+    return r;
+  }
+  if (r == -ENOENT)
+    r = 0;
  out:
   dout(10) << "rmattrs " << cid << "/" << oid << " = " << r << dendl;
   return r;
@@ -4590,6 +4562,17 @@ const char** FileStore::get_tracked_conf_keys() const
 void FileStore::handle_conf_change(const struct md_config_t *conf,
 			  const std::set <std::string> &changed)
 {
+  if (changed.count("filestore_max_inline_xattr_size") ||
+      changed.count("filestore_max_inline_xattr_size_xfs") ||
+      changed.count("filestore_max_inline_xattr_size_btrfs") ||
+      changed.count("filestore_max_inline_xattr_size_other") ||
+      changed.count("filestore_max_inline_xattrs") ||
+      changed.count("filestore_max_inline_xattrs_xfs") ||
+      changed.count("filestore_max_inline_xattrs_btrfs") ||
+      changed.count("filestore_max_inline_xattrs_other")) {
+    Mutex::Locker l(lock);
+    set_xattr_limits_via_conf();
+  }
   if (changed.count("filestore_min_sync_interval") ||
       changed.count("filestore_max_sync_interval") ||
       changed.count("filestore_queue_max_ops") ||
@@ -4669,6 +4652,44 @@ void FileStore::dump_transactions(list<ObjectStore::Transaction*>& ls, uint64_t
   m_filestore_dump.flush();
 }
 
+void FileStore::set_xattr_limits_via_conf()
+{
+  uint32_t fs_xattr_size;
+  uint32_t fs_xattrs;
+
+  assert(m_fs_type != FS_TYPE_NONE);
+
+  switch(m_fs_type) {
+    case FS_TYPE_XFS:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_xfs;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_xfs;
+      break;
+    case FS_TYPE_BTRFS:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_btrfs;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_btrfs;
+      break;
+    case FS_TYPE_ZFS:
+    case FS_TYPE_OTHER:
+      fs_xattr_size = g_conf->filestore_max_inline_xattr_size_other;
+      fs_xattrs = g_conf->filestore_max_inline_xattrs_other;
+      break;
+    default:
+      assert(!"Unknown fs type");
+  }
+
+  //Use override value if set
+  if (g_conf->filestore_max_inline_xattr_size)
+    m_filestore_max_inline_xattr_size = g_conf->filestore_max_inline_xattr_size;
+  else
+    m_filestore_max_inline_xattr_size = fs_xattr_size;
+
+  //Use override value if set
+  if (g_conf->filestore_max_inline_xattrs)
+    m_filestore_max_inline_xattrs = g_conf->filestore_max_inline_xattrs;
+  else
+    m_filestore_max_inline_xattrs = fs_xattrs;
+}
+
 // -- FSSuperblock --
 
 void FSSuperblock::encode(bufferlist &bl) const
diff --git a/src/os/FileStore.h b/src/os/FileStore.h
index fdab0ece34f..c489fdd5796 100644
--- a/src/os/FileStore.h
+++ b/src/os/FileStore.h
@@ -64,6 +64,14 @@ static const __SWORD_TYPE XFS_SUPER_MAGIC(0x58465342);
 static const __SWORD_TYPE ZFS_SUPER_MAGIC(0x2fc12fc1);
 #endif
 
+enum fs_types {
+  FS_TYPE_NONE = 0,
+  FS_TYPE_XFS,
+  FS_TYPE_BTRFS,
+  FS_TYPE_ZFS,
+  FS_TYPE_OTHER
+};
+
 class FileStoreBackend;
 
 #define CEPH_FS_FEATURE_INCOMPAT_SHARDS CompatSet::Feature(1, "sharded objects")
@@ -593,6 +601,13 @@ private:
   atomic_t m_filestore_kill_at;
   bool m_filestore_sloppy_crc;
   int m_filestore_sloppy_crc_block_size;
+  enum fs_types m_fs_type;
+
+  //Determined xattr handling based on fs type
+  void set_xattr_limits_via_conf();
+  uint32_t m_filestore_max_inline_xattr_size;
+  uint32_t m_filestore_max_inline_xattrs;
+
   FSSuperblock superblock;
 
   /**
diff --git a/src/os/GenericFileStoreBackend.cc b/src/os/GenericFileStoreBackend.cc
index dad1a9c220c..81d896a0943 100644
--- a/src/os/GenericFileStoreBackend.cc
+++ b/src/os/GenericFileStoreBackend.cc
@@ -263,6 +263,7 @@ int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
 {
   char buf[100];
   bufferptr bp;
+  int r = 0;
   int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
   if (l == -ENODATA) {
     return 0;
@@ -284,16 +285,21 @@ int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
     ::decode(*cm, p);
   }
   catch (buffer::error &e) {
-    return -EIO;
+    r = -EIO;
   }
-  return 0;
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
 }
 
 int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
 {
   bufferlist bl;
   ::encode(*cm, bl);
-  return chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
+  if (r < 0)
+    derr << __func__ << " got " << cpp_strerror(r) << dendl;
+  return r;
 }
 
 int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
diff --git a/src/os/LevelDBStore.h b/src/os/LevelDBStore.h
index 89718ce1987..bc5b612a97a 100644
--- a/src/os/LevelDBStore.h
+++ b/src/os/LevelDBStore.h
@@ -329,13 +329,15 @@ public:
       string fpath = path + '/' + n;
       struct stat s;
       int err = stat(fpath.c_str(), &s);
+      if (err < 0)
+	err = -errno;
       // we may race against leveldb while reading files; this should only
       // happen when those files are being updated, data is being shuffled
       // and files get removed, in which case there's not much of a problem
       // as we'll get to them next time around.
       if ((err < 0) && (err != -ENOENT)) {
         lderr(cct) << __func__ << " error obtaining stats for " << fpath
-                   << ": " << cpp_strerror(errno) << dendl;
+                   << ": " << cpp_strerror(err) << dendl;
         goto err;
       }
 
diff --git a/src/os/ObjectStore.cc b/src/os/ObjectStore.cc
index 84549821aff..1a1bbcb0b67 100644
--- a/src/os/ObjectStore.cc
+++ b/src/os/ObjectStore.cc
@@ -501,8 +501,7 @@ void ObjectStore::Transaction::generate_test_instances(list<ObjectStore::Transac
 int ObjectStore::collection_list(coll_t c, vector<hobject_t>& o)
 {
   vector<ghobject_t> go;
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list(c, go);
+  int ret = collection_list(c, go);
   if (ret == 0) {
     o.reserve(go.size());
     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
@@ -517,8 +516,7 @@ int ObjectStore::collection_list_partial(coll_t c, hobject_t start,
 {
   vector<ghobject_t> go;
   ghobject_t gnext, gstart(start);
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
+  int ret = collection_list_partial(c, gstart, min, max, snap, &go, &gnext);
   if (ret == 0) {
     *next = gnext.hobj;
     ls->reserve(go.size());
@@ -533,8 +531,7 @@ int ObjectStore::collection_list_range(coll_t c, hobject_t start, hobject_t end,
 {
   vector<ghobject_t> go;
   ghobject_t gstart(start), gend(end);
-  FileStore *fs = dynamic_cast<FileStore * >(this);
-  int ret = fs->collection_list_range(c, gstart, gend, seq, &go);
+  int ret = collection_list_range(c, gstart, gend, seq, &go);
   if (ret == 0) {
     ls->reserve(go.size());
     for (vector<ghobject_t>::iterator i = go.begin(); i != go.end() ; i++)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 22366b0baca..8ce11bb558c 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -713,7 +713,7 @@ int OSD::mkfs(CephContext *cct, const std::string &dev, const std::string &jdev,
       goto umount_store;
     }
 
-    ret = write_meta(dev, "ready", "ready\n", 6);
+    ret = safe_write_file(dev.c_str(), "ready", "ready\n", 6);
     if (ret) {
       derr << "OSD::mkfs: failed to write ready file: error " << ret << dendl;
       goto umount_store;
@@ -769,103 +769,19 @@ int OSD::dump_journal(CephContext *cct, const std::string &dev, const std::strin
   return err;
 }
 
-int OSD::write_meta(const std::string &base, const std::string &file,
-		    const char *val, size_t vallen)
-{
-  int ret;
-  char fn[PATH_MAX];
-  char tmp[PATH_MAX];
-  int fd;
-
-  // does the file already have correct content?
-  char oldval[80];
-  ret = read_meta(base, file, oldval, sizeof(oldval));
-  if (ret == (int)vallen && memcmp(oldval, val, vallen) == 0)
-    return 0;  // yes.
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  snprintf(tmp, sizeof(tmp), "%s/%s.tmp", base.c_str(), file.c_str());
-  fd = ::open(tmp, O_WRONLY|O_CREAT|O_TRUNC, 0644);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: error opening '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ret = safe_write(fd, val, vallen);
-  if (ret) {
-    derr << "write_meta: failed to write to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    TEMP_FAILURE_RETRY(::close(fd));
-    return ret;
-  }
-
-  ret = ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to fsync to '" << tmp << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-  ret = ::rename(tmp, fn);
-  if (ret) {
-    ::unlink(tmp);
-    derr << "write_meta: failed to rename '" << tmp << "' to '" << fn << "': "
-	 << cpp_strerror(ret) << dendl;
-    return ret;
-  }
-
-  fd = ::open(base.c_str(), O_RDONLY);
-  if (fd < 0) {
-    ret = errno;
-    derr << "write_meta: failed to open dir '" << base << "': "
-	 << cpp_strerror(ret) << dendl;
-    return -ret;
-  }
-  ::fsync(fd);
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  return 0;
-}
-
-int OSD::read_meta(const  std::string &base, const std::string &file,
-		   char *val, size_t vallen)
-{
-  char fn[PATH_MAX];
-  int fd, len;
-
-  snprintf(fn, sizeof(fn), "%s/%s", base.c_str(), file.c_str());
-  fd = ::open(fn, O_RDONLY);
-  if (fd < 0) {
-    int err = errno;
-    return -err;
-  }
-  len = safe_read(fd, val, vallen);
-  if (len < 0) {
-    TEMP_FAILURE_RETRY(::close(fd));
-    return len;
-  }
-  // close sometimes returns errors, but only after write()
-  TEMP_FAILURE_RETRY(::close(fd));
-
-  val[len] = 0;
-  return len;
-}
-
 int OSD::write_meta(const std::string &base, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
 {
   char val[80];
   
   snprintf(val, sizeof(val), "%s\n", CEPH_OSD_ONDISK_MAGIC);
-  write_meta(base, "magic", val, strlen(val));
+  safe_write_file(base.c_str(), "magic", val, strlen(val));
 
   snprintf(val, sizeof(val), "%d\n", whoami);
-  write_meta(base, "whoami", val, strlen(val));
+  safe_write_file(base.c_str(), "whoami", val, strlen(val));
 
   cluster_fsid.print(val);
   strcat(val, "\n");
-  write_meta(base, "ceph_fsid", val, strlen(val));
+  safe_write_file(base.c_str(), "ceph_fsid", val, strlen(val));
 
   return 0;
 }
@@ -875,24 +791,24 @@ int OSD::peek_meta(const std::string &dev, std::string& magic,
 {
   char val[80] = { 0 };
 
-  if (read_meta(dev, "magic", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "magic", val, sizeof(val)) < 0)
     return -errno;
   int l = strlen(val);
   if (l && val[l-1] == '\n')
     val[l-1] = 0;
   magic = val;
 
-  if (read_meta(dev, "whoami", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "whoami", val, sizeof(val)) < 0)
     return -errno;
   whoami = atoi(val);
 
-  if (read_meta(dev, "ceph_fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "ceph_fsid", val, sizeof(val)) < 0)
     return -errno;
   if (strlen(val) > 36)
     val[36] = 0;
   cluster_fsid.parse(val);
 
-  if (read_meta(dev, "fsid", val, sizeof(val)) < 0)
+  if (safe_read_file(dev.c_str(), "fsid", val, sizeof(val)) < 0)
     osd_fsid = uuid_d();
   else {
     if (strlen(val) > 36)
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 5fe667344a9..9346cee6890 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -1738,10 +1738,6 @@ protected:
   }
 
 private:
-  static int write_meta(const std::string &base, const std::string &file,
-			const char *val, size_t vallen);
-  static int read_meta(const std::string &base, const std::string &file,
-		       char *val, size_t vallen);
   static int write_meta(const std::string &base,
 			uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami);
 public:
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index f1985bf961b..1d9ed5f6a31 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -1,4 +1,3 @@
-
 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
 // vim: ts=8 sw=2 smarttab
 /*
@@ -1997,8 +1996,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
       hobject_t cur;
       vector<hobject_t> objects;
       while (1) {
-	int r = store->collection_list_partial(
-	  cid,
+	int r = get_pgbackend()->objects_list_partial(
 	  cur,
 	  store->get_ideal_list_min(),
 	  store->get_ideal_list_max(),
@@ -2046,8 +2044,7 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
   while (1) {
     dout(1) << "Updating snap_mapper from main collection, "
 	    << done << " objects done" << dendl;
-    int r = store->collection_list_partial(
-      cid,
+    int r = get_pgbackend()->objects_list_partial(
       cur,
       store->get_ideal_list_min(),
       store->get_ideal_list_max(),
@@ -2070,19 +2067,16 @@ void PG::upgrade(ObjectStore *store, const interval_set<snapid_t> &snapcolls)
 	 ++j) {
       if (j->snap < CEPH_MAXSNAP) {
 	OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
-	bufferptr bp;
-	r = store->getattr(
-	  cid,
+	bufferlist bl;
+	r = get_pgbackend()->objects_get_attr(
 	  *j,
 	  OI_ATTR,
-	  bp);
+	  &bl);
 	if (r < 0) {
 	  derr << __func__ << ": getattr returned "
 	       << cpp_strerror(r) << dendl;
 	  assert(0);
 	}
-	bufferlist bl;
-	bl.push_back(bp);
 	object_info_t oi(bl);
 	set<snapid_t> oi_snaps(oi.snaps.begin(), oi.snaps.end());
 	set<snapid_t> cur_snaps;
@@ -2412,9 +2406,8 @@ void PG::log_weirdness()
 			<< " log bound mismatch, empty but (" << pg_log.get_tail() << ","
 			<< pg_log.get_head() << "]\n";
   } else {
-    if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()) || // sloppy check
-	(pg_log.get_log().log.rbegin()->version != pg_log.get_head() &&
-	 !(pg_log.get_head() == pg_log.get_tail())))
+    // sloppy check
+    if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
       osd->clog.error() << info.pgid
 			<< " log bound mismatch, info (" << pg_log.get_tail() << ","
 			<< pg_log.get_head() << "]"
@@ -3039,9 +3032,9 @@ int PG::build_scrub_map_chunk(
 
   // objects
   vector<hobject_t> ls;
-  int ret = osd->store->collection_list_range(coll, start, end, 0, &ls);
+  int ret = get_pgbackend()->objects_list_range(start, end, 0, &ls);
   if (ret < 0) {
-    dout(5) << "collection_list_range error: " << ret << dendl;
+    dout(5) << "objects_list_range error: " << ret << dendl;
     return ret;
   }
 
@@ -3561,11 +3554,13 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           hobject_t start = scrubber.start;
           while (!boundary_found) {
             vector<hobject_t> objects;
-            ret = osd->store->collection_list_partial(coll, start,
-                                                      cct->_conf->osd_scrub_chunk_min,
-						      cct->_conf->osd_scrub_chunk_max,
-						      0,
-                                                      &objects, &scrubber.end);
+            ret = get_pgbackend()->objects_list_partial(
+	      start,
+	      cct->_conf->osd_scrub_chunk_min,
+	      cct->_conf->osd_scrub_chunk_max,
+	      0,
+	      &objects,
+	      &scrubber.end);
             assert(ret >= 0);
 
             // in case we don't find a boundary: start again at the end
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 74809eea268..275d30c7658 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -48,6 +48,7 @@
 #include "common/WorkQueue.h"
 #include "common/ceph_context.h"
 #include "include/str_list.h"
+#include "PGBackend.h"
 
 #include <list>
 #include <memory>
@@ -193,6 +194,8 @@ protected:
   CephContext *cct;
   OSDriver osdriver;
   SnapMapper snap_mapper;
+
+  virtual PGBackend *get_pgbackend() = 0;
 public:
   void update_snap_mapper_bits(uint32_t bits) {
     snap_mapper.update_bits(bits);
@@ -439,6 +442,7 @@ protected:
    */
   struct BackfillInterval {
     // info about a backfill interval on a peer
+    eversion_t version; /// version at which the scan occurred
     map<hobject_t,eversion_t> objects;
     hobject_t begin;
     hobject_t end;
@@ -447,6 +451,7 @@ protected:
     void clear() {
       objects.clear();
       begin = end = hobject_t();
+      version = eversion_t();
     }
 
     void reset(hobject_t start) {
diff --git a/src/osd/PGBackend.h b/src/osd/PGBackend.h
index e3cc05bf345..408c589a08a 100644
--- a/src/osd/PGBackend.h
+++ b/src/osd/PGBackend.h
@@ -205,6 +205,26 @@
    virtual void clear_temp_obj(const hobject_t &oid) = 0;
 
    virtual ~PGBackend() {}
+
+   /// List objects in collection
+   virtual int objects_list_partial(
+     const hobject_t &begin,
+     int min,
+     int max,
+     snapid_t seq,
+     vector<hobject_t> *ls,
+     hobject_t *next) = 0;
+
+   virtual int objects_list_range(
+     const hobject_t &start,
+     const hobject_t &end,
+     snapid_t seq,
+     vector<hobject_t> *ls) = 0;
+
+   virtual int objects_get_attr(
+     const hobject_t &hoid,
+     const string &attr,
+     bufferlist *out) = 0;
  };
 
 #endif
diff --git a/src/osd/PGLog.cc b/src/osd/PGLog.cc
index 6e025f289bc..1949c96fd57 100644
--- a/src/osd/PGLog.cc
+++ b/src/osd/PGLog.cc
@@ -52,13 +52,9 @@ void PGLog::IndexedLog::split_into(
 
   if (log.empty())
     tail = head;
-  else
-    head = log.rbegin()->version;
 
   if (olog->empty())
     olog->tail = olog->head;
-  else
-    olog->head = olog->log.rbegin()->version;
 
   olog->index();
   index();
diff --git a/src/osd/ReplicatedBackend.cc b/src/osd/ReplicatedBackend.cc
index 9868e7af2c8..ddc39d70372 100644
--- a/src/osd/ReplicatedBackend.cc
+++ b/src/osd/ReplicatedBackend.cc
@@ -194,3 +194,75 @@ void ReplicatedBackend::on_flushed()
     assert(0 == "found garbage in the temp collection");
   }
 }
+
+
+int ReplicatedBackend::objects_list_partial(
+  const hobject_t &begin,
+  int min,
+  int max,
+  snapid_t seq,
+  vector<hobject_t> *ls,
+  hobject_t *next)
+{
+  vector<ghobject_t> objects;
+  ghobject_t _next;
+  int r = osd->store->collection_list_partial(
+    coll,
+    begin,
+    min,
+    max,
+    seq,
+    &objects,
+    &_next);
+  ls->reserve(objects.size());
+  for (vector<ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    assert(i->is_degenerate());
+    ls->push_back(i->hobj);
+  }
+  assert(_next.is_degenerate());
+  *next = _next.hobj;
+  return r;
+}
+
+int ReplicatedBackend::objects_list_range(
+  const hobject_t &start,
+  const hobject_t &end,
+  snapid_t seq,
+  vector<hobject_t> *ls)
+{
+  vector<ghobject_t> objects;
+  int r = osd->store->collection_list_range(
+    coll,
+    start,
+    end,
+    seq,
+    &objects);
+  ls->reserve(objects.size());
+  for (vector<ghobject_t>::iterator i = objects.begin();
+       i != objects.end();
+       ++i) {
+    assert(i->is_degenerate());
+    ls->push_back(i->hobj);
+  }
+  return r;
+}
+
+int ReplicatedBackend::objects_get_attr(
+  const hobject_t &hoid,
+  const string &attr,
+  bufferlist *out)
+{
+  bufferptr bp;
+  int r = osd->store->getattr(
+    coll,
+    hoid,
+    attr.c_str(),
+    bp);
+  if (r >= 0 && out) {
+    out->clear();
+    out->push_back(bp);
+  }
+  return r;
+}
diff --git a/src/osd/ReplicatedBackend.h b/src/osd/ReplicatedBackend.h
index e34e55a618e..cc5f060e136 100644
--- a/src/osd/ReplicatedBackend.h
+++ b/src/osd/ReplicatedBackend.h
@@ -148,6 +148,26 @@ public:
       f->close_section();
     }
   }
+
+  /// List objects in collection
+  int objects_list_partial(
+    const hobject_t &begin,
+    int min,
+    int max,
+    snapid_t seq,
+    vector<hobject_t> *ls,
+    hobject_t *next);
+
+  int objects_list_range(
+    const hobject_t &start,
+    const hobject_t &end,
+    snapid_t seq,
+    vector<hobject_t> *ls);
+
+  int objects_get_attr(
+    const hobject_t &hoid,
+    const string &attr,
+    bufferlist *out);
 private:
   // push
   struct PushInfo {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index fb5e45a1a71..6c8b092ca01 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -398,8 +398,10 @@ bool PGLSPlainFilter::filter(bufferlist& xattr_data, bufferlist& outdata)
 bool ReplicatedPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 {
   bufferlist bl;
-
-  int ret = osd->store->getattr(coll_t(info.pgid), sobj, filter->get_xattr().c_str(), bl);
+  int ret = pgbackend->objects_get_attr(
+    sobj,
+    filter->get_xattr(),
+    &bl);
   dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
   if (ret < 0)
     return false;
@@ -639,12 +641,13 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	hobject_t next;
 	hobject_t current = response.handle;
 	osr->flush();
-	int r = osd->store->collection_list_partial(coll, current,
-						    list_size,
-						    list_size,
-						    snapid,
-						    &sentries,
-						    &next);
+	int r = pgbackend->objects_list_partial(
+	  current,
+	  list_size,
+	  list_size,
+	  snapid,
+	  &sentries,
+	  &next);
 	if (r != 0) {
 	  result = -EINVAL;
 	  break;
@@ -682,13 +685,17 @@ void ReplicatedPG::do_pg_op(OpRequestRef op)
 	  if (snapid != CEPH_NOSNAP) {
 	    bufferlist bl;
 	    if (candidate.snap == CEPH_NOSNAP) {
-	      osd->store->getattr(coll, candidate, SS_ATTR, bl);
+	      pgbackend->objects_get_attr(
+		candidate,
+		SS_ATTR,
+		&bl);
 	      SnapSet snapset(bl);
 	      if (snapid <= snapset.seq)
 		continue;
 	    } else {
 	      bufferlist attr_bl;
-	      osd->store->getattr(coll, candidate, OI_ATTR, attr_bl);
+	      pgbackend->objects_get_attr(
+		candidate, OI_ATTR, &attr_bl);
 	      object_info_t oi(attr_bl);
 	      vector<snapid_t>::iterator i = find(oi.snaps.begin(),
 						  oi.snaps.end(),
@@ -1536,8 +1543,9 @@ void ReplicatedPG::do_scan(
 
       BackfillInterval bi;
       osr->flush();
+      bi.begin = m->begin;
       scan_range(
-	m->begin, cct->_conf->osd_backfill_scan_min,
+	cct->_conf->osd_backfill_scan_min,
 	cct->_conf->osd_backfill_scan_max, &bi, handle);
       MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
 					 get_osdmap()->get_epoch(), m->query_epoch,
@@ -2659,7 +2667,10 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	string aname;
 	bp.copy(op.xattr.name_len, aname);
 	string name = "_" + aname;
-	int r = osd->store->getattr(coll, soid, name.c_str(), osd_op.outdata);
+	int r = pgbackend->objects_get_attr(
+	  soid,
+	  name,
+	  &(osd_op.outdata));
 	if (r >= 0) {
 	  op.xattr.value_len = r;
 	  result = 0;
@@ -2702,9 +2713,15 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	
 	bufferlist xattr;
 	if (op.op == CEPH_OSD_OP_CMPXATTR)
-	  result = osd->store->getattr(coll, soid, name.c_str(), xattr);
+	  result = pgbackend->objects_get_attr(
+	    soid,
+	    name,
+	    &xattr);
 	else
-	  result = osd->store->getattr(coll, src_obc->obs.oi.soid, name.c_str(), xattr);
+	  result = pgbackend->objects_get_attr(
+	    src_obc->obs.oi.soid,
+	    name,
+	    &xattr);
 	if (result < 0 && result != -EEXIST && result != -ENODATA)
 	  break;
 	
@@ -3675,7 +3692,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	  result = -EINVAL;
 	  goto fail;
 	}
-	if (!ctx->copy_op) {
+	if (!ctx->copy_cb) {
 	  // start
 	  pg_t raw_pg;
 	  get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
@@ -3687,13 +3704,18 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
 	    result = -EINVAL;
 	    break;
 	  }
-	  result = start_copy(ctx, src, src_oloc, src_version);
+	  hobject_t temp_target = generate_temp_object();
+	  CopyFromCallback *cb = new CopyFromCallback(ctx, temp_target);
+	  ctx->copy_cb = cb;
+	  result = start_copy(cb, ctx->obc, src, src_oloc, src_version,
+	                      temp_target);
 	  if (result < 0)
 	    goto fail;
 	  result = -EINPROGRESS;
 	} else {
 	  // finish
-	  result = finish_copy(ctx);
+	  assert(ctx->copy_cb->get_result() >= 0);
+	  result = finish_copyfrom(ctx);
 	}
       }
       break;
@@ -3785,37 +3807,35 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
   int ret = find_object_context(
     hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
     &rollback_to, false, &cloneid);
-  if (ret) {
-    if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) {
-      // there's no snapshot here, or there's no object.
-      // if there's no snapshot, we delete the object; otherwise, do nothing.
-      dout(20) << "_rollback_to deleting head on " << soid.oid
-	       << " because got ENOENT|whiteout on find_object_context" << dendl;
-      if (ctx->obc->obs.oi.watchers.size()) {
-	// Cannot delete an object with watchers
-	ret = -EBUSY;
-      } else {
-	_delete_head(ctx);
-	ret = 0;
-      }
-    } else if (-EAGAIN == ret) {
-      /* a different problem, like degraded pool
-       * with not-yet-restored object. We shouldn't have been able
-       * to get here; recovery should have completed first! */
-      hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
-				info.pgid.pool(), soid.get_namespace());
-      assert(is_missing_object(rollback_target));
-      dout(20) << "_rollback_to attempted to roll back to a missing object " 
-	       << rollback_target << " (requested snapid: ) " << snapid << dendl;
-      wait_for_missing_object(rollback_target, ctx->op);
+  if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
+    // there's no snapshot here, or there's no object.
+    // if there's no snapshot, we delete the object; otherwise, do nothing.
+    dout(20) << "_rollback_to deleting head on " << soid.oid
+	     << " because got ENOENT|whiteout on find_object_context" << dendl;
+    if (ctx->obc->obs.oi.watchers.size()) {
+      // Cannot delete an object with watchers
+      ret = -EBUSY;
     } else {
-      // ummm....huh? It *can't* return anything else at time of writing.
-      assert(0);
-    }
+      _delete_head(ctx);
+      ret = 0;
+    }
+  } else if (-EAGAIN == ret) {
+    /* a different problem, like degraded pool
+     * with not-yet-restored object. We shouldn't have been able
+     * to get here; recovery should have completed first! */
+    hobject_t rollback_target(soid.oid, soid.get_key(), cloneid, soid.hash,
+			      info.pgid.pool(), soid.get_namespace());
+    assert(is_missing_object(rollback_target));
+    dout(20) << "_rollback_to attempted to roll back to a missing object "
+	     << rollback_target << " (requested snapid: ) " << snapid << dendl;
+    wait_for_missing_object(rollback_target, ctx->op);
+  } else if (ret) {
+    // ummm....huh? It *can't* return anything else at time of writing.
+    assert(0 == "unexpected error code in _rollback_to");
   } else { //we got our context, let's use it to do the rollback!
     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
     if (is_degraded_object(rollback_to_sobject)) {
-      dout(20) << "_rollback_to attempted to roll back to a degraded object " 
+      dout(20) << "_rollback_to attempted to roll back to a degraded object "
 	       << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
       wait_for_degraded_object(rollback_to_sobject, ctx->op);
       ret = -EAGAIN;
@@ -4292,11 +4312,12 @@ struct C_Copyfrom : public Context {
   }
 };
 
-int ReplicatedPG::start_copy(OpContext *ctx,
-			     hobject_t src, object_locator_t oloc, version_t version)
+int ReplicatedPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
+			     hobject_t src, object_locator_t oloc, version_t version,
+			     const hobject_t& temp_dest_oid)
 {
-  const hobject_t& dest = ctx->obs->oi.soid;
-  dout(10) << __func__ << " " << dest << " ctx " << ctx
+  const hobject_t& dest = obc->obs.oi.soid;
+  dout(10) << __func__ << " " << dest
 	   << " from " << src << " " << oloc << " v" << version
 	   << dendl;
 
@@ -4308,19 +4329,18 @@ int ReplicatedPG::start_copy(OpContext *ctx,
     cancel_copy(cop);
   }
 
-  CopyOpRef cop(new CopyOp(ctx, src, oloc, version));
+  CopyOpRef cop(new CopyOp(cb, obc, src, oloc, version, temp_dest_oid));
   copy_ops[dest] = cop;
-  ctx->copy_op = cop;
-  ++ctx->obc->copyfrom_readside;
+  ++obc->copyfrom_readside;
 
-  _copy_some(ctx, cop);
+  _copy_some(obc, cop);
 
   return 0;
 }
 
-void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
+void ReplicatedPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
 {
-  dout(10) << __func__ << " " << ctx << " " << cop << dendl;
+  dout(10) << __func__ << " " << obc << " " << cop << dendl;
   ObjectOperation op;
   if (cop->version) {
     op.assert_version(cop->version);
@@ -4334,7 +4354,7 @@ void ReplicatedPG::_copy_some(OpContext *ctx, CopyOpRef cop)
 	      &cop->data, &cop->omap,
 	      &cop->rval);
 
-  C_Copyfrom *fin = new C_Copyfrom(this, ctx->obs->oi.soid,
+  C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
 				   get_last_peering_reset());
   osd->objecter_lock.Lock();
   tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
@@ -4362,50 +4382,48 @@ void ReplicatedPG::process_copy_chunk(hobject_t oid, tid_t tid, int r)
 	     << " tid " << cop->objecter_tid << dendl;
     return;
   }
-  OpContext *ctx = cop->ctx;
+  ObjectContextRef obc = cop->obc;
   cop->objecter_tid = 0;
-  if (r < 0) {
-    copy_ops.erase(ctx->obc->obs.oi.soid);
-    --ctx->obc->copyfrom_readside;
-    kick_object_context_blocked(ctx->obc);
-    reply_ctx(ctx, r);
-    return;
-  }
-  assert(cop->rval >= 0);
 
-  if (!cop->cursor.is_complete()) {
-    // write out what we have so far
-    vector<OSDOp> ops;
-    tid_t rep_tid = osd->get_tid();
-    osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
-    OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &ctx->obc->obs, ctx->obc->ssc, this);
-    tctx->mtime = ceph_clock_now(g_ceph_context);
-    RepGather *repop = new_repop(tctx, ctx->obc, rep_tid);
-
-    if (cop->temp_cursor.is_initial()) {
-      cop->temp_coll = get_temp_coll(&tctx->local_t);
-      cop->temp_oid = generate_temp_object();
-      repop->ctx->new_temp_oid = cop->temp_oid;
-    }
+  CopyResults results;
+  if (r >= 0) {
+    assert(cop->rval >= 0);
+
+    if (!cop->cursor.is_complete()) {
+      // write out what we have so far
+      vector<OSDOp> ops;
+      tid_t rep_tid = osd->get_tid();
+      osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
+      OpContext *tctx = new OpContext(OpRequestRef(), reqid, ops, &obc->obs, obc->ssc, this);
+      tctx->mtime = ceph_clock_now(g_ceph_context);
+      RepGather *repop = new_repop(tctx, obc, rep_tid);
+
+      if (cop->temp_cursor.is_initial()) {
+	cop->temp_coll = get_temp_coll(&tctx->local_t);
+	repop->ctx->new_temp_oid = cop->temp_oid;
+      }
 
-    _write_copy_chunk(cop, &tctx->op_t);
+      _write_copy_chunk(cop, &tctx->op_t);
 
-    issue_repop(repop, repop->ctx->mtime);
-    eval_repop(repop);
-    repop->put();
+      issue_repop(repop, repop->ctx->mtime);
+      eval_repop(repop);
+      repop->put();
 
-    dout(10) << __func__ << " fetching more" << dendl;
-    _copy_some(ctx, cop);
-    return;
+      dout(10) << __func__ << " fetching more" << dendl;
+      _copy_some(obc, cop);
+      return;
+    }
+    _build_finish_copy_transaction(cop, results.get<3>());
+    results.get<1>() = cop->temp_cursor.data_offset;
   }
 
   dout(20) << __func__ << " complete; committing" << dendl;
-  execute_ctx(ctx);
+  results.get<0>() = r;
+  cop->cb->complete(results);
 
-  copy_ops.erase(ctx->obc->obs.oi.soid);
-  --ctx->obc->copyfrom_readside;
-  ctx->copy_op.reset();
-  kick_object_context_blocked(ctx->obc);
+  copy_ops.erase(obc->obs.oi.soid);
+  --obc->copyfrom_readside;
+  kick_object_context_blocked(obc);
 }
 
 void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
@@ -4432,16 +4450,12 @@ void ReplicatedPG::_write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t)
   cop->temp_cursor = cop->cursor;
 }
 
-int ReplicatedPG::finish_copy(OpContext *ctx)
+void ReplicatedPG::_build_finish_copy_transaction(CopyOpRef cop,
+                                                  ObjectStore::Transaction& t)
 {
-  CopyOpRef cop = ctx->copy_op;
-  ObjectState& obs = ctx->new_obs;
-  ObjectStore::Transaction& t = ctx->op_t;
+  ObjectState& obs = cop->obc->obs;
 
-  if (!obs.exists) {
-    ctx->delta_stats.num_objects++;
-    obs.exists = true;
-  } else {
+  if (obs.exists) {
     t.remove(coll, obs.oi.soid);
   }
 
@@ -4455,18 +4469,34 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
     _write_copy_chunk(cop, &t);
     t.collection_move_rename(cop->temp_coll, cop->temp_oid, coll, obs.oi.soid);
     pgbackend->clear_temp_obj(cop->temp_oid);
-    ctx->discard_temp_oid = cop->temp_oid;
   }
+}
+
+int ReplicatedPG::finish_copyfrom(OpContext *ctx)
+{
+  dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
+  ObjectState& obs = ctx->new_obs;
+  CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
+
+  if (!ctx->obs->exists) {
+    ctx->delta_stats.num_objects++;
+    obs.exists = true;
+  }
+  if (cb->is_temp_obj_used()) {
+    ctx->discard_temp_oid = cb->temp_obj;
+  }
+  ctx->op_t.swap(cb->results.get<3>());
+  ctx->op_t.append(cb->results.get<3>());
 
   interval_set<uint64_t> ch;
   if (obs.oi.size > 0)
     ch.insert(0, obs.oi.size);
   ctx->modified_ranges.union_of(ch);
 
-  if (cop->cursor.data_offset != obs.oi.size) {
+  if (cb->get_data_size() != obs.oi.size) {
     ctx->delta_stats.num_bytes -= obs.oi.size;
+    obs.oi.size = cb->get_data_size();
     ctx->delta_stats.num_bytes += obs.oi.size;
-    obs.oi.size = cop->cursor.data_offset;
   }
   ctx->delta_stats.num_wr++;
   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
@@ -4476,8 +4506,7 @@ int ReplicatedPG::finish_copy(OpContext *ctx)
 
 void ReplicatedPG::cancel_copy(CopyOpRef cop)
 {
-  OpContext *ctx = cop->ctx;
-  dout(10) << __func__ << " " << ctx->obc->obs.oi.soid << " ctx " << ctx
+  dout(10) << __func__ << " " << cop->obc->obs.oi.soid
 	   << " from " << cop->src << " " << cop->oloc << " v" << cop->version
 	   << dendl;
 
@@ -4487,13 +4516,13 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
     osd->objecter->op_cancel(cop->objecter_tid);
   }
 
-  copy_ops.erase(ctx->obc->obs.oi.soid);
-  --ctx->obc->copyfrom_readside;
-  ctx->copy_op.reset();
-
-  kick_object_context_blocked(ctx->obc);
+  copy_ops.erase(cop->obc->obs.oi.soid);
+  --cop->obc->copyfrom_readside;
 
-  delete ctx;
+  kick_object_context_blocked(cop->obc);
+  bool temp_obj_created = !cop->cursor.is_initial();
+  CopyResults result(-ECANCELED, 0, temp_obj_created, ObjectStore::Transaction());
+  cop->cb->complete(result);
 }
 
 void ReplicatedPG::cancel_copy_ops()
@@ -4552,10 +4581,19 @@ void ReplicatedPG::apply_repop(RepGather *repop)
   if (repop->ctx->clone_obc)
     repop->ctx->clone_obc->ondisk_write_lock();
 
+  bool unlock_snapset_obc = false;
+  if (repop->ctx->snapset_obc && repop->ctx->snapset_obc->obs.oi.soid !=
+      repop->obc->obs.oi.soid) {
+    repop->ctx->snapset_obc->ondisk_write_lock();
+    unlock_snapset_obc = true;
+  }
+
   Context *oncommit = new C_OSD_OpCommit(this, repop);
   Context *onapplied = new C_OSD_OpApplied(this, repop);
-  Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(repop->obc,
-							repop->ctx->clone_obc);
+  Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
+    repop->obc,
+    repop->ctx->clone_obc,
+    unlock_snapset_obc ? repop->ctx->snapset_obc : ObjectContextRef());
   int r = osd->store->queue_transactions(osr.get(), repop->tls, onapplied, oncommit, onapplied_sync, repop->ctx->op);
   if (r) {
     derr << "apply_repop  queue_transactions returned " << r << " on " << *repop << dendl;
@@ -5145,7 +5183,7 @@ ObjectContextRef ReplicatedPG::get_object_context(const hobject_t& soid,
       assert(attrs->count(OI_ATTR));
       bv.push_back(attrs->find(OI_ATTR)->second);
     } else {
-      int r = osd->store->getattr(coll, soid, OI_ATTR, bv);
+      int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
       if (r < 0) {
 	if (!can_create)
 	  return ObjectContextRef();   // -ENOENT!
@@ -5409,12 +5447,12 @@ SnapSetContext *ReplicatedPG::get_snapset_context(
     if (!attrs) {
       hobject_t head(oid, key, CEPH_NOSNAP, seed,
 		     info.pgid.pool(), nspace);
-      int r = osd->store->getattr(coll, head, SS_ATTR, bv);
+      int r = pgbackend->objects_get_attr(head, SS_ATTR, &bv);
       if (r < 0) {
 	// try _snapset
 	hobject_t snapdir(oid, key, CEPH_SNAPDIR, seed,
 			  info.pgid.pool(), nspace);
-	r = osd->store->getattr(coll, snapdir, SS_ATTR, bv);
+	r = pgbackend->objects_get_attr(snapdir, SS_ATTR, &bv);
 	if (r < 0 && !can_create)
 	  return NULL;
       }
@@ -7791,6 +7829,8 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
     int peer = acting[i];
     map<int, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
     assert(pm != peer_missing.end());
+    map<int, pg_info_t>::const_iterator pi = peer_info.find(peer);
+    assert(pi != peer_info.end());
     size_t m_sz = pm->second.num_missing();
 
     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
@@ -7804,6 +7844,15 @@ int ReplicatedPG::recover_replicas(int max, ThreadPool::TPHandle &handle)
       handle.reset_tp_timeout();
       const hobject_t soid(p->second);
 
+      if (soid > pi->second.last_backfill) {
+	if (!recovering.count(soid)) {
+	  derr << __func__ << ": object added to missing set for backfill, but "
+	       << "is not in recovering, error!" << dendl;
+	  assert(0);
+	}
+	continue;
+      }
+
       if (recovering.count(soid)) {
 	dout(10) << __func__ << ": already recovering" << soid << dendl;
 	continue;
@@ -7871,17 +7920,12 @@ int ReplicatedPG::recover_backfill(
 	   << " interval " << pbi.begin << "-" << pbi.end
 	   << " " << pbi.objects.size() << " objects" << dendl;
 
-  int local_min = osd->store->get_ideal_list_min();
-  int local_max = osd->store->get_ideal_list_max();
+  int local_min = cct->_conf->osd_backfill_scan_min;
+  int local_max = cct->_conf->osd_backfill_scan_max;
 
-  // re-scan our local interval to cope with recent changes
-  // FIXME: we could track the eversion_t when we last scanned, and invalidate
-  // that way.  or explicitly modify/invalidate when we actually change specific
-  // objects.
-  dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
-  backfill_info.clear();
-  osr->flush();
-  scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+  // update our local interval to cope with recent changes
+  backfill_info.begin = backfill_pos;
+  update_range(&backfill_info, handle);
 
   int ops = 0;
   map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7895,7 +7939,8 @@ int ReplicatedPG::recover_backfill(
     if (backfill_info.begin <= pbi.begin &&
 	!backfill_info.extends_to_end() && backfill_info.empty()) {
       osr->flush();
-      scan_range(backfill_info.end, local_min, local_max, &backfill_info,
+      backfill_info.begin = backfill_info.end;
+      scan_range(local_min, local_max, &backfill_info,
 		 handle);
       backfill_info.trim();
     }
@@ -8056,26 +8101,81 @@ void ReplicatedPG::prep_backfill_object_push(
   start_recovery_op(oid);
   recovering.insert(oid);
   ObjectContextRef obc = get_object_context(oid, false);
+
+  // We need to take the read_lock here in order to flush in-progress writes
+  obc->ondisk_read_lock();
   pgbackend->recover_object(
     oid,
     ObjectContextRef(),
     obc,
     h);
+  obc->ondisk_read_unlock();
+}
+
+void ReplicatedPG::update_range(
+  BackfillInterval *bi,
+  ThreadPool::TPHandle &handle)
+{
+  int local_min = cct->_conf->osd_backfill_scan_min;
+  int local_max = cct->_conf->osd_backfill_scan_max;
+  if (bi->version >= info.last_update) {
+    dout(10) << __func__<< ": bi is current " << dendl;
+    assert(bi->version == info.last_update);
+  } else if (bi->version >= info.log_tail) {
+    assert(!pg_log.get_log().empty());
+    dout(10) << __func__<< ": bi is old, (" << bi->version
+	     << ") can be updated with log" << dendl;
+    list<pg_log_entry_t>::const_iterator i =
+      pg_log.get_log().log.end();
+    --i;
+    while (i != pg_log.get_log().log.begin() &&
+           i->version > bi->version) {
+      --i;
+    }
+    if (i->version == bi->version)
+      ++i;
+
+    assert(i != pg_log.get_log().log.end());
+    dout(10) << __func__ << ": updating from version " << i->version
+	     << dendl;
+    for (; i != pg_log.get_log().log.end(); ++i) {
+      const hobject_t &soid = i->soid;
+      if (soid >= bi->begin && soid < bi->end) {
+	if (i->is_update()) {
+	  dout(10) << __func__ << ": " << i->soid << " updated to version "
+		   << i->version << dendl;
+	  bi->objects.erase(i->soid);
+	  bi->objects.insert(
+	    make_pair(
+	      i->soid,
+	      i->version));
+	} else if (i->is_delete()) {
+	  dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+	  bi->objects.erase(i->soid);
+	}
+      }
+    }
+    bi->version = info.last_update;
+  } else {
+    dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+	     << dendl;
+    osr->flush();
+    scan_range(local_min, local_max, &backfill_info, handle);
+  }
 }
 
 void ReplicatedPG::scan_range(
-  hobject_t begin, int min, int max, BackfillInterval *bi,
+  int min, int max, BackfillInterval *bi,
   ThreadPool::TPHandle &handle)
 {
   assert(is_locked());
-  dout(10) << "scan_range from " << begin << dendl;
-  bi->begin = begin;
+  dout(10) << "scan_range from " << bi->begin << dendl;
+  bi->version = info.last_update;
   bi->objects.clear();  // for good measure
 
   vector<hobject_t> ls;
   ls.reserve(max);
-  int r = osd->store->collection_list_partial(coll, begin, min, max,
-					      0, &ls, &bi->end);
+  int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, &bi->end);
   assert(r >= 0);
   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
   dout(20) << ls << dendl;
@@ -8090,7 +8190,7 @@ void ReplicatedPG::scan_range(
       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
     } else {
       bufferlist bl;
-      int r = osd->store->getattr(coll, *p, OI_ATTR, bl);
+      int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
       assert(r >= 0);
       object_info_t oi(bl);
       bi->objects[*p] = oi.version;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 05edcef6adf..c277c0d3f86 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -18,6 +18,7 @@
 #define CEPH_REPLICATEDPG_H
 
 #include <boost/optional.hpp>
+#include <boost/tuple/tuple.hpp>
 
 #include "include/assert.h" 
 #include "common/cmdparse.h"
@@ -93,9 +94,11 @@ public:
    * state associated with a copy operation
    */
   struct OpContext;
+  class CopyCallback;
 
   struct CopyOp {
-    OpContext *ctx;
+    CopyCallback *cb;
+    ObjectContextRef obc;
     hobject_t src;
     object_locator_t oloc;
     version_t version;
@@ -114,16 +117,86 @@ public:
     hobject_t temp_oid;
     object_copy_cursor_t temp_cursor;
 
-    CopyOp(OpContext *c, hobject_t s, object_locator_t l, version_t v)
-      : ctx(c), src(s), oloc(l), version(v),
+    CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s, object_locator_t l,
+           version_t v, const hobject_t& dest)
+      : cb(cb_), obc(_obc), src(s), oloc(l), version(v),
 	objecter_tid(0),
 	size(0),
-	rval(-1)
+	rval(-1),
+	temp_oid(dest)
     {}
   };
   typedef boost::shared_ptr<CopyOp> CopyOpRef;
 
+  /**
+   * The CopyCallback class defines an interface for completions to the
+   * copy_start code. Users of the copy infrastructure must implement
+   * one and give an instance of the class to start_copy.
+   *
+   * The implementer is responsible for making sure that the CopyCallback
+   * can associate itself with the correct copy operation. The presence
+   * of the closing Transaction ensures that write operations can be performed
+   * atomically with the copy being completed (which doing them in separate
+   * transactions would not allow); if you are doing the copy for a read
+   * op you will have to generate a separate op to finish the copy with.
+   */
+  /// return code, total object size, data in temp object?, final Transaction
+  typedef boost::tuple<int, size_t, bool, ObjectStore::Transaction> CopyResults;
+  class CopyCallback : public GenContext<CopyResults&> {
+  protected:
+    CopyCallback() {}
+    /**
+     * results.get<0>() is the return code: 0 for success; -ECANCELLED if
+     * the operation was cancelled by the local OSD; -errno for other issues.
+     * results.get<1>() is the total size of the object (for updating pg stats)
+     * results.get<2>() indicates whether we have already written data to
+     * the temp object (so it needs to get cleaned up, if the return code
+     * indicates a failure)
+     * results.get<3>() is a Transaction; if non-empty you need to perform
+     * its results before any other accesses to the object in order to
+     * complete the copy.
+     */
+    virtual void finish(CopyResults& results_) = 0;
+
+  public:
+    /// Provide the final size of the copied object to the CopyCallback
+    virtual ~CopyCallback() {};
+  };
+
+  class CopyFromCallback: public CopyCallback {
+  public:
+    CopyResults results;
+    OpContext *ctx;
+    hobject_t temp_obj;
+    CopyFromCallback(OpContext *ctx_, const hobject_t& temp_obj_) :
+      ctx(ctx_), temp_obj(temp_obj_) {}
+    ~CopyFromCallback() {}
+
+    virtual void finish(CopyResults& results_) {
+      results = results_;
+      int r = results.get<0>();
+      if (r >= 0) {
+	ctx->pg->execute_ctx(ctx);
+      }
+      ctx->copy_cb = NULL;
+      if (r < 0) {
+	if (r != -ECANCELED) { // on cancel just toss it out; client resends
+	  ctx->pg->osd->reply_op_error(ctx->op, r);
+	}
+	delete ctx;
+      }
+    }
+
+    bool is_temp_obj_used() { return results.get<2>(); }
+    uint64_t get_data_size() { return results.get<1>(); }
+    int get_result() { return results.get<0>(); }
+  };
+  friend class CopyFromCallback;
+
   boost::scoped_ptr<PGBackend> pgbackend;
+  PGBackend *get_pgbackend() {
+    return pgbackend.get();
+  }
 
   /// Listener methods
   void on_local_recover_start(
@@ -297,7 +370,7 @@ public:
     int num_read;    ///< count read ops
     int num_write;   ///< count update ops
 
-    CopyOpRef copy_op;
+    CopyFromCallback *copy_cb;
 
     hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
 
@@ -314,7 +387,8 @@ public:
       current_osd_subop_num(0),
       data_off(0), reply(NULL), pg(_pg),
       num_read(0),
-      num_write(0) {
+      num_write(0),
+      copy_cb(NULL) {
       if (_ssc) {
 	new_snapset = _ssc->snapset;
 	snapset = &_ssc->snapset;
@@ -619,10 +693,16 @@ protected:
    * @bi [out] resulting map of objects to eversion_t's
    */
   void scan_range(
-    hobject_t begin, int min, int max, BackfillInterval *bi,
+    int min, int max, BackfillInterval *bi,
     ThreadPool::TPHandle &handle
     );
 
+  /// Update a hash range to reflect changes since the last scan
+  void update_range(
+    BackfillInterval *bi,        ///< [in,out] interval to update
+    ThreadPool::TPHandle &handle ///< [in] tp handle
+    );
+
   void prep_backfill_object_push(
     hobject_t oid, eversion_t v, eversion_t have, int peer,
     PGBackend::RecoveryHandle *h);
@@ -662,12 +742,17 @@ protected:
     }
   };
   struct C_OSD_OndiskWriteUnlock : public Context {
-    ObjectContextRef obc, obc2;
-    C_OSD_OndiskWriteUnlock(ObjectContextRef o, ObjectContextRef o2 = ObjectContextRef()) : obc(o), obc2(o2) {}
+    ObjectContextRef obc, obc2, obc3;
+    C_OSD_OndiskWriteUnlock(
+      ObjectContextRef o,
+      ObjectContextRef o2 = ObjectContextRef(),
+      ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
     void finish(int r) {
       obc->ondisk_write_unlock();
       if (obc2)
 	obc2->ondisk_write_unlock();
+      if (obc3)
+	obc3->ondisk_write_unlock();
     }
   };
   struct C_OSD_OndiskWriteUnlockList : public Context {
@@ -723,11 +808,15 @@ protected:
   // -- copyfrom --
   map<hobject_t, CopyOpRef> copy_ops;
 
-  int start_copy(OpContext *ctx, hobject_t src, object_locator_t oloc, version_t version);
+  int start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
+                 object_locator_t oloc, version_t version,
+                 const hobject_t& temp_dest_oid);
   void process_copy_chunk(hobject_t oid, tid_t tid, int r);
   void _write_copy_chunk(CopyOpRef cop, ObjectStore::Transaction *t);
-  void _copy_some(OpContext *ctx, CopyOpRef cop);
-  int finish_copy(OpContext *ctx);
+  void _copy_some(ObjectContextRef obc, CopyOpRef cop);
+  void _build_finish_copy_transaction(CopyOpRef cop,
+                                      ObjectStore::Transaction& t);
+  int finish_copyfrom(OpContext *ctx);
   void cancel_copy(CopyOpRef cop);
   void cancel_copy_ops();
 
diff --git a/src/rbd_fuse/rbd-fuse.c b/src/rbd_fuse/rbd-fuse.c
index eea6edb9eb8..2a6a8d22e81 100644
--- a/src/rbd_fuse/rbd-fuse.c
+++ b/src/rbd_fuse/rbd-fuse.c
@@ -1,7 +1,7 @@
 /*
  * rbd-fuse
  */
-#define FUSE_USE_VERSION 26
+#define FUSE_USE_VERSION 30
 
 #include "include/int_types.h"
 
diff --git a/src/rgw/rgw_acl.cc b/src/rgw/rgw_acl.cc
index 3f99d72cd5b..02504524847 100644
--- a/src/rgw/rgw_acl.cc
+++ b/src/rgw/rgw_acl.cc
@@ -79,7 +79,7 @@ int RGWAccessControlPolicy::get_perm(string& id, int perm_mask) {
   if ((perm & perm_mask) != perm_mask) {
     perm |= acl.get_group_perm(ACL_GROUP_ALL_USERS, perm_mask);
 
-    if (compare_group_name(id, ACL_GROUP_ALL_USERS) != 0) {
+    if (!compare_group_name(id, ACL_GROUP_ALL_USERS)) {
       /* this is not the anonymous user */
       perm |= acl.get_group_perm(ACL_GROUP_AUTHENTICATED_USERS, perm_mask);
     }
diff --git a/src/rgw/rgw_main.cc b/src/rgw/rgw_main.cc
index 54db609521c..2e0245587c9 100644
--- a/src/rgw/rgw_main.cc
+++ b/src/rgw/rgw_main.cc
@@ -461,7 +461,7 @@ int main(int argc, const char **argv)
 
   /* alternative default for module */
   vector<const char *> def_args;
-  def_args.push_back("--debug-rgw=20");
+  def_args.push_back("--debug-rgw=1/5");
   def_args.push_back("--keyring=$rgw_data/keyring");
   def_args.push_back("--log-file=/var/log/radosgw/$cluster-$name");
 
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 114b8709a22..fc4ad6d3511 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -1604,6 +1604,13 @@ void RGWPutMetadata::execute()
     }
   }
 
+  map<string, string>::iterator giter;
+  for (giter = s->generic_attrs.begin(); giter != s->generic_attrs.end(); ++giter) {
+    bufferlist& attrbl = attrs[giter->first];
+    const string& val = giter->second;
+    attrbl.append(val.c_str(), val.size() + 1);
+  }
+
   if (has_policy) {
     policy.encode(bl);
     attrs[RGW_ATTR_ACL] = bl;
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 8b4d18f4e68..6d2cc9159a6 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -82,18 +82,26 @@ void RGWDefaultRegionInfo::decode_json(JSONObj *obj) {
   JSONDecoder::decode_json("default_region", default_region, obj);
 }
 
-string RGWRegion::get_pool_name(CephContext *cct)
+int RGWRegion::get_pool_name(CephContext *cct, string *pool_name)
 {
-  string pool_name = cct->_conf->rgw_region_root_pool;
-  if (pool_name.empty()) {
-    pool_name = RGW_DEFAULT_REGION_ROOT_POOL;
+  *pool_name = cct->_conf->rgw_region_root_pool;
+  if (pool_name->empty()) {
+    *pool_name = RGW_DEFAULT_REGION_ROOT_POOL;
+  } else if ((*pool_name)[0] != '.') {
+    derr << "ERROR: region root pool name must start with a period" << dendl;
+    return -EINVAL;
   }
-  return pool_name;
+  return 0;
 }
 
 int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0) {
+    return ret;
+  }
 
   string oid = cct->_conf->rgw_default_region_info_oid;
   if (oid.empty()) {
@@ -102,7 +110,7 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0)
     return ret;
 
@@ -121,7 +129,10 @@ int RGWRegion::read_default(RGWDefaultRegionInfo& default_info)
 
 int RGWRegion::set_as_default()
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   string oid = cct->_conf->rgw_default_region_info_oid;
   if (oid.empty()) {
@@ -136,7 +147,7 @@ int RGWRegion::set_as_default()
 
   ::encode(default_info, bl);
 
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
   if (ret < 0)
     return ret;
 
@@ -185,7 +196,11 @@ int RGWRegion::init(CephContext *_cct, RGWRados *_store, bool setup_region)
 
 int RGWRegion::read_info(const string& region_name)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
+
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
 
@@ -193,7 +208,7 @@ int RGWRegion::read_info(const string& region_name)
 
   string oid = region_info_oid_prefix + name;
 
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0) {
     lderr(cct) << "failed reading region info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
     return ret;
@@ -246,7 +261,10 @@ int RGWRegion::create_default()
 
 int RGWRegion::store_info(bool exclusive)
 {
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
 
@@ -254,7 +272,7 @@ int RGWRegion::store_info(bool exclusive)
 
   bufferlist bl;
   ::encode(*this, bl);
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, 0, NULL);
 
   return ret;
 }
@@ -293,13 +311,17 @@ void RGWZoneParams::init_default(RGWRados *store)
   }
 }
 
-string RGWZoneParams::get_pool_name(CephContext *cct)
+int RGWZoneParams::get_pool_name(CephContext *cct, string *pool_name)
 {
-  string pool_name = cct->_conf->rgw_zone_root_pool;
-  if (pool_name.empty()) {
-    pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
+  *pool_name = cct->_conf->rgw_zone_root_pool;
+  if (pool_name->empty()) {
+    *pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
+  } else if ((*pool_name)[0] != '.') {
+    derr << "ERROR: zone root pool name must start with a period" << dendl;
+    return -EINVAL;
   }
-  return pool_name;
+
+  return 0;
 }
 
 void RGWZoneParams::init_name(CephContext *cct, RGWRegion& region)
@@ -319,13 +341,16 @@ int RGWZoneParams::init(CephContext *cct, RGWRados *store, RGWRegion& region)
 {
   init_name(cct, region);
 
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
   bufferlist bl;
 
   string oid = zone_info_oid_prefix + name;
-  int ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
+  ret = rgw_get_system_obj(store, NULL, pool, oid, bl, NULL, NULL);
   if (ret < 0)
     return ret;
 
@@ -344,14 +369,17 @@ int RGWZoneParams::store_info(CephContext *cct, RGWRados *store, RGWRegion& regi
 {
   init_name(cct, region);
 
-  string pool_name = get_pool_name(cct);
+  string pool_name;
+  int ret = get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   rgw_bucket pool(pool_name.c_str());
   string oid = zone_info_oid_prefix + name;
 
   bufferlist bl;
   ::encode(*this, bl);
-  int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
+  ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), false, NULL, 0, NULL);
 
   return ret;
 }
@@ -1025,14 +1053,20 @@ int RGWRados::list_raw_prefixed_objs(string pool_name, const string& prefix, lis
 
 int RGWRados::list_regions(list<string>& regions)
 {
-  string pool_name = RGWRegion::get_pool_name(cct);
+  string pool_name;
+  int ret = RGWRegion::get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   return list_raw_prefixed_objs(pool_name, region_info_oid_prefix, regions);
 }
 
 int RGWRados::list_zones(list<string>& zones)
 {
-  string pool_name = RGWZoneParams::get_pool_name(cct);
+  string pool_name;
+  int ret = RGWZoneParams::get_pool_name(cct, &pool_name);
+  if (ret < 0)
+    return ret;
 
   return list_raw_prefixed_objs(pool_name, zone_info_oid_prefix, zones);
 }
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index 65765c414aa..72f0675e762 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -433,7 +433,7 @@ struct RGWZoneParams {
 
   map<string, RGWZonePlacementInfo> placement_pools;
 
-  static string get_pool_name(CephContext *cct);
+  static int get_pool_name(CephContext *cct, string *pool_name);
   void init_name(CephContext *cct, RGWRegion& region);
   int init(CephContext *cct, RGWRados *store, RGWRegion& region);
   void init_default(RGWRados *store);
@@ -622,7 +622,7 @@ struct RGWRegion {
   int set_as_default();
   int equals(const string& other_region);
 
-  static string get_pool_name(CephContext *cct);
+  static int get_pool_name(CephContext *cct, string *pool_name);
 
   void dump(Formatter *f) const;
   void decode_json(JSONObj *obj);
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index debe18ff907..84a228f1d4b 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am
@@ -258,6 +258,11 @@ unittest_addrs_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_addrs_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 check_PROGRAMS += unittest_addrs
 
+unittest_bloom_filter_SOURCES = test/common/test_bloom_filter.cc
+unittest_bloom_filter_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+unittest_bloom_filter_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
+check_PROGRAMS += unittest_bloom_filter
+
 unittest_sharedptr_registry_SOURCES = test/common/test_sharedptr_registry.cc
 unittest_sharedptr_registry_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 unittest_sharedptr_registry_LDADD = $(UNITTEST_LDADD) $(CEPH_GLOBAL)
@@ -851,11 +856,6 @@ ceph_test_keyvaluedb_iterators_LDADD = $(LIBOS) $(UNITTEST_LDADD) $(CEPH_GLOBAL)
 ceph_test_keyvaluedb_iterators_CXXFLAGS = $(UNITTEST_CXXFLAGS)
 bin_DEBUGPROGRAMS += ceph_test_keyvaluedb_iterators
 
-ceph_test_store_tool_SOURCES = test/ObjectMap/test_store_tool/test_store_tool.cc
-ceph_test_store_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
-ceph_test_store_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
-bin_DEBUGPROGRAMS += ceph_test_store_tool
-
 ceph_test_cfuse_cache_invalidate_SOURCES = test/test_cfuse_cache_invalidate.cc
 bin_DEBUGPROGRAMS += ceph_test_cfuse_cache_invalidate
 
diff --git a/src/test/ObjectMap/test_store_tool/test_store_tool.cc b/src/test/ObjectMap/test_store_tool/test_store_tool.cc
deleted file mode 100644
index f81598ccfb8..00000000000
--- a/src/test/ObjectMap/test_store_tool/test_store_tool.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
-* Ceph - scalable distributed file system
-*
-* Copyright (C) 2012 Inktank, Inc.
-*
-* This is free software; you can redistribute it and/or
-* modify it under the terms of the GNU Lesser General Public
-* License version 2.1, as published by the Free Software
-* Foundation. See file COPYING.
-*/
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <map>
-#include <set>
-#include <boost/scoped_ptr.hpp>
-
-#include "os/LevelDBStore.h"
-
-#include "common/ceph_argparse.h"
-#include "global/global_init.h"
-#include "common/errno.h"
-#include "common/safe_io.h"
-#include "common/config.h"
-
-using namespace std;
-
-class StoreTool
-{
-  boost::scoped_ptr<KeyValueDB> db;
-
-  public:
-  StoreTool(const string &path) {
-    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, path);
-    assert(!db_ptr->open(std::cerr));
-    db.reset(db_ptr);
-  }
-
-  void list(const string &prefix) {
-    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
-
-    if (prefix.empty())
-      iter->seek_to_first();
-    else
-      iter->seek_to_first(prefix);
-
-    while (iter->valid()) {
-      pair<string,string> rk = iter->raw_key();
-      if (!prefix.empty() && (rk.first != prefix))
-	break;
-
-      std::cout << rk.first << ":" << rk.second << std::endl;
-      iter->next();
-    }
-  }
-
-  bool exists(const string &prefix) {
-    assert(!prefix.empty());
-    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
-    iter->seek_to_first(prefix);
-    return (iter->valid() && (iter->raw_key().first == prefix));
-  }
-
-  bool exists(const string &prefix, const string &key) {
-    assert(!prefix.empty());
-
-    if (key.empty()) {
-      return exists(prefix);
-    }
-
-    bool exists = false;
-    get(prefix, key, exists);
-    return exists;
-  }
-
-  bufferlist get(const string &prefix, const string &key, bool &exists) {
-    assert(!prefix.empty() && !key.empty());
-
-    map<string,bufferlist> result;
-    set<string> keys;
-    keys.insert(key);
-    db->get(prefix, keys, &result);
-
-    if (result.count(key) > 0) {
-      exists = true;
-      return result[key];
-    }
-    exists = false;
-    return bufferlist();
-  }
-
-  uint64_t get_size() {
-    map<string,uint64_t> extras;
-    uint64_t s = db->get_estimated_size(extras);
-    for (map<string,uint64_t>::iterator p = extras.begin();
-         p != extras.end(); ++p) {
-      std::cout << p->first << " - " << p->second << std::endl;
-    }
-    std::cout << "total: " << s << std::endl;
-    return s;
-  }
-};
-
-void usage(const char *pname)
-{
-  std::cerr << "Usage: " << pname << " <store path> command [args...]\n"
-    << "\n"
-    << "Commands:\n"
-    << "  list [prefix]\n"
-    << "  exists <prefix> [key]\n"
-    << "  get <prefix> <key>\n"
-    << "  verify <store path>\n"
-    << "  get-size\n"
-    << std::endl;
-}
-
-int main(int argc, const char *argv[])
-{
-  vector<const char*> args;
-  argv_to_vec(argc, argv, args);
-  env_to_vec(args);
-
-  global_init(
-      NULL, args,
-      CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
-  common_init_finish(g_ceph_context);
-
-
-  if (args.size() < 2) {
-    usage(argv[0]);
-    return 1;
-  }
-
-  string path(args[0]);
-  string cmd(args[1]);
-
-  std::cout << "path: " << path << " cmd " << cmd << std::endl;
-
-  StoreTool st(path);
-
-  if (cmd == "list") {
-    string prefix;
-    if (argc > 3)
-      prefix = argv[3];
-
-    st.list(prefix);
-
-  } else if (cmd == "exists") {
-    string key;
-    if (argc < 4) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    if (argc > 4)
-      key = argv[4];
-
-    bool ret = st.exists(prefix, key);
-    std::cout << "(" << prefix << ", " << key << ") "
-      << (ret ? "exists" : "does not exist")
-      << std::endl;
-    return (ret ? 0 : 1);
-
-  } else if (cmd == "get") {
-    if (argc < 5) {
-      usage(argv[0]);
-      return 1;
-    }
-    string prefix(argv[3]);
-    string key(argv[4]);
-
-    bool exists = false;
-    bufferlist bl = st.get(prefix, key, exists);
-    std::cout << "(" << prefix << ", " << key << ")";
-    if (!exists) {
-      std::cout << " does not exist" << std::endl;
-      return 1;
-    }
-    std::cout << std::endl;
-    ostringstream os;
-    bl.hexdump(os);
-    std::cout << os.str() << std::endl;
-
-  } else if (cmd == "verify") {
-    assert(0);
-  } else if (cmd == "get-size") {
-    std::cout << "estimated store size: " << st.get_size() << std::endl;
-  } else {
-    std::cerr << "Unrecognized command: " << cmd << std::endl;
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/src/test/cli-integration/rbd/formatted-output.t b/src/test/cli-integration/rbd/formatted-output.t
index bece14f11f1..707e0749367 100644
--- a/src/test/cli-integration/rbd/formatted-output.t
+++ b/src/test/cli-integration/rbd/formatted-output.t
@@ -39,7 +39,7 @@ For now, use a more inclusive regex.
   $ rbd info foo
   rbd image 'foo':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   $ rbd info foo --format json | python -mjson.tool
@@ -67,7 +67,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info foo@snap
   rbd image 'foo':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   \tprotected: False (esc)
@@ -96,7 +96,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar
   rbd image 'bar':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -131,7 +131,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar@snap
   rbd image 'bar':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -169,7 +169,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info bar@snap2
   rbd image 'bar':
   \tsize 1024 MB in 256 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -207,7 +207,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info baz
   rbd image 'baz':
   \tsize 2048 MB in 512 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -241,8 +241,8 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   </image>
   $ rbd info quux
   rbd image 'quux':
-  \tsize 1024 KB in 1 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \tsize 1024 kB in 1 objects (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 1 (esc)
   $ rbd info quux --format json | python -mjson.tool
@@ -268,7 +268,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info data/child
   rbd image 'child':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -303,7 +303,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   $ rbd info data/child@snap
   rbd image 'child':
   \tsize 512 MB in 128 objects (esc)
-  \torder 22 (4096 KB objects) (esc)
+  \torder 22 (4096 kB objects) (esc)
   [^^]+ (re)
   \tformat: 2 (esc)
   \tfeatures: layering (esc)
@@ -375,7 +375,7 @@ whenever it is run. grep -v to ignore it, but still work on other distros.
   NAME       SIZE PARENT FMT PROT LOCK 
   foo       1024M          1           
   foo@snap  1024M          1           
-  quux      1024K          1      excl 
+  quux      1024k          1      excl 
   bar       1024M          2           
   bar@snap   512M          2 yes       
   bar@snap2 1024M          2           
diff --git a/src/test/common/test_bloom_filter.cc b/src/test/common/test_bloom_filter.cc
new file mode 100644
index 00000000000..8e3661b2cc1
--- /dev/null
+++ b/src/test/common/test_bloom_filter.cc
@@ -0,0 +1,222 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 Inktank <info@inktank.com>
+ *
+ * LGPL2.1 (see COPYING-LGPL2.1) or later
+ */
+
+#include <iostream>
+#include <gtest/gtest.h>
+
+#include "include/stringify.h"
+#include "common/bloom_filter.hpp"
+
+TEST(BloomFilter, Basic) {
+  bloom_filter bf(10, .1, 1);
+  bf.insert("foo");
+  bf.insert("bar");
+
+  ASSERT_TRUE(bf.contains("foo"));
+  ASSERT_TRUE(bf.contains("bar"));
+}
+
+TEST(BloomFilter, Sweep) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      for (int n = 0; n < max; n++)
+	bf.insert("ok" + stringify(n));
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains("asdf" + stringify(n)))
+	  hit++;
+
+      ASSERT_TRUE(bf.contains("foo"));
+      ASSERT_TRUE(bf.contains("bar"));
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+
+    }
+  }
+}
+
+TEST(BloomFilter, SweepInt) {
+  std::cout << "# max\tfpp\tactual\tsize\tB/insert" << std::endl;
+  for (int ex = 3; ex < 12; ex += 2) {
+    for (float fpp = .001; fpp < .5; fpp *= 4.0) {
+      int max = 2 << ex;
+      bloom_filter bf(max, fpp, 1);
+      bf.insert("foo");
+      bf.insert("bar");
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      for (int n = 0; n < max; n++)
+	bf.insert(n);
+
+      int test = max * 100;
+      int hit = 0;
+      for (int n = 0; n < test; n++)
+	if (bf.contains(100000 + n))
+	  hit++;
+
+      ASSERT_TRUE(123);
+      ASSERT_TRUE(456);
+
+      double actual = (double)hit / (double)test;
+
+      bufferlist bl;
+      ::encode(bf, bl);
+
+      double byte_per_insert = (double)bl.length() / (double)max;
+
+      std::cout << max << "\t" << fpp << "\t" << actual << "\t" << bl.length() << "\t" << byte_per_insert << std::endl;
+      ASSERT_TRUE(actual < fpp * 10);
+      ASSERT_TRUE(actual > fpp / 10);
+    }
+  }
+}
+
+
+TEST(BloomFilter, BinSweep) {
+  int total_max = 16384;
+  float total_fpp = .01;
+  std::cout << "total_inserts " << total_max << " target-fpp " << total_fpp << std::endl;
+  for (int bins = 1; bins < 16; ++bins) {
+    int max = total_max / bins;
+    float fpp = total_fpp / bins;//pow(total_fpp, bins);
+
+    std::vector<bloom_filter*> ls;
+    bufferlist bl;
+    for (int i=0; i<bins; i++) {
+      ls.push_back(new bloom_filter(max, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert(10000 * (i+1) + j);
+      }
+      ::encode(*ls.front(), bl);
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains(i * 732)) {  // note: sequential i does not work here; the intenral int hash is weak!!
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "bins " << bins << " bin-max " << max << " bin-fpp " << fpp
+	      << " actual-fpp " << actual
+	      << " total-size " << bl.length() << std::endl;
+  }
+}
+
+// disable these tests; doing dual insertions in consecutive filters
+// appears to be equivalent to doing a single insertion in a bloom
+// filter that is twice as big.
+#if 0
+
+// test the fpp over a sequence of bloom filters, each with unique
+// items inserted into it.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp
+TEST(BloomFilter, Sequence) {
+
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  hit++;
+	  break;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual << std::endl;
+  }
+}
+
+// test the ffp over a sequence of bloom filters, where actual values
+// are always inserted into a consecutive pair of filters.  in order
+// to have a false positive, we need to falsely match two consecutive
+// filters.
+//
+// we expect:  actual_fpp = num_filters * per_filter_fpp^2
+TEST(BloomFilter, SequenceDouble) {
+  int max = 1024;
+  double fpp = .01;
+  for (int seq = 2; seq <= 128; seq *= 2) {
+    std::vector<bloom_filter*> ls;
+    for (int i=0; i<seq; i++) {
+      ls.push_back(new bloom_filter(max*2, fpp, i));
+      for (int j=0; j<max; j++) {
+	ls.back()->insert("ok" + stringify(j) + "_" + stringify(i));
+	if (ls.size() > 1)
+	  ls[ls.size() - 2]->insert("ok" + stringify(j) + "_" + stringify(i));
+      }
+    }
+
+    int hit = 0;
+    int test = max * 100;
+    int run = 0;
+    for (int i=0; i<test; ++i) {
+      for (std::vector<bloom_filter*>::iterator j = ls.begin(); j != ls.end(); ++j) {
+	if ((*j)->contains("bad" + stringify(i))) {
+	  run++;
+	  if (run >= 2) {
+	    hit++;
+	    break;
+	  }
+	} else {
+	  run = 0;
+	}
+      }
+    }
+
+    double actual = (double)hit / (double)test;
+    std::cout << "seq " << seq << " max " << max << " fpp " << fpp << " actual " << actual
+	      << " expected " << (fpp*fpp*(double)seq) << std::endl;
+  }
+}
+
+#endif
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 6dd180bc198..59e55a11b23 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -4,6 +4,9 @@ TYPE(CompatSet)
 #include "include/filepath.h"
 TYPE(filepath)
 
+#include "common/bloom_filter.hpp"
+TYPE(bloom_filter)
+
 #include "common/snap_types.h"
 TYPE(SnapContext)
 TYPE(SnapRealmInfo)
diff --git a/src/test/filestore/run_seed_to.sh b/src/test/filestore/run_seed_to.sh
index fdf56141e12..d5bb671138c 100755
--- a/src/test/filestore/run_seed_to.sh
+++ b/src/test/filestore/run_seed_to.sh
@@ -246,13 +246,13 @@ do
   do_rm $tmp_name_a $tmp_name_a.fail $tmp_name_a.recover
   $v ceph_test_filestore_idempotent_sequence run-sequence-to $to \
     $tmp_name_a $tmp_name_a/journal \
-    --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \
+    --test-seed $seed --osd-journal-size 100 \
     --filestore-kill-at $killat $tmp_opts_a \
     --log-file $tmp_name_a.fail --debug-filestore 20 || true
 
   stop_at=`ceph_test_filestore_idempotent_sequence get-last-op \
     $tmp_name_a $tmp_name_a/journal \
-    --filestore-xattr-use-omap --log-file $tmp_name_a.recover \
+    --log-file $tmp_name_a.recover \
     --debug-filestore 20 --debug-journal 20`
 
   if [[ "`expr $stop_at - $stop_at 2>/dev/null`" != "0" ]]; then
@@ -265,12 +265,11 @@ do
   do_rm $tmp_name_b $tmp_name_b.clean
   $v ceph_test_filestore_idempotent_sequence run-sequence-to \
     $stop_at $tmp_name_b $tmp_name_b/journal \
-    --filestore-xattr-use-omap --test-seed $seed --osd-journal-size 100 \
+    --test-seed $seed --osd-journal-size 100 \
     --log-file $tmp_name_b.clean --debug-filestore 20 $tmp_opts_b
 
   if $v ceph_test_filestore_idempotent_sequence diff \
-    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal \
-    --filestore-xattr-use-omap; then
+    $tmp_name_a $tmp_name_a/journal $tmp_name_b $tmp_name_b/journal ; then
       echo OK
   else
     echo "FAIL"
diff --git a/src/test/osd/RadosModel.h b/src/test/osd/RadosModel.h
index a87ecebb4c1..aba6a531c6f 100644
--- a/src/test/osd/RadosModel.h
+++ b/src/test/osd/RadosModel.h
@@ -143,6 +143,7 @@ public:
   map<int, map<string,ObjectDesc> > pool_obj_cont;
   set<string> oid_in_use;
   set<string> oid_not_in_use;
+  set<int> snaps_in_use;
   int current_snap;
   string pool_name;
   librados::IoCtx io_ctx;
@@ -1043,6 +1044,7 @@ public:
       if (!(err == -ENOENT && old_value.deleted())) {
 	cerr << num << ": Error: oid " << oid << " read returned error code "
 	     << err << std::endl;
+	context->errors++;
       }
     } else {
       cout << num << ":  expect " << old_value.most_recent() << std::endl;
@@ -1314,6 +1316,8 @@ public:
     }
     context->oid_in_use.insert(oid);
     context->oid_not_in_use.erase(oid);
+    context->snaps_in_use.insert(roll_back_to);
+
     context->roll_back(oid, roll_back_to);
     uint64_t snap = context->snaps[roll_back_to];
 
@@ -1341,6 +1345,7 @@ public:
     context->update_object_version(oid, comp->get_version64());
     context->oid_in_use.erase(oid);
     context->oid_not_in_use.insert(oid);
+    context->snaps_in_use.erase(roll_back_to);
     context->kick();
   }
 
diff --git a/src/test/osd/TestRados.cc b/src/test/osd/TestRados.cc
index be919161579..842f9d2bca3 100644
--- a/src/test/osd/TestRados.cc
+++ b/src/test/osd/TestRados.cc
@@ -111,10 +111,13 @@ private:
       return new SnapCreateOp(m_op, &context, m_stats);
 
     case TEST_OP_SNAP_REMOVE:
-      if (context.snaps.empty()) {
+      if (context.snaps.size() <= context.snaps_in_use.size()) {
 	return NULL;
-      } else {
+      }
+      while (true) {
 	int snap = rand_choose(context.snaps)->first;
+	if (context.snaps_in_use.count(snap))
+	  continue;  // in use; try again!
 	cout << "snap_remove snap " << snap << std::endl;
 	return new SnapRemoveOp(m_op, &context, snap, m_stats);
       }
@@ -122,11 +125,12 @@ private:
     case TEST_OP_ROLLBACK:
       if (context.snaps.empty()) {
 	return NULL;
-      } else {
+      }
+      {
 	int snap = rand_choose(context.snaps)->first;
 	string oid = *(rand_choose(context.oid_not_in_use));
 	cout << "rollback oid " << oid << " to " << snap << std::endl;
-        return new RollbackOp(m_op, &context, oid, snap);
+	return new RollbackOp(m_op, &context, oid, snap);
       }
 
     case TEST_OP_SETATTR:
diff --git a/src/tools/Makefile.am b/src/tools/Makefile.am
index 4b8da77951a..89417014dd4 100644
--- a/src/tools/Makefile.am
+++ b/src/tools/Makefile.am
@@ -6,6 +6,12 @@ ceph_monstore_tool_SOURCES = tools/ceph-monstore-tool.cc
 ceph_monstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
 bin_DEBUGPROGRAMS += ceph-monstore-tool
 
+ceph_kvstore_tool_SOURCES = tools/ceph-kvstore-tool.cc
+ceph_kvstore_tool_LDADD = $(LIBOS) $(CEPH_GLOBAL)
+ceph_kvstore_tool_CXXFLAGS = $(UNITTEST_CXXFLAGS)
+bin_DEBUGPROGRAMS += ceph-kvstore-tool
+
+
 ceph_filestore_dump_SOURCES = tools/ceph-filestore-dump.cc
 ceph_filestore_dump_LDADD = $(LIBOSD) $(LIBOS) $(CEPH_GLOBAL) -lboost_program_options
 if LINUX
diff --git a/src/tools/ceph-kvstore-tool.cc b/src/tools/ceph-kvstore-tool.cc
new file mode 100644
index 00000000000..e07391d5c51
--- /dev/null
+++ b/src/tools/ceph-kvstore-tool.cc
@@ -0,0 +1,380 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+* Ceph - scalable distributed file system
+*
+* Copyright (C) 2012 Inktank, Inc.
+*
+* This is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License version 2.1, as published by the Free Software
+* Foundation. See file COPYING.
+*/
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <map>
+#include <set>
+#include <boost/scoped_ptr.hpp>
+
+#include "os/LevelDBStore.h"
+
+#include "common/ceph_argparse.h"
+#include "global/global_init.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+#include "common/config.h"
+#include "common/strtol.h"
+#include "include/stringify.h"
+
+using namespace std;
+
+class StoreTool
+{
+  boost::scoped_ptr<KeyValueDB> db;
+  string store_path;
+
+  public:
+  StoreTool(const string &path) : store_path(path) {
+    LevelDBStore *db_ptr = new LevelDBStore(g_ceph_context, store_path);
+    assert(!db_ptr->open(std::cerr));
+    db.reset(db_ptr);
+  }
+
+  uint32_t traverse(const string &prefix,
+                    const bool do_crc,
+                    ostream *out) {
+    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
+
+    if (prefix.empty())
+      iter->seek_to_first();
+    else
+      iter->seek_to_first(prefix);
+
+    uint32_t crc = -1;
+
+    while (iter->valid()) {
+      pair<string,string> rk = iter->raw_key();
+      if (!prefix.empty() && (rk.first != prefix))
+        break;
+
+      if (out)
+        *out << rk.first << ":" << rk.second;
+      if (do_crc) {
+        bufferlist bl;
+        bl.append(rk.first);
+        bl.append(rk.second);
+        bl.append(iter->value());
+
+        crc = bl.crc32c(crc);
+        if (out) {
+          *out << " (" << bl.crc32c(0) << ")";
+        }
+      }
+      if (out)
+        *out << std::endl;
+      iter->next();
+    }
+
+    return crc;
+  }
+
+  void list(const string &prefix, const bool do_crc) {
+    traverse(prefix, do_crc, &std::cout);
+  }
+
+  bool exists(const string &prefix) {
+    assert(!prefix.empty());
+    KeyValueDB::WholeSpaceIterator iter = db->get_iterator();
+    iter->seek_to_first(prefix);
+    return (iter->valid() && (iter->raw_key().first == prefix));
+  }
+
+  bool exists(const string &prefix, const string &key) {
+    assert(!prefix.empty());
+
+    if (key.empty()) {
+      return exists(prefix);
+    }
+
+    bool exists = false;
+    get(prefix, key, exists);
+    return exists;
+  }
+
+  bufferlist get(const string &prefix, const string &key, bool &exists) {
+    assert(!prefix.empty() && !key.empty());
+
+    map<string,bufferlist> result;
+    std::set<std::string> keys;
+    keys.insert(key);
+    db->get(prefix, keys, &result);
+
+    if (result.count(key) > 0) {
+      exists = true;
+      return result[key];
+    }
+    exists = false;
+    return bufferlist();
+  }
+
+  uint64_t get_size() {
+    map<string,uint64_t> extras;
+    uint64_t s = db->get_estimated_size(extras);
+    for (map<string,uint64_t>::iterator p = extras.begin();
+         p != extras.end(); ++p) {
+      std::cout << p->first << " - " << p->second << std::endl;
+    }
+    std::cout << "total: " << s << std::endl;
+    return s;
+  }
+
+  bool set(const string &prefix, const string &key, bufferlist &val) {
+    assert(!prefix.empty());
+    assert(!key.empty());
+    assert(val.length() > 0);
+
+    KeyValueDB::Transaction tx = db->get_transaction();
+    tx->set(prefix, key, val);
+    int ret = db->submit_transaction_sync(tx);
+
+    return (ret == 0);
+  }
+
+  int copy_store_to(const string &other_path, const int num_keys_per_tx) {
+
+    if (num_keys_per_tx <= 0) {
+      std::cerr << "must specify a number of keys/tx > 0" << std::endl;
+      return -EINVAL;
+    }
+
+    // open or create a leveldb store at @p other_path
+    LevelDBStore other(g_ceph_context, other_path);
+    int err = other.create_and_open(std::cerr);
+    if (err < 0)
+      return err;
+
+    KeyValueDB::WholeSpaceIterator it = db->get_iterator();
+    it->seek_to_first();
+    uint64_t total_keys = 0;
+    uint64_t total_size = 0;
+    uint64_t total_txs = 0;
+
+    utime_t started_at = ceph_clock_now(g_ceph_context);
+
+    do {
+      int num_keys = 0;
+
+      KeyValueDB::Transaction tx = other.get_transaction();
+
+
+      while (it->valid() && num_keys < num_keys_per_tx) {
+        pair<string,string> k = it->raw_key();
+        bufferlist v = it->value();
+        tx->set(k.first, k.second, v);
+
+        num_keys ++;
+        total_size += v.length();
+
+        it->next();
+      }
+
+      total_txs ++;
+      total_keys += num_keys;
+
+      if (num_keys > 0)
+        other.submit_transaction_sync(tx);
+
+      utime_t cur_duration = ceph_clock_now(g_ceph_context) - started_at;
+      std::cout << "ts = " << cur_duration << "s, copied " << total_keys
+                << " keys so far (" << stringify(si_t(total_size)) << ")"
+                << std::endl;
+
+    } while (it->valid());
+
+    utime_t time_taken = ceph_clock_now(g_ceph_context) - started_at;
+
+    std::cout << "summary:" << std::endl;
+    std::cout << "  copied " << total_keys << " keys" << std::endl;
+    std::cout << "  used " << total_txs << " transactions" << std::endl;
+    std::cout << "  total size " << stringify(si_t(total_size)) << std::endl;
+    std::cout << "  from '" << store_path << "' to '" << other_path << "'"
+              << std::endl;
+    std::cout << "  duration " << time_taken << " seconds" << std::endl;
+
+    return 0;
+  }
+};
+
+void usage(const char *pname)
+{
+  std::cerr << "Usage: " << pname << " <store path> command [args...]\n"
+    << "\n"
+    << "Commands:\n"
+    << "  list [prefix]\n"
+    << "  list-crc [prefix]\n"
+    << "  exists <prefix> [key]\n"
+    << "  get <prefix> <key>\n"
+    << "  crc <prefix> <key>\n"
+    << "  get-size\n"
+    << "  set <prefix> <key> [ver <N>|in <file>]\n"
+    << "  store-copy <path> [num-keys-per-tx]\n"
+    << "  store-crc <path>\n"
+    << std::endl;
+}
+
+int main(int argc, const char *argv[])
+{
+  vector<const char*> args;
+  argv_to_vec(argc, argv, args);
+  env_to_vec(args);
+
+  global_init(
+      NULL, args,
+      CEPH_ENTITY_TYPE_CLIENT, CODE_ENVIRONMENT_UTILITY, 0);
+  common_init_finish(g_ceph_context);
+
+
+  if (args.size() < 2) {
+    usage(argv[0]);
+    return 1;
+  }
+
+  string path(args[0]);
+  string cmd(args[1]);
+
+  StoreTool st(path);
+
+  if (cmd == "list" || cmd == "list-crc") {
+    string prefix;
+    if (argc > 3)
+      prefix = argv[3];
+
+    bool do_crc = (cmd == "list-crc");
+
+    st.list(prefix, do_crc);
+
+  } else if (cmd == "exists") {
+    string key;
+    if (argc < 4) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    if (argc > 4)
+      key = argv[4];
+
+    bool ret = st.exists(prefix, key);
+    std::cout << "(" << prefix << ", " << key << ") "
+      << (ret ? "exists" : "does not exist")
+      << std::endl;
+    return (ret ? 0 : 1);
+
+  } else if (cmd == "get") {
+    if (argc < 5) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+
+    bool exists = false;
+    bufferlist bl = st.get(prefix, key, exists);
+    std::cout << "(" << prefix << ", " << key << ")";
+    if (!exists) {
+      std::cout << " does not exist" << std::endl;
+      return 1;
+    }
+    std::cout << std::endl;
+    ostringstream os;
+    bl.hexdump(os);
+    std::cout << os.str() << std::endl;
+
+  } else if (cmd == "crc") {
+    if (argc < 5) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+
+    bool exists = false;
+    bufferlist bl = st.get(prefix, key, exists);
+    std::cout << "(" << prefix << ", " << key << ") ";
+    if (!exists) {
+      std::cout << " does not exist" << std::endl;
+      return 1;
+    }
+    std::cout << " crc " << bl.crc32c(0) << std::endl;
+
+  } else if (cmd == "get-size") {
+    std::cout << "estimated store size: " << st.get_size() << std::endl;
+
+  } else if (cmd == "set") {
+    if (argc < 7) {
+      usage(argv[0]);
+      return 1;
+    }
+    string prefix(argv[3]);
+    string key(argv[4]);
+    string subcmd(argv[5]);
+
+    bufferlist val;
+    string errstr;
+    if (subcmd == "ver") {
+      version_t v = (version_t) strict_strtoll(argv[6], 10, &errstr);
+      if (!errstr.empty()) {
+        std::cerr << "error reading version: " << errstr << std::endl;
+        return 1;
+      }
+      ::encode(v, val);
+    } else if (subcmd == "in") {
+      int ret = val.read_file(argv[6], &errstr);
+      if (ret < 0 || !errstr.empty()) {
+        std::cerr << "error reading file: " << errstr << std::endl;
+        return 1;
+      }
+    } else {
+      std::cerr << "unrecognized subcommand '" << subcmd << "'" << std::endl;
+      usage(argv[0]);
+      return 1;
+    }
+
+    bool ret = st.set(prefix, key, val);
+    if (!ret) {
+      std::cerr << "error setting ("
+                << prefix << "," << key << ")" << std::endl;
+      return 1;
+    }
+  } else if (cmd == "store-copy") {
+    int num_keys_per_tx = 128; // magic number that just feels right.
+    if (argc < 4) {
+      usage(argv[0]);
+      return 1;
+    } else if (argc > 4) {
+      string err;
+      num_keys_per_tx = strict_strtol(argv[4], 10, &err);
+      if (!err.empty()) {
+        std::cerr << "invalid num_keys_per_tx: " << err << std::endl;
+        return 1;
+      }
+    }
+
+    int ret = st.copy_store_to(argv[3], num_keys_per_tx);
+    if (ret < 0) {
+      std::cerr << "error copying store to path '" << argv[3]
+                << "': " << cpp_strerror(ret) << std::endl;
+      return 1;
+    }
+
+  } else if (cmd == "store-crc") {
+    uint32_t crc = st.traverse(string(), true, NULL);
+    std::cout << "store at '" << path << "' crc " << crc << std::endl;
+
+  } else {
+    std::cerr << "Unrecognized command: " << cmd << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
author	Mark Doffman <mark.doffman@gmail.com>	2013-10-19 19:51:30 +0000
committer	Mark Doffman <mark.doffman@gmail.com>	2013-10-19 19:51:30 +0000
commit	0bad3151e50f2838193da8b3ed3e74c9758fb334 (patch)
tree	17c38637d0f5e9028a1838c301bcb64eb6cc72d3
parent	73b290bbc6d8965228a63930bf11aa41c88adcc2 (diff)
parent	e509cb1e69cd39e3702b5351188e60116bafc544 (diff)
download	ceph-0bad3151e50f2838193da8b3ed3e74c9758fb334.tar.gz