Merge defaultbug24969

author: Simon MacMullen <simon@rabbitmq.com> 2013-08-19 17:14:13 +0100
committer: Simon MacMullen <simon@rabbitmq.com> 2013-08-19 17:14:13 +0100
commit: 11049881a87eb51e9bf6efbb4d2ef1ee4be62bfe (patch)
tree: 2a3f21103e1d6050802ed32714d1e62763aeb0a5 /src
parent: bd1305279e255adcf583afdd55a7cee18a9fcddb (diff)
parent: af4ef7640e817141615298c504e9129d14be1d9d (diff)
download: rabbitmq-server-bug24969.tar.gz
132 files changed, 13090 insertions, 6785 deletions
diff --git a/src/app_utils.erl b/src/app_utils.erl
new file mode 100644
index 00000000..5ae2d295
--- /dev/null
+++ b/src/app_utils.erl
@@ -0,0 +1,138 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+-module(app_utils).
+
+-export([load_applications/1, start_applications/1, start_applications/2,
+         stop_applications/1, stop_applications/2, app_dependency_order/2,
+         wait_for_applications/1]).
+
+-ifdef(use_specs).
+
+-type error_handler() :: fun((atom(), any()) -> 'ok').
+
+-spec load_applications([atom()])                   -> 'ok'.
+-spec start_applications([atom()])                  -> 'ok'.
+-spec stop_applications([atom()])                   -> 'ok'.
+-spec start_applications([atom()], error_handler()) -> 'ok'.
+-spec stop_applications([atom()], error_handler())  -> 'ok'.
+-spec wait_for_applications([atom()])               -> 'ok'.
+-spec app_dependency_order([atom()], boolean())     -> [digraph:vertex()].
+
+-endif.
+
+%%---------------------------------------------------------------------------
+%% Public API
+
+load_applications(Apps) ->
+    load_applications(queue:from_list(Apps), sets:new()),
+    ok.
+
+start_applications(Apps) ->
+    start_applications(
+      Apps, fun (App, Reason) ->
+                    throw({error, {cannot_start_application, App, Reason}})
+            end).
+
+stop_applications(Apps) ->
+    stop_applications(
+      Apps, fun (App, Reason) ->
+                    throw({error, {cannot_stop_application, App, Reason}})
+            end).
+
+start_applications(Apps, ErrorHandler) ->
+    manage_applications(fun lists:foldl/3,
+                        fun application:start/1,
+                        fun application:stop/1,
+                        already_started,
+                        ErrorHandler,
+                        Apps).
+
+stop_applications(Apps, ErrorHandler) ->
+    manage_applications(fun lists:foldr/3,
+                        fun application:stop/1,
+                        fun application:start/1,
+                        not_started,
+                        ErrorHandler,
+                        Apps).
+
+
+wait_for_applications(Apps) ->
+    [wait_for_application(App) || App <- Apps], ok.
+
+app_dependency_order(RootApps, StripUnreachable) ->
+    {ok, G} = rabbit_misc:build_acyclic_graph(
+                fun (App, _Deps) -> [{App, App}] end,
+                fun (App,  Deps) -> [{Dep, App} || Dep <- Deps] end,
+                [{App, app_dependencies(App)} ||
+                    {App, _Desc, _Vsn} <- application:loaded_applications()]),
+    try
+        case StripUnreachable of
+            true -> digraph:del_vertices(G, digraph:vertices(G) --
+                     digraph_utils:reachable(RootApps, G));
+            false -> ok
+        end,
+        digraph_utils:topsort(G)
+    after
+        true = digraph:delete(G)
+    end.
+
+%%---------------------------------------------------------------------------
+%% Private API
+
+wait_for_application(Application) ->
+    case lists:keymember(Application, 1, rabbit_misc:which_applications()) of
+         true  -> ok;
+         false -> timer:sleep(1000),
+                  wait_for_application(Application)
+    end.
+
+load_applications(Worklist, Loaded) ->
+    case queue:out(Worklist) of
+        {empty, _WorkList} ->
+            ok;
+        {{value, App}, Worklist1} ->
+            case sets:is_element(App, Loaded) of
+                true  -> load_applications(Worklist1, Loaded);
+                false -> case application:load(App) of
+                             ok                             -> ok;
+                             {error, {already_loaded, App}} -> ok;
+                             Error                          -> throw(Error)
+                         end,
+                         load_applications(
+                           queue:join(Worklist1,
+                                      queue:from_list(app_dependencies(App))),
+                           sets:add_element(App, Loaded))
+            end
+    end.
+
+app_dependencies(App) ->
+    case application:get_key(App, applications) of
+        undefined -> [];
+        {ok, Lst} -> Lst
+    end.
+
+manage_applications(Iterate, Do, Undo, SkipError, ErrorHandler, Apps) ->
+    Iterate(fun (App, Acc) ->
+                    case Do(App) of
+                        ok -> [App | Acc];
+                        {error, {SkipError, _}} -> Acc;
+                        {error, Reason} ->
+                            lists:foreach(Undo, Acc),
+                            ErrorHandler(App, Reason)
+                    end
+            end, [], Apps),
+    ok.
+
diff --git a/src/background_gc.erl b/src/background_gc.erl
new file mode 100644
index 00000000..fbd7ce23
--- /dev/null
+++ b/src/background_gc.erl
@@ -0,0 +1,81 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(background_gc).
+
+-behaviour(gen_server2).
+
+-export([start_link/0, run/0]).
+-export([gc/0]). %% For run_interval only
+
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+         terminate/2, code_change/3]).
+
+-define(MAX_RATIO, 0.01).
+-define(IDEAL_INTERVAL, 60000).
+
+-record(state, {last_interval}).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start_link/0 :: () -> {'ok', pid()} | {'error', any()}).
+-spec(run/0 :: () -> 'ok').
+-spec(gc/0 :: () -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link() -> gen_server2:start_link({local, ?MODULE}, ?MODULE, [],
+                                       [{timeout, infinity}]).
+
+run() -> gen_server2:cast(?MODULE, run).
+
+%%----------------------------------------------------------------------------
+
+init([]) -> {ok, interval_gc(#state{last_interval = ?IDEAL_INTERVAL})}.
+
+handle_call(Msg, _From, State) ->
+    {stop, {unexpected_call, Msg}, {unexpected_call, Msg}, State}.
+
+handle_cast(run, State) -> gc(), {noreply, State};
+
+handle_cast(Msg, State) -> {stop, {unexpected_cast, Msg}, State}.
+
+handle_info(run, State) -> {noreply, interval_gc(State)};
+
+handle_info(Msg, State) -> {stop, {unexpected_info, Msg}, State}.
+
+code_change(_OldVsn, State, _Extra) -> {ok, State}.
+
+terminate(_Reason, State) -> State.
+
+%%----------------------------------------------------------------------------
+
+interval_gc(State = #state{last_interval = LastInterval}) ->
+    {ok, Interval} = rabbit_misc:interval_operation(
+                       {?MODULE, gc, []},
+                       ?MAX_RATIO, ?IDEAL_INTERVAL, LastInterval),
+    erlang:send_after(Interval, self(), run),
+    State#state{last_interval = Interval}.
+
+gc() ->
+    [garbage_collect(P) || P <- processes(),
+                           {status, waiting} == process_info(P, status)],
+    garbage_collect(), %% since we will never be waiting...
+    ok.
diff --git a/src/credit_flow.erl b/src/credit_flow.erl
index ba99811f..d48d649e 100644
--- a/src/credit_flow.erl
+++ b/src/credit_flow.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(credit_flow).
@@ -52,6 +52,22 @@
 
 %%----------------------------------------------------------------------------
 
+%% process dict update macro - eliminates the performance-hurting
+%% closure creation a HOF would introduce
+-define(UPDATE(Key, Default, Var, Expr),
+        begin
+            %% We deliberately allow Var to escape from the case here
+            %% to be used in Expr. Any temporary var we introduced
+            %% would also escape, and might conflict.
+            case get(Key) of
+                undefined -> Var = Default;
+                Var       -> ok
+            end,
+            put(Key, Expr)
+        end).
+
+%%----------------------------------------------------------------------------
+
 %% There are two "flows" here; of messages and of credit, going in
 %% opposite directions. The variable names "From" and "To" refer to
 %% the flow of credit, but the function names refer to the flow of
@@ -66,29 +82,33 @@
 send(From) -> send(From, ?DEFAULT_CREDIT).
 
 send(From, {InitialCredit, _MoreCreditAfter}) ->
-    update({credit_from, From}, InitialCredit,
-           fun (1) -> block(From),
-                      0;
-               (C) -> C - 1
-           end).
+    ?UPDATE({credit_from, From}, InitialCredit, C,
+            if C == 1 -> block(From),
+                         0;
+               true   -> C - 1
+            end).
 
 ack(To) -> ack(To, ?DEFAULT_CREDIT).
 
 ack(To, {_InitialCredit, MoreCreditAfter}) ->
-    update({credit_to, To}, MoreCreditAfter,
-           fun (1) -> grant(To, MoreCreditAfter),
-                      MoreCreditAfter;
-               (C) -> C - 1
-           end).
+    ?UPDATE({credit_to, To}, MoreCreditAfter, C,
+            if C == 1 -> grant(To, MoreCreditAfter),
+                         MoreCreditAfter;
+               true   -> C - 1
+            end).
 
 handle_bump_msg({From, MoreCredit}) ->
-    update({credit_from, From}, 0,
-           fun (C) when C =< 0 andalso C + MoreCredit > 0 -> unblock(From),
-                                                             C + MoreCredit;
-               (C)                                        -> C + MoreCredit
-           end).
-
-blocked() -> get(credit_blocked, []) =/= [].
+    ?UPDATE({credit_from, From}, 0, C,
+            if C =< 0 andalso C + MoreCredit > 0 -> unblock(From),
+                                                    C + MoreCredit;
+               true                              -> C + MoreCredit
+            end).
+
+blocked() -> case get(credit_blocked) of
+                 undefined -> false;
+                 []        -> false;
+                 _         -> true
+             end.
 
 peer_down(Peer) ->
     %% In theory we could also remove it from credit_deferred here, but it
@@ -105,24 +125,17 @@ grant(To, Quantity) ->
     Msg = {bump_credit, {self(), Quantity}},
     case blocked() of
         false -> To ! Msg;
-        true  -> update(credit_deferred, [],
-                        fun (Deferred) -> [{To, Msg} | Deferred] end)
+        true  -> ?UPDATE(credit_deferred, [], Deferred, [{To, Msg} | Deferred])
     end.
 
-block(From) -> update(credit_blocked, [], fun (Blocks) -> [From | Blocks] end).
+block(From) -> ?UPDATE(credit_blocked, [], Blocks, [From | Blocks]).
 
 unblock(From) ->
-    update(credit_blocked, [], fun (Blocks) -> Blocks -- [From] end),
+    ?UPDATE(credit_blocked, [], Blocks, Blocks -- [From]),
     case blocked() of
-        false -> [To ! Msg || {To, Msg} <- get(credit_deferred, [])],
-                 erase(credit_deferred);
+        false -> case erase(credit_deferred) of
+                     undefined -> ok;
+                     Credits   -> [To ! Msg || {To, Msg} <- Credits]
+                 end;
         true  -> ok
     end.
-
-get(Key, Default) ->
-    case get(Key) of
-        undefined -> Default;
-        Value     -> Value
-    end.
-
-update(Key, Default, Fun) -> put(Key, Fun(get(Key, Default))), ok.
diff --git a/src/delegate.erl b/src/delegate.erl
index d595e481..0331ca01 100644
--- a/src/delegate.erl
+++ b/src/delegate.erl
@@ -10,31 +10,44 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(delegate).
 
 -behaviour(gen_server2).
 
--export([start_link/1, invoke_no_result/2, invoke/2]).
+-export([start_link/1, invoke_no_result/2, invoke/2,
+         monitor/2, demonitor/1, call/2, cast/2]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
 
+-record(state, {node, monitors, name}).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
+-export_type([monitor_ref/0]).
+
+-type(monitor_ref() :: reference() | {atom(), pid()}).
+-type(fun_or_mfa(A) :: fun ((pid()) -> A) | {atom(), atom(), [any()]}).
+
 -spec(start_link/1 ::
         (non_neg_integer()) -> {'ok', pid()} | ignore | {'error', any()}).
--spec(invoke/2 ::
-        ( pid(),  fun ((pid()) -> A)) -> A;
-        ([pid()], fun ((pid()) -> A)) -> {[{pid(), A}],
-                                          [{pid(), term()}]}).
--spec(invoke_no_result/2 ::
-        (pid() | [pid()], fun ((pid()) -> any())) -> 'ok').
+-spec(invoke/2 :: ( pid(),  fun_or_mfa(A)) -> A;
+                  ([pid()], fun_or_mfa(A)) -> {[{pid(), A}],
+                                               [{pid(), term()}]}).
+-spec(invoke_no_result/2 :: (pid() | [pid()], fun_or_mfa(any())) -> 'ok').
+-spec(monitor/2 :: ('process', pid()) -> monitor_ref()).
+-spec(demonitor/1 :: (monitor_ref()) -> 'true').
+
+-spec(call/2 ::
+        ( pid(),  any()) -> any();
+        ([pid()], any()) -> {[{pid(), any()}], [{pid(), term()}]}).
+-spec(cast/2 :: (pid() | [pid()], any()) -> 'ok').
 
 -endif.
 
@@ -46,19 +59,27 @@
 %%----------------------------------------------------------------------------
 
 start_link(Num) ->
-    gen_server2:start_link({local, delegate_name(Num)}, ?MODULE, [], []).
+    Name = delegate_name(Num),
+    gen_server2:start_link({local, Name}, ?MODULE, [Name], []).
 
-invoke(Pid, Fun) when is_pid(Pid) andalso node(Pid) =:= node() ->
-    Fun(Pid);
-invoke(Pid, Fun) when is_pid(Pid) ->
-    case invoke([Pid], Fun) of
+invoke(Pid, FunOrMFA) when is_pid(Pid) andalso node(Pid) =:= node() ->
+    apply1(FunOrMFA, Pid);
+invoke(Pid, FunOrMFA) when is_pid(Pid) ->
+    case invoke([Pid], FunOrMFA) of
         {[{Pid, Result}], []} ->
             Result;
         {[], [{Pid, {Class, Reason, StackTrace}}]} ->
             erlang:raise(Class, Reason, StackTrace)
     end;
 
-invoke(Pids, Fun) when is_list(Pids) ->
+invoke([], _FunOrMFA) -> %% optimisation
+    {[], []};
+invoke([Pid], FunOrMFA) when node(Pid) =:= node() -> %% optimisation
+    case safe_invoke(Pid, FunOrMFA) of
+        {ok,    _, Result} -> {[{Pid, Result}], []};
+        {error, _, Error}  -> {[], [{Pid, Error}]}
+    end;
+invoke(Pids, FunOrMFA) when is_list(Pids) ->
     {LocalPids, Grouped} = group_pids_by_node(Pids),
     %% The use of multi_call is only safe because the timeout is
     %% infinity, and thus there is no process spawned in order to do
@@ -67,35 +88,59 @@ invoke(Pids, Fun) when is_list(Pids) ->
         case orddict:fetch_keys(Grouped) of
             []          -> {[], []};
             RemoteNodes -> gen_server2:multi_call(
-                             RemoteNodes, delegate(RemoteNodes),
-                             {invoke, Fun, Grouped}, infinity)
+                             RemoteNodes, delegate(self(), RemoteNodes),
+                             {invoke, FunOrMFA, Grouped}, infinity)
         end,
     BadPids = [{Pid, {exit, {nodedown, BadNode}, []}} ||
                   BadNode <- BadNodes,
                   Pid     <- orddict:fetch(BadNode, Grouped)],
-    ResultsNoNode = lists:append([safe_invoke(LocalPids, Fun) |
+    ResultsNoNode = lists:append([safe_invoke(LocalPids, FunOrMFA) |
                                   [Results || {_Node, Results} <- Replies]]),
     lists:foldl(
       fun ({ok,    Pid, Result}, {Good, Bad}) -> {[{Pid, Result} | Good], Bad};
           ({error, Pid, Error},  {Good, Bad}) -> {Good, [{Pid, Error} | Bad]}
       end, {[], BadPids}, ResultsNoNode).
 
-invoke_no_result(Pid, Fun) when is_pid(Pid) andalso node(Pid) =:= node() ->
-    safe_invoke(Pid, Fun), %% we don't care about any error
+invoke_no_result(Pid, FunOrMFA) when is_pid(Pid) andalso node(Pid) =:= node() ->
+    safe_invoke(Pid, FunOrMFA), %% we don't care about any error
     ok;
-invoke_no_result(Pid, Fun) when is_pid(Pid) ->
-    invoke_no_result([Pid], Fun);
+invoke_no_result(Pid, FunOrMFA) when is_pid(Pid) ->
+    invoke_no_result([Pid], FunOrMFA);
 
-invoke_no_result(Pids, Fun) when is_list(Pids) ->
+invoke_no_result([], _FunOrMFA) -> %% optimisation
+    ok;
+invoke_no_result([Pid], FunOrMFA) when node(Pid) =:= node() -> %% optimisation
+    safe_invoke(Pid, FunOrMFA), %% must not die
+    ok;
+invoke_no_result(Pids, FunOrMFA) when is_list(Pids) ->
     {LocalPids, Grouped} = group_pids_by_node(Pids),
     case orddict:fetch_keys(Grouped) of
         []          -> ok;
-        RemoteNodes -> gen_server2:abcast(RemoteNodes, delegate(RemoteNodes),
-                                          {invoke, Fun, Grouped})
+        RemoteNodes -> gen_server2:abcast(
+                         RemoteNodes, delegate(self(), RemoteNodes),
+                         {invoke, FunOrMFA, Grouped})
     end,
-    safe_invoke(LocalPids, Fun), %% must not die
+    safe_invoke(LocalPids, FunOrMFA), %% must not die
     ok.
 
+monitor(process, Pid) when node(Pid) =:= node() ->
+    erlang:monitor(process, Pid);
+monitor(process, Pid) ->
+    Name = delegate(Pid, [node(Pid)]),
+    gen_server2:cast(Name, {monitor, self(), Pid}),
+    {Name, Pid}.
+
+demonitor(Ref) when is_reference(Ref) ->
+    erlang:demonitor(Ref);
+demonitor({Name, Pid}) ->
+    gen_server2:cast(Name, {demonitor, self(), Pid}).
+
+call(PidOrPids, Msg) ->
+    invoke(PidOrPids, {gen_server2, call, [Msg, infinity]}).
+
+cast(PidOrPids, Msg) ->
+    invoke_no_result(PidOrPids, {gen_server2, cast, [Msg]}).
+
 %%----------------------------------------------------------------------------
 
 group_pids_by_node(Pids) ->
@@ -112,43 +157,88 @@ group_pids_by_node(Pids) ->
 delegate_name(Hash) ->
     list_to_atom("delegate_" ++ integer_to_list(Hash)).
 
-delegate(RemoteNodes) ->
+delegate(Pid, RemoteNodes) ->
     case get(delegate) of
         undefined -> Name = delegate_name(
-                              erlang:phash2(self(),
+                              erlang:phash2(Pid,
                                             delegate_sup:count(RemoteNodes))),
                      put(delegate, Name),
                      Name;
         Name      -> Name
     end.
 
-safe_invoke(Pids, Fun) when is_list(Pids) ->
-    [safe_invoke(Pid, Fun) || Pid <- Pids];
-safe_invoke(Pid, Fun) when is_pid(Pid) ->
+safe_invoke(Pids, FunOrMFA) when is_list(Pids) ->
+    [safe_invoke(Pid, FunOrMFA) || Pid <- Pids];
+safe_invoke(Pid, FunOrMFA) when is_pid(Pid) ->
     try
-        {ok, Pid, Fun(Pid)}
+        {ok, Pid, apply1(FunOrMFA, Pid)}
     catch Class:Reason ->
             {error, Pid, {Class, Reason, erlang:get_stacktrace()}}
     end.
 
+apply1({M, F, A}, Arg) -> apply(M, F, [Arg | A]);
+apply1(Fun,       Arg) -> Fun(Arg).
+
 %%----------------------------------------------------------------------------
 
-init([]) ->
-    {ok, node(), hibernate,
+init([Name]) ->
+    {ok, #state{node = node(), monitors = dict:new(), name = Name}, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
-handle_call({invoke, Fun, Grouped}, _From, Node) ->
-    {reply, safe_invoke(orddict:fetch(Node, Grouped), Fun), Node, hibernate}.
-
-handle_cast({invoke, Fun, Grouped}, Node) ->
-    safe_invoke(orddict:fetch(Node, Grouped), Fun),
-    {noreply, Node, hibernate}.
-
-handle_info(_Info, Node) ->
-    {noreply, Node, hibernate}.
+handle_call({invoke, FunOrMFA, Grouped}, _From, State = #state{node = Node}) ->
+    {reply, safe_invoke(orddict:fetch(Node, Grouped), FunOrMFA), State,
+     hibernate}.
+
+handle_cast({monitor, MonitoringPid, Pid},
+            State = #state{monitors = Monitors}) ->
+    Monitors1 = case dict:find(Pid, Monitors) of
+                    {ok, {Ref, Pids}} ->
+                        Pids1 = gb_sets:add_element(MonitoringPid, Pids),
+                        dict:store(Pid, {Ref, Pids1}, Monitors);
+                    error ->
+                        Ref = erlang:monitor(process, Pid),
+                        Pids = gb_sets:singleton(MonitoringPid),
+                        dict:store(Pid, {Ref, Pids}, Monitors)
+                end,
+    {noreply, State#state{monitors = Monitors1}, hibernate};
+
+handle_cast({demonitor, MonitoringPid, Pid},
+            State = #state{monitors = Monitors}) ->
+    Monitors1 = case dict:find(Pid, Monitors) of
+                    {ok, {Ref, Pids}} ->
+                        Pids1 = gb_sets:del_element(MonitoringPid, Pids),
+                        case gb_sets:is_empty(Pids1) of
+                            true  -> erlang:demonitor(Ref),
+                                     dict:erase(Pid, Monitors);
+                            false -> dict:store(Pid, {Ref, Pids1}, Monitors)
+                        end;
+                    error ->
+                        Monitors
+                end,
+    {noreply, State#state{monitors = Monitors1}, hibernate};
+
+handle_cast({invoke, FunOrMFA, Grouped}, State = #state{node = Node}) ->
+    safe_invoke(orddict:fetch(Node, Grouped), FunOrMFA),
+    {noreply, State, hibernate}.
+
+handle_info({'DOWN', Ref, process, Pid, Info},
+            State = #state{monitors = Monitors, name = Name}) ->
+    {noreply,
+     case dict:find(Pid, Monitors) of
+         {ok, {Ref, Pids}} ->
+             Msg = {'DOWN', {Name, Pid}, process, Pid, Info},
+             gb_sets:fold(fun (MonitoringPid, _) -> MonitoringPid ! Msg end,
+                          none, Pids),
+             State#state{monitors = dict:erase(Pid, Monitors)};
+         error ->
+             State
+     end, hibernate};
+
+handle_info(_Info, State) ->
+    {noreply, State, hibernate}.
 
 terminate(_Reason, _State) ->
     ok.
 
-code_change(_OldVsn, Node, _Extra) ->
-    {ok, Node}.
+code_change(_OldVsn, State, _Extra) ->
+    {ok, State}.
diff --git a/src/delegate_sup.erl b/src/delegate_sup.erl
index 2a8b915b..e31d6d38 100644
--- a/src/delegate_sup.erl
+++ b/src/delegate_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(delegate_sup).
diff --git a/src/dtree.erl b/src/dtree.erl
index ca2d30cf..5ff36bd9 100644
--- a/src/dtree.erl
+++ b/src/dtree.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 %% A dual-index tree.
diff --git a/src/file_handle_cache.erl b/src/file_handle_cache.erl
index f3b4dbaf..bac7c2c1 100644
--- a/src/file_handle_cache.erl
+++ b/src/file_handle_cache.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(file_handle_cache).
@@ -120,12 +120,12 @@
 %% do not need to worry about their handles being closed by the server
 %% - reopening them when necessary is handled transparently.
 %%
-%% The server also supports obtain, release and transfer. obtain/0
+%% The server also supports obtain, release and transfer. obtain/{0,1}
 %% blocks until a file descriptor is available, at which point the
-%% requesting process is considered to 'own' one more
-%% descriptor. release/0 is the inverse operation and releases a
-%% previously obtained descriptor. transfer/1 transfers ownership of a
-%% file descriptor between processes. It is non-blocking. Obtain has a
+%% requesting process is considered to 'own' more descriptor(s).
+%% release/{0,1} is the inverse operation and releases previously obtained
+%% descriptor(s). transfer/{1,2} transfers ownership of file descriptor(s)
+%% between processes. It is non-blocking. Obtain has a
 %% lower limit, set by the ?OBTAIN_LIMIT/1 macro. File handles can use
 %% the entire limit, but will be evicted by obtain calls up to the
 %% point at which no more obtain calls can be satisfied by the obtains
@@ -136,8 +136,8 @@
 %% as sockets can do so in such a way that the overall number of open
 %% file descriptors is managed.
 %%
-%% The callers of register_callback/3, obtain/0, and the argument of
-%% transfer/1 are monitored, reducing the count of handles in use
+%% The callers of register_callback/3, obtain, and the argument of
+%% transfer are monitored, reducing the count of handles in use
 %% appropriately when the processes terminate.
 
 -behaviour(gen_server2).
@@ -146,12 +146,13 @@
 -export([open/3, close/1, read/2, append/2, needs_sync/1, sync/1, position/2,
          truncate/1, current_virtual_offset/1, current_raw_offset/1, flush/1,
          copy/3, set_maximum_since_use/1, delete/1, clear/1]).
--export([obtain/0, release/0, transfer/1, set_limit/1, get_limit/0, info_keys/0,
+-export([obtain/0, obtain/1, release/0, release/1, transfer/1, transfer/2,
+         set_limit/1, get_limit/0, info_keys/0,
          info/0, info/1]).
 -export([ulimit/0]).
 
--export([start_link/0, init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3, prioritise_cast/2]).
+-export([start_link/0, start_link/2, init/1, handle_call/3, handle_cast/2,
+         handle_info/2, terminate/2, code_change/3, prioritise_cast/3]).
 
 -define(SERVER, ?MODULE).
 -define(RESERVED_FOR_OTHERS, 100).
@@ -195,7 +196,9 @@
           obtain_count,
           obtain_pending,
           clients,
-          timer_ref
+          timer_ref,
+          alarm_set,
+          alarm_clear
         }).
 
 -record(cstate,
@@ -249,8 +252,11 @@
 -spec(clear/1 :: (ref()) -> ok_or_error()).
 -spec(set_maximum_since_use/1 :: (non_neg_integer()) -> 'ok').
 -spec(obtain/0 :: () -> 'ok').
+-spec(obtain/1 :: (non_neg_integer()) -> 'ok').
 -spec(release/0 :: () -> 'ok').
+-spec(release/1 :: (non_neg_integer()) -> 'ok').
 -spec(transfer/1 :: (pid()) -> 'ok').
+-spec(transfer/2 :: (pid(), non_neg_integer()) -> 'ok').
 -spec(set_limit/1 :: (non_neg_integer()) -> 'ok').
 -spec(get_limit/0 :: () -> non_neg_integer()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
@@ -268,7 +274,11 @@
 %%----------------------------------------------------------------------------
 
 start_link() ->
-    gen_server2:start_link({local, ?SERVER}, ?MODULE, [], [{timeout, infinity}]).
+    start_link(fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).
+
+start_link(AlarmSet, AlarmClear) ->
+    gen_server2:start_link({local, ?SERVER}, ?MODULE, [AlarmSet, AlarmClear],
+                           [{timeout, infinity}]).
 
 register_callback(M, F, A)
   when is_atom(M) andalso is_atom(F) andalso is_list(A) ->
@@ -374,11 +384,11 @@ sync(Ref) ->
       end).
 
 needs_sync(Ref) ->
-    with_handles(
-      [Ref],
-      fun ([#handle { is_dirty = false, write_buffer = [] }]) -> false;
-          ([_Handle])                                         -> true
-      end).
+    %% This must *not* use with_handles/2; see bug 25052
+    case get({Ref, fhc_handle}) of
+        #handle { is_dirty = false, write_buffer = [] } -> false;
+        #handle {}                                      -> true
+    end.
 
 position(Ref, NewOffset) ->
     with_flushed_handles(
@@ -479,18 +489,22 @@ set_maximum_since_use(MaximumAge) ->
         true  -> ok
     end.
 
-obtain() ->
+obtain()      -> obtain(1).
+release()     -> release(1).
+transfer(Pid) -> transfer(Pid, 1).
+
+obtain(Count) when Count > 0 ->
     %% If the FHC isn't running, obtains succeed immediately.
     case whereis(?SERVER) of
         undefined -> ok;
-        _         -> gen_server2:call(?SERVER, {obtain, self()}, infinity)
+        _         -> gen_server2:call(?SERVER, {obtain, Count, self()}, infinity)
     end.
 
-release() ->
-    gen_server2:cast(?SERVER, {release, self()}).
+release(Count) when Count > 0 ->
+    gen_server2:cast(?SERVER, {release, Count, self()}).
 
-transfer(Pid) ->
-    gen_server2:cast(?SERVER, {transfer, self(), Pid}).
+transfer(Pid, Count) when Count > 0 ->
+    gen_server2:cast(?SERVER, {transfer, Count, self(), Pid}).
 
 set_limit(Limit) ->
     gen_server2:call(?SERVER, {set_limit, Limit}, infinity).
@@ -806,7 +820,7 @@ i(Item, _) -> throw({bad_argument, Item}).
 %% gen_server2 callbacks
 %%----------------------------------------------------------------------------
 
-init([]) ->
+init([AlarmSet, AlarmClear]) ->
     Limit = case application:get_env(file_handles_high_watermark) of
                 {ok, Watermark} when (is_integer(Watermark) andalso
                                       Watermark > 0) ->
@@ -830,11 +844,13 @@ init([]) ->
                       obtain_count   = 0,
                       obtain_pending = pending_new(),
                       clients        = Clients,
-                      timer_ref      = undefined }}.
+                      timer_ref      = undefined,
+                      alarm_set      = AlarmSet,
+                      alarm_clear    = AlarmClear }}.
 
-prioritise_cast(Msg, _State) ->
+prioritise_cast(Msg, _Len, _State) ->
     case Msg of
-        {release, _}                 -> 5;
+        {release, _, _}              -> 5;
         _                            -> 0
     end.
 
@@ -867,11 +883,12 @@ handle_call({open, Pid, Requested, EldestUnusedSince}, From,
         false -> {noreply, run_pending_item(Item, State)}
     end;
 
-handle_call({obtain, Pid}, From, State = #fhc_state { obtain_count   = Count,
-                                                      obtain_pending = Pending,
-                                                      clients = Clients }) ->
+handle_call({obtain, N, Pid}, From, State = #fhc_state {
+                                              obtain_count   = Count,
+                                              obtain_pending = Pending,
+                                              clients = Clients }) ->
     ok = track_client(Pid, Clients),
-    Item = #pending { kind = obtain, pid = Pid, requested = 1, from = From },
+    Item = #pending { kind = obtain, pid = Pid, requested = N, from = From },
     Enqueue = fun () ->
                       true = ets:update_element(Clients, Pid,
                                                 {#cstate.blocked, true}),
@@ -882,7 +899,7 @@ handle_call({obtain, Pid}, From, State = #fhc_state { obtain_count   = Count,
         case obtain_limit_reached(State) of
             true  -> Enqueue();
             false -> case needs_reduce(State #fhc_state {
-                                      obtain_count = Count + 1 }) of
+                                      obtain_count = Count + N }) of
                          true  -> reduce(Enqueue());
                          false -> adjust_alarm(
                                       State, run_pending_item(Item, State))
@@ -917,9 +934,9 @@ handle_cast({update, Pid, EldestUnusedSince},
     %% storm of messages
     {noreply, State};
 
-handle_cast({release, Pid}, State) ->
+handle_cast({release, N, Pid}, State) ->
     {noreply, adjust_alarm(State, process_pending(
-                                    update_counts(obtain, Pid, -1, State)))};
+                                    update_counts(obtain, Pid, -N, State)))};
 
 handle_cast({close, Pid, EldestUnusedSince},
             State = #fhc_state { elders = Elders, clients = Clients }) ->
@@ -931,11 +948,11 @@ handle_cast({close, Pid, EldestUnusedSince},
     {noreply, adjust_alarm(State, process_pending(
                 update_counts(open, Pid, -1, State)))};
 
-handle_cast({transfer, FromPid, ToPid}, State) ->
+handle_cast({transfer, N, FromPid, ToPid}, State) ->
     ok = track_client(ToPid, State#fhc_state.clients),
     {noreply, process_pending(
-                update_counts(obtain, ToPid, +1,
-                              update_counts(obtain, FromPid, -1, State)))}.
+                update_counts(obtain, ToPid, +N,
+                              update_counts(obtain, FromPid, -N, State)))}.
 
 handle_info(check_counts, State) ->
     {noreply, maybe_reduce(State #fhc_state { timer_ref = undefined })};
@@ -1026,10 +1043,11 @@ obtain_limit_reached(#fhc_state { obtain_limit = Limit,
                                   obtain_count = Count}) ->
     Limit =/= infinity andalso Count >= Limit.
 
-adjust_alarm(OldState, NewState) ->
+adjust_alarm(OldState = #fhc_state { alarm_set   = AlarmSet,
+                                     alarm_clear = AlarmClear }, NewState) ->
     case {obtain_limit_reached(OldState), obtain_limit_reached(NewState)} of
-        {false, true} -> alarm_handler:set_alarm({file_descriptor_limit, []});
-        {true, false} -> alarm_handler:clear_alarm(file_descriptor_limit);
+        {false, true} -> AlarmSet({file_descriptor_limit, []});
+        {true, false} -> AlarmClear(file_descriptor_limit);
         _             -> ok
     end,
     NewState.
diff --git a/src/gatherer.erl b/src/gatherer.erl
index 98b36038..c13298ca 100644
--- a/src/gatherer.erl
+++ b/src/gatherer.erl
@@ -10,15 +10,15 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(gatherer).
 
 -behaviour(gen_server2).
 
--export([start_link/0, stop/1, fork/1, finish/1, in/2, out/1]).
+-export([start_link/0, stop/1, fork/1, finish/1, in/2, sync_in/2, out/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
@@ -32,6 +32,7 @@
 -spec(fork/1 :: (pid()) -> 'ok').
 -spec(finish/1 :: (pid()) -> 'ok').
 -spec(in/2 :: (pid(), any()) -> 'ok').
+-spec(sync_in/2 :: (pid(), any()) -> 'ok').
 -spec(out/1 :: (pid()) -> {'value', any()} | 'empty').
 
 -endif.
@@ -62,6 +63,9 @@ finish(Pid) ->
 in(Pid, Value) ->
     gen_server2:cast(Pid, {in, Value}).
 
+sync_in(Pid, Value) ->
+    gen_server2:call(Pid, {in, Value}, infinity).
+
 out(Pid) ->
     gen_server2:call(Pid, out, infinity).
 
@@ -78,19 +82,22 @@ handle_call(stop, _From, State) ->
 handle_call(fork, _From, State = #gstate { forks = Forks }) ->
     {reply, ok, State #gstate { forks = Forks + 1 }, hibernate};
 
+handle_call({in, Value}, From, State) ->
+    {noreply, in(Value, From, State), hibernate};
+
 handle_call(out, From, State = #gstate { forks   = Forks,
                                          values  = Values,
                                          blocked = Blocked }) ->
     case queue:out(Values) of
+        {empty, _} when Forks == 0 ->
+            {reply, empty, State, hibernate};
         {empty, _} ->
-            case Forks of
-                0 -> {reply, empty, State, hibernate};
-                _ -> {noreply,
-                      State #gstate { blocked = queue:in(From, Blocked) },
-                      hibernate}
-            end;
-        {{value, _Value} = V, NewValues} ->
-            {reply, V, State #gstate { values = NewValues }, hibernate}
+            {noreply, State #gstate { blocked = queue:in(From, Blocked) },
+             hibernate};
+        {{value, {PendingIn, Value}}, NewValues} ->
+            reply(PendingIn, ok),
+            {reply, {value, Value}, State #gstate { values = NewValues },
+             hibernate}
     end;
 
 handle_call(Msg, _From, State) ->
@@ -107,15 +114,8 @@ handle_cast(finish, State = #gstate { forks = Forks, blocked = Blocked }) ->
     {noreply, State #gstate { forks = NewForks, blocked = NewBlocked },
      hibernate};
 
-handle_cast({in, Value}, State = #gstate { values  = Values,
-                                           blocked = Blocked }) ->
-    {noreply, case queue:out(Blocked) of
-                  {empty, _} ->
-                      State #gstate { values = queue:in(Value, Values) };
-                  {{value, From}, NewBlocked} ->
-                      gen_server2:reply(From, {value, Value}),
-                      State #gstate { blocked = NewBlocked }
-              end, hibernate};
+handle_cast({in, Value}, State) ->
+    {noreply, in(Value, undefined, State), hibernate};
 
 handle_cast(Msg, State) ->
     {stop, {unexpected_cast, Msg}, State}.
@@ -128,3 +128,18 @@ code_change(_OldVsn, State, _Extra) ->
 
 terminate(_Reason, State) ->
     State.
+
+%%----------------------------------------------------------------------------
+
+in(Value, From,  State = #gstate { values = Values, blocked = Blocked }) ->
+    case queue:out(Blocked) of
+        {empty, _} ->
+            State #gstate { values = queue:in({From, Value}, Values) };
+        {{value, PendingOut}, NewBlocked} ->
+            reply(From, ok),
+            gen_server2:reply(PendingOut, {value, Value}),
+            State #gstate { blocked = NewBlocked }
+    end.
+
+reply(undefined, _Reply) -> ok;
+reply(From,       Reply) -> gen_server2:reply(From, Reply).
diff --git a/src/gen_server2.erl b/src/gen_server2.erl
index 78bbbe06..6690d181 100644
--- a/src/gen_server2.erl
+++ b/src/gen_server2.erl
@@ -16,12 +16,15 @@
 %% The original code could reorder messages when communicating with a
 %% process on a remote node that was not currently connected.
 %%
-%% 4) The callback module can optionally implement prioritise_call/3,
-%% prioritise_cast/2 and prioritise_info/2.  These functions take
-%% Message, From and State or just Message and State and return a
-%% single integer representing the priority attached to the message.
-%% Messages with higher priorities are processed before requests with
-%% lower priorities. The default priority is 0.
+%% 4) The callback module can optionally implement prioritise_call/4,
+%% prioritise_cast/3 and prioritise_info/3.  These functions take
+%% Message, From, Length and State or just Message, Length and State
+%% (where Length is the current number of messages waiting to be
+%% processed) and return a single integer representing the priority
+%% attached to the message, or 'drop' to ignore it (for
+%% prioritise_cast/3 and prioritise_info/3 only).  Messages with
+%% higher priorities are processed before requests with lower
+%% priorities. The default priority is 0.
 %%
 %% 5) The callback module can optionally implement
 %% handle_pre_hibernate/1 and handle_post_hibernate/1. These will be
@@ -72,8 +75,14 @@
 %% format_message_queue/2 which is the equivalent of format_status/2
 %% but where the second argument is specifically the priority_queue
 %% which contains the prioritised message_queue.
+%%
+%% 9) The function with_state/2 can be used to debug a process with
+%% heavyweight state (without needing to copy the entire state out of
+%% process as sys:get_status/1 would). Pass through a function which
+%% can be invoked on the state, get back the result. The state is not
+%% modified.
 
-%% All modifications are (C) 2009-2012 VMware, Inc.
+%% All modifications are (C) 2009-2013 GoPivotal, Inc.
 
 %% ``The contents of this file are subject to the Erlang Public License,
 %% Version 1.1, (the "License"); you may not use this file except in
@@ -181,6 +190,7 @@
          cast/2, reply/2,
          abcast/2, abcast/3,
          multi_call/2, multi_call/3, multi_call/4,
+         with_state/2,
          enter_loop/3, enter_loop/4, enter_loop/5, enter_loop/6, wake_hib/1]).
 
 %% System exports
@@ -196,8 +206,7 @@
 
 %% State record
 -record(gs2_state, {parent, name, state, mod, time,
-                    timeout_state, queue, debug, prioritise_call,
-                    prioritise_cast, prioritise_info}).
+                    timeout_state, queue, debug, prioritisers}).
 
 -ifdef(use_specs).
 
@@ -380,6 +389,16 @@ multi_call(Nodes, Name, Req, Timeout)
   when is_list(Nodes), is_atom(Name), is_integer(Timeout), Timeout >= 0 ->
     do_multi_call(Nodes, Name, Req, Timeout).
 
+%% -----------------------------------------------------------------
+%% Apply a function to a generic server's state.
+%% -----------------------------------------------------------------
+with_state(Name, Fun) ->
+    case catch gen:call(Name, '$with_state', Fun, infinity) of
+        {ok,Res} ->
+            Res;
+        {'EXIT',Reason} ->
+            exit({Reason, {?MODULE, with_state, [Name, Fun]}})
+    end.
 
 %%-----------------------------------------------------------------
 %% enter_loop(Mod, Options, State, <ServerName>, <TimeOut>, <Backoff>) ->_
@@ -638,17 +657,22 @@ adjust_timeout_state(SleptAt, AwokeAt, {backoff, CurrentTO, MinimumTO,
     {backoff, CurrentTO1, MinimumTO, DesiredHibPeriod, RandomState1}.
 
 in({'$gen_cast', Msg} = Input,
-   GS2State = #gs2_state { prioritise_cast = PC }) ->
-    in(Input, PC(Msg, GS2State), GS2State);
+   GS2State = #gs2_state { prioritisers = {_, F, _} }) ->
+    in(Input, F(Msg, GS2State), GS2State);
 in({'$gen_call', From, Msg} = Input,
-   GS2State = #gs2_state { prioritise_call = PC }) ->
-    in(Input, PC(Msg, From, GS2State), GS2State);
+   GS2State = #gs2_state { prioritisers = {F, _, _} }) ->
+    in(Input, F(Msg, From, GS2State), GS2State);
+in({'$with_state', _From, _Fun} = Input, GS2State) ->
+    in(Input, 0, GS2State);
 in({'EXIT', Parent, _R} = Input, GS2State = #gs2_state { parent = Parent }) ->
     in(Input, infinity, GS2State);
 in({system, _From, _Req} = Input, GS2State) ->
     in(Input, infinity, GS2State);
-in(Input, GS2State = #gs2_state { prioritise_info = PI }) ->
-    in(Input, PI(Input, GS2State), GS2State).
+in(Input, GS2State = #gs2_state { prioritisers = {_, _, F} }) ->
+    in(Input, F(Input, GS2State), GS2State).
+
+in(_Input, drop, GS2State) ->
+    GS2State;
 
 in(Input, Priority, GS2State = #gs2_state { queue = Queue }) ->
     GS2State # gs2_state { queue = priority_queue:in(Input, Priority, Queue) }.
@@ -658,6 +682,10 @@ process_msg({system, From, Req},
     %% gen_server puts Hib on the end as the 7th arg, but that version
     %% of the fun seems not to be documented so leaving out for now.
     sys:handle_system_msg(Req, From, Parent, ?MODULE, Debug, GS2State);
+process_msg({'$with_state', From, Fun},
+           GS2State = #gs2_state{state = State}) ->
+    reply(From, catch Fun(State)),
+    loop(GS2State);
 process_msg({'EXIT', Parent, Reason} = Msg,
             GS2State = #gs2_state { parent = Parent }) ->
     terminate(Reason, Msg, GS2State);
@@ -864,13 +892,19 @@ dispatch(Info, Mod, State) ->
 common_reply(_Name, From, Reply, _NState, [] = _Debug) ->
     reply(From, Reply),
     [];
-common_reply(Name, From, Reply, NState, Debug) ->
-    reply(Name, From, Reply, NState, Debug).
+common_reply(Name, {To, _Tag} = From, Reply, NState, Debug) ->
+    reply(From, Reply),
+    sys:handle_debug(Debug, fun print_event/3, Name, {out, Reply, To, NState}).
 
-common_debug([] = _Debug, _Func, _Info, _Event) ->
+common_noreply(_Name, _NState, [] = _Debug) ->
     [];
-common_debug(Debug, Func, Info, Event) ->
-    sys:handle_debug(Debug, Func, Info, Event).
+common_noreply(Name, NState, Debug) ->
+    sys:handle_debug(Debug, fun print_event/3, Name, {noreply, NState}).
+
+common_become(_Name, _Mod, _NState, [] = _Debug) ->
+    [];
+common_become(Name, Mod, NState, Debug) ->
+    sys:handle_debug(Debug, fun print_event/3, Name, {become, Mod, NState}).
 
 handle_msg({'$gen_call', From, Msg}, GS2State = #gs2_state { mod = Mod,
                                                              state = State,
@@ -887,23 +921,11 @@ handle_msg({'$gen_call', From, Msg}, GS2State = #gs2_state { mod = Mod,
             loop(GS2State #gs2_state { state = NState,
                                        time  = Time1,
                                        debug = Debug1});
-        {noreply, NState} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {noreply, NState}),
-            loop(GS2State #gs2_state {state = NState,
-                                      time  = infinity,
-                                      debug = Debug1});
-        {noreply, NState, Time1} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {noreply, NState}),
-            loop(GS2State #gs2_state {state = NState,
-                                      time  = Time1,
-                                      debug = Debug1});
         {stop, Reason, Reply, NState} ->
             {'EXIT', R} =
                 (catch terminate(Reason, Msg,
                                  GS2State #gs2_state { state = NState })),
-            reply(Name, From, Reply, NState, Debug),
+            common_reply(Name, From, Reply, NState, Debug),
             exit(R);
         Other ->
             handle_common_reply(Other, Msg, GS2State)
@@ -916,28 +938,24 @@ handle_common_reply(Reply, Msg, GS2State = #gs2_state { name  = Name,
                                                         debug = Debug}) ->
     case Reply of
         {noreply, NState} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {noreply, NState}),
-            loop(GS2State #gs2_state { state = NState,
-                                       time  = infinity,
-                                       debug = Debug1 });
+            Debug1 = common_noreply(Name, NState, Debug),
+            loop(GS2State #gs2_state {state = NState,
+                                      time  = infinity,
+                                      debug = Debug1});
         {noreply, NState, Time1} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {noreply, NState}),
-            loop(GS2State #gs2_state { state = NState,
-                                       time  = Time1,
-                                       debug = Debug1 });
+            Debug1 = common_noreply(Name, NState, Debug),
+            loop(GS2State #gs2_state {state = NState,
+                                      time  = Time1,
+                                      debug = Debug1});
         {become, Mod, NState} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {become, Mod, NState}),
+            Debug1 = common_become(Name, Mod, NState, Debug),
             loop(find_prioritisers(
                    GS2State #gs2_state { mod   = Mod,
                                          state = NState,
                                          time  = infinity,
                                          debug = Debug1 }));
         {become, Mod, NState, Time1} ->
-            Debug1 = common_debug(Debug, fun print_event/3, Name,
-                                  {become, Mod, NState}),
+            Debug1 = common_become(Name, Mod, NState, Debug),
             loop(find_prioritisers(
                    GS2State #gs2_state { mod   = Mod,
                                          state = NState,
@@ -957,12 +975,6 @@ handle_common_termination(Reply, Msg, GS2State) ->
             terminate({bad_return_value, Reply}, Msg, GS2State)
     end.
 
-reply(Name, {To, Tag}, Reply, State, Debug) ->
-    reply({To, Tag}, Reply),
-    sys:handle_debug(
-      Debug, fun print_event/3, Name, {out, Reply, To, State}).
-
-
 %%-----------------------------------------------------------------
 %% Callback functions for system messages handling.
 %%-----------------------------------------------------------------
@@ -1165,30 +1177,33 @@ whereis_name(Name) ->
     end.
 
 find_prioritisers(GS2State = #gs2_state { mod = Mod }) ->
-    PrioriCall = function_exported_or_default(
-                   Mod, 'prioritise_call', 3,
-                   fun (_Msg, _From, _State) -> 0 end),
-    PrioriCast = function_exported_or_default(Mod, 'prioritise_cast', 2,
-                                              fun (_Msg, _State) -> 0 end),
-    PrioriInfo = function_exported_or_default(Mod, 'prioritise_info', 2,
-                                              fun (_Msg, _State) -> 0 end),
-    GS2State #gs2_state { prioritise_call = PrioriCall,
-                          prioritise_cast = PrioriCast,
-                          prioritise_info = PrioriInfo }.
+    PCall = function_exported_or_default(Mod, 'prioritise_call', 4,
+                                         fun (_Msg, _From, _State) -> 0 end),
+    PCast = function_exported_or_default(Mod, 'prioritise_cast', 3,
+                                         fun (_Msg, _State) -> 0 end),
+    PInfo = function_exported_or_default(Mod, 'prioritise_info', 3,
+                                         fun (_Msg, _State) -> 0 end),
+    GS2State #gs2_state { prioritisers = {PCall, PCast, PInfo} }.
 
 function_exported_or_default(Mod, Fun, Arity, Default) ->
     case erlang:function_exported(Mod, Fun, Arity) of
         true -> case Arity of
-                    2 -> fun (Msg, GS2State = #gs2_state { state = State }) ->
-                                 case catch Mod:Fun(Msg, State) of
+                    3 -> fun (Msg, GS2State = #gs2_state { queue = Queue,
+                                                           state = State }) ->
+                                 Length = priority_queue:len(Queue),
+                                 case catch Mod:Fun(Msg, Length, State) of
+                                     drop ->
+                                         drop;
                                      Res when is_integer(Res) ->
                                          Res;
                                      Err ->
                                          handle_common_termination(Err, Msg, GS2State)
                                  end
                          end;
-                    3 -> fun (Msg, From, GS2State = #gs2_state { state = State }) ->
-                                 case catch Mod:Fun(Msg, From, State) of
+                    4 -> fun (Msg, From, GS2State = #gs2_state { queue = Queue,
+                                                                 state = State }) ->
+                                 Length = priority_queue:len(Queue),
+                                 case catch Mod:Fun(Msg, From, Length, State) of
                                      Res when is_integer(Res) ->
                                          Res;
                                      Err ->
diff --git a/src/gm.erl b/src/gm.erl
index 97c81ec6..78099499 100644
--- a/src/gm.erl
+++ b/src/gm.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(gm).
@@ -77,9 +77,19 @@
 %% confirmed_broadcast/2 directly from the callback module otherwise
 %% you will deadlock the entire group.
 %%
-%% group_members/1
-%% Provide the Pid. Returns a list of the current group members.
+%% info/1
+%% Provide the Pid. Returns a proplist with various facts, including
+%% the group name and the current group members.
 %%
+%% validate_members/2
+%% Check whether a given member list agrees with the chosen member's
+%% view. Any differences will be communicated via the members_changed
+%% callback. If there are no differences then there will be no reply.
+%% Note that members will not necessarily share the same view.
+%%
+%% forget_group/1
+%% Provide the group name. Removes its mnesia record. Makes no attempt
+%% to ensure the group is empty.
 %%
 %% Implementation Overview
 %% -----------------------
@@ -372,11 +382,11 @@
 
 -behaviour(gen_server2).
 
--export([create_tables/0, start_link/3, leave/1, broadcast/2,
-         confirmed_broadcast/2, group_members/1]).
+-export([create_tables/0, start_link/4, leave/1, broadcast/2,
+         confirmed_broadcast/2, info/1, validate_members/2, forget_group/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
-         code_change/3, prioritise_info/2]).
+         code_change/3, prioritise_info/3]).
 
 -ifndef(use_specs).
 -export([behaviour_info/1]).
@@ -404,7 +414,8 @@
           callback_args,
           confirms,
           broadcast_buffer,
-          broadcast_timer
+          broadcast_timer,
+          txn_executor
         }).
 
 -record(gm_group, { name, version, members }).
@@ -424,14 +435,17 @@
 -export_type([group_name/0]).
 
 -type(group_name() :: any()).
+-type(txn_fun() :: fun((fun(() -> any())) -> any())).
 
 -spec(create_tables/0 :: () -> 'ok' | {'aborted', any()}).
--spec(start_link/3 :: (group_name(), atom(), any()) ->
+-spec(start_link/4 :: (group_name(), atom(), any(), txn_fun()) ->
                            rabbit_types:ok_pid_or_error()).
 -spec(leave/1 :: (pid()) -> 'ok').
 -spec(broadcast/2 :: (pid(), any()) -> 'ok').
 -spec(confirmed_broadcast/2 :: (pid(), any()) -> 'ok').
--spec(group_members/1 :: (pid()) -> [pid()]).
+-spec(info/1 :: (pid()) -> rabbit_types:infos()).
+-spec(validate_members/2 :: (pid(), [pid()]) -> 'ok').
+-spec(forget_group/1 :: (group_name()) -> 'ok').
 
 %% The joined, members_changed and handle_msg callbacks can all return
 %% any of the following terms:
@@ -502,8 +516,8 @@ table_definitions() ->
     {Name, Attributes} = ?TABLE,
     [{Name, [?TABLE_MATCH | Attributes]}].
 
-start_link(GroupName, Module, Args) ->
-    gen_server2:start_link(?MODULE, [GroupName, Module, Args], []).
+start_link(GroupName, Module, Args, TxnFun) ->
+    gen_server2:start_link(?MODULE, [GroupName, Module, Args, TxnFun], []).
 
 leave(Server) ->
     gen_server2:cast(Server, leave).
@@ -514,11 +528,20 @@ broadcast(Server, Msg) ->
 confirmed_broadcast(Server, Msg) ->
     gen_server2:call(Server, {confirmed_broadcast, Msg}, infinity).
 
-group_members(Server) ->
-    gen_server2:call(Server, group_members, infinity).
+info(Server) ->
+    gen_server2:call(Server, info, infinity).
+
+validate_members(Server, Members) ->
+    gen_server2:cast(Server, {validate_members, Members}).
 
+forget_group(GroupName) ->
+    {atomic, ok} = mnesia:sync_transaction(
+                     fun () ->
+                             mnesia:delete({?GROUP_TABLE, GroupName})
+                     end),
+    ok.
 
-init([GroupName, Module, Args]) ->
+init([GroupName, Module, Args, TxnFun]) ->
     {MegaSecs, Secs, MicroSecs} = now(),
     random:seed(MegaSecs, Secs, MicroSecs),
     Self = make_member(GroupName),
@@ -534,7 +557,8 @@ init([GroupName, Module, Args]) ->
                   callback_args    = Args,
                   confirms         = queue:new(),
                   broadcast_buffer = [],
-                  broadcast_timer  = undefined }, hibernate,
+                  broadcast_timer  = undefined,
+                  txn_executor     = TxnFun }, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 
@@ -553,12 +577,16 @@ handle_call({confirmed_broadcast, Msg}, _From,
 handle_call({confirmed_broadcast, Msg}, From, State) ->
     internal_broadcast(Msg, From, State);
 
-handle_call(group_members, _From,
+handle_call(info, _From,
             State = #state { members_state = undefined }) ->
     reply(not_joined, State);
 
-handle_call(group_members, _From, State = #state { view = View }) ->
-    reply(alive_view_members(View), State);
+handle_call(info, _From, State = #state { group_name = GroupName,
+                                          module     = Module,
+                                          view       = View }) ->
+    reply([{group_name,    GroupName},
+           {module,        Module},
+           {group_members, get_pids(alive_view_members(View))}], State);
 
 handle_call({add_on_right, _NewMember}, _From,
             State = #state { members_state = undefined }) ->
@@ -570,7 +598,8 @@ handle_call({add_on_right, NewMember}, _From,
                              view          = View,
                              members_state = MembersState,
                              module        = Module,
-                             callback_args = Args }) ->
+                             callback_args = Args,
+                             txn_executor  = TxnFun }) ->
     {MembersState1, Group} =
       record_new_member_in_group(
         GroupName, Self, NewMember,
@@ -581,7 +610,7 @@ handle_call({add_on_right, NewMember}, _From,
                                 {catchup, Self,
                                  prepare_members_state(MembersState1)}),
                 MembersState1
-        end),
+        end, TxnFun),
     View2 = group_to_view(Group),
     State1 = check_neighbours(State #state { view          = View2,
                                              members_state = MembersState1 }),
@@ -627,8 +656,9 @@ handle_cast(join, State = #state { self          = Self,
                                    group_name    = GroupName,
                                    members_state = undefined,
                                    module        = Module,
-                                   callback_args = Args }) ->
-    View = join_group(Self, GroupName),
+                                   callback_args = Args,
+                                   txn_executor  = TxnFun }) ->
+    View = join_group(Self, GroupName, TxnFun),
     MembersState =
         case alive_view_members(View) of
             [Self] -> blank_member_state();
@@ -639,6 +669,19 @@ handle_cast(join, State = #state { self          = Self,
     handle_callback_result(
       {Module:joined(Args, get_pids(all_known_members(View))), State1});
 
+handle_cast({validate_members, OldMembers},
+            State = #state { view          = View,
+                             module        = Module,
+                             callback_args = Args }) ->
+    NewMembers = get_pids(all_known_members(View)),
+    Births = NewMembers -- OldMembers,
+    Deaths = OldMembers -- NewMembers,
+    case {Births, Deaths} of
+        {[], []}  -> noreply(State);
+        _         -> Result = Module:members_changed(Args, Births, Deaths),
+                     handle_callback_result({Result, State})
+    end;
+
 handle_cast(leave, State) ->
     {stop, normal, State}.
 
@@ -647,7 +690,10 @@ handle_info(flush, State) ->
     noreply(
       flush_broadcast_buffer(State #state { broadcast_timer = undefined }));
 
-handle_info({'DOWN', MRef, process, _Pid, _Reason},
+handle_info(timeout, State) ->
+    noreply(flush_broadcast_buffer(State));
+
+handle_info({'DOWN', MRef, process, _Pid, Reason},
             State = #state { self          = Self,
                              left          = Left,
                              right         = Right,
@@ -655,18 +701,22 @@ handle_info({'DOWN', MRef, process, _Pid, _Reason},
                              view          = View,
                              module        = Module,
                              callback_args = Args,
-                             confirms      = Confirms }) ->
+                             confirms      = Confirms,
+                             txn_executor  = TxnFun }) ->
     Member = case {Left, Right} of
                  {{Member1, MRef}, _} -> Member1;
                  {_, {Member1, MRef}} -> Member1;
                  _                    -> undefined
              end,
-    case Member of
-        undefined ->
+    case {Member, Reason} of
+        {undefined, _} ->
+            noreply(State);
+        {_, {shutdown, ring_shutdown}} ->
             noreply(State);
         _ ->
             View1 =
-                group_to_view(record_dead_member_in_group(Member, GroupName)),
+                group_to_view(record_dead_member_in_group(Member,
+                                                          GroupName, TxnFun)),
             {Result, State2} =
                 case alive_view_members(View1) of
                     [Self] ->
@@ -694,12 +744,12 @@ terminate(Reason, State = #state { module        = Module,
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
 
-prioritise_info(flush, _State) ->
+prioritise_info(flush, _Len, _State) ->
     1;
-prioritise_info({'DOWN', _MRef, process, _Pid, _Reason},
+prioritise_info({'DOWN', _MRef, process, _Pid, _Reason}, _Len,
                 #state { members_state = MS }) when MS /= undefined ->
     1;
-prioritise_info(_, _State) ->
+prioritise_info(_, _Len, _State) ->
     0.
 
 
@@ -810,10 +860,13 @@ handle_msg({activity, _NotLeft, _Activity}, State) ->
 
 
 noreply(State) ->
-    {noreply, ensure_broadcast_timer(State), hibernate}.
+    {noreply, ensure_broadcast_timer(State), flush_timeout(State)}.
 
 reply(Reply, State) ->
-    {reply, Reply, ensure_broadcast_timer(State), hibernate}.
+    {reply, Reply, ensure_broadcast_timer(State), flush_timeout(State)}.
+
+flush_timeout(#state{broadcast_buffer = []}) -> hibernate;
+flush_timeout(_)                             -> 0.
 
 ensure_broadcast_timer(State = #state { broadcast_buffer = [],
                                         broadcast_timer  = undefined }) ->
@@ -876,11 +929,9 @@ flush_broadcast_buffer(State = #state { self             = Self,
 %% View construction and inspection
 %% ---------------------------------------------------------------------------
 
-needs_view_update(ReqVer, {Ver, _View}) ->
-    Ver < ReqVer.
+needs_view_update(ReqVer, {Ver, _View}) -> Ver < ReqVer.
 
-view_version({Ver, _View}) ->
-    Ver.
+view_version({Ver, _View}) -> Ver.
 
 is_member_alive({dead, _Member}) -> false;
 is_member_alive(_)               -> true.
@@ -899,17 +950,13 @@ store_view_member(VMember = #view_member { id = Id }, {Ver, View}) ->
 with_view_member(Fun, View, Id) ->
     store_view_member(Fun(fetch_view_member(Id, View)), View).
 
-fetch_view_member(Id, {_Ver, View}) ->
-    ?DICT:fetch(Id, View).
+fetch_view_member(Id, {_Ver, View}) -> ?DICT:fetch(Id, View).
 
-find_view_member(Id, {_Ver, View}) ->
-    ?DICT:find(Id, View).
+find_view_member(Id, {_Ver, View}) -> ?DICT:find(Id, View).
 
-blank_view(Ver) ->
-    {Ver, ?DICT:new()}.
+blank_view(Ver) -> {Ver, ?DICT:new()}.
 
-alive_view_members({_Ver, View}) ->
-    ?DICT:fetch_keys(View).
+alive_view_members({_Ver, View}) -> ?DICT:fetch_keys(View).
 
 all_known_members({_Ver, View}) ->
     ?DICT:fold(
@@ -974,14 +1021,15 @@ ensure_alive_suffix1(MembersQ) ->
 %% View modification
 %% ---------------------------------------------------------------------------
 
-join_group(Self, GroupName) ->
-    join_group(Self, GroupName, read_group(GroupName)).
+join_group(Self, GroupName, TxnFun) ->
+    join_group(Self, GroupName, read_group(GroupName), TxnFun).
 
-join_group(Self, GroupName, {error, not_found}) ->
-    join_group(Self, GroupName, prune_or_create_group(Self, GroupName));
-join_group(Self, _GroupName, #gm_group { members = [Self] } = Group) ->
+join_group(Self, GroupName, {error, not_found}, TxnFun) ->
+    join_group(Self, GroupName,
+               prune_or_create_group(Self, GroupName, TxnFun), TxnFun);
+join_group(Self, _GroupName, #gm_group { members = [Self] } = Group, _TxnFun) ->
     group_to_view(Group);
-join_group(Self, GroupName, #gm_group { members = Members } = Group) ->
+join_group(Self, GroupName, #gm_group { members = Members } = Group, TxnFun) ->
     case lists:member(Self, Members) of
         true ->
             group_to_view(Group);
@@ -989,20 +1037,22 @@ join_group(Self, GroupName, #gm_group { members = Members } = Group) ->
             case lists:filter(fun is_member_alive/1, Members) of
                 [] ->
                     join_group(Self, GroupName,
-                               prune_or_create_group(Self, GroupName));
+                               prune_or_create_group(Self, GroupName, TxnFun));
                 Alive ->
                     Left = lists:nth(random:uniform(length(Alive)), Alive),
                     Handler =
                         fun () ->
                                 join_group(
                                   Self, GroupName,
-                                  record_dead_member_in_group(Left, GroupName))
+                                  record_dead_member_in_group(
+                                    Left, GroupName, TxnFun),
+                                  TxnFun)
                         end,
                     try
                         case gen_server2:call(
                                get_pid(Left), {add_on_right, Self}, infinity) of
                             {ok, Group1} -> group_to_view(Group1);
-                            not_ready    -> join_group(Self, GroupName)
+                            not_ready    -> join_group(Self, GroupName, TxnFun)
                         end
                     catch
                         exit:{R, _}
@@ -1021,29 +1071,29 @@ read_group(GroupName) ->
         [Group] -> Group
     end.
 
-prune_or_create_group(Self, GroupName) ->
-    {atomic, Group} =
-        mnesia:sync_transaction(
-          fun () -> GroupNew = #gm_group { name    = GroupName,
-                                           members = [Self],
-                                           version = ?VERSION_START },
-                    case mnesia:read({?GROUP_TABLE, GroupName}) of
-                        [] ->
-                            mnesia:write(GroupNew),
-                            GroupNew;
-                        [Group1 = #gm_group { members = Members }] ->
-                            case lists:any(fun is_member_alive/1, Members) of
-                                true  -> Group1;
-                                false -> mnesia:write(GroupNew),
-                                         GroupNew
-                            end
-                    end
-          end),
+prune_or_create_group(Self, GroupName, TxnFun) ->
+    Group = TxnFun(
+              fun () ->
+                      GroupNew = #gm_group { name    = GroupName,
+                                             members = [Self],
+                                             version = get_version(Self) },
+                      case mnesia:read({?GROUP_TABLE, GroupName}) of
+                          [] ->
+                              mnesia:write(GroupNew),
+                              GroupNew;
+                          [Group1 = #gm_group { members = Members }] ->
+                              case lists:any(fun is_member_alive/1, Members) of
+                                  true  -> Group1;
+                                  false -> mnesia:write(GroupNew),
+                                           GroupNew
+                              end
+                      end
+              end),
     Group.
 
-record_dead_member_in_group(Member, GroupName) ->
-    {atomic, Group} =
-        mnesia:sync_transaction(
+record_dead_member_in_group(Member, GroupName, TxnFun) ->
+    Group =
+        TxnFun(
           fun () -> [Group1 = #gm_group { members = Members, version = Ver }] =
                         mnesia:read({?GROUP_TABLE, GroupName}),
                     case lists:splitwith(
@@ -1060,9 +1110,9 @@ record_dead_member_in_group(Member, GroupName) ->
           end),
     Group.
 
-record_new_member_in_group(GroupName, Left, NewMember, Fun) ->
-    {atomic, {Result, Group}} =
-        mnesia:sync_transaction(
+record_new_member_in_group(GroupName, Left, NewMember, Fun, TxnFun) ->
+    {Result, Group} =
+        TxnFun(
           fun () ->
                   [#gm_group { members = Members, version = Ver } = Group1] =
                       mnesia:read({?GROUP_TABLE, GroupName}),
@@ -1077,10 +1127,10 @@ record_new_member_in_group(GroupName, Left, NewMember, Fun) ->
           end),
     {Result, Group}.
 
-erase_members_in_group(Members, GroupName) ->
+erase_members_in_group(Members, GroupName, TxnFun) ->
     DeadMembers = [{dead, Id} || Id <- Members],
-    {atomic, Group} =
-        mnesia:sync_transaction(
+    Group =
+        TxnFun(
           fun () ->
                   [Group1 = #gm_group { members = [_|_] = Members1,
                                         version = Ver }] =
@@ -1101,7 +1151,8 @@ maybe_erase_aliases(State = #state { self          = Self,
                                      view          = View0,
                                      members_state = MembersState,
                                      module        = Module,
-                                     callback_args = Args }, View) ->
+                                     callback_args = Args,
+                                     txn_executor  = TxnFun }, View) ->
     #view_member { aliases = Aliases } = fetch_view_member(Self, View),
     {Erasable, MembersState1}
         = ?SETS:fold(
@@ -1118,7 +1169,7 @@ maybe_erase_aliases(State = #state { self          = Self,
     case Erasable of
         [] -> {ok, State1 #state { view = View }};
         _  -> View1 = group_to_view(
-                        erase_members_in_group(Erasable, GroupName)),
+                        erase_members_in_group(Erasable, GroupName, TxnFun)),
               {callback_view_changed(Args, Module, View0, View1),
                check_neighbours(State1 #state { view = View1 })}
     end.
@@ -1150,10 +1201,8 @@ ensure_neighbour(Ver, Self, {RealNeighbour, MRef}, Neighbour) ->
          end,
     {Neighbour, maybe_monitor(Neighbour, Self)}.
 
-maybe_monitor(Self, Self) ->
-    undefined;
-maybe_monitor(Other, _Self) ->
-    erlang:monitor(process, get_pid(Other)).
+maybe_monitor( Self,  Self) -> undefined;
+maybe_monitor(Other, _Self) -> erlang:monitor(process, get_pid(Other)).
 
 check_neighbours(State = #state { self             = Self,
                                   left             = Left,
@@ -1242,23 +1291,19 @@ find_member_or_blank(Id, MembersState) ->
         error        -> blank_member()
     end.
 
-erase_member(Id, MembersState) ->
-    ?DICT:erase(Id, MembersState).
+erase_member(Id, MembersState) -> ?DICT:erase(Id, MembersState).
 
 blank_member() ->
     #member { pending_ack = queue:new(), last_pub = -1, last_ack = -1 }.
 
-blank_member_state() ->
-    ?DICT:new().
+blank_member_state() -> ?DICT:new().
 
 store_member(Id, MemberState, MembersState) ->
     ?DICT:store(Id, MemberState, MembersState).
 
-prepare_members_state(MembersState) ->
-    ?DICT:to_list(MembersState).
+prepare_members_state(MembersState) -> ?DICT:to_list(MembersState).
 
-build_members_state(MembersStateList) ->
-    ?DICT:from_list(MembersStateList).
+build_members_state(MembersStateList) -> ?DICT:from_list(MembersStateList).
 
 make_member(GroupName) ->
    {case read_group(GroupName) of
@@ -1272,6 +1317,8 @@ remove_erased_members(MembersState, View) ->
                                  MembersState1)
                 end, blank_member_state(), all_known_members(View)).
 
+get_version({Version, _Pid}) -> Version.
+
 get_pid({_Version, Pid}) -> Pid.
 
 get_pids(Ids) -> [Pid || {_Version, Pid} <- Ids].
@@ -1280,16 +1327,12 @@ get_pids(Ids) -> [Pid || {_Version, Pid} <- Ids].
 %% Activity assembly
 %% ---------------------------------------------------------------------------
 
-activity_nil() ->
-    queue:new().
+activity_nil() -> queue:new().
 
-activity_cons(_Id, [], [], Tail) ->
-    Tail;
-activity_cons(Sender, Pubs, Acks, Tail) ->
-    queue:in({Sender, Pubs, Acks}, Tail).
+activity_cons(   _Id,   [],   [], Tail) -> Tail;
+activity_cons(Sender, Pubs, Acks, Tail) -> queue:in({Sender, Pubs, Acks}, Tail).
 
-activity_finalise(Activity) ->
-    queue:to_list(Activity).
+activity_finalise(Activity) -> queue:to_list(Activity).
 
 maybe_send_activity([], _State) ->
     ok;
@@ -1393,34 +1436,25 @@ purge_confirms(Confirms) ->
 %% Msg transformation
 %% ---------------------------------------------------------------------------
 
-acks_from_queue(Q) ->
-    [PubNum || {PubNum, _Msg} <- queue:to_list(Q)].
+acks_from_queue(Q) -> [PubNum || {PubNum, _Msg} <- queue:to_list(Q)].
 
-pubs_from_queue(Q) ->
-    queue:to_list(Q).
+pubs_from_queue(Q) -> queue:to_list(Q).
 
-queue_from_pubs(Pubs) ->
-    queue:from_list(Pubs).
+queue_from_pubs(Pubs) -> queue:from_list(Pubs).
 
-apply_acks([], Pubs) ->
-    Pubs;
-apply_acks(List, Pubs) ->
-    {_, Pubs1} = queue:split(length(List), Pubs),
-    Pubs1.
+apply_acks(  [], Pubs) -> Pubs;
+apply_acks(List, Pubs) -> {_, Pubs1} = queue:split(length(List), Pubs),
+                          Pubs1.
 
 join_pubs(Q, [])   -> Q;
 join_pubs(Q, Pubs) -> queue:join(Q, queue_from_pubs(Pubs)).
 
-last_ack([], LA) ->
-    LA;
-last_ack(List, LA) ->
-    LA1 = lists:last(List),
-    true = LA1 > LA, %% ASSERTION
-    LA1.
-
-last_pub([], LP) ->
-    LP;
-last_pub(List, LP) ->
-    {PubNum, _Msg} = lists:last(List),
-    true = PubNum > LP, %% ASSERTION
-    PubNum.
+last_ack(  [], LA) -> LA;
+last_ack(List, LA) -> LA1 = lists:last(List),
+                      true = LA1 > LA, %% ASSERTION
+                      LA1.
+
+last_pub(  [], LP) -> LP;
+last_pub(List, LP) -> {PubNum, _Msg} = lists:last(List),
+                      true = PubNum > LP, %% ASSERTION
+                      PubNum.
diff --git a/src/gm_soak_test.erl b/src/gm_soak_test.erl
index 57217541..b379d218 100644
--- a/src/gm_soak_test.erl
+++ b/src/gm_soak_test.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(gm_soak_test).
@@ -105,7 +105,9 @@ spawn_member() ->
               random:seed(MegaSecs, Secs, MicroSecs),
               %% start up delay of no more than 10 seconds
               timer:sleep(random:uniform(10000)),
-              {ok, Pid} = gm:start_link(?MODULE, ?MODULE, []),
+              {ok, Pid} = gm:start_link(
+                            ?MODULE, ?MODULE, [],
+                            fun rabbit_misc:execute_mnesia_transaction/1),
               Start = random:uniform(10000),
               send_loop(Pid, Start, Start + random:uniform(10000)),
               gm:leave(Pid),
diff --git a/src/gm_speed_test.erl b/src/gm_speed_test.erl
index dad75bd4..768cc462 100644
--- a/src/gm_speed_test.erl
+++ b/src/gm_speed_test.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(gm_speed_test).
@@ -44,7 +44,8 @@ terminate(Owner, _Reason) ->
 %% other
 
 wile_e_coyote(Time, WriteUnit) ->
-    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(),
+                              fun rabbit_misc:execute_mnesia_transaction/1),
     receive joined -> ok end,
     timer:sleep(1000), %% wait for all to join
     timer:send_after(Time, stop),
diff --git a/src/gm_tests.erl b/src/gm_tests.erl
index 0a2d4204..233702ad 100644
--- a/src/gm_tests.erl
+++ b/src/gm_tests.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(gm_tests).
@@ -76,7 +76,9 @@ test_confirmed_broadcast() ->
 test_member_death() ->
     with_two_members(
       fun (Pid, Pid2) ->
-              {ok, Pid3} = gm:start_link(?MODULE, ?MODULE, self()),
+              {ok, Pid3} = gm:start_link(
+                             ?MODULE, ?MODULE, self(),
+                             fun rabbit_misc:execute_mnesia_transaction/1),
               passed = receive_joined(Pid3, [Pid, Pid2, Pid3],
                                       timeout_joining_gm_group_3),
               passed = receive_birth(Pid, Pid3, timeout_waiting_for_birth_3_1),
@@ -128,10 +130,12 @@ test_broadcast_fun(Fun) ->
 with_two_members(Fun) ->
     ok = gm:create_tables(),
 
-    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid} = gm:start_link(?MODULE, ?MODULE, self(),
+                              fun rabbit_misc:execute_mnesia_transaction/1),
     passed = receive_joined(Pid, [Pid], timeout_joining_gm_group_1),
 
-    {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self()),
+    {ok, Pid2} = gm:start_link(?MODULE, ?MODULE, self(),
+                               fun rabbit_misc:execute_mnesia_transaction/1),
     passed = receive_joined(Pid2, [Pid, Pid2], timeout_joining_gm_group_2),
     passed = receive_birth(Pid, Pid2, timeout_waiting_for_birth_2),
 
diff --git a/src/lqueue.erl b/src/lqueue.erl
index c4e046b5..4ff7cc0b 100644
--- a/src/lqueue.erl
+++ b/src/lqueue.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(lqueue).
diff --git a/src/mirrored_supervisor.erl b/src/mirrored_supervisor.erl
index 4fc488b8..d5f51db0 100644
--- a/src/mirrored_supervisor.erl
+++ b/src/mirrored_supervisor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(mirrored_supervisor).
@@ -174,7 +174,7 @@
 -spec start_internal(Group, ChildSpecs) -> Result when
       Group :: group_name(),
       ChildSpecs :: [supervisor2:child_spec()],
-      Result :: supervisor2:startlink_ret().
+      Result :: {'ok', pid()} | {'error', term()}.
 
 -spec create_tables() -> Result when
       Result :: 'ok'.
@@ -212,9 +212,8 @@ start_link0(Prefix, Group, Init) ->
 init(Mod, Args) ->
     case Mod:init(Args) of
         {ok, {{Bad, _, _}, _ChildSpecs}} when
-              Bad =:= simple_one_for_one orelse
-              Bad =:= simple_one_for_one_terminate -> erlang:error(badarg);
-        Init                                       -> Init
+              Bad =:= simple_one_for_one -> erlang:error(badarg);
+        Init                             -> Init
     end.
 
 start_child(Sup, ChildSpec) -> call(Sup, {start_child,  ChildSpec}).
diff --git a/src/mirrored_supervisor_tests.erl b/src/mirrored_supervisor_tests.erl
index f8cbd853..780ef11d 100644
--- a/src/mirrored_supervisor_tests.erl
+++ b/src/mirrored_supervisor_tests.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(mirrored_supervisor_tests).
diff --git a/src/mnesia_sync.erl b/src/mnesia_sync.erl
index a3773d90..78c566e1 100644
--- a/src/mnesia_sync.erl
+++ b/src/mnesia_sync.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(mnesia_sync).
diff --git a/src/mochijson2.erl b/src/mochijson2.erl
new file mode 100644
index 00000000..bddb52cc
--- /dev/null
+++ b/src/mochijson2.erl
@@ -0,0 +1,893 @@
+%% This file is a copy of `mochijson2.erl' from mochiweb, revision
+%% d541e9a0f36c00dcadc2e589f20e47fbf46fc76f.  For the license, see
+%% `LICENSE-MIT-Mochi'.
+
+%% @author Bob Ippolito <bob@mochimedia.com>
+%% @copyright 2007 Mochi Media, Inc.
+
+%% @doc Yet another JSON (RFC 4627) library for Erlang. mochijson2 works
+%%      with binaries as strings, arrays as lists (without an {array, _})
+%%      wrapper and it only knows how to decode UTF-8 (and ASCII).
+%%
+%%      JSON terms are decoded as follows (javascript -> erlang):
+%%      <ul>
+%%          <li>{"key": "value"} ->
+%%              {struct, [{&lt;&lt;"key">>, &lt;&lt;"value">>}]}</li>
+%%          <li>["array", 123, 12.34, true, false, null] ->
+%%              [&lt;&lt;"array">>, 123, 12.34, true, false, null]
+%%          </li>
+%%      </ul>
+%%      <ul>
+%%          <li>Strings in JSON decode to UTF-8 binaries in Erlang</li>
+%%          <li>Objects decode to {struct, PropList}</li>
+%%          <li>Numbers decode to integer or float</li>
+%%          <li>true, false, null decode to their respective terms.</li>
+%%      </ul>
+%%      The encoder will accept the same format that the decoder will produce,
+%%      but will also allow additional cases for leniency:
+%%      <ul>
+%%          <li>atoms other than true, false, null will be considered UTF-8
+%%              strings (even as a proplist key)
+%%          </li>
+%%          <li>{json, IoList} will insert IoList directly into the output
+%%              with no validation
+%%          </li>
+%%          <li>{array, Array} will be encoded as Array
+%%              (legacy mochijson style)
+%%          </li>
+%%          <li>A non-empty raw proplist will be encoded as an object as long
+%%              as the first pair does not have an atom key of json, struct,
+%%              or array
+%%          </li>
+%%      </ul>
+
+-module(mochijson2).
+-author('bob@mochimedia.com').
+-export([encoder/1, encode/1]).
+-export([decoder/1, decode/1, decode/2]).
+
+%% This is a macro to placate syntax highlighters..
+-define(Q, $\").
+-define(ADV_COL(S, N), S#decoder{offset=N+S#decoder.offset,
+                                 column=N+S#decoder.column}).
+-define(INC_COL(S), S#decoder{offset=1+S#decoder.offset,
+                              column=1+S#decoder.column}).
+-define(INC_LINE(S), S#decoder{offset=1+S#decoder.offset,
+                               column=1,
+                               line=1+S#decoder.line}).
+-define(INC_CHAR(S, C),
+        case C of
+            $\n ->
+                S#decoder{column=1,
+                          line=1+S#decoder.line,
+                          offset=1+S#decoder.offset};
+            _ ->
+                S#decoder{column=1+S#decoder.column,
+                          offset=1+S#decoder.offset}
+        end).
+-define(IS_WHITESPACE(C),
+        (C =:= $\s orelse C =:= $\t orelse C =:= $\r orelse C =:= $\n)).
+
+%% @type json_string() = atom | binary()
+%% @type json_number() = integer() | float()
+%% @type json_array() = [json_term()]
+%% @type json_object() = {struct, [{json_string(), json_term()}]}
+%% @type json_eep18_object() = {[{json_string(), json_term()}]}
+%% @type json_iolist() = {json, iolist()}
+%% @type json_term() = json_string() | json_number() | json_array() |
+%%                     json_object() | json_eep18_object() | json_iolist()
+
+-record(encoder, {handler=null,
+                  utf8=false}).
+
+-record(decoder, {object_hook=null,
+                  offset=0,
+                  line=1,
+                  column=1,
+                  state=null}).
+
+%% @spec encoder([encoder_option()]) -> function()
+%% @doc Create an encoder/1 with the given options.
+%% @type encoder_option() = handler_option() | utf8_option()
+%% @type utf8_option() = boolean(). Emit unicode as utf8 (default - false)
+encoder(Options) ->
+    State = parse_encoder_options(Options, #encoder{}),
+    fun (O) -> json_encode(O, State) end.
+
+%% @spec encode(json_term()) -> iolist()
+%% @doc Encode the given as JSON to an iolist.
+encode(Any) ->
+    json_encode(Any, #encoder{}).
+
+%% @spec decoder([decoder_option()]) -> function()
+%% @doc Create a decoder/1 with the given options.
+decoder(Options) ->
+    State = parse_decoder_options(Options, #decoder{}),
+    fun (O) -> json_decode(O, State) end.
+
+%% @spec decode(iolist(), [{format, proplist | eep18 | struct}]) -> json_term()
+%% @doc Decode the given iolist to Erlang terms using the given object format
+%%      for decoding, where proplist returns JSON objects as [{binary(), json_term()}]
+%%      proplists, eep18 returns JSON objects as {[binary(), json_term()]}, and struct
+%%      returns them as-is.
+decode(S, Options) ->
+    json_decode(S, parse_decoder_options(Options, #decoder{})).
+
+%% @spec decode(iolist()) -> json_term()
+%% @doc Decode the given iolist to Erlang terms.
+decode(S) ->
+    json_decode(S, #decoder{}).
+
+%% Internal API
+
+parse_encoder_options([], State) ->
+    State;
+parse_encoder_options([{handler, Handler} | Rest], State) ->
+    parse_encoder_options(Rest, State#encoder{handler=Handler});
+parse_encoder_options([{utf8, Switch} | Rest], State) ->
+    parse_encoder_options(Rest, State#encoder{utf8=Switch}).
+
+parse_decoder_options([], State) ->
+    State;
+parse_decoder_options([{object_hook, Hook} | Rest], State) ->
+    parse_decoder_options(Rest, State#decoder{object_hook=Hook});
+parse_decoder_options([{format, Format} | Rest], State)
+  when Format =:= struct orelse Format =:= eep18 orelse Format =:= proplist ->
+    parse_decoder_options(Rest, State#decoder{object_hook=Format}).
+
+json_encode(true, _State) ->
+    <<"true">>;
+json_encode(false, _State) ->
+    <<"false">>;
+json_encode(null, _State) ->
+    <<"null">>;
+json_encode(I, _State) when is_integer(I) ->
+    integer_to_list(I);
+json_encode(F, _State) when is_float(F) ->
+    mochinum:digits(F);
+json_encode(S, State) when is_binary(S); is_atom(S) ->
+    json_encode_string(S, State);
+json_encode([{K, _}|_] = Props, State) when (K =/= struct andalso
+                                             K =/= array andalso
+                                             K =/= json) ->
+    json_encode_proplist(Props, State);
+json_encode({struct, Props}, State) when is_list(Props) ->
+    json_encode_proplist(Props, State);
+json_encode({Props}, State) when is_list(Props) ->
+    json_encode_proplist(Props, State);
+json_encode({}, State) ->
+    json_encode_proplist([], State);
+json_encode(Array, State) when is_list(Array) ->
+    json_encode_array(Array, State);
+json_encode({array, Array}, State) when is_list(Array) ->
+    json_encode_array(Array, State);
+json_encode({json, IoList}, _State) ->
+    IoList;
+json_encode(Bad, #encoder{handler=null}) ->
+    exit({json_encode, {bad_term, Bad}});
+json_encode(Bad, State=#encoder{handler=Handler}) ->
+    json_encode(Handler(Bad), State).
+
+json_encode_array([], _State) ->
+    <<"[]">>;
+json_encode_array(L, State) ->
+    F = fun (O, Acc) ->
+                [$,, json_encode(O, State) | Acc]
+        end,
+    [$, | Acc1] = lists:foldl(F, "[", L),
+    lists:reverse([$\] | Acc1]).
+
+json_encode_proplist([], _State) ->
+    <<"{}">>;
+json_encode_proplist(Props, State) ->
+    F = fun ({K, V}, Acc) ->
+                KS = json_encode_string(K, State),
+                VS = json_encode(V, State),
+                [$,, VS, $:, KS | Acc]
+        end,
+    [$, | Acc1] = lists:foldl(F, "{", Props),
+    lists:reverse([$\} | Acc1]).
+
+json_encode_string(A, State) when is_atom(A) ->
+    L = atom_to_list(A),
+    case json_string_is_safe(L) of
+        true ->
+            [?Q, L, ?Q];
+        false ->
+            json_encode_string_unicode(xmerl_ucs:from_utf8(L), State, [?Q])
+    end;
+json_encode_string(B, State) when is_binary(B) ->
+    case json_bin_is_safe(B) of
+        true ->
+            [?Q, B, ?Q];
+        false ->
+            json_encode_string_unicode(xmerl_ucs:from_utf8(B), State, [?Q])
+    end;
+json_encode_string(I, _State) when is_integer(I) ->
+    [?Q, integer_to_list(I), ?Q];
+json_encode_string(L, State) when is_list(L) ->
+    case json_string_is_safe(L) of
+        true ->
+            [?Q, L, ?Q];
+        false ->
+            json_encode_string_unicode(L, State, [?Q])
+    end.
+
+json_string_is_safe([]) ->
+    true;
+json_string_is_safe([C | Rest]) ->
+    case C of
+        ?Q ->
+            false;
+        $\\ ->
+            false;
+        $\b ->
+            false;
+        $\f ->
+            false;
+        $\n ->
+            false;
+        $\r ->
+            false;
+        $\t ->
+            false;
+        C when C >= 0, C < $\s; C >= 16#7f, C =< 16#10FFFF ->
+            false;
+        C when C < 16#7f ->
+            json_string_is_safe(Rest);
+        _ ->
+            false
+    end.
+
+json_bin_is_safe(<<>>) ->
+    true;
+json_bin_is_safe(<<C, Rest/binary>>) ->
+    case C of
+        ?Q ->
+            false;
+        $\\ ->
+            false;
+        $\b ->
+            false;
+        $\f ->
+            false;
+        $\n ->
+            false;
+        $\r ->
+            false;
+        $\t ->
+            false;
+        C when C >= 0, C < $\s; C >= 16#7f ->
+            false;
+        C when C < 16#7f ->
+            json_bin_is_safe(Rest)
+    end.
+
+json_encode_string_unicode([], _State, Acc) ->
+    lists:reverse([$\" | Acc]);
+json_encode_string_unicode([C | Cs], State, Acc) ->
+    Acc1 = case C of
+               ?Q ->
+                   [?Q, $\\ | Acc];
+               %% Escaping solidus is only useful when trying to protect
+               %% against "</script>" injection attacks which are only
+               %% possible when JSON is inserted into a HTML document
+               %% in-line. mochijson2 does not protect you from this, so
+               %% if you do insert directly into HTML then you need to
+               %% uncomment the following case or escape the output of encode.
+               %%
+               %% $/ ->
+               %%    [$/, $\\ | Acc];
+               %%
+               $\\ ->
+                   [$\\, $\\ | Acc];
+               $\b ->
+                   [$b, $\\ | Acc];
+               $\f ->
+                   [$f, $\\ | Acc];
+               $\n ->
+                   [$n, $\\ | Acc];
+               $\r ->
+                   [$r, $\\ | Acc];
+               $\t ->
+                   [$t, $\\ | Acc];
+               C when C >= 0, C < $\s ->
+                   [unihex(C) | Acc];
+               C when C >= 16#7f, C =< 16#10FFFF, State#encoder.utf8 ->
+                   [xmerl_ucs:to_utf8(C) | Acc];
+               C when  C >= 16#7f, C =< 16#10FFFF, not State#encoder.utf8 ->
+                   [unihex(C) | Acc];
+               C when C < 16#7f ->
+                   [C | Acc];
+               _ ->
+                   exit({json_encode, {bad_char, C}})
+           end,
+    json_encode_string_unicode(Cs, State, Acc1).
+
+hexdigit(C) when C >= 0, C =< 9 ->
+    C + $0;
+hexdigit(C) when C =< 15 ->
+    C + $a - 10.
+
+unihex(C) when C < 16#10000 ->
+    <<D3:4, D2:4, D1:4, D0:4>> = <<C:16>>,
+    Digits = [hexdigit(D) || D <- [D3, D2, D1, D0]],
+    [$\\, $u | Digits];
+unihex(C) when C =< 16#10FFFF ->
+    N = C - 16#10000,
+    S1 = 16#d800 bor ((N bsr 10) band 16#3ff),
+    S2 = 16#dc00 bor (N band 16#3ff),
+    [unihex(S1), unihex(S2)].
+
+json_decode(L, S) when is_list(L) ->
+    json_decode(iolist_to_binary(L), S);
+json_decode(B, S) ->
+    {Res, S1} = decode1(B, S),
+    {eof, _} = tokenize(B, S1#decoder{state=trim}),
+    Res.
+
+decode1(B, S=#decoder{state=null}) ->
+    case tokenize(B, S#decoder{state=any}) of
+        {{const, C}, S1} ->
+            {C, S1};
+        {start_array, S1} ->
+            decode_array(B, S1);
+        {start_object, S1} ->
+            decode_object(B, S1)
+    end.
+
+make_object(V, #decoder{object_hook=N}) when N =:= null orelse N =:= struct ->
+    V;
+make_object({struct, P}, #decoder{object_hook=eep18}) ->
+    {P};
+make_object({struct, P}, #decoder{object_hook=proplist}) ->
+    P;
+make_object(V, #decoder{object_hook=Hook}) ->
+    Hook(V).
+
+decode_object(B, S) ->
+    decode_object(B, S#decoder{state=key}, []).
+
+decode_object(B, S=#decoder{state=key}, Acc) ->
+    case tokenize(B, S) of
+        {end_object, S1} ->
+            V = make_object({struct, lists:reverse(Acc)}, S1),
+            {V, S1#decoder{state=null}};
+        {{const, K}, S1} ->
+            {colon, S2} = tokenize(B, S1),
+            {V, S3} = decode1(B, S2#decoder{state=null}),
+            decode_object(B, S3#decoder{state=comma}, [{K, V} | Acc])
+    end;
+decode_object(B, S=#decoder{state=comma}, Acc) ->
+    case tokenize(B, S) of
+        {end_object, S1} ->
+            V = make_object({struct, lists:reverse(Acc)}, S1),
+            {V, S1#decoder{state=null}};
+        {comma, S1} ->
+            decode_object(B, S1#decoder{state=key}, Acc)
+    end.
+
+decode_array(B, S) ->
+    decode_array(B, S#decoder{state=any}, []).
+
+decode_array(B, S=#decoder{state=any}, Acc) ->
+    case tokenize(B, S) of
+        {end_array, S1} ->
+            {lists:reverse(Acc), S1#decoder{state=null}};
+        {start_array, S1} ->
+            {Array, S2} = decode_array(B, S1),
+            decode_array(B, S2#decoder{state=comma}, [Array | Acc]);
+        {start_object, S1} ->
+            {Array, S2} = decode_object(B, S1),
+            decode_array(B, S2#decoder{state=comma}, [Array | Acc]);
+        {{const, Const}, S1} ->
+            decode_array(B, S1#decoder{state=comma}, [Const | Acc])
+    end;
+decode_array(B, S=#decoder{state=comma}, Acc) ->
+    case tokenize(B, S) of
+        {end_array, S1} ->
+            {lists:reverse(Acc), S1#decoder{state=null}};
+        {comma, S1} ->
+            decode_array(B, S1#decoder{state=any}, Acc)
+    end.
+
+tokenize_string(B, S=#decoder{offset=O}) ->
+    case tokenize_string_fast(B, O) of
+        {escape, O1} ->
+            Length = O1 - O,
+            S1 = ?ADV_COL(S, Length),
+            <<_:O/binary, Head:Length/binary, _/binary>> = B,
+            tokenize_string(B, S1, lists:reverse(binary_to_list(Head)));
+        O1 ->
+            Length = O1 - O,
+            <<_:O/binary, String:Length/binary, ?Q, _/binary>> = B,
+            {{const, String}, ?ADV_COL(S, Length + 1)}
+    end.
+
+tokenize_string_fast(B, O) ->
+    case B of
+        <<_:O/binary, ?Q, _/binary>> ->
+            O;
+        <<_:O/binary, $\\, _/binary>> ->
+            {escape, O};
+        <<_:O/binary, C1, _/binary>> when C1 < 128 ->
+            tokenize_string_fast(B, 1 + O);
+        <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
+                C2 >= 128, C2 =< 191 ->
+            tokenize_string_fast(B, 2 + O);
+        <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191 ->
+            tokenize_string_fast(B, 3 + O);
+        <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191,
+                C4 >= 128, C4 =< 191 ->
+            tokenize_string_fast(B, 4 + O);
+        _ ->
+            throw(invalid_utf8)
+    end.
+
+tokenize_string(B, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, ?Q, _/binary>> ->
+            {{const, iolist_to_binary(lists:reverse(Acc))}, ?INC_COL(S)};
+        <<_:O/binary, "\\\"", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\" | Acc]);
+        <<_:O/binary, "\\\\", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\\ | Acc]);
+        <<_:O/binary, "\\/", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$/ | Acc]);
+        <<_:O/binary, "\\b", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\b | Acc]);
+        <<_:O/binary, "\\f", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\f | Acc]);
+        <<_:O/binary, "\\n", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\n | Acc]);
+        <<_:O/binary, "\\r", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\r | Acc]);
+        <<_:O/binary, "\\t", _/binary>> ->
+            tokenize_string(B, ?ADV_COL(S, 2), [$\t | Acc]);
+        <<_:O/binary, "\\u", C3, C2, C1, C0, Rest/binary>> ->
+            C = erlang:list_to_integer([C3, C2, C1, C0], 16),
+            if C > 16#D7FF, C < 16#DC00 ->
+                %% coalesce UTF-16 surrogate pair
+                <<"\\u", D3, D2, D1, D0, _/binary>> = Rest,
+                D = erlang:list_to_integer([D3,D2,D1,D0], 16),
+                [CodePoint] = xmerl_ucs:from_utf16be(<<C:16/big-unsigned-integer,
+                    D:16/big-unsigned-integer>>),
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(CodePoint), Acc),
+                tokenize_string(B, ?ADV_COL(S, 12), Acc1);
+            true ->
+                Acc1 = lists:reverse(xmerl_ucs:to_utf8(C), Acc),
+                tokenize_string(B, ?ADV_COL(S, 6), Acc1)
+            end;
+        <<_:O/binary, C1, _/binary>> when C1 < 128 ->
+            tokenize_string(B, ?INC_CHAR(S, C1), [C1 | Acc]);
+        <<_:O/binary, C1, C2, _/binary>> when C1 >= 194, C1 =< 223,
+                C2 >= 128, C2 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 2), [C2, C1 | Acc]);
+        <<_:O/binary, C1, C2, C3, _/binary>> when C1 >= 224, C1 =< 239,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 3), [C3, C2, C1 | Acc]);
+        <<_:O/binary, C1, C2, C3, C4, _/binary>> when C1 >= 240, C1 =< 244,
+                C2 >= 128, C2 =< 191,
+                C3 >= 128, C3 =< 191,
+                C4 >= 128, C4 =< 191 ->
+            tokenize_string(B, ?ADV_COL(S, 4), [C4, C3, C2, C1 | Acc]);
+        _ ->
+            throw(invalid_utf8)
+    end.
+
+tokenize_number(B, S) ->
+    case tokenize_number(B, sign, S, []) of
+        {{int, Int}, S1} ->
+            {{const, list_to_integer(Int)}, S1};
+        {{float, Float}, S1} ->
+            {{const, list_to_float(Float)}, S1}
+    end.
+
+tokenize_number(B, sign, S=#decoder{offset=O}, []) ->
+    case B of
+        <<_:O/binary, $-, _/binary>> ->
+            tokenize_number(B, int, ?INC_COL(S), [$-]);
+        _ ->
+            tokenize_number(B, int, S, [])
+    end;
+tokenize_number(B, int, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, $0, _/binary>> ->
+            tokenize_number(B, frac, ?INC_COL(S), [$0 | Acc]);
+        <<_:O/binary, C, _/binary>> when C >= $1 andalso C =< $9 ->
+            tokenize_number(B, int1, ?INC_COL(S), [C | Acc])
+    end;
+tokenize_number(B, int1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, int1, ?INC_COL(S), [C | Acc]);
+        _ ->
+            tokenize_number(B, frac, S, Acc)
+    end;
+tokenize_number(B, frac, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, $., C, _/binary>> when C >= $0, C =< $9 ->
+            tokenize_number(B, frac1, ?ADV_COL(S, 2), [C, $. | Acc]);
+        <<_:O/binary, E, _/binary>> when E =:= $e orelse E =:= $E ->
+            tokenize_number(B, esign, ?INC_COL(S), [$e, $0, $. | Acc]);
+        _ ->
+            {{int, lists:reverse(Acc)}, S}
+    end;
+tokenize_number(B, frac1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, frac1, ?INC_COL(S), [C | Acc]);
+        <<_:O/binary, E, _/binary>> when E =:= $e orelse E =:= $E ->
+            tokenize_number(B, esign, ?INC_COL(S), [$e | Acc]);
+        _ ->
+            {{float, lists:reverse(Acc)}, S}
+    end;
+tokenize_number(B, esign, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C =:= $- orelse C=:= $+ ->
+            tokenize_number(B, eint, ?INC_COL(S), [C | Acc]);
+        _ ->
+            tokenize_number(B, eint, S, Acc)
+    end;
+tokenize_number(B, eint, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, eint1, ?INC_COL(S), [C | Acc])
+    end;
+tokenize_number(B, eint1, S=#decoder{offset=O}, Acc) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when C >= $0 andalso C =< $9 ->
+            tokenize_number(B, eint1, ?INC_COL(S), [C | Acc]);
+        _ ->
+            {{float, lists:reverse(Acc)}, S}
+    end.
+
+tokenize(B, S=#decoder{offset=O}) ->
+    case B of
+        <<_:O/binary, C, _/binary>> when ?IS_WHITESPACE(C) ->
+            tokenize(B, ?INC_CHAR(S, C));
+        <<_:O/binary, "{", _/binary>> ->
+            {start_object, ?INC_COL(S)};
+        <<_:O/binary, "}", _/binary>> ->
+            {end_object, ?INC_COL(S)};
+        <<_:O/binary, "[", _/binary>> ->
+            {start_array, ?INC_COL(S)};
+        <<_:O/binary, "]", _/binary>> ->
+            {end_array, ?INC_COL(S)};
+        <<_:O/binary, ",", _/binary>> ->
+            {comma, ?INC_COL(S)};
+        <<_:O/binary, ":", _/binary>> ->
+            {colon, ?INC_COL(S)};
+        <<_:O/binary, "null", _/binary>> ->
+            {{const, null}, ?ADV_COL(S, 4)};
+        <<_:O/binary, "true", _/binary>> ->
+            {{const, true}, ?ADV_COL(S, 4)};
+        <<_:O/binary, "false", _/binary>> ->
+            {{const, false}, ?ADV_COL(S, 5)};
+        <<_:O/binary, "\"", _/binary>> ->
+            tokenize_string(B, ?INC_COL(S));
+        <<_:O/binary, C, _/binary>> when (C >= $0 andalso C =< $9)
+                                         orelse C =:= $- ->
+            tokenize_number(B, S);
+        <<_:O/binary>> ->
+            trim = S#decoder.state,
+            {eof, S}
+    end.
+%%
+%% Tests
+%%
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+
+
+%% testing constructs borrowed from the Yaws JSON implementation.
+
+%% Create an object from a list of Key/Value pairs.
+
+obj_new() ->
+    {struct, []}.
+
+is_obj({struct, Props}) ->
+    F = fun ({K, _}) when is_binary(K) -> true end,
+    lists:all(F, Props).
+
+obj_from_list(Props) ->
+    Obj = {struct, Props},
+    ?assert(is_obj(Obj)),
+    Obj.
+
+%% Test for equivalence of Erlang terms.
+%% Due to arbitrary order of construction, equivalent objects might
+%% compare unequal as erlang terms, so we need to carefully recurse
+%% through aggregates (tuples and objects).
+
+equiv({struct, Props1}, {struct, Props2}) ->
+    equiv_object(Props1, Props2);
+equiv(L1, L2) when is_list(L1), is_list(L2) ->
+    equiv_list(L1, L2);
+equiv(N1, N2) when is_number(N1), is_number(N2) -> N1 == N2;
+equiv(B1, B2) when is_binary(B1), is_binary(B2) -> B1 == B2;
+equiv(A, A) when A =:= true orelse A =:= false orelse A =:= null -> true.
+
+%% Object representation and traversal order is unknown.
+%% Use the sledgehammer and sort property lists.
+
+equiv_object(Props1, Props2) ->
+    L1 = lists:keysort(1, Props1),
+    L2 = lists:keysort(1, Props2),
+    Pairs = lists:zip(L1, L2),
+    true = lists:all(fun({{K1, V1}, {K2, V2}}) ->
+                             equiv(K1, K2) and equiv(V1, V2)
+                     end, Pairs).
+
+%% Recursively compare tuple elements for equivalence.
+
+equiv_list([], []) ->
+    true;
+equiv_list([V1 | L1], [V2 | L2]) ->
+    equiv(V1, V2) andalso equiv_list(L1, L2).
+
+decode_test() ->
+    [1199344435545.0, 1] = decode(<<"[1199344435545.0,1]">>),
+    <<16#F0,16#9D,16#9C,16#95>> = decode([34,"\\ud835","\\udf15",34]).
+
+e2j_vec_test() ->
+    test_one(e2j_test_vec(utf8), 1).
+
+test_one([], _N) ->
+    %% io:format("~p tests passed~n", [N-1]),
+    ok;
+test_one([{E, J} | Rest], N) ->
+    %% io:format("[~p] ~p ~p~n", [N, E, J]),
+    true = equiv(E, decode(J)),
+    true = equiv(E, decode(encode(E))),
+    test_one(Rest, 1+N).
+
+e2j_test_vec(utf8) ->
+    [
+     {1, "1"},
+     {3.1416, "3.14160"}, %% text representation may truncate, trail zeroes
+     {-1, "-1"},
+     {-3.1416, "-3.14160"},
+     {12.0e10, "1.20000e+11"},
+     {1.234E+10, "1.23400e+10"},
+     {-1.234E-10, "-1.23400e-10"},
+     {10.0, "1.0e+01"},
+     {123.456, "1.23456E+2"},
+     {10.0, "1e1"},
+     {<<"foo">>, "\"foo\""},
+     {<<"foo", 5, "bar">>, "\"foo\\u0005bar\""},
+     {<<"">>, "\"\""},
+     {<<"\n\n\n">>, "\"\\n\\n\\n\""},
+     {<<"\" \b\f\r\n\t\"">>, "\"\\\" \\b\\f\\r\\n\\t\\\"\""},
+     {obj_new(), "{}"},
+     {obj_from_list([{<<"foo">>, <<"bar">>}]), "{\"foo\":\"bar\"}"},
+     {obj_from_list([{<<"foo">>, <<"bar">>}, {<<"baz">>, 123}]),
+      "{\"foo\":\"bar\",\"baz\":123}"},
+     {[], "[]"},
+     {[[]], "[[]]"},
+     {[1, <<"foo">>], "[1,\"foo\"]"},
+
+     %% json array in a json object
+     {obj_from_list([{<<"foo">>, [123]}]),
+      "{\"foo\":[123]}"},
+
+     %% json object in a json object
+     {obj_from_list([{<<"foo">>, obj_from_list([{<<"bar">>, true}])}]),
+      "{\"foo\":{\"bar\":true}}"},
+
+     %% fold evaluation order
+     {obj_from_list([{<<"foo">>, []},
+                     {<<"bar">>, obj_from_list([{<<"baz">>, true}])},
+                     {<<"alice">>, <<"bob">>}]),
+      "{\"foo\":[],\"bar\":{\"baz\":true},\"alice\":\"bob\"}"},
+
+     %% json object in a json array
+     {[-123, <<"foo">>, obj_from_list([{<<"bar">>, []}]), null],
+      "[-123,\"foo\",{\"bar\":[]},null]"}
+    ].
+
+%% test utf8 encoding
+encoder_utf8_test() ->
+    %% safe conversion case (default)
+    [34,"\\u0001","\\u0442","\\u0435","\\u0441","\\u0442",34] =
+        encode(<<1,"\321\202\320\265\321\201\321\202">>),
+
+    %% raw utf8 output (optional)
+    Enc = mochijson2:encoder([{utf8, true}]),
+    [34,"\\u0001",[209,130],[208,181],[209,129],[209,130],34] =
+        Enc(<<1,"\321\202\320\265\321\201\321\202">>).
+
+input_validation_test() ->
+    Good = [
+        {16#00A3, <<?Q, 16#C2, 16#A3, ?Q>>}, %% pound
+        {16#20AC, <<?Q, 16#E2, 16#82, 16#AC, ?Q>>}, %% euro
+        {16#10196, <<?Q, 16#F0, 16#90, 16#86, 16#96, ?Q>>} %% denarius
+    ],
+    lists:foreach(fun({CodePoint, UTF8}) ->
+        Expect = list_to_binary(xmerl_ucs:to_utf8(CodePoint)),
+        Expect = decode(UTF8)
+    end, Good),
+
+    Bad = [
+        %% 2nd, 3rd, or 4th byte of a multi-byte sequence w/o leading byte
+        <<?Q, 16#80, ?Q>>,
+        %% missing continuations, last byte in each should be 80-BF
+        <<?Q, 16#C2, 16#7F, ?Q>>,
+        <<?Q, 16#E0, 16#80,16#7F, ?Q>>,
+        <<?Q, 16#F0, 16#80, 16#80, 16#7F, ?Q>>,
+        %% we don't support code points > 10FFFF per RFC 3629
+        <<?Q, 16#F5, 16#80, 16#80, 16#80, ?Q>>,
+        %% escape characters trigger a different code path
+        <<?Q, $\\, $\n, 16#80, ?Q>>
+    ],
+    lists:foreach(
+      fun(X) ->
+              ok = try decode(X) catch invalid_utf8 -> ok end,
+              %% could be {ucs,{bad_utf8_character_code}} or
+              %%          {json_encode,{bad_char,_}}
+              {'EXIT', _} = (catch encode(X))
+      end, Bad).
+
+inline_json_test() ->
+    ?assertEqual(<<"\"iodata iodata\"">>,
+                 iolist_to_binary(
+                   encode({json, [<<"\"iodata">>, " iodata\""]}))),
+    ?assertEqual({struct, [{<<"key">>, <<"iodata iodata">>}]},
+                 decode(
+                   encode({struct,
+                           [{key, {json, [<<"\"iodata">>, " iodata\""]}}]}))),
+    ok.
+
+big_unicode_test() ->
+    UTF8Seq = list_to_binary(xmerl_ucs:to_utf8(16#0001d120)),
+    ?assertEqual(
+       <<"\"\\ud834\\udd20\"">>,
+       iolist_to_binary(encode(UTF8Seq))),
+    ?assertEqual(
+       UTF8Seq,
+       decode(iolist_to_binary(encode(UTF8Seq)))),
+    ok.
+
+custom_decoder_test() ->
+    ?assertEqual(
+       {struct, [{<<"key">>, <<"value">>}]},
+       (decoder([]))("{\"key\": \"value\"}")),
+    F = fun ({struct, [{<<"key">>, <<"value">>}]}) -> win end,
+    ?assertEqual(
+       win,
+       (decoder([{object_hook, F}]))("{\"key\": \"value\"}")),
+    ok.
+
+atom_test() ->
+    %% JSON native atoms
+    [begin
+         ?assertEqual(A, decode(atom_to_list(A))),
+         ?assertEqual(iolist_to_binary(atom_to_list(A)),
+                      iolist_to_binary(encode(A)))
+     end || A <- [true, false, null]],
+    %% Atom to string
+    ?assertEqual(
+       <<"\"foo\"">>,
+       iolist_to_binary(encode(foo))),
+    ?assertEqual(
+       <<"\"\\ud834\\udd20\"">>,
+       iolist_to_binary(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))),
+    ok.
+
+key_encode_test() ->
+    %% Some forms are accepted as keys that would not be strings in other
+    %% cases
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{foo, 1}]}))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{<<"foo">>, 1}]}))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode({struct, [{"foo", 1}]}))),
+	?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{foo, 1}]))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{<<"foo">>, 1}]))),
+    ?assertEqual(
+       <<"{\"foo\":1}">>,
+       iolist_to_binary(encode([{"foo", 1}]))),
+    ?assertEqual(
+       <<"{\"\\ud834\\udd20\":1}">>,
+       iolist_to_binary(
+         encode({struct, [{[16#0001d120], 1}]}))),
+    ?assertEqual(
+       <<"{\"1\":1}">>,
+       iolist_to_binary(encode({struct, [{1, 1}]}))),
+    ok.
+
+unsafe_chars_test() ->
+    Chars = "\"\\\b\f\n\r\t",
+    [begin
+         ?assertEqual(false, json_string_is_safe([C])),
+         ?assertEqual(false, json_bin_is_safe(<<C>>)),
+         ?assertEqual(<<C>>, decode(encode(<<C>>)))
+     end || C <- Chars],
+    ?assertEqual(
+       false,
+       json_string_is_safe([16#0001d120])),
+    ?assertEqual(
+       false,
+       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8(16#0001d120)))),
+    ?assertEqual(
+       [16#0001d120],
+       xmerl_ucs:from_utf8(
+         binary_to_list(
+           decode(encode(list_to_atom(xmerl_ucs:to_utf8(16#0001d120))))))),
+    ?assertEqual(
+       false,
+       json_string_is_safe([16#110000])),
+    ?assertEqual(
+       false,
+       json_bin_is_safe(list_to_binary(xmerl_ucs:to_utf8([16#110000])))),
+    %% solidus can be escaped but isn't unsafe by default
+    ?assertEqual(
+       <<"/">>,
+       decode(<<"\"\\/\"">>)),
+    ok.
+
+int_test() ->
+    ?assertEqual(0, decode("0")),
+    ?assertEqual(1, decode("1")),
+    ?assertEqual(11, decode("11")),
+    ok.
+
+large_int_test() ->
+    ?assertEqual(<<"-2147483649214748364921474836492147483649">>,
+        iolist_to_binary(encode(-2147483649214748364921474836492147483649))),
+    ?assertEqual(<<"2147483649214748364921474836492147483649">>,
+        iolist_to_binary(encode(2147483649214748364921474836492147483649))),
+    ok.
+
+float_test() ->
+    ?assertEqual(<<"-2147483649.0">>, iolist_to_binary(encode(-2147483649.0))),
+    ?assertEqual(<<"2147483648.0">>, iolist_to_binary(encode(2147483648.0))),
+    ok.
+
+handler_test() ->
+    ?assertEqual(
+       {'EXIT',{json_encode,{bad_term,{x,y}}}},
+       catch encode({x,y})),
+    F = fun ({x,y}) -> [] end,
+    ?assertEqual(
+       <<"[]">>,
+       iolist_to_binary((encoder([{handler, F}]))({x, y}))),
+    ok.
+
+encode_empty_test_() ->
+    [{A, ?_assertEqual(<<"{}">>, iolist_to_binary(encode(B)))}
+     || {A, B} <- [{"eep18 {}", {}},
+                   {"eep18 {[]}", {[]}},
+                   {"{struct, []}", {struct, []}}]].
+
+encode_test_() ->
+    P = [{<<"k">>, <<"v">>}],
+    JSON = iolist_to_binary(encode({struct, P})),
+    [{atom_to_list(F),
+      ?_assertEqual(JSON, iolist_to_binary(encode(decode(JSON, [{format, F}]))))}
+     || F <- [struct, eep18, proplist]].
+
+format_test_() ->
+    P = [{<<"k">>, <<"v">>}],
+    JSON = iolist_to_binary(encode({struct, P})),
+    [{atom_to_list(F),
+      ?_assertEqual(A, decode(JSON, [{format, F}]))}
+     || {F, A} <- [{struct, {struct, P}},
+                   {eep18, {P}},
+                   {proplist, P}]].
+
+-endif.
diff --git a/src/mochinum.erl b/src/mochinum.erl
new file mode 100644
index 00000000..4ea7a22a
--- /dev/null
+++ b/src/mochinum.erl
@@ -0,0 +1,358 @@
+%% This file is a copy of `mochijson2.erl' from mochiweb, revision
+%% d541e9a0f36c00dcadc2e589f20e47fbf46fc76f.  For the license, see
+%% `LICENSE-MIT-Mochi'.
+
+%% @copyright 2007 Mochi Media, Inc.
+%% @author Bob Ippolito <bob@mochimedia.com>
+
+%% @doc Useful numeric algorithms for floats that cover some deficiencies
+%% in the math module. More interesting is digits/1, which implements
+%% the algorithm from:
+%% http://www.cs.indiana.edu/~burger/fp/index.html
+%% See also "Printing Floating-Point Numbers Quickly and Accurately"
+%% in Proceedings of the SIGPLAN '96 Conference on Programming Language
+%% Design and Implementation.
+
+-module(mochinum).
+-author("Bob Ippolito <bob@mochimedia.com>").
+-export([digits/1, frexp/1, int_pow/2, int_ceil/1]).
+
+%% IEEE 754 Float exponent bias
+-define(FLOAT_BIAS, 1022).
+-define(MIN_EXP, -1074).
+-define(BIG_POW, 4503599627370496).
+
+%% External API
+
+%% @spec digits(number()) -> string()
+%% @doc  Returns a string that accurately represents the given integer or float
+%%       using a conservative amount of digits. Great for generating
+%%       human-readable output, or compact ASCII serializations for floats.
+digits(N) when is_integer(N) ->
+    integer_to_list(N);
+digits(0.0) ->
+    "0.0";
+digits(Float) ->
+    {Frac1, Exp1} = frexp_int(Float),
+    [Place0 | Digits0] = digits1(Float, Exp1, Frac1),
+    {Place, Digits} = transform_digits(Place0, Digits0),
+    R = insert_decimal(Place, Digits),
+    case Float < 0 of
+        true ->
+            [$- | R];
+        _ ->
+            R
+    end.
+
+%% @spec frexp(F::float()) -> {Frac::float(), Exp::float()}
+%% @doc  Return the fractional and exponent part of an IEEE 754 double,
+%%       equivalent to the libc function of the same name.
+%%       F = Frac * pow(2, Exp).
+frexp(F) ->
+    frexp1(unpack(F)).
+
+%% @spec int_pow(X::integer(), N::integer()) -> Y::integer()
+%% @doc  Moderately efficient way to exponentiate integers.
+%%       int_pow(10, 2) = 100.
+int_pow(_X, 0) ->
+    1;
+int_pow(X, N) when N > 0 ->
+    int_pow(X, N, 1).
+
+%% @spec int_ceil(F::float()) -> integer()
+%% @doc  Return the ceiling of F as an integer. The ceiling is defined as
+%%       F when F == trunc(F);
+%%       trunc(F) when F &lt; 0;
+%%       trunc(F) + 1 when F &gt; 0.
+int_ceil(X) ->
+    T = trunc(X),
+    case (X - T) of
+        Pos when Pos > 0 -> T + 1;
+        _ -> T
+    end.
+
+
+%% Internal API
+
+int_pow(X, N, R) when N < 2 ->
+    R * X;
+int_pow(X, N, R) ->
+    int_pow(X * X, N bsr 1, case N band 1 of 1 -> R * X; 0 -> R end).
+
+insert_decimal(0, S) ->
+    "0." ++ S;
+insert_decimal(Place, S) when Place > 0 ->
+    L = length(S),
+    case Place - L of
+         0 ->
+            S ++ ".0";
+        N when N < 0 ->
+            {S0, S1} = lists:split(L + N, S),
+            S0 ++ "." ++ S1;
+        N when N < 6 ->
+            %% More places than digits
+            S ++ lists:duplicate(N, $0) ++ ".0";
+        _ ->
+            insert_decimal_exp(Place, S)
+    end;
+insert_decimal(Place, S) when Place > -6 ->
+    "0." ++ lists:duplicate(abs(Place), $0) ++ S;
+insert_decimal(Place, S) ->
+    insert_decimal_exp(Place, S).
+
+insert_decimal_exp(Place, S) ->
+    [C | S0] = S,
+    S1 = case S0 of
+             [] ->
+                 "0";
+             _ ->
+                 S0
+         end,
+    Exp = case Place < 0 of
+              true ->
+                  "e-";
+              false ->
+                  "e+"
+          end,
+    [C] ++ "." ++ S1 ++ Exp ++ integer_to_list(abs(Place - 1)).
+
+
+digits1(Float, Exp, Frac) ->
+    Round = ((Frac band 1) =:= 0),
+    case Exp >= 0 of
+        true ->
+            BExp = 1 bsl Exp,
+            case (Frac =/= ?BIG_POW) of
+                true ->
+                    scale((Frac * BExp * 2), 2, BExp, BExp,
+                          Round, Round, Float);
+                false ->
+                    scale((Frac * BExp * 4), 4, (BExp * 2), BExp,
+                          Round, Round, Float)
+            end;
+        false ->
+            case (Exp =:= ?MIN_EXP) orelse (Frac =/= ?BIG_POW) of
+                true ->
+                    scale((Frac * 2), 1 bsl (1 - Exp), 1, 1,
+                          Round, Round, Float);
+                false ->
+                    scale((Frac * 4), 1 bsl (2 - Exp), 2, 1,
+                          Round, Round, Float)
+            end
+    end.
+
+scale(R, S, MPlus, MMinus, LowOk, HighOk, Float) ->
+    Est = int_ceil(math:log10(abs(Float)) - 1.0e-10),
+    %% Note that the scheme implementation uses a 326 element look-up table
+    %% for int_pow(10, N) where we do not.
+    case Est >= 0 of
+        true ->
+            fixup(R, S * int_pow(10, Est), MPlus, MMinus, Est,
+                  LowOk, HighOk);
+        false ->
+            Scale = int_pow(10, -Est),
+            fixup(R * Scale, S, MPlus * Scale, MMinus * Scale, Est,
+                  LowOk, HighOk)
+    end.
+
+fixup(R, S, MPlus, MMinus, K, LowOk, HighOk) ->
+    TooLow = case HighOk of
+                 true ->
+                     (R + MPlus) >= S;
+                 false ->
+                     (R + MPlus) > S
+             end,
+    case TooLow of
+        true ->
+            [(K + 1) | generate(R, S, MPlus, MMinus, LowOk, HighOk)];
+        false ->
+            [K | generate(R * 10, S, MPlus * 10, MMinus * 10, LowOk, HighOk)]
+    end.
+
+generate(R0, S, MPlus, MMinus, LowOk, HighOk) ->
+    D = R0 div S,
+    R = R0 rem S,
+    TC1 = case LowOk of
+              true ->
+                  R =< MMinus;
+              false ->
+                  R < MMinus
+          end,
+    TC2 = case HighOk of
+              true ->
+                  (R + MPlus) >= S;
+              false ->
+                  (R + MPlus) > S
+          end,
+    case TC1 of
+        false ->
+            case TC2 of
+                false ->
+                    [D | generate(R * 10, S, MPlus * 10, MMinus * 10,
+                                  LowOk, HighOk)];
+                true ->
+                    [D + 1]
+            end;
+        true ->
+            case TC2 of
+                false ->
+                    [D];
+                true ->
+                    case R * 2 < S of
+                        true ->
+                            [D];
+                        false ->
+                            [D + 1]
+                    end
+            end
+    end.
+
+unpack(Float) ->
+    <<Sign:1, Exp:11, Frac:52>> = <<Float:64/float>>,
+    {Sign, Exp, Frac}.
+
+frexp1({_Sign, 0, 0}) ->
+    {0.0, 0};
+frexp1({Sign, 0, Frac}) ->
+    Exp = log2floor(Frac),
+    <<Frac1:64/float>> = <<Sign:1, ?FLOAT_BIAS:11, (Frac-1):52>>,
+    {Frac1, -(?FLOAT_BIAS) - 52 + Exp};
+frexp1({Sign, Exp, Frac}) ->
+    <<Frac1:64/float>> = <<Sign:1, ?FLOAT_BIAS:11, Frac:52>>,
+    {Frac1, Exp - ?FLOAT_BIAS}.
+
+log2floor(Int) ->
+    log2floor(Int, 0).
+
+log2floor(0, N) ->
+    N;
+log2floor(Int, N) ->
+    log2floor(Int bsr 1, 1 + N).
+
+
+transform_digits(Place, [0 | Rest]) ->
+    transform_digits(Place, Rest);
+transform_digits(Place, Digits) ->
+    {Place, [$0 + D || D <- Digits]}.
+
+
+frexp_int(F) ->
+    case unpack(F) of
+        {_Sign, 0, Frac} ->
+            {Frac, ?MIN_EXP};
+        {_Sign, Exp, Frac} ->
+            {Frac + (1 bsl 52), Exp - 53 - ?FLOAT_BIAS}
+    end.
+
+%%
+%% Tests
+%%
+-ifdef(TEST).
+-include_lib("eunit/include/eunit.hrl").
+
+int_ceil_test() ->
+    ?assertEqual(1, int_ceil(0.0001)),
+    ?assertEqual(0, int_ceil(0.0)),
+    ?assertEqual(1, int_ceil(0.99)),
+    ?assertEqual(1, int_ceil(1.0)),
+    ?assertEqual(-1, int_ceil(-1.5)),
+    ?assertEqual(-2, int_ceil(-2.0)),
+    ok.
+
+int_pow_test() ->
+    ?assertEqual(1, int_pow(1, 1)),
+    ?assertEqual(1, int_pow(1, 0)),
+    ?assertEqual(1, int_pow(10, 0)),
+    ?assertEqual(10, int_pow(10, 1)),
+    ?assertEqual(100, int_pow(10, 2)),
+    ?assertEqual(1000, int_pow(10, 3)),
+    ok.
+
+digits_test() ->
+    ?assertEqual("0",
+                 digits(0)),
+    ?assertEqual("0.0",
+                 digits(0.0)),
+    ?assertEqual("1.0",
+                 digits(1.0)),
+    ?assertEqual("-1.0",
+                 digits(-1.0)),
+    ?assertEqual("0.1",
+                 digits(0.1)),
+    ?assertEqual("0.01",
+                 digits(0.01)),
+    ?assertEqual("0.001",
+                 digits(0.001)),
+    ?assertEqual("1.0e+6",
+                 digits(1000000.0)),
+    ?assertEqual("0.5",
+                 digits(0.5)),
+    ?assertEqual("4503599627370496.0",
+                 digits(4503599627370496.0)),
+    %% small denormalized number
+    %% 4.94065645841246544177e-324 =:= 5.0e-324
+    <<SmallDenorm/float>> = <<0,0,0,0,0,0,0,1>>,
+    ?assertEqual("5.0e-324",
+                 digits(SmallDenorm)),
+    ?assertEqual(SmallDenorm,
+                 list_to_float(digits(SmallDenorm))),
+    %% large denormalized number
+    %% 2.22507385850720088902e-308
+    <<BigDenorm/float>> = <<0,15,255,255,255,255,255,255>>,
+    ?assertEqual("2.225073858507201e-308",
+                 digits(BigDenorm)),
+    ?assertEqual(BigDenorm,
+                 list_to_float(digits(BigDenorm))),
+    %% small normalized number
+    %% 2.22507385850720138309e-308
+    <<SmallNorm/float>> = <<0,16,0,0,0,0,0,0>>,
+    ?assertEqual("2.2250738585072014e-308",
+                 digits(SmallNorm)),
+    ?assertEqual(SmallNorm,
+                 list_to_float(digits(SmallNorm))),
+    %% large normalized number
+    %% 1.79769313486231570815e+308
+    <<LargeNorm/float>> = <<127,239,255,255,255,255,255,255>>,
+    ?assertEqual("1.7976931348623157e+308",
+                 digits(LargeNorm)),
+    ?assertEqual(LargeNorm,
+                 list_to_float(digits(LargeNorm))),
+    %% issue #10 - mochinum:frexp(math:pow(2, -1074)).
+    ?assertEqual("5.0e-324",
+                 digits(math:pow(2, -1074))),
+    ok.
+
+frexp_test() ->
+    %% zero
+    ?assertEqual({0.0, 0}, frexp(0.0)),
+    %% one
+    ?assertEqual({0.5, 1}, frexp(1.0)),
+    %% negative one
+    ?assertEqual({-0.5, 1}, frexp(-1.0)),
+    %% small denormalized number
+    %% 4.94065645841246544177e-324
+    <<SmallDenorm/float>> = <<0,0,0,0,0,0,0,1>>,
+    ?assertEqual({0.5, -1073}, frexp(SmallDenorm)),
+    %% large denormalized number
+    %% 2.22507385850720088902e-308
+    <<BigDenorm/float>> = <<0,15,255,255,255,255,255,255>>,
+    ?assertEqual(
+       {0.99999999999999978, -1022},
+       frexp(BigDenorm)),
+    %% small normalized number
+    %% 2.22507385850720138309e-308
+    <<SmallNorm/float>> = <<0,16,0,0,0,0,0,0>>,
+    ?assertEqual({0.5, -1021}, frexp(SmallNorm)),
+    %% large normalized number
+    %% 1.79769313486231570815e+308
+    <<LargeNorm/float>> = <<127,239,255,255,255,255,255,255>>,
+    ?assertEqual(
+        {0.99999999999999989, 1024},
+        frexp(LargeNorm)),
+    %% issue #10 - mochinum:frexp(math:pow(2, -1074)).
+    ?assertEqual(
+       {0.5, -1073},
+       frexp(math:pow(2, -1074))),
+    ok.
+
+-endif.
diff --git a/src/pg_local.erl b/src/pg_local.erl
index e2e82f1f..f535b136 100644
--- a/src/pg_local.erl
+++ b/src/pg_local.erl
@@ -13,7 +13,7 @@
 %%    versions of Erlang/OTP. The remaining type specs have been
 %%    removed.
 
-%% All modifications are (C) 2010-2012 VMware, Inc.
+%% All modifications are (C) 2010-2013 GoPivotal, Inc.
 
 %% %CopyrightBegin%
 %% 
diff --git a/src/pmon.erl b/src/pmon.erl
index 45786577..86308167 100644
--- a/src/pmon.erl
+++ b/src/pmon.erl
@@ -10,14 +10,18 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(pmon).
 
--export([new/0, monitor/2, monitor_all/2, demonitor/2, is_monitored/2, erase/2,
-         monitored/1, is_empty/1]).
+-export([new/0, new/1, monitor/2, monitor_all/2, demonitor/2,
+         is_monitored/2, erase/2, monitored/1, is_empty/1]).
+
+-compile({no_auto_import, [monitor/2]}).
+
+-record(state, {dict, module}).
 
 -ifdef(use_specs).
 
@@ -25,40 +29,50 @@
 
 -export_type([?MODULE/0]).
 
--opaque(?MODULE()    :: dict()).
+-opaque(?MODULE() :: #state{dict   :: dict(),
+                            module :: atom()}).
+
+-type(item()         :: pid() | {atom(), node()}).
 
 -spec(new/0          :: () -> ?MODULE()).
--spec(monitor/2      :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(monitor_all/2  :: ([pid()], ?MODULE()) -> ?MODULE()).
--spec(demonitor/2    :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(is_monitored/2 :: (pid(), ?MODULE()) -> boolean()).
--spec(erase/2        :: (pid(), ?MODULE()) -> ?MODULE()).
--spec(monitored/1    :: (?MODULE()) -> [pid()]).
+-spec(new/1          :: ('erlang' | 'delegate') -> ?MODULE()).
+-spec(monitor/2      :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(monitor_all/2  :: ([item()], ?MODULE()) -> ?MODULE()).
+-spec(demonitor/2    :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(is_monitored/2 :: (item(), ?MODULE()) -> boolean()).
+-spec(erase/2        :: (item(), ?MODULE()) -> ?MODULE()).
+-spec(monitored/1    :: (?MODULE()) -> [item()]).
 -spec(is_empty/1     :: (?MODULE()) -> boolean()).
 
 -endif.
 
-new() -> dict:new().
+new() -> new(erlang).
+
+new(Module) -> #state{dict   = dict:new(),
+                      module = Module}.
 
-monitor(Pid, M) ->
-    case dict:is_key(Pid, M) of
-        true  -> M;
-        false -> dict:store(Pid, erlang:monitor(process, Pid), M)
+monitor(Item, S = #state{dict = M, module = Module}) ->
+    case dict:is_key(Item, M) of
+        true  -> S;
+        false -> S#state{dict = dict:store(
+                                  Item, Module:monitor(process, Item), M)}
     end.
 
-monitor_all(Pids, M) -> lists:foldl(fun monitor/2, M, Pids).
+monitor_all([],     S) -> S;                %% optimisation
+monitor_all([Item], S) -> monitor(Item, S); %% optimisation
+monitor_all(Items,  S) -> lists:foldl(fun monitor/2, S, Items).
 
-demonitor(Pid, M) ->
-    case dict:find(Pid, M) of
-        {ok, MRef} -> erlang:demonitor(MRef),
-                      dict:erase(Pid, M);
+demonitor(Item, S = #state{dict = M, module = Module}) ->
+    case dict:find(Item, M) of
+        {ok, MRef} -> Module:demonitor(MRef),
+                      S#state{dict = dict:erase(Item, M)};
         error      -> M
     end.
 
-is_monitored(Pid, M) -> dict:is_key(Pid, M).
+is_monitored(Item, #state{dict = M}) -> dict:is_key(Item, M).
 
-erase(Pid, M) -> dict:erase(Pid, M).
+erase(Item, S = #state{dict = M}) -> S#state{dict = dict:erase(Item, M)}.
 
-monitored(M) -> dict:fetch_keys(M).
+monitored(#state{dict = M}) -> dict:fetch_keys(M).
 
-is_empty(M) -> dict:size(M) == 0.
+is_empty(#state{dict = M}) -> dict:size(M) == 0.
diff --git a/src/priority_queue.erl b/src/priority_queue.erl
index 780fa2e9..6995c3be 100644
--- a/src/priority_queue.erl
+++ b/src/priority_queue.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 %% Priority queues have essentially the same interface as ordinary
@@ -51,7 +51,7 @@
 
 -type(q() :: pqueue()).
 -type(priority() :: integer() | 'infinity').
--type(squeue() :: {queue, [any()], [any()]}).
+-type(squeue() :: {queue, [any()], [any()], non_neg_integer()}).
 -type(pqueue() ::  squeue() | {pqueue, [{priority(), squeue()}]}).
 
 -spec(new/0 :: () -> pqueue()).
@@ -69,9 +69,9 @@
 %%----------------------------------------------------------------------------
 
 new() ->
-    {queue, [], []}.
+    {queue, [], [], 0}.
 
-is_queue({queue, R, F}) when is_list(R), is_list(F) ->
+is_queue({queue, R, F, L}) when is_list(R), is_list(F), is_integer(L) ->
     true;
 is_queue({pqueue, Queues}) when is_list(Queues) ->
     lists:all(fun ({infinity, Q}) -> is_queue(Q);
@@ -80,17 +80,17 @@ is_queue({pqueue, Queues}) when is_list(Queues) ->
 is_queue(_) ->
     false.
 
-is_empty({queue, [], []}) ->
+is_empty({queue, [], [], 0}) ->
     true;
 is_empty(_) ->
     false.
 
-len({queue, R, F}) when is_list(R), is_list(F) ->
-    length(R) + length(F);
+len({queue, _R, _F, L}) ->
+    L;
 len({pqueue, Queues}) ->
     lists:sum([len(Q) || {_, Q} <- Queues]).
 
-to_list({queue, In, Out}) when is_list(In), is_list(Out) ->
+to_list({queue, In, Out, _Len}) when is_list(In), is_list(Out) ->
     [{0, V} || V <- Out ++ lists:reverse(In, [])];
 to_list({pqueue, Queues}) ->
     [{maybe_negate_priority(P), V} || {P, Q} <- Queues,
@@ -99,13 +99,13 @@ to_list({pqueue, Queues}) ->
 in(Item, Q) ->
     in(Item, 0, Q).
 
-in(X, 0, {queue, [_] = In, []}) ->
-    {queue, [X], In};
-in(X, 0, {queue, In, Out}) when is_list(In), is_list(Out) ->
-    {queue, [X|In], Out};
-in(X, Priority, _Q = {queue, [], []}) ->
+in(X, 0, {queue, [_] = In, [], 1}) ->
+    {queue, [X], In, 2};
+in(X, 0, {queue, In, Out, Len}) when is_list(In), is_list(Out) ->
+    {queue, [X|In], Out, Len + 1};
+in(X, Priority, _Q = {queue, [], [], 0}) ->
     in(X, Priority, {pqueue, []});
-in(X, Priority, Q = {queue, _, _}) ->
+in(X, Priority, Q = {queue, _, _, _}) ->
     in(X, Priority, {pqueue, [{0, Q}]});
 in(X, Priority, {pqueue, Queues}) ->
     P = maybe_negate_priority(Priority),
@@ -113,33 +113,33 @@ in(X, Priority, {pqueue, Queues}) ->
                  {value, {_, Q}} ->
                      lists:keyreplace(P, 1, Queues, {P, in(X, Q)});
                  false when P == infinity ->
-                     [{P, {queue, [X], []}} | Queues];
+                     [{P, {queue, [X], [], 1}} | Queues];
                  false ->
                      case Queues of
                          [{infinity, InfQueue} | Queues1] ->
                              [{infinity, InfQueue} |
-                              lists:keysort(1, [{P, {queue, [X], []}} | Queues1])];
+                              lists:keysort(1, [{P, {queue, [X], [], 1}} | Queues1])];
                          _ ->
-                             lists:keysort(1, [{P, {queue, [X], []}} | Queues])
+                             lists:keysort(1, [{P, {queue, [X], [], 1}} | Queues])
                      end
              end}.
 
-out({queue, [], []} = Q) ->
+out({queue, [], [], 0} = Q) ->
     {empty, Q};
-out({queue, [V], []}) ->
-    {{value, V}, {queue, [], []}};
-out({queue, [Y|In], []}) ->
+out({queue, [V], [], 1}) ->
+    {{value, V}, {queue, [], [], 0}};
+out({queue, [Y|In], [], Len}) ->
     [V|Out] = lists:reverse(In, []),
-    {{value, V}, {queue, [Y], Out}};
-out({queue, In, [V]}) when is_list(In) ->
-    {{value,V}, r2f(In)};
-out({queue, In,[V|Out]}) when is_list(In) ->
-    {{value, V}, {queue, In, Out}};
+    {{value, V}, {queue, [Y], Out}, Len - 1};
+out({queue, In, [V], Len}) when is_list(In) ->
+    {{value,V}, r2f(In, Len - 1)};
+out({queue, In,[V|Out], Len}) when is_list(In) ->
+    {{value, V}, {queue, In, Out, Len - 1}};
 out({pqueue, [{P, Q} | Queues]}) ->
     {R, Q1} = out(Q),
     NewQ = case is_empty(Q1) of
                true -> case Queues of
-                           []           -> {queue, [], []};
+                           []           -> {queue, [], [], 0};
                            [{0, OnlyQ}] -> OnlyQ;
                            [_|_]        -> {pqueue, Queues}
                        end;
@@ -147,13 +147,13 @@ out({pqueue, [{P, Q} | Queues]}) ->
            end,
     {R, NewQ}.
 
-join(A, {queue, [], []}) ->
+join(A, {queue, [], [], 0}) ->
     A;
-join({queue, [], []}, B) ->
+join({queue, [], [], 0}, B) ->
     B;
-join({queue, AIn, AOut}, {queue, BIn, BOut}) ->
-    {queue, BIn, AOut ++ lists:reverse(AIn, BOut)};
-join(A = {queue, _, _}, {pqueue, BPQ}) ->
+join({queue, AIn, AOut, ALen}, {queue, BIn, BOut, BLen}) ->
+    {queue, BIn, AOut ++ lists:reverse(AIn, BOut), ALen + BLen};
+join(A = {queue, _, _, _}, {pqueue, BPQ}) ->
     {Pre, Post} =
         lists:splitwith(fun ({P, _}) -> P < 0 orelse P == infinity end, BPQ),
     Post1 = case Post of
@@ -162,7 +162,7 @@ join(A = {queue, _, _}, {pqueue, BPQ}) ->
                 _                         -> [ {0, A} | Post ]
             end,
     {pqueue, Pre ++ Post1};
-join({pqueue, APQ}, B = {queue, _, _}) ->
+join({pqueue, APQ}, B = {queue, _, _, _}) ->
     {Pre, Post} =
         lists:splitwith(fun ({P, _}) -> P < 0 orelse P == infinity end, APQ),
     Post1 = case Post of
@@ -185,10 +185,10 @@ merge([{PA, A}|As], Bs = [{PB, _}|_], Acc) when PA < PB orelse PA == infinity ->
 merge(As = [{_, _}|_], [{PB, B}|Bs], Acc) ->
     merge(As, Bs, [ {PB, B} | Acc ]).
 
-r2f([])      -> {queue, [], []};
-r2f([_] = R) -> {queue, [], R};
-r2f([X,Y])   -> {queue, [X], [Y]};
-r2f([X,Y|R]) -> {queue, [X,Y], lists:reverse(R, [])}.
+r2f([],      0) -> {queue, [], [], 0};
+r2f([_] = R, 1) -> {queue, [], R, 1};
+r2f([X,Y],   2) -> {queue, [X], [Y], 2};
+r2f([X,Y|R], L) -> {queue, [X,Y], lists:reverse(R, []), L}.
 
 maybe_negate_priority(infinity) -> infinity;
 maybe_negate_priority(P)        -> -P.
diff --git a/src/rabbit.erl b/src/rabbit.erl
index df009529..a9974711 100644
--- a/src/rabbit.erl
+++ b/src/rabbit.erl
@@ -10,17 +10,18 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit).
 
 -behaviour(application).
 
--export([maybe_hipe_compile/0, prepare/0, start/0, stop/0, stop_and_halt/0,
-         status/0, is_running/0, is_running/1, environment/0,
-         rotate_logs/1, force_event_refresh/0]).
+-export([start/0, boot/0, stop/0,
+         stop_and_halt/0, await_startup/0, status/0, is_running/0,
+         is_running/1, environment/0, rotate_logs/1, force_event_refresh/0,
+         start_fhc/0]).
 
 -export([start/2, stop/1]).
 
@@ -35,7 +36,7 @@
 -rabbit_boot_step({codec_correctness_check,
                    [{description, "codec correctness check"},
                     {mfa,         {rabbit_binary_generator,
-                                   check_empty_content_body_frame_size,
+                                   check_empty_frame_size,
                                    []}},
                     {requires,    pre_boot},
                     {enables,     external_infrastructure}]}).
@@ -53,8 +54,7 @@
 
 -rabbit_boot_step({file_handle_cache,
                    [{description, "file handle cache server"},
-                    {mfa,         {rabbit_sup, start_restartable_child,
-                                   [file_handle_cache]}},
+                    {mfa,         {rabbit, start_fhc, []}},
                     {requires,    pre_boot},
                     {enables,     worker_pool}]}).
 
@@ -123,7 +123,7 @@
                    [{description, "node monitor"},
                     {mfa,         {rabbit_sup, start_restartable_child,
                                    [rabbit_node_monitor]}},
-                    {requires,    kernel_ready},
+                    {requires,    rabbit_alarm},
                     {enables,     core_initialized}]}).
 
 -rabbit_boot_step({core_initialized,
@@ -161,7 +161,12 @@
 
 -rabbit_boot_step({log_relay,
                    [{description, "error log relay"},
-                    {mfa,         {rabbit_error_logger, boot, []}},
+                    {mfa,         {rabbit_sup, start_child,
+                                   [rabbit_error_logger_lifecycle,
+                                    supervised_lifecycle,
+                                    [rabbit_error_logger_lifecycle,
+                                     {rabbit_error_logger, start, []},
+                                     {rabbit_error_logger, stop,  []}]]}},
                     {requires,    routing_ready},
                     {enables,     networking}]}).
 
@@ -176,9 +181,15 @@
 
 -rabbit_boot_step({notify_cluster,
                    [{description, "notify cluster nodes"},
-                    {mfa,         {rabbit_node_monitor, notify_cluster, []}},
+                    {mfa,         {rabbit_node_monitor, notify_node_up, []}},
                     {requires,    networking}]}).
 
+-rabbit_boot_step({background_gc,
+                   [{description, "background garbage collection"},
+                    {mfa,         {rabbit_sup, start_restartable_child,
+                                   [background_gc]}},
+                    {enables,     networking}]}).
+
 %%---------------------------------------------------------------------------
 
 -include("rabbit_framing.hrl").
@@ -186,21 +197,6 @@
 
 -define(APPS, [os_mon, mnesia, rabbit]).
 
-%% see bug 24513 for how this list was created
--define(HIPE_WORTHY,
-        [rabbit_reader, rabbit_channel, gen_server2,
-         rabbit_exchange, rabbit_command_assembler, rabbit_framing_amqp_0_9_1,
-         rabbit_basic, rabbit_event, lists, queue, priority_queue,
-         rabbit_router, rabbit_trace, rabbit_misc, rabbit_binary_parser,
-         rabbit_exchange_type_direct, rabbit_guid, rabbit_net,
-         rabbit_amqqueue_process, rabbit_variable_queue,
-         rabbit_binary_generator, rabbit_writer, delegate, gb_sets, lqueue,
-         sets, orddict, rabbit_amqqueue, rabbit_limiter, gb_trees,
-         rabbit_queue_index, gen, dict, ordsets, file_handle_cache,
-         rabbit_msg_store, array, rabbit_msg_store_ets_index, rabbit_msg_file,
-         rabbit_exchange_type_fanout, rabbit_exchange_type_topic, mnesia,
-         mnesia_lib, rpc, mnesia_tm, qlc, sofs, proplists, credit_flow, pmon]).
-
 %% HiPE compilation uses multiple cores anyway, but some bits are
 %% IO-bound so we can go faster if we parallelise a bit more. In
 %% practice 2 processes seems just as fast as any other number > 1,
@@ -216,11 +212,11 @@
 -type(log_location() :: 'tty' | 'undefined' | file:filename()).
 -type(param() :: atom()).
 
--spec(maybe_hipe_compile/0 :: () -> 'ok').
--spec(prepare/0 :: () -> 'ok').
 -spec(start/0 :: () -> 'ok').
+-spec(boot/0 :: () -> 'ok').
 -spec(stop/0 :: () -> 'ok').
 -spec(stop_and_halt/0 :: () -> no_return()).
+-spec(await_startup/0 :: () -> 'ok').
 -spec(status/0 ::
         () -> [{pid, integer()} |
                {running_applications, [{atom(), string(), string()}]} |
@@ -229,7 +225,7 @@
                {memory, any()}]).
 -spec(is_running/0 :: () -> boolean()).
 -spec(is_running/1 :: (node()) -> boolean()).
--spec(environment/0 :: () -> [{param() | term()}]).
+-spec(environment/0 :: () -> [{param(), term()}]).
 -spec(rotate_logs/1 :: (file_suffix()) -> rabbit_types:ok_or_error(any())).
 -spec(force_event_refresh/0 :: () -> 'ok').
 
@@ -251,19 +247,33 @@
 
 %%----------------------------------------------------------------------------
 
+%% HiPE compilation happens before we have log handlers - so we have
+%% to io:format/2, it's all we can do.
+
 maybe_hipe_compile() ->
     {ok, Want} = application:get_env(rabbit, hipe_compile),
     Can = code:which(hipe) =/= non_existing,
     case {Want, Can} of
-        {true,  true}  -> hipe_compile();
-        {true,  false} -> io:format("Not HiPE compiling: HiPE not found in "
-                                    "this Erlang installation.~n");
-        {false, _}     -> ok
+        {true,  true}  -> hipe_compile(),
+                          true;
+        {true,  false} -> false;
+        {false, _}     -> true
     end.
 
+warn_if_hipe_compilation_failed(true) ->
+    ok;
+warn_if_hipe_compilation_failed(false) ->
+    error_logger:warning_msg(
+      "Not HiPE compiling: HiPE not found in this Erlang installation.~n").
+
+%% HiPE compilation happens before we have log handlers and can take a
+%% long time, so make an exception to our no-stdout policy and display
+%% progress via stdout.
 hipe_compile() ->
-    Count = length(?HIPE_WORTHY),
-    io:format("HiPE compiling:  |~s|~n                 |",
+    {ok, HipeModulesAll} = application:get_env(rabbit, hipe_modules),
+    HipeModules = [HM || HM <- HipeModulesAll, code:which(HM) =/= non_existing],
+    Count = length(HipeModules),
+    io:format("~nHiPE compiling:  |~s|~n                 |",
               [string:copies("-", Count)]),
     T1 = erlang:now(),
     PidMRefs = [spawn_monitor(fun () -> [begin
@@ -271,7 +281,7 @@ hipe_compile() ->
                                              io:format("#")
                                          end || M <- Ms]
                               end) ||
-                   Ms <- split(?HIPE_WORTHY, ?HIPE_PROCESSES)],
+                   Ms <- split(HipeModules, ?HIPE_PROCESSES)],
     [receive
          {'DOWN', MRef, process, _, normal} -> ok;
          {'DOWN', MRef, process, _, Reason} -> exit(Reason)
@@ -285,29 +295,80 @@ split(L, N) -> split0(L, [[] || _ <- lists:seq(1, N)]).
 split0([],       Ls)       -> Ls;
 split0([I | Is], [L | Ls]) -> split0(Is, Ls ++ [[I | L]]).
 
-prepare() ->
-    ok = ensure_working_log_handlers(),
-    ok = rabbit_upgrade:maybe_upgrade_mnesia().
+ensure_application_loaded() ->
+    %% We end up looking at the rabbit app's env for HiPE and log
+    %% handling, so it needs to be loaded. But during the tests, it
+    %% may end up getting loaded twice, so guard against that.
+    case application:load(rabbit) of
+        ok                                -> ok;
+        {error, {already_loaded, rabbit}} -> ok
+    end.
 
 start() ->
+    start_it(fun() ->
+                     %% We do not want to HiPE compile or upgrade
+                     %% mnesia after just restarting the app
+                     ok = ensure_application_loaded(),
+                     ok = ensure_working_log_handlers(),
+                     rabbit_node_monitor:prepare_cluster_status_files(),
+                     rabbit_mnesia:check_cluster_consistency(),
+                     ok = app_utils:start_applications(
+                            app_startup_order(), fun handle_app_error/2),
+                     ok = log_broker_started(rabbit_plugins:active())
+             end).
+
+boot() ->
+    start_it(fun() ->
+                     ok = ensure_application_loaded(),
+                     Success = maybe_hipe_compile(),
+                     ok = ensure_working_log_handlers(),
+                     warn_if_hipe_compilation_failed(Success),
+                     rabbit_node_monitor:prepare_cluster_status_files(),
+                     ok = rabbit_upgrade:maybe_upgrade_mnesia(),
+                     %% It's important that the consistency check happens after
+                     %% the upgrade, since if we are a secondary node the
+                     %% primary node will have forgotten us
+                     rabbit_mnesia:check_cluster_consistency(),
+                     Plugins = rabbit_plugins:setup(),
+                     ToBeLoaded = Plugins ++ ?APPS,
+                     ok = app_utils:load_applications(ToBeLoaded),
+                     StartupApps = app_utils:app_dependency_order(ToBeLoaded,
+                                                                  false),
+                     ok = app_utils:start_applications(
+                            StartupApps, fun handle_app_error/2),
+                     ok = log_broker_started(Plugins)
+             end).
+
+handle_app_error(App, {bad_return, {_MFA, {'EXIT', {Reason, _}}}}) ->
+    throw({could_not_start, App, Reason});
+
+handle_app_error(App, Reason) ->
+    throw({could_not_start, App, Reason}).
+
+start_it(StartFun) ->
+    Marker = spawn_link(fun() -> receive stop -> ok end end),
+    register(rabbit_boot, Marker),
     try
-        %% prepare/1 ends up looking at the rabbit app's env, so it
-        %% needs to be loaded, but during the tests, it may end up
-        %% getting loaded twice, so guard against that
-        case application:load(rabbit) of
-            ok                                -> ok;
-            {error, {already_loaded, rabbit}} -> ok
-        end,
-        ok = prepare(),
-        ok = rabbit_misc:start_applications(application_load_order())
+        StartFun()
+    catch
+        throw:{could_not_start, _App, _Reason}=Err ->
+            boot_error(Err, not_available);
+         _:Reason ->
+            boot_error(Reason, erlang:get_stacktrace())
     after
-        %%give the error loggers some time to catch up
+        unlink(Marker),
+        Marker ! stop,
+        %% give the error loggers some time to catch up
         timer:sleep(100)
     end.
 
 stop() ->
-    rabbit_log:info("Stopping Rabbit~n"),
-    ok = rabbit_misc:stop_applications(application_load_order()).
+    case whereis(rabbit_boot) of
+        undefined -> ok;
+        _         -> await_startup()
+    end,
+    rabbit_log:info("Stopping RabbitMQ~n"),
+    ok = app_utils:stop_applications(app_shutdown_order()).
 
 stop_and_halt() ->
     try
@@ -318,12 +379,15 @@ stop_and_halt() ->
     end,
     ok.
 
+await_startup() ->
+    app_utils:wait_for_applications(app_startup_order()).
+
 status() ->
     S1 = [{pid,                  list_to_integer(os:getpid())},
-          {running_applications, application:which_applications(infinity)},
+          {running_applications, rabbit_misc:which_applications()},
           {os,                   os:type()},
           {erlang_version,       erlang:system_info(system_version)},
-          {memory,               erlang:memory()}],
+          {memory,               rabbit_vm:memory()}],
     S2 = rabbit_misc:filter_exit_map(
            fun ({Key, {M, F, A}}) -> {Key, erlang:apply(M, F, A)} end,
            [{vm_memory_high_watermark, {vm_memory_monitor,
@@ -348,13 +412,11 @@ status() ->
 
 is_running() -> is_running(node()).
 
-is_running(Node) ->
-    rabbit_nodes:is_running(Node, rabbit).
+is_running(Node) -> rabbit_nodes:is_process_running(Node, rabbit).
 
 environment() ->
-    lists:keysort(
-      1, [P || P = {K, _} <- application:get_all_env(rabbit),
-               K =/= default_pass]).
+    lists:keysort(1, [P || P = {K, _} <- application:get_all_env(rabbit),
+                           K =/= default_pass]).
 
 rotate_logs(BinarySuffix) ->
     Suffix = binary_to_list(BinarySuffix),
@@ -371,89 +433,55 @@ rotate_logs(BinarySuffix) ->
 start(normal, []) ->
     case erts_version_check() of
         ok ->
+            {ok, Vsn} = application:get_key(rabbit, vsn),
+            error_logger:info_msg("Starting RabbitMQ ~s on Erlang ~s~n~s~n~s~n",
+                                  [Vsn, erlang:system_info(otp_release),
+                                   ?COPYRIGHT_MESSAGE, ?INFORMATION_MESSAGE]),
             {ok, SupPid} = rabbit_sup:start_link(),
             true = register(rabbit, self()),
             print_banner(),
+            log_banner(),
             [ok = run_boot_step(Step) || Step <- boot_steps()],
-            io:format("~nbroker running~n"),
             {ok, SupPid};
         Error ->
             Error
     end.
 
 stop(_State) ->
-    ok = rabbit_mnesia:record_running_nodes(),
-    terminated_ok = error_logger:delete_report_handler(rabbit_error_logger),
     ok = rabbit_alarm:stop(),
     ok = case rabbit_mnesia:is_clustered() of
              true  -> rabbit_amqqueue:on_node_down(node());
-             false -> rabbit_mnesia:empty_ram_only_tables()
+             false -> rabbit_table:clear_ram_only_tables()
          end,
     ok.
 
 %%---------------------------------------------------------------------------
 %% application life cycle
 
-application_load_order() ->
-    ok = load_applications(),
-    {ok, G} = rabbit_misc:build_acyclic_graph(
-                fun (App, _Deps) -> [{App, App}] end,
-                fun (App,  Deps) -> [{Dep, App} || Dep <- Deps] end,
-                [{App, app_dependencies(App)} ||
-                    {App, _Desc, _Vsn} <- application:loaded_applications()]),
-    true = digraph:del_vertices(
-             G, digraph:vertices(G) -- digraph_utils:reachable(?APPS, G)),
-    Result = digraph_utils:topsort(G),
-    true = digraph:delete(G),
-    Result.
-
-load_applications() ->
-    load_applications(queue:from_list(?APPS), sets:new()).
-
-load_applications(Worklist, Loaded) ->
-    case queue:out(Worklist) of
-        {empty, _WorkList} ->
-            ok;
-        {{value, App}, Worklist1} ->
-            case sets:is_element(App, Loaded) of
-                true  -> load_applications(Worklist1, Loaded);
-                false -> case application:load(App) of
-                             ok                             -> ok;
-                             {error, {already_loaded, App}} -> ok;
-                             Error                          -> throw(Error)
-                         end,
-                         load_applications(
-                           queue:join(Worklist1,
-                                      queue:from_list(app_dependencies(App))),
-                           sets:add_element(App, Loaded))
-            end
-    end.
+app_startup_order() ->
+    ok = app_utils:load_applications(?APPS),
+    app_utils:app_dependency_order(?APPS, false).
 
-app_dependencies(App) ->
-    case application:get_key(App, applications) of
-        undefined -> [];
-        {ok, Lst} -> Lst
-    end.
+app_shutdown_order() ->
+    Apps = ?APPS ++ rabbit_plugins:active(),
+    app_utils:app_dependency_order(Apps, true).
 
 %%---------------------------------------------------------------------------
 %% boot step logic
 
-run_boot_step({StepName, Attributes}) ->
-    Description = case lists:keysearch(description, 1, Attributes) of
-                      {value, {_, D}} -> D;
-                      false           -> StepName
-                  end,
+run_boot_step({_StepName, Attributes}) ->
     case [MFA || {mfa, MFA} <- Attributes] of
         [] ->
-            io:format("-- ~s~n", [Description]);
+            ok;
         MFAs ->
-            io:format("starting ~-60s ...", [Description]),
             [try
                  apply(M,F,A)
+             of
+                 ok ->              ok;
+                 {error, Reason} -> boot_error(Reason, not_available)
              catch
-                 _:Reason -> boot_step_error(Reason, erlang:get_stacktrace())
+                 _:Reason -> boot_error(Reason, erlang:get_stacktrace())
              end || {M,F,A} <- MFAs],
-            io:format("done~n"),
             ok
     end.
 
@@ -479,7 +507,8 @@ sort_boot_steps(UnsortedSteps) ->
             %% there is one, otherwise fail).
             SortedSteps = lists:reverse(
                             [begin
-                                 {StepName, Step} = digraph:vertex(G, StepName),
+                                 {StepName, Step} = digraph:vertex(G,
+                                                                   StepName),
                                  Step
                              end || StepName <- digraph_utils:topsort(G)]),
             digraph:delete(G),
@@ -489,14 +518,17 @@ sort_boot_steps(UnsortedSteps) ->
                      {mfa, {M,F,A}}         <- Attributes,
                      not erlang:function_exported(M, F, length(A))] of
                 []               -> SortedSteps;
-                MissingFunctions -> boot_error(
+                MissingFunctions -> basic_boot_error(
+                                      {missing_functions, MissingFunctions},
                                       "Boot step functions not exported: ~p~n",
                                       [MissingFunctions])
             end;
         {error, {vertex, duplicate, StepName}} ->
-            boot_error("Duplicate boot step name: ~w~n", [StepName]);
+            basic_boot_error({duplicate_boot_step, StepName},
+                             "Duplicate boot step name: ~w~n", [StepName]);
         {error, {edge, Reason, From, To}} ->
-            boot_error(
+            basic_boot_error(
+              {invalid_boot_step_dependency, From, To},
               "Could not add boot step dependency of ~w on ~w:~n~s",
               [To, From,
                case Reason of
@@ -510,30 +542,43 @@ sort_boot_steps(UnsortedSteps) ->
                end])
     end.
 
-boot_step_error({error, {timeout_waiting_for_tables, _}}, _Stacktrace) ->
+-ifdef(use_specs).
+-spec(boot_error/2 :: (term(), not_available | [tuple()]) -> no_return()).
+-endif.
+boot_error(Term={error, {timeout_waiting_for_tables, _}}, _Stacktrace) ->
+    AllNodes = rabbit_mnesia:cluster_nodes(all),
     {Err, Nodes} =
-        case rabbit_mnesia:read_previously_running_nodes() of
+        case AllNodes -- [node()] of
             [] -> {"Timeout contacting cluster nodes. Since RabbitMQ was"
                    " shut down forcefully~nit cannot determine which nodes"
-                   " are timing out. Details on all nodes will~nfollow.~n",
-                   rabbit_mnesia:all_clustered_nodes() -- [node()]};
+                   " are timing out.~n", []};
             Ns -> {rabbit_misc:format(
                      "Timeout contacting cluster nodes: ~p.~n", [Ns]),
                    Ns}
         end,
-    boot_error(Err ++ rabbit_nodes:diagnostics(Nodes) ++ "~n~n", []);
+    basic_boot_error(Term,
+                     Err ++ rabbit_nodes:diagnostics(Nodes) ++ "~n~n", []);
+boot_error(Reason, Stacktrace) ->
+    Fmt = "Error description:~n   ~p~n~n" ++
+        "Log files (may contain more information):~n   ~s~n   ~s~n~n",
+    Args = [Reason, log_location(kernel), log_location(sasl)],
+    boot_error(Reason, Fmt, Args, Stacktrace).
 
-boot_step_error(Reason, Stacktrace) ->
-    boot_error("Error description:~n   ~p~n~n"
-               "Log files (may contain more information):~n   ~s~n   ~s~n~n"
-               "Stack trace:~n   ~p~n~n",
-               [Reason, log_location(kernel), log_location(sasl), Stacktrace]).
+-ifdef(use_specs).
+-spec(boot_error/4 :: (term(), string(), [any()], not_available | [tuple()])
+                      -> no_return()).
+-endif.
+boot_error(Reason, Fmt, Args, not_available) ->
+    basic_boot_error(Reason, Fmt, Args);
+boot_error(Reason, Fmt, Args, Stacktrace) ->
+    basic_boot_error(Reason, Fmt ++ "Stack trace:~n   ~p~n~n",
+                     Args ++ [Stacktrace]).
 
-boot_error(Format, Args) ->
+basic_boot_error(Reason, Format, Args) ->
     io:format("~n~nBOOT FAILED~n===========~n~n" ++ Format, Args),
-    error_logger:error_msg(Format, Args),
+    rabbit_misc:local_info_msg(Format, Args),
     timer:sleep(1000),
-    exit({?MODULE, failure_during_boot}).
+    exit({?MODULE, failure_during_boot, Reason}).
 
 %%---------------------------------------------------------------------------
 %% boot step functions
@@ -543,10 +588,14 @@ boot_delegate() ->
     rabbit_sup:start_supervisor_child(delegate_sup, [Count]).
 
 recover() ->
-    rabbit_binding:recover(rabbit_exchange:recover(), rabbit_amqqueue:start()).
+    rabbit_policy:recover(),
+    Qs = rabbit_amqqueue:recover(),
+    ok = rabbit_binding:recover(rabbit_exchange:recover(),
+                                [QName || #amqqueue{name = QName} <- Qs]),
+    rabbit_amqqueue:start(Qs).
 
 maybe_insert_default_data() ->
-    case rabbit_mnesia:is_db_empty() of
+    case rabbit_table:is_empty() of
         true -> insert_default_data();
         false -> ok
     end.
@@ -561,7 +610,8 @@ insert_default_data() ->
     ok = rabbit_vhost:add(DefaultVHost),
     ok = rabbit_auth_backend_internal:add_user(DefaultUser, DefaultPass),
     ok = rabbit_auth_backend_internal:set_tags(DefaultUser, DefaultTags),
-    ok = rabbit_auth_backend_internal:set_permissions(DefaultUser, DefaultVHost,
+    ok = rabbit_auth_backend_internal:set_permissions(DefaultUser,
+                                                      DefaultVHost,
                                                       DefaultConfigurePerm,
                                                       DefaultWritePerm,
                                                       DefaultReadPerm),
@@ -649,6 +699,17 @@ force_event_refresh() ->
 %%---------------------------------------------------------------------------
 %% misc
 
+log_broker_started(Plugins) ->
+    rabbit_misc:with_local_io(
+      fun() ->
+              PluginList = iolist_to_binary([rabbit_misc:format(" * ~s~n", [P])
+                                             || P <- Plugins]),
+              error_logger:info_msg(
+                "Server startup complete; ~b plugins started.~n~s",
+                [length(Plugins), PluginList]),
+              io:format(" completed with ~p plugins.~n", [length(Plugins)])
+      end).
+
 erts_version_check() ->
     FoundVer = erlang:system_info(version),
     case rabbit_misc:version_compare(?ERTS_MINIMUM, FoundVer, lte) of
@@ -660,49 +721,39 @@ erts_version_check() ->
 print_banner() ->
     {ok, Product} = application:get_key(id),
     {ok, Version} = application:get_key(vsn),
-    ProductLen = string:len(Product),
-    io:format("~n"
-              "+---+   +---+~n"
-              "|   |   |   |~n"
-              "|   |   |   |~n"
-              "|   |   |   |~n"
-              "|   +---+   +-------+~n"
-              "|                   |~n"
-              "| ~s  +---+   |~n"
-              "|           |   |   |~n"
-              "| ~s  +---+   |~n"
-              "|                   |~n"
-              "+-------------------+~n"
-              "~s~n~s~n~s~n~n",
-              [Product, string:right([$v|Version], ProductLen),
-               ?PROTOCOL_VERSION,
-               ?COPYRIGHT_MESSAGE, ?INFORMATION_MESSAGE]),
+    io:format("~n              ~s ~s. ~s"
+              "~n  ##  ##      ~s"
+              "~n  ##  ##"
+              "~n  ##########  Logs: ~s"
+              "~n  ######  ##        ~s"
+              "~n  ##########"
+              "~n              Starting broker...",
+              [Product, Version, ?COPYRIGHT_MESSAGE, ?INFORMATION_MESSAGE,
+               log_location(kernel), log_location(sasl)]).
+
+log_banner() ->
     Settings = [{"node",           node()},
-                {"app descriptor", app_location()},
                 {"home dir",       home_dir()},
                 {"config file(s)", config_files()},
                 {"cookie hash",    rabbit_nodes:cookie_hash()},
                 {"log",            log_location(kernel)},
                 {"sasl log",       log_location(sasl)},
-                {"database dir",   rabbit_mnesia:dir()},
-                {"erlang version", erlang:system_info(version)}],
+                {"database dir",   rabbit_mnesia:dir()}],
     DescrLen = 1 + lists:max([length(K) || {K, _V} <- Settings]),
     Format = fun (K, V) ->
-                     io:format("~-" ++ integer_to_list(DescrLen) ++ "s: ~s~n",
-                               [K, V])
+                     rabbit_misc:format(
+                       "~-" ++ integer_to_list(DescrLen) ++ "s: ~s~n", [K, V])
              end,
-    lists:foreach(fun ({"config file(s)" = K, []}) ->
-                          Format(K, "(none)");
-                      ({"config file(s)" = K, [V0 | Vs]}) ->
-                          Format(K, V0), [Format("", V) || V <- Vs];
-                      ({K, V}) ->
-                          Format(K, V)
-                  end, Settings),
-    io:nl().
-
-app_location() ->
-    {ok, Application} = application:get_application(),
-    filename:absname(code:where_is_file(atom_to_list(Application) ++ ".app")).
+    Banner = iolist_to_binary(
+               [case S of
+                    {"config file(s)" = K, []} ->
+                        Format(K, "(none)");
+                    {"config file(s)" = K, [V0 | Vs]} ->
+                        Format(K, V0), [Format("", V) || V <- Vs];
+                    {K, V} ->
+                        Format(K, V)
+                end || S <- Settings]),
+    error_logger:info_msg("~s", [Banner]).
 
 home_dir() ->
     case init:get_argument(home) of
@@ -717,3 +768,10 @@ config_files() ->
                            [File] <- Files];
         error       -> []
     end.
+
+%% We don't want this in fhc since it references rabbit stuff. And we can't put
+%% this in the bootstep directly.
+start_fhc() ->
+    rabbit_sup:start_restartable_child(
+      file_handle_cache,
+      [fun rabbit_alarm:set_alarm/1, fun rabbit_alarm:clear_alarm/1]).
diff --git a/src/rabbit_access_control.erl b/src/rabbit_access_control.erl
index 75c53511..d54c2a8d 100644
--- a/src/rabbit_access_control.erl
+++ b/src/rabbit_access_control.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_access_control).
@@ -68,12 +68,13 @@ check_vhost_access(User = #user{ username     = Username,
                                  auth_backend = Module }, VHostPath) ->
     check_access(
       fun() ->
-              rabbit_vhost:exists(VHostPath) andalso
-                  Module:check_vhost_access(User, VHostPath)
+              %% TODO this could be an andalso shortcut under >R13A
+              case rabbit_vhost:exists(VHostPath) of
+                  false -> false;
+                  true  -> Module:check_vhost_access(User, VHostPath)
+              end
       end,
-      "~s failed checking vhost access to ~s for ~s: ~p~n",
-      [Module, VHostPath, Username],
-      "access to vhost '~s' refused for user '~s'",
+      Module, "access to vhost '~s' refused for user '~s'",
       [VHostPath, Username]).
 
 check_resource_access(User, R = #resource{kind = exchange, name = <<"">>},
@@ -84,15 +85,14 @@ check_resource_access(User = #user{username = Username, auth_backend = Module},
                       Resource, Permission) ->
     check_access(
       fun() -> Module:check_resource_access(User, Resource, Permission) end,
-      "~s failed checking resource access to ~p for ~s: ~p~n",
-      [Module, Resource, Username],
-      "access to ~s refused for user '~s'",
+      Module, "access to ~s refused for user '~s'",
       [rabbit_misc:rs(Resource), Username]).
 
-check_access(Fun, ErrStr, ErrArgs, RefStr, RefArgs) ->
+check_access(Fun, Module, ErrStr, ErrArgs) ->
     Allow = case Fun() of
-                {error, _} = E ->
-                    rabbit_log:error(ErrStr, ErrArgs ++ [E]),
+                {error, E}  ->
+                    rabbit_log:error(ErrStr ++ " by ~s: ~p~n",
+                                     ErrArgs ++ [Module, E]),
                     false;
                 Else ->
                     Else
@@ -101,5 +101,5 @@ check_access(Fun, ErrStr, ErrArgs, RefStr, RefArgs) ->
         true ->
             ok;
         false ->
-            rabbit_misc:protocol_error(access_refused, RefStr, RefArgs)
+            rabbit_misc:protocol_error(access_refused, ErrStr, ErrArgs)
     end.
diff --git a/src/rabbit_alarm.erl b/src/rabbit_alarm.erl
index d16d90a4..cd1d125b 100644
--- a/src/rabbit_alarm.erl
+++ b/src/rabbit_alarm.erl
@@ -10,30 +10,36 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_alarm).
 
 -behaviour(gen_event).
 
--export([start/0, stop/0, register/2, on_node_up/1, on_node_down/1]).
+-export([start_link/0, start/0, stop/0, register/2, set_alarm/1,
+         clear_alarm/1, get_alarms/0, on_node_up/1, on_node_down/1]).
 
 -export([init/1, handle_call/2, handle_event/2, handle_info/2,
          terminate/2, code_change/3]).
 
 -export([remote_conserve_resources/3]). %% Internal use only
 
--record(alarms, {alertees, alarmed_nodes}).
+-define(SERVER, ?MODULE).
+
+-record(alarms, {alertees, alarmed_nodes, alarms}).
 
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(start/0 :: () -> 'ok').
 -spec(stop/0 :: () -> 'ok').
--spec(register/2 :: (pid(), rabbit_types:mfargs()) -> boolean()).
+-spec(register/2 :: (pid(), rabbit_types:mfargs()) -> [atom()]).
+-spec(set_alarm/1 :: (any()) -> 'ok').
+-spec(clear_alarm/1 :: (any()) -> 'ok').
 -spec(on_node_up/1 :: (node()) -> 'ok').
 -spec(on_node_down/1 :: (node()) -> 'ok').
 
@@ -41,67 +47,90 @@
 
 %%----------------------------------------------------------------------------
 
+start_link() ->
+    gen_event:start_link({local, ?SERVER}).
+
 start() ->
-    ok = alarm_handler:add_alarm_handler(?MODULE, []),
+    ok = rabbit_sup:start_restartable_child(?MODULE),
+    ok = gen_event:add_handler(?SERVER, ?MODULE, []),
     {ok, MemoryWatermark} = application:get_env(vm_memory_high_watermark),
-    rabbit_sup:start_restartable_child(vm_memory_monitor, [MemoryWatermark]),
-
+    rabbit_sup:start_restartable_child(
+      vm_memory_monitor, [MemoryWatermark,
+                          fun (Alarm) ->
+                                  background_gc:run(),
+                                  set_alarm(Alarm)
+                          end,
+                          fun clear_alarm/1]),
     {ok, DiskLimit} = application:get_env(disk_free_limit),
     rabbit_sup:start_restartable_child(rabbit_disk_monitor, [DiskLimit]),
     ok.
 
-stop() ->
-    ok = alarm_handler:delete_alarm_handler(?MODULE).
+stop() -> ok.
 
-register(Pid, HighMemMFA) ->
-    gen_event:call(alarm_handler, ?MODULE,
-                   {register, Pid, HighMemMFA},
-                   infinity).
+register(Pid, AlertMFA) ->
+    gen_event:call(?SERVER, ?MODULE, {register, Pid, AlertMFA}, infinity).
 
-on_node_up(Node) -> gen_event:notify(alarm_handler, {node_up, Node}).
+set_alarm(Alarm)   -> gen_event:notify(?SERVER, {set_alarm,   Alarm}).
+clear_alarm(Alarm) -> gen_event:notify(?SERVER, {clear_alarm, Alarm}).
 
-on_node_down(Node) -> gen_event:notify(alarm_handler, {node_down, Node}).
+get_alarms() -> gen_event:call(?SERVER, ?MODULE, get_alarms, infinity).
+
+on_node_up(Node)   -> gen_event:notify(?SERVER, {node_up,   Node}).
+on_node_down(Node) -> gen_event:notify(?SERVER, {node_down, Node}).
 
-%% Can't use alarm_handler:{set,clear}_alarm because that doesn't
-%% permit notifying a remote node.
 remote_conserve_resources(Pid, Source, true) ->
-    gen_event:notify({alarm_handler, node(Pid)},
+    gen_event:notify({?SERVER, node(Pid)},
                      {set_alarm, {{resource_limit, Source, node()}, []}});
 remote_conserve_resources(Pid, Source, false) ->
-    gen_event:notify({alarm_handler, node(Pid)},
+    gen_event:notify({?SERVER, node(Pid)},
                      {clear_alarm, {resource_limit, Source, node()}}).
 
+
 %%----------------------------------------------------------------------------
 
 init([]) ->
     {ok, #alarms{alertees      = dict:new(),
-                 alarmed_nodes = dict:new()}}.
+                 alarmed_nodes = dict:new(),
+                 alarms        = []}}.
 
-handle_call({register, Pid, HighMemMFA}, State) ->
-    {ok, 0 < dict:size(State#alarms.alarmed_nodes),
-     internal_register(Pid, HighMemMFA, State)};
+handle_call({register, Pid, AlertMFA}, State = #alarms{alarmed_nodes = AN}) ->
+    {ok, lists:usort(lists:append([V || {_, V} <- dict:to_list(AN)])),
+     internal_register(Pid, AlertMFA, State)};
+
+handle_call(get_alarms, State = #alarms{alarms = Alarms}) ->
+    {ok, Alarms, State};
 
 handle_call(_Request, State) ->
     {ok, not_understood, State}.
 
-handle_event({set_alarm, {{resource_limit, Source, Node}, []}}, State) ->
-    {ok, maybe_alert(fun dict:append/3, Node, Source, State)};
+handle_event({set_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
+    case lists:member(Alarm, Alarms) of
+        true  -> {ok, State};
+        false -> UpdatedAlarms = lists:usort([Alarm|Alarms]),
+                 handle_set_alarm(Alarm, State#alarms{alarms = UpdatedAlarms})
+    end;
+
+handle_event({clear_alarm, Alarm}, State = #alarms{alarms = Alarms}) ->
+    case lists:keymember(Alarm, 1, Alarms) of
+        true  -> handle_clear_alarm(
+                   Alarm, State#alarms{alarms = lists:keydelete(
+                                                  Alarm, 1, Alarms)});
+        false -> {ok, State}
 
-handle_event({clear_alarm, {resource_limit, Source, Node}}, State) ->
-    {ok, maybe_alert(fun dict_unappend/3, Node, Source, State)};
+    end;
 
 handle_event({node_up, Node}, State) ->
     %% Must do this via notify and not call to avoid possible deadlock.
     ok = gen_event:notify(
-           {alarm_handler, Node},
+           {?SERVER, Node},
            {register, self(), {?MODULE, remote_conserve_resources, []}}),
     {ok, State};
 
 handle_event({node_down, Node}, State) ->
-    {ok, maybe_alert(fun dict_unappend_all/3, Node, [], State)};
+    {ok, maybe_alert(fun dict_unappend_all/3, Node, [], false, State)};
 
-handle_event({register, Pid, HighMemMFA}, State) ->
-    {ok, internal_register(Pid, HighMemMFA, State)};
+handle_event({register, Pid, AlertMFA}, State) ->
+    {ok, internal_register(Pid, AlertMFA, State)};
 
 handle_event(_Event, State) ->
     {ok, State}.
@@ -121,45 +150,36 @@ code_change(_OldVsn, State, _Extra) ->
 
 %%----------------------------------------------------------------------------
 
+dict_append(Key, Val, Dict) ->
+    L = case dict:find(Key, Dict) of
+            {ok, V} -> V;
+            error   -> []
+        end,
+    dict:store(Key, lists:usort([Val|L]), Dict).
+
 dict_unappend_all(Key, _Val, Dict) ->
     dict:erase(Key, Dict).
 
 dict_unappend(Key, Val, Dict) ->
-    case lists:delete(Val, dict:fetch(Key, Dict)) of
+    L = case dict:find(Key, Dict) of
+            {ok, V} -> V;
+            error   -> []
+        end,
+
+    case lists:delete(Val, L) of
         [] -> dict:erase(Key, Dict);
         X  -> dict:store(Key, X, Dict)
     end.
 
-count_dict_values(Val, Dict) ->
-    dict:fold(fun (_Node, List, Count) ->
-                  Count + case lists:member(Val, List) of
-                              true  -> 1;
-                              false -> 0
-                          end
-              end, 0, Dict).
-
-maybe_alert(UpdateFun, Node, Source,
+maybe_alert(UpdateFun, Node, Source, Alert,
             State = #alarms{alarmed_nodes = AN,
                             alertees      = Alertees}) ->
     AN1 = UpdateFun(Node, Source, AN),
-    BeforeSz = count_dict_values(Source, AN),
-    AfterSz  = count_dict_values(Source, AN1),
-
-    %% If we have changed our alarm state, inform the remotes.
-    IsLocal = Node =:= node(),
-    if IsLocal andalso BeforeSz < AfterSz ->
-           ok = alert_remote(true,  Alertees, Source);
-       IsLocal andalso BeforeSz > AfterSz ->
-           ok = alert_remote(false, Alertees, Source);
-       true                               ->
-           ok
-    end,
-    %% If the overall alarm state has changed, inform the locals.
-    case {dict:size(AN), dict:size(AN1)} of
-        {0, 1} -> ok = alert_local(true,  Alertees, Source);
-        {1, 0} -> ok = alert_local(false, Alertees, Source);
-        {_, _} -> ok
+    case node() of
+        Node -> ok = alert_remote(Alert,  Alertees, Source);
+        _    -> ok
     end,
+    ok = alert_local(Alert, Alertees, Source),
     State#alarms{alarmed_nodes = AN1}.
 
 alert_local(Alert, Alertees, Source) ->
@@ -177,12 +197,42 @@ alert(Alertees, Source, Alert, NodeComparator) ->
                       end
               end, ok, Alertees).
 
-internal_register(Pid, {M, F, A} = HighMemMFA,
+internal_register(Pid, {M, F, A} = AlertMFA,
                   State = #alarms{alertees = Alertees}) ->
     _MRef = erlang:monitor(process, Pid),
     case dict:find(node(), State#alarms.alarmed_nodes) of
         {ok, Sources} -> [apply(M, F, A ++ [Pid, R, true]) || R <- Sources];
         error          -> ok
     end,
-    NewAlertees = dict:store(Pid, HighMemMFA, Alertees),
+    NewAlertees = dict:store(Pid, AlertMFA, Alertees),
     State#alarms{alertees = NewAlertees}.
+
+handle_set_alarm({{resource_limit, Source, Node}, []}, State) ->
+    rabbit_log:warning(
+      "~s resource limit alarm set on node ~p.~n~n"
+      "**********************************************************~n"
+      "*** Publishers will be blocked until this alarm clears ***~n"
+      "**********************************************************~n",
+      [Source, Node]),
+    {ok, maybe_alert(fun dict_append/3, Node, Source, true, State)};
+handle_set_alarm({file_descriptor_limit, []}, State) ->
+    rabbit_log:warning(
+      "file descriptor limit alarm set.~n~n"
+      "********************************************************************~n"
+      "*** New connections will not be accepted until this alarm clears ***~n"
+      "********************************************************************~n"),
+    {ok, State};
+handle_set_alarm(Alarm, State) ->
+    rabbit_log:warning("alarm '~p' set~n", [Alarm]),
+    {ok, State}.
+
+handle_clear_alarm({resource_limit, Source, Node}, State) ->
+    rabbit_log:warning("~s resource limit alarm cleared on node ~p~n",
+                       [Source, Node]),
+    {ok, maybe_alert(fun dict_unappend/3, Node, Source, false, State)};
+handle_clear_alarm(file_descriptor_limit, State) ->
+    rabbit_log:warning("file descriptor limit alarm cleared~n"),
+    {ok, State};
+handle_clear_alarm(Alarm, State) ->
+    rabbit_log:warning("alarm '~p' cleared~n", [Alarm]),
+    {ok, State}.
diff --git a/src/rabbit_amqqueue.erl b/src/rabbit_amqqueue.erl
index c1673504..a1efaf65 100644
--- a/src/rabbit_amqqueue.erl
+++ b/src/rabbit_amqqueue.erl
@@ -10,29 +10,32 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_amqqueue).
 
--export([start/0, stop/0, declare/5, delete_immediately/1, delete/3, purge/1]).
+-export([recover/0, stop/0, start/1, declare/5,
+         delete_immediately/1, delete/3, purge/1, forget_all_durable/1]).
 -export([pseudo_queue/2]).
--export([lookup/1, with/2, with_or_die/2, assert_equivalence/5,
+-export([lookup/1, not_found_or_absent/1, with/2, with/3, with_or_die/2,
+         assert_equivalence/5,
          check_exclusive_access/2, with_exclusive_access_or_die/3,
          stat/1, deliver/2, deliver_flow/2, requeue/3, ack/3, reject/4]).
 -export([list/0, list/1, info_keys/0, info/1, info/2, info_all/1, info_all/2]).
--export([force_event_refresh/0]).
+-export([force_event_refresh/0, wake_up/1]).
 -export([consumers/1, consumers_all/1, consumer_info_keys/0]).
--export([basic_get/3, basic_consume/7, basic_cancel/4]).
--export([notify_sent/2, notify_sent_queue_down/1, unblock/2, flush_all/2]).
--export([notify_down_all/2, limit_all/3]).
+-export([basic_get/4, basic_consume/9, basic_cancel/4]).
+-export([notify_sent/2, notify_sent_queue_down/1, resume/2, flush_all/2]).
+-export([notify_down_all/2, activate_limit_all/2, credit/5]).
 -export([on_node_down/1]).
--export([store_queue/1]).
-
+-export([update/2, store_queue/1, policy_changed/2]).
+-export([start_mirroring/1, stop_mirroring/1, sync_mirrors/1,
+         cancel_sync_mirrors/1]).
 
 %% internal
--export([internal_declare/2, internal_delete/2, run_backing_queue/3,
+-export([internal_declare/2, internal_delete/1, run_backing_queue/3,
          set_ram_duration_target/2, set_maximum_since_use/2]).
 
 -include("rabbit.hrl").
@@ -58,24 +61,33 @@
 -type(msg_id() :: non_neg_integer()).
 -type(ok_or_errors() ::
         'ok' | {'error', [{'error' | 'exit' | 'throw', any()}]}).
--type(routing_result() :: 'routed' | 'unroutable' | 'not_delivered').
--type(queue_or_not_found() :: rabbit_types:amqqueue() | 'not_found').
-
--spec(start/0 :: () -> [name()]).
+-type(routing_result() :: 'routed' | 'unroutable').
+-type(queue_or_absent() :: rabbit_types:amqqueue() |
+                           {'absent', rabbit_types:amqqueue()}).
+-type(not_found_or_absent() :: 'not_found' |
+                               {'absent', rabbit_types:amqqueue()}).
+-spec(recover/0 :: () -> [rabbit_types:amqqueue()]).
 -spec(stop/0 :: () -> 'ok').
+-spec(start/1 :: ([rabbit_types:amqqueue()]) -> 'ok').
 -spec(declare/5 ::
         (name(), boolean(), boolean(),
          rabbit_framing:amqp_table(), rabbit_types:maybe(pid()))
-        -> {'new' | 'existing', rabbit_types:amqqueue()} |
+        -> {'new' | 'existing' | 'absent', rabbit_types:amqqueue()} |
            rabbit_types:channel_exit()).
 -spec(internal_declare/2 ::
         (rabbit_types:amqqueue(), boolean())
-        -> queue_or_not_found() | rabbit_misc:thunk(queue_or_not_found())).
+        -> queue_or_absent() | rabbit_misc:thunk(queue_or_absent())).
+-spec(update/2 ::
+        (name(),
+         fun((rabbit_types:amqqueue()) -> rabbit_types:amqqueue())) -> 'ok').
 -spec(lookup/1 ::
         (name()) -> rabbit_types:ok(rabbit_types:amqqueue()) |
                     rabbit_types:error('not_found');
         ([name()]) -> [rabbit_types:amqqueue()]).
--spec(with/2 :: (name(), qfun(A)) -> A | rabbit_types:error('not_found')).
+-spec(not_found_or_absent/1 :: (name()) -> not_found_or_absent()).
+-spec(with/2 :: (name(), qfun(A)) ->
+                     A | rabbit_types:error(not_found_or_absent())).
+-spec(with/3 :: (name(), qfun(A), fun((not_found_or_absent()) -> B)) -> A | B).
 -spec(with_or_die/2 ::
         (name(), qfun(A)) -> A | rabbit_types:channel_exit()).
 -spec(assert_equivalence/5 ::
@@ -99,6 +111,7 @@
 -spec(info_all/2 :: (rabbit_types:vhost(), rabbit_types:info_keys())
                     -> [rabbit_types:infos()]).
 -spec(force_event_refresh/0 :: () -> 'ok').
+-spec(wake_up/1 :: (rabbit_types:amqqueue()) -> 'ok').
 -spec(consumers/1 ::
         (rabbit_types:amqqueue())
         -> [{pid(), rabbit_types:ctag(), boolean()}]).
@@ -122,6 +135,7 @@
            rabbit_types:error('in_use') |
            rabbit_types:error('not_empty')).
 -spec(purge/1 :: (rabbit_types:amqqueue()) -> qlen()).
+-spec(forget_all_durable/1 :: (node()) -> 'ok').
 -spec(deliver/2 :: ([rabbit_types:amqqueue()], rabbit_types:delivery()) ->
                         {routing_result(), qpids()}).
 -spec(deliver_flow/2 :: ([rabbit_types:amqqueue()], rabbit_types:delivery()) ->
@@ -130,25 +144,26 @@
 -spec(ack/3 :: (pid(), [msg_id()], pid()) -> 'ok').
 -spec(reject/4 :: (pid(), [msg_id()], boolean(), pid()) -> 'ok').
 -spec(notify_down_all/2 :: (qpids(), pid()) -> ok_or_errors()).
--spec(limit_all/3 :: (qpids(), pid(), rabbit_limiter:token()) ->
-                          ok_or_errors()).
--spec(basic_get/3 :: (rabbit_types:amqqueue(), pid(), boolean()) ->
+-spec(activate_limit_all/2 :: (qpids(), pid()) -> ok_or_errors()).
+-spec(basic_get/4 :: (rabbit_types:amqqueue(), pid(), boolean(), pid()) ->
                           {'ok', non_neg_integer(), qmsg()} | 'empty').
--spec(basic_consume/7 ::
-        (rabbit_types:amqqueue(), boolean(), pid(),
-         rabbit_limiter:token(), rabbit_types:ctag(), boolean(), any())
+-spec(credit/5 :: (rabbit_types:amqqueue(), pid(), rabbit_types:ctag(),
+                   non_neg_integer(), boolean()) -> 'ok').
+-spec(basic_consume/9 ::
+        (rabbit_types:amqqueue(), boolean(), pid(), pid(), boolean(),
+         rabbit_types:ctag(), boolean(), {non_neg_integer(), boolean()} | 'none', any())
         -> rabbit_types:ok_or_error('exclusive_consume_unavailable')).
 -spec(basic_cancel/4 ::
         (rabbit_types:amqqueue(), pid(), rabbit_types:ctag(), any()) -> 'ok').
 -spec(notify_sent/2 :: (pid(), pid()) -> 'ok').
 -spec(notify_sent_queue_down/1 :: (pid()) -> 'ok').
--spec(unblock/2 :: (pid(), pid()) -> 'ok').
+-spec(resume/2 :: (pid(), pid()) -> 'ok').
 -spec(flush_all/2 :: (qpids(), pid()) -> 'ok').
--spec(internal_delete/2 ::
-        (name(), pid()) -> rabbit_types:ok_or_error('not_found') |
-                           rabbit_types:connection_exit() |
-                           fun (() -> rabbit_types:ok_or_error('not_found') |
-                                      rabbit_types:connection_exit())).
+-spec(internal_delete/1 ::
+        (name()) -> rabbit_types:ok_or_error('not_found') |
+                    rabbit_types:connection_exit() |
+                    fun (() -> rabbit_types:ok_or_error('not_found') |
+                               rabbit_types:connection_exit())).
 -spec(run_backing_queue/3 ::
         (pid(), atom(),
          (fun ((atom(), A) -> {[rabbit_types:msg_id()], A}))) -> 'ok').
@@ -157,6 +172,12 @@
 -spec(on_node_down/1 :: (node()) -> 'ok').
 -spec(pseudo_queue/2 :: (name(), pid()) -> rabbit_types:amqqueue()).
 -spec(store_queue/1 :: (rabbit_types:amqqueue()) -> 'ok').
+-spec(policy_changed/2 ::
+        (rabbit_types:amqqueue(), rabbit_types:amqqueue()) -> 'ok').
+-spec(start_mirroring/1 :: (pid()) -> 'ok').
+-spec(stop_mirroring/1 :: (pid()) -> 'ok').
+-spec(sync_mirrors/1 :: (pid()) -> 'ok' | rabbit_types:error('not_mirrored')).
+-spec(cancel_sync_mirrors/1 :: (pid()) -> 'ok' | {'ok', 'not_syncing'}).
 
 -endif.
 
@@ -165,7 +186,10 @@
 -define(CONSUMER_INFO_KEYS,
         [queue_name, channel_pid, consumer_tag, ack_required]).
 
-start() ->
+recover() ->
+    %% Clear out remnants of old incarnation, in case we restarted
+    %% faster than other nodes handled DOWN messages from us.
+    on_node_down(node()),
     DurableQueues = find_durable_queues(),
     {ok, BQ} = application:get_env(rabbit, backing_queue_module),
     ok = BQ:start([QName || #amqqueue{name = QName} <- DurableQueues]),
@@ -182,36 +206,45 @@ stop() ->
     {ok, BQ} = application:get_env(rabbit, backing_queue_module),
     ok = BQ:stop().
 
+start(Qs) ->
+    %% At this point all recovered queues and their bindings are
+    %% visible to routing, so now it is safe for them to complete
+    %% their initialisation (which may involve interacting with other
+    %% queues).
+    [Pid ! {self(), go} || #amqqueue{pid = Pid} <- Qs],
+    ok.
+
 find_durable_queues() ->
     Node = node(),
     %% TODO: use dirty ops instead
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
-              qlc:e(qlc:q([Q || Q = #amqqueue{pid = Pid}
+              qlc:e(qlc:q([Q || Q = #amqqueue{name = Name,
+                                              pid  = Pid}
                                     <- mnesia:table(rabbit_durable_queue),
+                                mnesia:read(rabbit_queue, Name, read) =:= [],
                                 node(Pid) == Node]))
       end).
 
 recover_durable_queues(DurableQueues) ->
     Qs = [start_queue_process(node(), Q) || Q <- DurableQueues],
-    [QName || Q = #amqqueue{name = QName, pid = Pid} <- Qs,
-              gen_server2:call(Pid, {init, true}, infinity) == {new, Q}].
+    [Q || Q = #amqqueue{pid = Pid} <- Qs,
+          gen_server2:call(Pid, {init, self()}, infinity) == {new, Q}].
 
 declare(QueueName, Durable, AutoDelete, Args, Owner) ->
     ok = check_declare_arguments(QueueName, Args),
-    {Node, MNodes} = determine_queue_nodes(Args),
-    Q = start_queue_process(Node, #amqqueue{name            = QueueName,
-                                            durable         = Durable,
-                                            auto_delete     = AutoDelete,
-                                            arguments       = Args,
-                                            exclusive_owner = Owner,
-                                            pid             = none,
-                                            slave_pids      = [],
-                                            mirror_nodes    = MNodes}),
-    case gen_server2:call(Q#amqqueue.pid, {init, false}, infinity) of
-        not_found -> rabbit_misc:not_found(QueueName);
-        Q1        -> Q1
-    end.
+    Q0 = rabbit_policy:set(#amqqueue{name            = QueueName,
+                                     durable         = Durable,
+                                     auto_delete     = AutoDelete,
+                                     arguments       = Args,
+                                     exclusive_owner = Owner,
+                                     pid             = none,
+                                     slave_pids      = [],
+                                     sync_slave_pids = [],
+                                     gm_pids         = []}),
+    {Node, _MNodes} = rabbit_mirror_queue_misc:suggested_queue_nodes(Q0),
+    Q1 = start_queue_process(Node, Q0),
+    gen_server2:call(Q1#amqqueue.pid, {init, new}, infinity).
 
 internal_declare(Q, true) ->
     rabbit_misc:execute_mnesia_tx_with_tail(
@@ -221,45 +254,51 @@ internal_declare(Q = #amqqueue{name = QueueName}, false) ->
       fun () ->
               case mnesia:wread({rabbit_queue, QueueName}) of
                   [] ->
-                      case mnesia:read({rabbit_durable_queue, QueueName}) of
-                          []  -> ok = store_queue(Q),
-                                 B = add_default_binding(Q),
-                                 fun () -> B(), Q end;
-                          %% Q exists on stopped node
-                          [_] -> rabbit_misc:const(not_found)
+                      case not_found_or_absent(QueueName) of
+                          not_found        -> Q1 = rabbit_policy:set(Q),
+                                              ok = store_queue(Q1),
+                                              B = add_default_binding(Q1),
+                                              fun () -> B(), Q1 end;
+                          {absent, _Q} = R -> rabbit_misc:const(R)
                       end;
                   [ExistingQ = #amqqueue{pid = QPid}] ->
                       case rabbit_misc:is_process_alive(QPid) of
                           true  -> rabbit_misc:const(ExistingQ);
-                          false -> TailFun = internal_delete(QueueName, QPid),
+                          false -> TailFun = internal_delete(QueueName),
                                    fun () -> TailFun(), ExistingQ end
                       end
               end
       end).
 
+update(Name, Fun) ->
+    case mnesia:wread({rabbit_queue, Name}) of
+        [Q = #amqqueue{durable = Durable}] ->
+            Q1 = Fun(Q),
+            ok = mnesia:write(rabbit_queue, Q1, write),
+            case Durable of
+                true -> ok = mnesia:write(rabbit_durable_queue, Q1, write);
+                _    -> ok
+            end;
+        [] ->
+            ok
+    end.
+
 store_queue(Q = #amqqueue{durable = true}) ->
-    ok = mnesia:write(rabbit_durable_queue, Q#amqqueue{slave_pids = []}, write),
+    ok = mnesia:write(rabbit_durable_queue,
+                      Q#amqqueue{slave_pids      = [],
+                                 sync_slave_pids = [],
+                                 gm_pids         = []}, write),
     ok = mnesia:write(rabbit_queue, Q, write),
     ok;
 store_queue(Q = #amqqueue{durable = false}) ->
     ok = mnesia:write(rabbit_queue, Q, write),
     ok.
 
-determine_queue_nodes(Args) ->
-    Policy = rabbit_misc:table_lookup(Args, <<"x-ha-policy">>),
-    PolicyParams = rabbit_misc:table_lookup(Args, <<"x-ha-policy-params">>),
-    case {Policy, PolicyParams} of
-        {{_Type, <<"nodes">>}, {array, Nodes}} ->
-            case [list_to_atom(binary_to_list(Node)) ||
-                     {longstr, Node} <- Nodes] of
-                [Node]         -> {Node,   undefined};
-                [First | Rest] -> {First,  [First | Rest]}
-            end;
-        {{_Type, <<"all">>}, _} ->
-            {node(), all};
-        _ ->
-            {node(), undefined}
-    end.
+policy_changed(Q1, Q2) ->
+    rabbit_mirror_queue_misc:update_mirrors(Q1, Q2),
+    %% Make sure we emit a stats event even if nothing
+    %% mirroring-related has changed - the policy may have changed anyway.
+    wake_up(Q1).
 
 start_queue_process(Node, Q) ->
     {ok, Pid} = rabbit_amqqueue_sup:start_child(Node, [Q]),
@@ -273,6 +312,8 @@ add_default_binding(#amqqueue{name = QueueName}) ->
                                 key         = RoutingKey,
                                 args        = []}).
 
+lookup([])     -> [];                             %% optimisation
+lookup([Name]) -> ets:lookup(rabbit_queue, Name); %% optimisation
 lookup(Names) when is_list(Names) ->
     %% Normally we'd call mnesia:dirty_read/1 here, but that is quite
     %% expensive for reasons explained in rabbit_misc:dirty_read/1.
@@ -280,21 +321,47 @@ lookup(Names) when is_list(Names) ->
 lookup(Name) ->
     rabbit_misc:dirty_read({rabbit_queue, Name}).
 
+not_found_or_absent(Name) ->
+    %% NB: we assume that the caller has already performed a lookup on
+    %% rabbit_queue and not found anything
+    case mnesia:read({rabbit_durable_queue, Name}) of
+        []  -> not_found;
+        [Q] -> {absent, Q} %% Q exists on stopped node
+    end.
+
+not_found_or_absent_dirty(Name) ->
+    %% We should read from both tables inside a tx, to get a
+    %% consistent view. But the chances of an inconsistency are small,
+    %% and only affect the error kind.
+    case rabbit_misc:dirty_read({rabbit_durable_queue, Name}) of
+        {error, not_found} -> not_found;
+        {ok, Q}            -> {absent, Q}
+    end.
+
 with(Name, F, E) ->
     case lookup(Name) of
-        {ok, Q = #amqqueue{slave_pids = []}} ->
-            rabbit_misc:with_exit_handler(E, fun () -> F(Q) end);
-        {ok, Q} ->
-            E1 = fun () -> timer:sleep(25), with(Name, F, E) end,
-            rabbit_misc:with_exit_handler(E1, fun () -> F(Q) end);
+        {ok, Q = #amqqueue{pid = QPid}} ->
+            %% We check is_process_alive(QPid) in case we receive a
+            %% nodedown (for example) in F() that has nothing to do
+            %% with the QPid.
+            rabbit_misc:with_exit_handler(
+              fun () ->
+                      case rabbit_misc:is_process_alive(QPid) of
+                          true  -> E(not_found_or_absent_dirty(Name));
+                          false -> timer:sleep(25),
+                                   with(Name, F, E)
+                      end
+              end, fun () -> F(Q) end);
         {error, not_found} ->
-            E()
+            E(not_found_or_absent_dirty(Name))
     end.
 
-with(Name, F) ->
-    with(Name, F, fun () -> {error, not_found} end).
+with(Name, F) -> with(Name, F, fun (E) -> {error, E} end).
+
 with_or_die(Name, F) ->
-    with(Name, F, fun () -> rabbit_misc:not_found(Name) end).
+    with(Name, F, fun (not_found)   -> rabbit_misc:not_found(Name);
+                      ({absent, Q}) -> rabbit_misc:absent(Q)
+                  end).
 
 assert_equivalence(#amqqueue{durable     = Durable,
                              auto_delete = AutoDelete} = Q,
@@ -325,16 +392,10 @@ with_exclusive_access_or_die(Name, ReaderPid, F) ->
 
 assert_args_equivalence(#amqqueue{name = QueueName, arguments = Args},
                         RequiredArgs) ->
-    rabbit_misc:assert_args_equivalence(
-      Args, RequiredArgs, QueueName,
-      [<<"x-expires">>, <<"x-message-ttl">>, <<"x-ha-policy">>]).
+    rabbit_misc:assert_args_equivalence(Args, RequiredArgs, QueueName,
+                                        [Key || {Key, _Fun} <- args()]).
 
 check_declare_arguments(QueueName, Args) ->
-    Checks = [{<<"x-expires">>,                 fun check_positive_int_arg/2},
-              {<<"x-message-ttl">>,             fun check_non_neg_int_arg/2},
-              {<<"x-ha-policy">>,               fun check_ha_policy_arg/2},
-              {<<"x-dead-letter-exchange">>,    fun check_string_arg/2},
-              {<<"x-dead-letter-routing-key">>, fun check_dlxrk_arg/2}],
     [case rabbit_misc:table_lookup(Args, Key) of
          undefined -> ok;
          TypeVal   -> case Fun(TypeVal, Args) of
@@ -345,13 +406,14 @@ check_declare_arguments(QueueName, Args) ->
                                               [Key, rabbit_misc:rs(QueueName),
                                                Error])
                       end
-     end || {Key, Fun} <- Checks],
+     end || {Key, Fun} <- args()],
     ok.
 
-check_string_arg({longstr, _}, _Args) ->
-    ok;
-check_string_arg({Type, _}, _) ->
-    {error, {unacceptable_type, Type}}.
+args() ->
+    [{<<"x-expires">>,                 fun check_expires_arg/2},
+     {<<"x-message-ttl">>,             fun check_message_ttl_arg/2},
+     {<<"x-dead-letter-routing-key">>, fun check_dlxrk_arg/2},
+     {<<"x-max-length">>,              fun check_max_length_arg/2}].
 
 check_int_arg({Type, _}, _) ->
     case lists:member(Type, ?INTEGER_ARG_TYPES) of
@@ -359,53 +421,35 @@ check_int_arg({Type, _}, _) ->
         false -> {error, {unacceptable_type, Type}}
     end.
 
-check_positive_int_arg({Type, Val}, Args) ->
+check_max_length_arg({Type, Val}, Args) ->
     case check_int_arg({Type, Val}, Args) of
-        ok when Val > 0 -> ok;
-        ok              -> {error, {value_zero_or_less, Val}};
-        Error           -> Error
+        ok when Val >= 0 -> ok;
+        ok               -> {error, {value_negative, Val}};
+        Error            -> Error
     end.
 
-check_non_neg_int_arg({Type, Val}, Args) ->
+check_expires_arg({Type, Val}, Args) ->
     case check_int_arg({Type, Val}, Args) of
-        ok when Val >= 0 -> ok;
-        ok               -> {error, {value_less_than_zero, Val}};
+        ok when Val == 0 -> {error, {value_zero, Val}};
+        ok               -> rabbit_misc:check_expiry(Val);
         Error            -> Error
     end.
 
+check_message_ttl_arg({Type, Val}, Args) ->
+    case check_int_arg({Type, Val}, Args) of
+        ok    -> rabbit_misc:check_expiry(Val);
+        Error -> Error
+    end.
+
 check_dlxrk_arg({longstr, _}, Args) ->
     case rabbit_misc:table_lookup(Args, <<"x-dead-letter-exchange">>) of
         undefined -> {error, routing_key_but_no_dlx_defined};
         _         -> ok
     end;
-check_dlxrk_arg({Type, _}, _Args) ->
+check_dlxrk_arg({Type,    _}, _Args) ->
     {error, {unacceptable_type, Type}}.
 
-check_ha_policy_arg({longstr, <<"all">>}, _Args) ->
-    ok;
-check_ha_policy_arg({longstr, <<"nodes">>}, Args) ->
-    case rabbit_misc:table_lookup(Args, <<"x-ha-policy-params">>) of
-        undefined ->
-            {error, {require, 'x-ha-policy-params'}};
-        {array, []} ->
-            {error, {require_non_empty_list_of_nodes_for_ha}};
-        {array, Ary} ->
-            case lists:all(fun ({longstr, _Node}) -> true;
-                               (_               ) -> false
-                           end, Ary) of
-                true  -> ok;
-                false -> {error, {require_node_list_as_longstrs_for_ha, Ary}}
-            end;
-        {Type, _} ->
-            {error, {ha_nodes_policy_params_not_array_of_longstr, Type}}
-    end;
-check_ha_policy_arg({longstr, Policy}, _Args) ->
-    {error, {invalid_ha_policy, Policy}};
-check_ha_policy_arg({Type, _}, _Args) ->
-    {error, {unacceptable_type, Type}}.
-
-list() ->
-    mnesia:dirty_match_object(rabbit_queue, #amqqueue{_ = '_'}).
+list() -> mnesia:dirty_match_object(rabbit_queue, #amqqueue{_ = '_'}).
 
 list(VHostPath) ->
     mnesia:dirty_match_object(
@@ -416,11 +460,10 @@ info_keys() -> rabbit_amqqueue_process:info_keys().
 
 map(VHostPath, F) -> rabbit_misc:filter_exit_map(F, list(VHostPath)).
 
-info(#amqqueue{ pid = QPid }) ->
-    delegate_call(QPid, info).
+info(#amqqueue{ pid = QPid }) -> delegate:call(QPid, info).
 
 info(#amqqueue{ pid = QPid }, Items) ->
-    case delegate_call(QPid, {info, Items}) of
+    case delegate:call(QPid, {info, Items}) of
         {ok, Res}      -> Res;
         {error, Error} -> throw(Error)
     end.
@@ -434,8 +477,7 @@ info_all(VHostPath, Items) -> map(VHostPath, fun (Q) -> info(Q, Items) end).
 %% the first place since a node failed). Therefore we keep poking at
 %% the list of queues until we were able to talk to a live process or
 %% the queue no longer exists.
-force_event_refresh() ->
-    force_event_refresh([Q#amqqueue.name || Q <- list()]).
+force_event_refresh() -> force_event_refresh([Q#amqqueue.name || Q <- list()]).
 
 force_event_refresh(QNames) ->
     Qs = [Q || Q <- list(), lists:member(Q#amqqueue.name, QNames)],
@@ -450,8 +492,9 @@ force_event_refresh(QNames) ->
               force_event_refresh(Failed)
     end.
 
-consumers(#amqqueue{ pid = QPid }) ->
-    delegate_call(QPid, consumers).
+wake_up(#amqqueue{pid = QPid}) -> gen_server2:cast(QPid, wake_up).
+
+consumers(#amqqueue{ pid = QPid }) -> delegate:call(QPid, consumers).
 
 consumer_info_keys() -> ?CONSUMER_INFO_KEYS.
 
@@ -465,50 +508,54 @@ consumers_all(VHostPath) ->
                          {ChPid, ConsumerTag, AckRequired} <- consumers(Q)]
           end)).
 
-stat(#amqqueue{pid = QPid}) ->
-    delegate_call(QPid, stat).
+stat(#amqqueue{pid = QPid}) -> delegate:call(QPid, stat).
 
 delete_immediately(QPids) ->
     [gen_server2:cast(QPid, delete_immediately) || QPid <- QPids],
     ok.
 
 delete(#amqqueue{ pid = QPid }, IfUnused, IfEmpty) ->
-    delegate_call(QPid, {delete, IfUnused, IfEmpty}).
+    delegate:call(QPid, {delete, IfUnused, IfEmpty}).
 
-purge(#amqqueue{ pid = QPid }) -> delegate_call(QPid, purge).
+purge(#amqqueue{ pid = QPid }) -> delegate:call(QPid, purge).
 
 deliver(Qs, Delivery) -> deliver(Qs, Delivery, noflow).
 
 deliver_flow(Qs, Delivery) -> deliver(Qs, Delivery, flow).
 
-requeue(QPid, MsgIds, ChPid) ->
-    delegate_call(QPid, {requeue, MsgIds, ChPid}).
+requeue(QPid, MsgIds, ChPid) -> delegate:call(QPid, {requeue, MsgIds, ChPid}).
 
-ack(QPid, MsgIds, ChPid) ->
-    delegate_cast(QPid, {ack, MsgIds, ChPid}).
+ack(QPid, MsgIds, ChPid) -> delegate:cast(QPid, {ack, MsgIds, ChPid}).
 
 reject(QPid, MsgIds, Requeue, ChPid) ->
-    delegate_cast(QPid, {reject, MsgIds, Requeue, ChPid}).
+    delegate:cast(QPid, {reject, MsgIds, Requeue, ChPid}).
 
 notify_down_all(QPids, ChPid) ->
-    safe_delegate_call_ok(
-      fun (QPid) -> gen_server2:call(QPid, {notify_down, ChPid}, infinity) end,
-      QPids).
+    {_, Bads} = delegate:call(QPids, {notify_down, ChPid}),
+    case lists:filter(
+           fun ({_Pid, {exit, {R, _}, _}}) -> rabbit_misc:is_abnormal_exit(R);
+               ({_Pid, _})                 -> false
+           end, Bads) of
+        []    -> ok;
+        Bads1 -> {error, Bads1}
+    end.
 
-limit_all(QPids, ChPid, Limiter) ->
-    delegate:invoke_no_result(
-      QPids, fun (QPid) -> gen_server2:cast(QPid, {limit, ChPid, Limiter}) end).
+activate_limit_all(QPids, ChPid) ->
+    delegate:cast(QPids, {activate_limit, ChPid}).
 
-basic_get(#amqqueue{pid = QPid}, ChPid, NoAck) ->
-    delegate_call(QPid, {basic_get, ChPid, NoAck}).
+credit(#amqqueue{pid = QPid}, ChPid, CTag, Credit, Drain) ->
+    delegate:cast(QPid, {credit, ChPid, CTag, Credit, Drain}).
 
-basic_consume(#amqqueue{pid = QPid}, NoAck, ChPid, Limiter,
-              ConsumerTag, ExclusiveConsume, OkMsg) ->
-    delegate_call(QPid, {basic_consume, NoAck, ChPid,
-                         Limiter, ConsumerTag, ExclusiveConsume, OkMsg}).
+basic_get(#amqqueue{pid = QPid}, ChPid, NoAck, LimiterPid) ->
+    delegate:call(QPid, {basic_get, ChPid, NoAck, LimiterPid}).
+
+basic_consume(#amqqueue{pid = QPid}, NoAck, ChPid, LimiterPid, LimiterActive,
+              ConsumerTag, ExclusiveConsume, CreditArgs, OkMsg) ->
+    delegate:call(QPid, {basic_consume, NoAck, ChPid, LimiterPid, LimiterActive,
+                         ConsumerTag, ExclusiveConsume, CreditArgs, OkMsg}).
 
 basic_cancel(#amqqueue{pid = QPid}, ChPid, ConsumerTag, OkMsg) ->
-    ok = delegate_call(QPid, {basic_cancel, ChPid, ConsumerTag, OkMsg}).
+    delegate:call(QPid, {basic_cancel, ChPid, ConsumerTag, OkMsg}).
 
 notify_sent(QPid, ChPid) ->
     Key = {consumer_credit_to, QPid},
@@ -527,36 +574,58 @@ notify_sent_queue_down(QPid) ->
     erase({consumer_credit_to, QPid}),
     ok.
 
-unblock(QPid, ChPid) ->
-    delegate_cast(QPid, {unblock, ChPid}).
+resume(QPid, ChPid) -> delegate:cast(QPid, {resume, ChPid}).
 
-flush_all(QPids, ChPid) ->
-    delegate:invoke_no_result(
-      QPids, fun (QPid) -> gen_server2:cast(QPid, {flush, ChPid}) end).
+flush_all(QPids, ChPid) -> delegate:cast(QPids, {flush, ChPid}).
 
 internal_delete1(QueueName) ->
     ok = mnesia:delete({rabbit_queue, QueueName}),
-    ok = mnesia:delete({rabbit_durable_queue, QueueName}),
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:wread({rabbit_durable_queue, QueueName}) of
+        []  -> ok;
+        [_] -> ok = mnesia:delete({rabbit_durable_queue, QueueName})
+    end,
     %% we want to execute some things, as decided by rabbit_exchange,
     %% after the transaction.
     rabbit_binding:remove_for_destination(QueueName).
 
-internal_delete(QueueName, QPid) ->
+internal_delete(QueueName) ->
     rabbit_misc:execute_mnesia_tx_with_tail(
       fun () ->
-              case mnesia:wread({rabbit_queue, QueueName}) of
-                  []  -> rabbit_misc:const({error, not_found});
-                  [_] -> Deletions = internal_delete1(QueueName),
-                         T = rabbit_binding:process_deletions(Deletions),
-                         fun() ->
-                                 ok = T(),
-                                 ok = rabbit_event:notify(queue_deleted,
-                                                          [{pid,  QPid},
-                                                           {name, QueueName}])
-                         end
+              case {mnesia:wread({rabbit_queue, QueueName}),
+                    mnesia:wread({rabbit_durable_queue, QueueName})} of
+                  {[], []} ->
+                      rabbit_misc:const({error, not_found});
+                  _ ->
+                      Deletions = internal_delete1(QueueName),
+                      T = rabbit_binding:process_deletions(Deletions),
+                      fun() ->
+                              ok = T(),
+                              ok = rabbit_event:notify(queue_deleted,
+                                                       [{name, QueueName}])
+                      end
               end
       end).
 
+forget_all_durable(Node) ->
+    %% Note rabbit is not running so we avoid e.g. the worker pool. Also why
+    %% we don't invoke the return from rabbit_binding:process_deletions/1.
+    {atomic, ok} =
+        mnesia:sync_transaction(
+          fun () ->
+                  Qs = mnesia:match_object(rabbit_durable_queue,
+                                           #amqqueue{_ = '_'}, write),
+                  [rabbit_binding:process_deletions(
+                     internal_delete1(Name)) ||
+                      #amqqueue{name = Name, pid = Pid} = Q <- Qs,
+                      node(Pid) =:= Node,
+                      rabbit_policy:get(<<"ha-mode">>, Q)
+                          =:= {error, not_found}],
+                  ok
+          end),
+    ok.
+
 run_backing_queue(QPid, Mod, Fun) ->
     gen_server2:cast(QPid, {run_backing_queue, Mod, Fun}).
 
@@ -566,14 +635,21 @@ set_ram_duration_target(QPid, Duration) ->
 set_maximum_since_use(QPid, Age) ->
     gen_server2:cast(QPid, {set_maximum_since_use, Age}).
 
+start_mirroring(QPid) -> ok = delegate:cast(QPid, start_mirroring).
+stop_mirroring(QPid)  -> ok = delegate:cast(QPid, stop_mirroring).
+
+sync_mirrors(QPid)        -> delegate:call(QPid, sync_mirrors).
+cancel_sync_mirrors(QPid) -> delegate:call(QPid, cancel_sync_mirrors).
+
 on_node_down(Node) ->
     rabbit_misc:execute_mnesia_tx_with_tail(
       fun () -> QsDels =
-                    qlc:e(qlc:q([{{QName, Pid}, delete_queue(QName)} ||
+                    qlc:e(qlc:q([{QName, delete_queue(QName)} ||
                                     #amqqueue{name = QName, pid = Pid,
                                               slave_pids = []}
                                         <- mnesia:table(rabbit_queue),
-                                    node(Pid) == Node])),
+                                    node(Pid) == Node andalso
+                                    not rabbit_misc:is_process_alive(Pid)])),
                 {Qs, Dels} = lists:unzip(QsDels),
                 T = rabbit_binding:process_deletions(
                       lists:foldl(fun rabbit_binding:combine_deletions/2,
@@ -581,10 +657,9 @@ on_node_down(Node) ->
                 fun () ->
                         T(),
                         lists:foreach(
-                          fun({QName, QPid}) ->
+                          fun(QName) ->
                                   ok = rabbit_event:notify(queue_deleted,
-                                                           [{pid,  QPid},
-                                                            {name, QName}])
+                                                           [{name, QName}])
                           end, Qs)
                 end
       end).
@@ -599,64 +674,54 @@ pseudo_queue(QueueName, Pid) ->
               auto_delete  = false,
               arguments    = [],
               pid          = Pid,
-              slave_pids   = [],
-              mirror_nodes = undefined}.
+              slave_pids   = []}.
 
-deliver([], #delivery{mandatory = false, immediate = false}, _Flow) ->
+deliver([], #delivery{mandatory = false}, _Flow) ->
     %% /dev/null optimisation
     {routed, []};
 
-deliver(Qs, Delivery = #delivery{mandatory = false, immediate = false}, Flow) ->
-    %% optimisation: when Mandatory = false and Immediate = false,
-    %% rabbit_amqqueue:deliver will deliver the message to the queue
-    %% process asynchronously, and return true, which means all the
-    %% QPids will always be returned. It is therefore safe to use a
-    %% fire-and-forget cast here and return the QPids - the semantics
-    %% is preserved. This scales much better than the non-immediate
-    %% case below.
-    QPids = qpids(Qs),
+deliver(Qs, Delivery = #delivery{mandatory = false}, Flow) ->
+    %% optimisation: when Mandatory = false, rabbit_amqqueue:deliver
+    %% will deliver the message to the queue process asynchronously,
+    %% and return true, which means all the QPids will always be
+    %% returned. It is therefore safe to use a fire-and-forget cast
+    %% here and return the QPids - the semantics is preserved. This
+    %% scales much better than the case below.
+    {MPids, SPids} = qpids(Qs),
+    QPids = MPids ++ SPids,
     case Flow of
         flow   -> [credit_flow:send(QPid) || QPid <- QPids];
         noflow -> ok
     end,
-    delegate:invoke_no_result(
-      QPids, fun (QPid) ->
-                     gen_server2:cast(QPid, {deliver, Delivery, Flow})
-             end),
-    {routed, QPids};
 
-deliver(Qs, Delivery = #delivery{mandatory = Mandatory, immediate = Immediate},
-        _Flow) ->
-    QPids = qpids(Qs),
-    {Success, _} =
-        delegate:invoke(
-          QPids, fun (QPid) ->
-                         gen_server2:call(QPid, {deliver, Delivery}, infinity)
-                 end),
-    case {Mandatory, Immediate,
-          lists:foldl(fun ({QPid, true}, {_, H}) -> {true, [QPid | H]};
-                          ({_,   false}, {_, H}) -> {true, H}
-                      end, {false, []}, Success)} of
-        {true, _   , {false, []}} -> {unroutable,    []};
-        {_   , true, {_    , []}} -> {not_delivered, []};
-        {_   , _   , {_    ,  R}} -> {routed,         R}
-    end.
+    %% We let slaves know that they were being addressed as slaves at
+    %% the time - if they receive such a message from the channel
+    %% after they have become master they should mark the message as
+    %% 'delivered' since they do not know what the master may have
+    %% done with it.
+    MMsg = {deliver, Delivery, false, Flow},
+    SMsg = {deliver, Delivery, true,  Flow},
+    delegate:cast(MPids, MMsg),
+    delegate:cast(SPids, SMsg),
+    {routed, QPids};
 
-qpids(Qs) -> lists:append([[QPid | SPids] ||
-                              #amqqueue{pid = QPid, slave_pids = SPids} <- Qs]).
-
-safe_delegate_call_ok(F, Pids) ->
-    case delegate:invoke(Pids, fun (Pid) ->
-                                       rabbit_misc:with_exit_handler(
-                                         fun () -> ok end,
-                                         fun () -> F(Pid) end)
-                               end) of
-        {_,  []} -> ok;
-        {_, Bad} -> {error, Bad}
+deliver(Qs, Delivery, _Flow) ->
+    {MPids, SPids} = qpids(Qs),
+    %% see comment above
+    MMsg = {deliver, Delivery, false},
+    SMsg = {deliver, Delivery, true},
+    {MRouted, _} = delegate:call(MPids, MMsg),
+    {SRouted, _} = delegate:call(SPids, SMsg),
+    case MRouted ++ SRouted of
+        [] -> {unroutable, []};
+        R  -> {routed,     [QPid || {QPid, ok} <- R]}
     end.
 
-delegate_call(Pid, Msg) ->
-    delegate:invoke(Pid, fun (P) -> gen_server2:call(P, Msg, infinity) end).
-
-delegate_cast(Pid, Msg) ->
-    delegate:invoke_no_result(Pid, fun (P) -> gen_server2:cast(P, Msg) end).
+qpids([]) -> {[], []}; %% optimisation
+qpids([#amqqueue{pid = QPid, slave_pids = SPids}]) -> {[QPid], SPids}; %% opt
+qpids(Qs) ->
+    {MPids, SPids} = lists:foldl(fun (#amqqueue{pid = QPid, slave_pids = SPids},
+                                      {MPidAcc, SPidAcc}) ->
+                                         {[QPid | MPidAcc], [SPids | SPidAcc]}
+                                 end, {[], []}, Qs),
+    {MPids, lists:append(SPids)}.
diff --git a/src/rabbit_amqqueue_process.erl b/src/rabbit_amqqueue_process.erl
index 5701efeb..6e0eb9bf 100644
--- a/src/rabbit_amqqueue_process.erl
+++ b/src/rabbit_amqqueue_process.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_amqqueue_process).
@@ -26,11 +26,11 @@
 
 -export([start_link/1, info_keys/0]).
 
--export([init_with_backing_queue_state/8]).
+-export([init_with_backing_queue_state/7]).
 
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
-         handle_info/2, handle_pre_hibernate/1, prioritise_call/3,
-         prioritise_cast/2, prioritise_info/2, format_message_queue/2]).
+         handle_info/2, handle_pre_hibernate/1, prioritise_call/4,
+         prioritise_cast/3, prioritise_info/3, format_message_queue/2]).
 
 %% Queue's state
 -record(q, {q,
@@ -47,13 +47,12 @@
             msg_id_to_channel,
             ttl,
             ttl_timer_ref,
+            ttl_timer_expiry,
             senders,
-            publish_seqno,
-            unconfirmed,
-            delayed_stop,
-            queue_monitors,
             dlx,
-            dlx_routing_key
+            dlx_routing_key,
+            max_length,
+            status
            }).
 
 -record(consumer, {tag, ack_required}).
@@ -63,9 +62,12 @@
              monitor_ref,
              acktags,
              consumer_count,
+             %% Queue of {ChPid, #consumer{}} for consumers which have
+             %% been blocked for any reason
              blocked_consumers,
+             %% The limiter itself
              limiter,
-             is_limit_active,
+             %% Internal flow control for queue -> writer
              unsent_message_count}).
 
 %%----------------------------------------------------------------------------
@@ -75,8 +77,8 @@
 -spec(start_link/1 ::
         (rabbit_types:amqqueue()) -> rabbit_types:ok_pid_or_error()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
--spec(init_with_backing_queue_state/8 ::
-        (rabbit_types:amqqueue(), atom(), tuple(), any(), [any()],
+-spec(init_with_backing_queue_state/7 ::
+        (rabbit_types:amqqueue(), atom(), tuple(), any(),
          [rabbit_types:delivery()], pmon:pmon(), dict()) -> #q{}).
 
 -endif.
@@ -84,7 +86,8 @@
 %%----------------------------------------------------------------------------
 
 -define(STATISTICS_KEYS,
-        [pid,
+        [name,
+         policy,
          exclusive_consumer_pid,
          exclusive_consumer_tag,
          messages_ready,
@@ -93,22 +96,20 @@
          consumers,
          memory,
          slave_pids,
-         backing_queue_status
+         synchronised_slave_pids,
+         backing_queue_status,
+         status
         ]).
 
 -define(CREATION_EVENT_KEYS,
-        [pid,
-         name,
+        [name,
          durable,
          auto_delete,
          arguments,
-         owner_pid,
-         slave_pids,
-         synchronised_slave_pids
+         owner_pid
         ]).
 
--define(INFO_KEYS,
-        ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid, slave_pids]).
+-define(INFO_KEYS, [pid | ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [name]]).
 
 %%----------------------------------------------------------------------------
 
@@ -120,73 +121,59 @@ info_keys() -> ?INFO_KEYS.
 
 init(Q) ->
     process_flag(trap_exit, true),
-
-    State = #q{q                   = Q#amqqueue{pid = self()},
-               exclusive_consumer  = none,
-               has_had_consumers   = false,
-               backing_queue       = backing_queue_module(Q),
-               backing_queue_state = undefined,
-               active_consumers    = queue:new(),
-               expires             = undefined,
-               sync_timer_ref      = undefined,
-               rate_timer_ref      = undefined,
-               expiry_timer_ref    = undefined,
-               ttl                 = undefined,
-               senders             = pmon:new(),
-               dlx                 = undefined,
-               dlx_routing_key     = undefined,
-               publish_seqno       = 1,
-               unconfirmed         = dtree:empty(),
-               delayed_stop        = undefined,
-               queue_monitors      = pmon:new(),
-               msg_id_to_channel   = gb_trees:empty()},
-    {ok, rabbit_event:init_stats_timer(State, #q.stats_timer), hibernate,
+    {ok, init_state(Q#amqqueue{pid = self()}), hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
 init_with_backing_queue_state(Q = #amqqueue{exclusive_owner = Owner}, BQ, BQS,
-                              RateTRef, AckTags, Deliveries, Senders, MTC) ->
+                              RateTRef, Deliveries, Senders, MTC) ->
     case Owner of
         none -> ok;
         _    -> erlang:monitor(process, Owner)
     end,
+    State = init_state(Q),
+    State1 = State#q{backing_queue       = BQ,
+                     backing_queue_state = BQS,
+                     rate_timer_ref      = RateTRef,
+                     senders             = Senders,
+                     msg_id_to_channel   = MTC},
+    State2 = process_args(State1),
+    lists:foldl(fun (Delivery, StateN) ->
+                        deliver_or_enqueue(Delivery, true, StateN)
+                end, State2, Deliveries).
+
+init_state(Q) ->
     State = #q{q                   = Q,
                exclusive_consumer  = none,
                has_had_consumers   = false,
-               backing_queue       = BQ,
-               backing_queue_state = BQS,
                active_consumers    = queue:new(),
-               expires             = undefined,
-               sync_timer_ref      = undefined,
-               rate_timer_ref      = RateTRef,
-               expiry_timer_ref    = undefined,
-               ttl                 = undefined,
-               senders             = Senders,
-               publish_seqno       = 1,
-               unconfirmed         = dtree:empty(),
-               delayed_stop        = undefined,
-               queue_monitors      = pmon:new(),
-               msg_id_to_channel   = MTC},
-    State1 = requeue_and_run(AckTags, process_args(
-                                        rabbit_event:init_stats_timer(
-                                          State, #q.stats_timer))),
-    lists:foldl(
-      fun (Delivery, StateN) -> deliver_or_enqueue(Delivery, StateN) end,
-      State1, Deliveries).
+               senders             = pmon:new(delegate),
+               msg_id_to_channel   = gb_trees:empty(),
+               status              = running},
+    rabbit_event:init_stats_timer(State, #q.stats_timer).
 
 terminate(shutdown = R,      State = #q{backing_queue = BQ}) ->
     terminate_shutdown(fun (BQS) -> BQ:terminate(R, BQS) end, State);
+terminate({shutdown, missing_owner} = Reason, State) ->
+    %% if the owner was missing then there will be no queue, so don't emit stats
+    terminate_shutdown(terminate_delete(false, Reason, State), State);
 terminate({shutdown, _} = R, State = #q{backing_queue = BQ}) ->
     terminate_shutdown(fun (BQS) -> BQ:terminate(R, BQS) end, State);
-terminate(Reason,            State = #q{q             = #amqqueue{name = QName},
-                                        backing_queue = BQ}) ->
-    %% FIXME: How do we cancel active subscriptions?
-    terminate_shutdown(
-      fun (BQS) ->
-              BQS1 = BQ:delete_and_terminate(Reason, BQS),
-              %% don't care if the internal delete doesn't return 'ok'.
-              rabbit_amqqueue:internal_delete(QName, self()),
-              BQS1
-      end, State).
+terminate(Reason,            State) ->
+    terminate_shutdown(terminate_delete(true, Reason, State), State).
+
+terminate_delete(EmitStats, Reason,
+                 State = #q{q = #amqqueue{name          = QName},
+                                          backing_queue = BQ}) ->
+    fun (BQS) ->
+        BQS1 = BQ:delete_and_terminate(Reason, BQS),
+        if EmitStats -> rabbit_event:if_enabled(State, #q.stats_timer,
+                                                fun() -> emit_stats(State) end);
+           true      -> ok
+        end,
+        %% don't care if the internal delete doesn't return 'ok'.
+        rabbit_amqqueue:internal_delete(QName),
+        BQS1
+    end.
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
@@ -194,34 +181,63 @@ code_change(_OldVsn, State, _Extra) ->
 %%----------------------------------------------------------------------------
 
 declare(Recover, From, State = #q{q                   = Q,
-                                  backing_queue       = BQ,
+                                  backing_queue       = undefined,
                                   backing_queue_state = undefined}) ->
-    case rabbit_amqqueue:internal_declare(Q, Recover) of
-        not_found -> {stop, normal, not_found, State};
-        Q         -> gen_server2:reply(From, {new, Q}),
-                     ok = file_handle_cache:register_callback(
-                            rabbit_amqqueue, set_maximum_since_use,
-                            [self()]),
-                     ok = rabbit_memory_monitor:register(
-                            self(), {rabbit_amqqueue,
-                                     set_ram_duration_target, [self()]}),
-                     BQS = bq_init(BQ, Q, Recover),
-                     State1 = process_args(State#q{backing_queue_state = BQS}),
-                     rabbit_event:notify(queue_created,
-                                         infos(?CREATION_EVENT_KEYS, State1)),
-                     rabbit_event:if_enabled(State1, #q.stats_timer,
-                                             fun() -> emit_stats(State1) end),
-                     noreply(State1);
-        Q1        -> {stop, normal, {existing, Q1}, State}
+    case rabbit_amqqueue:internal_declare(Q, Recover =/= new) of
+        #amqqueue{} = Q1 ->
+            case matches(Recover, Q, Q1) of
+                true ->
+                    gen_server2:reply(From, {new, Q}),
+                    ok = file_handle_cache:register_callback(
+                           rabbit_amqqueue, set_maximum_since_use, [self()]),
+                    ok = rabbit_memory_monitor:register(
+                           self(), {rabbit_amqqueue,
+                                    set_ram_duration_target, [self()]}),
+                    BQ = backing_queue_module(Q1),
+                    BQS = bq_init(BQ, Q, Recover),
+                    recovery_barrier(Recover),
+                    State1 = process_args(State#q{backing_queue       = BQ,
+                                                  backing_queue_state = BQS}),
+                    rabbit_event:notify(queue_created,
+                                        infos(?CREATION_EVENT_KEYS, State1)),
+                    rabbit_event:if_enabled(State1, #q.stats_timer,
+                                            fun() -> emit_stats(State1) end),
+                    noreply(State1);
+                false ->
+                    {stop, normal, {existing, Q1}, State}
+            end;
+        Err ->
+            {stop, normal, Err, State}
     end.
 
+matches(new, Q1, Q2) ->
+    %% i.e. not policy
+    Q1#amqqueue.name            =:= Q2#amqqueue.name            andalso
+    Q1#amqqueue.durable         =:= Q2#amqqueue.durable         andalso
+    Q1#amqqueue.auto_delete     =:= Q2#amqqueue.auto_delete     andalso
+    Q1#amqqueue.exclusive_owner =:= Q2#amqqueue.exclusive_owner andalso
+    Q1#amqqueue.arguments       =:= Q2#amqqueue.arguments       andalso
+    Q1#amqqueue.pid             =:= Q2#amqqueue.pid             andalso
+    Q1#amqqueue.slave_pids      =:= Q2#amqqueue.slave_pids;
+matches(_,  Q,   Q) -> true;
+matches(_, _Q, _Q1) -> false.
+
 bq_init(BQ, Q, Recover) ->
     Self = self(),
-    BQ:init(Q, Recover,
+    BQ:init(Q, Recover =/= new,
             fun (Mod, Fun) ->
                     rabbit_amqqueue:run_backing_queue(Self, Mod, Fun)
             end).
 
+recovery_barrier(new) ->
+    ok;
+recovery_barrier(BarrierPid) ->
+    MRef = erlang:monitor(process, BarrierPid),
+    receive
+        {BarrierPid, go}              -> erlang:demonitor(MRef, [flush]);
+        {'DOWN', MRef, process, _, _} -> ok
+    end.
+
 process_args(State = #q{q = #amqqueue{arguments = Arguments}}) ->
     lists:foldl(
       fun({Arg, Fun}, State1) ->
@@ -231,13 +247,14 @@ process_args(State = #q{q = #amqqueue{arguments = Arguments}}) ->
               end
       end, State,
       [{<<"x-expires">>,                 fun init_expires/2},
-       {<<"x-message-ttl">>,             fun init_ttl/2},
        {<<"x-dead-letter-exchange">>,    fun init_dlx/2},
-       {<<"x-dead-letter-routing-key">>, fun init_dlx_routing_key/2}]).
+       {<<"x-dead-letter-routing-key">>, fun init_dlx_routing_key/2},
+       {<<"x-message-ttl">>,             fun init_ttl/2},
+       {<<"x-max-length">>,              fun init_max_length/2}]).
 
 init_expires(Expires, State) -> ensure_expiry_timer(State#q{expires = Expires}).
 
-init_ttl(TTL, State) -> drop_expired_messages(State#q{ttl = TTL}).
+init_ttl(TTL, State) -> drop_expired_msgs(State#q{ttl = TTL}).
 
 init_dlx(DLX, State = #q{q = #amqqueue{name = QName}}) ->
     State#q{dlx = rabbit_misc:r(QName, exchange, DLX)}.
@@ -245,80 +262,61 @@ init_dlx(DLX, State = #q{q = #amqqueue{name = QName}}) ->
 init_dlx_routing_key(RoutingKey, State) ->
     State#q{dlx_routing_key = RoutingKey}.
 
+init_max_length(MaxLen, State) -> State#q{max_length = MaxLen}.
+
 terminate_shutdown(Fun, State) ->
     State1 = #q{backing_queue_state = BQS} =
-        stop_sync_timer(stop_rate_timer(State)),
+        lists:foldl(fun (F, S) -> F(S) end, State,
+                    [fun stop_sync_timer/1,
+                     fun stop_rate_timer/1,
+                     fun stop_expiry_timer/1,
+                     fun stop_ttl_timer/1]),
     case BQS of
         undefined -> State1;
         _         -> ok = rabbit_memory_monitor:deregister(self()),
-                     [emit_consumer_deleted(Ch, CTag)
+                     QName = qname(State),
+                     [emit_consumer_deleted(Ch, CTag, QName)
                       || {Ch, CTag, _} <- consumers(State1)],
                      State1#q{backing_queue_state = Fun(BQS)}
     end.
 
 reply(Reply, NewState) ->
-    assert_invariant(NewState),
     {NewState1, Timeout} = next_state(NewState),
-    {reply, Reply, NewState1, Timeout}.
+    {reply, Reply, ensure_stats_timer(ensure_rate_timer(NewState1)), Timeout}.
 
 noreply(NewState) ->
-    assert_invariant(NewState),
     {NewState1, Timeout} = next_state(NewState),
-    {noreply, NewState1, Timeout}.
+    {noreply, ensure_stats_timer(ensure_rate_timer(NewState1)), Timeout}.
 
 next_state(State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
+    assert_invariant(State),
     {MsgIds, BQS1} = BQ:drain_confirmed(BQS),
-    State1 = ensure_stats_timer(
-               ensure_rate_timer(
-                 confirm_messages(MsgIds, State#q{
-                                            backing_queue_state = BQS1}))),
+    State1 = confirm_messages(MsgIds, State#q{backing_queue_state = BQS1}),
     case BQ:needs_timeout(BQS1) of
         false -> {stop_sync_timer(State1),   hibernate     };
         idle  -> {stop_sync_timer(State1),   ?SYNC_INTERVAL};
         timed -> {ensure_sync_timer(State1), 0             }
     end.
 
-backing_queue_module(#amqqueue{arguments = Args}) ->
-    case rabbit_misc:table_lookup(Args, <<"x-ha-policy">>) of
-        undefined -> {ok, BQM} = application:get_env(backing_queue_module),
-                     BQM;
-        _Policy   -> rabbit_mirror_queue_master
+backing_queue_module(Q) ->
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> {ok, BQM} = application:get_env(backing_queue_module),
+                 BQM;
+        true  -> rabbit_mirror_queue_master
     end.
 
-ensure_sync_timer(State = #q{sync_timer_ref = undefined}) ->
-    TRef = erlang:send_after(?SYNC_INTERVAL, self(), sync_timeout),
-    State#q{sync_timer_ref = TRef};
 ensure_sync_timer(State) ->
-    State.
+    rabbit_misc:ensure_timer(State, #q.sync_timer_ref,
+                             ?SYNC_INTERVAL, sync_timeout).
+
+stop_sync_timer(State) -> rabbit_misc:stop_timer(State, #q.sync_timer_ref).
 
-stop_sync_timer(State = #q{sync_timer_ref = undefined}) ->
-    State;
-stop_sync_timer(State = #q{sync_timer_ref = TRef}) ->
-    erlang:cancel_timer(TRef),
-    State#q{sync_timer_ref = undefined}.
-
-ensure_rate_timer(State = #q{rate_timer_ref = undefined}) ->
-    TRef = erlang:send_after(
-             ?RAM_DURATION_UPDATE_INTERVAL, self(), update_ram_duration),
-    State#q{rate_timer_ref = TRef};
-ensure_rate_timer(State = #q{rate_timer_ref = just_measured}) ->
-    State#q{rate_timer_ref = undefined};
 ensure_rate_timer(State) ->
-    State.
+    rabbit_misc:ensure_timer(State, #q.rate_timer_ref,
+                             ?RAM_DURATION_UPDATE_INTERVAL,
+                             update_ram_duration).
 
-stop_rate_timer(State = #q{rate_timer_ref = undefined}) ->
-    State;
-stop_rate_timer(State = #q{rate_timer_ref = just_measured}) ->
-    State#q{rate_timer_ref = undefined};
-stop_rate_timer(State = #q{rate_timer_ref = TRef}) ->
-    erlang:cancel_timer(TRef),
-    State#q{rate_timer_ref = undefined}.
-
-stop_expiry_timer(State = #q{expiry_timer_ref = undefined}) ->
-    State;
-stop_expiry_timer(State = #q{expiry_timer_ref = TRef}) ->
-    erlang:cancel_timer(TRef),
-    State#q{expiry_timer_ref = undefined}.
+stop_rate_timer(State) -> rabbit_misc:stop_timer(State, #q.rate_timer_ref).
 
 %% We wish to expire only when there are no consumers *and* the expiry
 %% hasn't been refreshed (by queue.declare or basic.get) for the
@@ -328,17 +326,41 @@ ensure_expiry_timer(State = #q{expires = undefined}) ->
 ensure_expiry_timer(State = #q{expires = Expires}) ->
     case is_unused(State) of
         true  -> NewState = stop_expiry_timer(State),
-                 TRef = erlang:send_after(Expires, self(), maybe_expire),
-                 NewState#q{expiry_timer_ref = TRef};
+                 rabbit_misc:ensure_timer(NewState, #q.expiry_timer_ref,
+                                          Expires, maybe_expire);
         false -> State
     end.
 
+stop_expiry_timer(State) -> rabbit_misc:stop_timer(State, #q.expiry_timer_ref).
+
+ensure_ttl_timer(undefined, State) ->
+    State;
+ensure_ttl_timer(Expiry, State = #q{ttl_timer_ref = undefined}) ->
+    After = (case Expiry - now_micros() of
+                 V when V > 0 -> V + 999; %% always fire later
+                 _            -> 0
+             end) div 1000,
+    TRef = erlang:send_after(After, self(), drop_expired),
+    State#q{ttl_timer_ref = TRef, ttl_timer_expiry = Expiry};
+ensure_ttl_timer(Expiry, State = #q{ttl_timer_ref    = TRef,
+                                    ttl_timer_expiry = TExpiry})
+  when Expiry + 1000 < TExpiry ->
+    case erlang:cancel_timer(TRef) of
+        false -> State;
+        _     -> ensure_ttl_timer(Expiry, State#q{ttl_timer_ref = undefined})
+    end;
+ensure_ttl_timer(_Expiry, State) ->
+    State.
+
+stop_ttl_timer(State) -> rabbit_misc:stop_timer(State, #q.ttl_timer_ref).
+
 ensure_stats_timer(State) ->
     rabbit_event:ensure_stats_timer(State, #q.stats_timer, emit_stats).
 
-assert_invariant(#q{active_consumers = AC,
-                    backing_queue = BQ, backing_queue_state = BQS}) ->
-    true = (queue:is_empty(AC) orelse BQ:is_empty(BQS)).
+assert_invariant(State = #q{active_consumers = AC}) ->
+    true = (queue:is_empty(AC) orelse is_empty(State)).
+
+is_empty(#q{backing_queue = BQ, backing_queue_state = BQS}) -> BQ:is_empty(BQS).
 
 lookup_ch(ChPid) ->
     case get({ch, ChPid}) of
@@ -346,17 +368,17 @@ lookup_ch(ChPid) ->
         C         -> C
     end.
 
-ch_record(ChPid) ->
+ch_record(ChPid, LimiterPid) ->
     Key = {ch, ChPid},
     case get(Key) of
         undefined -> MonitorRef = erlang:monitor(process, ChPid),
+                     Limiter = rabbit_limiter:client(LimiterPid),
                      C = #cr{ch_pid               = ChPid,
                              monitor_ref          = MonitorRef,
-                             acktags              = sets:new(),
+                             acktags              = queue:new(),
                              consumer_count       = 0,
                              blocked_consumers    = queue:new(),
-                             is_limit_active      = false,
-                             limiter              = rabbit_limiter:make_token(),
+                             limiter              = Limiter,
                              unsent_message_count = 0},
                      put(Key, C),
                      C;
@@ -366,9 +388,9 @@ ch_record(ChPid) ->
 update_ch_record(C = #cr{consumer_count       = ConsumerCount,
                          acktags              = ChAckTags,
                          unsent_message_count = UnsentMessageCount}) ->
-    case {sets:size(ChAckTags), ConsumerCount, UnsentMessageCount} of
-        {0, 0, 0} -> ok = erase_ch_record(C);
-        _         -> ok = store_ch_record(C)
+    case {queue:is_empty(ChAckTags), ConsumerCount, UnsentMessageCount} of
+        {true, 0, 0} -> ok = erase_ch_record(C);
+        _            -> ok = store_ch_record(C)
     end,
     C.
 
@@ -376,37 +398,32 @@ store_ch_record(C = #cr{ch_pid = ChPid}) ->
     put({ch, ChPid}, C),
     ok.
 
-erase_ch_record(#cr{ch_pid      = ChPid,
-                    limiter     = Limiter,
-                    monitor_ref = MonitorRef}) ->
-    ok = rabbit_limiter:unregister(Limiter, self()),
+erase_ch_record(#cr{ch_pid = ChPid, monitor_ref = MonitorRef}) ->
     erlang:demonitor(MonitorRef),
     erase({ch, ChPid}),
     ok.
 
-update_consumer_count(C = #cr{consumer_count = 0, limiter = Limiter}, +1) ->
-    ok = rabbit_limiter:register(Limiter, self()),
-    update_ch_record(C#cr{consumer_count = 1});
-update_consumer_count(C = #cr{consumer_count = 1, limiter = Limiter}, -1) ->
-    ok = rabbit_limiter:unregister(Limiter, self()),
-    update_ch_record(C#cr{consumer_count = 0,
-                          limiter = rabbit_limiter:make_token()});
-update_consumer_count(C = #cr{consumer_count = Count}, Delta) ->
-    update_ch_record(C#cr{consumer_count = Count + Delta}).
-
 all_ch_record() -> [C || {{ch, _}, C} <- get()].
 
 block_consumer(C = #cr{blocked_consumers = Blocked}, QEntry) ->
     update_ch_record(C#cr{blocked_consumers = queue:in(QEntry, Blocked)}).
 
-is_ch_blocked(#cr{unsent_message_count = Count, is_limit_active = Limited}) ->
-    Limited orelse Count >= ?UNSENT_MESSAGE_LIMIT.
+is_ch_blocked(#cr{unsent_message_count = Count, limiter = Limiter}) ->
+    Count >= ?UNSENT_MESSAGE_LIMIT orelse rabbit_limiter:is_suspended(Limiter).
 
-ch_record_state_transition(OldCR, NewCR) ->
-    case {is_ch_blocked(OldCR), is_ch_blocked(NewCR)} of
-        {true, false} -> unblock;
-        {false, true} -> block;
-        {_, _}        -> ok
+maybe_send_drained(WasEmpty, State) ->
+    case (not WasEmpty) andalso is_empty(State) of
+        true  -> [send_drained(C) || C <- all_ch_record()];
+        false -> ok
+    end,
+    State.
+
+send_drained(C = #cr{ch_pid = ChPid, limiter = Limiter}) ->
+    case rabbit_limiter:drained(Limiter) of
+        {[], Limiter}          -> ok;
+        {CTagCredit, Limiter2} -> rabbit_channel:send_drained(
+                                    ChPid, CTagCredit),
+                                  update_ch_record(C#cr{limiter = Limiter2})
     end.
 
 deliver_msgs_to_consumers(_DeliverFun, true, State) ->
@@ -424,18 +441,21 @@ deliver_msgs_to_consumers(DeliverFun, false,
     end.
 
 deliver_msg_to_consumer(DeliverFun, E = {ChPid, Consumer}, State) ->
-    C = ch_record(ChPid),
+    C = lookup_ch(ChPid),
     case is_ch_blocked(C) of
         true  -> block_consumer(C, E),
                  {false, State};
-        false -> case rabbit_limiter:can_send(C#cr.limiter, self(),
-                                              Consumer#consumer.ack_required) of
-                     false -> block_consumer(C#cr{is_limit_active = true}, E),
-                              {false, State};
-                     true  -> AC1 = queue:in(E, State#q.active_consumers),
-                              deliver_msg_to_consumer(
-                                DeliverFun, Consumer, C,
-                                State#q{active_consumers = AC1})
+        false -> case rabbit_limiter:can_send(C#cr.limiter,
+                                              Consumer#consumer.ack_required,
+                                              Consumer#consumer.tag) of
+                     {suspend, Limiter} ->
+                         block_consumer(C#cr{limiter = Limiter}, E),
+                         {false, State};
+                     {continue, Limiter} ->
+                         AC1 = queue:in(E, State#q.active_consumers),
+                         deliver_msg_to_consumer(
+                           DeliverFun, Consumer, C#cr{limiter = Limiter},
+                           State#q{active_consumers = AC1})
                  end
     end.
 
@@ -451,7 +471,7 @@ deliver_msg_to_consumer(DeliverFun,
     rabbit_channel:deliver(ChPid, ConsumerTag, AckRequired,
                            {QName, self(), AckTag, IsDelivered, Message}),
     ChAckTags1 = case AckRequired of
-                     true  -> sets:add_element(AckTag, ChAckTags);
+                     true  -> queue:in(AckTag, ChAckTags);
                      false -> ChAckTags
                  end,
     update_ch_record(C#cr{acktags              = ChAckTags1,
@@ -459,9 +479,8 @@ deliver_msg_to_consumer(DeliverFun,
     {Stop, State1}.
 
 deliver_from_queue_deliver(AckRequired, State) ->
-    {{Message, IsDelivered, AckTag, Remaining}, State1} =
-        fetch(AckRequired, State),
-    {{Message, IsDelivered, AckTag}, 0 == Remaining, State1}.
+    {Result, State1} = fetch(AckRequired, State),
+    {Result, is_empty(State1), State1}.
 
 confirm_messages([], State) ->
     State;
@@ -481,106 +500,142 @@ confirm_messages(MsgIds, State = #q{msg_id_to_channel = MTC}) ->
     rabbit_misc:gb_trees_foreach(fun rabbit_misc:confirm_to_sender/2, CMs),
     State#q{msg_id_to_channel = MTC1}.
 
-should_confirm_message(#delivery{msg_seq_no = undefined}, _State) ->
-    never;
-should_confirm_message(#delivery{sender     = SenderPid,
+send_or_record_confirm(#delivery{msg_seq_no = undefined}, State) ->
+    {never, State};
+send_or_record_confirm(#delivery{sender     = SenderPid,
                                  msg_seq_no = MsgSeqNo,
                                  message    = #basic_message {
                                    is_persistent = true,
                                    id            = MsgId}},
-                       #q{q = #amqqueue{durable = true}}) ->
-    {eventually, SenderPid, MsgSeqNo, MsgId};
-should_confirm_message(#delivery{sender     = SenderPid,
-                                 msg_seq_no = MsgSeqNo},
-                       _State) ->
-    {immediately, SenderPid, MsgSeqNo}.
-
-needs_confirming({eventually, _, _, _}) -> true;
-needs_confirming(_)                     -> false.
-
-maybe_record_confirm_message({eventually, SenderPid, MsgSeqNo, MsgId},
-                             State = #q{msg_id_to_channel = MTC}) ->
-    State#q{msg_id_to_channel =
-                gb_trees:insert(MsgId, {SenderPid, MsgSeqNo}, MTC)};
-maybe_record_confirm_message({immediately, SenderPid, MsgSeqNo}, State) ->
+                       State = #q{q                 = #amqqueue{durable = true},
+                                  msg_id_to_channel = MTC}) ->
+    MTC1 = gb_trees:insert(MsgId, {SenderPid, MsgSeqNo}, MTC),
+    {eventually, State#q{msg_id_to_channel = MTC1}};
+send_or_record_confirm(#delivery{sender     = SenderPid,
+                                 msg_seq_no = MsgSeqNo}, State) ->
     rabbit_misc:confirm_to_sender(SenderPid, [MsgSeqNo]),
-    State;
-maybe_record_confirm_message(_Confirm, State) ->
-    State.
+    {immediately, State}.
 
-run_message_queue(State) ->
+discard(#delivery{sender     = SenderPid,
+                  msg_seq_no = MsgSeqNo,
+                  message    = #basic_message{id = MsgId}}, State) ->
     State1 = #q{backing_queue = BQ, backing_queue_state = BQS} =
-        drop_expired_messages(State),
-    {_IsEmpty1, State2} = deliver_msgs_to_consumers(
+        case MsgSeqNo of
+            undefined -> State;
+            _         -> confirm_messages([MsgId], State)
+        end,
+    BQS1 = BQ:discard(MsgId, SenderPid, BQS),
+    State1#q{backing_queue_state = BQS1}.
+
+run_message_queue(State) ->
+    {_IsEmpty1, State1} = deliver_msgs_to_consumers(
                             fun deliver_from_queue_deliver/2,
-                            BQ:is_empty(BQS), State1),
-    State2.
+                            is_empty(State), State),
+    State1.
 
-attempt_delivery(#delivery{sender = SenderPid, message = Message}, Confirm,
-                 State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
+attempt_delivery(Delivery = #delivery{sender = SenderPid, message = Message},
+                 Props, Delivered, State = #q{backing_queue       = BQ,
+                                              backing_queue_state = BQS}) ->
     case BQ:is_duplicate(Message, BQS) of
         {false, BQS1} ->
             deliver_msgs_to_consumers(
-              fun (AckRequired, State1 = #q{backing_queue_state = BQS2}) ->
-                      Props = message_properties(Confirm, State1),
+              fun (true, State1 = #q{backing_queue_state = BQS2}) ->
+                      true = BQ:is_empty(BQS2),
                       {AckTag, BQS3} = BQ:publish_delivered(
-                                         AckRequired, Message, Props,
-                                         SenderPid, BQS2),
-                      {{Message, false, AckTag}, true,
-                       State1#q{backing_queue_state = BQS3}}
+                                         Message, Props, SenderPid, BQS2),
+                      {{Message, Delivered, AckTag},
+                       true, State1#q{backing_queue_state = BQS3}};
+                  (false, State1) ->
+                      {{Message, Delivered, undefined},
+                       true, discard(Delivery, State1)}
               end, false, State#q{backing_queue_state = BQS1});
-        {Duplicate, BQS1} ->
-            %% if the message has previously been seen by the BQ then
-            %% it must have been seen under the same circumstances as
-            %% now: i.e. if it is now a deliver_immediately then it
-            %% must have been before.
-            {case Duplicate of
-                 published -> true;
-                 discarded -> false
-             end,
-             State#q{backing_queue_state = BQS1}}
+        {true, BQS1} ->
+            {true, State#q{backing_queue_state = BQS1}}
+    end.
+
+deliver_or_enqueue(Delivery = #delivery{message = Message, sender = SenderPid},
+                   Delivered, State) ->
+    {Confirm, State1} = send_or_record_confirm(Delivery, State),
+    Props = message_properties(Message, Confirm, State),
+    case attempt_delivery(Delivery, Props, Delivered, State1) of
+        {true, State2} ->
+            State2;
+        %% The next one is an optimisation
+        {false, State2 = #q{ttl = 0, dlx = undefined}} ->
+            discard(Delivery, State2);
+        {false, State2 = #q{backing_queue = BQ, backing_queue_state = BQS}} ->
+            BQS1 = BQ:publish(Message, Props, Delivered, SenderPid, BQS),
+            {Dropped, State3 = #q{backing_queue_state = BQS2}} =
+              maybe_drop_head(State2#q{backing_queue_state = BQS1}),
+            QLen = BQ:len(BQS2),
+            %% optimisation: it would be perfectly safe to always
+            %% invoke drop_expired_msgs here, but that is expensive so
+            %% we only do that if a new message that might have an
+            %% expiry ends up at the head of the queue. If the head
+            %% remains unchanged, or if the newly published message
+            %% has no expiry and becomes the head of the queue then
+            %% the call is unnecessary.
+            case {Dropped > 0, QLen =:= 1, Props#message_properties.expiry} of
+                {false, false,         _} -> State3;
+                {true,  true,  undefined} -> State3;
+                {_,     _,             _} -> drop_expired_msgs(State3)
+            end
     end.
 
-deliver_or_enqueue(Delivery = #delivery{message    = Message,
-                                        msg_seq_no = MsgSeqNo,
-                                        sender     = SenderPid}, State) ->
-    Confirm = should_confirm_message(Delivery, State),
-    case attempt_delivery(Delivery, Confirm, State) of
-        {true, State1} ->
-            maybe_record_confirm_message(Confirm, State1);
-        %% the next two are optimisations
-        {false, State1 = #q{ttl = 0, dlx = undefined}} when Confirm == never ->
-            discard_delivery(Delivery, State1);
-        {false, State1 = #q{ttl = 0, dlx = undefined}} ->
-            rabbit_misc:confirm_to_sender(SenderPid, [MsgSeqNo]),
-            discard_delivery(Delivery, State1);
-        {false, State1} ->
-            State2 = #q{backing_queue = BQ, backing_queue_state = BQS} =
-                maybe_record_confirm_message(Confirm, State1),
-            Props = message_properties(Confirm, State2),
-            BQS1 = BQ:publish(Message, Props, SenderPid, BQS),
-            ensure_ttl_timer(State2#q{backing_queue_state = BQS1})
+maybe_drop_head(State = #q{max_length = undefined}) ->
+    {0, State};
+maybe_drop_head(State = #q{max_length          = MaxLen,
+                           backing_queue       = BQ,
+                           backing_queue_state = BQS}) ->
+    case BQ:len(BQS) - MaxLen of
+        Excess when Excess > 0 ->
+            {Excess,
+             with_dlx(
+               State#q.dlx,
+               fun (X) -> dead_letter_maxlen_msgs(X, Excess, State) end,
+               fun () ->
+                       {_, BQS1} = lists:foldl(fun (_, {_, BQS0}) ->
+                                                       BQ:drop(false, BQS0)
+                                               end, {ok, BQS},
+                                               lists:seq(1, Excess)),
+                       State#q{backing_queue_state = BQS1}
+               end)};
+        _ -> {0, State}
     end.
 
-requeue_and_run(AckTags, State = #q{backing_queue = BQ}) ->
-    run_backing_queue(BQ, fun (M, BQS) ->
-                                  {_MsgIds, BQS1} = M:requeue(AckTags, BQS),
-                                  BQS1
-                          end, State).
+requeue_and_run(AckTags, State = #q{backing_queue       = BQ,
+                                    backing_queue_state = BQS}) ->
+    WasEmpty = BQ:is_empty(BQS),
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    {_Dropped, State1} = maybe_drop_head(State#q{backing_queue_state = BQS1}),
+    run_message_queue(maybe_send_drained(WasEmpty, drop_expired_msgs(State1))).
 
-fetch(AckRequired, State = #q{backing_queue_state = BQS,
-                              backing_queue       = BQ}) ->
+fetch(AckRequired, State = #q{backing_queue       = BQ,
+                              backing_queue_state = BQS}) ->
     {Result, BQS1} = BQ:fetch(AckRequired, BQS),
-    {Result, State#q{backing_queue_state = BQS1}}.
+    State1 = drop_expired_msgs(State#q{backing_queue_state = BQS1}),
+    {Result, maybe_send_drained(Result =:= empty, State1)}.
+
+ack(AckTags, ChPid, State) ->
+    subtract_acks(ChPid, AckTags, State,
+                  fun (State1 = #q{backing_queue       = BQ,
+                                   backing_queue_state = BQS}) ->
+                          {_Guids, BQS1} = BQ:ack(AckTags, BQS),
+                          State1#q{backing_queue_state = BQS1}
+                  end).
+
+requeue(AckTags, ChPid, State) ->
+    subtract_acks(ChPid, AckTags, State,
+                  fun (State1) -> requeue_and_run(AckTags, State1) end).
 
 remove_consumer(ChPid, ConsumerTag, Queue) ->
     queue:filter(fun ({CP, #consumer{tag = CTag}}) ->
                          (CP /= ChPid) or (CTag /= ConsumerTag)
                  end, Queue).
 
-remove_consumers(ChPid, Queue) ->
+remove_consumers(ChPid, Queue, QName) ->
     queue:filter(fun ({CP, #consumer{tag = CTag}}) when CP =:= ChPid ->
-                         emit_consumer_deleted(ChPid, CTag),
+                         emit_consumer_deleted(ChPid, CTag, QName),
                          false;
                      (_) ->
                          true
@@ -588,20 +643,29 @@ remove_consumers(ChPid, Queue) ->
 
 possibly_unblock(State, ChPid, Update) ->
     case lookup_ch(ChPid) of
-        not_found ->
+        not_found -> State;
+        C         -> C1 = Update(C),
+                     case is_ch_blocked(C) andalso not is_ch_blocked(C1) of
+                         false -> update_ch_record(C1),
+                                  State;
+                         true  -> unblock(State, C1)
+                     end
+    end.
+
+unblock(State, C = #cr{limiter = Limiter}) ->
+    case lists:partition(
+           fun({_ChPid, #consumer{tag = CTag}}) ->
+                   rabbit_limiter:is_consumer_blocked(Limiter, CTag)
+           end, queue:to_list(C#cr.blocked_consumers)) of
+        {_, []} ->
+            update_ch_record(C),
             State;
-        C ->
-            C1 = Update(C),
-            case ch_record_state_transition(C, C1) of
-                ok      ->  update_ch_record(C1),
-                            State;
-                unblock -> #cr{blocked_consumers = Consumers} = C1,
-                           update_ch_record(
-                             C1#cr{blocked_consumers = queue:new()}),
-                           AC1 = queue:join(State#q.active_consumers,
-                                            Consumers),
-                           run_message_queue(State#q{active_consumers = AC1})
-            end
+        {Blocked, Unblocked} ->
+            BlockedQ   = queue:from_list(Blocked),
+            UnblockedQ = queue:from_list(Unblocked),
+            update_ch_record(C#cr{blocked_consumers = BlockedQ}),
+            AC1 = queue:join(State#q.active_consumers, UnblockedQ),
+            run_message_queue(State#q{active_consumers = AC1})
     end.
 
 should_auto_delete(#q{q = #amqqueue{auto_delete = false}}) -> false;
@@ -621,7 +685,8 @@ handle_ch_down(DownPid, State = #q{exclusive_consumer = Holder,
         C = #cr{ch_pid            = ChPid,
                 acktags           = ChAckTags,
                 blocked_consumers = Blocked} ->
-            _ = remove_consumers(ChPid, Blocked), %% for stats emission
+            QName = qname(State),
+            _ = remove_consumers(ChPid, Blocked, QName), %% for stats emission
             ok = erase_ch_record(C),
             State1 = State#q{
                        exclusive_consumer = case Holder of
@@ -629,11 +694,12 @@ handle_ch_down(DownPid, State = #q{exclusive_consumer = Holder,
                                                 Other      -> Other
                                             end,
                        active_consumers = remove_consumers(
-                                            ChPid, State#q.active_consumers),
+                                            ChPid, State#q.active_consumers,
+                                            QName),
                        senders          = Senders1},
             case should_auto_delete(State1) of
                 true  -> {stop, State1};
-                false -> {ok, requeue_and_run(sets:to_list(ChAckTags),
+                false -> {ok, requeue_and_run(queue:to_list(ChAckTags),
                                               ensure_expiry_timer(State1))}
             end
     end.
@@ -648,13 +714,8 @@ check_exclusive_access(none, true, State) ->
         false -> in_use
     end.
 
-consumer_count() -> consumer_count(fun (_) -> false end).
-
-active_consumer_count() -> consumer_count(fun is_ch_blocked/1).
-
-consumer_count(Exclude) ->
-    lists:sum([Count || C = #cr{consumer_count = Count} <- all_ch_record(),
-                        not Exclude(C)]).
+consumer_count() ->
+    lists:sum([Count || #cr{consumer_count = Count} <- all_ch_record()]).
 
 is_unused(_State) -> consumer_count() == 0.
 
@@ -663,158 +724,130 @@ maybe_send_reply(ChPid, Msg) -> ok = rabbit_channel:send_command(ChPid, Msg).
 
 qname(#q{q = #amqqueue{name = QName}}) -> QName.
 
-backing_queue_timeout(State = #q{backing_queue = BQ}) ->
-    run_backing_queue(BQ, fun (M, BQS) -> M:timeout(BQS) end, State).
-
-run_backing_queue(Mod, Fun, State = #q{backing_queue = BQ,
-                                       backing_queue_state = BQS}) ->
-    run_message_queue(State#q{backing_queue_state = BQ:invoke(Mod, Fun, BQS)}).
+backing_queue_timeout(State = #q{backing_queue       = BQ,
+                                 backing_queue_state = BQS}) ->
+    State#q{backing_queue_state = BQ:timeout(BQS)}.
 
 subtract_acks(ChPid, AckTags, State, Fun) ->
     case lookup_ch(ChPid) of
         not_found ->
             State;
         C = #cr{acktags = ChAckTags} ->
-            update_ch_record(C#cr{acktags = lists:foldl(fun sets:del_element/2,
-                                                        ChAckTags, AckTags)}),
+            update_ch_record(
+              C#cr{acktags = subtract_acks(AckTags, [], ChAckTags)}),
             Fun(State)
     end.
 
-discard_delivery(#delivery{sender = SenderPid,
-                           message = Message},
-                 State = #q{backing_queue = BQ,
-                            backing_queue_state = BQS}) ->
-    State#q{backing_queue_state = BQ:discard(Message, SenderPid, BQS)}.
-
-message_properties(Confirm, #q{ttl = TTL}) ->
-    #message_properties{expiry           = calculate_msg_expiry(TTL),
-                        needs_confirming = needs_confirming(Confirm)}.
-
-calculate_msg_expiry(undefined) -> undefined;
-calculate_msg_expiry(TTL)       -> now_micros() + (TTL * 1000).
+subtract_acks([], [], AckQ) ->
+    AckQ;
+subtract_acks([], Prefix, AckQ) ->
+    queue:join(queue:from_list(lists:reverse(Prefix)), AckQ);
+subtract_acks([T | TL] = AckTags, Prefix, AckQ) ->
+    case queue:out(AckQ) of
+        {{value,  T}, QTail} -> subtract_acks(TL,             Prefix, QTail);
+        {{value, AT}, QTail} -> subtract_acks(AckTags, [AT | Prefix], QTail)
+    end.
 
-drop_expired_messages(State = #q{ttl = undefined}) ->
-    State;
-drop_expired_messages(State = #q{backing_queue_state = BQS,
-                                 backing_queue       = BQ }) ->
-    Now = now_micros(),
-    DLXFun = dead_letter_fun(expired, State),
-    ExpirePred = fun (#message_properties{expiry = Expiry}) -> Now > Expiry end,
-    case DLXFun of
-        undefined -> {undefined, BQS1} = BQ:dropwhile(ExpirePred, false, BQS),
-                     BQS1;
-        _         -> {Msgs, BQS1} = BQ:dropwhile(ExpirePred, true, BQS),
-                     lists:foreach(
-                       fun({Msg, AckTag}) -> DLXFun(Msg, AckTag) end, Msgs),
-                     BQS1
-    end,
-    ensure_ttl_timer(State#q{backing_queue_state = BQS1}).
-
-ensure_ttl_timer(State = #q{backing_queue       = BQ,
-                            backing_queue_state = BQS,
-                            ttl                 = TTL,
-                            ttl_timer_ref       = undefined})
-  when TTL =/= undefined ->
-    case BQ:is_empty(BQS) of
-        true  -> State;
-        false -> TRef = erlang:send_after(TTL, self(), drop_expired),
-                 State#q{ttl_timer_ref = TRef}
-    end;
-ensure_ttl_timer(State) ->
-    State.
+message_properties(Message, Confirm, #q{ttl = TTL}) ->
+    #message_properties{expiry           = calculate_msg_expiry(Message, TTL),
+                        needs_confirming = Confirm == eventually}.
 
-ack_if_no_dlx(AckTags, State = #q{dlx                 = undefined,
-                                  backing_queue       = BQ,
-                                  backing_queue_state = BQS }) ->
-    {_Guids, BQS1} = BQ:ack(AckTags, BQS),
-    State#q{backing_queue_state = BQS1};
-ack_if_no_dlx(_AckTags, State) ->
-    State.
-
-dead_letter_fun(_Reason, #q{dlx = undefined}) ->
-    undefined;
-dead_letter_fun(Reason, _State) ->
-    fun(Msg, AckTag) ->
-            gen_server2:cast(self(), {dead_letter, {Msg, AckTag}, Reason})
+calculate_msg_expiry(#basic_message{content = Content}, TTL) ->
+    #content{properties = Props} =
+        rabbit_binary_parser:ensure_content_decoded(Content),
+    %% We assert that the expiration must be valid - we check in the channel.
+    {ok, MsgTTL} = rabbit_basic:parse_expiration(Props),
+    case lists:min([TTL, MsgTTL]) of
+        undefined -> undefined;
+        T         -> now_micros() + T * 1000
     end.
 
-dead_letter_publish(Msg, Reason, State = #q{publish_seqno = MsgSeqNo}) ->
-    DLMsg = #basic_message{exchange_name = XName} =
-        make_dead_letter_msg(Reason, Msg, State),
-    case rabbit_exchange:lookup(XName) of
-        {ok, X} ->
-            Delivery = rabbit_basic:delivery(false, false, DLMsg, MsgSeqNo),
-            {Queues, Cycles} = detect_dead_letter_cycles(
-                                 DLMsg, rabbit_exchange:route(X, Delivery)),
-            lists:foreach(fun log_cycle_once/1, Cycles),
-            QPids = rabbit_amqqueue:lookup(Queues),
-            {_, DeliveredQPids} = rabbit_amqqueue:deliver(QPids, Delivery),
-            DeliveredQPids;
-        {error, not_found} ->
-            []
+%% Logically this function should invoke maybe_send_drained/2.
+%% However, that is expensive. Since some frequent callers of
+%% drop_expired_msgs/1, in particular deliver_or_enqueue/3, cannot
+%% possibly cause the queue to become empty, we push the
+%% responsibility to the callers. So be cautious when adding new ones.
+drop_expired_msgs(State) ->
+    case is_empty(State) of
+        true  -> State;
+        false -> drop_expired_msgs(now_micros(), State)
     end.
 
-dead_letter_msg(Msg, AckTag, Reason, State = #q{publish_seqno = MsgSeqNo,
-                                                unconfirmed   = UC}) ->
-    QPids = dead_letter_publish(Msg, Reason, State),
-    State1 = State#q{queue_monitors = pmon:monitor_all(
-                                        QPids, State#q.queue_monitors),
-                     publish_seqno  = MsgSeqNo + 1},
-    case QPids of
-        [] -> cleanup_after_confirm([AckTag], State1);
-        _  -> UC1 = dtree:insert(MsgSeqNo, QPids, AckTag, UC),
-              noreply(State1#q{unconfirmed = UC1})
-    end.
+drop_expired_msgs(Now, State = #q{backing_queue_state = BQS,
+                                  backing_queue       = BQ }) ->
+    ExpirePred = fun (#message_properties{expiry = Exp}) -> Now >= Exp end,
+    {Props, State1} =
+        with_dlx(
+          State#q.dlx,
+          fun (X) -> dead_letter_expired_msgs(ExpirePred, X, State) end,
+          fun () -> {Next, BQS1} = BQ:dropwhile(ExpirePred, BQS),
+                    {Next, State#q{backing_queue_state = BQS1}} end),
+    ensure_ttl_timer(case Props of
+                         undefined                         -> undefined;
+                         #message_properties{expiry = Exp} -> Exp
+                     end, State1).
+
+with_dlx(undefined, _With,  Without) -> Without();
+with_dlx(DLX,        With,  Without) -> case rabbit_exchange:lookup(DLX) of
+                                            {ok, X}            -> With(X);
+                                            {error, not_found} -> Without()
+                                        end.
+
+dead_letter_expired_msgs(ExpirePred, X, State = #q{backing_queue = BQ}) ->
+    dead_letter_msgs(fun (DLFun, Acc, BQS1) ->
+                             BQ:fetchwhile(ExpirePred, DLFun, Acc, BQS1)
+                     end, expired, X, State).
+
+dead_letter_rejected_msgs(AckTags, X,  State = #q{backing_queue = BQ}) ->
+    {ok, State1} =
+        dead_letter_msgs(
+          fun (DLFun, Acc, BQS) ->
+                  {Acc1, BQS1} = BQ:ackfold(DLFun, Acc, BQS, AckTags),
+                  {ok, Acc1, BQS1}
+          end, rejected, X, State),
+    State1.
+
+dead_letter_maxlen_msgs(X, Excess, State = #q{backing_queue = BQ}) ->
+    {ok, State1} =
+        dead_letter_msgs(
+          fun (DLFun, Acc, BQS) ->
+                  lists:foldl(fun (_, {ok, Acc0, BQS0}) ->
+                                      {{Msg, _, AckTag}, BQS1} =
+                                        BQ:fetch(true, BQS0),
+                                      {ok, DLFun(Msg, AckTag, Acc0), BQS1}
+                              end, {ok, Acc, BQS}, lists:seq(1, Excess))
+          end, maxlen, X, State),
+    State1.
+
+dead_letter_msgs(Fun, Reason, X, State = #q{dlx_routing_key     = RK,
+                                            backing_queue_state = BQS,
+                                            backing_queue       = BQ}) ->
+    QName = qname(State),
+    {Res, Acks1, BQS1} =
+        Fun(fun (Msg, AckTag, Acks) ->
+                    dead_letter_publish(Msg, Reason, X, RK, QName),
+                    [AckTag | Acks]
+            end, [], BQS),
+    {_Guids, BQS2} = BQ:ack(Acks1, BQS1),
+    {Res, State#q{backing_queue_state = BQS2}}.
+
+dead_letter_publish(Msg, Reason, X, RK, QName) ->
+    DLMsg = make_dead_letter_msg(Msg, Reason, X#exchange.name, RK, QName),
+    Delivery = rabbit_basic:delivery(false, DLMsg, undefined),
+    {Queues, Cycles} = detect_dead_letter_cycles(
+                         Reason, DLMsg, rabbit_exchange:route(X, Delivery)),
+    lists:foreach(fun log_cycle_once/1, Cycles),
+    rabbit_amqqueue:deliver( rabbit_amqqueue:lookup(Queues), Delivery),
+    ok.
 
-handle_queue_down(QPid, Reason, State = #q{queue_monitors = QMons,
-                                           unconfirmed    = UC}) ->
-    case pmon:is_monitored(QPid, QMons) of
-        false -> noreply(State);
-        true  -> case rabbit_misc:is_abnormal_termination(Reason) of
-                     true  -> {Lost, _UC1} = dtree:take_all(QPid, UC),
-                              QNameS = rabbit_misc:rs(qname(State)),
-                              rabbit_log:warning("DLQ ~p for ~s died with "
-                                                 "~p unconfirmed messages~n",
-                                                 [QPid, QNameS, length(Lost)]);
-                     false -> ok
-                 end,
-                 {MsgSeqNoAckTags, UC1} = dtree:take(QPid, UC),
-                 cleanup_after_confirm(
-                   [AckTag || {_MsgSeqNo, AckTag} <- MsgSeqNoAckTags],
-                   State#q{queue_monitors = pmon:erase(QPid, QMons),
-                           unconfirmed    = UC1})
-    end.
+stop(State) -> stop(noreply, State).
 
-stop_later(Reason, State) ->
-    stop_later(Reason, undefined, noreply, State).
-
-stop_later(Reason, From, Reply, State = #q{unconfirmed = UC}) ->
-    case {dtree:is_empty(UC), Reply} of
-        {true, noreply} ->
-            {stop, Reason, State};
-        {true, _} ->
-            {stop, Reason, Reply, State};
-        {false, _} ->
-            noreply(State#q{delayed_stop = {Reason, {From, Reply}}})
-    end.
+stop(noreply, State) -> {stop, normal, State};
+stop(Reply,   State) -> {stop, normal, Reply, State}.
 
-cleanup_after_confirm(AckTags, State = #q{delayed_stop        = DS,
-                                          unconfirmed         = UC,
-                                          backing_queue       = BQ,
-                                          backing_queue_state = BQS}) ->
-    {_Guids, BQS1} = BQ:ack(AckTags, BQS),
-    State1 = State#q{backing_queue_state = BQS1},
-    case dtree:is_empty(UC) andalso DS =/= undefined of
-        true  -> case DS of
-                     {_, {_, noreply}}  -> ok;
-                     {_, {From, Reply}} -> gen_server2:reply(From, Reply)
-                 end,
-                 {Reason, _} = DS,
-                 {stop, Reason, State1};
-        false -> noreply(State1)
-    end.
 
-detect_dead_letter_cycles(#basic_message{content = Content}, Queues) ->
+detect_dead_letter_cycles(expired,
+                          #basic_message{content = Content}, Queues) ->
     #content{properties = #'P_basic'{headers = Headers}} =
         rabbit_binary_parser:ensure_content_decoded(Content),
     NoCycles = {Queues, []},
@@ -823,38 +856,56 @@ detect_dead_letter_cycles(#basic_message{content = Content}, Queues) ->
             NoCycles;
         _ ->
             case rabbit_misc:table_lookup(Headers, <<"x-death">>) of
-                {array, DeathTables} ->
-                    OldQueues = [rabbit_misc:table_lookup(D, <<"queue">>) ||
-                                    {table, D} <- DeathTables],
-                    OldQueues1 = [QName || {longstr, QName} <- OldQueues],
-                    OldQueuesSet = ordsets:from_list(OldQueues1),
+                {array, Deaths} ->
                     {Cycling, NotCycling} =
                         lists:partition(
-                          fun(Queue) ->
-                                  ordsets:is_element(Queue#resource.name,
-                                                     OldQueuesSet)
+                          fun (#resource{name = Queue}) ->
+                                  is_dead_letter_cycle(Queue, Deaths)
                           end, Queues),
+                    OldQueues = [rabbit_misc:table_lookup(D, <<"queue">>) ||
+                                    {table, D} <- Deaths],
+                    OldQueues1 = [QName || {longstr, QName} <- OldQueues],
                     {NotCycling, [[QName | OldQueues1] ||
                                      #resource{name = QName} <- Cycling]};
                 _ ->
                     NoCycles
             end
+    end;
+detect_dead_letter_cycles(_Reason, _Msg, Queues) ->
+    {Queues, []}.
+
+is_dead_letter_cycle(Queue, Deaths) ->
+    {Cycle, Rest} =
+        lists:splitwith(
+          fun ({table, D}) ->
+                  {longstr, Queue} =/= rabbit_misc:table_lookup(D, <<"queue">>);
+              (_) ->
+                  true
+          end, Deaths),
+    %% Is there a cycle, and if so, is it entirely due to expiry?
+    case Rest of
+        []    -> false;
+        [H|_] -> lists:all(
+                   fun ({table, D}) ->
+                           {longstr, <<"expired">>} =:=
+                               rabbit_misc:table_lookup(D, <<"reason">>);
+                       (_) ->
+                           false
+                   end, Cycle ++ [H])
     end.
 
-make_dead_letter_msg(Reason,
-                     Msg = #basic_message{content       = Content,
+make_dead_letter_msg(Msg = #basic_message{content       = Content,
                                           exchange_name = Exchange,
                                           routing_keys  = RoutingKeys},
-                     State = #q{dlx = DLX, dlx_routing_key = DlxRoutingKey}) ->
+                     Reason, DLX, RK, #resource{name = QName}) ->
     {DeathRoutingKeys, HeadersFun1} =
-        case DlxRoutingKey of
+        case RK of
             undefined -> {RoutingKeys, fun (H) -> H end};
-            _         -> {[DlxRoutingKey],
-                          fun (H) -> lists:keydelete(<<"CC">>, 1, H) end}
+            _         -> {[RK], fun (H) -> lists:keydelete(<<"CC">>, 1, H) end}
         end,
     ReasonBin = list_to_binary(atom_to_list(Reason)),
-    #resource{name = QName} = qname(State),
     TimeSec = rabbit_misc:now_ms() div 1000,
+    PerMsgTTL = per_msg_ttl_header(Content#content.properties),
     HeadersFun2 =
         fun (Headers) ->
                 %% The first routing key is the one specified in the
@@ -865,47 +916,29 @@ make_dead_letter_msg(Reason,
                         {<<"queue">>,        longstr,   QName},
                         {<<"time">>,         timestamp, TimeSec},
                         {<<"exchange">>,     longstr,   Exchange#resource.name},
-                        {<<"routing-keys">>, array,     RKs1}],
-                HeadersFun1(rabbit_basic:append_table_header(<<"x-death">>,
-                                                             Info, Headers))
+                        {<<"routing-keys">>, array,     RKs1}] ++ PerMsgTTL,
+                HeadersFun1(rabbit_basic:prepend_table_header(<<"x-death">>,
+                                                              Info, Headers))
         end,
-    Content1 = rabbit_basic:map_headers(HeadersFun2, Content),
-    Msg#basic_message{exchange_name = DLX, id = rabbit_guid:gen(),
-                      routing_keys = DeathRoutingKeys, content = Content1}.
+    Content1 = #content{properties = Props} =
+        rabbit_basic:map_headers(HeadersFun2, Content),
+    Content2 = Content1#content{properties =
+                                    Props#'P_basic'{expiration = undefined}},
+    Msg#basic_message{exchange_name = DLX,
+                      id            = rabbit_guid:gen(),
+                      routing_keys  = DeathRoutingKeys,
+                      content       = Content2}.
+
+per_msg_ttl_header(#'P_basic'{expiration = undefined}) ->
+    [];
+per_msg_ttl_header(#'P_basic'{expiration = Expiration}) ->
+    [{<<"original-expiration">>, longstr, Expiration}];
+per_msg_ttl_header(_) ->
+    [].
 
 now_micros() -> timer:now_diff(now(), {0,0,0}).
 
-infos(Items, State) ->
-    {Prefix, Items1} =
-        case lists:member(synchronised_slave_pids, Items) of
-            true  -> Prefix1 = slaves_status(State),
-                     case lists:member(slave_pids, Items) of
-                         true  -> {Prefix1, Items -- [slave_pids]};
-                         false -> {proplists:delete(slave_pids, Prefix1), Items}
-                     end;
-            false -> {[], Items}
-        end,
-    Prefix ++ [{Item, i(Item, State)}
-               || Item <- (Items1 -- [synchronised_slave_pids])].
-
-slaves_status(#q{q = #amqqueue{name = Name}}) ->
-    case rabbit_amqqueue:lookup(Name) of
-        {ok, #amqqueue{mirror_nodes = undefined}} ->
-            [{slave_pids, ''}, {synchronised_slave_pids, ''}];
-        {ok, #amqqueue{slave_pids = SPids}} ->
-            {Results, _Bad} =
-                delegate:invoke(SPids, fun rabbit_mirror_queue_slave:info/1),
-            {SPids1, SSPids} =
-                lists:foldl(
-                  fun ({Pid, Infos}, {SPidsN, SSPidsN}) ->
-                          {[Pid | SPidsN],
-                           case proplists:get_bool(is_synchronised, Infos) of
-                               true  -> [Pid | SSPidsN];
-                               false -> SSPidsN
-                           end}
-                  end, {[], []}, Results),
-            [{slave_pids, SPids1}, {synchronised_slave_pids, SSPids}]
-    end.
+infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
 i(name,        #q{q = #amqqueue{name        = Name}})       -> Name;
 i(durable,     #q{q = #amqqueue{durable     = Durable}})    -> Durable;
@@ -917,6 +950,12 @@ i(owner_pid, #q{q = #amqqueue{exclusive_owner = none}}) ->
     '';
 i(owner_pid, #q{q = #amqqueue{exclusive_owner = ExclusiveOwner}}) ->
     ExclusiveOwner;
+i(policy,    #q{q = #amqqueue{name = Name}}) ->
+    {ok, Q} = rabbit_amqqueue:lookup(Name),
+    case rabbit_policy:name(Q) of
+        none   -> '';
+        Policy -> Policy
+    end;
 i(exclusive_consumer_pid, #q{exclusive_consumer = none}) ->
     '';
 i(exclusive_consumer_pid, #q{exclusive_consumer = {ChPid, _ConsumerTag}}) ->
@@ -928,7 +967,7 @@ i(exclusive_consumer_tag, #q{exclusive_consumer = {_ChPid, ConsumerTag}}) ->
 i(messages_ready, #q{backing_queue_state = BQS, backing_queue = BQ}) ->
     BQ:len(BQS);
 i(messages_unacknowledged, _) ->
-    lists:sum([sets:size(C#cr.acktags) || C <- all_ch_record()]);
+    lists:sum([queue:len(C#cr.acktags) || C <- all_ch_record()]);
 i(messages, State) ->
     lists:sum([i(Item, State) || Item <- [messages_ready,
                                           messages_unacknowledged]]);
@@ -938,10 +977,21 @@ i(memory, _) ->
     {memory, M} = process_info(self(), memory),
     M;
 i(slave_pids, #q{q = #amqqueue{name = Name}}) ->
-    case rabbit_amqqueue:lookup(Name) of
-        {ok, #amqqueue{mirror_nodes = undefined}} -> [];
-        {ok, #amqqueue{slave_pids = SPids}}       -> SPids
+    {ok, Q = #amqqueue{slave_pids = SPids}} =
+        rabbit_amqqueue:lookup(Name),
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> '';
+        true  -> SPids
     end;
+i(synchronised_slave_pids, #q{q = #amqqueue{name = Name}}) ->
+    {ok, Q = #amqqueue{sync_slave_pids = SSPids}} =
+        rabbit_amqqueue:lookup(Name),
+    case rabbit_mirror_queue_misc:is_mirrored(Q) of
+        false -> '';
+        true  -> SSPids
+    end;
+i(status, #q{status = Status}) ->
+    Status;
 i(backing_queue_status, #q{backing_queue_state = BQS, backing_queue = BQ}) ->
     BQ:status(BQS);
 i(Item, _) ->
@@ -963,47 +1013,41 @@ emit_stats(State) ->
 emit_stats(State, Extra) ->
     rabbit_event:notify(queue_stats, Extra ++ infos(?STATISTICS_KEYS, State)).
 
-emit_consumer_created(ChPid, ConsumerTag, Exclusive, AckRequired) ->
+emit_consumer_created(ChPid, ConsumerTag, Exclusive, AckRequired, QName) ->
     rabbit_event:notify(consumer_created,
                         [{consumer_tag, ConsumerTag},
                          {exclusive,    Exclusive},
                          {ack_required, AckRequired},
                          {channel,      ChPid},
-                         {queue,        self()}]).
+                         {queue,        QName}]).
 
-emit_consumer_deleted(ChPid, ConsumerTag) ->
+emit_consumer_deleted(ChPid, ConsumerTag, QName) ->
     rabbit_event:notify(consumer_deleted,
                         [{consumer_tag, ConsumerTag},
                          {channel,      ChPid},
-                         {queue,        self()}]).
+                         {queue,        QName}]).
 
 %%----------------------------------------------------------------------------
 
-prioritise_call(Msg, _From, _State) ->
+prioritise_call(Msg, _From, _Len, _State) ->
     case Msg of
         info                                 -> 9;
         {info, _Items}                       -> 9;
         consumers                            -> 9;
-        {basic_consume, _, _, _, _, _, _}    -> 7;
-        {basic_cancel, _, _, _}              -> 7;
         stat                                 -> 7;
         _                                    -> 0
     end.
 
-prioritise_cast(Msg, _State) ->
+prioritise_cast(Msg, _Len, _State) ->
     case Msg of
         delete_immediately                   -> 8;
         {set_ram_duration_target, _Duration} -> 8;
         {set_maximum_since_use, _Age}        -> 8;
-        {ack, _AckTags, _ChPid}              -> 7;
-        {reject, _AckTags, _Requeue, _ChPid} -> 7;
-        {notify_sent, _ChPid, _Credit}       -> 7;
-        {unblock, _ChPid}                    -> 7;
         {run_backing_queue, _Mod, _Fun}      -> 6;
         _                                    -> 0
     end.
 
-prioritise_info(Msg, #q{q = #amqqueue{exclusive_owner = DownPid}}) ->
+prioritise_info(Msg, _Len, #q{q = #amqqueue{exclusive_owner = DownPid}}) ->
     case Msg of
         {'DOWN', _, process, DownPid, _}     -> 8;
         update_ram_duration                  -> 8;
@@ -1014,9 +1058,6 @@ prioritise_info(Msg, #q{q = #amqqueue{exclusive_owner = DownPid}}) ->
         _                                    -> 0
     end.
 
-handle_call(_, _, State = #q{delayed_stop = DS}) when DS =/= undefined ->
-    noreply(State);
-
 handle_call({init, Recover}, From,
             State = #q{q = #amqqueue{exclusive_owner = none}}) ->
     declare(Recover, From, State);
@@ -1026,17 +1067,21 @@ handle_call({init, Recover}, From,
     case rabbit_misc:is_process_alive(Owner) of
         true  -> erlang:monitor(process, Owner),
                  declare(Recover, From, State);
-        false -> #q{backing_queue = BQ, backing_queue_state = undefined,
-                    q = #amqqueue{name = QName} = Q} = State,
+        false -> #q{backing_queue       = undefined,
+                    backing_queue_state = undefined,
+                    q                   = #amqqueue{name = QName} = Q} = State,
                  gen_server2:reply(From, not_found),
                  case Recover of
-                     true -> ok;
-                     _    -> rabbit_log:warning(
-                               "Queue ~p exclusive owner went away~n", [QName])
+                     new -> rabbit_log:warning(
+                              "Queue ~p exclusive owner went away~n",
+                              [rabbit_misc:rs(QName)]);
+                     _   -> ok
                  end,
+                 BQ = backing_queue_module(Q),
                  BQS = bq_init(BQ, Q, Recover),
                  %% Rely on terminate to delete the queue.
-                 {stop, normal, State#q{backing_queue_state = BQS}}
+                 {stop, {shutdown, missing_owner},
+                  State#q{backing_queue = BQ, backing_queue_state = BQS}}
     end;
 
 handle_call(info, _From, State) ->
@@ -1051,30 +1096,12 @@ handle_call({info, Items}, _From, State) ->
 handle_call(consumers, _From, State) ->
     reply(consumers(State), State);
 
-handle_call({deliver, Delivery = #delivery{immediate = true}}, _From, State) ->
-    %% FIXME: Is this correct semantics?
-    %%
-    %% I'm worried in particular about the case where an exchange has
-    %% two queues against a particular routing key, and a message is
-    %% sent in immediate mode through the binding. In non-immediate
-    %% mode, both queues get the message, saving it for later if
-    %% there's noone ready to receive it just now. In immediate mode,
-    %% should both queues still get the message, somehow, or should
-    %% just all ready-to-consume queues get the message, with unready
-    %% queues discarding the message?
-    %%
-    Confirm = should_confirm_message(Delivery, State),
-    {Delivered, State1} = attempt_delivery(Delivery, Confirm, State),
-    reply(Delivered, case Delivered of
-                         true  -> maybe_record_confirm_message(Confirm, State1);
-                         false -> discard_delivery(Delivery, State1)
-                     end);
-
-handle_call({deliver, Delivery = #delivery{mandatory = true}}, From, State) ->
-    gen_server2:reply(From, true),
-    noreply(deliver_or_enqueue(Delivery, State));
-
-handle_call({notify_down, ChPid}, From, State) ->
+handle_call({deliver, Delivery, Delivered}, From, State) ->
+    %% Synchronous, "mandatory" deliver mode.
+    gen_server2:reply(From, ok),
+    noreply(deliver_or_enqueue(Delivery, Delivered, State));
+
+handle_call({notify_down, ChPid}, _From, State) ->
     %% we want to do this synchronously, so that auto_deleted queues
     %% are no longer visible by the time we send a response to the
     %% client.  The queue is ultimately deleted in terminate/2; if we
@@ -1082,71 +1109,87 @@ handle_call({notify_down, ChPid}, From, State) ->
     %% gen_server2 *before* the reply is sent.
     case handle_ch_down(ChPid, State) of
         {ok, State1}   -> reply(ok, State1);
-        {stop, State1} -> stop_later(normal, From, ok, State1)
+        {stop, State1} -> stop(ok, State1)
     end;
 
-handle_call({basic_get, ChPid, NoAck}, _From,
+handle_call({basic_get, ChPid, NoAck, LimiterPid}, _From,
             State = #q{q = #amqqueue{name = QName}}) ->
     AckRequired = not NoAck,
     State1 = ensure_expiry_timer(State),
-    case fetch(AckRequired, drop_expired_messages(State1)) of
+    case fetch(AckRequired, State1) of
         {empty, State2} ->
             reply(empty, State2);
-        {{Message, IsDelivered, AckTag, Remaining}, State2} ->
-            State3 =
+        {{Message, IsDelivered, AckTag}, State2} ->
+            State3 = #q{backing_queue = BQ, backing_queue_state = BQS} =
                 case AckRequired of
-                    true  -> C = #cr{acktags = ChAckTags} = ch_record(ChPid),
-                             ChAckTags1 = sets:add_element(AckTag, ChAckTags),
+                    true  -> C = #cr{acktags = ChAckTags} =
+                                 ch_record(ChPid, LimiterPid),
+                             ChAckTags1 = queue:in(AckTag, ChAckTags),
                              update_ch_record(C#cr{acktags = ChAckTags1}),
                              State2;
                     false -> State2
                 end,
             Msg = {QName, self(), AckTag, IsDelivered, Message},
-            reply({ok, Remaining, Msg}, State3)
+            reply({ok, BQ:len(BQS), Msg}, State3)
     end;
 
-handle_call({basic_consume, NoAck, ChPid, Limiter,
-             ConsumerTag, ExclusiveConsume, OkMsg},
-            _From, State = #q{exclusive_consumer = ExistingHolder}) ->
-    case check_exclusive_access(ExistingHolder, ExclusiveConsume,
-                                State) of
+handle_call({basic_consume, NoAck, ChPid, LimiterPid, LimiterActive,
+             ConsumerTag, ExclusiveConsume, CreditArgs, OkMsg},
+            _From, State = #q{exclusive_consumer = Holder}) ->
+    case check_exclusive_access(Holder, ExclusiveConsume, State) of
         in_use ->
             reply({error, exclusive_consume_unavailable}, State);
         ok ->
-            C = ch_record(ChPid),
-            C1 = update_consumer_count(C#cr{limiter = Limiter}, +1),
+            C = #cr{consumer_count = Count,
+                    limiter        = Limiter} = ch_record(ChPid, LimiterPid),
+            Limiter1 = case LimiterActive of
+                           true  -> rabbit_limiter:activate(Limiter);
+                           false -> Limiter
+                       end,
+            Limiter2 = case CreditArgs of
+                           none         -> Limiter1;
+                           {Crd, Drain} -> rabbit_limiter:credit(
+                                             Limiter1, ConsumerTag, Crd, Drain)
+                       end,
+            C1 = update_ch_record(C#cr{consumer_count = Count + 1,
+                                       limiter        = Limiter2}),
+            case is_empty(State) of
+                true  -> send_drained(C1);
+                false -> ok
+            end,
             Consumer = #consumer{tag = ConsumerTag,
                                  ack_required = not NoAck},
             ExclusiveConsumer = if ExclusiveConsume -> {ChPid, ConsumerTag};
-                                   true             -> ExistingHolder
+                                   true             -> Holder
                                 end,
             State1 = State#q{has_had_consumers = true,
                              exclusive_consumer = ExclusiveConsumer},
             ok = maybe_send_reply(ChPid, OkMsg),
-            E = {ChPid, Consumer},
-            State2 =
-                case is_ch_blocked(C1) of
-                    true  -> block_consumer(C1, E),
-                             State1;
-                    false -> update_ch_record(C1),
-                             AC1 = queue:in(E, State1#q.active_consumers),
-                             run_message_queue(State1#q{active_consumers = AC1})
-                end,
             emit_consumer_created(ChPid, ConsumerTag, ExclusiveConsume,
-                                  not NoAck),
-            reply(ok, State2)
+                                  not NoAck, qname(State1)),
+            AC1 = queue:in({ChPid, Consumer}, State1#q.active_consumers),
+            reply(ok, run_message_queue(State1#q{active_consumers = AC1}))
     end;
 
-handle_call({basic_cancel, ChPid, ConsumerTag, OkMsg}, From,
+handle_call({basic_cancel, ChPid, ConsumerTag, OkMsg}, _From,
             State = #q{exclusive_consumer = Holder}) ->
     ok = maybe_send_reply(ChPid, OkMsg),
     case lookup_ch(ChPid) of
         not_found ->
             reply(ok, State);
-        C = #cr{blocked_consumers = Blocked} ->
-            emit_consumer_deleted(ChPid, ConsumerTag),
+        C = #cr{consumer_count    = Count,
+                limiter           = Limiter,
+                blocked_consumers = Blocked} ->
+            emit_consumer_deleted(ChPid, ConsumerTag, qname(State)),
             Blocked1 = remove_consumer(ChPid, ConsumerTag, Blocked),
-            update_consumer_count(C#cr{blocked_consumers = Blocked1}, -1),
+            Limiter1 = case Count of
+                           1 -> rabbit_limiter:deactivate(Limiter);
+                           _ -> Limiter
+                       end,
+            Limiter2 = rabbit_limiter:forget_consumer(Limiter1, ConsumerTag),
+            update_ch_record(C#cr{consumer_count    = Count - 1,
+                                  limiter           = Limiter2,
+                                  blocked_consumers = Blocked1}),
             State1 = State#q{
                        exclusive_consumer = case Holder of
                                                 {ChPid, ConsumerTag} -> none;
@@ -1154,109 +1197,120 @@ handle_call({basic_cancel, ChPid, ConsumerTag, OkMsg}, From,
                                             end,
                        active_consumers   = remove_consumer(
                                               ChPid, ConsumerTag,
-                                             State#q.active_consumers)},
+                                              State#q.active_consumers)},
             case should_auto_delete(State1) of
                 false -> reply(ok, ensure_expiry_timer(State1));
-                true  -> stop_later(normal, From, ok, State1)
+                true  -> stop(ok, State1)
             end
     end;
 
 handle_call(stat, _From, State) ->
     State1 = #q{backing_queue = BQ, backing_queue_state = BQS} =
-        drop_expired_messages(ensure_expiry_timer(State)),
-    reply({ok, BQ:len(BQS), active_consumer_count()}, State1);
+        ensure_expiry_timer(State),
+    reply({ok, BQ:len(BQS), consumer_count()}, State1);
 
-handle_call({delete, IfUnused, IfEmpty}, From,
+handle_call({delete, IfUnused, IfEmpty}, _From,
             State = #q{backing_queue_state = BQS, backing_queue = BQ}) ->
-    IsEmpty = BQ:is_empty(BQS),
+    IsEmpty  = BQ:is_empty(BQS),
     IsUnused = is_unused(State),
     if
-        IfEmpty and not(IsEmpty)   -> reply({error, not_empty}, State);
-        IfUnused and not(IsUnused) -> reply({error, in_use}, State);
-        true                       -> stop_later(normal, From,
-                                                 {ok, BQ:len(BQS)}, State)
+        IfEmpty  and not(IsEmpty)  -> reply({error, not_empty}, State);
+        IfUnused and not(IsUnused) -> reply({error,    in_use}, State);
+        true                       -> stop({ok, BQ:len(BQS)}, State)
     end;
 
 handle_call(purge, _From, State = #q{backing_queue       = BQ,
                                      backing_queue_state = BQS}) ->
     {Count, BQS1} = BQ:purge(BQS),
-    reply({ok, Count}, State#q{backing_queue_state = BQS1});
+    State1 = State#q{backing_queue_state = BQS1},
+    reply({ok, Count}, maybe_send_drained(Count =:= 0, State1));
 
 handle_call({requeue, AckTags, ChPid}, From, State) ->
     gen_server2:reply(From, ok),
-    noreply(subtract_acks(
-              ChPid, AckTags, State,
-              fun (State1) -> requeue_and_run(AckTags, State1) end));
+    noreply(requeue(AckTags, ChPid, State));
+
+handle_call(sync_mirrors, _From,
+            State = #q{backing_queue       = rabbit_mirror_queue_master,
+                       backing_queue_state = BQS}) ->
+    S = fun(BQSN) -> State#q{backing_queue_state = BQSN} end,
+    HandleInfo = fun (Status) ->
+                         receive {'$gen_call', From, {info, Items}} ->
+                                 Infos = infos(Items, State#q{status = Status}),
+                                 gen_server2:reply(From, {ok, Infos})
+                         after 0 ->
+                                 ok
+                         end
+                 end,
+    EmitStats = fun (Status) ->
+                        rabbit_event:if_enabled(
+                          State, #q.stats_timer,
+                          fun() -> emit_stats(State#q{status = Status}) end)
+                end,
+    case rabbit_mirror_queue_master:sync_mirrors(HandleInfo, EmitStats, BQS) of
+        {ok, BQS1}           -> reply(ok, S(BQS1));
+        {stop, Reason, BQS1} -> {stop, Reason, S(BQS1)}
+    end;
+
+handle_call(sync_mirrors, _From, State) ->
+    reply({error, not_mirrored}, State);
+
+%% By definition if we get this message here we do not have to do anything.
+handle_call(cancel_sync_mirrors, _From, State) ->
+    reply({ok, not_syncing}, State);
 
 handle_call(force_event_refresh, _From,
             State = #q{exclusive_consumer = Exclusive}) ->
     rabbit_event:notify(queue_created, infos(?CREATION_EVENT_KEYS, State)),
+    QName = qname(State),
     case Exclusive of
-        none       -> [emit_consumer_created(Ch, CTag, false, AckRequired) ||
+        none       -> [emit_consumer_created(
+                         Ch, CTag, false, AckRequired, QName) ||
                           {Ch, CTag, AckRequired} <- consumers(State)];
         {Ch, CTag} -> [{Ch, CTag, AckRequired}] = consumers(State),
-                      emit_consumer_created(Ch, CTag, true, AckRequired)
+                      emit_consumer_created(Ch, CTag, true, AckRequired, QName)
     end,
     reply(ok, State).
 
-handle_cast({confirm, MsgSeqNos, QPid}, State = #q{unconfirmed = UC}) ->
-    {MsgSeqNoAckTags, UC1} = dtree:take(MsgSeqNos, QPid, UC),
-    State1 = case dtree:is_defined(QPid, UC1) of
-                 false -> QMons = State#q.queue_monitors,
-                          State#q{queue_monitors = pmon:demonitor(QPid, QMons)};
-                 true  -> State
-             end,
-    cleanup_after_confirm([AckTag || {_MsgSeqNo, AckTag} <- MsgSeqNoAckTags],
-                          State1#q{unconfirmed = UC1});
-
-handle_cast(_, State = #q{delayed_stop = DS}) when DS =/= undefined ->
-    noreply(State);
-
-handle_cast({run_backing_queue, Mod, Fun}, State) ->
-    noreply(run_backing_queue(Mod, Fun, State));
+handle_cast({run_backing_queue, Mod, Fun},
+            State = #q{backing_queue = BQ, backing_queue_state = BQS}) ->
+    noreply(State#q{backing_queue_state = BQ:invoke(Mod, Fun, BQS)});
 
-handle_cast({deliver, Delivery = #delivery{sender = Sender}, Flow},
+handle_cast({deliver, Delivery = #delivery{sender = Sender}, Delivered, Flow},
             State = #q{senders = Senders}) ->
-    %% Asynchronous, non-"mandatory", non-"immediate" deliver mode.
+    %% Asynchronous, non-"mandatory" deliver mode.
     Senders1 = case Flow of
                    flow   -> credit_flow:ack(Sender),
                              pmon:monitor(Sender, Senders);
                    noflow -> Senders
                end,
     State1 = State#q{senders = Senders1},
-    noreply(deliver_or_enqueue(Delivery, State1));
+    noreply(deliver_or_enqueue(Delivery, Delivered, State1));
 
 handle_cast({ack, AckTags, ChPid}, State) ->
-    noreply(subtract_acks(
-              ChPid, AckTags, State,
-              fun (State1 = #q{backing_queue       = BQ,
-                               backing_queue_state = BQS}) ->
-                      {_Guids, BQS1} = BQ:ack(AckTags, BQS),
-                      State1#q{backing_queue_state = BQS1}
-              end));
-
-handle_cast({reject, AckTags, Requeue, ChPid}, State) ->
-    noreply(subtract_acks(
-              ChPid, AckTags, State,
-              case Requeue of
-                  true  -> fun (State1) -> requeue_and_run(AckTags, State1) end;
-                  false -> fun (State1 = #q{backing_queue       = BQ,
-                                            backing_queue_state = BQS}) ->
-                                   Fun = dead_letter_fun(rejected, State1),
-                                   BQS1 = BQ:fold(Fun, BQS, AckTags),
-                                   ack_if_no_dlx(
-                                     AckTags,
-                                     State1#q{backing_queue_state = BQS1})
-                           end
-              end));
+    noreply(ack(AckTags, ChPid, State));
+
+handle_cast({reject, AckTags, true, ChPid}, State) ->
+    noreply(requeue(AckTags, ChPid, State));
+
+handle_cast({reject, AckTags, false, ChPid}, State) ->
+    noreply(with_dlx(
+              State#q.dlx,
+              fun (X) -> subtract_acks(ChPid, AckTags, State,
+                                       fun (State1) ->
+                                               dead_letter_rejected_msgs(
+                                                 AckTags, X, State1)
+                                       end) end,
+              fun () -> ack(AckTags, ChPid, State) end));
 
 handle_cast(delete_immediately, State) ->
-    stop_later(normal, State);
+    stop(State);
 
-handle_cast({unblock, ChPid}, State) ->
+handle_cast({resume, ChPid}, State) ->
     noreply(
       possibly_unblock(State, ChPid,
-                       fun (C) -> C#cr{is_limit_active = false} end));
+                       fun (C = #cr{limiter = Limiter}) ->
+                               C#cr{limiter = rabbit_limiter:resume(Limiter)}
+                       end));
 
 handle_cast({notify_sent, ChPid, Credit}, State) ->
     noreply(
@@ -1265,21 +1319,12 @@ handle_cast({notify_sent, ChPid, Credit}, State) ->
                                C#cr{unsent_message_count = Count - Credit}
                        end));
 
-handle_cast({limit, ChPid, Limiter}, State) ->
+handle_cast({activate_limit, ChPid}, State) ->
     noreply(
-      possibly_unblock(
-        State, ChPid,
-        fun (C = #cr{consumer_count  = ConsumerCount,
-                     limiter         = OldLimiter,
-                     is_limit_active = OldLimited}) ->
-                case (ConsumerCount =/= 0 andalso
-                      not rabbit_limiter:is_enabled(OldLimiter)) of
-                    true  -> ok = rabbit_limiter:register(Limiter, self());
-                    false -> ok
-                end,
-                Limited = OldLimited andalso rabbit_limiter:is_enabled(Limiter),
-                C#cr{limiter = Limiter, is_limit_active = Limited}
-        end));
+      possibly_unblock(State, ChPid,
+                       fun (C = #cr{limiter = Limiter}) ->
+                               C#cr{limiter = rabbit_limiter:activate(Limiter)}
+                       end));
 
 handle_cast({flush, ChPid}, State) ->
     ok = rabbit_channel:flushed(ChPid, self()),
@@ -1294,33 +1339,61 @@ handle_cast({set_maximum_since_use, Age}, State) ->
     ok = file_handle_cache:set_maximum_since_use(Age),
     noreply(State);
 
-handle_cast({dead_letter, {Msg, AckTag}, Reason}, State) ->
-    dead_letter_msg(Msg, AckTag, Reason, State).
-
-%% We need to not ignore this as we need to remove outstanding
-%% confirms due to queue death.
-handle_info({'DOWN', _MonitorRef, process, DownPid, Reason},
-            State = #q{delayed_stop = DS}) when DS =/= undefined ->
-    handle_queue_down(DownPid, Reason, State);
-
-handle_info(_, State = #q{delayed_stop = DS}) when DS =/= undefined ->
-    noreply(State);
+handle_cast(start_mirroring, State = #q{backing_queue       = BQ,
+					backing_queue_state = BQS}) ->
+    %% lookup again to get policy for init_with_existing_bq
+    {ok, Q} = rabbit_amqqueue:lookup(qname(State)),
+    true = BQ =/= rabbit_mirror_queue_master, %% assertion
+    BQ1 = rabbit_mirror_queue_master,
+    BQS1 = BQ1:init_with_existing_bq(Q, BQ, BQS),
+    noreply(State#q{backing_queue       = BQ1,
+		    backing_queue_state = BQS1});
+
+handle_cast(stop_mirroring, State = #q{backing_queue       = BQ,
+				       backing_queue_state = BQS}) ->
+    BQ = rabbit_mirror_queue_master, %% assertion
+    {BQ1, BQS1} = BQ:stop_mirroring(BQS),
+    noreply(State#q{backing_queue       = BQ1,
+		    backing_queue_state = BQS1});
+
+handle_cast({credit, ChPid, CTag, Credit, Drain},
+            State = #q{backing_queue       = BQ,
+                       backing_queue_state = BQS}) ->
+    Len = BQ:len(BQS),
+    rabbit_channel:send_credit_reply(ChPid, Len),
+    C = #cr{limiter = Limiter} = lookup_ch(ChPid),
+    C1 = C#cr{limiter = rabbit_limiter:credit(Limiter, CTag, Credit, Drain)},
+    noreply(case Drain andalso Len == 0 of
+                true  -> update_ch_record(C1),
+                         send_drained(C1),
+                         State;
+                false -> case is_ch_blocked(C1) of
+                             true  -> update_ch_record(C1),
+                                      State;
+                             false -> unblock(State, C1)
+                         end
+            end);
+
+handle_cast(wake_up, State) ->
+    noreply(State).
 
 handle_info(maybe_expire, State) ->
     case is_unused(State) of
-        true  -> stop_later(normal, State);
-        false -> noreply(ensure_expiry_timer(State))
+        true  -> stop(State);
+        false -> noreply(State#q{expiry_timer_ref = undefined})
     end;
 
 handle_info(drop_expired, State) ->
-    noreply(drop_expired_messages(State#q{ttl_timer_ref = undefined}));
+    WasEmpty = is_empty(State),
+    State1 = drop_expired_msgs(State#q{ttl_timer_ref = undefined}),
+    noreply(maybe_send_drained(WasEmpty, State1));
 
 handle_info(emit_stats, State) ->
-    %% Do not invoke noreply as it would see no timer and create a new one.
     emit_stats(State),
-    State1 = rabbit_event:reset_stats_timer(State, #q.stats_timer),
-    assert_invariant(State1),
-    {noreply, State1, hibernate};
+    %% Don't call noreply/1, we don't want to set timers
+    {State1, Timeout} = next_state(rabbit_event:reset_stats_timer(
+                                     State, #q.stats_timer)),
+    {noreply, State1, Timeout};
 
 handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason},
             State = #q{q = #amqqueue{exclusive_owner = DownPid}}) ->
@@ -1330,12 +1403,12 @@ handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason},
     %% match what people expect (see bug 21824). However we need this
     %% monitor-and-async- delete in case the connection goes away
     %% unexpectedly.
-    stop_later(normal, State);
+    stop(State);
 
-handle_info({'DOWN', _MonitorRef, process, DownPid, Reason}, State) ->
+handle_info({'DOWN', _MonitorRef, process, DownPid, _Reason}, State) ->
     case handle_ch_down(DownPid, State) of
-        {ok, State1}   -> handle_queue_down(DownPid, Reason, State1);
-        {stop, State1} -> stop_later(normal, State1)
+        {ok, State1}   -> noreply(State1);
+        {stop, State1} -> stop(State1)
     end;
 
 handle_info(update_ram_duration, State = #q{backing_queue = BQ,
@@ -1344,8 +1417,10 @@ handle_info(update_ram_duration, State = #q{backing_queue = BQ,
     DesiredDuration =
         rabbit_memory_monitor:report_ram_duration(self(), RamDuration),
     BQS2 = BQ:set_ram_duration_target(DesiredDuration, BQS1),
-    noreply(State#q{rate_timer_ref = just_measured,
-                    backing_queue_state = BQS2});
+    %% Don't call noreply/1, we don't want to set timers
+    {State1, Timeout} = next_state(State#q{rate_timer_ref      = undefined,
+                                           backing_queue_state = BQS2}),
+    {noreply, State1, Timeout};
 
 handle_info(sync_timeout, State) ->
     noreply(backing_queue_timeout(State#q{sync_timer_ref = undefined}));
diff --git a/src/rabbit_amqqueue_sup.erl b/src/rabbit_amqqueue_sup.erl
index a4305e5f..74ae59da 100644
--- a/src/rabbit_amqqueue_sup.erl
+++ b/src/rabbit_amqqueue_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_amqqueue_sup).
@@ -47,6 +47,6 @@ start_child(Node, Args) ->
     supervisor2:start_child({?SERVER, Node}, Args).
 
 init([]) ->
-    {ok, {{simple_one_for_one_terminate, 10, 10},
+    {ok, {{simple_one_for_one, 10, 10},
           [{rabbit_amqqueue, {rabbit_amqqueue_process, start_link, []},
             temporary, ?MAX_WAIT, worker, [rabbit_amqqueue_process]}]}}.
diff --git a/src/rabbit_auth_backend.erl b/src/rabbit_auth_backend.erl
index e89951e7..4ffc8c3a 100644
--- a/src/rabbit_auth_backend.erl
+++ b/src/rabbit_auth_backend.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_backend).
@@ -20,7 +20,7 @@
 
 %% A description proplist as with auth mechanisms,
 %% exchanges. Currently unused.
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% Check a user can log in, given a username and a proplist of
 %% authentication information (e.g. [{password, Password}]).
diff --git a/src/rabbit_auth_backend_internal.erl b/src/rabbit_auth_backend_internal.erl
index 7b9df81e..61919d05 100644
--- a/src/rabbit_auth_backend_internal.erl
+++ b/src/rabbit_auth_backend_internal.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_backend_internal).
@@ -49,7 +49,7 @@
 -spec(hash_password/1 :: (rabbit_types:password())
                          -> rabbit_types:password_hash()).
 -spec(set_tags/2 :: (rabbit_types:username(), [atom()]) -> 'ok').
--spec(list_users/0 :: () -> rabbit_types:infos()).
+-spec(list_users/0 :: () -> [rabbit_types:infos()]).
 -spec(user_info_keys/0 :: () -> rabbit_types:info_keys()).
 -spec(lookup_user/1 :: (rabbit_types:username())
                        -> rabbit_types:ok(rabbit_types:internal_user())
@@ -58,14 +58,14 @@
                            regexp(), regexp(), regexp()) -> 'ok').
 -spec(clear_permissions/2 :: (rabbit_types:username(), rabbit_types:vhost())
                              -> 'ok').
--spec(list_permissions/0 :: () -> rabbit_types:infos()).
+-spec(list_permissions/0 :: () -> [rabbit_types:infos()]).
 -spec(list_vhost_permissions/1 ::
-        (rabbit_types:vhost()) -> rabbit_types:infos()).
+        (rabbit_types:vhost()) -> [rabbit_types:infos()]).
 -spec(list_user_permissions/1 ::
-        (rabbit_types:username()) -> rabbit_types:infos()).
+        (rabbit_types:username()) -> [rabbit_types:infos()]).
 -spec(list_user_vhost_permissions/2 ::
         (rabbit_types:username(), rabbit_types:vhost())
-        -> rabbit_types:infos()).
+        -> [rabbit_types:infos()]).
 -spec(perms_info_keys/0 :: () -> rabbit_types:info_keys()).
 -spec(vhost_perms_info_keys/0 :: () -> rabbit_types:info_keys()).
 -spec(user_perms_info_keys/0 :: () -> rabbit_types:info_keys()).
@@ -203,7 +203,9 @@ hash_password(Cleartext) ->
     <<Salt/binary, Hash/binary>>.
 
 check_password(Cleartext, <<Salt:4/binary, Hash/binary>>) ->
-    Hash =:= salted_md5(Salt, Cleartext).
+    Hash =:= salted_md5(Salt, Cleartext);
+check_password(_Cleartext, _Any) ->
+    false.
 
 make_salt() ->
     {A1,A2,A3} = now(),
diff --git a/src/rabbit_auth_mechanism.erl b/src/rabbit_auth_mechanism.erl
index eda6a743..21528b11 100644
--- a/src/rabbit_auth_mechanism.erl
+++ b/src/rabbit_auth_mechanism.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_mechanism).
@@ -19,7 +19,7 @@
 -ifdef(use_specs).
 
 %% A description.
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% If this mechanism is enabled, should it be offered for a given socket?
 %% (primarily so EXTERNAL can be SSL-only)
diff --git a/src/rabbit_auth_mechanism_amqplain.erl b/src/rabbit_auth_mechanism_amqplain.erl
index c0d86cd1..8e896b45 100644
--- a/src/rabbit_auth_mechanism_amqplain.erl
+++ b/src/rabbit_auth_mechanism_amqplain.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_mechanism_amqplain).
@@ -33,8 +33,7 @@
 %% referring generically to "SASL security mechanism", i.e. the above.
 
 description() ->
-    [{name, <<"AMQPLAIN">>},
-     {description, <<"QPid AMQPLAIN mechanism">>}].
+    [{description, <<"QPid AMQPLAIN mechanism">>}].
 
 should_offer(_Sock) ->
     true.
diff --git a/src/rabbit_auth_mechanism_cr_demo.erl b/src/rabbit_auth_mechanism_cr_demo.erl
index 5df1d5d7..8699a9fa 100644
--- a/src/rabbit_auth_mechanism_cr_demo.erl
+++ b/src/rabbit_auth_mechanism_cr_demo.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_mechanism_cr_demo).
@@ -37,8 +37,7 @@
 %% SECURE-OK: "My password is ~s", [Password]
 
 description() ->
-    [{name, <<"RABBIT-CR-DEMO">>},
-     {description, <<"RabbitMQ Demo challenge-response authentication "
+    [{description, <<"RabbitMQ Demo challenge-response authentication "
                      "mechanism">>}].
 
 should_offer(_Sock) ->
diff --git a/src/rabbit_auth_mechanism_plain.erl b/src/rabbit_auth_mechanism_plain.erl
index 423170e1..5ab22e75 100644
--- a/src/rabbit_auth_mechanism_plain.erl
+++ b/src/rabbit_auth_mechanism_plain.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_auth_mechanism_plain).
@@ -31,13 +31,11 @@
 %% SASL PLAIN, as used by the Qpid Java client and our clients. Also,
 %% apparently, by OpenAMQ.
 
-%% TODO: once the minimum erlang becomes R13B03, reimplement this
-%% using the binary module - that makes use of BIFs to do binary
-%% matching and will thus be much faster.
+%% TODO: reimplement this using the binary module? - that makes use of
+%% BIFs to do binary matching and will thus be much faster.
 
 description() ->
-    [{name, <<"PLAIN">>},
-     {description, <<"SASL PLAIN authentication mechanism">>}].
+    [{description, <<"SASL PLAIN authentication mechanism">>}].
 
 should_offer(_Sock) ->
     true.
diff --git a/src/rabbit_autoheal.erl b/src/rabbit_autoheal.erl
new file mode 100644
index 00000000..a5b91867
--- /dev/null
+++ b/src/rabbit_autoheal.erl
@@ -0,0 +1,199 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_autoheal).
+
+-export([init/0, maybe_start/1, node_down/2, handle_msg/3]).
+
+%% The named process we are running in.
+-define(SERVER, rabbit_node_monitor).
+
+%%----------------------------------------------------------------------------
+
+%% In order to autoheal we want to:
+%%
+%% * Find the winning partition
+%% * Stop all nodes in other partitions
+%% * Wait for them all to be stopped
+%% * Start them again
+%%
+%% To keep things simple, we assume all nodes are up. We don't start
+%% unless all nodes are up, and if a node goes down we abandon the
+%% whole process. To further keep things simple we also defer the
+%% decision as to the winning node to the "leader" - arbitrarily
+%% selected as the first node in the cluster.
+%%
+%% To coordinate the restarting nodes we pick a special node from the
+%% winning partition - the "winner". Restarting nodes then stop, tell
+%% the winner they have done so, and wait for it to tell them it is
+%% safe to start again. The winner and the leader are not necessarily
+%% the same node.
+%%
+%% Possible states:
+%%
+%% not_healing
+%%   - the default
+%%
+%% {winner_waiting, OutstandingStops, Notify}
+%%   - we are the winner and are waiting for all losing nodes to stop
+%%   before telling them they can restart
+%%
+%% restarting
+%%   - we are restarting. Of course the node monitor immediately dies
+%%   then so this state does not last long. We therefore send the
+%%   autoheal_safe_to_start message to the rabbit_outside_app_process
+%%   instead.
+
+%%----------------------------------------------------------------------------
+
+init() -> not_healing.
+
+maybe_start(not_healing) ->
+    case enabled() of
+        true  -> [Leader | _] = lists:usort(rabbit_mnesia:cluster_nodes(all)),
+                 send(Leader, {request_start, node()}),
+                 rabbit_log:info("Autoheal request sent to ~p~n", [Leader]),
+                 not_healing;
+        false -> not_healing
+    end;
+maybe_start(State) ->
+    State.
+
+enabled() ->
+    {ok, autoheal} =:= application:get_env(rabbit, cluster_partition_handling).
+
+node_down(_Node, {winner_waiting, _Nodes, _Notify} = Autoheal) ->
+    Autoheal;
+node_down(_Node, not_healing) ->
+    not_healing;
+node_down(Node, _State) ->
+    rabbit_log:info("Autoheal: aborting - ~p went down~n", [Node]),
+    not_healing.
+
+%% By receiving this message we become the leader
+%% TODO should we try to debounce this?
+handle_msg({request_start, Node},
+           not_healing, Partitions) ->
+    rabbit_log:info("Autoheal request received from ~p~n", [Node]),
+    case rabbit_node_monitor:all_rabbit_nodes_up() of
+        false -> not_healing;
+        true  -> AllPartitions = all_partitions(Partitions),
+                 {Winner, Losers} = make_decision(AllPartitions),
+                 rabbit_log:info("Autoheal decision~n"
+                                 "  * Partitions: ~p~n"
+                                 "  * Winner:     ~p~n"
+                                 "  * Losers:     ~p~n",
+                                 [AllPartitions, Winner, Losers]),
+                 send(Winner, {become_winner, Losers}),
+                 [send(L, {winner_is, Winner}) || L <- Losers],
+                 not_healing
+    end;
+
+handle_msg({request_start, Node},
+           State, _Partitions) ->
+    rabbit_log:info("Autoheal request received from ~p when in state ~p; "
+                    "ignoring~n", [Node, State]),
+    State;
+
+handle_msg({become_winner, Losers},
+           not_healing, _Partitions) ->
+    rabbit_log:info("Autoheal: I am the winner, waiting for ~p to stop~n",
+                    [Losers]),
+    {winner_waiting, Losers, Losers};
+
+handle_msg({become_winner, Losers},
+           {winner_waiting, WaitFor, Notify}, _Partitions) ->
+    rabbit_log:info("Autoheal: I am the winner, waiting additionally for "
+                    "~p to stop~n", [Losers]),
+    {winner_waiting, lists:usort(Losers ++ WaitFor),
+     lists:usort(Losers ++ Notify)};
+
+handle_msg({winner_is, Winner},
+           not_healing, _Partitions) ->
+    rabbit_log:warning(
+      "Autoheal: we were selected to restart; winner is ~p~n", [Winner]),
+    rabbit_node_monitor:run_outside_applications(
+      fun () ->
+              MRef = erlang:monitor(process, {?SERVER, Winner}),
+              rabbit:stop(),
+              send(Winner, {node_stopped, node()}),
+              receive
+                  {'DOWN', MRef, process, {?SERVER, Winner}, _Reason} -> ok;
+                  autoheal_safe_to_start                              -> ok
+              end,
+              erlang:demonitor(MRef, [flush]),
+              rabbit:start()
+      end),
+    restarting;
+
+%% This is the winner receiving its last notification that a node has
+%% stopped - all nodes can now start again
+handle_msg({node_stopped, Node},
+           {winner_waiting, [Node], Notify}, _Partitions) ->
+    rabbit_log:info("Autoheal: final node has stopped, starting...~n",[]),
+    [{rabbit_outside_app_process, N} ! autoheal_safe_to_start || N <- Notify],
+    not_healing;
+
+handle_msg({node_stopped, Node},
+           {winner_waiting, WaitFor, Notify}, _Partitions) ->
+    {winner_waiting, WaitFor -- [Node], Notify};
+
+handle_msg(_, restarting, _Partitions) ->
+    %% ignore, we can contribute no further
+    restarting;
+
+handle_msg({node_stopped, _Node}, State, _Partitions) ->
+    %% ignore, we already cancelled the autoheal process
+    State.
+
+%%----------------------------------------------------------------------------
+
+send(Node, Msg) -> {?SERVER, Node} ! {autoheal_msg, Msg}.
+
+make_decision(AllPartitions) ->
+    Sorted = lists:sort([{partition_value(P), P} || P <- AllPartitions]),
+    [[Winner | _] | Rest] = lists:reverse([P || {_, P} <- Sorted]),
+    {Winner, lists:append(Rest)}.
+
+partition_value(Partition) ->
+    Connections = [Res || Node <- Partition,
+                          Res <- [rpc:call(Node, rabbit_networking,
+                                           connections_local, [])],
+                          is_list(Res)],
+    {length(lists:append(Connections)), length(Partition)}.
+
+%% We have our local understanding of what partitions exist; but we
+%% only know which nodes we have been partitioned from, not which
+%% nodes are partitioned from each other.
+all_partitions(PartitionedWith) ->
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    Partitions = [{node(), PartitionedWith} |
+                  rabbit_node_monitor:partitions(Nodes -- [node()])],
+    all_partitions(Partitions, [Nodes]).
+
+all_partitions([], Partitions) ->
+    Partitions;
+all_partitions([{Node, CantSee} | Rest], Partitions) ->
+    {[Containing], Others} =
+        lists:partition(fun (Part) -> lists:member(Node, Part) end, Partitions),
+    A = Containing -- CantSee,
+    B = Containing -- A,
+    Partitions1 = case {A, B} of
+                      {[], _}  -> Partitions;
+                      {_,  []} -> Partitions;
+                      _        -> [A, B | Others]
+                  end,
+    all_partitions(Rest, Partitions1).
diff --git a/src/rabbit_backing_queue.erl b/src/rabbit_backing_queue.erl
index dc144a0e..61b504bc 100644
--- a/src/rabbit_backing_queue.erl
+++ b/src/rabbit_backing_queue.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_backing_queue).
@@ -22,17 +22,18 @@
 -type(ack()   :: any()).
 -type(state() :: any()).
 
+-type(msg_ids() :: [rabbit_types:msg_id()]).
 -type(fetch_result(Ack) ::
-        ('empty' |
-         %% Message,                  IsDelivered, AckTag, Remaining_Len
-         {rabbit_types:basic_message(), boolean(), Ack, non_neg_integer()})).
+        ('empty' | {rabbit_types:basic_message(), boolean(), Ack})).
+-type(drop_result(Ack) ::
+        ('empty' | {rabbit_types:msg_id(), Ack})).
 -type(attempt_recovery() :: boolean()).
 -type(purged_msg_count() :: non_neg_integer()).
--type(async_callback() :: fun ((atom(), fun ((atom(), state()) -> state())) -> 'ok')).
+-type(async_callback() ::
+        fun ((atom(), fun ((atom(), state()) -> state())) -> 'ok')).
 -type(duration() :: ('undefined' | 'infinity' | number())).
 
--type(msg_fun() :: fun((rabbit_types:basic_message(), ack()) -> 'ok') |
-                   'undefined').
+-type(msg_fun(A) :: fun ((rabbit_types:basic_message(), ack(), A) -> A)).
 -type(msg_pred() :: fun ((rabbit_types:message_properties()) -> boolean())).
 
 %% Called on startup with a list of durable queue names. The queues
@@ -68,24 +69,29 @@
 %% content.
 -callback delete_and_terminate(any(), state()) -> state().
 
-%% Remove all messages in the queue, but not messages which have been
-%% fetched and are pending acks.
+%% Remove all 'fetchable' messages from the queue, i.e. all messages
+%% except those that have been fetched already and are pending acks.
 -callback purge(state()) -> {purged_msg_count(), state()}.
 
+%% Remove all messages in the queue which have been fetched and are
+%% pending acks.
+-callback purge_acks(state()) -> state().
+
 %% Publish a message.
 -callback publish(rabbit_types:basic_message(),
-                  rabbit_types:message_properties(), pid(), state()) ->
-    state().
+                  rabbit_types:message_properties(), boolean(), pid(),
+                  state()) -> state().
 
 %% Called for messages which have already been passed straight
 %% out to a client. The queue will be empty for these calls
 %% (i.e. saves the round trip through the backing queue).
--callback publish_delivered(true, rabbit_types:basic_message(),
-                            rabbit_types:message_properties(), pid(), state())
-                           -> {ack(), state()};
-                           (false, rabbit_types:basic_message(),
+-callback publish_delivered(rabbit_types:basic_message(),
                             rabbit_types:message_properties(), pid(), state())
-                           -> {undefined, state()}.
+                           -> {ack(), state()}.
+
+%% Called to inform the BQ about messages which have reached the
+%% queue, but are not going to be further passed to BQ.
+-callback discard(rabbit_types:msg_id(), pid(), state()) -> state().
 
 %% Return ids of messages which have been confirmed since the last
 %% invocation of this function (or initialisation).
@@ -114,32 +120,51 @@
 %% first time the message id appears in the result of
 %% drain_confirmed. All subsequent appearances of that message id will
 %% be ignored.
--callback drain_confirmed(state()) -> {[rabbit_guid:guid()], state()}.
-
-%% Drop messages from the head of the queue while the supplied predicate returns
-%% true. Also accepts a boolean parameter that determines whether the messages
-%% necessitate an ack or not. If they do, the function returns a list of
-%% messages with the respective acktags.
--callback dropwhile(msg_pred(), true, state())
-                   -> {[{rabbit_types:basic_message(), ack()}], state()};
-                   (msg_pred(), false, state())
-                   -> {undefined, state()}.
+-callback drain_confirmed(state()) -> {msg_ids(), state()}.
+
+%% Drop messages from the head of the queue while the supplied
+%% predicate on message properties returns true. Returns the first
+%% message properties for which the predictate returned false, or
+%% 'undefined' if the whole backing queue was traversed w/o the
+%% predicate ever returning false.
+-callback dropwhile(msg_pred(), state())
+                   -> {rabbit_types:message_properties() | undefined, state()}.
+
+%% Like dropwhile, except messages are fetched in "require
+%% acknowledgement" mode and are passed, together with their ack tag,
+%% to the supplied function. The function is also fed an
+%% accumulator. The result of fetchwhile is as for dropwhile plus the
+%% accumulator.
+-callback fetchwhile(msg_pred(), msg_fun(A), A, state())
+                     -> {rabbit_types:message_properties() | undefined,
+                         A, state()}.
 
 %% Produce the next message.
 -callback fetch(true,  state()) -> {fetch_result(ack()), state()};
                (false, state()) -> {fetch_result(undefined), state()}.
 
+%% Remove the next message.
+-callback drop(true,  state()) -> {drop_result(ack()), state()};
+              (false, state()) -> {drop_result(undefined), state()}.
+
 %% Acktags supplied are for messages which can now be forgotten
 %% about. Must return 1 msg_id per Ack, in the same order as Acks.
--callback ack([ack()], state()) -> {[rabbit_guid:guid()], state()}.
-
-%% Acktags supplied are for messages which should be processed. The
-%% provided callback function is called with each message.
--callback fold(msg_fun(), state(), [ack()]) -> state().
+-callback ack([ack()], state()) -> {msg_ids(), state()}.
 
 %% Reinsert messages into the queue which have already been delivered
 %% and were pending acknowledgement.
--callback requeue([ack()], state()) -> {[rabbit_guid:guid()], state()}.
+-callback requeue([ack()], state()) -> {msg_ids(), state()}.
+
+%% Fold over messages by ack tag. The supplied function is called with
+%% each message, its ack tag, and an accumulator.
+-callback ackfold(msg_fun(A), A, state(), [ack()]) -> {A, state()}.
+
+%% Fold over all the messages in a queue and return the accumulated
+%% results, leaving the queue undisturbed.
+-callback fold(fun((rabbit_types:basic_message(),
+                    rabbit_types:message_properties(),
+                    boolean(), A) -> {('stop' | 'cont'), A}),
+               A, state()) -> {A, state()}.
 
 %% How long is my queue?
 -callback len(state()) -> non_neg_integer().
@@ -147,6 +172,9 @@
 %% Is my queue empty?
 -callback is_empty(state()) -> boolean().
 
+%% What's the queue depth, where depth = length + number of pending acks
+-callback depth(state()) -> non_neg_integer().
+
 %% For the next three functions, the assumption is that you're
 %% monitoring something like the ingress and egress rates of the
 %% queue. The RAM duration is thus the length of time represented by
@@ -185,18 +213,10 @@
 -callback invoke(atom(), fun ((atom(), A) -> A), state()) -> state().
 
 %% Called prior to a publish or publish_delivered call. Allows the BQ
-%% to signal that it's already seen this message (and in what capacity
-%% - i.e. was it published previously or discarded previously) and
-%% thus the message should be dropped.
+%% to signal that it's already seen this message, (e.g. it was published
+%% or discarded previously) and thus the message should be dropped.
 -callback is_duplicate(rabbit_types:basic_message(), state())
-                      -> {'false'|'published'|'discarded', state()}.
-
-%% Called to inform the BQ about messages which have reached the
-%% queue, but are not going to be further passed to BQ for some
-%% reason. Note that this is may be invoked for messages for which
-%% BQ:is_duplicate/2 has already returned {'published' | 'discarded',
-%% BQS}.
--callback discard(rabbit_types:basic_message(), pid(), state()) -> state().
+                      -> {boolean(), state()}.
 
 -else.
 
@@ -204,12 +224,13 @@
 
 behaviour_info(callbacks) ->
     [{start, 1}, {stop, 0}, {init, 3}, {terminate, 2},
-     {delete_and_terminate, 2}, {purge, 1}, {publish, 4},
-     {publish_delivered, 5}, {drain_confirmed, 1}, {dropwhile, 3},
-     {fetch, 2}, {ack, 2}, {fold, 3}, {requeue, 2}, {len, 1},
-     {is_empty, 1}, {set_ram_duration_target, 2}, {ram_duration, 1},
-     {needs_timeout, 1}, {timeout, 1}, {handle_pre_hibernate, 1},
-     {status, 1}, {invoke, 3}, {is_duplicate, 2}, {discard, 3}];
+     {delete_and_terminate, 2}, {purge, 1}, {purge_acks, 1}, {publish, 5},
+     {publish_delivered, 4}, {discard, 3}, {drain_confirmed, 1},
+     {dropwhile, 2}, {fetchwhile, 4},
+     {fetch, 2}, {ack, 2}, {requeue, 2}, {ackfold, 4}, {fold, 3}, {len, 1},
+     {is_empty, 1}, {depth, 1}, {set_ram_duration_target, 2},
+     {ram_duration, 1}, {needs_timeout, 1}, {timeout, 1},
+     {handle_pre_hibernate, 1}, {status, 1}, {invoke, 3}, {is_duplicate, 2}] ;
 behaviour_info(_Other) ->
     undefined.
 
diff --git a/src/rabbit_backing_queue_qc.erl b/src/rabbit_backing_queue_qc.erl
index a84800c0..e2bc3247 100644
--- a/src/rabbit_backing_queue_qc.erl
+++ b/src/rabbit_backing_queue_qc.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_backing_queue_qc).
@@ -85,17 +85,19 @@ backing_queue_test(Cmds) ->
 
 %% Commands
 
-%% Command frequencies are tuned so that queues are normally reasonably
-%% short, but they may sometimes exceed ?QUEUE_MAXLEN. Publish-multiple
-%% and purging cause extreme queue lengths, so these have lower probabilities.
-%% Fetches are sufficiently frequent so that commands that need acktags
-%% get decent coverage.
+%% Command frequencies are tuned so that queues are normally
+%% reasonably short, but they may sometimes exceed
+%% ?QUEUE_MAXLEN. Publish-multiple and purging cause extreme queue
+%% lengths, so these have lower probabilities.  Fetches/drops are
+%% sufficiently frequent so that commands that need acktags get decent
+%% coverage.
 
 command(S) ->
     frequency([{10, qc_publish(S)},
                {1,  qc_publish_delivered(S)},
                {1,  qc_publish_multiple(S)},  %% very slow
-               {15, qc_fetch(S)},             %% needed for ack and requeue
+               {9,  qc_fetch(S)},             %% needed for ack and requeue
+               {6,  qc_drop(S)},              %%
                {15, qc_ack(S)},
                {15, qc_requeue(S)},
                {3,  qc_set_ram_duration_target(S)},
@@ -104,7 +106,8 @@ command(S) ->
                {1,  qc_dropwhile(S)},
                {1,  qc_is_empty(S)},
                {1,  qc_timeout(S)},
-               {1,  qc_purge(S)}]).
+               {1,  qc_purge(S)},
+               {1,  qc_fold(S)}]).
 
 qc_publish(#state{bqstate = BQ}) ->
     {call, ?BQMOD, publish,
@@ -112,18 +115,21 @@ qc_publish(#state{bqstate = BQ}) ->
       #message_properties{needs_confirming = frequency([{1,  true},
                                                         {20, false}]),
                           expiry = oneof([undefined | lists:seq(1, 10)])},
-      self(), BQ]}.
+      false, self(), BQ]}.
 
 qc_publish_multiple(#state{}) ->
     {call, ?MODULE, publish_multiple, [resize(?QUEUE_MAXLEN, pos_integer())]}.
 
 qc_publish_delivered(#state{bqstate = BQ}) ->
     {call, ?BQMOD, publish_delivered,
-     [boolean(), qc_message(), #message_properties{}, self(), BQ]}.
+     [qc_message(), #message_properties{}, self(), BQ]}.
 
 qc_fetch(#state{bqstate = BQ}) ->
     {call, ?BQMOD, fetch, [boolean(), BQ]}.
 
+qc_drop(#state{bqstate = BQ}) ->
+    {call, ?BQMOD, drop, [boolean(), BQ]}.
+
 qc_ack(#state{bqstate = BQ, acks = Acks}) ->
     {call, ?BQMOD, ack, [rand_choice(proplists:get_keys(Acks)), BQ]}.
 
@@ -141,7 +147,7 @@ qc_drain_confirmed(#state{bqstate = BQ}) ->
     {call, ?BQMOD, drain_confirmed, [BQ]}.
 
 qc_dropwhile(#state{bqstate = BQ}) ->
-    {call, ?BQMOD, dropwhile, [fun dropfun/1, false, BQ]}.
+    {call, ?BQMOD, dropwhile, [fun dropfun/1, BQ]}.
 
 qc_is_empty(#state{bqstate = BQ}) ->
     {call, ?BQMOD, is_empty, [BQ]}.
@@ -152,6 +158,9 @@ qc_timeout(#state{bqstate = BQ}) ->
 qc_purge(#state{bqstate = BQ}) ->
     {call, ?BQMOD, purge, [BQ]}.
 
+qc_fold(#state{bqstate = BQ}) ->
+    {call, ?BQMOD, fold, [makefoldfun(pos_integer()), foldacc(), BQ]}.
+
 %% Preconditions
 
 %% Create long queues by only allowing publishing
@@ -173,7 +182,7 @@ precondition(#state{len = Len}, {call, ?MODULE, publish_multiple, _Arg}) ->
 
 %% Model updates
 
-next_state(S, BQ, {call, ?BQMOD, publish, [Msg, MsgProps, _Pid, _BQ]}) ->
+next_state(S, BQ, {call, ?BQMOD, publish, [Msg, MsgProps, _Del, _Pid, _BQ]}) ->
     #state{len         = Len,
            messages    = Messages,
            confirms    = Confirms,
@@ -199,7 +208,7 @@ next_state(S, _BQ, {call, ?MODULE, publish_multiple, [PublishCount]}) ->
 
 next_state(S, Res,
            {call, ?BQMOD, publish_delivered,
-            [AckReq, Msg, MsgProps, _Pid, _BQ]}) ->
+            [Msg, MsgProps, _Pid, _BQ]}) ->
     #state{confirms = Confirms, acks = Acks, next_seq_id = NextSeq} = S,
     AckTag = {call, erlang, element, [1, Res]},
     BQ1    = {call, erlang, element, [2, Res]},
@@ -213,29 +222,14 @@ next_state(S, Res,
                            true -> gb_sets:add(MsgId, Confirms);
                            _    -> Confirms
                        end,
-            acks = case AckReq of
-                       true  -> [{AckTag, {NextSeq, {MsgProps, Msg}}}|Acks];
-                       false -> Acks
-                   end
+            acks = [{AckTag, {NextSeq, {MsgProps, Msg}}}|Acks]
            };
 
 next_state(S, Res, {call, ?BQMOD, fetch, [AckReq, _BQ]}) ->
-    #state{len = Len, messages = Messages, acks = Acks} = S,
-    ResultInfo = {call, erlang, element, [1, Res]},
-    BQ1        = {call, erlang, element, [2, Res]},
-    AckTag     = {call, erlang, element, [3, ResultInfo]},
-    S1         = S#state{bqstate = BQ1},
-    case gb_trees:is_empty(Messages) of
-        true  -> S1;
-        false -> {SeqId, MsgProp_Msg, M2} = gb_trees:take_smallest(Messages),
-                 S2 = S1#state{len = Len - 1, messages = M2},
-                 case AckReq of
-                     true  ->
-                         S2#state{acks = [{AckTag, {SeqId, MsgProp_Msg}}|Acks]};
-                     false ->
-                         S2
-                 end
-    end;
+    next_state_fetch_and_drop(S, Res, AckReq, 3);
+
+next_state(S, Res, {call, ?BQMOD, drop, [AckReq, _BQ]}) ->
+    next_state_fetch_and_drop(S, Res, AckReq, 2);
 
 next_state(S, Res, {call, ?BQMOD, ack, [AcksArg, _BQ]}) ->
     #state{acks = AcksState} = S,
@@ -281,19 +275,38 @@ next_state(S, BQ, {call, ?MODULE, timeout, _Args}) ->
 
 next_state(S, Res, {call, ?BQMOD, purge, _Args}) ->
     BQ1 = {call, erlang, element, [2, Res]},
-    S#state{bqstate = BQ1, len = 0, messages = gb_trees:empty()}.
+    S#state{bqstate = BQ1, len = 0, messages = gb_trees:empty()};
+
+next_state(S, Res, {call, ?BQMOD, fold, _Args}) ->
+    BQ1 = {call, erlang, element, [2, Res]},
+    S#state{bqstate = BQ1}.
 
 %% Postconditions
 
 postcondition(S, {call, ?BQMOD, fetch, _Args}, Res) ->
     #state{messages = Messages, len = Len, acks = Acks, confirms = Confrms} = S,
     case Res of
-        {{MsgFetched, _IsDelivered, AckTag, RemainingLen}, _BQ} ->
+        {{MsgFetched, _IsDelivered, AckTag}, _BQ} ->
             {_SeqId, {_MsgProps, Msg}} = gb_trees:smallest(Messages),
             MsgFetched =:= Msg andalso
             not proplists:is_defined(AckTag, Acks) andalso
                 not gb_sets:is_element(AckTag, Confrms) andalso
-                RemainingLen =:= Len - 1;
+                Len =/= 0;
+        {empty, _BQ} ->
+            Len =:= 0
+    end;
+
+postcondition(S, {call, ?BQMOD, drop, _Args}, Res) ->
+    #state{messages = Messages, len = Len, acks = Acks, confirms = Confrms} = S,
+    case Res of
+        {{MsgIdFetched, AckTag}, _BQ} ->
+            {_SeqId, {_MsgProps, Msg}} = gb_trees:smallest(Messages),
+            MsgId = eval({call, erlang, element,
+                          [?RECORD_INDEX(id, basic_message), Msg]}),
+            MsgIdFetched =:= MsgId andalso
+            not proplists:is_defined(AckTag, Acks) andalso
+                not gb_sets:is_element(AckTag, Confrms) andalso
+                Len =/= 0;
         {empty, _BQ} ->
             Len =:= 0
     end;
@@ -316,6 +329,15 @@ postcondition(S, {call, ?BQMOD, drain_confirmed, _Args}, Res) ->
     lists:all(fun (M) -> gb_sets:is_element(M, Confirms) end,
               ReportedConfirmed);
 
+postcondition(S, {call, ?BQMOD, fold, [FoldFun, Acc0, _BQ0]}, {Res, _BQ1}) ->
+    #state{messages = Messages} = S,
+    {_, Model} = lists:foldl(fun ({_SeqId, {_MsgProps, _Msg}}, {stop, Acc}) ->
+                                     {stop, Acc};
+                                 ({_SeqId, {MsgProps, Msg}}, {cont, Acc}) ->
+                                     FoldFun(Msg, MsgProps, false, Acc)
+                             end, {cont, Acc0}, gb_trees:to_list(Messages)),
+    true = Model =:= Res;
+
 postcondition(#state{bqstate = BQ, len = Len}, {call, _M, _F, _A}, _Res) ->
     ?BQMOD:len(BQ) =:= Len.
 
@@ -374,6 +396,16 @@ rand_choice(List, Selection, N)  ->
                        rand_choice(List -- [Picked], [Picked | Selection],
                        N - 1).
 
+makefoldfun(Size) ->
+    fun (Msg, _MsgProps, Unacked, Acc) ->
+            case {length(Acc) > Size, Unacked} of
+                {false, false} -> {cont, [Msg | Acc]};
+                {false, true}  -> {cont, Acc};
+                {true, _}      -> {stop, Acc}
+            end
+    end.
+foldacc() -> [].
+
 dropfun(Props) ->
     Expiry = eval({call, erlang, element,
                    [?RECORD_INDEX(expiry, message_properties), Props]}),
@@ -391,4 +423,31 @@ drop_messages(Messages) ->
             end
     end.
 
+next_state_fetch_and_drop(S, Res, AckReq, AckTagIdx) ->
+    #state{len = Len, messages = Messages, acks = Acks} = S,
+    ResultInfo = {call, erlang, element, [1, Res]},
+    BQ1        = {call, erlang, element, [2, Res]},
+    AckTag     = {call, erlang, element, [AckTagIdx, ResultInfo]},
+    S1         = S#state{bqstate = BQ1},
+    case gb_trees:is_empty(Messages) of
+        true  -> S1;
+        false -> {SeqId, MsgProp_Msg, M2} = gb_trees:take_smallest(Messages),
+                 S2 = S1#state{len = Len - 1, messages = M2},
+                 case AckReq of
+                     true  ->
+                         S2#state{acks = [{AckTag, {SeqId, MsgProp_Msg}}|Acks]};
+                     false ->
+                         S2
+                 end
+    end.
+
+-else.
+
+-export([prop_disabled/0]).
+
+prop_disabled() ->
+    exit({compiled_without_proper,
+          "PropEr was not present during compilation of the test module. "
+          "Hence all tests are disabled."}).
+
 -endif.
diff --git a/src/rabbit_basic.erl b/src/rabbit_basic.erl
index 734456d3..2e825536 100644
--- a/src/rabbit_basic.erl
+++ b/src/rabbit_basic.erl
@@ -10,17 +10,18 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_basic).
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 
--export([publish/4, publish/6, publish/1,
-         message/3, message/4, properties/1, append_table_header/3,
-         extract_headers/1, map_headers/2, delivery/4, header_routes/1]).
+-export([publish/4, publish/5, publish/1,
+         message/3, message/4, properties/1, prepend_table_header/3,
+         extract_headers/1, map_headers/2, delivery/3, header_routes/1,
+         parse_expiration/1]).
 -export([build_content/2, from_content/1]).
 
 %%----------------------------------------------------------------------------
@@ -40,13 +41,13 @@
 -spec(publish/4 ::
         (exchange_input(), rabbit_router:routing_key(), properties_input(),
          body_input()) -> publish_result()).
--spec(publish/6 ::
-        (exchange_input(), rabbit_router:routing_key(), boolean(), boolean(),
+-spec(publish/5 ::
+        (exchange_input(), rabbit_router:routing_key(), boolean(),
          properties_input(), body_input()) -> publish_result()).
 -spec(publish/1 ::
         (rabbit_types:delivery()) -> publish_result()).
--spec(delivery/4 ::
-        (boolean(), boolean(), rabbit_types:message(), undefined | integer()) ->
+-spec(delivery/3 ::
+        (boolean(), rabbit_types:message(), undefined | integer()) ->
                          rabbit_types:delivery()).
 -spec(message/4 ::
         (rabbit_exchange:name(), rabbit_router:routing_key(),
@@ -58,7 +59,7 @@
 -spec(properties/1 ::
         (properties_input()) -> rabbit_framing:amqp_property_record()).
 
--spec(append_table_header/3 ::
+-spec(prepend_table_header/3 ::
         (binary(), rabbit_framing:amqp_table(), headers()) -> headers()).
 
 -spec(extract_headers/1 :: (rabbit_types:content()) -> headers()).
@@ -72,6 +73,9 @@
                           binary() | [binary()]) -> rabbit_types:content()).
 -spec(from_content/1 :: (rabbit_types:content()) ->
                              {rabbit_framing:amqp_property_record(), binary()}).
+-spec(parse_expiration/1 ::
+        (rabbit_framing:amqp_property_record())
+        -> rabbit_types:ok_or_error2('undefined' | non_neg_integer(), any())).
 
 -endif.
 
@@ -80,18 +84,16 @@
 %% Convenience function, for avoiding round-trips in calls across the
 %% erlang distributed network.
 publish(Exchange, RoutingKeyBin, Properties, Body) ->
-    publish(Exchange, RoutingKeyBin, false, false, Properties, Body).
+    publish(Exchange, RoutingKeyBin, false, Properties, Body).
 
 %% Convenience function, for avoiding round-trips in calls across the
 %% erlang distributed network.
-publish(X = #exchange{name = XName}, RKey, Mandatory, Immediate, Props, Body) ->
-    publish(X, delivery(Mandatory, Immediate,
-                        message(XName, RKey, properties(Props), Body),
-                        undefined));
-publish(XName, RKey, Mandatory, Immediate, Props, Body) ->
-    publish(delivery(Mandatory, Immediate,
-                     message(XName, RKey, properties(Props), Body),
-                     undefined)).
+publish(X = #exchange{name = XName}, RKey, Mandatory, Props, Body) ->
+    Message = message(XName, RKey, properties(Props), Body),
+    publish(X, delivery(Mandatory, Message, undefined));
+publish(XName, RKey, Mandatory, Props, Body) ->
+    Message = message(XName, RKey, properties(Props), Body),
+    publish(delivery(Mandatory, Message, undefined)).
 
 publish(Delivery = #delivery{
           message = #basic_message{exchange_name = XName}}) ->
@@ -105,8 +107,8 @@ publish(X, Delivery) ->
     {RoutingRes, DeliveredQPids} = rabbit_amqqueue:deliver(Qs, Delivery),
     {ok, RoutingRes, DeliveredQPids}.
 
-delivery(Mandatory, Immediate, Message, MsgSeqNo) ->
-    #delivery{mandatory = Mandatory, immediate = Immediate, sender = self(),
+delivery(Mandatory, Message, MsgSeqNo) ->
+    #delivery{mandatory = Mandatory, sender = self(),
               message = Message, msg_seq_no = MsgSeqNo}.
 
 build_content(Properties, BodyBin) when is_binary(BodyBin) ->
@@ -179,15 +181,45 @@ properties(P) when is_list(P) ->
                         end
                 end, #'P_basic'{}, P).
 
-append_table_header(Name, Info, undefined) ->
-    append_table_header(Name, Info, []);
-append_table_header(Name, Info, Headers) ->
-    Prior = case rabbit_misc:table_lookup(Headers, Name) of
-                undefined          -> [];
-                {array, Existing}  -> Existing
-            end,
+prepend_table_header(Name, Info, undefined) ->
+    prepend_table_header(Name, Info, []);
+prepend_table_header(Name, Info, Headers) ->
+    case rabbit_misc:table_lookup(Headers, Name) of
+        {array, Existing} ->
+            prepend_table(Name, Info, Existing, Headers);
+        undefined ->
+            prepend_table(Name, Info, [], Headers);
+        Other ->
+            Headers2 = prepend_table(Name, Info, [], Headers),
+            set_invalid_header(Name, Other, Headers2)
+    end.
+
+prepend_table(Name, Info, Prior, Headers) ->
     rabbit_misc:set_table_value(Headers, Name, array, [{table, Info} | Prior]).
 
+set_invalid_header(Name, {_, _}=Value, Headers) when is_list(Headers) ->
+    case rabbit_misc:table_lookup(Headers, ?INVALID_HEADERS_KEY) of
+        undefined ->
+            set_invalid([{Name, array, [Value]}], Headers);
+        {table, ExistingHdr} ->
+            update_invalid(Name, Value, ExistingHdr, Headers);
+        Other ->
+            %% somehow the x-invalid-headers header is corrupt
+            Invalid = [{?INVALID_HEADERS_KEY, array, [Other]}],
+            set_invalid_header(Name, Value, set_invalid(Invalid, Headers))
+    end.
+
+set_invalid(NewHdr, Headers) ->
+    rabbit_misc:set_table_value(Headers, ?INVALID_HEADERS_KEY, table, NewHdr).
+
+update_invalid(Name, Value, ExistingHdr, Header) ->
+    Values = case rabbit_misc:table_lookup(ExistingHdr, Name) of
+                 undefined      -> [Value];
+                 {array, Prior} -> [Value | Prior]
+             end,
+    NewHdr = rabbit_misc:set_table_value(ExistingHdr, Name, array, Values),
+    set_invalid(NewHdr, Header).
+
 extract_headers(Content) ->
     #content{properties = #'P_basic'{headers = Headers}} =
         rabbit_binary_parser:ensure_content_decoded(Content),
@@ -226,3 +258,19 @@ header_routes(HeadersTable) ->
            {Type, _Val}    -> throw({error, {unacceptable_type_in_header,
                                              binary_to_list(HeaderKey), Type}})
        end || HeaderKey <- ?ROUTING_HEADERS]).
+
+parse_expiration(#'P_basic'{expiration = undefined}) ->
+    {ok, undefined};
+parse_expiration(#'P_basic'{expiration = Expiration}) ->
+    case string:to_integer(binary_to_list(Expiration)) of
+        {error, no_integer} = E ->
+            E;
+        {N, ""} ->
+            case rabbit_misc:check_expiry(N) of
+                ok             -> {ok, N};
+                E = {error, _} -> E
+            end;
+        {_, S} ->
+            {error, {leftover_string, S}}
+    end.
+
diff --git a/src/rabbit_binary_generator.erl b/src/rabbit_binary_generator.erl
index d69376fb..ae5bbf51 100644
--- a/src/rabbit_binary_generator.erl
+++ b/src/rabbit_binary_generator.erl
@@ -10,28 +10,19 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_binary_generator).
 -include("rabbit_framing.hrl").
 -include("rabbit.hrl").
 
-%% EMPTY_CONTENT_BODY_FRAME_SIZE, 8 = 1 + 2 + 4 + 1
-%%  - 1 byte of frame type
-%%  - 2 bytes of channel number
-%%  - 4 bytes of frame payload length
-%%  - 1 byte of payload trailer FRAME_END byte
-%% See definition of check_empty_content_body_frame_size/0,
-%% an assertion called at startup.
--define(EMPTY_CONTENT_BODY_FRAME_SIZE, 8).
-
 -export([build_simple_method_frame/3,
          build_simple_content_frames/4,
          build_heartbeat_frame/0]).
--export([generate_table/1, encode_properties/2]).
--export([check_empty_content_body_frame_size/0]).
+-export([generate_table/1]).
+-export([check_empty_frame_size/0]).
 -export([ensure_content_encoded/2, clear_encoded_content/1]).
 -export([map_exception/3]).
 
@@ -51,9 +42,7 @@
         -> [frame()]).
 -spec(build_heartbeat_frame/0 :: () -> frame()).
 -spec(generate_table/1 :: (rabbit_framing:amqp_table()) -> binary()).
--spec(encode_properties/2 ::
-        ([rabbit_framing:amqp_property_type()], [any()]) -> binary()).
--spec(check_empty_content_body_frame_size/0 :: () -> 'ok').
+-spec(check_empty_frame_size/0 :: () -> 'ok').
 -spec(ensure_content_encoded/2 ::
         (rabbit_types:content(), rabbit_types:protocol()) ->
                                        rabbit_types:encoded_content()).
@@ -88,10 +77,8 @@ build_simple_content_frames(ChannelInt, Content, FrameMax, Protocol) ->
     [HeaderFrame | ContentFrames].
 
 build_content_frames(FragsRev, FrameMax, ChannelInt) ->
-    BodyPayloadMax = if FrameMax == 0 ->
-                             iolist_size(FragsRev);
-                        true ->
-                             FrameMax - ?EMPTY_CONTENT_BODY_FRAME_SIZE
+    BodyPayloadMax = if FrameMax == 0 -> iolist_size(FragsRev);
+                        true          -> FrameMax - ?EMPTY_FRAME_SIZE
                      end,
     build_content_frames(0, [], BodyPayloadMax, [],
                          lists:reverse(FragsRev), BodyPayloadMax, ChannelInt).
@@ -129,51 +116,24 @@ create_frame(TypeInt, ChannelInt, Payload) ->
 %% table_field_to_binary supports the AMQP 0-8/0-9 standard types, S,
 %% I, D, T and F, as well as the QPid extensions b, d, f, l, s, t, x,
 %% and V.
-
-table_field_to_binary({FName, Type, Value}) ->
-    [short_string_to_binary(FName) | field_value_to_binary(Type, Value)].
-
-field_value_to_binary(longstr, Value) ->
-    ["S", long_string_to_binary(Value)];
-
-field_value_to_binary(signedint, Value) ->
-    ["I", <<Value:32/signed>>];
-
-field_value_to_binary(decimal, {Before, After}) ->
-    ["D", Before, <<After:32>>];
-
-field_value_to_binary(timestamp, Value) ->
-    ["T", <<Value:64>>];
-
-field_value_to_binary(table, Value) ->
-    ["F", table_to_binary(Value)];
-
-field_value_to_binary(array, Value) ->
-    ["A", array_to_binary(Value)];
-
-field_value_to_binary(byte, Value) ->
-    ["b", <<Value:8/unsigned>>];
-
-field_value_to_binary(double, Value) ->
-    ["d", <<Value:64/float>>];
-
-field_value_to_binary(float, Value) ->
-    ["f", <<Value:32/float>>];
-
-field_value_to_binary(long, Value) ->
-    ["l", <<Value:64/signed>>];
-
-field_value_to_binary(short, Value) ->
-    ["s", <<Value:16/signed>>];
-
-field_value_to_binary(bool, Value) ->
-    ["t", if Value -> 1; true -> 0 end];
-
-field_value_to_binary(binary, Value) ->
-    ["x", long_string_to_binary(Value)];
-
-field_value_to_binary(void, _Value) ->
-    ["V"].
+table_field_to_binary({FName, T, V}) ->
+    [short_string_to_binary(FName) | field_value_to_binary(T, V)].
+
+field_value_to_binary(longstr,   V) -> ["S", long_string_to_binary(V)];
+field_value_to_binary(signedint, V) -> ["I", <<V:32/signed>>];
+field_value_to_binary(decimal,   V) -> {Before, After} = V,
+                                       ["D", Before, <<After:32>>];
+field_value_to_binary(timestamp, V) -> ["T", <<V:64>>];
+field_value_to_binary(table,     V) -> ["F", table_to_binary(V)];
+field_value_to_binary(array,     V) -> ["A", array_to_binary(V)];
+field_value_to_binary(byte,      V) -> ["b", <<V:8/unsigned>>];
+field_value_to_binary(double,    V) -> ["d", <<V:64/float>>];
+field_value_to_binary(float,     V) -> ["f", <<V:32/float>>];
+field_value_to_binary(long,      V) -> ["l", <<V:64/signed>>];
+field_value_to_binary(short,     V) -> ["s", <<V:16/signed>>];
+field_value_to_binary(bool,      V) -> ["t", if V -> 1; true -> 0 end];
+field_value_to_binary(binary,    V) -> ["x", long_string_to_binary(V)];
+field_value_to_binary(void,     _V) -> ["V"].
 
 table_to_binary(Table) when is_list(Table) ->
     BinTable = generate_table(Table),
@@ -187,9 +147,8 @@ generate_table(Table) when is_list(Table) ->
     list_to_binary(lists:map(fun table_field_to_binary/1, Table)).
 
 generate_array(Array) when is_list(Array) ->
-    list_to_binary(lists:map(
-                     fun ({Type, Value}) -> field_value_to_binary(Type, Value) end,
-                     Array)).
+    list_to_binary(lists:map(fun ({T, V}) -> field_value_to_binary(T, V) end,
+                             Array)).
 
 short_string_to_binary(String) when is_binary(String) ->
     Len = size(String),
@@ -207,65 +166,12 @@ long_string_to_binary(String) when is_binary(String) ->
 long_string_to_binary(String) ->
     [<<(length(String)):32>>, String].
 
-encode_properties([], []) ->
-    <<0, 0>>;
-encode_properties(TypeList, ValueList) ->
-    encode_properties(0, TypeList, ValueList, 0, [], []).
-
-encode_properties(_Bit, [], [], FirstShortAcc, FlagsAcc, PropsAcc) ->
-    list_to_binary([lists:reverse(FlagsAcc), <<FirstShortAcc:16>>, lists:reverse(PropsAcc)]);
-encode_properties(_Bit, [], _ValueList, _FirstShortAcc, _FlagsAcc, _PropsAcc) ->
-    exit(content_properties_values_overflow);
-encode_properties(15, TypeList, ValueList, FirstShortAcc, FlagsAcc, PropsAcc) ->
-    NewFlagsShort = FirstShortAcc bor 1, % set the continuation low bit
-    encode_properties(0, TypeList, ValueList, 0, [<<NewFlagsShort:16>> | FlagsAcc], PropsAcc);
-encode_properties(Bit, [bit | TypeList], [Value | ValueList], FirstShortAcc, FlagsAcc, PropsAcc) ->
-    case Value of
-        true -> encode_properties(Bit + 1, TypeList, ValueList,
-                                  FirstShortAcc bor (1 bsl (15 - Bit)), FlagsAcc, PropsAcc);
-        false -> encode_properties(Bit + 1, TypeList, ValueList,
-                                   FirstShortAcc, FlagsAcc, PropsAcc);
-        Other -> exit({content_properties_illegal_bit_value, Other})
-    end;
-encode_properties(Bit, [T | TypeList], [Value | ValueList], FirstShortAcc, FlagsAcc, PropsAcc) ->
-    case Value of
-        undefined -> encode_properties(Bit + 1, TypeList, ValueList,
-                                       FirstShortAcc, FlagsAcc, PropsAcc);
-        _ -> encode_properties(Bit + 1, TypeList, ValueList,
-                               FirstShortAcc bor (1 bsl (15 - Bit)),
-                               FlagsAcc,
-                               [encode_property(T, Value) | PropsAcc])
-    end.
-
-encode_property(shortstr, String) ->
-    Len = size(String),
-    if Len < 256 -> <<Len:8, String:Len/binary>>;
-       true      -> exit(content_properties_shortstr_overflow)
-    end;
-encode_property(longstr, String) ->
-    Len = size(String), <<Len:32, String:Len/binary>>;
-encode_property(octet, Int) ->
-    <<Int:8/unsigned>>;
-encode_property(shortint, Int) ->
-    <<Int:16/unsigned>>;
-encode_property(longint, Int) ->
-    <<Int:32/unsigned>>;
-encode_property(longlongint, Int) ->
-    <<Int:64/unsigned>>;
-encode_property(timestamp, Int) ->
-    <<Int:64/unsigned>>;
-encode_property(table, Table) ->
-    table_to_binary(Table).
-
-check_empty_content_body_frame_size() ->
-    %% Intended to ensure that EMPTY_CONTENT_BODY_FRAME_SIZE is
-    %% defined correctly.
-    ComputedSize = iolist_size(create_frame(?FRAME_BODY, 0, <<>>)),
-    if ComputedSize == ?EMPTY_CONTENT_BODY_FRAME_SIZE ->
-            ok;
-       true ->
-            exit({incorrect_empty_content_body_frame_size,
-                  ComputedSize, ?EMPTY_CONTENT_BODY_FRAME_SIZE})
+check_empty_frame_size() ->
+    %% Intended to ensure that EMPTY_FRAME_SIZE is defined correctly.
+    case iolist_size(create_frame(?FRAME_BODY, 0, <<>>)) of
+        ?EMPTY_FRAME_SIZE -> ok;
+        ComputedSize      -> exit({incorrect_empty_frame_size,
+                                   ComputedSize, ?EMPTY_FRAME_SIZE})
     end.
 
 ensure_content_encoded(Content = #content{properties_bin = PropBin,
diff --git a/src/rabbit_binary_parser.erl b/src/rabbit_binary_parser.erl
index 5f0016b6..dc6d090f 100644
--- a/src/rabbit_binary_parser.erl
+++ b/src/rabbit_binary_parser.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_binary_parser).
@@ -50,47 +50,36 @@ parse_array(<<ValueAndRest/binary>>) ->
     {Type, Value, Rest} = parse_field_value(ValueAndRest),
     [{Type, Value} | parse_array(Rest)].
 
-parse_field_value(<<"S", VLen:32/unsigned, ValueString:VLen/binary, Rest/binary>>) ->
-    {longstr, ValueString, Rest};
+parse_field_value(<<"S", VLen:32/unsigned, V:VLen/binary, R/binary>>) ->
+    {longstr, V, R};
 
-parse_field_value(<<"I", Value:32/signed, Rest/binary>>) ->
-    {signedint, Value, Rest};
+parse_field_value(<<"I", V:32/signed, R/binary>>) ->
+    {signedint, V, R};
 
-parse_field_value(<<"D", Before:8/unsigned, After:32/unsigned, Rest/binary>>) ->
-    {decimal, {Before, After}, Rest};
+parse_field_value(<<"D", Before:8/unsigned, After:32/unsigned, R/binary>>) ->
+    {decimal, {Before, After}, R};
 
-parse_field_value(<<"T", Value:64/unsigned, Rest/binary>>) ->
-    {timestamp, Value, Rest};
+parse_field_value(<<"T", V:64/unsigned, R/binary>>) ->
+    {timestamp, V, R};
 
-parse_field_value(<<"F", VLen:32/unsigned, Table:VLen/binary, Rest/binary>>) ->
-    {table, parse_table(Table), Rest};
+parse_field_value(<<"F", VLen:32/unsigned, Table:VLen/binary, R/binary>>) ->
+    {table, parse_table(Table), R};
 
-parse_field_value(<<"A", VLen:32/unsigned, Array:VLen/binary, Rest/binary>>) ->
-    {array, parse_array(Array), Rest};
+parse_field_value(<<"A", VLen:32/unsigned, Array:VLen/binary, R/binary>>) ->
+    {array, parse_array(Array), R};
 
-parse_field_value(<<"b", Value:8/unsigned, Rest/binary>>) ->
-    {byte, Value, Rest};
+parse_field_value(<<"b", V:8/unsigned, R/binary>>) -> {byte,        V, R};
+parse_field_value(<<"d", V:64/float,   R/binary>>) -> {double,      V, R};
+parse_field_value(<<"f", V:32/float,   R/binary>>) -> {float,       V, R};
+parse_field_value(<<"l", V:64/signed,  R/binary>>) -> {long,        V, R};
+parse_field_value(<<"s", V:16/signed,  R/binary>>) -> {short,       V, R};
+parse_field_value(<<"t", V:8/unsigned, R/binary>>) -> {bool, (V /= 0), R};
 
-parse_field_value(<<"d", Value:64/float, Rest/binary>>) ->
-    {double, Value, Rest};
+parse_field_value(<<"x", VLen:32/unsigned, V:VLen/binary, R/binary>>) ->
+    {binary, V, R};
 
-parse_field_value(<<"f", Value:32/float, Rest/binary>>) ->
-    {float, Value, Rest};
-
-parse_field_value(<<"l", Value:64/signed, Rest/binary>>) ->
-    {long, Value, Rest};
-
-parse_field_value(<<"s", Value:16/signed, Rest/binary>>) ->
-    {short, Value, Rest};
-
-parse_field_value(<<"t", Value:8/unsigned, Rest/binary>>) ->
-    {bool, (Value /= 0), Rest};
-
-parse_field_value(<<"x", VLen:32/unsigned, ValueString:VLen/binary, Rest/binary>>) ->
-    {binary, ValueString, Rest};
-
-parse_field_value(<<"V", Rest/binary>>) ->
-    {void, undefined, Rest}.
+parse_field_value(<<"V", R/binary>>) ->
+    {void, undefined, R}.
 
 ensure_content_decoded(Content = #content{properties = Props})
   when Props =/= none ->
diff --git a/src/rabbit_binding.erl b/src/rabbit_binding.erl
index bb44797e..91f42e9c 100644
--- a/src/rabbit_binding.erl
+++ b/src/rabbit_binding.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_binding).
@@ -35,11 +35,16 @@
 
 -type(key() :: binary()).
 
--type(bind_errors() :: rabbit_types:error('source_not_found' |
-                                          'destination_not_found' |
-                                          'source_and_destination_not_found')).
+-type(bind_errors() :: rabbit_types:error(
+                         {'resources_missing',
+                          [{'not_found', (rabbit_types:binding_source() |
+                                          rabbit_types:binding_destination())} |
+                           {'absent', rabbit_types:amqqueue()}]})).
+
 -type(bind_ok_or_error() :: 'ok' | bind_errors() |
-                            rabbit_types:error('binding_not_found')).
+                            rabbit_types:error(
+                              'binding_not_found' |
+                              {'binding_invalid', string(), [any()]})).
 -type(bind_res() :: bind_ok_or_error() | rabbit_misc:thunk(bind_ok_or_error())).
 -type(inner_fun() ::
         fun((rabbit_types:exchange(),
@@ -155,33 +160,38 @@ add(Binding, InnerFun) ->
     binding_action(
       Binding,
       fun (Src, Dst, B) ->
-              %% this argument is used to check queue exclusivity;
-              %% in general, we want to fail on that in preference to
-              %% anything else
-              case InnerFun(Src, Dst) of
-                  ok               -> case mnesia:read({rabbit_route, B}) of
-                                          []  -> add(Src, Dst, B);
-                                          [_] -> fun rabbit_misc:const_ok/0
-                                      end;
-                  {error, _} = Err -> rabbit_misc:const(Err)
+              case rabbit_exchange:validate_binding(Src, B) of
+                  ok ->
+                      %% this argument is used to check queue exclusivity;
+                      %% in general, we want to fail on that in preference to
+                      %% anything else
+                      case InnerFun(Src, Dst) of
+                          ok ->
+                              case mnesia:read({rabbit_route, B}) of
+                                  []  -> add(Src, Dst, B);
+                                  [_] -> fun rabbit_misc:const_ok/0
+                              end;
+                          {error, _} = Err ->
+                              rabbit_misc:const(Err)
+                      end;
+                  {error, _} = Err ->
+                      rabbit_misc:const(Err)
               end
       end).
 
 add(Src, Dst, B) ->
     [SrcDurable, DstDurable] = [durable(E) || E <- [Src, Dst]],
-    case (not (SrcDurable andalso DstDurable) orelse
-          mnesia:read({rabbit_durable_route, B}) =:= []) of
-        true  -> ok = sync_route(#route{binding = B}, SrcDurable, DstDurable,
+    case (SrcDurable andalso DstDurable andalso
+          mnesia:read({rabbit_durable_route, B}) =/= []) of
+        false -> ok = sync_route(#route{binding = B}, SrcDurable, DstDurable,
                                  fun mnesia:write/3),
-                 ok = rabbit_exchange:callback(
-                        Src, add_binding, [transaction, Src, B]),
+                 x_callback(transaction, Src, add_binding, B),
                  Serial = rabbit_exchange:serial(Src),
                  fun () ->
-                     ok = rabbit_exchange:callback(
-                            Src, add_binding, [Serial, Src, B]),
-                     ok = rabbit_event:notify(binding_created, info(B))
+                         x_callback(Serial, Src, add_binding, B),
+                         ok = rabbit_event:notify(binding_created, info(B))
                  end;
-        false -> rabbit_misc:const({error, binding_not_found})
+        true  -> rabbit_misc:const({error, binding_not_found})
     end.
 
 remove(Binding) -> remove(Binding, fun (_Src, _Dst) -> ok end).
@@ -279,21 +289,15 @@ has_for_source(SrcName) ->
 remove_for_source(SrcName) ->
     lock_route_tables(),
     Match = #route{binding = #binding{source = SrcName, _ = '_'}},
-    Routes = lists:usort(
-               mnesia:match_object(rabbit_route, Match, write) ++
-                   mnesia:match_object(rabbit_durable_route, Match, write)),
-    [begin
-         sync_route(Route, fun mnesia:delete_object/3),
-         Route#route.binding
-     end || Route <- Routes].
+    remove_routes(
+      lists:usort(mnesia:match_object(rabbit_route, Match, write) ++
+                      mnesia:match_object(rabbit_durable_route, Match, write))).
 
-remove_for_destination(Dst) ->
-    remove_for_destination(
-      Dst, fun (R) -> sync_route(R, fun mnesia:delete_object/3) end).
+remove_for_destination(DstName) ->
+    remove_for_destination(DstName, fun remove_routes/1).
 
-remove_transient_for_destination(Dst) ->
-    remove_for_destination(
-      Dst, fun (R) -> sync_transient_route(R, fun mnesia:delete_object/3) end).
+remove_transient_for_destination(DstName) ->
+    remove_for_destination(DstName, fun remove_transient_routes/1).
 
 %%----------------------------------------------------------------------------
 
@@ -310,6 +314,14 @@ binding_action(Binding = #binding{source      = SrcName,
               Fun(Src, Dst, Binding#binding{args = SortedArgs})
       end).
 
+delete_object(Tab, Record, LockKind) ->
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:match_object(Tab, Record, LockKind) of
+        []  -> ok;
+        [_] -> mnesia:delete_object(Tab, Record, LockKind)
+    end.
+
 sync_route(R, Fun) -> sync_route(R, true, true, Fun).
 
 sync_route(Route, true, true, Fun) ->
@@ -330,21 +342,32 @@ sync_transient_route(Route, Fun) ->
 call_with_source_and_destination(SrcName, DstName, Fun) ->
     SrcTable = table_for_resource(SrcName),
     DstTable = table_for_resource(DstName),
-    ErrFun = fun (Err) -> rabbit_misc:const({error, Err}) end,
+    ErrFun = fun (Names) ->
+                     Errs = [not_found_or_absent(Name) || Name <- Names],
+                     rabbit_misc:const({error, {resources_missing, Errs}})
+             end,
     rabbit_misc:execute_mnesia_tx_with_tail(
       fun () ->
               case {mnesia:read({SrcTable, SrcName}),
                     mnesia:read({DstTable, DstName})} of
                   {[Src], [Dst]} -> Fun(Src, Dst);
-                  {[],    [_]  } -> ErrFun(source_not_found);
-                  {[_],   []   } -> ErrFun(destination_not_found);
-                  {[],    []   } -> ErrFun(source_and_destination_not_found)
-               end
+                  {[],    [_]  } -> ErrFun([SrcName]);
+                  {[_],   []   } -> ErrFun([DstName]);
+                  {[],    []   } -> ErrFun([SrcName, DstName])
+              end
       end).
 
 table_for_resource(#resource{kind = exchange}) -> rabbit_exchange;
 table_for_resource(#resource{kind = queue})    -> rabbit_queue.
 
+not_found_or_absent(#resource{kind = exchange} = Name) ->
+    {not_found, Name};
+not_found_or_absent(#resource{kind = queue}    = Name) ->
+    case rabbit_amqqueue:not_found_or_absent(Name) of
+        not_found        -> {not_found, Name};
+        {absent, _Q} = R -> R
+    end.
+
 contains(Table, MatchHead) ->
     continue(mnesia:select(Table, [{MatchHead, [], ['$_']}], 1, read)).
 
@@ -372,16 +395,32 @@ lock_route_tables() ->
                                              rabbit_semi_durable_route,
                                              rabbit_durable_route]].
 
-remove_for_destination(DstName, DeleteFun) ->
+remove_routes(Routes) ->
+    %% This partitioning allows us to suppress unnecessary delete
+    %% operations on disk tables, which require an fsync.
+    {TransientRoutes, DurableRoutes} =
+        lists:partition(fun (R) -> mnesia:match_object(
+                                     rabbit_durable_route, R, write) == [] end,
+                        Routes),
+    [ok = sync_transient_route(R, fun mnesia:delete_object/3) ||
+        R <- TransientRoutes],
+    [ok = sync_route(R, fun mnesia:delete_object/3) ||
+        R <- DurableRoutes],
+    [R#route.binding || R <- Routes].
+
+remove_transient_routes(Routes) ->
+    [begin
+         ok = sync_transient_route(R, fun delete_object/3),
+         R#route.binding
+     end || R <- Routes].
+
+remove_for_destination(DstName, Fun) ->
     lock_route_tables(),
     Match = reverse_route(
               #route{binding = #binding{destination = DstName, _ = '_'}}),
-    ReverseRoutes = mnesia:match_object(rabbit_reverse_route, Match, write),
-    Bindings = [begin
-                    Route = reverse_route(ReverseRoute),
-                    ok = DeleteFun(Route),
-                    Route#route.binding
-                end || ReverseRoute <- ReverseRoutes],
+    Routes = [reverse_route(R) || R <- mnesia:match_object(
+                                         rabbit_reverse_route, Match, write)],
+    Bindings = Fun(Routes),
     group_bindings_fold(fun maybe_auto_delete/3, new_deletions(),
                         lists:keysort(#binding.source, Bindings)).
 
@@ -487,4 +526,5 @@ process_deletions(Deletions) ->
 
 del_notify(Bs) -> [rabbit_event:notify(binding_deleted, info(B)) || B <- Bs].
 
-x_callback(Arg, X, F, Bs) -> ok = rabbit_exchange:callback(X, F, [Arg, X, Bs]).
+x_callback(Serial, X, F, Bs) ->
+    ok = rabbit_exchange:callback(X, F, Serial, [X, Bs]).
diff --git a/src/rabbit_channel.erl b/src/rabbit_channel.erl
index 22c6a223..d6c1e8c0 100644
--- a/src/rabbit_channel.erl
+++ b/src/rabbit_channel.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_channel).
@@ -21,22 +21,23 @@
 -behaviour(gen_server2).
 
 -export([start_link/11, do/2, do/3, do_flow/3, flush/1, shutdown/1]).
--export([send_command/2, deliver/4, flushed/2]).
+-export([send_command/2, deliver/4, send_credit_reply/2, send_drained/2,
+         flushed/2]).
 -export([list/0, info_keys/0, info/1, info/2, info_all/0, info_all/1]).
 -export([refresh_config_local/0, ready_for_close/1]).
 -export([force_event_refresh/0]).
 
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
-         handle_info/2, handle_pre_hibernate/1, prioritise_call/3,
-         prioritise_cast/2, prioritise_info/2, format_message_queue/2]).
+         handle_info/2, handle_pre_hibernate/1, prioritise_call/4,
+         prioritise_cast/3, prioritise_info/3, format_message_queue/2]).
 %% Internal
 -export([list_local/0]).
 
 -record(ch, {state, protocol, channel, reader_pid, writer_pid, conn_pid,
-             conn_name, limiter, tx_status, next_tag, unacked_message_q,
-             uncommitted_message_q, uncommitted_acks, uncommitted_nacks, user,
-             virtual_host, most_recently_declared_queue, queue_monitors,
-             consumer_mapping, blocking, queue_consumers, delivering_queues,
+             conn_name, limiter, tx, next_tag, unacked_message_q, user,
+             virtual_host, most_recently_declared_queue,
+             queue_names, queue_monitors, consumer_mapping,
+             blocking, queue_consumers, delivering_queues,
              queue_collector_pid, stats_timer, confirm_enabled, publish_seqno,
              unconfirmed, confirmed, capabilities, trace_state}).
 
@@ -64,6 +65,12 @@
 
 -define(INFO_KEYS, ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid]).
 
+-define(INCR_STATS(Incs, Measure, State),
+        case rabbit_event:stats_level(State, #ch.stats_timer) of
+            fine -> incr_stats(Incs, Measure);
+            _    -> ok
+        end).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
@@ -75,8 +82,8 @@
 -spec(start_link/11 ::
         (channel_number(), pid(), pid(), pid(), string(),
          rabbit_types:protocol(), rabbit_types:user(), rabbit_types:vhost(),
-         rabbit_framing:amqp_table(),
-         pid(), rabbit_limiter:token()) -> rabbit_types:ok_pid_or_error()).
+         rabbit_framing:amqp_table(), pid(), pid()) ->
+                            rabbit_types:ok_pid_or_error()).
 -spec(do/2 :: (pid(), rabbit_framing:amqp_method_record()) -> 'ok').
 -spec(do/3 :: (pid(), rabbit_framing:amqp_method_record(),
                rabbit_types:maybe(rabbit_types:content())) -> 'ok').
@@ -88,6 +95,9 @@
 -spec(deliver/4 ::
         (pid(), rabbit_types:ctag(), boolean(), rabbit_amqqueue:qmsg())
         -> 'ok').
+-spec(send_credit_reply/2 :: (pid(), non_neg_integer()) -> 'ok').
+-spec(send_drained/2 :: (pid(), [{rabbit_types:ctag(), non_neg_integer()}])
+                        -> 'ok').
 -spec(flushed/2 :: (pid(), pid()) -> 'ok').
 -spec(list/0 :: () -> [pid()]).
 -spec(list_local/0 :: () -> [pid()]).
@@ -132,11 +142,17 @@ send_command(Pid, Msg) ->
 deliver(Pid, ConsumerTag, AckRequired, Msg) ->
     gen_server2:cast(Pid, {deliver, ConsumerTag, AckRequired, Msg}).
 
+send_credit_reply(Pid, Len) ->
+    gen_server2:cast(Pid, {send_credit_reply, Len}).
+
+send_drained(Pid, CTagCredit) ->
+    gen_server2:cast(Pid, {send_drained, CTagCredit}).
+
 flushed(Pid, QPid) ->
     gen_server2:cast(Pid, {flushed, QPid}).
 
 list() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_channel, list_local, []).
 
 list_local() ->
@@ -174,7 +190,7 @@ force_event_refresh() ->
 %%---------------------------------------------------------------------------
 
 init([Channel, ReaderPid, WriterPid, ConnPid, ConnName, Protocol, User, VHost,
-      Capabilities, CollectorPid, Limiter]) ->
+      Capabilities, CollectorPid, LimiterPid]) ->
     process_flag(trap_exit, true),
     ok = pg_local:join(rabbit_channels, self()),
     State = #ch{state                   = starting,
@@ -184,16 +200,14 @@ init([Channel, ReaderPid, WriterPid, ConnPid, ConnName, Protocol, User, VHost,
                 writer_pid              = WriterPid,
                 conn_pid                = ConnPid,
                 conn_name               = ConnName,
-                limiter                 = Limiter,
-                tx_status               = none,
+                limiter                 = rabbit_limiter:new(LimiterPid),
+                tx                      = none,
                 next_tag                = 1,
                 unacked_message_q       = queue:new(),
-                uncommitted_message_q   = queue:new(),
-                uncommitted_acks        = [],
-                uncommitted_nacks       = [],
                 user                    = User,
                 virtual_host            = VHost,
                 most_recently_declared_queue = <<>>,
+                queue_names             = dict:new(),
                 queue_monitors          = pmon:new(),
                 consumer_mapping        = dict:new(),
                 blocking                = sets:new(),
@@ -213,20 +227,20 @@ init([Channel, ReaderPid, WriterPid, ConnPid, ConnName, Protocol, User, VHost,
     {ok, State1, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
-prioritise_call(Msg, _From, _State) ->
+prioritise_call(Msg, _From, _Len, _State) ->
     case Msg of
         info           -> 9;
         {info, _Items} -> 9;
         _              -> 0
     end.
 
-prioritise_cast(Msg, _State) ->
+prioritise_cast(Msg, _Len, _State) ->
     case Msg of
         {confirm, _MsgSeqNos, _QPid} -> 5;
         _                            -> 0
     end.
 
-prioritise_info(Msg, _State) ->
+prioritise_info(Msg, _Len, _State) ->
     case Msg of
         emit_stats                   -> 7;
         _                            -> 0
@@ -258,7 +272,7 @@ handle_cast({method, Method, Content, Flow},
     end,
     try handle_method(Method, Content, State) of
         {reply, Reply, NewState} ->
-            ok = rabbit_writer:send_command(NewState#ch.writer_pid, Reply),
+            ok = send(Reply, NewState),
             noreply(NewState);
         {noreply, NewState} ->
             noreply(NewState);
@@ -267,7 +281,7 @@ handle_cast({method, Method, Content, Flow},
     catch
         exit:Reason = #amqp_error{} ->
             MethodName = rabbit_misc:method_record_type(Method),
-            send_exception(Reason#amqp_error{method = MethodName}, State);
+            handle_exception(Reason#amqp_error{method = MethodName}, State);
         _:Reason ->
             {stop, {Reason, erlang:get_stacktrace()}, State}
     end;
@@ -280,18 +294,20 @@ handle_cast(ready_for_close, State = #ch{state      = closing,
     ok = rabbit_writer:send_command_sync(WriterPid, #'channel.close_ok'{}),
     {stop, normal, State};
 
-handle_cast(terminate, State) ->
+handle_cast(terminate, State = #ch{writer_pid = WriterPid}) ->
+    ok = rabbit_writer:flush(WriterPid),
     {stop, normal, State};
 
-handle_cast({command, #'basic.consume_ok'{consumer_tag = ConsumerTag} = Msg},
-            State = #ch{writer_pid = WriterPid}) ->
-    ok = rabbit_writer:send_command(WriterPid, Msg),
-    noreply(consumer_monitor(ConsumerTag, State));
+handle_cast({command, #'basic.consume_ok'{consumer_tag = CTag} = Msg}, State) ->
+    ok = send(Msg, State),
+    noreply(consumer_monitor(CTag, State));
 
-handle_cast({command, Msg}, State = #ch{writer_pid = WriterPid}) ->
-    ok = rabbit_writer:send_command(WriterPid, Msg),
+handle_cast({command, Msg}, State) ->
+    ok = send(Msg, State),
     noreply(State);
 
+handle_cast({deliver, _CTag, _AckReq, _Msg}, State = #ch{state = closing}) ->
+    noreply(State);
 handle_cast({deliver, ConsumerTag, AckRequired,
              Msg = {_QName, QPid, _MsgId, Redelivered,
                     #basic_message{exchange_name = ExchangeName,
@@ -309,12 +325,27 @@ handle_cast({deliver, ConsumerTag, AckRequired,
            Content),
     noreply(record_sent(ConsumerTag, AckRequired, Msg, State));
 
+handle_cast({send_credit_reply, Len}, State = #ch{writer_pid = WriterPid}) ->
+    ok = rabbit_writer:send_command(
+           WriterPid, #'basic.credit_ok'{available = Len}),
+    noreply(State);
+
+handle_cast({send_drained, CTagCredit}, State = #ch{writer_pid = WriterPid}) ->
+    [ok = rabbit_writer:send_command(
+            WriterPid, #'basic.credit_drained'{consumer_tag   = ConsumerTag,
+                                               credit_drained = CreditDrained})
+     || {ConsumerTag, CreditDrained} <- CTagCredit],
+    noreply(State);
+
 handle_cast(force_event_refresh, State) ->
     rabbit_event:notify(channel_created, infos(?CREATION_EVENT_KEYS, State)),
     noreply(State);
+
 handle_cast({confirm, MsgSeqNos, From}, State) ->
     State1 = #ch{confirmed = C} = confirm(MsgSeqNos, From, State),
-    noreply([send_confirms], State1, case C of [] -> hibernate; _ -> 0 end).
+    Timeout = case C of [] -> hibernate; _ -> 0 end,
+    %% NB: don't call noreply/1 since we don't want to send confirms.
+    {noreply, ensure_stats_timer(State1), Timeout}.
 
 handle_info({bump_credit, Msg}, State) ->
     credit_flow:handle_bump_msg(Msg),
@@ -325,8 +356,10 @@ handle_info(timeout, State) ->
 
 handle_info(emit_stats, State) ->
     emit_stats(State),
-    noreply([ensure_stats_timer],
-            rabbit_event:reset_stats_timer(State, #ch.stats_timer));
+    State1 = rabbit_event:reset_stats_timer(State, #ch.stats_timer),
+    %% NB: don't call noreply/1 since we don't want to kick off the
+    %% stats timer.
+    {noreply, send_confirms(State1), hibernate};
 
 handle_info({'DOWN', _MRef, process, QPid, Reason}, State) ->
     State1 = handle_publishing_queue_down(QPid, Reason, State),
@@ -334,9 +367,13 @@ handle_info({'DOWN', _MRef, process, QPid, Reason}, State) ->
     State3 = handle_consuming_queue_down(QPid, State2),
     State4 = handle_delivering_queue_down(QPid, State3),
     credit_flow:peer_down(QPid),
-    erase_queue_stats(QPid),
-    noreply(State3#ch{queue_monitors = pmon:erase(
-                                         QPid, State4#ch.queue_monitors)});
+    #ch{queue_names = QNames, queue_monitors = QMons} = State4,
+    case dict:find(QPid, QNames) of
+        {ok, QName} -> erase_queue_stats(QName);
+        error       -> ok
+    end,
+    noreply(State4#ch{queue_names    = dict:erase(QPid, QNames),
+                      queue_monitors = pmon:erase(QPid, QMons)});
 
 handle_info({'EXIT', _Pid, Reason}, State) ->
     {stop, Reason, State}.
@@ -357,6 +394,8 @@ terminate(Reason, State) ->
         _                 -> ok
     end,
     pg_local:leave(rabbit_channels, self()),
+    rabbit_event:if_enabled(State, #ch.stats_timer,
+                            fun() -> emit_stats(State) end),
     rabbit_event:notify(channel_closed, [{pid, self()}]).
 
 code_change(_OldVsn, State, _Extra) ->
@@ -366,30 +405,11 @@ format_message_queue(Opt, MQ) -> rabbit_misc:format_message_queue(Opt, MQ).
 
 %%---------------------------------------------------------------------------
 
-reply(Reply, NewState) -> reply(Reply, [], NewState).
-
-reply(Reply, Mask, NewState) -> reply(Reply, Mask, NewState, hibernate).
-
-reply(Reply, Mask, NewState, Timeout) ->
-    {reply, Reply, next_state(Mask, NewState), Timeout}.
-
-noreply(NewState) -> noreply([], NewState).
-
-noreply(Mask, NewState) -> noreply(Mask, NewState, hibernate).
-
-noreply(Mask, NewState, Timeout) ->
-    {noreply, next_state(Mask, NewState), Timeout}.
+reply(Reply, NewState) -> {reply, Reply, next_state(NewState), hibernate}.
 
--define(MASKED_CALL(Fun, Mask, State),
-        case lists:member(Fun, Mask) of
-            true  -> State;
-            false -> Fun(State)
-        end).
+noreply(NewState) -> {noreply, next_state(NewState), hibernate}.
 
-next_state(Mask, State) ->
-    State1 = ?MASKED_CALL(ensure_stats_timer, Mask, State),
-    State2 = ?MASKED_CALL(send_confirms,      Mask, State1),
-    State2.
+next_state(State) -> ensure_stats_timer(send_confirms(State)).
 
 ensure_stats_timer(State) ->
     rabbit_event:ensure_stats_timer(State, #ch.stats_timer, emit_stats).
@@ -400,24 +420,40 @@ return_ok(State, false, Msg)  -> {reply, Msg, State}.
 ok_msg(true, _Msg) -> undefined;
 ok_msg(false, Msg) -> Msg.
 
-send_exception(Reason, State = #ch{protocol   = Protocol,
-                                   channel    = Channel,
-                                   writer_pid = WriterPid,
-                                   reader_pid = ReaderPid,
-                                   conn_pid   = ConnPid}) ->
-    {CloseChannel, CloseMethod} =
-        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
-    rabbit_log:error("connection ~p, channel ~p - error:~n~p~n",
-                     [ConnPid, Channel, Reason]),
+send(_Command, #ch{state = closing}) ->
+    ok;
+send(Command, #ch{writer_pid = WriterPid}) ->
+    ok = rabbit_writer:send_command(WriterPid, Command).
+
+handle_exception(Reason, State = #ch{protocol   = Protocol,
+                                     channel    = Channel,
+                                     writer_pid = WriterPid,
+                                     reader_pid = ReaderPid,
+                                     conn_pid   = ConnPid}) ->
     %% something bad's happened: notify_queues may not be 'ok'
     {_Result, State1} = notify_queues(State),
-    case CloseChannel of
-        Channel -> ok = rabbit_writer:send_command(WriterPid, CloseMethod),
-                   {noreply, State1};
-        _       -> ReaderPid ! {channel_exit, Channel, Reason},
-                   {stop, normal, State1}
+    case rabbit_binary_generator:map_exception(Channel, Reason, Protocol) of
+        {Channel, CloseMethod} ->
+            rabbit_log:error("connection ~p, channel ~p - soft error:~n~p~n",
+                             [ConnPid, Channel, Reason]),
+            ok = rabbit_writer:send_command(WriterPid, CloseMethod),
+            {noreply, State1};
+        {0, _} ->
+            ReaderPid ! {channel_exit, Channel, Reason},
+            {stop, normal, State1}
     end.
 
+-ifdef(use_specs).
+-spec(precondition_failed/1 :: (string()) -> no_return()).
+-endif.
+precondition_failed(Format) -> precondition_failed(Format, []).
+
+-ifdef(use_specs).
+-spec(precondition_failed/2 :: (string(), [any()]) -> no_return()).
+-endif.
+precondition_failed(Format, Params) ->
+    rabbit_misc:protocol_error(precondition_failed, Format, Params).
+
 return_queue_declare_ok(#resource{name = ActualName},
                         NoWait, MessageCount, ConsumerCount, State) ->
     return_ok(State#ch{most_recently_declared_queue = ActualName}, NoWait,
@@ -431,15 +467,13 @@ check_resource_access(User, Resource, Perm) ->
                 undefined -> [];
                 Other     -> Other
             end,
-    CacheTail =
-        case lists:member(V, Cache) of
-            true  -> lists:delete(V, Cache);
-            false -> ok = rabbit_access_control:check_resource_access(
-                            User, Resource, Perm),
-                     lists:sublist(Cache, ?MAX_PERMISSION_CACHE_SIZE - 1)
-        end,
-    put(permission_cache, [V | CacheTail]),
-    ok.
+    case lists:member(V, Cache) of
+        true  -> ok;
+        false -> ok = rabbit_access_control:check_resource_access(
+                        User, Resource, Perm),
+                 CacheTail = lists:sublist(Cache, ?MAX_PERMISSION_CACHE_SIZE-1),
+                 put(permission_cache, [V | CacheTail])
+    end.
 
 clear_permission_cache() ->
     erase(permission_cache),
@@ -460,10 +494,21 @@ check_user_id_header(#'P_basic'{user_id = Username},
                      #ch{user = #user{username = Username}}) ->
     ok;
 check_user_id_header(#'P_basic'{user_id = Claimed},
-                     #ch{user = #user{username = Actual}}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "user_id property set to '~s' but "
-      "authenticated user was '~s'", [Claimed, Actual]).
+                     #ch{user = #user{username = Actual,
+                                      tags     = Tags}}) ->
+    case lists:member(impersonator, Tags) of
+        true  -> ok;
+        false -> precondition_failed(
+                   "user_id property set to '~s' but authenticated user was "
+                   "'~s'", [Claimed, Actual])
+    end.
+
+check_expiration_header(Props) ->
+    case rabbit_basic:parse_expiration(Props) of
+        {ok, _}    -> ok;
+        {error, E} -> precondition_failed("invalid expiration '~s': ~p",
+                                          [Props#'P_basic'.expiration, E])
+    end.
 
 check_internal_exchange(#exchange{name = Name, internal = true}) ->
     rabbit_misc:protocol_error(access_refused,
@@ -507,16 +552,12 @@ check_not_default_exchange(_) ->
 %% check that an exchange/queue name does not contain the reserved
 %% "amq."  prefix.
 %%
-%% One, quite reasonable, interpretation of the spec, taken by the
-%% QPid M1 Java client, is that the exclusion of "amq." prefixed names
+%% As per the AMQP 0-9-1 spec, the exclusion of "amq." prefixed names
 %% only applies on actual creation, and not in the cases where the
-%% entity already exists. This is how we use this function in the code
-%% below. However, AMQP JIRA 123 changes that in 0-10, and possibly
-%% 0-9SP1, making it illegal to attempt to declare an exchange/queue
-%% with an amq.* name when passive=false. So this will need
-%% revisiting.
+%% entity already exists or passive=true.
 %%
-%% TODO: enforce other constraints on name. See AMQP JIRA 69.
+%% NB: We deliberately do not enforce the other constraints on names
+%% required by the spec.
 check_name(Kind, NameBin = <<"amq.", _/binary>>) ->
     rabbit_misc:protocol_error(
       access_refused,
@@ -527,20 +568,16 @@ check_name(_Kind, NameBin) ->
 queue_blocked(QPid, State = #ch{blocking = Blocking}) ->
     case sets:is_element(QPid, Blocking) of
         false -> State;
-        true  -> Blocking1 = sets:del_element(QPid, Blocking),
-                 ok = case sets:size(Blocking1) of
-                          0 -> rabbit_writer:send_command(
-                                 State#ch.writer_pid,
-                                 #'channel.flow_ok'{active = false});
-                          _ -> ok
-                      end,
-                 State#ch{blocking = Blocking1}
+        true  -> maybe_send_flow_ok(
+                   State#ch{blocking = sets:del_element(QPid, Blocking)})
     end.
 
-record_confirm(undefined, _, State) ->
-    State;
-record_confirm(MsgSeqNo, XName, State) ->
-    record_confirms([{MsgSeqNo, XName}], State).
+maybe_send_flow_ok(State = #ch{blocking = Blocking}) ->
+    case sets:size(Blocking) of
+        0 -> ok = send(#'channel.flow_ok'{active = false}, State);
+        _ -> ok
+    end,
+    State.
 
 record_confirms([], State) ->
     State;
@@ -566,14 +603,25 @@ handle_method(_Method, _, #ch{state = starting}) ->
 handle_method(#'channel.close_ok'{}, _, #ch{state = closing}) ->
     stop;
 
-handle_method(#'channel.close'{}, _, State = #ch{state = closing}) ->
-    {reply, #'channel.close_ok'{}, State};
+handle_method(#'channel.close'{}, _, State = #ch{writer_pid = WriterPid,
+                                                 state      = closing}) ->
+    ok = rabbit_writer:send_command(WriterPid, #'channel.close_ok'{}),
+    {noreply, State};
 
 handle_method(_Method, _, State = #ch{state = closing}) ->
     {noreply, State};
 
 handle_method(#'channel.close'{}, _, State = #ch{reader_pid = ReaderPid}) ->
     {ok, State1} = notify_queues(State),
+    %% We issue the channel.close_ok response after a handshake with
+    %% the reader, the other half of which is ready_for_close. That
+    %% way the reader forgets about the channel before we send the
+    %% response (and this channel process terminates). If we didn't do
+    %% that, a channel.open for the same channel number, which a
+    %% client is entitled to send as soon as it has received the
+    %% close_ok, might be received by the reader before it has seen
+    %% the termination and hence be sent to the old, now dead/dying
+    %% channel process, instead of a new process, and thus lost.
     ReaderPid ! {channel_closing, self()},
     {noreply, State1};
 
@@ -581,20 +629,22 @@ handle_method(#'channel.close'{}, _, State = #ch{reader_pid = ReaderPid}) ->
 %% while waiting for the reply to a synchronous command, we generally
 %% do allow this...except in the case of a pending tx.commit, where
 %% it could wreak havoc.
-handle_method(_Method, _, #ch{tx_status = TxStatus})
-  when TxStatus =/= none andalso TxStatus =/= in_progress ->
+handle_method(_Method, _, #ch{tx = Tx})
+  when Tx =:= committing orelse Tx =:= failed ->
     rabbit_misc:protocol_error(
       channel_error, "unexpected command while processing 'tx.commit'", []);
 
 handle_method(#'access.request'{},_, State) ->
     {reply, #'access.request_ok'{ticket = 1}, State};
 
+handle_method(#'basic.publish'{immediate = true}, _Content, _State) ->
+    rabbit_misc:protocol_error(not_implemented, "immediate=true", []);
+
 handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
                                routing_key = RoutingKey,
-                               mandatory   = Mandatory,
-                               immediate   = Immediate},
+                               mandatory   = Mandatory},
               Content, State = #ch{virtual_host    = VHostPath,
-                                   tx_status       = TxStatus,
+                                   tx              = Tx,
                                    confirm_enabled = ConfirmEnabled,
                                    trace_state     = TraceState}) ->
     ExchangeName = rabbit_misc:r(VHostPath, exchange, ExchangeNameBin),
@@ -603,30 +653,29 @@ handle_method(#'basic.publish'{exchange    = ExchangeNameBin,
     check_internal_exchange(Exchange),
     %% We decode the content's properties here because we're almost
     %% certain to want to look at delivery-mode and priority.
-    DecodedContent = rabbit_binary_parser:ensure_content_decoded(Content),
-    check_user_id_header(DecodedContent#content.properties, State),
+    DecodedContent = #content {properties = Props} =
+        rabbit_binary_parser:ensure_content_decoded(Content),
+    check_user_id_header(Props, State),
+    check_expiration_header(Props),
     {MsgSeqNo, State1} =
-        case {TxStatus, ConfirmEnabled} of
+        case {Tx, ConfirmEnabled} of
             {none, false} -> {undefined, State};
             {_, _}        -> SeqNo = State#ch.publish_seqno,
                              {SeqNo, State#ch{publish_seqno = SeqNo + 1}}
         end,
     case rabbit_basic:message(ExchangeName, RoutingKey, DecodedContent) of
         {ok, Message} ->
-            rabbit_trace:tap_trace_in(Message, TraceState),
-            Delivery = rabbit_basic:delivery(Mandatory, Immediate, Message,
-                                             MsgSeqNo),
+            rabbit_trace:tap_in(Message, TraceState),
+            Delivery = rabbit_basic:delivery(Mandatory, Message, MsgSeqNo),
             QNames = rabbit_exchange:route(Exchange, Delivery),
-            {noreply,
-             case TxStatus of
-                 none        -> deliver_to_queues({Delivery, QNames}, State1);
-                 in_progress -> TMQ = State1#ch.uncommitted_message_q,
-                                NewTMQ = queue:in({Delivery, QNames}, TMQ),
-                                State1#ch{uncommitted_message_q = NewTMQ}
-             end};
+            DQ = {Delivery, QNames},
+            {noreply, case Tx of
+                          none         -> deliver_to_queues(DQ, State1);
+                          {Msgs, Acks} -> Msgs1 = queue:in(DQ, Msgs),
+                                          State1#ch{tx = {Msgs1, Acks}}
+                      end};
         {error, Reason} ->
-            rabbit_misc:protocol_error(precondition_failed,
-                                       "invalid message: ~p", [Reason])
+            precondition_failed("invalid message: ~p", [Reason])
     end;
 
 handle_method(#'basic.nack'{delivery_tag = DeliveryTag,
@@ -637,29 +686,31 @@ handle_method(#'basic.nack'{delivery_tag = DeliveryTag,
 
 handle_method(#'basic.ack'{delivery_tag = DeliveryTag,
                            multiple = Multiple},
-              _, State = #ch{unacked_message_q = UAMQ, tx_status = TxStatus}) ->
+              _, State = #ch{unacked_message_q = UAMQ, tx = Tx}) ->
     {Acked, Remaining} = collect_acks(UAMQ, DeliveryTag, Multiple),
     State1 = State#ch{unacked_message_q = Remaining},
-    {noreply,
-     case TxStatus of
-         none        -> ack(Acked, State1),
-                        State1;
-         in_progress -> State1#ch{uncommitted_acks =
-                                      Acked ++ State1#ch.uncommitted_acks}
-     end};
+    {noreply, case Tx of
+                  none         -> ack(Acked, State1),
+                                  State1;
+                  {Msgs, Acks} -> Acks1 = ack_cons(ack, Acked, Acks),
+                                  State1#ch{tx = {Msgs, Acks1}}
+              end};
 
 handle_method(#'basic.get'{queue = QueueNameBin,
                            no_ack = NoAck},
               _, State = #ch{writer_pid = WriterPid,
                              conn_pid   = ConnPid,
+                             limiter    = Limiter,
                              next_tag   = DeliveryTag}) ->
     QueueName = expand_queue_name_shortcut(QueueNameBin, State),
     check_read_permitted(QueueName, State),
     case rabbit_amqqueue:with_exclusive_access_or_die(
            QueueName, ConnPid,
-           fun (Q) -> rabbit_amqqueue:basic_get(Q, self(), NoAck) end) of
+           fun (Q) -> rabbit_amqqueue:basic_get(
+                        Q, self(), NoAck, rabbit_limiter:pid(Limiter))
+           end) of
         {ok, MessageCount,
-         Msg = {_QName, QPid, _MsgId, Redelivered,
+         Msg = {QName, QPid, _MsgId, Redelivered,
                 #basic_message{exchange_name = ExchangeName,
                                routing_keys  = [RoutingKey | _CcRoutes],
                                content       = Content}}} ->
@@ -671,7 +722,7 @@ handle_method(#'basic.get'{queue = QueueNameBin,
                                    routing_key   = RoutingKey,
                                    message_count = MessageCount},
                    Content),
-            State1 = monitor_delivering_queue(NoAck, QPid, State),
+            State1 = monitor_delivering_queue(NoAck, QPid, QName, State),
             {noreply, record_sent(none, not(NoAck), Msg, State1)};
         empty ->
             {reply, #'basic.get_empty'{}, State}
@@ -682,7 +733,8 @@ handle_method(#'basic.consume'{queue        = QueueNameBin,
                                no_local     = _, % FIXME: implement
                                no_ack       = NoAck,
                                exclusive    = ExclusiveConsume,
-                               nowait       = NoWait},
+                               nowait       = NoWait,
+                               arguments    = Arguments},
               _, State = #ch{conn_pid          = ConnPid,
                              limiter           = Limiter,
                              consumer_mapping  = ConsumerMapping}) ->
@@ -704,16 +756,20 @@ handle_method(#'basic.consume'{queue        = QueueNameBin,
                    QueueName, ConnPid,
                    fun (Q) ->
                            {rabbit_amqqueue:basic_consume(
-                              Q, NoAck, self(), Limiter,
+                              Q, NoAck, self(),
+                              rabbit_limiter:pid(Limiter),
+                              rabbit_limiter:is_active(Limiter),
                               ActualConsumerTag, ExclusiveConsume,
+                              parse_credit_args(Arguments),
                               ok_msg(NoWait, #'basic.consume_ok'{
                                        consumer_tag = ActualConsumerTag})),
                             Q}
                    end) of
-                {ok, Q = #amqqueue{pid = QPid}} ->
+                {ok, Q = #amqqueue{pid = QPid, name = QName}} ->
                     CM1 = dict:store(ActualConsumerTag, Q, ConsumerMapping),
                     State1 = monitor_delivering_queue(
-                               NoAck, QPid, State#ch{consumer_mapping = CM1}),
+                               NoAck, QPid, QName,
+                               State#ch{consumer_mapping = CM1}),
                     {noreply,
                      case NoWait of
                          true  -> consumer_monitor(ActualConsumerTag, State1);
@@ -778,33 +834,31 @@ handle_method(#'basic.qos'{prefetch_size = Size}, _, _State) when Size /= 0 ->
     rabbit_misc:protocol_error(not_implemented,
                                "prefetch_size!=0 (~w)", [Size]);
 
-handle_method(#'basic.qos'{prefetch_count = PrefetchCount}, _,
+handle_method(#'basic.qos'{prefetch_count = 0}, _,
               State = #ch{limiter = Limiter}) ->
-    Limiter1 = case {rabbit_limiter:is_enabled(Limiter), PrefetchCount} of
-                   {false, 0} -> Limiter;
-                   {false, _} -> enable_limiter(State);
-                   {_, _}     -> Limiter
-               end,
-    Limiter3 = case rabbit_limiter:limit(Limiter1, PrefetchCount) of
-                   ok                   -> Limiter1;
-                   {disabled, Limiter2} -> ok = limit_queues(Limiter2, State),
-                                           Limiter2
-               end,
-    {reply, #'basic.qos_ok'{}, State#ch{limiter = Limiter3}};
+    Limiter1 = rabbit_limiter:unlimit_prefetch(Limiter),
+    {reply, #'basic.qos_ok'{}, State#ch{limiter = Limiter1}};
+
+handle_method(#'basic.qos'{prefetch_count = PrefetchCount}, _,
+              State = #ch{limiter = Limiter, unacked_message_q = UAMQ}) ->
+    %% TODO queue:len(UAMQ) is not strictly right since that counts
+    %% unacked messages from basic.get too. Pretty obscure though.
+    Limiter1 = rabbit_limiter:limit_prefetch(Limiter,
+                                             PrefetchCount, queue:len(UAMQ)),
+    {reply, #'basic.qos_ok'{},
+     maybe_limit_queues(Limiter, Limiter1, State#ch{limiter = Limiter1})};
 
 handle_method(#'basic.recover_async'{requeue = true},
               _, State = #ch{unacked_message_q = UAMQ,
                              limiter = Limiter}) ->
     OkFun = fun () -> ok end,
     UAMQL = queue:to_list(UAMQ),
-    ok = fold_per_queue(
-           fun (QPid, MsgIds, ok) ->
-                   rabbit_misc:with_exit_handler(
-                     OkFun, fun () ->
-                                    rabbit_amqqueue:requeue(
-                                      QPid, MsgIds, self())
-                            end)
-           end, ok, UAMQL),
+    foreach_per_queue(
+      fun (QPid, MsgIds) ->
+              rabbit_misc:with_exit_handler(
+                OkFun,
+                fun () -> rabbit_amqqueue:requeue(QPid, MsgIds, self()) end)
+      end, lists:reverse(UAMQL)),
     ok = notify_limiter(Limiter, UAMQL),
     %% No answer required - basic.recover is the newer, synchronous
     %% variant of this method
@@ -814,12 +868,9 @@ handle_method(#'basic.recover_async'{requeue = false}, _, _State) ->
     rabbit_misc:protocol_error(not_implemented, "requeue=false", []);
 
 handle_method(#'basic.recover'{requeue = Requeue}, Content, State) ->
-    {noreply, State2 = #ch{writer_pid = WriterPid}} =
-        handle_method(#'basic.recover_async'{requeue = Requeue},
-                      Content,
-                      State),
-    ok = rabbit_writer:send_command(WriterPid, #'basic.recover_ok'{}),
-    {noreply, State2};
+    {noreply, State1} = handle_method(#'basic.recover_async'{requeue = Requeue},
+                                      Content, State),
+    {reply, #'basic.recover_ok'{}, State1};
 
 handle_method(#'basic.reject'{delivery_tag = DeliveryTag,
                               requeue = Requeue},
@@ -843,9 +894,13 @@ handle_method(#'exchange.declare'{exchange = ExchangeNameBin,
             {ok, FoundX} -> FoundX;
             {error, not_found} ->
                 check_name('exchange', ExchangeNameBin),
-                case rabbit_misc:r_arg(VHostPath, exchange, Args,
-                                       <<"alternate-exchange">>) of
+                AeKey = <<"alternate-exchange">>,
+                case rabbit_misc:r_arg(VHostPath, exchange, Args, AeKey) of
                     undefined -> ok;
+                    {error, {invalid_type, Type}} ->
+                        precondition_failed(
+                          "invalid type '~s' for arg '~s' in ~s",
+                          [Type, AeKey, rabbit_misc:rs(ExchangeName)]);
                     AName     -> check_read_permitted(ExchangeName, State),
                                  check_write_permitted(AName, State),
                                  ok
@@ -881,8 +936,7 @@ handle_method(#'exchange.delete'{exchange = ExchangeNameBin,
         {error, not_found} ->
             rabbit_misc:not_found(ExchangeName);
         {error, in_use} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s in use", [rabbit_misc:rs(ExchangeName)]);
+            precondition_failed("~s in use", [rabbit_misc:rs(ExchangeName)]);
         ok ->
             return_ok(State, NoWait,  #'exchange.delete_ok'{})
     end;
@@ -936,6 +990,19 @@ handle_method(#'queue.declare'{queue       = QueueNameBin,
             return_queue_declare_ok(QueueName, NoWait, MessageCount,
                                     ConsumerCount, State);
         {error, not_found} ->
+            DlxKey = <<"x-dead-letter-exchange">>,
+            case rabbit_misc:r_arg(VHostPath, exchange, Args, DlxKey) of
+               undefined ->
+                   ok;
+               {error, {invalid_type, Type}} ->
+                    precondition_failed(
+                      "invalid type '~s' for arg '~s' in ~s",
+                      [Type, DlxKey, rabbit_misc:rs(QueueName)]);
+               DLX ->
+                   check_read_permitted(QueueName, State),
+                   check_write_permitted(DLX, State),
+                   ok
+            end,
             case rabbit_amqqueue:declare(QueueName, Durable, AutoDelete,
                                          Args, Owner) of
                 {new, #amqqueue{pid = QPid}} ->
@@ -952,8 +1019,12 @@ handle_method(#'queue.declare'{queue       = QueueNameBin,
                 {existing, _Q} ->
                     %% must have been created between the stat and the
                     %% declare. Loop around again.
-                    handle_method(Declare, none, State)
-            end
+                    handle_method(Declare, none, State);
+                {absent, Q} ->
+                    rabbit_misc:absent(Q)
+            end;
+        {error, {absent, Q}} ->
+            rabbit_misc:absent(Q)
     end;
 
 handle_method(#'queue.declare'{queue   = QueueNameBin,
@@ -980,11 +1051,9 @@ handle_method(#'queue.delete'{queue = QueueNameBin,
            QueueName, ConnPid,
            fun (Q) -> rabbit_amqqueue:delete(Q, IfUnused, IfEmpty) end) of
         {error, in_use} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s in use", [rabbit_misc:rs(QueueName)]);
+            precondition_failed("~s in use", [rabbit_misc:rs(QueueName)]);
         {error, not_empty} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "~s not empty", [rabbit_misc:rs(QueueName)]);
+            precondition_failed("~s not empty", [rabbit_misc:rs(QueueName)]);
         {ok, PurgedMessageCount} ->
             return_ok(State, NoWait,
                       #'queue.delete_ok'{message_count = PurgedMessageCount})
@@ -1019,41 +1088,38 @@ handle_method(#'queue.purge'{queue = QueueNameBin,
               #'queue.purge_ok'{message_count = PurgedMessageCount});
 
 handle_method(#'tx.select'{}, _, #ch{confirm_enabled = true}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "cannot switch from confirm to tx mode", []);
+    precondition_failed("cannot switch from confirm to tx mode");
+
+handle_method(#'tx.select'{}, _, State = #ch{tx = none}) ->
+    {reply, #'tx.select_ok'{}, State#ch{tx = new_tx()}};
 
 handle_method(#'tx.select'{}, _, State) ->
-    {reply, #'tx.select_ok'{}, State#ch{tx_status = in_progress}};
+    {reply, #'tx.select_ok'{}, State};
 
-handle_method(#'tx.commit'{}, _, #ch{tx_status = none}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "channel is not transactional", []);
-
-handle_method(#'tx.commit'{}, _,
-              State = #ch{uncommitted_message_q = TMQ,
-                          uncommitted_acks      = TAL,
-                          uncommitted_nacks     = TNL,
-                          limiter               = Limiter}) ->
-    State1 = rabbit_misc:queue_fold(fun deliver_to_queues/2, State, TMQ),
-    ack(TAL, State1),
-    lists:foreach(
-      fun({Requeue, Acked}) -> reject(Requeue, Acked, Limiter) end, TNL),
-    {noreply, maybe_complete_tx(new_tx(State1#ch{tx_status = committing}))};
-
-handle_method(#'tx.rollback'{}, _, #ch{tx_status = none}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "channel is not transactional", []);
+handle_method(#'tx.commit'{}, _, #ch{tx = none}) ->
+    precondition_failed("channel is not transactional");
+
+handle_method(#'tx.commit'{}, _, State = #ch{tx      = {Msgs, Acks},
+                                             limiter = Limiter}) ->
+    State1 = rabbit_misc:queue_fold(fun deliver_to_queues/2, State, Msgs),
+    Rev = fun (X) -> lists:reverse(lists:sort(X)) end,
+    lists:foreach(fun ({ack,     A}) -> ack(Rev(A), State1);
+                      ({Requeue, A}) -> reject(Requeue, Rev(A), Limiter)
+                  end, lists:reverse(Acks)),
+    {noreply, maybe_complete_tx(State1#ch{tx = committing})};
+
+handle_method(#'tx.rollback'{}, _, #ch{tx = none}) ->
+    precondition_failed("channel is not transactional");
 
 handle_method(#'tx.rollback'{}, _, State = #ch{unacked_message_q = UAMQ,
-                                               uncommitted_acks  = TAL,
-                                               uncommitted_nacks = TNL}) ->
-    TNL1 = lists:append([L || {_, L} <- TNL]),
-    UAMQ1 = queue:from_list(lists:usort(TAL ++ TNL1 ++ queue:to_list(UAMQ))),
-    {reply, #'tx.rollback_ok'{}, new_tx(State#ch{unacked_message_q = UAMQ1})};
+                                               tx = {_Msgs, Acks}}) ->
+    AcksL = lists:append(lists:reverse([lists:reverse(L) || {_, L} <- Acks])),
+    UAMQ1 = queue:from_list(lists:usort(AcksL ++ queue:to_list(UAMQ))),
+    {reply, #'tx.rollback_ok'{}, State#ch{unacked_message_q = UAMQ1,
+                                          tx                = new_tx()}};
 
-handle_method(#'confirm.select'{}, _, #ch{tx_status = in_progress}) ->
-    rabbit_misc:protocol_error(
-      precondition_failed, "cannot switch from tx to confirm mode", []);
+handle_method(#'confirm.select'{}, _, #ch{tx = {_, _}}) ->
+    precondition_failed("cannot switch from tx to confirm mode");
 
 handle_method(#'confirm.select'{nowait = NoWait}, _, State) ->
     return_ok(State#ch{confirm_enabled = true},
@@ -1061,27 +1127,44 @@ handle_method(#'confirm.select'{nowait = NoWait}, _, State) ->
 
 handle_method(#'channel.flow'{active = true}, _,
               State = #ch{limiter = Limiter}) ->
-    Limiter2 = case rabbit_limiter:unblock(Limiter) of
-                   ok                   -> Limiter;
-                   {disabled, Limiter1} -> ok = limit_queues(Limiter1, State),
-                                           Limiter1
-               end,
-    {reply, #'channel.flow_ok'{active = true}, State#ch{limiter = Limiter2}};
+    Limiter1 = rabbit_limiter:unblock(Limiter),
+    {reply, #'channel.flow_ok'{active = true},
+     maybe_limit_queues(Limiter, Limiter1, State#ch{limiter = Limiter1})};
 
 handle_method(#'channel.flow'{active = false}, _,
               State = #ch{consumer_mapping = Consumers,
                           limiter          = Limiter}) ->
-    Limiter1 = case rabbit_limiter:is_enabled(Limiter) of
-                   true  -> Limiter;
-                   false -> enable_limiter(State)
-               end,
-    State1 = State#ch{limiter = Limiter1},
-    ok = rabbit_limiter:block(Limiter1),
-    case consumer_queues(Consumers) of
-        []    -> {reply, #'channel.flow_ok'{active = false}, State1};
-        QPids -> State2 = State1#ch{blocking = sets:from_list(QPids)},
+    case rabbit_limiter:is_blocked(Limiter) of
+        true  -> {noreply, maybe_send_flow_ok(State)};
+        false -> Limiter1 = rabbit_limiter:block(Limiter),
+                 State1 = maybe_limit_queues(Limiter, Limiter1,
+                                             State#ch{limiter = Limiter1}),
+                 %% The semantics of channel.flow{active=false}
+                 %% require that no messages are delivered after the
+                 %% channel.flow_ok has been sent. We accomplish that
+                 %% by "flushing" all messages in flight from the
+                 %% consumer queues to us. To do this we tell all the
+                 %% queues to invoke rabbit_channel:flushed/2, which
+                 %% will send us a {flushed, ...} message that appears
+                 %% *after* all the {deliver, ...} messages. We keep
+                 %% track of all the QPids thus asked, and once all of
+                 %% them have responded (or died) we send the
+                 %% channel.flow_ok.
+                 QPids = consumer_queues(Consumers),
                  ok = rabbit_amqqueue:flush_all(QPids, self()),
-                 {noreply, State2}
+                 {noreply, maybe_send_flow_ok(
+                             State1#ch{blocking = sets:from_list(QPids)})}
+    end;
+
+handle_method(#'basic.credit'{consumer_tag = CTag,
+                              credit       = Credit,
+                              drain        = Drain}, _,
+              State = #ch{consumer_mapping = Consumers}) ->
+    case dict:find(CTag, Consumers) of
+        {ok, Q} -> ok = rabbit_amqqueue:credit(
+                          Q, self(), CTag, Credit, Drain),
+                   {noreply, State};
+        error   -> precondition_failed("unknown consumer tag '~s'", [CTag])
     end;
 
 handle_method(_MethodRecord, _Content, _State) ->
@@ -1111,15 +1194,19 @@ consumer_monitor(ConsumerTag,
             State
     end.
 
-monitor_delivering_queue(true, _QPid, State) ->
-    State;
-monitor_delivering_queue(false, QPid, State = #ch{queue_monitors    = QMons,
-                                                  delivering_queues = DQ}) ->
-    State#ch{queue_monitors    = pmon:monitor(QPid, QMons),
-             delivering_queues = sets:add_element(QPid, DQ)}.
+monitor_delivering_queue(NoAck, QPid, QName,
+                         State = #ch{queue_names       = QNames,
+                                     queue_monitors    = QMons,
+                                     delivering_queues = DQ}) ->
+    State#ch{queue_names       = dict:store(QPid, QName, QNames),
+             queue_monitors    = pmon:monitor(QPid, QMons),
+             delivering_queues = case NoAck of
+                                     true  -> DQ;
+                                     false -> sets:add_element(QPid, DQ)
+                                 end}.
 
 handle_publishing_queue_down(QPid, Reason, State = #ch{unconfirmed = UC}) ->
-    case rabbit_misc:is_abnormal_termination(Reason) of
+    case rabbit_misc:is_abnormal_exit(Reason) of
         true  -> {MXs, UC1} = dtree:take_all(QPid, UC),
                  send_nacks(MXs, State#ch{unconfirmed = UC1});
         false -> {MXs, UC1} = dtree:take(QPid, UC),
@@ -1129,16 +1216,21 @@ handle_publishing_queue_down(QPid, Reason, State = #ch{unconfirmed = UC}) ->
 handle_consuming_queue_down(QPid,
                             State = #ch{consumer_mapping = ConsumerMapping,
                                         queue_consumers  = QCons,
-                                        writer_pid       = WriterPid}) ->
+                                        queue_names      = QNames}) ->
     ConsumerTags = case dict:find(QPid, QCons) of
                        error       -> gb_sets:new();
                        {ok, CTags} -> CTags
                    end,
     ConsumerMapping1 =
         gb_sets:fold(fun (CTag, CMap) ->
-                             Cancel = #'basic.cancel'{consumer_tag = CTag,
-                                                      nowait       = true},
-                             ok = rabbit_writer:send_command(WriterPid, Cancel),
+                             ok = send(#'basic.cancel'{consumer_tag = CTag,
+                                                       nowait       = true},
+                                       State),
+                             rabbit_event:notify(
+                               consumer_deleted,
+                               [{consumer_tag, CTag},
+                                {channel,      self()},
+                                {queue,        dict:fetch(QPid, QNames)}]),
                              dict:erase(CTag, CMap)
                      end, ConsumerMapping, ConsumerTags),
     State#ch{consumer_mapping = ConsumerMapping1,
@@ -1147,14 +1239,20 @@ handle_consuming_queue_down(QPid,
 handle_delivering_queue_down(QPid, State = #ch{delivering_queues = DQ}) ->
     State#ch{delivering_queues = sets:del_element(QPid, DQ)}.
 
+parse_credit_args(Arguments) ->
+    case rabbit_misc:table_lookup(Arguments, <<"x-credit">>) of
+        {table, T} -> case {rabbit_misc:table_lookup(T, <<"credit">>),
+                            rabbit_misc:table_lookup(T, <<"drain">>)} of
+                          {{long, Credit}, {boolean, Drain}} -> {Credit, Drain};
+                          _                                  -> none
+                      end;
+        undefined  -> none
+    end.
+
 binding_action(Fun, ExchangeNameBin, DestinationType, DestinationNameBin,
                RoutingKey, Arguments, ReturnMethod, NoWait,
                State = #ch{virtual_host = VHostPath,
                            conn_pid     = ConnPid }) ->
-    %% FIXME: connection exception (!) on failure??
-    %% (see rule named "failure" in spec-XML)
-    %% FIXME: don't allow binding to internal exchanges -
-    %% including the one named "" !
     {DestinationName, ActualRoutingKey} =
         expand_binding(DestinationType, DestinationNameBin, RoutingKey, State),
     check_write_permitted(DestinationName, State),
@@ -1172,19 +1270,17 @@ binding_action(Fun, ExchangeNameBin, DestinationType, DestinationNameBin,
                  (_X, #exchange{}) ->
                      ok
              end) of
-        {error, source_not_found} ->
-            rabbit_misc:not_found(ExchangeName);
-        {error, destination_not_found} ->
-            rabbit_misc:not_found(DestinationName);
-        {error, source_and_destination_not_found} ->
-            rabbit_misc:protocol_error(
-              not_found, "no ~s and no ~s", [rabbit_misc:rs(ExchangeName),
-                                             rabbit_misc:rs(DestinationName)]);
+        {error, {resources_missing, [{not_found, Name} | _]}} ->
+            rabbit_misc:not_found(Name);
+        {error, {resources_missing, [{absent, Q} | _]}} ->
+            rabbit_misc:absent(Q);
         {error, binding_not_found} ->
             rabbit_misc:protocol_error(
               not_found, "no binding ~s between ~s and ~s",
               [RoutingKey, rabbit_misc:rs(ExchangeName),
                rabbit_misc:rs(DestinationName)]);
+        {error, {binding_invalid, Fmt, Args}} ->
+            rabbit_misc:protocol_error(precondition_failed, Fmt, Args);
         {error, #amqp_error{} = Error} ->
             rabbit_misc:protocol_error(Error);
         ok -> return_ok(State, NoWait, ReturnMethod)
@@ -1204,39 +1300,40 @@ basic_return(#basic_message{exchange_name = ExchangeName,
            Content).
 
 reject(DeliveryTag, Requeue, Multiple,
-       State = #ch{unacked_message_q = UAMQ, tx_status = TxStatus}) ->
+       State = #ch{unacked_message_q = UAMQ, tx = Tx}) ->
     {Acked, Remaining} = collect_acks(UAMQ, DeliveryTag, Multiple),
     State1 = State#ch{unacked_message_q = Remaining},
-    {noreply,
-     case TxStatus of
-         none ->
-             reject(Requeue, Acked, State1#ch.limiter),
-             State1;
-         in_progress ->
-             State1#ch{uncommitted_nacks =
-                           [{Requeue, Acked} | State1#ch.uncommitted_nacks]}
-     end}.
-
+    {noreply, case Tx of
+                  none         -> reject(Requeue, Acked, State1#ch.limiter),
+                                  State1;
+                  {Msgs, Acks} -> Acks1 = ack_cons(Requeue, Acked, Acks),
+                                  State1#ch{tx = {Msgs, Acks1}}
+              end}.
+
+%% NB: Acked is in youngest-first order
 reject(Requeue, Acked, Limiter) ->
-    ok = fold_per_queue(
-           fun (QPid, MsgIds, ok) ->
-                   rabbit_amqqueue:reject(QPid, MsgIds, Requeue, self())
-           end, ok, Acked),
+    foreach_per_queue(
+      fun (QPid, MsgIds) ->
+              rabbit_amqqueue:reject(QPid, MsgIds, Requeue, self())
+      end, Acked),
     ok = notify_limiter(Limiter, Acked).
 
 record_sent(ConsumerTag, AckRequired,
-            Msg = {_QName, QPid, MsgId, Redelivered, _Message},
+            Msg = {QName, QPid, MsgId, Redelivered, _Message},
             State = #ch{unacked_message_q = UAMQ,
                         next_tag          = DeliveryTag,
                         trace_state       = TraceState}) ->
-    maybe_incr_stats([{QPid, 1}], case {ConsumerTag, AckRequired} of
-                                      {none,  true} -> get;
-                                      {none, false} -> get_no_ack;
-                                      {_   ,  true} -> deliver;
-                                      {_   , false} -> deliver_no_ack
-                                  end, State),
-    maybe_incr_redeliver_stats(Redelivered, QPid, State),
-    rabbit_trace:tap_trace_out(Msg, TraceState),
+    ?INCR_STATS([{queue_stats, QName, 1}], case {ConsumerTag, AckRequired} of
+                                               {none,  true} -> get;
+                                               {none, false} -> get_no_ack;
+                                               {_   ,  true} -> deliver;
+                                               {_   , false} -> deliver_no_ack
+                                           end, State),
+    case Redelivered of
+        true  -> ?INCR_STATS([{queue_stats, QName, 1}], redeliver, State);
+        false -> ok
+    end,
+    rabbit_trace:tap_out(Msg, TraceState),
     UAMQ1 = case AckRequired of
                 true  -> queue:in({DeliveryTag, ConsumerTag, {QPid, MsgId}},
                                   UAMQ);
@@ -1244,41 +1341,61 @@ record_sent(ConsumerTag, AckRequired,
             end,
     State#ch{unacked_message_q = UAMQ1, next_tag = DeliveryTag + 1}.
 
+%% NB: returns acks in youngest-first order
 collect_acks(Q, 0, true) ->
-    {queue:to_list(Q), queue:new()};
+    {lists:reverse(queue:to_list(Q)), queue:new()};
 collect_acks(Q, DeliveryTag, Multiple) ->
-    collect_acks([], queue:new(), Q, DeliveryTag, Multiple).
+    collect_acks([], [], Q, DeliveryTag, Multiple).
 
 collect_acks(ToAcc, PrefixAcc, Q, DeliveryTag, Multiple) ->
     case queue:out(Q) of
         {{value, UnackedMsg = {CurrentDeliveryTag, _ConsumerTag, _Msg}},
          QTail} ->
             if CurrentDeliveryTag == DeliveryTag ->
-                    {[UnackedMsg | ToAcc], queue:join(PrefixAcc, QTail)};
+                    {[UnackedMsg | ToAcc],
+                     case PrefixAcc of
+                         [] -> QTail;
+                         _  -> queue:join(
+                                 queue:from_list(lists:reverse(PrefixAcc)),
+                                 QTail)
+                     end};
                Multiple ->
                     collect_acks([UnackedMsg | ToAcc], PrefixAcc,
                                  QTail, DeliveryTag, Multiple);
                true ->
-                    collect_acks(ToAcc, queue:in(UnackedMsg, PrefixAcc),
+                    collect_acks(ToAcc, [UnackedMsg | PrefixAcc],
                                  QTail, DeliveryTag, Multiple)
             end;
         {empty, _} ->
-            rabbit_misc:protocol_error(
-              precondition_failed, "unknown delivery tag ~w", [DeliveryTag])
+            precondition_failed("unknown delivery tag ~w", [DeliveryTag])
     end.
 
-ack(Acked, State) ->
-    QIncs = fold_per_queue(
-              fun (QPid, MsgIds, L) ->
-                      ok = rabbit_amqqueue:ack(QPid, MsgIds, self()),
-                      [{QPid, length(MsgIds)} | L]
-              end, [], Acked),
-    ok = notify_limiter(State#ch.limiter, Acked),
-    maybe_incr_stats(QIncs, ack, State).
-
-new_tx(State) -> State#ch{uncommitted_message_q = queue:new(),
-                          uncommitted_acks      = [],
-                          uncommitted_nacks     = []}.
+%% NB: Acked is in youngest-first order
+ack(Acked, State = #ch{queue_names = QNames}) ->
+    foreach_per_queue(
+      fun (QPid, MsgIds) ->
+              ok = rabbit_amqqueue:ack(QPid, MsgIds, self()),
+              ?INCR_STATS(case dict:find(QPid, QNames) of
+                              {ok, QName} -> Count = length(MsgIds),
+                                             [{queue_stats, QName, Count}];
+                              error       -> []
+                          end, ack, State)
+      end, Acked),
+    ok = notify_limiter(State#ch.limiter, Acked).
+
+%% {Msgs, Acks}
+%%
+%% Msgs is a queue.
+%%
+%% Acks looks s.t. like this:
+%% [{false,[5,4]},{true,[3]},{ack,[2,1]}, ...]
+%%
+%% Each element is a pair consisting of a tag and a list of
+%% ack'ed/reject'ed msg ids. The tag is one of 'ack' (to ack), 'true'
+%% (reject w requeue), 'false' (reject w/o requeue). The msg ids, as
+%% well as the list overall, are in "most-recent (generally youngest)
+%% ack first" order.
+new_tx() -> {queue:new(), []}.
 
 notify_queues(State = #ch{state = closing}) ->
     {ok, State};
@@ -1288,24 +1405,26 @@ notify_queues(State = #ch{consumer_mapping  = Consumers,
               sets:union(sets:from_list(consumer_queues(Consumers)), DQ)),
     {rabbit_amqqueue:notify_down_all(QPids, self()), State#ch{state = closing}}.
 
-fold_per_queue(_F, Acc, []) ->
-    Acc;
-fold_per_queue(F, Acc, [{_DTag, _CTag, {QPid, MsgId}}]) -> %% common case
-    F(QPid, [MsgId], Acc);
-fold_per_queue(F, Acc, UAL) ->
+foreach_per_queue(_F, []) ->
+    ok;
+foreach_per_queue(F, [{_DTag, _CTag, {QPid, MsgId}}]) -> %% common case
+    F(QPid, [MsgId]);
+%% NB: UAL should be in youngest-first order; the tree values will
+%% then be in oldest-first order
+foreach_per_queue(F, UAL) ->
     T = lists:foldl(fun ({_DTag, _CTag, {QPid, MsgId}}, T) ->
                             rabbit_misc:gb_trees_cons(QPid, MsgId, T)
                     end, gb_trees:empty(), UAL),
-    rabbit_misc:gb_trees_fold(F, Acc, T).
-
-enable_limiter(State = #ch{unacked_message_q = UAMQ,
-                           limiter           = Limiter}) ->
-    Limiter1 = rabbit_limiter:enable(Limiter, queue:len(UAMQ)),
-    ok = limit_queues(Limiter1, State),
-    Limiter1.
-
-limit_queues(Limiter, #ch{consumer_mapping = Consumers}) ->
-    rabbit_amqqueue:limit_all(consumer_queues(Consumers), self(), Limiter).
+    rabbit_misc:gb_trees_foreach(F, T).
+
+maybe_limit_queues(OldLimiter, NewLimiter, State) ->
+    case ((not rabbit_limiter:is_active(OldLimiter)) andalso
+          rabbit_limiter:is_active(NewLimiter)) of
+        true  -> Queues = consumer_queues(State#ch.consumer_mapping),
+                 rabbit_amqqueue:activate_limit_all(Queues, self());
+        false -> ok
+    end,
+    State.
 
 consumer_queues(Consumers) ->
     lists:usort([QPid ||
@@ -1316,77 +1435,110 @@ consumer_queues(Consumers) ->
 %% messages sent in a response to a basic.get (identified by their
 %% 'none' consumer tag)
 notify_limiter(Limiter, Acked) ->
-    case rabbit_limiter:is_enabled(Limiter) of
+    %% optimisation: avoid the potentially expensive 'foldl' in the
+    %% common case.
+     case rabbit_limiter:is_prefetch_limited(Limiter) of
         false -> ok;
         true  -> case lists:foldl(fun ({_, none, _}, Acc) -> Acc;
-                                      ({_, _, _}, Acc)    -> Acc + 1
+                                      ({_,    _, _}, Acc) -> Acc + 1
                                   end, 0, Acked) of
                      0     -> ok;
                      Count -> rabbit_limiter:ack(Limiter, Count)
                  end
     end.
 
+deliver_to_queues({#delivery{message    = #basic_message{exchange_name = XName},
+                             msg_seq_no = undefined,
+                             mandatory  = false},
+                   []}, State) -> %% optimisation
+    ?INCR_STATS([{exchange_stats, XName, 1}], publish, State),
+    State;
 deliver_to_queues({Delivery = #delivery{message    = Message = #basic_message{
                                                        exchange_name = XName},
                                         msg_seq_no = MsgSeqNo},
-                   QNames}, State) ->
-    {RoutingRes, DeliveredQPids} =
-        rabbit_amqqueue:deliver_flow(rabbit_amqqueue:lookup(QNames), Delivery),
-    State1 = State#ch{queue_monitors =
-                          pmon:monitor_all(DeliveredQPids,
-                                           State#ch.queue_monitors)},
-    State2 = process_routing_result(RoutingRes, DeliveredQPids,
-                                    XName, MsgSeqNo, Message, State1),
-    maybe_incr_stats([{XName, 1} |
-                      [{{QPid, XName}, 1} ||
-                          QPid <- DeliveredQPids]], publish, State2),
-    State2.
-
-process_routing_result(unroutable,    _, XName,  MsgSeqNo, Msg, State) ->
-    ok = basic_return(Msg, State, no_route),
-    maybe_incr_stats([{Msg#basic_message.exchange_name, 1}],
-                     return_unroutable, State),
-    record_confirm(MsgSeqNo, XName, State);
-process_routing_result(not_delivered, _, XName,  MsgSeqNo, Msg, State) ->
-    ok = basic_return(Msg, State, no_consumers),
-    maybe_incr_stats([{XName, 1}], return_not_delivered, State),
-    record_confirm(MsgSeqNo, XName, State);
-process_routing_result(routed,       [], XName,  MsgSeqNo,   _, State) ->
-    record_confirm(MsgSeqNo, XName, State);
-process_routing_result(routed,        _,     _, undefined,   _, State) ->
+                   DelQNames}, State = #ch{queue_names    = QNames,
+                                           queue_monitors = QMons}) ->
+    Qs = rabbit_amqqueue:lookup(DelQNames),
+    {RoutingRes, DeliveredQPids} = rabbit_amqqueue:deliver_flow(Qs, Delivery),
+    %% The pmon:monitor_all/2 monitors all queues to which we
+    %% delivered. But we want to monitor even queues we didn't deliver
+    %% to, since we need their 'DOWN' messages to clean
+    %% queue_names. So we also need to monitor each QPid from
+    %% queues. But that only gets the masters (which is fine for
+    %% cleaning queue_names), so we need the union of both.
+    %%
+    %% ...and we need to add even non-delivered queues to queue_names
+    %% since alternative algorithms to update queue_names less
+    %% frequently would in fact be more expensive in the common case.
+    {QNames1, QMons1} =
+        lists:foldl(fun (#amqqueue{pid = QPid, name = QName},
+                         {QNames0, QMons0}) ->
+                            {case dict:is_key(QPid, QNames0) of
+                                 true  -> QNames0;
+                                 false -> dict:store(QPid, QName, QNames0)
+                             end, pmon:monitor(QPid, QMons0)}
+                    end, {QNames, pmon:monitor_all(DeliveredQPids, QMons)}, Qs),
+    State1 = process_routing_result(RoutingRes, DeliveredQPids,
+                                    XName, MsgSeqNo, Message,
+                                    State#ch{queue_names    = QNames1,
+                                             queue_monitors = QMons1}),
+    ?INCR_STATS([{exchange_stats, XName, 1} |
+                 [{queue_exchange_stats, {QName, XName}, 1} ||
+                     QPid        <- DeliveredQPids,
+                     {ok, QName} <- [dict:find(QPid, QNames1)]]],
+                publish, State1),
+    State1.
+
+process_routing_result(routed,     _,     _, undefined,   _, State) ->
     State;
-process_routing_result(routed,    QPids, XName,  MsgSeqNo,   _, State) ->
+process_routing_result(routed,    [], XName,  MsgSeqNo,   _, State) ->
+    record_confirms([{MsgSeqNo, XName}], State);
+process_routing_result(routed, QPids, XName,  MsgSeqNo,   _, State) ->
     State#ch{unconfirmed = dtree:insert(MsgSeqNo, QPids, XName,
-                                        State#ch.unconfirmed)}.
+                                        State#ch.unconfirmed)};
+process_routing_result(unroutable, _, XName,  MsgSeqNo, Msg, State) ->
+    ok = basic_return(Msg, State, no_route),
+    ?INCR_STATS([{exchange_stats, XName, 1}], return_unroutable, State),
+    case MsgSeqNo of
+        undefined -> State;
+        _         -> record_confirms([{MsgSeqNo, XName}], State)
+    end.
 
 send_nacks([], State) ->
     State;
-send_nacks(MXs, State = #ch{tx_status = none}) ->
+send_nacks(_MXs, State = #ch{state = closing,
+                             tx    = none}) -> %% optimisation
+    State;
+send_nacks(MXs, State = #ch{tx = none}) ->
     coalesce_and_send([MsgSeqNo || {MsgSeqNo, _} <- MXs],
                       fun(MsgSeqNo, Multiple) ->
                               #'basic.nack'{delivery_tag = MsgSeqNo,
                                             multiple     = Multiple}
                       end, State);
+send_nacks(_MXs, State = #ch{state = closing}) -> %% optimisation
+    State#ch{tx = failed};
 send_nacks(_, State) ->
-    maybe_complete_tx(State#ch{tx_status = failed}).
+    maybe_complete_tx(State#ch{tx = failed}).
 
-send_confirms(State = #ch{tx_status = none, confirmed = []}) ->
+send_confirms(State = #ch{tx = none, confirmed = []}) ->
     State;
-send_confirms(State = #ch{tx_status = none, confirmed = C}) ->
+send_confirms(State = #ch{tx = none, confirmed = C}) ->
     MsgSeqNos =
-        lists:foldl(fun ({MsgSeqNo, XName}, MSNs) ->
-                            maybe_incr_stats([{XName, 1}], confirm, State),
-                            [MsgSeqNo | MSNs]
-                    end, [], lists:append(C)),
+        lists:foldl(
+          fun ({MsgSeqNo, XName}, MSNs) ->
+                  ?INCR_STATS([{exchange_stats, XName, 1}], confirm, State),
+                  [MsgSeqNo | MSNs]
+          end, [], lists:append(C)),
     send_confirms(MsgSeqNos, State#ch{confirmed = []});
 send_confirms(State) ->
     maybe_complete_tx(State).
 
 send_confirms([], State) ->
     State;
-send_confirms([MsgSeqNo], State = #ch{writer_pid = WriterPid}) ->
-    ok = rabbit_writer:send_command(WriterPid,
-                                    #'basic.ack'{delivery_tag = MsgSeqNo}),
+send_confirms(_Cs, State = #ch{state = closing}) -> %% optimisation
+    State;
+send_confirms([MsgSeqNo], State) ->
+    ok = send(#'basic.ack'{delivery_tag = MsgSeqNo}, State),
     State;
 send_confirms(Cs, State) ->
     coalesce_and_send(Cs, fun(MsgSeqNo, Multiple) ->
@@ -1394,8 +1546,7 @@ send_confirms(Cs, State) ->
                                                multiple     = Multiple}
                           end, State).
 
-coalesce_and_send(MsgSeqNos, MkMsgFun,
-                  State = #ch{writer_pid = WriterPid, unconfirmed = UC}) ->
+coalesce_and_send(MsgSeqNos, MkMsgFun, State = #ch{unconfirmed = UC}) ->
     SMsgSeqNos = lists:usort(MsgSeqNos),
     CutOff = case dtree:is_empty(UC) of
                  true  -> lists:last(SMsgSeqNos) + 1;
@@ -1404,14 +1555,17 @@ coalesce_and_send(MsgSeqNos, MkMsgFun,
     {Ms, Ss} = lists:splitwith(fun(X) -> X < CutOff end, SMsgSeqNos),
     case Ms of
         [] -> ok;
-        _  -> ok = rabbit_writer:send_command(
-                     WriterPid, MkMsgFun(lists:last(Ms), true))
+        _  -> ok = send(MkMsgFun(lists:last(Ms), true), State)
     end,
-    [ok = rabbit_writer:send_command(
-            WriterPid, MkMsgFun(SeqNo, false)) || SeqNo <- Ss],
+    [ok = send(MkMsgFun(SeqNo, false), State) || SeqNo <- Ss],
     State.
 
-maybe_complete_tx(State = #ch{tx_status = in_progress}) ->
+ack_cons(Tag, Acked, [{Tag, Acks} | L]) -> [{Tag, Acked ++ Acks} | L];
+ack_cons(Tag, Acked, Acks)              -> [{Tag, Acked} | Acks].
+
+ack_len(Acks) -> lists:sum([length(L) || {ack, L} <- Acks]).
+
+maybe_complete_tx(State = #ch{tx = {_, _}}) ->
     State;
 maybe_complete_tx(State = #ch{unconfirmed = UC}) ->
     case dtree:is_empty(UC) of
@@ -1419,16 +1573,16 @@ maybe_complete_tx(State = #ch{unconfirmed = UC}) ->
         true  -> complete_tx(State#ch{confirmed = []})
     end.
 
-complete_tx(State = #ch{tx_status = committing}) ->
-    ok = rabbit_writer:send_command(State#ch.writer_pid, #'tx.commit_ok'{}),
-    State#ch{tx_status = in_progress};
-complete_tx(State = #ch{tx_status = failed}) ->
-    {noreply, State1} = send_exception(
+complete_tx(State = #ch{tx = committing}) ->
+    ok = send(#'tx.commit_ok'{}, State),
+    State#ch{tx = new_tx()};
+complete_tx(State = #ch{tx = failed}) ->
+    {noreply, State1} = handle_exception(
                           rabbit_misc:amqp_error(
                             precondition_failed, "partial tx completion", [],
                             'tx.commit'),
                           State),
-    State1#ch{tx_status = in_progress}.
+    State1#ch{tx = new_tx()}.
 
 infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
@@ -1437,21 +1591,18 @@ i(connection,     #ch{conn_pid         = ConnPid}) -> ConnPid;
 i(number,         #ch{channel          = Channel}) -> Channel;
 i(user,           #ch{user             = User})    -> User#user.username;
 i(vhost,          #ch{virtual_host     = VHost})   -> VHost;
-i(transactional,  #ch{tx_status        = TE})      -> TE =/= none;
+i(transactional,  #ch{tx               = Tx})      -> Tx =/= none;
 i(confirm,        #ch{confirm_enabled  = CE})      -> CE;
 i(name,           State)                           -> name(State);
-i(consumer_count, #ch{consumer_mapping = ConsumerMapping}) ->
-    dict:size(ConsumerMapping);
-i(messages_unconfirmed, #ch{unconfirmed = UC}) ->
-    dtree:size(UC);
-i(messages_unacknowledged, #ch{unacked_message_q = UAMQ}) ->
-    queue:len(UAMQ);
-i(messages_uncommitted, #ch{uncommitted_message_q = TMQ}) ->
-    queue:len(TMQ);
-i(acks_uncommitted, #ch{uncommitted_acks = TAL}) ->
-    length(TAL);
+i(consumer_count,          #ch{consumer_mapping = CM})    -> dict:size(CM);
+i(messages_unconfirmed,    #ch{unconfirmed = UC})         -> dtree:size(UC);
+i(messages_unacknowledged, #ch{unacked_message_q = UAMQ}) -> queue:len(UAMQ);
+i(messages_uncommitted,    #ch{tx = {Msgs, _Acks}})       -> queue:len(Msgs);
+i(messages_uncommitted,    #ch{})                         -> 0;
+i(acks_uncommitted,        #ch{tx = {_Msgs, Acks}})       -> ack_len(Acks);
+i(acks_uncommitted,        #ch{})                         -> 0;
 i(prefetch_count, #ch{limiter = Limiter}) ->
-    rabbit_limiter:get_limit(Limiter);
+    rabbit_limiter:get_prefetch_limit(Limiter);
 i(client_flow_blocked, #ch{limiter = Limiter}) ->
     rabbit_limiter:is_blocked(Limiter);
 i(Item, _) ->
@@ -1460,26 +1611,11 @@ i(Item, _) ->
 name(#ch{conn_name = ConnName, channel = Channel}) ->
     list_to_binary(rabbit_misc:format("~s (~p)", [ConnName, Channel])).
 
-maybe_incr_redeliver_stats(true, QPid, State) ->
-    maybe_incr_stats([{QPid, 1}], redeliver, State);
-maybe_incr_redeliver_stats(_, _, _State) ->
-    ok.
-
-maybe_incr_stats(QXIncs, Measure, State) ->
-    case rabbit_event:stats_level(State, #ch.stats_timer) of
-        fine -> [incr_stats(QX, Inc, Measure) || {QX, Inc} <- QXIncs];
-        _    -> ok
-    end.
-
-incr_stats({_, _} = QX, Inc, Measure) ->
-    update_measures(queue_exchange_stats, QX, Inc, Measure);
-incr_stats(QPid, Inc, Measure) when is_pid(QPid) ->
-    update_measures(queue_stats, QPid, Inc, Measure);
-incr_stats(X, Inc, Measure) ->
-    update_measures(exchange_stats, X, Inc, Measure).
+incr_stats(Incs, Measure) ->
+    [update_measures(Type, Key, Inc, Measure) || {Type, Key, Inc} <- Incs].
 
-update_measures(Type, QX, Inc, Measure) ->
-    Measures = case get({Type, QX}) of
+update_measures(Type, Key, Inc, Measure) ->
+    Measures = case get({Type, Key}) of
                    undefined -> [];
                    D         -> D
                end,
@@ -1487,31 +1623,29 @@ update_measures(Type, QX, Inc, Measure) ->
               error   -> 0;
               {ok, C} -> C
           end,
-    put({Type, QX},
-        orddict:store(Measure, Cur + Inc, Measures)).
+    put({Type, Key}, orddict:store(Measure, Cur + Inc, Measures)).
 
 emit_stats(State) ->
     emit_stats(State, []).
 
 emit_stats(State, Extra) ->
-    CoarseStats = infos(?STATISTICS_KEYS, State),
+    Coarse = infos(?STATISTICS_KEYS, State),
     case rabbit_event:stats_level(State, #ch.stats_timer) of
-        coarse ->
-            rabbit_event:notify(channel_stats, Extra ++ CoarseStats);
-        fine ->
-            FineStats =
-                [{channel_queue_stats,
-                  [{QPid, Stats} || {{queue_stats, QPid}, Stats} <- get()]},
-                 {channel_exchange_stats,
-                  [{X, Stats} || {{exchange_stats, X}, Stats} <- get()]},
-                 {channel_queue_exchange_stats,
-                  [{QX, Stats} ||
-                      {{queue_exchange_stats, QX}, Stats} <- get()]}],
-            rabbit_event:notify(channel_stats,
-                                Extra ++ CoarseStats ++ FineStats)
+        coarse -> rabbit_event:notify(channel_stats, Extra ++ Coarse);
+        fine   -> Fine = [{channel_queue_stats,
+                           [{QName, Stats} ||
+                               {{queue_stats,       QName}, Stats} <- get()]},
+                          {channel_exchange_stats,
+                           [{XName, Stats} ||
+                               {{exchange_stats,    XName}, Stats} <- get()]},
+                          {channel_queue_exchange_stats,
+                           [{QX, Stats} ||
+                               {{queue_exchange_stats, QX}, Stats} <- get()]}],
+                  rabbit_event:notify(channel_stats, Extra ++ Coarse ++ Fine)
     end.
 
-erase_queue_stats(QPid) ->
-    erase({queue_stats, QPid}),
+erase_queue_stats(QName) ->
+    erase({queue_stats, QName}),
     [erase({queue_exchange_stats, QX}) ||
-        {{queue_exchange_stats, QX = {QPid0, _}}, _} <- get(), QPid =:= QPid0].
+        {{queue_exchange_stats, QX = {QName0, _}}, _} <- get(),
+        QName0 =:= QName].
diff --git a/src/rabbit_channel_sup.erl b/src/rabbit_channel_sup.erl
index bcb83851..df2e80ca 100644
--- a/src/rabbit_channel_sup.erl
+++ b/src/rabbit_channel_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_channel_sup).
@@ -58,7 +58,7 @@ start_link({tcp, Sock, Channel, FrameMax, ReaderPid, ConnName, Protocol, User,
           {channel, {rabbit_channel, start_link,
                      [Channel, ReaderPid, WriterPid, ReaderPid, ConnName,
                       Protocol, User, VHost, Capabilities, Collector,
-                      rabbit_limiter:make_token(LimiterPid)]},
+                      LimiterPid]},
            intrinsic, ?MAX_WAIT, worker, [rabbit_channel]}),
     {ok, AState} = rabbit_command_assembler:init(Protocol),
     {ok, SupPid, {ChannelPid, AState}};
@@ -72,7 +72,7 @@ start_link({direct, Channel, ClientChannelPid, ConnPid, ConnName, Protocol,
           {channel, {rabbit_channel, start_link,
                      [Channel, ClientChannelPid, ClientChannelPid, ConnPid,
                       ConnName, Protocol, User, VHost, Capabilities, Collector,
-                      rabbit_limiter:make_token(LimiterPid)]},
+                      LimiterPid]},
            intrinsic, ?MAX_WAIT, worker, [rabbit_channel]}),
     {ok, SupPid, {ChannelPid, none}}.
 
@@ -83,7 +83,7 @@ init(Type) ->
 
 child_specs({tcp, Sock, Channel, FrameMax, ReaderPid, Protocol}) ->
     [{writer, {rabbit_writer, start_link,
-               [Sock, Channel, FrameMax, Protocol, ReaderPid]},
+               [Sock, Channel, FrameMax, Protocol, ReaderPid, true]},
       intrinsic, ?MAX_WAIT, worker, [rabbit_writer]} | child_specs(direct)];
 child_specs(direct) ->
     [{limiter, {rabbit_limiter, start_link, []},
diff --git a/src/rabbit_channel_sup_sup.erl b/src/rabbit_channel_sup_sup.erl
index 995c41fb..e2c255db 100644
--- a/src/rabbit_channel_sup_sup.erl
+++ b/src/rabbit_channel_sup_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_channel_sup_sup).
@@ -43,6 +43,6 @@ start_channel(Pid, Args) ->
 %%----------------------------------------------------------------------------
 
 init([]) ->
-    {ok, {{simple_one_for_one_terminate, 0, 1},
+    {ok, {{simple_one_for_one, 0, 1},
           [{channel_sup, {rabbit_channel_sup, start_link, []},
             temporary, infinity, supervisor, [rabbit_channel_sup]}]}}.
diff --git a/src/rabbit_client_sup.erl b/src/rabbit_client_sup.erl
index c508f1b9..843bb615 100644
--- a/src/rabbit_client_sup.erl
+++ b/src/rabbit_client_sup.erl
@@ -10,15 +10,15 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_client_sup).
 
 -behaviour(supervisor2).
 
--export([start_link/1, start_link/2]).
+-export([start_link/1, start_link/2, start_link_worker/2]).
 
 -export([init/1]).
 
@@ -32,6 +32,8 @@
                            rabbit_types:ok_pid_or_error()).
 -spec(start_link/2 :: ({'local', atom()}, rabbit_types:mfargs()) ->
                            rabbit_types:ok_pid_or_error()).
+-spec(start_link_worker/2 :: ({'local', atom()}, rabbit_types:mfargs()) ->
+                                  rabbit_types:ok_pid_or_error()).
 
 -endif.
 
@@ -43,6 +45,13 @@ start_link(Callback) ->
 start_link(SupName, Callback) ->
     supervisor2:start_link(SupName, ?MODULE, Callback).
 
+start_link_worker(SupName, Callback) ->
+    supervisor2:start_link(SupName, ?MODULE, {Callback, worker}).
+
 init({M,F,A}) ->
-    {ok, {{simple_one_for_one_terminate, 0, 1},
-          [{client, {M,F,A}, temporary, infinity, supervisor, [M]}]}}.
+    {ok, {{simple_one_for_one, 0, 1},
+          [{client, {M,F,A}, temporary, infinity, supervisor, [M]}]}};
+init({{M,F,A}, worker}) ->
+    {ok, {{simple_one_for_one, 0, 1},
+          [{client, {M,F,A}, temporary, ?MAX_WAIT, worker, [M]}]}}.
+
diff --git a/src/rabbit_command_assembler.erl b/src/rabbit_command_assembler.erl
index adf6e417..4095ccf1 100644
--- a/src/rabbit_command_assembler.erl
+++ b/src/rabbit_command_assembler.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_command_assembler).
diff --git a/src/rabbit_connection_sup.erl b/src/rabbit_connection_sup.erl
index 12a532b6..fee377e7 100644
--- a/src/rabbit_connection_sup.erl
+++ b/src/rabbit_connection_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_connection_sup).
@@ -42,16 +42,20 @@ start_link() ->
           SupPid,
           {collector, {rabbit_queue_collector, start_link, []},
            intrinsic, ?MAX_WAIT, worker, [rabbit_queue_collector]}),
-    {ok, ChannelSupSupPid} =
+    %% We need to get channels in the hierarchy here so they close
+    %% before the reader. But for 1.0 readers we can't start the real
+    %% ch_sup_sup (because we don't know if we will be 0-9-1 or 1.0) -
+    %% so we add another supervisor into the hierarchy.
+    {ok, ChannelSup3Pid} =
         supervisor2:start_child(
           SupPid,
-          {channel_sup_sup, {rabbit_channel_sup_sup, start_link, []},
-           intrinsic, infinity, supervisor, [rabbit_channel_sup_sup]}),
+          {channel_sup3, {rabbit_intermediate_sup, start_link, []},
+           intrinsic, infinity, supervisor, [rabbit_intermediate_sup]}),
     {ok, ReaderPid} =
         supervisor2:start_child(
           SupPid,
           {reader, {rabbit_reader, start_link,
-                    [ChannelSupSupPid, Collector,
+                    [ChannelSup3Pid, Collector,
                      rabbit_heartbeat:start_heartbeat_fun(SupPid)]},
            intrinsic, ?MAX_WAIT, worker, [rabbit_reader]}),
     {ok, SupPid, ReaderPid}.
diff --git a/src/rabbit_control.erl b/src/rabbit_control_main.erl
index 0c3ac966..6f36f99d 100644
--- a/src/rabbit_control.erl
+++ b/src/rabbit_control_main.erl
@@ -10,14 +10,15 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
--module(rabbit_control).
+-module(rabbit_control_main).
 -include("rabbit.hrl").
 
--export([start/0, stop/0, action/5]).
+-export([start/0, stop/0, parse_arguments/2, action/5,
+         sync_queue/1, cancel_sync_queue/1]).
 
 -define(RPC_TIMEOUT, infinity).
 -define(EXTERNAL_CHECK_INTERVAL, 1000).
@@ -25,10 +26,18 @@
 -define(QUIET_OPT, "-q").
 -define(NODE_OPT, "-n").
 -define(VHOST_OPT, "-p").
+-define(PRIORITY_OPT, "--priority").
+-define(APPLY_TO_OPT, "--apply-to").
+-define(RAM_OPT, "--ram").
+-define(OFFLINE_OPT, "--offline").
 
 -define(QUIET_DEF, {?QUIET_OPT, flag}).
 -define(NODE_DEF(Node), {?NODE_OPT, {option, Node}}).
 -define(VHOST_DEF, {?VHOST_OPT, {option, "/"}}).
+-define(PRIORITY_DEF, {?PRIORITY_OPT, {option, "0"}}).
+-define(APPLY_TO_DEF, {?APPLY_TO_OPT, {option, "all"}}).
+-define(RAM_DEF, {?RAM_OPT, flag}).
+-define(OFFLINE_DEF, {?OFFLINE_OPT, flag}).
 
 -define(GLOBAL_DEFS(Node), [?QUIET_DEF, ?NODE_DEF(Node)]).
 
@@ -41,9 +50,13 @@
          force_reset,
          rotate_logs,
 
-         cluster,
-         force_cluster,
+         {join_cluster, [?RAM_DEF]},
+         change_cluster_node_type,
+         update_cluster_nodes,
+         {forget_cluster_node, [?OFFLINE_DEF]},
          cluster_status,
+         {sync_queue, [?VHOST_DEF]},
+         {cancel_sync_queue, [?VHOST_DEF]},
 
          add_user,
          delete_user,
@@ -60,9 +73,13 @@
          {list_permissions, [?VHOST_DEF]},
          list_user_permissions,
 
-         set_parameter,
-         clear_parameter,
-         list_parameters,
+         {set_parameter, [?VHOST_DEF]},
+         {clear_parameter, [?VHOST_DEF]},
+         {list_parameters, [?VHOST_DEF]},
+
+         {set_policy, [?VHOST_DEF, ?PRIORITY_DEF, ?APPLY_TO_DEF]},
+         {clear_policy, [?VHOST_DEF]},
+         {list_policies, [?VHOST_DEF]},
 
          {list_queues, [?VHOST_DEF]},
          {list_exchanges, [?VHOST_DEF]},
@@ -92,7 +109,9 @@
          {"Bindings",  rabbit_binding,  info_all, info_keys},
          {"Consumers", rabbit_amqqueue, consumers_all, consumer_info_keys},
          {"Permissions", rabbit_auth_backend_internal, list_vhost_permissions,
-          vhost_perms_info_keys}]).
+          vhost_perms_info_keys},
+         {"Policies",   rabbit_policy,             list_formatted, info_keys},
+         {"Parameters", rabbit_runtime_parameters, list_formatted, info_keys}]).
 
 %%----------------------------------------------------------------------------
 
@@ -113,19 +132,13 @@
 start() ->
     {ok, [[NodeStr|_]|_]} = init:get_argument(nodename),
     {Command, Opts, Args} =
-        case rabbit_misc:parse_arguments(?COMMANDS, ?GLOBAL_DEFS(NodeStr),
-                                         init:get_plain_arguments())
-        of
+        case parse_arguments(init:get_plain_arguments(), NodeStr) of
             {ok, Res}  -> Res;
             no_command -> print_error("could not recognise command", []),
                           usage()
         end,
-    Opts1 = [case K of
-                 ?NODE_OPT -> {?NODE_OPT, rabbit_nodes:make(V)};
-                 _         -> {K, V}
-             end || {K, V} <- Opts],
-    Quiet = proplists:get_bool(?QUIET_OPT, Opts1),
-    Node = proplists:get_value(?NODE_OPT, Opts1),
+    Quiet = proplists:get_bool(?QUIET_OPT, Opts),
+    Node = proplists:get_value(?NODE_OPT, Opts),
     Inform = case Quiet of
                  true  -> fun (_Format, _Args1) -> ok end;
                  false -> fun (Format, Args1) ->
@@ -147,6 +160,12 @@ start() ->
                 false -> io:format("...done.~n")
             end,
             rabbit_misc:quit(0);
+        {ok, Info} ->
+            case Quiet of
+                true  -> ok;
+                false -> io:format("...done (~p).~n", [Info])
+            end,
+            rabbit_misc:quit(0);
         {'EXIT', {function_clause, [{?MODULE, action, _}    | _]}} -> %% < R15
             PrintInvalidCommandError(),
             usage();
@@ -156,6 +175,11 @@ start() ->
         {'EXIT', {badarg, _}} ->
             print_error("invalid parameter: ~p", [Args]),
             usage();
+        {error, {Problem, Reason}} when is_atom(Problem), is_binary(Reason) ->
+            %% We handle this common case specially to avoid ~p since
+            %% that has i18n issues
+            print_error("~s: ~s", [Problem, Reason]),
+            rabbit_misc:quit(2);
         {error, Reason} ->
             print_error("~p", [Reason]),
             rabbit_misc:quit(2);
@@ -185,11 +209,11 @@ print_report(Node, {Descr, Module, InfoFun, KeysFun}, VHostArg) ->
     print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg).
 
 print_report0(Node, {Module, InfoFun, KeysFun}, VHostArg) ->
-    case Results = rpc_call(Node, Module, InfoFun, VHostArg) of
-        [_|_] -> InfoItems = rpc_call(Node, Module, KeysFun, []),
-                 display_row([atom_to_list(I) || I <- InfoItems]),
-                 display_info_list(Results, InfoItems);
-        _     -> ok
+    case rpc_call(Node, Module, InfoFun, VHostArg) of
+        [_|_] = Results -> InfoItems = rpc_call(Node, Module, KeysFun, []),
+                           display_row([atom_to_list(I) || I <- InfoItems]),
+                           display_info_list(Results, InfoItems);
+        _               -> ok
     end,
     io:nl().
 
@@ -205,6 +229,19 @@ usage() ->
     io:format("~s", [rabbit_ctl_usage:usage()]),
     rabbit_misc:quit(1).
 
+parse_arguments(CmdLine, NodeStr) ->
+    case rabbit_misc:parse_arguments(
+           ?COMMANDS, ?GLOBAL_DEFS(NodeStr), CmdLine) of
+        {ok, {Cmd, Opts0, Args}} ->
+            Opts = [case K of
+                        ?NODE_OPT -> {?NODE_OPT, rabbit_nodes:make(V)};
+                        _         -> {K, V}
+                    end || {K, V} <- Opts0],
+            {ok, {Cmd, Opts, Args}};
+        E ->
+            E
+    end.
+
 %%----------------------------------------------------------------------------
 
 action(stop, Node, Args, _Opts, Inform) ->
@@ -234,21 +271,50 @@ action(force_reset, Node, [], _Opts, Inform) ->
     Inform("Forcefully resetting node ~p", [Node]),
     call(Node, {rabbit_mnesia, force_reset, []});
 
-action(cluster, Node, ClusterNodeSs, _Opts, Inform) ->
-    ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
-    Inform("Clustering node ~p with ~p",
-           [Node, ClusterNodes]),
-    rpc_call(Node, rabbit_mnesia, cluster, [ClusterNodes]);
+action(join_cluster, Node, [ClusterNodeS], Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    NodeType = case proplists:get_bool(?RAM_OPT, Opts) of
+                   true  -> ram;
+                   false -> disc
+               end,
+    Inform("Clustering node ~p with ~p", [Node, ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, join_cluster, [ClusterNode, NodeType]);
+
+action(change_cluster_node_type, Node, ["ram"], _Opts, Inform) ->
+    Inform("Turning ~p into a ram node", [Node]),
+    rpc_call(Node, rabbit_mnesia, change_cluster_node_type, [ram]);
+action(change_cluster_node_type, Node, [Type], _Opts, Inform)
+  when Type =:= "disc" orelse Type =:= "disk" ->
+    Inform("Turning ~p into a disc node", [Node]),
+    rpc_call(Node, rabbit_mnesia, change_cluster_node_type, [disc]);
+
+action(update_cluster_nodes, Node, [ClusterNodeS], _Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    Inform("Updating cluster nodes for ~p from ~p", [Node, ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, update_cluster_nodes, [ClusterNode]);
+
+action(forget_cluster_node, Node, [ClusterNodeS], Opts, Inform) ->
+    ClusterNode = list_to_atom(ClusterNodeS),
+    RemoveWhenOffline = proplists:get_bool(?OFFLINE_OPT, Opts),
+    Inform("Removing node ~p from cluster", [ClusterNode]),
+    rpc_call(Node, rabbit_mnesia, forget_cluster_node,
+             [ClusterNode, RemoveWhenOffline]);
+
+action(sync_queue, Node, [Q], Opts, Inform) ->
+    VHost = proplists:get_value(?VHOST_OPT, Opts),
+    QName = rabbit_misc:r(list_to_binary(VHost), queue, list_to_binary(Q)),
+    Inform("Synchronising ~s", [rabbit_misc:rs(QName)]),
+    rpc_call(Node, rabbit_control_main, sync_queue, [QName]);
 
-action(force_cluster, Node, ClusterNodeSs, _Opts, Inform) ->
-    ClusterNodes = lists:map(fun list_to_atom/1, ClusterNodeSs),
-    Inform("Forcefully clustering node ~p with ~p (ignoring offline nodes)",
-           [Node, ClusterNodes]),
-    rpc_call(Node, rabbit_mnesia, force_cluster, [ClusterNodes]);
+action(cancel_sync_queue, Node, [Q], Opts, Inform) ->
+    VHost = proplists:get_value(?VHOST_OPT, Opts),
+    QName = rabbit_misc:r(list_to_binary(VHost), queue, list_to_binary(Q)),
+    Inform("Stopping synchronising ~s", [rabbit_misc:rs(QName)]),
+    rpc_call(Node, rabbit_control_main, cancel_sync_queue, [QName]);
 
 action(wait, Node, [PidFile], _Opts, Inform) ->
     Inform("Waiting for ~p", [Node]),
-    wait_for_application(Node, PidFile, rabbit, Inform);
+    wait_for_application(Node, PidFile, rabbit_and_plugins, Inform);
 action(wait, Node, [PidFile, App], _Opts, Inform) ->
     Inform("Waiting for ~p on ~p", [App, Node]),
     wait_for_application(Node, PidFile, list_to_atom(App), Inform);
@@ -352,7 +418,7 @@ action(list_bindings, Node, Args, Opts, Inform) ->
 
 action(list_connections, Node, Args, _Opts, Inform) ->
     Inform("Listing connections", []),
-    ArgAtoms = default_if_empty(Args, [user, peer_address, peer_port, state]),
+    ArgAtoms = default_if_empty(Args, [user, peer_host, peer_port, state]),
     display_info_list(rpc_call(Node, rabbit_networking, connection_info_all,
                                [ArgAtoms]),
                       ArgAtoms);
@@ -409,50 +475,85 @@ action(list_permissions, Node, [], Opts, Inform) ->
                              list_vhost_permissions, [VHost]}),
                       rabbit_auth_backend_internal:vhost_perms_info_keys());
 
-action(set_parameter, Node, [Component, Key, Value], _Opts, Inform) ->
+action(set_parameter, Node, [Component, Key, Value], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Setting runtime parameter ~p for component ~p to ~p",
            [Key, Component, Value]),
     rpc_call(Node, rabbit_runtime_parameters, parse_set,
-             [list_to_binary(Component), list_to_binary(Key), Value]);
+             [VHostArg, list_to_binary(Component), list_to_binary(Key), Value]);
 
-action(clear_parameter, Node, [Component, Key], _Opts, Inform) ->
+action(clear_parameter, Node, [Component, Key], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Clearing runtime parameter ~p for component ~p", [Key, Component]),
-    rpc_call(Node, rabbit_runtime_parameters, clear, [list_to_binary(Component),
+    rpc_call(Node, rabbit_runtime_parameters, clear, [VHostArg,
+                                                      list_to_binary(Component),
                                                       list_to_binary(Key)]);
 
-action(list_parameters, Node, Args = [], _Opts, Inform) ->
+action(list_parameters, Node, [], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
     Inform("Listing runtime parameters", []),
     display_info_list(
-      rpc_call(Node, rabbit_runtime_parameters, list_formatted, Args),
+      rpc_call(Node, rabbit_runtime_parameters, list_formatted, [VHostArg]),
       rabbit_runtime_parameters:info_keys());
 
+action(set_policy, Node, [Key, Pattern, Defn], Opts, Inform) ->
+    Msg = "Setting policy ~p for pattern ~p to ~p with priority ~p",
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    PriorityArg = proplists:get_value(?PRIORITY_OPT, Opts),
+    ApplyToArg = list_to_binary(proplists:get_value(?APPLY_TO_OPT, Opts)),
+    Inform(Msg, [Key, Pattern, Defn, PriorityArg]),
+    rpc_call(
+      Node, rabbit_policy, parse_set,
+      [VHostArg, list_to_binary(Key), Pattern, Defn, PriorityArg, ApplyToArg]);
+
+action(clear_policy, Node, [Key], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    Inform("Clearing policy ~p", [Key]),
+    rpc_call(Node, rabbit_policy, delete, [VHostArg, list_to_binary(Key)]);
+
+action(list_policies, Node, [], Opts, Inform) ->
+    VHostArg = list_to_binary(proplists:get_value(?VHOST_OPT, Opts)),
+    Inform("Listing policies", []),
+    display_info_list(rpc_call(Node, rabbit_policy, list_formatted, [VHostArg]),
+                      rabbit_policy:info_keys());
+
 action(report, Node, _Args, _Opts, Inform) ->
-    io:format("Reporting server status on ~p~n~n", [erlang:universaltime()]),
+    Inform("Reporting server status on ~p~n~n", [erlang:universaltime()]),
     [begin ok = action(Action, N, [], [], Inform), io:nl() end ||
-        N      <- unsafe_rpc(Node, rabbit_mnesia, running_clustered_nodes, []),
+        N      <- unsafe_rpc(Node, rabbit_mnesia, cluster_nodes, [running]),
         Action <- [status, cluster_status, environment]],
     VHosts = unsafe_rpc(Node, rabbit_vhost, list, []),
     [print_report(Node, Q)      || Q <- ?GLOBAL_QUERIES],
     [print_report(Node, Q, [V]) || Q <- ?VHOST_QUERIES, V <- VHosts],
-    io:format("End of server status report~n"),
     ok;
 
 action(eval, Node, [Expr], _Opts, _Inform) ->
     case erl_scan:string(Expr) of
         {ok, Scanned, _} ->
             case erl_parse:parse_exprs(Scanned) of
-                {ok, Parsed} ->
-                    {value, Value, _} = unsafe_rpc(
-                                          Node, erl_eval, exprs, [Parsed, []]),
-                    io:format("~p~n", [Value]),
-                    ok;
-                {error, E} ->
-                    {error_string, format_parse_error(E)}
+                {ok, Parsed} -> {value, Value, _} =
+                                    unsafe_rpc(
+                                      Node, erl_eval, exprs, [Parsed, []]),
+                                io:format("~p~n", [Value]),
+                                ok;
+                {error, E}   -> {error_string, format_parse_error(E)}
             end;
         {error, E, _} ->
             {error_string, format_parse_error(E)}
     end.
 
+format_parse_error({_Line, Mod, Err}) -> lists:flatten(Mod:format_error(Err)).
+
+sync_queue(Q) ->
+    rabbit_amqqueue:with(
+      Q, fun(#amqqueue{pid = QPid}) -> rabbit_amqqueue:sync_mirrors(QPid) end).
+
+cancel_sync_queue(Q) ->
+    rabbit_amqqueue:with(
+      Q, fun(#amqqueue{pid = QPid}) ->
+                 rabbit_amqqueue:cancel_sync_mirrors(QPid)
+         end).
+
 %%----------------------------------------------------------------------------
 
 wait_for_application(Node, PidFile, Application, Inform) ->
@@ -460,12 +561,22 @@ wait_for_application(Node, PidFile, Application, Inform) ->
     Inform("pid is ~s", [Pid]),
     wait_for_application(Node, Pid, Application).
 
+wait_for_application(Node, Pid, rabbit_and_plugins) ->
+    wait_for_startup(Node, Pid);
 wait_for_application(Node, Pid, Application) ->
+    while_process_is_alive(
+      Node, Pid, fun() -> rabbit_nodes:is_running(Node, Application) end).
+
+wait_for_startup(Node, Pid) ->
+    while_process_is_alive(
+      Node, Pid, fun() -> rpc:call(Node, rabbit, await_startup, []) =:= ok end).
+
+while_process_is_alive(Node, Pid, Activity) ->
     case process_up(Pid) of
-        true  -> case rabbit_nodes:is_running(Node, Application) of
+        true  -> case Activity() of
                      true  -> ok;
                      false -> timer:sleep(?EXTERNAL_CHECK_INTERVAL),
-                              wait_for_application(Node, Pid, Application)
+                              while_process_is_alive(Node, Pid, Activity)
                  end;
         false -> {error, process_not_running}
     end.
@@ -480,12 +591,14 @@ wait_for_process_death(Pid) ->
 read_pid_file(PidFile, Wait) ->
     case {file:read_file(PidFile), Wait} of
         {{ok, Bin}, _} ->
-            S = string:strip(binary_to_list(Bin), right, $\n),
-            try list_to_integer(S)
+            S = binary_to_list(Bin),
+            {match, [PidS]} = re:run(S, "[^\\s]+",
+                                     [{capture, all, list}]),
+            try list_to_integer(PidS)
             catch error:badarg ->
                     exit({error, {garbage_in_pid_file, PidFile}})
             end,
-            S;
+            PidS;
         {{error, enoent}, true} ->
             timer:sleep(?EXTERNAL_CHECK_INTERVAL),
             read_pid_file(PidFile, Wait);
@@ -497,12 +610,11 @@ read_pid_file(PidFile, Wait) ->
 % rpc:call(os, getpid, []) at this point
 process_up(Pid) ->
     with_os([{unix, fun () ->
-                            system("ps -p " ++ Pid
-                                   ++ " >/dev/null 2>&1") =:= 0
+                            run_ps(Pid) =:= 0
                     end},
              {win32, fun () ->
-                             Res = os:cmd("tasklist /nh /fi \"pid eq " ++
-                                          Pid ++ "\" 2>&1"),
+                             Cmd = "tasklist /nh /fi \"pid eq " ++ Pid ++ "\" ",
+                             Res = rabbit_misc:os_cmd(Cmd ++ "2>&1"),
                              case re:run(Res, "erl\\.exe", [{capture, none}]) of
                                  match -> true;
                                  _     -> false
@@ -516,18 +628,17 @@ with_os(Handlers) ->
         Handler   -> Handler()
     end.
 
-% Like system(3)
-system(Cmd) ->
-    ShCmd = "sh -c '" ++ escape_quotes(Cmd) ++ "'",
-    Port = erlang:open_port({spawn, ShCmd}, [exit_status,nouse_stdio]),
-    receive {Port, {exit_status, Status}} -> Status end.
-
-% Escape the quotes in a shell command so that it can be used in "sh -c 'cmd'"
-escape_quotes(Cmd) ->
-    lists:flatten(lists:map(fun ($') -> "'\\''"; (Ch) -> Ch end, Cmd)).
+run_ps(Pid) ->
+    Port = erlang:open_port({spawn, "ps -p " ++ Pid},
+                            [exit_status, {line, 16384},
+                             use_stdio, stderr_to_stdout]),
+    exit_loop(Port).
 
-format_parse_error({_Line, Mod, Err}) ->
-    lists:flatten(Mod:format_error(Err)).
+exit_loop(Port) ->
+    receive
+        {Port, {exit_status, Rc}} -> Rc;
+        {Port, _}                 -> exit_loop(Port)
+    end.
 
 %%----------------------------------------------------------------------------
 
@@ -541,7 +652,7 @@ display_info_list(Results, InfoItemKeys) when is_list(Results) ->
       fun (Result) -> display_row(
                         [format_info_item(proplists:get_value(X, Result)) ||
                             X <- InfoItemKeys])
-      end, Results),
+      end, lists:sort(Results)),
     ok;
 display_info_list(Other, _) ->
     Other.
diff --git a/src/rabbit_direct.erl b/src/rabbit_direct.erl
index c07ad832..a7ee3276 100644
--- a/src/rabbit_direct.erl
+++ b/src/rabbit_direct.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_direct).
@@ -31,16 +31,18 @@
 -spec(force_event_refresh/0 :: () -> 'ok').
 -spec(list/0 :: () -> [pid()]).
 -spec(list_local/0 :: () -> [pid()]).
--spec(connect/5 :: (rabbit_types:username(), rabbit_types:vhost(),
-                    rabbit_types:protocol(), pid(),
+-spec(connect/5 :: ((rabbit_types:username() | rabbit_types:user() |
+                     {rabbit_types:username(), rabbit_types:password()}),
+                    rabbit_types:vhost(), rabbit_types:protocol(), pid(),
                     rabbit_event:event_props()) ->
-                        {'ok', {rabbit_types:user(),
-                                rabbit_framing:amqp_table()}}).
+                        rabbit_types:ok_or_error2(
+                          {rabbit_types:user(), rabbit_framing:amqp_table()},
+                          'broker_not_found_on_node' | 'auth_failure' |
+                          'access_refused')).
 -spec(start_channel/9 ::
         (rabbit_channel:channel_number(), pid(), pid(), string(),
          rabbit_types:protocol(), rabbit_types:user(), rabbit_types:vhost(),
          rabbit_framing:amqp_table(), pid()) -> {'ok', pid()}).
-
 -spec(disconnect/2 :: (pid(), rabbit_event:event_props()) -> 'ok').
 
 -endif.
@@ -60,32 +62,42 @@ list_local() ->
     pg_local:get_members(rabbit_direct).
 
 list() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_direct, list_local, []).
 
 %%----------------------------------------------------------------------------
 
+connect(User = #user{}, VHost, Protocol, Pid, Infos) ->
+    try rabbit_access_control:check_vhost_access(User, VHost) of
+        ok -> ok = pg_local:join(rabbit_direct, Pid),
+              rabbit_event:notify(connection_created, Infos),
+              {ok, {User, rabbit_reader:server_properties(Protocol)}}
+    catch
+        exit:#amqp_error{name = access_refused} ->
+            {error, access_refused}
+    end;
+
+connect({Username, Password}, VHost, Protocol, Pid, Infos) ->
+    connect0(fun () -> rabbit_access_control:check_user_pass_login(
+                         Username, Password) end,
+             VHost, Protocol, Pid, Infos);
+
 connect(Username, VHost, Protocol, Pid, Infos) ->
+    connect0(fun () -> rabbit_access_control:check_user_login(
+                         Username, []) end,
+             VHost, Protocol, Pid, Infos).
+
+connect0(AuthFun, VHost, Protocol, Pid, Infos) ->
     case rabbit:is_running() of
-        true  ->
-            case rabbit_access_control:check_user_login(Username, []) of
-                {ok, User} ->
-                    try rabbit_access_control:check_vhost_access(User, VHost) of
-                        ok -> ok = pg_local:join(rabbit_direct, Pid),
-                              rabbit_event:notify(connection_created, Infos),
-                              {ok, {User,
-                                    rabbit_reader:server_properties(Protocol)}}
-                    catch
-                        exit:#amqp_error{name = access_refused} ->
-                            {error, access_refused}
-                    end;
-                {refused, _Msg, _Args} ->
-                    {error, auth_failure}
-            end;
-        false ->
-            {error, broker_not_found_on_node}
+        true  -> case AuthFun() of
+                     {ok, User}        -> connect(User, VHost, Protocol, Pid,
+                                                  Infos);
+                     {refused, _M, _A} -> {error, auth_failure}
+                 end;
+        false -> {error, broker_not_found_on_node}
     end.
 
+
 start_channel(Number, ClientChannelPid, ConnPid, ConnName, Protocol, User,
               VHost, Capabilities, Collector) ->
     {ok, _, {ChannelPid, _}} =
diff --git a/src/rabbit_disk_monitor.erl b/src/rabbit_disk_monitor.erl
index d9e8e8e4..5aaa1b2d 100644
--- a/src/rabbit_disk_monitor.erl
+++ b/src/rabbit_disk_monitor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_disk_monitor).
@@ -27,10 +27,11 @@
          set_check_interval/1, get_disk_free/0]).
 
 -define(SERVER, ?MODULE).
--define(DEFAULT_DISK_CHECK_INTERVAL, 60000).
+-define(DEFAULT_DISK_CHECK_INTERVAL, 10000).
 
 -record(state, {dir,
                 limit,
+                actual,
                 timeout,
                 timer,
                 alarmed
@@ -106,8 +107,8 @@ handle_call({set_check_interval, Timeout}, _From, State) ->
     {ok, cancel} = timer:cancel(State#state.timer),
     {reply, ok, State#state{timeout = Timeout, timer = start_timer(Timeout)}};
 
-handle_call(get_disk_free, _From, State = #state { dir = Dir }) ->
-    {reply, get_disk_free(Dir), State};
+handle_call(get_disk_free, _From, State = #state { actual = Actual }) ->
+    {reply, Actual, State};
 
 handle_call(_Request, _From, State) ->
     {noreply, State}.
@@ -137,7 +138,7 @@ dir() -> rabbit_mnesia:dir().
 set_disk_limits(State, Limit) ->
     State1 = State#state { limit = Limit },
     rabbit_log:info("Disk free limit set to ~pMB~n",
-                    [trunc(interpret_limit(Limit) / 1048576)]),
+                    [trunc(interpret_limit(Limit) / 1000000)]),
     internal_update(State1).
 
 internal_update(State = #state { limit   = Limit,
@@ -148,15 +149,15 @@ internal_update(State = #state { limit   = Limit,
     NewAlarmed = CurrentFreeBytes < LimitBytes,
     case {Alarmed, NewAlarmed} of
         {false, true} ->
-            emit_update_info("exceeded", CurrentFreeBytes, LimitBytes),
-            alarm_handler:set_alarm({{resource_limit, disk, node()}, []});
+            emit_update_info("insufficient", CurrentFreeBytes, LimitBytes),
+            rabbit_alarm:set_alarm({{resource_limit, disk, node()}, []});
         {true, false} ->
-            emit_update_info("below limit", CurrentFreeBytes, LimitBytes),
-            alarm_handler:clear_alarm({resource_limit, disk, node()});
+            emit_update_info("sufficient", CurrentFreeBytes, LimitBytes),
+            rabbit_alarm:clear_alarm({resource_limit, disk, node()});
         _ ->
             ok
     end,
-    State #state {alarmed = NewAlarmed}.
+    State #state {alarmed = NewAlarmed, actual = CurrentFreeBytes}.
 
 get_disk_free(Dir) ->
     get_disk_free(Dir, os:type()).
@@ -167,9 +168,9 @@ get_disk_free(Dir, {unix, Sun})
 get_disk_free(Dir, {unix, _}) ->
     parse_free_unix(rabbit_misc:os_cmd("/bin/df -kP " ++ Dir));
 get_disk_free(Dir, {win32, _}) ->
-    parse_free_win32(os:cmd("dir /-C /W \"" ++ Dir ++ [$"]));
-get_disk_free(_, _) ->
-    unknown.
+    parse_free_win32(rabbit_misc:os_cmd("dir /-C /W \"" ++ Dir ++ [$"]));
+get_disk_free(_, Platform) ->
+    {unknown, Platform}.
 
 parse_free_unix(CommandResult) ->
     [_, Stats | _] = string:tokens(CommandResult, "\n"),
@@ -187,10 +188,10 @@ interpret_limit({mem_relative, R}) ->
 interpret_limit(L) ->
     L.
 
-emit_update_info(State, CurrentFree, Limit) ->
+emit_update_info(StateStr, CurrentFree, Limit) ->
     rabbit_log:info(
-      "Disk free space limit now ~s. Free bytes:~p Limit:~p~n",
-      [State, CurrentFree, Limit]).
+      "Disk free space ~s. Free bytes:~p Limit:~p~n",
+      [StateStr, CurrentFree, Limit]).
 
 start_timer(Timeout) ->
     {ok, TRef} = timer:send_interval(Timeout, update),
diff --git a/src/rabbit_error_logger.erl b/src/rabbit_error_logger.erl
index f1672f4e..17ed8563 100644
--- a/src/rabbit_error_logger.erl
+++ b/src/rabbit_error_logger.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_error_logger).
@@ -22,7 +22,7 @@
 
 -behaviour(gen_event).
 
--export([boot/0]).
+-export([start/0, stop/0]).
 
 -export([init/1, terminate/2, code_change/3, handle_call/2, handle_event/2,
          handle_info/2]).
@@ -31,16 +31,23 @@
 
 -ifdef(use_specs).
 
--spec(boot/0 :: () -> 'ok').
+-spec(start/0 :: () -> 'ok').
+-spec(stop/0  :: () -> 'ok').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
-boot() ->
+start() ->
     {ok, DefaultVHost} = application:get_env(default_vhost),
     ok = error_logger:add_report_handler(?MODULE, [DefaultVHost]).
 
+stop() ->
+    terminated_ok = error_logger:delete_report_handler(rabbit_error_logger),
+    ok.
+
+%%----------------------------------------------------------------------------
+
 init([DefaultVHost]) ->
     #exchange{} = rabbit_exchange:declare(
                     rabbit_misc:r(DefaultVHost, exchange, ?LOG_EXCH_NAME),
@@ -81,7 +88,7 @@ publish1(RoutingKey, Format, Data, LogExch) ->
     %% second resolution, not millisecond.
     Timestamp = rabbit_misc:now_ms() div 1000,
     {ok, _RoutingRes, _DeliveredQPids} =
-        rabbit_basic:publish(LogExch, RoutingKey, false, false,
+        rabbit_basic:publish(LogExch, RoutingKey,
                              #'P_basic'{content_type = <<"text/plain">>,
                                         timestamp    = Timestamp},
                              list_to_binary(io_lib:format(Format, Data))),
diff --git a/src/rabbit_error_logger_file_h.erl b/src/rabbit_error_logger_file_h.erl
index 042ab23c..d59641b0 100644
--- a/src/rabbit_error_logger_file_h.erl
+++ b/src/rabbit_error_logger_file_h.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_error_logger_file_h).
@@ -76,6 +76,9 @@ init_file(File, PrevHandler) ->
         Error   -> Error
     end.
 
+%% filter out "application: foo; exited: stopped; type: temporary"
+handle_event({info_report, _, {_, std_info, _}}, State) ->
+    {ok, State};
 handle_event(Event, State) ->
     error_logger_file_h:handle_event(Event, State).
 
diff --git a/src/rabbit_event.erl b/src/rabbit_event.erl
index 3f1b20fe..a713d76b 100644
--- a/src/rabbit_event.erl
+++ b/src/rabbit_event.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_event).
@@ -19,8 +19,8 @@
 -include("rabbit.hrl").
 
 -export([start_link/0]).
--export([init_stats_timer/2, ensure_stats_timer/3, stop_stats_timer/2]).
--export([reset_stats_timer/2]).
+-export([init_stats_timer/2, init_disabled_stats_timer/2,
+         ensure_stats_timer/3, stop_stats_timer/2, reset_stats_timer/2]).
 -export([stats_level/2, if_enabled/3]).
 -export([notify/2, notify_if/3]).
 
@@ -51,6 +51,7 @@
 
 -spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
 -spec(init_stats_timer/2 :: (container(), pos()) -> container()).
+-spec(init_disabled_stats_timer/2 :: (container(), pos()) -> container()).
 -spec(ensure_stats_timer/3 :: (container(), pos(), term()) -> container()).
 -spec(stop_stats_timer/2 :: (container(), pos()) -> container()).
 -spec(reset_stats_timer/2 :: (container(), pos()) -> container()).
@@ -90,10 +91,13 @@ start_link() ->
 
 init_stats_timer(C, P) ->
     {ok, StatsLevel} = application:get_env(rabbit, collect_statistics),
-    {ok, Interval} = application:get_env(rabbit, collect_statistics_interval),
+    {ok, Interval}   = application:get_env(rabbit, collect_statistics_interval),
     setelement(P, C, #state{level = StatsLevel, interval = Interval,
                             timer = undefined}).
 
+init_disabled_stats_timer(C, P) ->
+    setelement(P, C, #state{level = none, interval = 0, timer = undefined}).
+
 ensure_stats_timer(C, P, Msg) ->
     case element(P, C) of
         #state{level = Level, interval = Interval, timer = undefined} = State
@@ -106,18 +110,18 @@ ensure_stats_timer(C, P, Msg) ->
 
 stop_stats_timer(C, P) ->
     case element(P, C) of
-        #state{level = Level, timer = TRef} = State
-          when Level =/= none andalso TRef =/= undefined ->
-            erlang:cancel_timer(TRef),
-            setelement(P, C, State#state{timer = undefined});
+        #state{timer = TRef} = State when TRef =/= undefined ->
+            case erlang:cancel_timer(TRef) of
+                false -> C;
+                _     -> setelement(P, C, State#state{timer = undefined})
+            end;
         #state{} ->
             C
     end.
 
 reset_stats_timer(C, P) ->
     case element(P, C) of
-        #state{timer = TRef} = State
-          when TRef =/= undefined ->
+        #state{timer = TRef} = State when TRef =/= undefined ->
             setelement(P, C, State#state{timer = undefined});
         #state{} ->
             C
@@ -137,8 +141,6 @@ notify_if(true,   Type,  Props) -> notify(Type, Props);
 notify_if(false, _Type, _Props) -> ok.
 
 notify(Type, Props) ->
-    %% TODO: switch to os:timestamp() when we drop support for
-    %% Erlang/OTP < R13B01
     gen_event:notify(?MODULE, #event{type      = Type,
                                      props     = Props,
-                                     timestamp = now()}).
+                                     timestamp = os:timestamp()}).
diff --git a/src/rabbit_exchange.erl b/src/rabbit_exchange.erl
index 910a89b4..49952a4d 100644
--- a/src/rabbit_exchange.erl
+++ b/src/rabbit_exchange.erl
@@ -10,21 +10,21 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange).
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 
--export([recover/0, callback/3, declare/6,
+-export([recover/0, policy_changed/2, callback/4, declare/6,
          assert_equivalence/6, assert_args_equivalence/2, check_type/1,
-         lookup/1, lookup_or_die/1, list/1, update_scratch/2,
+         lookup/1, lookup_or_die/1, list/1, lookup_scratch/2, update_scratch/3,
          info_keys/0, info/1, info/2, info_all/1, info_all/2,
-         route/2, delete/2]).
+         route/2, delete/2, validate_binding/2]).
 %% these must be run inside a mnesia tx
--export([maybe_auto_delete/1, serial/1, peek_serial/1]).
+-export([maybe_auto_delete/1, serial/1, peek_serial/1, update/2]).
 
 %%----------------------------------------------------------------------------
 
@@ -37,7 +37,11 @@
 -type(fun_name() :: atom()).
 
 -spec(recover/0 :: () -> [name()]).
--spec(callback/3:: (rabbit_types:exchange(), fun_name(), [any()]) -> 'ok').
+-spec(callback/4::
+        (rabbit_types:exchange(), fun_name(),
+         fun((boolean()) -> non_neg_integer()) | atom(), [any()]) -> 'ok').
+-spec(policy_changed/2 ::
+        (rabbit_types:exchange(), rabbit_types:exchange()) -> 'ok').
 -spec(declare/6 ::
         (name(), type(), boolean(), boolean(), boolean(),
          rabbit_framing:amqp_table())
@@ -58,7 +62,14 @@
         (name()) -> rabbit_types:exchange() |
                     rabbit_types:channel_exit()).
 -spec(list/1 :: (rabbit_types:vhost()) -> [rabbit_types:exchange()]).
--spec(update_scratch/2 :: (name(), fun((any()) -> any())) -> 'ok').
+-spec(lookup_scratch/2 :: (name(), atom()) ->
+                               rabbit_types:ok(term()) |
+                               rabbit_types:error('not_found')).
+-spec(update_scratch/3 :: (name(), atom(), fun((any()) -> any())) -> 'ok').
+-spec(update/2 ::
+        (name(),
+         fun((rabbit_types:exchange()) -> rabbit_types:exchange()))
+         -> not_found | rabbit_types:exchange()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
 -spec(info/1 :: (rabbit_types:exchange()) -> rabbit_types:infos()).
 -spec(info/2 ::
@@ -73,17 +84,22 @@
         (name(), boolean())-> 'ok' |
                               rabbit_types:error('not_found') |
                               rabbit_types:error('in_use')).
+-spec(validate_binding/2 ::
+        (rabbit_types:exchange(), rabbit_types:binding())
+        -> rabbit_types:ok_or_error({'binding_invalid', string(), [any()]})).
 -spec(maybe_auto_delete/1::
         (rabbit_types:exchange())
         -> 'not_deleted' | {'deleted', rabbit_binding:deletions()}).
--spec(serial/1 :: (rabbit_types:exchange()) -> 'none' | pos_integer()).
+-spec(serial/1 :: (rabbit_types:exchange()) ->
+                       fun((boolean()) -> 'none' | pos_integer())).
 -spec(peek_serial/1 :: (name()) -> pos_integer() | 'undefined').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
--define(INFO_KEYS, [name, type, durable, auto_delete, internal, arguments]).
+-define(INFO_KEYS, [name, type, durable, auto_delete, internal, arguments,
+                    policy]).
 
 recover() ->
     Xs = rabbit_misc:table_filter(
@@ -95,21 +111,60 @@ recover() ->
                        true  -> store(X);
                        false -> ok
                    end,
-                   rabbit_exchange:callback(X, create, [map_create_tx(Tx), X])
+                   callback(X, create, map_create_tx(Tx), [X])
            end,
            rabbit_durable_exchange),
+    report_missing_decorators(Xs),
     [XName || #exchange{name = XName} <- Xs].
 
-callback(#exchange{type = XType}, Fun, Args) ->
-    apply(type_to_module(XType), Fun, Args).
+report_missing_decorators(Xs) ->
+    Mods = lists:usort(lists:append([rabbit_exchange_decorator:select(raw, D) ||
+                                     #exchange{decorators = D} <- Xs])),
+    case [M || M <- Mods, code:which(M) =:= non_existing] of
+        [] -> ok;
+        M  -> rabbit_log:warning("Missing exchange decorators: ~p~n", [M])
+    end.
+
+callback(X = #exchange{type       = XType,
+                       decorators = Decorators}, Fun, Serial0, Args) ->
+    Serial = if is_function(Serial0) -> Serial0;
+                is_atom(Serial0)     -> fun (_Bool) -> Serial0 end
+             end,
+    [ok = apply(M, Fun, [Serial(M:serialise_events(X)) | Args]) ||
+        M <- rabbit_exchange_decorator:select(all, Decorators)],
+    Module = type_to_module(XType),
+    apply(Module, Fun, [Serial(Module:serialise_events()) | Args]).
+
+policy_changed(X  = #exchange{type       = XType,
+                              decorators = Decorators},
+               X1 = #exchange{decorators = Decorators1}) ->
+    D  = rabbit_exchange_decorator:select(all, Decorators),
+    D1 = rabbit_exchange_decorator:select(all, Decorators1),
+    DAll = lists:usort(D ++ D1),
+    [ok = M:policy_changed(X, X1) || M <- [type_to_module(XType) | DAll]],
+    ok.
+
+serialise_events(X = #exchange{type = Type, decorators = Decorators}) ->
+    lists:any(fun (M) -> M:serialise_events(X) end,
+              rabbit_exchange_decorator:select(all, Decorators))
+        orelse (type_to_module(Type)):serialise_events().
+
+serial(#exchange{name = XName} = X) ->
+    Serial = case serialise_events(X) of
+                 true  -> next_serial(XName);
+                 false -> none
+             end,
+    fun (true)  -> Serial;
+        (false) -> none
+    end.
 
 declare(XName, Type, Durable, AutoDelete, Internal, Args) ->
-    X = #exchange{name        = XName,
-                  type        = Type,
-                  durable     = Durable,
-                  auto_delete = AutoDelete,
-                  internal    = Internal,
-                  arguments   = Args},
+    X = rabbit_policy:set(#exchange{name        = XName,
+                                    type        = Type,
+                                    durable     = Durable,
+                                    auto_delete = AutoDelete,
+                                    internal    = Internal,
+                                    arguments   = Args}),
     XT = type_to_module(Type),
     %% We want to upset things if it isn't ok
     ok = XT:validate(X),
@@ -129,7 +184,7 @@ declare(XName, Type, Durable, AutoDelete, Internal, Args) ->
               end
       end,
       fun ({new, Exchange}, Tx) ->
-              ok = XT:create(map_create_tx(Tx), Exchange),
+              ok = callback(X, create, map_create_tx(Tx), [Exchange]),
               rabbit_event:notify_if(not Tx, exchange_created, info(Exchange)),
               Exchange;
           ({existing, Exchange}, _Tx) ->
@@ -141,13 +196,7 @@ declare(XName, Type, Durable, AutoDelete, Internal, Args) ->
 map_create_tx(true)  -> transaction;
 map_create_tx(false) -> none.
 
-store(X = #exchange{name = Name, type = Type}) ->
-    ok = mnesia:write(rabbit_exchange, X, write),
-    case (type_to_module(Type)):serialise_events() of
-        true  -> S = #exchange_serial{name = Name, next = 1},
-                 ok = mnesia:write(rabbit_exchange_serial, S, write);
-        false -> ok
-    end.
+store(X) -> ok = mnesia:write(rabbit_exchange, X, write).
 
 %% Used with binaries sent over the wire; the type may not exist.
 check_type(TypeBin) ->
@@ -200,23 +249,53 @@ list(VHostPath) ->
       rabbit_exchange,
       #exchange{name = rabbit_misc:r(VHostPath, exchange), _ = '_'}).
 
-update_scratch(Name, Fun) ->
+lookup_scratch(Name, App) ->
+    case lookup(Name) of
+        {ok, #exchange{scratches = undefined}} ->
+            {error, not_found};
+        {ok, #exchange{scratches = Scratches}} ->
+            case orddict:find(App, Scratches) of
+                {ok, Value} -> {ok, Value};
+                error       -> {error, not_found}
+            end;
+        {error, not_found} ->
+            {error, not_found}
+    end.
+
+update_scratch(Name, App, Fun) ->
     rabbit_misc:execute_mnesia_transaction(
       fun() ->
-              case mnesia:wread({rabbit_exchange, Name}) of
-                  [X = #exchange{durable = Durable, scratch = Scratch}] ->
-                      X1 = X#exchange{scratch = Fun(Scratch)},
-                      ok = mnesia:write(rabbit_exchange, X1, write),
-                      case Durable of
-                          true -> ok = mnesia:write(rabbit_durable_exchange,
-                                                    X1, write);
-                          _    -> ok
-                      end;
-                  [] ->
-                      ok
-              end
+              update(Name,
+                     fun(X = #exchange{scratches = Scratches0}) ->
+                             Scratches1 = case Scratches0 of
+                                              undefined -> orddict:new();
+                                              _         -> Scratches0
+                                          end,
+                             Scratch = case orddict:find(App, Scratches1) of
+                                           {ok, S} -> S;
+                                           error   -> undefined
+                                       end,
+                             Scratches2 = orddict:store(
+                                            App, Fun(Scratch), Scratches1),
+                             X#exchange{scratches = Scratches2}
+                     end),
+              ok
       end).
 
+update(Name, Fun) ->
+    case mnesia:wread({rabbit_exchange, Name}) of
+        [X = #exchange{durable = Durable}] ->
+            X1 = Fun(X),
+            ok = mnesia:write(rabbit_exchange, X1, write),
+            case Durable of
+                true -> ok = mnesia:write(rabbit_durable_exchange, X1, write);
+                _    -> ok
+            end,
+            X1;
+        [] ->
+            not_found
+    end.
+
 info_keys() -> ?INFO_KEYS.
 
 map(VHostPath, F) ->
@@ -232,6 +311,10 @@ i(durable,     #exchange{durable     = Durable})    -> Durable;
 i(auto_delete, #exchange{auto_delete = AutoDelete}) -> AutoDelete;
 i(internal,    #exchange{internal    = Internal})   -> Internal;
 i(arguments,   #exchange{arguments   = Arguments})  -> Arguments;
+i(policy,      X) ->  case rabbit_policy:name(X) of
+                          none   -> '';
+                          Policy -> Policy
+                      end;
 i(Item, _) -> throw({bad_argument, Item}).
 
 info(X = #exchange{}) -> infos(?INFO_KEYS, X).
@@ -242,58 +325,67 @@ info_all(VHostPath) -> map(VHostPath, fun (X) -> info(X) end).
 
 info_all(VHostPath, Items) -> map(VHostPath, fun (X) -> info(X, Items) end).
 
-%% Optimisation
-route(#exchange{name = #resource{name = <<"">>, virtual_host = VHost}},
-      #delivery{message = #basic_message{routing_keys = RKs}}) ->
-    [rabbit_misc:r(VHost, queue, RK) || RK <- lists:usort(RKs)];
-
-route(X = #exchange{name = XName}, Delivery) ->
-    route1(Delivery, {queue:from_list([X]), XName, []}).
-
-route1(Delivery, {WorkList, SeenXs, QNames}) ->
-    case queue:out(WorkList) of
-        {empty, _WorkList} ->
-            lists:usort(QNames);
-        {{value, X = #exchange{type = Type}}, WorkList1} ->
-            DstNames = process_alternate(
-                         X, ((type_to_module(Type)):route(X, Delivery))),
-            route1(Delivery,
-                   lists:foldl(fun process_route/2, {WorkList1, SeenXs, QNames},
-                               DstNames))
+route(#exchange{name = #resource{virtual_host = VHost, name = RName} = XName,
+                decorators = Decorators} = X,
+      #delivery{message = #basic_message{routing_keys = RKs}} = Delivery) ->
+    case {RName, rabbit_exchange_decorator:select(route, Decorators)} of
+        {<<"">>, []} ->
+            %% Optimisation
+            [rabbit_misc:r(VHost, queue, RK) || RK <- lists:usort(RKs)];
+        {_, SelectedDecorators} ->
+            lists:usort(route1(Delivery, SelectedDecorators, {[X], XName, []}))
     end.
 
-process_alternate(#exchange{arguments = []}, Results) -> %% optimisation
-     Results;
+route1(_, _, {[], _, QNames}) ->
+    QNames;
+route1(Delivery, Decorators,
+       {[X = #exchange{type = Type} | WorkList], SeenXs, QNames}) ->
+    ExchangeDests  = (type_to_module(Type)):route(X, Delivery),
+    DecorateDests  = process_decorators(X, Decorators, Delivery),
+    AlternateDests = process_alternate(X, ExchangeDests),
+    route1(Delivery, Decorators,
+           lists:foldl(fun process_route/2, {WorkList, SeenXs, QNames},
+                       AlternateDests ++ DecorateDests  ++ ExchangeDests)).
+
+process_alternate(#exchange{arguments = []}, _Results) -> %% optimisation
+    [];
 process_alternate(#exchange{name = XName, arguments = Args}, []) ->
     case rabbit_misc:r_arg(XName, exchange, Args, <<"alternate-exchange">>) of
         undefined -> [];
         AName     -> [AName]
     end;
-process_alternate(_X, Results) ->
-    Results.
+process_alternate(_X, _Results) ->
+    [].
+
+process_decorators(_, [], _) -> %% optimisation
+    [];
+process_decorators(X, Decorators, Delivery) ->
+    lists:append([Decorator:route(X, Delivery) || Decorator <- Decorators]).
 
 process_route(#resource{kind = exchange} = XName,
               {_WorkList, XName, _QNames} = Acc) ->
     Acc;
 process_route(#resource{kind = exchange} = XName,
               {WorkList, #resource{kind = exchange} = SeenX, QNames}) ->
-    {case lookup(XName) of
-         {ok, X}            -> queue:in(X, WorkList);
-         {error, not_found} -> WorkList
-     end, gb_sets:from_list([SeenX, XName]), QNames};
+    {cons_if_present(XName, WorkList),
+     gb_sets:from_list([SeenX, XName]), QNames};
 process_route(#resource{kind = exchange} = XName,
               {WorkList, SeenXs, QNames} = Acc) ->
     case gb_sets:is_element(XName, SeenXs) of
         true  -> Acc;
-        false -> {case lookup(XName) of
-                      {ok, X}            -> queue:in(X, WorkList);
-                      {error, not_found} -> WorkList
-                  end, gb_sets:add_element(XName, SeenXs), QNames}
+        false -> {cons_if_present(XName, WorkList),
+                  gb_sets:add_element(XName, SeenXs), QNames}
     end;
 process_route(#resource{kind = queue} = QName,
               {WorkList, SeenXs, QNames}) ->
     {WorkList, SeenXs, [QName | QNames]}.
 
+cons_if_present(XName, L) ->
+    case lookup(XName) of
+        {ok, X}            -> [X | L];
+        {error, not_found} -> L
+    end.
+
 call_with_exchange(XName, Fun) ->
     rabbit_misc:execute_mnesia_tx_with_tail(
       fun () -> case mnesia:read({rabbit_exchange, XName}) of
@@ -320,6 +412,10 @@ delete(XName, IfUnused) ->
               end
       end).
 
+validate_binding(X = #exchange{type = XType}, Binding) ->
+    Module = type_to_module(XType),
+    Module:validate_binding(X, Binding).
+
 maybe_auto_delete(#exchange{auto_delete = false}) ->
     not_deleted;
 maybe_auto_delete(#exchange{auto_delete = true} = X) ->
@@ -335,34 +431,33 @@ conditional_delete(X = #exchange{name = XName}) ->
     end.
 
 unconditional_delete(X = #exchange{name = XName}) ->
-    ok = mnesia:delete({rabbit_durable_exchange, XName}),
+    %% this 'guarded' delete prevents unnecessary writes to the mnesia
+    %% disk log
+    case mnesia:wread({rabbit_durable_exchange, XName}) of
+        []  -> ok;
+        [_] -> ok = mnesia:delete({rabbit_durable_exchange, XName})
+    end,
     ok = mnesia:delete({rabbit_exchange, XName}),
     ok = mnesia:delete({rabbit_exchange_serial, XName}),
     Bindings = rabbit_binding:remove_for_source(XName),
     {deleted, X, Bindings, rabbit_binding:remove_for_destination(XName)}.
 
-serial(#exchange{name = XName, type = Type}) ->
-    case (type_to_module(Type)):serialise_events() of
-        true  -> next_serial(XName);
-        false -> none
-    end.
-
 next_serial(XName) ->
-    [#exchange_serial{next = Serial}] =
-        mnesia:read(rabbit_exchange_serial, XName, write),
+    Serial = peek_serial(XName, write),
     ok = mnesia:write(rabbit_exchange_serial,
                       #exchange_serial{name = XName, next = Serial + 1}, write),
     Serial.
 
-peek_serial(XName) ->
-    case mnesia:read({rabbit_exchange_serial, XName}) of
+peek_serial(XName) -> peek_serial(XName, read).
+
+peek_serial(XName, LockType) ->
+    case mnesia:read(rabbit_exchange_serial, XName, LockType) of
         [#exchange_serial{next = Serial}]  -> Serial;
-        _                                  -> undefined
+        _                                  -> 1
     end.
 
 invalid_module(T) ->
-    rabbit_log:warning(
-      "Could not find exchange type ~s.~n", [T]),
+    rabbit_log:warning("Could not find exchange type ~s.~n", [T]),
     put({xtype_to_module, T}, rabbit_exchange_type_invalid),
     rabbit_exchange_type_invalid.
 
diff --git a/src/rabbit_exchange_decorator.erl b/src/rabbit_exchange_decorator.erl
new file mode 100644
index 00000000..505998b9
--- /dev/null
+++ b/src/rabbit_exchange_decorator.erl
@@ -0,0 +1,106 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_exchange_decorator).
+
+-include("rabbit.hrl").
+
+-export([select/2, set/1]).
+
+%% This is like an exchange type except that:
+%%
+%% 1) It applies to all exchanges as soon as it is installed, therefore
+%% 2) It is not allowed to affect validation, so no validate/1 or
+%%    assert_args_equivalence/2
+%%
+%% It's possible in the future we might make decorators
+%% able to manipulate messages as they are published.
+
+-ifdef(use_specs).
+
+-type(tx() :: 'transaction' | 'none').
+-type(serial() :: pos_integer() | tx()).
+
+-callback description() -> [proplists:property()].
+
+%% Should Rabbit ensure that all binding events that are
+%% delivered to an individual exchange can be serialised? (they
+%% might still be delivered out of order, but there'll be a
+%% serial number).
+-callback serialise_events(rabbit_types:exchange()) -> boolean().
+
+%% called after declaration and recovery
+-callback create(tx(), rabbit_types:exchange()) -> 'ok'.
+
+%% called after exchange (auto)deletion.
+-callback delete(tx(), rabbit_types:exchange(), [rabbit_types:binding()]) ->
+    'ok'.
+
+%% called when the policy attached to this exchange changes.
+-callback policy_changed(rabbit_types:exchange(), rabbit_types:exchange()) ->
+    'ok'.
+
+%% called after a binding has been added or recovered
+-callback add_binding(serial(), rabbit_types:exchange(),
+                      rabbit_types:binding()) -> 'ok'.
+
+%% called after bindings have been deleted.
+-callback remove_bindings(serial(), rabbit_types:exchange(),
+                          [rabbit_types:binding()]) -> 'ok'.
+
+%% Allows additional destinations to be added to the routing decision.
+-callback route(rabbit_types:exchange(), rabbit_types:delivery()) ->
+    [rabbit_amqqueue:name() | rabbit_exchange:name()].
+
+%% Whether the decorator wishes to receive callbacks for the exchange
+%% none:no callbacks, noroute:all callbacks except route, all:all callbacks
+-callback active_for(rabbit_types:exchange()) -> 'none' | 'noroute' | 'all'.
+
+-else.
+
+-export([behaviour_info/1]).
+
+behaviour_info(callbacks) ->
+    [{description, 0}, {serialise_events, 1}, {create, 2}, {delete, 3},
+     {policy_changed, 2}, {add_binding, 3}, {remove_bindings, 3},
+     {route, 2}, {active_for, 1}];
+behaviour_info(_Other) ->
+    undefined.
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+%% select a subset of active decorators
+select(all,   {Route, NoRoute})  -> filter(Route ++ NoRoute);
+select(route, {Route, _NoRoute}) -> filter(Route);
+select(raw,   {Route, NoRoute})  -> Route ++ NoRoute.
+
+filter(Modules) ->
+    [M || M <- Modules, code:which(M) =/= non_existing].
+
+set(X) ->
+    Decs = lists:foldl(fun (D, {Route, NoRoute}) ->
+                               ActiveFor = D:active_for(X),
+                               {cons_if_eq(all,     ActiveFor, D, Route),
+                                cons_if_eq(noroute, ActiveFor, D, NoRoute)}
+                       end, {[], []}, list()),
+    X#exchange{decorators = Decs}.
+
+list() -> [M || {_, M} <- rabbit_registry:lookup_all(exchange_decorator)].
+
+cons_if_eq(Select,  Select, Item,  List) -> [Item | List];
+cons_if_eq(_Select, _Other, _Item, List) -> List.
diff --git a/src/rabbit_exchange_type.erl b/src/rabbit_exchange_type.erl
index 1027570c..ce7a436b 100644
--- a/src/rabbit_exchange_type.erl
+++ b/src/rabbit_exchange_type.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type).
@@ -21,7 +21,7 @@
 -type(tx() :: 'transaction' | 'none').
 -type(serial() :: pos_integer() | tx()).
 
--callback description() -> [proplist:property()].
+-callback description() -> [proplists:property()].
 
 %% Should Rabbit ensure that all binding events that are
 %% delivered to an individual exchange can be serialised? (they
@@ -37,6 +37,10 @@
 %% called BEFORE declaration, to check args etc; may exit with #amqp_error{}
 -callback validate(rabbit_types:exchange()) -> 'ok'.
 
+%% called BEFORE declaration, to check args etc
+-callback validate_binding(rabbit_types:exchange(), rabbit_types:binding()) ->
+    rabbit_types:ok_or_error({'binding_invalid', string(), [any()]}).
+
 %% called after declaration and recovery
 -callback create(tx(), rabbit_types:exchange()) -> 'ok'.
 
@@ -44,6 +48,10 @@
 -callback delete(tx(), rabbit_types:exchange(), [rabbit_types:binding()]) ->
     'ok'.
 
+%% called when the policy attached to this exchange changes.
+-callback policy_changed(rabbit_types:exchange(), rabbit_types:exchange()) ->
+    'ok'.
+
 %% called after a binding has been added or recovered
 -callback add_binding(serial(), rabbit_types:exchange(),
                       rabbit_types:binding()) -> 'ok'.
@@ -54,8 +62,8 @@
 
 %% called when comparing exchanges for equivalence - should return ok or
 %% exit with #amqp_error{}
--callback assert_args_equivalence (rabbit_types:exchange(),
-                                   rabbit_framing:amqp_table()) ->
+-callback assert_args_equivalence(rabbit_types:exchange(),
+                                  rabbit_framing:amqp_table()) ->
     'ok' | rabbit_types:connection_exit().
 
 -else.
@@ -63,7 +71,8 @@
 -export([behaviour_info/1]).
 
 behaviour_info(callbacks) ->
-    [{description, 0}, {serialise_events, 0}, {route, 2}, {validate, 1},
+    [{description, 0}, {serialise_events, 0}, {route, 2},
+     {validate, 1}, {validate_binding, 2}, {policy_changed, 2},
      {create, 2}, {delete, 3}, {add_binding, 3}, {remove_bindings, 3},
      {assert_args_equivalence, 2}];
 behaviour_info(_Other) ->
diff --git a/src/rabbit_exchange_type_direct.erl b/src/rabbit_exchange_type_direct.erl
index cdec1cb9..52704ab6 100644
--- a/src/rabbit_exchange_type_direct.erl
+++ b/src/rabbit_exchange_type_direct.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type_direct).
@@ -20,8 +20,9 @@
 -behaviour(rabbit_exchange_type).
 
 -export([description/0, serialise_events/0, route/2]).
--export([validate/1, create/2, delete/3,
-         add_binding/3, remove_bindings/3, assert_args_equivalence/2]).
+-export([validate/1, validate_binding/2,
+         create/2, delete/3, policy_changed/2, add_binding/3,
+         remove_bindings/3, assert_args_equivalence/2]).
 
 -rabbit_boot_step({?MODULE,
                    [{description, "exchange type direct"},
@@ -31,8 +32,7 @@
                     {enables,     kernel_ready}]}).
 
 description() ->
-    [{name, <<"direct">>},
-     {description, <<"AMQP direct exchange, as per the AMQP specification">>}].
+    [{description, <<"AMQP direct exchange, as per the AMQP specification">>}].
 
 serialise_events() -> false.
 
@@ -41,8 +41,10 @@ route(#exchange{name = Name},
     rabbit_router:match_routing_key(Name, Routes).
 
 validate(_X) -> ok.
+validate_binding(_X, _B) -> ok.
 create(_Tx, _X) -> ok.
 delete(_Tx, _X, _Bs) -> ok.
+policy_changed(_X1, _X2) -> ok.
 add_binding(_Tx, _X, _B) -> ok.
 remove_bindings(_Tx, _X, _Bs) -> ok.
 assert_args_equivalence(X, Args) ->
diff --git a/src/rabbit_exchange_type_fanout.erl b/src/rabbit_exchange_type_fanout.erl
index a64f2c29..068472bb 100644
--- a/src/rabbit_exchange_type_fanout.erl
+++ b/src/rabbit_exchange_type_fanout.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type_fanout).
@@ -20,7 +20,8 @@
 -behaviour(rabbit_exchange_type).
 
 -export([description/0, serialise_events/0, route/2]).
--export([validate/1, create/2, delete/3, add_binding/3,
+-export([validate/1, validate_binding/2,
+         create/2, delete/3, policy_changed/2, add_binding/3,
          remove_bindings/3, assert_args_equivalence/2]).
 
 -rabbit_boot_step({?MODULE,
@@ -31,8 +32,7 @@
                     {enables,     kernel_ready}]}).
 
 description() ->
-    [{name, <<"fanout">>},
-     {description, <<"AMQP fanout exchange, as per the AMQP specification">>}].
+    [{description, <<"AMQP fanout exchange, as per the AMQP specification">>}].
 
 serialise_events() -> false.
 
@@ -40,8 +40,10 @@ route(#exchange{name = Name}, _Delivery) ->
     rabbit_router:match_routing_key(Name, ['_']).
 
 validate(_X) -> ok.
+validate_binding(_X, _B) -> ok.
 create(_Tx, _X) -> ok.
 delete(_Tx, _X, _Bs) -> ok.
+policy_changed(_X1, _X2) -> ok.
 add_binding(_Tx, _X, _B) -> ok.
 remove_bindings(_Tx, _X, _Bs) -> ok.
 assert_args_equivalence(X, Args) ->
diff --git a/src/rabbit_exchange_type_headers.erl b/src/rabbit_exchange_type_headers.erl
index 61917d8f..baec9c29 100644
--- a/src/rabbit_exchange_type_headers.erl
+++ b/src/rabbit_exchange_type_headers.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type_headers).
@@ -21,7 +21,8 @@
 -behaviour(rabbit_exchange_type).
 
 -export([description/0, serialise_events/0, route/2]).
--export([validate/1, create/2, delete/3, add_binding/3,
+-export([validate/1, validate_binding/2,
+         create/2, delete/3, policy_changed/2, add_binding/3,
          remove_bindings/3, assert_args_equivalence/2]).
 
 -rabbit_boot_step({?MODULE,
@@ -37,8 +38,7 @@
 -endif.
 
 description() ->
-    [{name, <<"headers">>},
-     {description, <<"AMQP headers exchange, as per the AMQP specification">>}].
+    [{description, <<"AMQP headers exchange, as per the AMQP specification">>}].
 
 serialise_events() -> false.
 
@@ -51,14 +51,26 @@ route(#exchange{name = Name},
     rabbit_router:match_bindings(
       Name, fun (#binding{args = Spec}) -> headers_match(Spec, Headers) end).
 
-default_headers_match_kind() -> all.
+validate_binding(_X, #binding{args = Args}) ->
+    case rabbit_misc:table_lookup(Args, <<"x-match">>) of
+        {longstr, <<"all">>} -> ok;
+        {longstr, <<"any">>} -> ok;
+        {longstr, Other}     -> {error,
+                                 {binding_invalid,
+                                  "Invalid x-match field value ~p; "
+                                  "expected all or any", [Other]}};
+        {Type,    Other}     -> {error,
+                                 {binding_invalid,
+                                  "Invalid x-match field type ~p (value ~p); "
+                                  "expected longstr", [Type, Other]}};
+        undefined            -> ok %% [0]
+    end.
+%% [0] spec is vague on whether it can be omitted but in practice it's
+%% useful to allow people to do this
 
-parse_x_match(<<"all">>) -> all;
-parse_x_match(<<"any">>) -> any;
-parse_x_match(Other) ->
-    rabbit_log:warning("Invalid x-match field value ~p; expected all or any",
-                       [Other]),
-    default_headers_match_kind().
+parse_x_match({longstr, <<"all">>}) -> all;
+parse_x_match({longstr, <<"any">>}) -> any;
+parse_x_match(_)                    -> all. %% legacy; we didn't validate
 
 %% Horrendous matching algorithm. Depends for its merge-like
 %% (linear-time) behaviour on the lists:keysort
@@ -69,17 +81,9 @@ parse_x_match(Other) ->
 %% In other words: REQUIRES BOTH PATTERN AND DATA TO BE SORTED ASCENDING BY KEY.
 %%                 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 %%
-headers_match(Pattern, Data) ->
-    MatchKind = case lists:keysearch(<<"x-match">>, 1, Pattern) of
-                    {value, {_, longstr, MK}} -> parse_x_match(MK);
-                    {value, {_, Type, MK}} ->
-                        rabbit_log:warning("Invalid x-match field type ~p "
-                                           "(value ~p); expected longstr",
-                                           [Type, MK]),
-                        default_headers_match_kind();
-                    _ -> default_headers_match_kind()
-                end,
-    headers_match(Pattern, Data, true, false, MatchKind).
+headers_match(Args, Data) ->
+    MK = parse_x_match(rabbit_misc:table_lookup(Args, <<"x-match">>)),
+    headers_match(Args, Data, true, false, MK).
 
 headers_match([], _Data, AllMatch, _AnyMatch, all) ->
     AllMatch;
@@ -116,6 +120,7 @@ headers_match([{PK, PT, PV} | PRest], [{DK, DT, DV} | DRest],
 validate(_X) -> ok.
 create(_Tx, _X) -> ok.
 delete(_Tx, _X, _Bs) -> ok.
+policy_changed(_X1, _X2) -> ok.
 add_binding(_Tx, _X, _B) -> ok.
 remove_bindings(_Tx, _X, _Bs) -> ok.
 assert_args_equivalence(X, Args) ->
diff --git a/src/rabbit_exchange_type_invalid.erl b/src/rabbit_exchange_type_invalid.erl
index 82d27960..84bb2182 100644
--- a/src/rabbit_exchange_type_invalid.erl
+++ b/src/rabbit_exchange_type_invalid.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type_invalid).
@@ -20,17 +20,21 @@
 -behaviour(rabbit_exchange_type).
 
 -export([description/0, serialise_events/0, route/2]).
--export([validate/1, create/2, delete/3,
-         add_binding/3, remove_bindings/3, assert_args_equivalence/2]).
+-export([validate/1, validate_binding/2,
+         create/2, delete/3, policy_changed/2, add_binding/3,
+         remove_bindings/3, assert_args_equivalence/2]).
 
 description() ->
-    [{name, <<"invalid">>},
-     {description,
+    [{description,
       <<"Dummy exchange type, to be used when the intended one is not found.">>
      }].
 
 serialise_events() -> false.
 
+-ifdef(use_specs).
+-spec(route/2 :: (rabbit_types:exchange(), rabbit_types:delivery())
+                 -> no_return()).
+-endif.
 route(#exchange{name = Name, type = Type}, _) ->
     rabbit_misc:protocol_error(
       precondition_failed,
@@ -38,8 +42,10 @@ route(#exchange{name = Name, type = Type}, _) ->
       [rabbit_misc:rs(Name), Type]).
 
 validate(_X) -> ok.
+validate_binding(_X, _B) -> ok.
 create(_Tx, _X) -> ok.
 delete(_Tx, _X, _Bs) -> ok.
+policy_changed(_X1, _X2) -> ok.
 add_binding(_Tx, _X, _B) -> ok.
 remove_bindings(_Tx, _X, _Bs) -> ok.
 assert_args_equivalence(X, Args) ->
diff --git a/src/rabbit_exchange_type_topic.erl b/src/rabbit_exchange_type_topic.erl
index 3160fdf4..8ba29deb 100644
--- a/src/rabbit_exchange_type_topic.erl
+++ b/src/rabbit_exchange_type_topic.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_exchange_type_topic).
@@ -21,7 +21,8 @@
 -behaviour(rabbit_exchange_type).
 
 -export([description/0, serialise_events/0, route/2]).
--export([validate/1, create/2, delete/3, add_binding/3,
+-export([validate/1, validate_binding/2,
+         create/2, delete/3, policy_changed/2, add_binding/3,
          remove_bindings/3, assert_args_equivalence/2]).
 
 -rabbit_boot_step({?MODULE,
@@ -34,8 +35,7 @@
 %%----------------------------------------------------------------------------
 
 description() ->
-    [{name, <<"topic">>},
-     {description, <<"AMQP topic exchange, as per the AMQP specification">>}].
+    [{description, <<"AMQP topic exchange, as per the AMQP specification">>}].
 
 serialise_events() -> false.
 
@@ -48,6 +48,7 @@ route(#exchange{name = X},
                   end || RKey <- Routes]).
 
 validate(_X) -> ok.
+validate_binding(_X, _B) -> ok.
 create(_Tx, _X) -> ok.
 
 delete(transaction, #exchange{name = X}, _Bs) ->
@@ -58,6 +59,8 @@ delete(transaction, #exchange{name = X}, _Bs) ->
 delete(none, _Exchange, _Bs) ->
     ok.
 
+policy_changed(_X1, _X2) -> ok.
+
 add_binding(transaction, _Exchange, Binding) ->
     internal_add_binding(Binding);
 add_binding(none, _Exchange, _Binding) ->
diff --git a/src/rabbit_file.erl b/src/rabbit_file.erl
index 59df14f3..4cf314ca 100644
--- a/src/rabbit_file.erl
+++ b/src/rabbit_file.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_file).
@@ -24,6 +24,8 @@
 -export([rename/2, delete/1, recursive_delete/1, recursive_copy/2]).
 -export([lock_file/1]).
 
+-define(TMP_EXT, ".tmp").
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
@@ -102,9 +104,12 @@ read_file_info(File) ->
     with_fhc_handle(fun () -> prim_file:read_file_info(File) end).
 
 with_fhc_handle(Fun) ->
-    ok = file_handle_cache:obtain(),
+    with_fhc_handle(1, Fun).
+
+with_fhc_handle(N, Fun) ->
+    ok = file_handle_cache:obtain(N),
     try Fun()
-    after ok = file_handle_cache:release()
+    after ok = file_handle_cache:release(N)
     end.
 
 read_term_file(File) ->
@@ -133,29 +138,17 @@ write_term_file(File, Terms) ->
 
 write_file(Path, Data) -> write_file(Path, Data, []).
 
-%% write_file/3 and make_binary/1 are both based on corresponding
-%% functions in the kernel/file.erl module of the Erlang R14B02
-%% release, which is licensed under the EPL. That implementation of
-%% write_file/3 does not do an fsync prior to closing the file, hence
-%% the existence of this version. APIs are otherwise identical.
 write_file(Path, Data, Modes) ->
     Modes1 = [binary, write | (Modes -- [binary, write])],
     case make_binary(Data) of
-        Bin when is_binary(Bin) ->
-            with_fhc_handle(
-              fun () -> case prim_file:open(Path, Modes1) of
-                            {ok, Hdl}      -> try prim_file:write(Hdl, Bin) of
-                                                  ok -> prim_file:sync(Hdl);
-                                                  {error, _} = E -> E
-                                              after
-                                                  prim_file:close(Hdl)
-                                              end;
-                            {error, _} = E -> E
-                        end
-              end);
-        {error, _} = E -> E
+        Bin when is_binary(Bin) -> write_file1(Path, Bin, Modes1);
+        {error, _} = E          -> E
     end.
 
+%% make_binary/1 is based on the corresponding function in the
+%% kernel/file.erl module of the Erlang R14B02 release, which is
+%% licensed under the EPL.
+
 make_binary(Bin) when is_binary(Bin) ->
     Bin;
 make_binary(List) ->
@@ -165,7 +158,41 @@ make_binary(List) ->
             {error, Reason}
     end.
 
+write_file1(Path, Bin, Modes) ->
+    try
+        with_synced_copy(Path, Modes,
+                         fun (Hdl) ->
+                                 ok = prim_file:write(Hdl, Bin)
+                         end)
+    catch
+        error:{badmatch, Error} -> Error;
+            _:{error, Error}    -> {error, Error}
+    end.
+
+with_synced_copy(Path, Modes, Fun) ->
+    case lists:member(append, Modes) of
+        true ->
+            {error, append_not_supported, Path};
+        false ->
+            with_fhc_handle(
+              fun () ->
+                      Bak = Path ++ ?TMP_EXT,
+                      case prim_file:open(Bak, Modes) of
+                          {ok, Hdl} ->
+                              try
+                                  Result = Fun(Hdl),
+                                  ok = prim_file:rename(Bak, Path),
+                                  ok = prim_file:sync(Hdl),
+                                  Result
+                              after
+                                  prim_file:close(Hdl)
+                              end;
+                          {error, _} = E -> E
+                      end
+              end)
+    end.
 
+%% TODO the semantics of this function are rather odd. But see bug 25021.
 append_file(File, Suffix) ->
     case read_file_info(File) of
         {ok, FInfo}     -> append_file(File, FInfo#file_info.size, Suffix);
@@ -183,9 +210,11 @@ append_file(File, 0, Suffix) ->
                             end
                     end);
 append_file(File, _, Suffix) ->
-    case with_fhc_handle(fun () -> prim_file:read_file(File) end) of
-        {ok, Data} -> write_file([File, Suffix], Data, [append]);
-        Error      -> Error
+    case with_fhc_handle(2, fun () ->
+                                file:copy(File, {[File, Suffix], [append]})
+                            end) of
+        {ok, _BytesCopied} -> ok;
+        Error              -> Error
     end.
 
 ensure_parent_dirs_exist(Filename) ->
diff --git a/src/rabbit_framing.erl b/src/rabbit_framing.erl
index a79188ab..51aaa999 100644
--- a/src/rabbit_framing.erl
+++ b/src/rabbit_framing.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 %% TODO auto-generate
diff --git a/src/rabbit_guid.erl b/src/rabbit_guid.erl
index ba0cb04f..70d1f0c1 100644
--- a/src/rabbit_guid.erl
+++ b/src/rabbit_guid.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_guid).
@@ -63,6 +63,7 @@ update_disk_serial() ->
     Filename = filename(),
     Serial = case rabbit_file:read_term_file(Filename) of
                  {ok, [Num]}     -> Num;
+                 {ok, []}        -> 0; %% [1]
                  {error, enoent} -> 0;
                  {error, Reason} ->
                      throw({error, {cannot_read_serial_file, Filename, Reason}})
@@ -73,6 +74,10 @@ update_disk_serial() ->
             throw({error, {cannot_write_serial_file, Filename, Reason1}})
     end,
     Serial.
+%% [1] a couple of users have reported startup failures due to the
+%% file being empty, presumably as a result of filesystem
+%% corruption. While rabbit doesn't cope with that in general, in this
+%% specific case we can be more accommodating.
 
 %% Generate an un-hashed guid.
 fresh() ->
@@ -104,8 +109,6 @@ advance_blocks({B1, B2, B3, B4}, I) ->
     B5 = erlang:phash2({B1, I}, 4294967296),
     {{(B2 bxor B5), (B3 bxor B5), (B4 bxor B5), B5}, I+1}.
 
-blocks_to_binary({B1, B2, B3, B4}) -> <<B1:32, B2:32, B3:32, B4:32>>.
-
 %% generate a GUID. This function should be used when performance is a
 %% priority and predictability is not an issue. Otherwise use
 %% gen_secure/0.
@@ -114,14 +117,15 @@ gen() ->
     %% time we need a new guid we rotate them producing a new hash
     %% with the aid of the counter. Look at the comments in
     %% advance_blocks/2 for details.
-    {BS, I} = case get(guid) of
-                  undefined -> <<B1:32, B2:32, B3:32, B4:32>> =
-                                   erlang:md5(term_to_binary(fresh())),
-                               {{B1,B2,B3,B4}, 0};
-                  {BS0, I0} -> advance_blocks(BS0, I0)
-              end,
-    put(guid, {BS, I}),
-    blocks_to_binary(BS).
+    case get(guid) of
+        undefined -> <<B1:32, B2:32, B3:32, B4:32>> = Res =
+                         erlang:md5(term_to_binary(fresh())),
+                     put(guid, {{B1, B2, B3, B4}, 0}),
+                     Res;
+        {BS, I}   -> {{B1, B2, B3, B4}, _} = S = advance_blocks(BS, I),
+                     put(guid, S),
+                     <<B1:32, B2:32, B3:32, B4:32>>
+    end.
 
 %% generate a non-predictable GUID.
 %%
@@ -144,11 +148,7 @@ gen_secure() ->
 %% employs base64url encoding, which is safer in more contexts than
 %% plain base64.
 string(G, Prefix) ->
-    Prefix ++ "-" ++ lists:foldl(fun ($\+, Acc) -> [$\- | Acc];
-                                     ($\/, Acc) -> [$\_ | Acc];
-                                     ($\=, Acc) -> Acc;
-                                     (Chr, Acc) -> [Chr | Acc]
-                                 end, [], base64:encode_to_string(G)).
+    Prefix ++ "-" ++ rabbit_misc:base64url(G).
 
 binary(G, Prefix) ->
     list_to_binary(string(G, Prefix)).
diff --git a/src/rabbit_heartbeat.erl b/src/rabbit_heartbeat.erl
index 80b4e768..fac74edb 100644
--- a/src/rabbit_heartbeat.erl
+++ b/src/rabbit_heartbeat.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_heartbeat).
@@ -19,6 +19,8 @@
 -export([start_heartbeat_sender/3, start_heartbeat_receiver/3,
          start_heartbeat_fun/1, pause_monitor/1, resume_monitor/1]).
 
+-export([system_continue/3, system_terminate/4, system_code_change/4]).
+
 -include("rabbit.hrl").
 
 %%----------------------------------------------------------------------------
@@ -51,6 +53,10 @@
 -spec(pause_monitor/1 :: (heartbeaters()) -> 'ok').
 -spec(resume_monitor/1 :: (heartbeaters()) -> 'ok').
 
+-spec(system_code_change/4 :: (_,_,_,_) -> {'ok',_}).
+-spec(system_continue/3 :: (_,_,{_, _}) -> any()).
+-spec(system_terminate/4 :: (_,_,_,_) -> none()).
+
 -endif.
 
 %%----------------------------------------------------------------------------
@@ -59,21 +65,15 @@ start_heartbeat_sender(Sock, TimeoutSec, SendFun) ->
     %% the 'div 2' is there so that we don't end up waiting for nearly
     %% 2 * TimeoutSec before sending a heartbeat in the boundary case
     %% where the last message was sent just after a heartbeat.
-    heartbeater(
-      {Sock, TimeoutSec * 1000 div 2, send_oct, 0,
-       fun () ->
-               SendFun(),
-               continue
-       end}).
+    heartbeater({Sock, TimeoutSec * 1000 div 2, send_oct, 0,
+                 fun () -> SendFun(), continue end}).
 
 start_heartbeat_receiver(Sock, TimeoutSec, ReceiveFun) ->
     %% we check for incoming data every interval, and time out after
     %% two checks with no change. As a result we will time out between
     %% 2 and 3 intervals after the last data has been received.
-    heartbeater({Sock, TimeoutSec * 1000, recv_oct, 1, fun () ->
-                                                               ReceiveFun(),
-                                                               stop
-                                                       end}).
+    heartbeater({Sock, TimeoutSec * 1000, recv_oct, 1,
+                 fun () -> ReceiveFun(), stop end}).
 
 start_heartbeat_fun(SupPid) ->
     fun (Sock, SendTimeoutSec, SendFun, ReceiveTimeoutSec, ReceiveFun) ->
@@ -88,17 +88,20 @@ start_heartbeat_fun(SupPid) ->
             {Sender, Receiver}
     end.
 
-pause_monitor({_Sender, none}) ->
-    ok;
-pause_monitor({_Sender, Receiver}) ->
-    Receiver ! pause,
-    ok.
+pause_monitor({_Sender,     none}) -> ok;
+pause_monitor({_Sender, Receiver}) -> Receiver ! pause, ok.
+
+resume_monitor({_Sender,     none}) -> ok;
+resume_monitor({_Sender, Receiver}) -> Receiver ! resume, ok.
+
+system_continue(_Parent, Deb, {Params, State}) ->
+    heartbeater(Params, Deb, State).
+
+system_terminate(Reason, _Parent, _Deb, _State) ->
+    exit(Reason).
 
-resume_monitor({_Sender, none}) ->
-    ok;
-resume_monitor({_Sender, Receiver}) ->
-    Receiver ! resume,
-    ok.
+system_code_change(Misc, _Module, _OldVsn, _Extra) ->
+    {ok, Misc}.
 
 %%----------------------------------------------------------------------------
 start_heartbeater(0, _SupPid, _Sock, _TimeoutFun, _Name, _Callback) ->
@@ -106,24 +109,29 @@ start_heartbeater(0, _SupPid, _Sock, _TimeoutFun, _Name, _Callback) ->
 start_heartbeater(TimeoutSec, SupPid, Sock, TimeoutFun, Name, Callback) ->
     supervisor2:start_child(
       SupPid, {Name,
-               {rabbit_heartbeat, Callback,
-                [Sock, TimeoutSec, TimeoutFun]},
+               {rabbit_heartbeat, Callback, [Sock, TimeoutSec, TimeoutFun]},
                transient, ?MAX_WAIT, worker, [rabbit_heartbeat]}).
 
 heartbeater(Params) ->
-    {ok, proc_lib:spawn_link(fun () -> heartbeater(Params, {0, 0}) end)}.
+    Deb = sys:debug_options([]),
+    {ok, proc_lib:spawn_link(fun () -> heartbeater(Params, Deb, {0, 0}) end)}.
 
 heartbeater({Sock, TimeoutMillisec, StatName, Threshold, Handler} = Params,
-            {StatVal, SameCount}) ->
-    Recurse = fun (V) -> heartbeater(Params, V) end,
+            Deb, {StatVal, SameCount} = State) ->
+    Recurse = fun (State1) -> heartbeater(Params, Deb, State1) end,
+    System  = fun (From, Req) ->
+                      sys:handle_system_msg(
+                        Req, From, self(), ?MODULE, Deb, {Params, State})
+              end,
     receive
         pause ->
             receive
-                resume ->
-                    Recurse({0, 0});
-                Other ->
-                    exit({unexpected_message, Other})
+                resume              -> Recurse({0, 0});
+                {system, From, Req} -> System(From, Req);
+                Other               -> exit({unexpected_message, Other})
             end;
+        {system, From, Req} ->
+            System(From, Req);
         Other ->
             exit({unexpected_message, Other})
     after TimeoutMillisec ->
diff --git a/src/rabbit_intermediate_sup.erl b/src/rabbit_intermediate_sup.erl
new file mode 100644
index 00000000..a9381f20
--- /dev/null
+++ b/src/rabbit_intermediate_sup.erl
@@ -0,0 +1,39 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2013-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_intermediate_sup).
+
+-behaviour(supervisor2).
+
+-export([start_link/0]).
+
+-export([init/1]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+-spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link() ->
+    supervisor2:start_link(?MODULE, []).
+
+%%----------------------------------------------------------------------------
+
+init([]) ->
+    {ok, {{one_for_one, 10, 10}, []}}.
diff --git a/src/rabbit_limiter.erl b/src/rabbit_limiter.erl
index 2b15498e..12a13c00 100644
--- a/src/rabbit_limiter.erl
+++ b/src/rabbit_limiter.erl
@@ -10,46 +10,169 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
+%% The purpose of the limiter is to stem the flow of messages from
+%% queues to channels, in order to act upon various protocol-level
+%% flow control mechanisms, specifically AMQP 0-9-1's basic.qos
+%% prefetch_count and channel.flow, and AMQP 1.0's link (aka consumer)
+%% credit mechanism.
+%%
+%% Each channel has an associated limiter process, created with
+%% start_link/1, which it passes to queues on consumer creation with
+%% rabbit_amqqueue:basic_consume/9, and rabbit_amqqueue:basic_get/4.
+%% The latter isn't strictly necessary, since basic.get is not
+%% subject to limiting, but it means that whenever a queue knows about
+%% a channel, it also knows about its limiter, which is less fiddly.
+%%
+%% The limiter process holds state that is, in effect, shared between
+%% the channel and all queues from which the channel is
+%% consuming. Essentially all these queues are competing for access to
+%% a single, limited resource - the ability to deliver messages via
+%% the channel - and it is the job of the limiter process to mediate
+%% that access.
+%%
+%% The limiter process is separate from the channel process for two
+%% reasons: separation of concerns, and efficiency. Channels can get
+%% very busy, particularly if they are also dealing with publishes.
+%% With a separate limiter process all the aforementioned access
+%% mediation can take place without touching the channel.
+%%
+%% For efficiency, both the channel and the queues keep some local
+%% state, initialised from the limiter pid with new/1 and client/1,
+%% respectively. In particular this allows them to avoid any
+%% interaction with the limiter process when it is 'inactive', i.e. no
+%% protocol-level flow control is taking place.
+%%
+%% This optimisation does come at the cost of some complexity though:
+%% when a limiter becomes active, the channel needs to inform all its
+%% consumer queues of this change in status. It does this by invoking
+%% rabbit_amqqueue:activate_limit_all/2. Note that there is no inverse
+%% transition, i.e. once a queue has been told about an active
+%% limiter, it is not subsequently told when that limiter becomes
+%% inactive. In practice it is rare for that to happen, though we
+%% could optimise this case in the future.
+%%
+%% In addition, the consumer credit bookkeeping is local to queues, so
+%% it is not necessary to store information about it in the limiter
+%% process. But for abstraction we hide it from the queue behind the
+%% limiter API, and it therefore becomes part of the queue local
+%% state.
+%%
+%% The interactions with the limiter are as follows:
+%%
+%% 1. Channels tell the limiter about basic.qos prefetch counts -
+%%    that's what the limit_prefetch/3, unlimit_prefetch/1,
+%%    is_prefetch_limited/1, get_prefetch_limit/1 API functions are
+%%    about - and channel.flow blocking - that's what block/1,
+%%    unblock/1 and is_blocked/1 are for. They also tell the limiter
+%%    queue state (via the queue) about consumer credit changes -
+%%    that's what credit/4 is for.
+%%
+%% 2. Queues also tell the limiter queue state about the queue
+%%    becoming empty (via drained/1) and consumers leaving (via
+%%    forget_consumer/2).
+%%
+%% 3. Queues register with the limiter - this happens as part of
+%%    activate/1.
+%%
+%% 4. The limiter process maintains an internal counter of 'messages
+%%    sent but not yet acknowledged', called the 'volume'.
+%%
+%% 5. Queues ask the limiter for permission (with can_send/3) whenever
+%%    they want to deliver a message to a channel. The limiter checks
+%%    whether a) the channel isn't blocked by channel.flow, b) the
+%%    volume has not yet reached the prefetch limit, and c) whether
+%%    the consumer has enough credit. If so it increments the volume
+%%    and tells the queue to proceed. Otherwise it marks the queue as
+%%    requiring notification (see below) and tells the queue not to
+%%    proceed.
+%%
+%% 6. A queue that has been told to proceed (by the return value of
+%%    can_send/3) sends the message to the channel. Conversely, a
+%%    queue that has been told not to proceed, will not attempt to
+%%    deliver that message, or any future messages, to the
+%%    channel. This is accomplished by can_send/3 capturing the
+%%    outcome in the local state, where it can be accessed with
+%%    is_suspended/1.
+%%
+%% 7. When a channel receives an ack it tells the limiter (via ack/2)
+%%    how many messages were ack'ed. The limiter process decrements
+%%    the volume and if it falls below the prefetch_count then it
+%%    notifies (through rabbit_amqqueue:resume/2) all the queues
+%%    requiring notification, i.e. all those that had a can_send/3
+%%    request denied.
+%%
+%% 8. Upon receipt of such a notification, queues resume delivery to
+%%    the channel, i.e. they will once again start asking limiter, as
+%%    described in (5).
+%%
+%% 9. When a queue has no more consumers associated with a particular
+%%    channel, it deactivates use of the limiter with deactivate/1,
+%%    which alters the local state such that no further interactions
+%%    with the limiter process take place until a subsequent
+%%    activate/1.
+
 -module(rabbit_limiter).
 
 -behaviour(gen_server2).
 
+-export([start_link/0]).
+%% channel API
+-export([new/1, limit_prefetch/3, unlimit_prefetch/1, block/1, unblock/1,
+         is_prefetch_limited/1, is_blocked/1, is_active/1,
+         get_prefetch_limit/1, ack/2, pid/1]).
+%% queue API
+-export([client/1, activate/1, can_send/3, resume/1, deactivate/1,
+         is_suspended/1, is_consumer_blocked/2, credit/4, drained/1,
+         forget_consumer/2]).
+%% callbacks
 -export([init/1, terminate/2, code_change/3, handle_call/3, handle_cast/2,
-         handle_info/2, prioritise_call/3]).
--export([start_link/0, make_token/0, make_token/1, is_enabled/1, enable/2,
-         disable/1]).
--export([limit/2, can_send/3, ack/2, register/2, unregister/2]).
--export([get_limit/1, block/1, unblock/1, is_blocked/1]).
+         handle_info/2, prioritise_call/4]).
 
 %%----------------------------------------------------------------------------
 
--record(token, {pid, enabled}).
+-record(lstate, {pid, prefetch_limited, blocked}).
+-record(qstate, {pid, state, credits}).
 
 -ifdef(use_specs).
 
--export_type([token/0]).
-
--opaque(token() :: #token{}).
+-type(lstate() :: #lstate{pid              :: pid(),
+                          prefetch_limited :: boolean(),
+                          blocked          :: boolean()}).
+-type(qstate() :: #qstate{pid :: pid(),
+                          state :: 'dormant' | 'active' | 'suspended'}).
 
 -spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
--spec(make_token/0 :: () -> token()).
--spec(make_token/1 :: ('undefined' | pid()) -> token()).
--spec(is_enabled/1 :: (token()) -> boolean()).
--spec(enable/2 :: (token(), non_neg_integer()) -> token()).
--spec(disable/1 :: (token()) -> token()).
--spec(limit/2 :: (token(), non_neg_integer()) -> 'ok' | {'disabled', token()}).
--spec(can_send/3 :: (token(), pid(), boolean()) -> boolean()).
--spec(ack/2 :: (token(), non_neg_integer()) -> 'ok').
--spec(register/2 :: (token(), pid()) -> 'ok').
--spec(unregister/2 :: (token(), pid()) -> 'ok').
--spec(get_limit/1 :: (token()) -> non_neg_integer()).
--spec(block/1 :: (token()) -> 'ok').
--spec(unblock/1 :: (token()) -> 'ok' | {'disabled', token()}).
--spec(is_blocked/1 :: (token()) -> boolean()).
+-spec(new/1 :: (pid()) -> lstate()).
+
+-spec(limit_prefetch/3      :: (lstate(), non_neg_integer(), non_neg_integer())
+                               -> lstate()).
+-spec(unlimit_prefetch/1    :: (lstate()) -> lstate()).
+-spec(block/1               :: (lstate()) -> lstate()).
+-spec(unblock/1             :: (lstate()) -> lstate()).
+-spec(is_prefetch_limited/1 :: (lstate()) -> boolean()).
+-spec(is_blocked/1          :: (lstate()) -> boolean()).
+-spec(is_active/1           :: (lstate()) -> boolean()).
+-spec(get_prefetch_limit/1  :: (lstate()) -> non_neg_integer()).
+-spec(ack/2                 :: (lstate(), non_neg_integer()) -> 'ok').
+-spec(pid/1                 :: (lstate()) -> pid()).
+
+-spec(client/1       :: (pid()) -> qstate()).
+-spec(activate/1     :: (qstate()) -> qstate()).
+-spec(can_send/3     :: (qstate(), boolean(), rabbit_types:ctag()) ->
+                             {'continue' | 'suspend', qstate()}).
+-spec(resume/1       :: (qstate()) -> qstate()).
+-spec(deactivate/1   :: (qstate()) -> qstate()).
+-spec(is_suspended/1 :: (qstate()) -> boolean()).
+-spec(is_consumer_blocked/2 :: (qstate(), rabbit_types:ctag()) -> boolean()).
+-spec(credit/4 :: (qstate(), rabbit_types:ctag(), non_neg_integer(), boolean())
+                  -> qstate()).
+-spec(drained/1 :: (qstate())
+                   -> {[{rabbit_types:ctag(), non_neg_integer()}], qstate()}).
+-spec(forget_consumer/2 :: (qstate(), rabbit_types:ctag()) -> qstate()).
 
 -endif.
 
@@ -64,120 +187,183 @@
 %% notified of a change in the limit or volume that may allow it to
 %% deliver more messages via the limiter's channel.
 
+-record(credit, {credit = 0, drain = false}).
+
 %%----------------------------------------------------------------------------
 %% API
 %%----------------------------------------------------------------------------
 
 start_link() -> gen_server2:start_link(?MODULE, [], []).
 
-make_token() -> make_token(undefined).
-make_token(Pid) -> #token{pid = Pid, enabled = false}.
+new(Pid) ->
+    %% this a 'call' to ensure that it is invoked at most once.
+    ok = gen_server:call(Pid, {new, self()}),
+    #lstate{pid = Pid, prefetch_limited = false, blocked = false}.
 
-is_enabled(#token{enabled = Enabled}) -> Enabled.
+limit_prefetch(L, PrefetchCount, UnackedCount) when PrefetchCount > 0 ->
+    ok = gen_server:call(L#lstate.pid,
+                         {limit_prefetch, PrefetchCount, UnackedCount}),
+    L#lstate{prefetch_limited = true}.
 
-enable(#token{pid = Pid} = Token, Volume) ->
-    gen_server2:call(Pid, {enable, Token, self(), Volume}, infinity).
+unlimit_prefetch(L) ->
+    ok = gen_server:call(L#lstate.pid, unlimit_prefetch),
+    L#lstate{prefetch_limited = false}.
 
-disable(#token{pid = Pid} = Token) ->
-    gen_server2:call(Pid, {disable, Token}, infinity).
+block(L) ->
+    ok = gen_server:call(L#lstate.pid, block),
+    L#lstate{blocked = true}.
 
-limit(Limiter, PrefetchCount) ->
-    maybe_call(Limiter, {limit, PrefetchCount, Limiter}, ok).
+unblock(L) ->
+    ok = gen_server:call(L#lstate.pid, unblock),
+    L#lstate{blocked = false}.
 
-%% Ask the limiter whether the queue can deliver a message without
-%% breaching a limit. Note that we don't use maybe_call here in order
-%% to avoid always going through with_exit_handler/2, even when the
-%% limiter is disabled.
-can_send(#token{pid = Pid, enabled = true}, QPid, AckRequired) ->
-    rabbit_misc:with_exit_handler(
-      fun () -> true end,
-      fun () ->
-              gen_server2:call(Pid, {can_send, QPid, AckRequired}, infinity)
-      end);
-can_send(_, _, _) ->
-    true.
+is_prefetch_limited(#lstate{prefetch_limited = Limited}) -> Limited.
+
+is_blocked(#lstate{blocked = Blocked}) -> Blocked.
 
-%% Let the limiter know that the channel has received some acks from a
-%% consumer
-ack(Limiter, Count) -> maybe_cast(Limiter, {ack, Count}).
+is_active(L) -> is_prefetch_limited(L) orelse is_blocked(L).
 
-register(Limiter, QPid) -> maybe_cast(Limiter, {register, QPid}).
+get_prefetch_limit(#lstate{prefetch_limited = false}) -> 0;
+get_prefetch_limit(L) -> gen_server:call(L#lstate.pid, get_prefetch_limit).
 
-unregister(Limiter, QPid) -> maybe_cast(Limiter, {unregister, QPid}).
+ack(#lstate{prefetch_limited = false}, _AckCount) -> ok;
+ack(L, AckCount) -> gen_server:cast(L#lstate.pid, {ack, AckCount}).
 
-get_limit(Limiter) ->
+pid(#lstate{pid = Pid}) -> Pid.
+
+client(Pid) -> #qstate{pid = Pid, state = dormant, credits = gb_trees:empty()}.
+
+activate(L = #qstate{state = dormant}) ->
+    ok = gen_server:cast(L#qstate.pid, {register, self()}),
+    L#qstate{state = active};
+activate(L) -> L.
+
+can_send(L = #qstate{pid = Pid, state = State, credits = Credits},
+         AckRequired, CTag) ->
+    case is_consumer_blocked(L, CTag) of
+        false -> case (State =/= active orelse
+                       safe_call(Pid, {can_send, self(), AckRequired}, true)) of
+                     true  -> {continue, L#qstate{
+                                credits = record_send_q(CTag, Credits)}};
+                     false -> {suspend, L#qstate{state = suspended}}
+                 end;
+        true  -> {suspend, L}
+    end.
+
+safe_call(Pid, Msg, ExitValue) ->
     rabbit_misc:with_exit_handler(
-      fun () -> 0 end,
-      fun () -> maybe_call(Limiter, get_limit, 0) end).
+      fun () -> ExitValue end,
+      fun () -> gen_server2:call(Pid, Msg, infinity) end).
+
+resume(L = #qstate{state = suspended}) ->
+    L#qstate{state = active};
+resume(L) -> L.
+
+deactivate(L = #qstate{state = dormant}) -> L;
+deactivate(L) ->
+    ok = gen_server:cast(L#qstate.pid, {unregister, self()}),
+    L#qstate{state = dormant}.
+
+is_suspended(#qstate{state = suspended}) -> true;
+is_suspended(#qstate{})                  -> false.
+
+is_consumer_blocked(#qstate{credits = Credits}, CTag) ->
+    case gb_trees:lookup(CTag, Credits) of
+        {value, #credit{credit = C}} when C > 0 -> false;
+        {value, #credit{}}                      -> true;
+        none                                    -> false
+    end.
 
-block(Limiter) ->
-    maybe_call(Limiter, block, ok).
+credit(Limiter = #qstate{credits = Credits}, CTag, Credit, Drain) ->
+    Limiter#qstate{credits = update_credit(CTag, Credit, Drain, Credits)}.
 
-unblock(Limiter) ->
-    maybe_call(Limiter, {unblock, Limiter}, ok).
+drained(Limiter = #qstate{credits = Credits}) ->
+    {CTagCredits, Credits2} =
+        rabbit_misc:gb_trees_fold(
+          fun (CTag,  #credit{credit = C,  drain = true},  {Acc, Creds0}) ->
+                  {[{CTag, C} | Acc], update_credit(CTag, 0, false, Creds0)};
+              (_CTag, #credit{credit = _C, drain = false}, {Acc, Creds0}) ->
+                  {Acc, Creds0}
+          end, {[], Credits}, Credits),
+    {CTagCredits, Limiter#qstate{credits = Credits2}}.
 
-is_blocked(Limiter) ->
-    maybe_call(Limiter, is_blocked, false).
+forget_consumer(Limiter = #qstate{credits = Credits}, CTag) ->
+    Limiter#qstate{credits = gb_trees:delete_any(CTag, Credits)}.
+
+%%----------------------------------------------------------------------------
+%% Queue-local code
+%%----------------------------------------------------------------------------
+
+%% We want to do all the AMQP 1.0-ish link level credit calculations
+%% in the queue (to do them elsewhere introduces a ton of
+%% races). However, it's a big chunk of code that is conceptually very
+%% linked to the limiter concept. So we get the queue to hold a bit of
+%% state for us (#qstate.credits), and maintain a fiction that the
+%% limiter is making the decisions...
+
+record_send_q(CTag, Credits) ->
+    case gb_trees:lookup(CTag, Credits) of
+        {value, #credit{credit = Credit, drain = Drain}} ->
+            update_credit(CTag, Credit - 1, Drain, Credits);
+        none ->
+            Credits
+    end.
+
+update_credit(CTag, Credit, Drain, Credits) ->
+    %% Using up all credit implies no need to send a 'drained' event
+    Drain1 = Drain andalso Credit > 0,
+    gb_trees:enter(CTag, #credit{credit = Credit, drain = Drain1}, Credits).
 
 %%----------------------------------------------------------------------------
 %% gen_server callbacks
 %%----------------------------------------------------------------------------
 
-init([]) ->
-    {ok, #lim{}}.
+init([]) -> {ok, #lim{}}.
+
+prioritise_call(get_prefetch_limit, _From, _Len, _State) -> 9;
+prioritise_call(_Msg,               _From, _Len, _State) -> 0.
+
+handle_call({new, ChPid}, _From, State = #lim{ch_pid = undefined}) ->
+    {reply, ok, State#lim{ch_pid = ChPid}};
 
-prioritise_call(get_limit, _From, _State) -> 9;
-prioritise_call(_Msg,      _From, _State) -> 0.
+handle_call({limit_prefetch, PrefetchCount, UnackedCount}, _From,
+            State = #lim{prefetch_count = 0}) ->
+    {reply, ok, maybe_notify(State, State#lim{prefetch_count = PrefetchCount,
+                                              volume         = UnackedCount})};
+handle_call({limit_prefetch, PrefetchCount, _UnackedCount}, _From, State) ->
+    {reply, ok, maybe_notify(State, State#lim{prefetch_count = PrefetchCount})};
+
+handle_call(unlimit_prefetch, _From, State) ->
+    {reply, ok, maybe_notify(State, State#lim{prefetch_count = 0,
+                                              volume         = 0})};
+
+handle_call(block, _From, State) ->
+    {reply, ok, State#lim{blocked = true}};
+
+handle_call(unblock, _From, State) ->
+    {reply, ok, maybe_notify(State, State#lim{blocked = false})};
+
+handle_call(get_prefetch_limit, _From,
+            State = #lim{prefetch_count = PrefetchCount}) ->
+    {reply, PrefetchCount, State};
 
 handle_call({can_send, QPid, _AckRequired}, _From,
             State = #lim{blocked = true}) ->
     {reply, false, limit_queue(QPid, State)};
 handle_call({can_send, QPid, AckRequired}, _From,
             State = #lim{volume = Volume}) ->
-    case limit_reached(State) of
+    case prefetch_limit_reached(State) of
         true  -> {reply, false, limit_queue(QPid, State)};
         false -> {reply, true,  State#lim{volume = if AckRequired -> Volume + 1;
                                                       true        -> Volume
                                                    end}}
-    end;
-
-handle_call(get_limit, _From, State = #lim{prefetch_count = PrefetchCount}) ->
-    {reply, PrefetchCount, State};
-
-handle_call({limit, PrefetchCount, Token}, _From, State) ->
-    case maybe_notify(State, State#lim{prefetch_count = PrefetchCount}) of
-        {cont, State1} ->
-            {reply, ok, State1};
-        {stop, State1} ->
-            {reply, {disabled, Token#token{enabled = false}}, State1}
-    end;
-
-handle_call(block, _From, State) ->
-    {reply, ok, State#lim{blocked = true}};
-
-handle_call({unblock, Token}, _From, State) ->
-    case maybe_notify(State, State#lim{blocked = false}) of
-        {cont, State1} ->
-            {reply, ok, State1};
-        {stop, State1} ->
-            {reply, {disabled, Token#token{enabled = false}}, State1}
-    end;
-
-handle_call(is_blocked, _From, State) ->
-    {reply, blocked(State), State};
-
-handle_call({enable, Token, Channel, Volume}, _From, State) ->
-    {reply, Token#token{enabled = true},
-     State#lim{ch_pid = Channel, volume = Volume}};
-handle_call({disable, Token}, _From, State) ->
-    {reply, Token#token{enabled = false}, State}.
+    end.
 
 handle_cast({ack, Count}, State = #lim{volume = Volume}) ->
     NewVolume = if Volume == 0 -> 0;
                    true        -> Volume - Count
                 end,
-    {cont, State1} = maybe_notify(State, State#lim{volume = NewVolume}),
-    {noreply, State1};
+    {noreply, maybe_notify(State, State#lim{volume = NewVolume})};
 
 handle_cast({register, QPid}, State) ->
     {noreply, remember_queue(QPid, State)};
@@ -199,27 +385,13 @@ code_change(_, State, _) ->
 %%----------------------------------------------------------------------------
 
 maybe_notify(OldState, NewState) ->
-    case (limit_reached(OldState) orelse blocked(OldState)) andalso
-        not (limit_reached(NewState) orelse blocked(NewState)) of
-        true  -> NewState1 = notify_queues(NewState),
-                 {case NewState1#lim.prefetch_count of
-                      0 -> stop;
-                      _ -> cont
-                  end, NewState1};
-        false -> {cont, NewState}
+    case (prefetch_limit_reached(OldState) orelse blocked(OldState)) andalso
+        not (prefetch_limit_reached(NewState) orelse blocked(NewState)) of
+        true  -> notify_queues(NewState);
+        false -> NewState
     end.
 
-maybe_call(#token{pid = Pid, enabled = true}, Call, _Default) ->
-    gen_server2:call(Pid, Call, infinity);
-maybe_call(_, _Call, Default) ->
-    Default.
-
-maybe_cast(#token{pid = Pid, enabled = true}, Cast) ->
-    gen_server2:cast(Pid, Cast);
-maybe_cast(_, _Call) ->
-    ok.
-
-limit_reached(#lim{prefetch_count = Limit, volume = Volume}) ->
+prefetch_limit_reached(#lim{prefetch_count = Limit, volume = Volume}) ->
     Limit =/= 0 andalso Volume >= Limit.
 
 blocked(#lim{blocked = Blocked}) -> Blocked.
@@ -231,10 +403,9 @@ remember_queue(QPid, State = #lim{queues = Queues}) ->
         true  -> State
     end.
 
-forget_queue(QPid, State = #lim{ch_pid = ChPid, queues = Queues}) ->
+forget_queue(QPid, State = #lim{queues = Queues}) ->
     case orddict:find(QPid, Queues) of
         {ok, {MRef, _}} -> true = erlang:demonitor(MRef),
-                           ok = rabbit_amqqueue:unblock(QPid, ChPid),
                            State#lim{queues = orddict:erase(QPid, Queues)};
         error           -> State
     end.
@@ -251,13 +422,13 @@ notify_queues(State = #lim{ch_pid = ChPid, queues = Queues}) ->
                      end, {[], Queues}, Queues),
     case length(QList) of
         0 -> ok;
-        1 -> ok = rabbit_amqqueue:unblock(hd(QList), ChPid); %% common case
+        1 -> ok = rabbit_amqqueue:resume(hd(QList), ChPid); %% common case
         L ->
             %% We randomly vary the position of queues in the list,
             %% thus ensuring that each queue has an equal chance of
             %% being notified first.
             {L1, L2} = lists:split(random:uniform(L), QList),
-            [[ok = rabbit_amqqueue:unblock(Q, ChPid) || Q <- L3]
+            [[ok = rabbit_amqqueue:resume(Q, ChPid) || Q <- L3]
              || L3 <- [L2, L1]],
             ok
     end,
diff --git a/src/rabbit_log.erl b/src/rabbit_log.erl
index a6b4eeb0..2e3a1bbb 100644
--- a/src/rabbit_log.erl
+++ b/src/rabbit_log.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_log).
@@ -40,18 +40,20 @@
 
 -spec(log/3 :: (category(), level(), string()) -> 'ok').
 -spec(log/4 :: (category(), level(), string(), [any()]) -> 'ok').
--spec(info/1 :: (string()) -> 'ok').
--spec(info/2 :: (string(), [any()]) -> 'ok').
+
+-spec(info/1    :: (string()) -> 'ok').
+-spec(info/2    :: (string(), [any()]) -> 'ok').
 -spec(warning/1 :: (string()) -> 'ok').
 -spec(warning/2 :: (string(), [any()]) -> 'ok').
--spec(error/1 :: (string()) -> 'ok').
--spec(error/2 :: (string(), [any()]) -> 'ok').
+-spec(error/1   :: (string()) -> 'ok').
+-spec(error/2   :: (string(), [any()]) -> 'ok').
 
 -endif.
 
 %%----------------------------------------------------------------------------
 start_link() ->
     gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+
 log(Category, Level, Fmt) -> log(Category, Level, Fmt, []).
 
 log(Category, Level, Fmt, Args) when is_list(Args) ->
diff --git a/src/rabbit_memory_monitor.erl b/src/rabbit_memory_monitor.erl
index f22ad874..b8d8023e 100644
--- a/src/rabbit_memory_monitor.erl
+++ b/src/rabbit_memory_monitor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 
@@ -43,17 +43,6 @@
 -define(DEFAULT_UPDATE_INTERVAL, 2500).
 -define(TABLE_NAME, ?MODULE).
 
-%% Because we have a feedback loop here, we need to ensure that we
-%% have some space for when the queues don't quite respond as fast as
-%% we would like, or when there is buffering going on in other parts
-%% of the system. In short, we aim to stay some distance away from
-%% when the memory alarms will go off, which cause backpressure (of
-%% some sort) on producers. Note that all other Thresholds are
-%% relative to this scaling.
--define(MEMORY_LIMIT_SCALING, 0.4).
-
--define(LIMIT_THRESHOLD, 0.5). %% don't limit queues when mem use is < this
-
 %% If all queues are pushed to disk (duration 0), then the sum of
 %% their reported lengths will be 0. If memory then becomes available,
 %% unless we manually intervene, the sum will remain 0, and the queues
@@ -207,7 +196,9 @@ internal_update(State = #state { queue_durations = Durations,
                                  desired_duration = DesiredDurationAvg,
                                  queue_duration_sum = Sum,
                                  queue_duration_count = Count }) ->
-    MemoryLimit = ?MEMORY_LIMIT_SCALING * vm_memory_monitor:get_memory_limit(),
+    {ok, LimitThreshold} =
+        application:get_env(rabbit, vm_memory_high_watermark_paging_ratio),
+    MemoryLimit = vm_memory_monitor:get_memory_limit(),
     MemoryRatio = case MemoryLimit > 0.0 of
                       true  -> erlang:memory(total) / MemoryLimit;
                       false -> infinity
@@ -215,7 +206,7 @@ internal_update(State = #state { queue_durations = Durations,
     DesiredDurationAvg1 =
         if MemoryRatio =:= infinity ->
                 0.0;
-           MemoryRatio < ?LIMIT_THRESHOLD orelse Count == 0 ->
+           MemoryRatio < LimitThreshold orelse Count == 0 ->
                 infinity;
            MemoryRatio < ?SUM_INC_THRESHOLD ->
                 ((Sum + ?SUM_INC_AMOUNT) / Count) / MemoryRatio;
diff --git a/src/rabbit_mirror_queue_coordinator.erl b/src/rabbit_mirror_queue_coordinator.erl
index 17e2ffb4..f54e9bd1 100644
--- a/src/rabbit_mirror_queue_coordinator.erl
+++ b/src/rabbit_mirror_queue_coordinator.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2010-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_mirror_queue_coordinator).
@@ -33,16 +33,14 @@
                  gm,
                  monitors,
                  death_fun,
-                 length_fun
+                 depth_fun
                }).
 
--define(ONE_SECOND, 1000).
-
 -ifdef(use_specs).
 
 -spec(start_link/4 :: (rabbit_types:amqqueue(), pid() | 'undefined',
                        rabbit_mirror_queue_master:death_fun(),
-                       rabbit_mirror_queue_master:length_fun()) ->
+                       rabbit_mirror_queue_master:depth_fun()) ->
                            rabbit_types:ok_pid_or_error()).
 -spec(get_gm/1 :: (pid()) -> pid()).
 -spec(ensure_monitoring/2 :: (pid(), [pid()]) -> 'ok').
@@ -103,19 +101,25 @@
 %% channel during a publish, only some of the mirrors may receive that
 %% publish. As a result of this problem, the messages broadcast over
 %% the gm contain published content, and thus slaves can operate
-%% successfully on messages that they only receive via the gm. The key
-%% purpose of also sending messages directly from the channels to the
-%% slaves is that without this, in the event of the death of the
-%% master, messages could be lost until a suitable slave is promoted.
+%% successfully on messages that they only receive via the gm.
+%%
+%% The key purpose of also sending messages directly from the channels
+%% to the slaves is that without this, in the event of the death of
+%% the master, messages could be lost until a suitable slave is
+%% promoted. However, that is not the only reason. A slave cannot send
+%% confirms for a message until it has seen it from the
+%% channel. Otherwise, it might send a confirm to a channel for a
+%% message that it might *never* receive from that channel. This can
+%% happen because new slaves join the gm ring (and thus receive
+%% messages from the master) before inserting themselves in the
+%% queue's mnesia record (which is what channels look at for routing).
+%% As it turns out, channels will simply ignore such bogus confirms,
+%% but relying on that would introduce a dangerously tight coupling.
 %%
-%% However, that is not the only reason. For example, if confirms are
-%% in use, then there is no guarantee that every slave will see the
-%% delivery with the same msg_seq_no. As a result, the slaves have to
-%% wait until they've seen both the publish via gm, and the publish
-%% via the channel before they have enough information to be able to
-%% perform the publish to their own bq, and subsequently issue the
-%% confirm, if necessary. Either form of publish can arrive first, and
-%% a slave can be upgraded to the master at any point during this
+%% Hence the slaves have to wait until they've seen both the publish
+%% via gm, and the publish via the channel before they issue the
+%% confirm. Either form of publish can arrive first, and a slave can
+%% be upgraded to the master at any point during this
 %% process. Confirms continue to be issued correctly, however.
 %%
 %% Because the slave is a full process, it impersonates parts of the
@@ -134,25 +138,31 @@
 %% gm should be processed as normal, but fetches which are for
 %% messages the slave has never seen should be ignored. Similarly,
 %% acks for messages the slave never fetched should be
-%% ignored. Eventually, as the master is consumed from, the messages
-%% at the head of the queue which were there before the slave joined
-%% will disappear, and the slave will become fully synced with the
-%% state of the master. The detection of the sync-status of a slave is
-%% done entirely based on length: if the slave and the master both
-%% agree on the length of the queue after the fetch of the head of the
-%% queue (or a 'set_length' results in a slave having to drop some
-%% messages from the head of its queue), then the queues must be in
-%% sync. The only other possibility is that the slave's queue is
-%% shorter, and thus the fetch should be ignored. In case slaves are
-%% joined to an empty queue which only goes on to receive publishes,
-%% they start by asking the master to broadcast its length. This is
-%% enough for slaves to always be able to work out when their head
-%% does not differ from the master (and is much simpler and cheaper
-%% than getting the master to hang on to the guid of the msg at the
-%% head of its queue). When a slave is promoted to a master, it
-%% unilaterally broadcasts its length, in order to solve the problem
-%% of length requests from new slaves being unanswered by a dead
-%% master.
+%% ignored. Similarly, we don't republish rejected messages that we
+%% haven't seen. Eventually, as the master is consumed from, the
+%% messages at the head of the queue which were there before the slave
+%% joined will disappear, and the slave will become fully synced with
+%% the state of the master.
+%%
+%% The detection of the sync-status is based on the depth of the BQs,
+%% where the depth is defined as the sum of the length of the BQ (as
+%% per BQ:len) and the messages pending an acknowledgement. When the
+%% depth of the slave is equal to the master's, then the slave is
+%% synchronised. We only store the difference between the two for
+%% simplicity. Comparing the length is not enough since we need to
+%% take into account rejected messages which will make it back into
+%% the master queue but can't go back in the slave, since we don't
+%% want "holes" in the slave queue. Note that the depth, and the
+%% length likewise, must always be shorter on the slave - we assert
+%% that in various places. In case slaves are joined to an empty queue
+%% which only goes on to receive publishes, they start by asking the
+%% master to broadcast its depth. This is enough for slaves to always
+%% be able to work out when their head does not differ from the master
+%% (and is much simpler and cheaper than getting the master to hang on
+%% to the guid of the msg at the head of its queue). When a slave is
+%% promoted to a master, it unilaterally broadcasts its depth, in
+%% order to solve the problem of depth requests from new slaves being
+%% unanswered by a dead master.
 %%
 %% Obviously, due to the async nature of communication across gm, the
 %% slaves can fall behind. This does not matter from a sync pov: if
@@ -212,20 +222,19 @@
 %% sender_death message to all the slaves, saying the sender has
 %% died. Once the slaves receive the sender_death message, they know
 %% that they're not going to receive any more instructions from the gm
-%% regarding that sender, thus they throw away any publications from
-%% the sender pending publication instructions. However, it is
-%% possible that the coordinator receives the DOWN and communicates
-%% that to the master before the master has finished receiving and
-%% processing publishes from the sender. This turns out not to be a
-%% problem: the sender has actually died, and so will not need to
-%% receive confirms or other feedback, and should further messages be
-%% "received" from the sender, the master will ask the coordinator to
-%% set up a new monitor, and will continue to process the messages
-%% normally. Slaves may thus receive publishes via gm from previously
-%% declared "dead" senders, but again, this is fine: should the slave
-%% have just thrown out the message it had received directly from the
-%% sender (due to receiving a sender_death message via gm), it will be
-%% able to cope with the publication purely from the master via gm.
+%% regarding that sender. However, it is possible that the coordinator
+%% receives the DOWN and communicates that to the master before the
+%% master has finished receiving and processing publishes from the
+%% sender. This turns out not to be a problem: the sender has actually
+%% died, and so will not need to receive confirms or other feedback,
+%% and should further messages be "received" from the sender, the
+%% master will ask the coordinator to set up a new monitor, and
+%% will continue to process the messages normally. Slaves may thus
+%% receive publishes via gm from previously declared "dead" senders,
+%% but again, this is fine: should the slave have just thrown out the
+%% message it had received directly from the sender (due to receiving
+%% a sender_death message via gm), it will be able to cope with the
+%% publication purely from the master via gm.
 %%
 %% When a slave receives a DOWN message for a sender, if it has not
 %% received the sender_death message from the master via gm already,
@@ -293,15 +302,15 @@
 %% if they have no mirrored content at all. This is not surprising: to
 %% achieve anything more sophisticated would require the master and
 %% recovering slave to be able to check to see whether they agree on
-%% the last seen state of the queue: checking length alone is not
+%% the last seen state of the queue: checking depth alone is not
 %% sufficient in this case.
 %%
 %% For more documentation see the comments in bug 23554.
 %%
 %%----------------------------------------------------------------------------
 
-start_link(Queue, GM, DeathFun, LengthFun) ->
-    gen_server2:start_link(?MODULE, [Queue, GM, DeathFun, LengthFun], []).
+start_link(Queue, GM, DeathFun, DepthFun) ->
+    gen_server2:start_link(?MODULE, [Queue, GM, DeathFun, DepthFun], []).
 
 get_gm(CPid) ->
     gen_server2:call(CPid, get_gm, infinity).
@@ -313,10 +322,12 @@ ensure_monitoring(CPid, Pids) ->
 %% gen_server
 %% ---------------------------------------------------------------------------
 
-init([#amqqueue { name = QueueName } = Q, GM, DeathFun, LengthFun]) ->
+init([#amqqueue { name = QueueName } = Q, GM, DeathFun, DepthFun]) ->
     GM1 = case GM of
               undefined ->
-                  {ok, GM2} = gm:start_link(QueueName, ?MODULE, [self()]),
+                  {ok, GM2} = gm:start_link(
+                                QueueName, ?MODULE, [self()],
+                                fun rabbit_misc:execute_mnesia_transaction/1),
                   receive {joined, GM2, _Members} ->
                           ok
                   end,
@@ -325,12 +336,11 @@ init([#amqqueue { name = QueueName } = Q, GM, DeathFun, LengthFun]) ->
                   true = link(GM),
                   GM
           end,
-    ensure_gm_heartbeat(),
     {ok, #state { q          = Q,
                   gm         = GM1,
                   monitors   = pmon:new(),
                   death_fun  = DeathFun,
-                  length_fun = LengthFun },
+                  depth_fun  = DepthFun },
      hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
@@ -340,7 +350,7 @@ handle_call(get_gm, _From, State = #state { gm = GM }) ->
 handle_cast({gm_deaths, Deaths},
             State = #state { q  = #amqqueue { name = QueueName, pid = MPid } })
   when node(MPid) =:= node() ->
-    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, Deaths) of
+    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, MPid, Deaths) of
         {ok, MPid, DeadPids} ->
             rabbit_mirror_queue_misc:report_deaths(MPid, true, QueueName,
                                                    DeadPids),
@@ -349,17 +359,15 @@ handle_cast({gm_deaths, Deaths},
             {stop, normal, State}
     end;
 
-handle_cast(request_length, State = #state { length_fun = LengthFun }) ->
-    ok = LengthFun(),
+handle_cast(request_depth, State = #state { depth_fun = DepthFun }) ->
+    ok = DepthFun(),
     noreply(State);
 
 handle_cast({ensure_monitoring, Pids}, State = #state { monitors = Mons }) ->
-    noreply(State #state { monitors = pmon:monitor_all(Pids, Mons) }).
+    noreply(State #state { monitors = pmon:monitor_all(Pids, Mons) });
 
-handle_info(send_gm_heartbeat, State = #state { gm = GM }) ->
-    gm:broadcast(GM, heartbeat),
-    ensure_gm_heartbeat(),
-    noreply(State);
+handle_cast({delete_and_terminate, Reason}, State) ->
+    {stop, Reason, State}.
 
 handle_info({'DOWN', _MonitorRef, process, Pid, _Reason},
             State = #state { monitors  = Mons,
@@ -396,12 +404,13 @@ members_changed([_CPid], _Births, []) ->
 members_changed([CPid], _Births, Deaths) ->
     ok = gen_server2:cast(CPid, {gm_deaths, Deaths}).
 
-handle_msg([_CPid], _From, heartbeat) ->
-    ok;
-handle_msg([CPid], _From, request_length = Msg) ->
+handle_msg([CPid], _From, request_depth = Msg) ->
     ok = gen_server2:cast(CPid, Msg);
 handle_msg([CPid], _From, {ensure_monitoring, _Pids} = Msg) ->
     ok = gen_server2:cast(CPid, Msg);
+handle_msg([CPid], _From, {delete_and_terminate, _Reason} = Msg) ->
+    ok = gen_server2:cast(CPid, Msg),
+    {stop, {shutdown, ring_shutdown}};
 handle_msg([_CPid], _From, _Msg) ->
     ok.
 
@@ -414,6 +423,3 @@ noreply(State) ->
 
 reply(Reply, State) ->
     {reply, Reply, State, hibernate}.
-
-ensure_gm_heartbeat() ->
-    erlang:send_after(?ONE_SECOND, self(), send_gm_heartbeat).
diff --git a/src/rabbit_mirror_queue_master.erl b/src/rabbit_mirror_queue_master.erl
index 4e71cc43..3abd81f5 100644
--- a/src/rabbit_mirror_queue_master.erl
+++ b/src/rabbit_mirror_queue_master.erl
@@ -10,59 +10,67 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2010-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_mirror_queue_master).
 
 -export([init/3, terminate/2, delete_and_terminate/2,
-         purge/1, publish/4, publish_delivered/5, fetch/2, ack/2,
-         requeue/2, len/1, is_empty/1, drain_confirmed/1, dropwhile/3,
-         set_ram_duration_target/2, ram_duration/1,
+         purge/1, purge_acks/1, publish/5, publish_delivered/4,
+         discard/3, fetch/2, drop/2, ack/2, requeue/2, ackfold/4, fold/3,
+         len/1, is_empty/1, depth/1, drain_confirmed/1,
+         dropwhile/2, fetchwhile/4, set_ram_duration_target/2, ram_duration/1,
          needs_timeout/1, timeout/1, handle_pre_hibernate/1,
-         status/1, invoke/3, is_duplicate/2, discard/3, fold/3]).
+         status/1, invoke/3, is_duplicate/2]).
 
 -export([start/1, stop/0]).
 
--export([promote_backing_queue_state/6, sender_death_fun/0, length_fun/0]).
+-export([promote_backing_queue_state/8, sender_death_fun/0, depth_fun/0]).
+
+-export([init_with_existing_bq/3, stop_mirroring/1, sync_mirrors/3]).
 
 -behaviour(rabbit_backing_queue).
 
 -include("rabbit.hrl").
 
--record(state, { gm,
+-record(state, { name,
+                 gm,
                  coordinator,
                  backing_queue,
                  backing_queue_state,
-                 set_delivered,
                  seen_status,
                  confirmed,
-                 ack_msg_id,
                  known_senders
                }).
 
 -ifdef(use_specs).
 
--export_type([death_fun/0, length_fun/0]).
+-export_type([death_fun/0, depth_fun/0, stats_fun/0]).
 
 -type(death_fun() :: fun ((pid()) -> 'ok')).
--type(length_fun() :: fun (() -> 'ok')).
--type(master_state() :: #state { gm                  :: pid(),
+-type(depth_fun() :: fun (() -> 'ok')).
+-type(stats_fun() :: fun ((any()) -> 'ok')).
+-type(master_state() :: #state { name                :: rabbit_amqqueue:name(),
+                                 gm                  :: pid(),
                                  coordinator         :: pid(),
                                  backing_queue       :: atom(),
                                  backing_queue_state :: any(),
-                                 set_delivered       :: non_neg_integer(),
                                  seen_status         :: dict(),
                                  confirmed           :: [rabbit_guid:guid()],
-                                 ack_msg_id          :: dict(),
                                  known_senders       :: set()
                                }).
 
--spec(promote_backing_queue_state/6 ::
-        (pid(), atom(), any(), pid(), dict(), [pid()]) -> master_state()).
+-spec(promote_backing_queue_state/8 ::
+        (rabbit_amqqueue:name(), pid(), atom(), any(), pid(), [any()], dict(),
+         [pid()]) -> master_state()).
 -spec(sender_death_fun/0 :: () -> death_fun()).
--spec(length_fun/0 :: () -> length_fun()).
+-spec(depth_fun/0 :: () -> depth_fun()).
+-spec(init_with_existing_bq/3 :: (rabbit_types:amqqueue(), atom(), any()) ->
+                                      master_state()).
+-spec(stop_mirroring/1 :: (master_state()) -> {atom(), any()}).
+-spec(sync_mirrors/3 :: (stats_fun(), stats_fun(), master_state()) ->
+    {'ok', master_state()} | {stop, any(), master_state()}).
 
 -endif.
 
@@ -82,41 +90,79 @@ stop() ->
     %% Same as start/1.
     exit({not_valid_for_generic_backing_queue, ?MODULE}).
 
-init(#amqqueue { name = QName, mirror_nodes = MNodes } = Q, Recover,
-     AsyncCallback) ->
-    {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
-                   Q, undefined, sender_death_fun(), length_fun()),
-    GM = rabbit_mirror_queue_coordinator:get_gm(CPid),
-    MNodes1 =
-        (case MNodes of
-             all       -> rabbit_mnesia:all_clustered_nodes();
-             undefined -> [];
-             _         -> MNodes
-         end) -- [node()],
-    [rabbit_mirror_queue_misc:add_mirror(QName, Node) || Node <- MNodes1],
+init(Q, Recover, AsyncCallback) ->
     {ok, BQ} = application:get_env(backing_queue_module),
     BQS = BQ:init(Q, Recover, AsyncCallback),
-    ok = gm:broadcast(GM, {length, BQ:len(BQS)}),
-    #state { gm                  = GM,
+    State = #state{gm = GM} = init_with_existing_bq(Q, BQ, BQS),
+    ok = gm:broadcast(GM, {depth, BQ:depth(BQS)}),
+    State.
+
+init_with_existing_bq(Q = #amqqueue{name = QName}, BQ, BQS) ->
+    {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
+                   Q, undefined, sender_death_fun(), depth_fun()),
+    GM = rabbit_mirror_queue_coordinator:get_gm(CPid),
+    Self = self(),
+    ok = rabbit_misc:execute_mnesia_transaction(
+           fun () ->
+                   [Q1 = #amqqueue{gm_pids = GMPids}]
+                       = mnesia:read({rabbit_queue, QName}),
+                   ok = rabbit_amqqueue:store_queue(
+                          Q1#amqqueue{gm_pids = [{GM, Self} | GMPids]})
+           end),
+    {_MNode, SNodes} = rabbit_mirror_queue_misc:suggested_queue_nodes(Q),
+    rabbit_mirror_queue_misc:add_mirrors(QName, SNodes),
+    #state { name                = QName,
+             gm                  = GM,
              coordinator         = CPid,
              backing_queue       = BQ,
              backing_queue_state = BQS,
-             set_delivered       = 0,
              seen_status         = dict:new(),
              confirmed           = [],
-             ack_msg_id          = dict:new(),
              known_senders       = sets:new() }.
 
+stop_mirroring(State = #state { coordinator         = CPid,
+                                backing_queue       = BQ,
+                                backing_queue_state = BQS }) ->
+    unlink(CPid),
+    stop_all_slaves(shutdown, State),
+    {BQ, BQS}.
+
+sync_mirrors(HandleInfo, EmitStats,
+             State = #state { name                = QName,
+                              gm                  = GM,
+                              backing_queue       = BQ,
+                              backing_queue_state = BQS }) ->
+    Log = fun (Fmt, Params) ->
+                  rabbit_log:info("Synchronising ~s: " ++ Fmt ++ "~n",
+                                  [rabbit_misc:rs(QName) | Params])
+          end,
+    Log("~p messages to synchronise", [BQ:len(BQS)]),
+    {ok, #amqqueue{slave_pids = SPids}} = rabbit_amqqueue:lookup(QName),
+    Ref = make_ref(),
+    Syncer = rabbit_mirror_queue_sync:master_prepare(Ref, Log, SPids),
+    gm:broadcast(GM, {sync_start, Ref, Syncer, SPids}),
+    S = fun(BQSN) -> State#state{backing_queue_state = BQSN} end,
+    case rabbit_mirror_queue_sync:master_go(
+           Syncer, Ref, Log, HandleInfo, EmitStats, BQ, BQS) of
+        {shutdown,  R, BQS1}   -> {stop, R, S(BQS1)};
+        {sync_died, R, BQS1}   -> Log("~p", [R]),
+                                  {ok, S(BQS1)};
+        {already_synced, BQS1} -> {ok, S(BQS1)};
+        {ok, BQS1}             -> Log("complete", []),
+                                  {ok, S(BQS1)}
+    end.
+
 terminate({shutdown, dropped} = Reason,
-          State = #state { backing_queue = BQ, backing_queue_state = BQS }) ->
+          State = #state { backing_queue       = BQ,
+                           backing_queue_state = BQS }) ->
     %% Backing queue termination - this node has been explicitly
     %% dropped. Normally, non-durable queues would be tidied up on
     %% startup, but there's a possibility that we will be added back
     %% in without this node being restarted. Thus we must do the full
     %% blown delete_and_terminate now, but only locally: we do not
     %% broadcast delete_and_terminate.
-    State #state { backing_queue_state = BQ:delete_and_terminate(Reason, BQS),
-                   set_delivered       = 0 };
+    State#state{backing_queue_state = BQ:delete_and_terminate(Reason, BQS)};
+
 terminate(Reason,
           State = #state { backing_queue = BQ, backing_queue_state = BQS }) ->
     %% Backing queue termination. The queue is going down but
@@ -124,63 +170,77 @@ terminate(Reason,
     %% node. Thus just let some other slave take over.
     State #state { backing_queue_state = BQ:terminate(Reason, BQS) }.
 
-delete_and_terminate(Reason, State = #state { gm                  = GM,
-                                              backing_queue       = BQ,
+delete_and_terminate(Reason, State = #state { backing_queue       = BQ,
                                               backing_queue_state = BQS }) ->
+    stop_all_slaves(Reason, State),
+    State#state{backing_queue_state = BQ:delete_and_terminate(Reason, BQS)}.
+
+stop_all_slaves(Reason, #state{name = QName, gm   = GM}) ->
+    {ok, #amqqueue{slave_pids = SPids}} = rabbit_amqqueue:lookup(QName),
+    MRefs = [erlang:monitor(process, SPid) || SPid <- SPids],
     ok = gm:broadcast(GM, {delete_and_terminate, Reason}),
-    State #state { backing_queue_state = BQ:delete_and_terminate(Reason, BQS),
-                   set_delivered       = 0 }.
+    [receive {'DOWN', MRef, process, _Pid, _Info} -> ok end || MRef <- MRefs],
+    %% Normally when we remove a slave another slave or master will
+    %% notice and update Mnesia. But we just removed them all, and
+    %% have stopped listening ourselves. So manually clean up.
+    rabbit_misc:execute_mnesia_transaction(
+      fun () ->
+              [Q] = mnesia:read({rabbit_queue, QName}),
+              rabbit_mirror_queue_misc:store_updated_slaves(
+                Q #amqqueue { gm_pids = [], slave_pids = [] })
+      end),
+    ok = gm:forget_group(QName).
 
 purge(State = #state { gm                  = GM,
                        backing_queue       = BQ,
                        backing_queue_state = BQS }) ->
-    ok = gm:broadcast(GM, {set_length, 0, false}),
+    ok = gm:broadcast(GM, {drop, 0, BQ:len(BQS), false}),
     {Count, BQS1} = BQ:purge(BQS),
-    {Count, State #state { backing_queue_state = BQS1,
-                           set_delivered       = 0 }}.
+    {Count, State #state { backing_queue_state = BQS1 }}.
+
+purge_acks(_State) -> exit({not_implemented, {?MODULE, purge_acks}}).
 
-publish(Msg = #basic_message { id = MsgId }, MsgProps, ChPid,
+publish(Msg = #basic_message { id = MsgId }, MsgProps, IsDelivered, ChPid,
         State = #state { gm                  = GM,
                          seen_status         = SS,
                          backing_queue       = BQ,
                          backing_queue_state = BQS }) ->
     false = dict:is_key(MsgId, SS), %% ASSERTION
-    ok = gm:broadcast(GM, {publish, false, ChPid, MsgProps, Msg}),
-    BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS),
+    ok = gm:broadcast(GM, {publish, ChPid, MsgProps, Msg}),
+    BQS1 = BQ:publish(Msg, MsgProps, IsDelivered, ChPid, BQS),
     ensure_monitoring(ChPid, State #state { backing_queue_state = BQS1 }).
 
-publish_delivered(AckRequired, Msg = #basic_message { id = MsgId }, MsgProps,
+publish_delivered(Msg = #basic_message { id = MsgId }, MsgProps,
                   ChPid, State = #state { gm                  = GM,
                                           seen_status         = SS,
                                           backing_queue       = BQ,
-                                          backing_queue_state = BQS,
-                                          ack_msg_id          = AM }) ->
+                                          backing_queue_state = BQS }) ->
     false = dict:is_key(MsgId, SS), %% ASSERTION
-    %% Must use confirmed_broadcast here in order to guarantee that
-    %% all slaves are forced to interpret this publish_delivered at
-    %% the same point, especially if we die and a slave is promoted.
-    ok = gm:confirmed_broadcast(
-           GM, {publish, {true, AckRequired}, ChPid, MsgProps, Msg}),
-    {AckTag, BQS1} =
-        BQ:publish_delivered(AckRequired, Msg, MsgProps, ChPid, BQS),
-    AM1 = maybe_store_acktag(AckTag, MsgId, AM),
-    {AckTag,
-     ensure_monitoring(ChPid, State #state { backing_queue_state = BQS1,
-                                             ack_msg_id          = AM1 })}.
-
-dropwhile(Pred, AckRequired,
-          State = #state{gm                  = GM,
-                         backing_queue       = BQ,
-                         set_delivered       = SetDelivered,
-                         backing_queue_state = BQS }) ->
+    ok = gm:broadcast(GM, {publish_delivered, ChPid, MsgProps, Msg}),
+    {AckTag, BQS1} = BQ:publish_delivered(Msg, MsgProps, ChPid, BQS),
+    State1 = State #state { backing_queue_state = BQS1 },
+    {AckTag, ensure_monitoring(ChPid, State1)}.
+
+discard(MsgId, ChPid, State = #state { gm                  = GM,
+                                       backing_queue       = BQ,
+                                       backing_queue_state = BQS,
+                                       seen_status         = SS }) ->
+    false = dict:is_key(MsgId, SS), %% ASSERTION
+    ok = gm:broadcast(GM, {discard, ChPid, MsgId}),
+    ensure_monitoring(ChPid, State #state { backing_queue_state =
+                                                BQ:discard(MsgId, ChPid, BQS) }).
+
+dropwhile(Pred, State = #state{backing_queue       = BQ,
+                               backing_queue_state = BQS }) ->
     Len  = BQ:len(BQS),
-    {Msgs, BQS1} = BQ:dropwhile(Pred, AckRequired, BQS),
-    Len1 = BQ:len(BQS1),
-    ok = gm:broadcast(GM, {set_length, Len1, AckRequired}),
-    Dropped = Len - Len1,
-    SetDelivered1 = lists:max([0, SetDelivered - Dropped]),
-    {Msgs, State #state { backing_queue_state = BQS1,
-                          set_delivered       = SetDelivered1 } }.
+    {Next, BQS1} = BQ:dropwhile(Pred, BQS),
+    {Next, drop(Len, false, State #state { backing_queue_state = BQS1 })}.
+
+fetchwhile(Pred, Fun, Acc, State = #state{backing_queue       = BQ,
+                                          backing_queue_state = BQS }) ->
+    Len  = BQ:len(BQS),
+    {Next, Acc1, BQS1} = BQ:fetchwhile(Pred, Fun, Acc, BQS),
+    {Next, Acc1, drop(Len, true, State #state { backing_queue_state = BQS1 })}.
 
 drain_confirmed(State = #state { backing_queue       = BQ,
                                  backing_queue_state = BQS,
@@ -212,43 +272,33 @@ drain_confirmed(State = #state { backing_queue       = BQ,
                                           seen_status         = SS1,
                                           confirmed           = [] }}.
 
-fetch(AckRequired, State = #state { gm                  = GM,
-                                    backing_queue       = BQ,
-                                    backing_queue_state = BQS,
-                                    set_delivered       = SetDelivered,
-                                    ack_msg_id          = AM }) ->
+fetch(AckRequired, State = #state { backing_queue       = BQ,
+                                    backing_queue_state = BQS }) ->
     {Result, BQS1} = BQ:fetch(AckRequired, BQS),
     State1 = State #state { backing_queue_state = BQS1 },
-    case Result of
-        empty ->
-            {Result, State1};
-        {#basic_message { id = MsgId } = Message, IsDelivered, AckTag,
-         Remaining} ->
-            ok = gm:broadcast(GM, {fetch, AckRequired, MsgId, Remaining}),
-            IsDelivered1 = IsDelivered orelse SetDelivered > 0,
-            SetDelivered1 = lists:max([0, SetDelivered - 1]),
-            AM1 = maybe_store_acktag(AckTag, MsgId, AM),
-            {{Message, IsDelivered1, AckTag, Remaining},
-             State1 #state { set_delivered = SetDelivered1,
-                             ack_msg_id    = AM1 }}
-    end.
+    {Result, case Result of
+                 empty                          -> State1;
+                 {_MsgId, _IsDelivered, AckTag} -> drop_one(AckTag, State1)
+             end}.
+
+drop(AckRequired, State = #state { backing_queue       = BQ,
+                                   backing_queue_state = BQS }) ->
+    {Result, BQS1} = BQ:drop(AckRequired, BQS),
+    State1 = State #state { backing_queue_state = BQS1 },
+    {Result, case Result of
+                 empty            -> State1;
+                 {_MsgId, AckTag} -> drop_one(AckTag, State1)
+             end}.
 
 ack(AckTags, State = #state { gm                  = GM,
                               backing_queue       = BQ,
-                              backing_queue_state = BQS,
-                              ack_msg_id          = AM }) ->
+                              backing_queue_state = BQS }) ->
     {MsgIds, BQS1} = BQ:ack(AckTags, BQS),
     case MsgIds of
         [] -> ok;
         _  -> ok = gm:broadcast(GM, {ack, MsgIds})
     end,
-    AM1 = lists:foldl(fun dict:erase/2, AM, AckTags),
-    {MsgIds, State #state { backing_queue_state = BQS1,
-                            ack_msg_id          = AM1 }}.
-
-fold(MsgFun, State = #state { backing_queue       = BQ,
-                              backing_queue_state = BQS }, AckTags) ->
-    State #state { backing_queue_state = BQ:fold(MsgFun, BQS, AckTags) }.
+    {MsgIds, State #state { backing_queue_state = BQS1 }}.
 
 requeue(AckTags, State = #state { gm                  = GM,
                                   backing_queue       = BQ,
@@ -257,12 +307,25 @@ requeue(AckTags, State = #state { gm                  = GM,
     ok = gm:broadcast(GM, {requeue, MsgIds}),
     {MsgIds, State #state { backing_queue_state = BQS1 }}.
 
+ackfold(MsgFun, Acc, State = #state { backing_queue       = BQ,
+                                      backing_queue_state = BQS }, AckTags) ->
+    {Acc1, BQS1} = BQ:ackfold(MsgFun, Acc, BQS, AckTags),
+    {Acc1, State #state { backing_queue_state =  BQS1 }}.
+
+fold(Fun, Acc, State = #state { backing_queue = BQ,
+                                backing_queue_state = BQS }) ->
+    {Result, BQS1} = BQ:fold(Fun, Acc, BQS),
+    {Result, State #state { backing_queue_state = BQS1 }}.
+
 len(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
     BQ:len(BQS).
 
 is_empty(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
     BQ:is_empty(BQS).
 
+depth(#state { backing_queue = BQ, backing_queue_state = BQS }) ->
+    BQ:depth(BQS).
+
 set_ram_duration_target(Target, State = #state { backing_queue       = BQ,
                                                  backing_queue_state = BQS }) ->
     State #state { backing_queue_state =
@@ -319,8 +382,9 @@ is_duplicate(Message = #basic_message { id = MsgId },
             %% immediately after calling is_duplicate). The msg is
             %% invalid. We will not see this again, nor will we be
             %% further involved in confirming this message, so erase.
-            {published, State #state { seen_status = dict:erase(MsgId, SS) }};
-        {ok, confirmed} ->
+            {true, State #state { seen_status = dict:erase(MsgId, SS) }};
+        {ok, Disposition}
+          when Disposition =:= confirmed
             %% It got published when we were a slave via gm, and
             %% confirmed some time after that (maybe even after
             %% promotion), but before we received the publish from the
@@ -329,47 +393,31 @@ is_duplicate(Message = #basic_message { id = MsgId },
             %% need to confirm now. As above, amqqueue_process will
             %% have the entry for the msg_id_to_channel mapping added
             %% immediately after calling is_duplicate/2.
-            {published, State #state { seen_status = dict:erase(MsgId, SS),
-                                       confirmed = [MsgId | Confirmed] }};
-        {ok, discarded} ->
-            %% Don't erase from SS here because discard/2 is about to
-            %% be called and we need to be able to detect this case
-            {discarded, State}
-    end.
-
-discard(Msg = #basic_message { id = MsgId }, ChPid,
-        State = #state { gm                  = GM,
-                         backing_queue       = BQ,
-                         backing_queue_state = BQS,
-                         seen_status         = SS }) ->
-    %% It's a massive error if we get told to discard something that's
-    %% already been published or published-and-confirmed. To do that
-    %% would require non FIFO access. Hence we should not find
-    %% 'published' or 'confirmed' in this dict:find.
-    case dict:find(MsgId, SS) of
-        error ->
-            ok = gm:broadcast(GM, {discard, ChPid, Msg}),
-            State #state { backing_queue_state = BQ:discard(Msg, ChPid, BQS),
-                           seen_status         = dict:erase(MsgId, SS) };
-        {ok, discarded} ->
-            State
+          orelse Disposition =:= discarded ->
+            %% Message was discarded while we were a slave. Confirm now.
+            %% As above, amqqueue_process will have the entry for the
+            %% msg_id_to_channel mapping.
+            {true, State #state { seen_status = dict:erase(MsgId, SS),
+                                  confirmed = [MsgId | Confirmed] }}
     end.
 
 %% ---------------------------------------------------------------------------
 %% Other exported functions
 %% ---------------------------------------------------------------------------
 
-promote_backing_queue_state(CPid, BQ, BQS, GM, SeenStatus, KS) ->
-    Len = BQ:len(BQS),
-    ok = gm:broadcast(GM, {length, Len}),
-    #state { gm                  = GM,
+promote_backing_queue_state(QName, CPid, BQ, BQS, GM, AckTags, Seen, KS) ->
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    Len   = BQ:len(BQS1),
+    Depth = BQ:depth(BQS1),
+    true = Len == Depth, %% ASSERTION: everything must have been requeued
+    ok = gm:broadcast(GM, {depth, Depth}),
+    #state { name                = QName,
+             gm                  = GM,
              coordinator         = CPid,
              backing_queue       = BQ,
-             backing_queue_state = BQS,
-             set_delivered       = Len,
-             seen_status         = SeenStatus,
+             backing_queue_state = BQS1,
+             seen_status         = Seen,
              confirmed           = [],
-             ack_msg_id          = dict:new(),
              known_senders       = sets:from_list(KS) }.
 
 sender_death_fun() ->
@@ -384,7 +432,7 @@ sender_death_fun() ->
               end)
     end.
 
-length_fun() ->
+depth_fun() ->
     Self = self(),
     fun () ->
             rabbit_amqqueue:run_backing_queue(
@@ -392,15 +440,30 @@ length_fun() ->
               fun (?MODULE, State = #state { gm                  = GM,
                                              backing_queue       = BQ,
                                              backing_queue_state = BQS }) ->
-                      ok = gm:broadcast(GM, {length, BQ:len(BQS)}),
+                      ok = gm:broadcast(GM, {depth, BQ:depth(BQS)}),
                       State
               end)
     end.
 
-maybe_store_acktag(undefined, _MsgId, AM) ->
-    AM;
-maybe_store_acktag(AckTag, MsgId, AM) ->
-    dict:store(AckTag, MsgId, AM).
+%% ---------------------------------------------------------------------------
+%% Helpers
+%% ---------------------------------------------------------------------------
+
+drop_one(AckTag, State = #state { gm                  = GM,
+                                  backing_queue       = BQ,
+                                  backing_queue_state = BQS }) ->
+    ok = gm:broadcast(GM, {drop, BQ:len(BQS), 1, AckTag =/= undefined}),
+    State.
+
+drop(PrevLen, AckRequired, State = #state { gm                  = GM,
+                                            backing_queue       = BQ,
+                                            backing_queue_state = BQS }) ->
+    Len = BQ:len(BQS),
+    case PrevLen - Len of
+        0       -> State;
+        Dropped -> ok = gm:broadcast(GM, {drop, Len, Dropped, AckRequired}),
+                   State
+    end.
 
 ensure_monitoring(ChPid, State = #state { coordinator = CPid,
                                           known_senders = KS }) ->
diff --git a/src/rabbit_mirror_queue_misc.erl b/src/rabbit_mirror_queue_misc.erl
index 180677fe..eded0411 100644
--- a/src/rabbit_mirror_queue_misc.erl
+++ b/src/rabbit_mirror_queue_misc.erl
@@ -10,33 +10,52 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2010-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_mirror_queue_misc).
+-behaviour(rabbit_policy_validator).
 
--export([remove_from_queue/2, on_node_up/0,
-         drop_mirror/2, drop_mirror/3, add_mirror/2, add_mirror/3,
-         report_deaths/4]).
+-export([remove_from_queue/3, on_node_up/0, add_mirrors/2, add_mirror/2,
+         report_deaths/4, store_updated_slaves/1, suggested_queue_nodes/1,
+         is_mirrored/1, update_mirrors/2, validate_policy/1]).
+
+%% for testing only
+-export([module/1]).
 
 -include("rabbit.hrl").
 
+-rabbit_boot_step({?MODULE,
+                   [{description, "HA policy validation"},
+                    {mfa, {rabbit_registry, register,
+                           [policy_validator, <<"ha-mode">>, ?MODULE]}},
+                    {mfa, {rabbit_registry, register,
+                           [policy_validator, <<"ha-params">>, ?MODULE]}},
+                    {mfa, {rabbit_registry, register,
+                           [policy_validator, <<"ha-sync-mode">>, ?MODULE]}},
+                    {requires, rabbit_registry},
+                    {enables, recovery}]}).
+
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
--spec(remove_from_queue/2 ::
-        (rabbit_amqqueue:name(), [pid()])
+-spec(remove_from_queue/3 ::
+        (rabbit_amqqueue:name(), pid(), [pid()])
         -> {'ok', pid(), [pid()]} | {'error', 'not_found'}).
 -spec(on_node_up/0 :: () -> 'ok').
--spec(drop_mirror/2 ::
-        (rabbit_amqqueue:name(), node()) -> rabbit_types:ok_or_error(any())).
+-spec(add_mirrors/2 :: (rabbit_amqqueue:name(), [node()]) -> 'ok').
 -spec(add_mirror/2 ::
-        (rabbit_amqqueue:name(), node()) -> rabbit_types:ok_or_error(any())).
--spec(add_mirror/3 ::
-        (rabbit_types:vhost(), binary(), atom())
-        -> rabbit_types:ok_or_error(any())).
+        (rabbit_amqqueue:name(), node()) ->
+                           {'ok', atom()} | rabbit_types:error(any())).
+-spec(store_updated_slaves/1 :: (rabbit_types:amqqueue()) ->
+                                     rabbit_types:amqqueue()).
+-spec(suggested_queue_nodes/1 :: (rabbit_types:amqqueue()) ->
+                                      {node(), [node()]}).
+-spec(is_mirrored/1 :: (rabbit_types:amqqueue()) -> boolean()).
+-spec(update_mirrors/2 ::
+        (rabbit_types:amqqueue(), rabbit_types:amqqueue()) -> 'ok').
 
 -endif.
 
@@ -50,29 +69,35 @@
 %% slave (now master) receives messages it's not ready for (for
 %% example, new consumers).
 %% Returns {ok, NewMPid, DeadPids}
-remove_from_queue(QueueName, DeadPids) ->
-    DeadNodes = [node(DeadPid) || DeadPid <- DeadPids],
+remove_from_queue(QueueName, Self, DeadGMPids) ->
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
               %% Someone else could have deleted the queue before we
               %% get here.
               case mnesia:read({rabbit_queue, QueueName}) of
                   [] -> {error, not_found};
-                  [Q = #amqqueue { pid          = QPid,
-                                   slave_pids   = SPids }] ->
-                      [QPid1 | SPids1] = Alive =
-                          [Pid || Pid <- [QPid | SPids],
-                                  not lists:member(node(Pid), DeadNodes)],
+                  [Q = #amqqueue { pid        = QPid,
+                                   slave_pids = SPids,
+                                   gm_pids    = GMPids }] ->
+                      {Dead, GMPids1} = lists:partition(
+                                          fun ({GM, _}) ->
+                                                  lists:member(GM, DeadGMPids)
+                                          end, GMPids),
+                      DeadPids = [Pid || {_GM, Pid} <- Dead],
+                      Alive = [QPid | SPids] -- DeadPids,
+                      {QPid1, SPids1} = promote_slave(Alive),
                       case {{QPid, SPids}, {QPid1, SPids1}} of
                           {Same, Same} ->
+                              GMPids = GMPids1, %% ASSERTION
                               {ok, QPid1, []};
-                          _ when QPid =:= QPid1 orelse node(QPid1) =:= node() ->
+                          _ when QPid =:= QPid1 orelse QPid1 =:= Self ->
                               %% Either master hasn't changed, so
                               %% we're ok to update mnesia; or we have
                               %% become the master.
-                              Q1 = Q #amqqueue { pid        = QPid1,
-                                                 slave_pids = SPids1 },
-                              ok = rabbit_amqqueue:store_queue(Q1),
+                              store_updated_slaves(
+                                Q #amqqueue { pid        = QPid1,
+                                              slave_pids = SPids1,
+                                              gm_pids    = GMPids1 }),
                               {ok, QPid1, [QPid | SPids] -- Alive};
                           _ ->
                               %% Master has changed, and we're not it,
@@ -85,32 +110,41 @@ remove_from_queue(QueueName, DeadPids) ->
       end).
 
 on_node_up() ->
-    Qs =
+    QNames =
         rabbit_misc:execute_mnesia_transaction(
           fun () ->
                   mnesia:foldl(
-                    fun (#amqqueue { mirror_nodes = undefined }, QsN) ->
-                            QsN;
-                        (#amqqueue { name         = QName,
-                                     mirror_nodes = all }, QsN) ->
-                            [QName | QsN];
-                        (#amqqueue { name         = QName,
-                                     mirror_nodes = MNodes }, QsN) ->
-                            case lists:member(node(), MNodes) of
-                                true  -> [QName | QsN];
-                                false -> QsN
+                    fun (Q = #amqqueue{name       = QName,
+                                       pid        = Pid,
+                                       slave_pids = SPids}, QNames0) ->
+                            %% We don't want to pass in the whole
+                            %% cluster - we don't want a situation
+                            %% where starting one node causes us to
+                            %% decide to start a mirror on another
+                            PossibleNodes0 = [node(P) || P <- [Pid | SPids]],
+                            PossibleNodes =
+                                case lists:member(node(), PossibleNodes0) of
+                                    true  -> PossibleNodes0;
+                                    false -> [node() | PossibleNodes0]
+                                end,
+                            {_MNode, SNodes} = suggested_queue_nodes(
+                                                 Q, PossibleNodes),
+                            case lists:member(node(), SNodes) of
+                                true  -> [QName | QNames0];
+                                false -> QNames0
                             end
                     end, [], rabbit_queue)
           end),
-    [add_mirror(Q, node()) || Q <- Qs],
+    [add_mirror(QName, node()) || QName <- QNames],
     ok.
 
-drop_mirror(VHostPath, QueueName, MirrorNode) ->
-    drop_mirror(rabbit_misc:r(VHostPath, queue, QueueName), MirrorNode).
+drop_mirrors(QName, Nodes) ->
+    [drop_mirror(QName, Node)  || Node <- Nodes],
+    ok.
 
-drop_mirror(Queue, MirrorNode) ->
-    if_mirrored_queue(
-      Queue,
+drop_mirror(QName, MirrorNode) ->
+    rabbit_amqqueue:with(
+      QName,
       fun (#amqqueue { name = Name, pid = QPid, slave_pids = SPids }) ->
               case [Pid || Pid <- [QPid | SPids], node(Pid) =:= MirrorNode] of
                   [] ->
@@ -122,48 +156,54 @@ drop_mirror(Queue, MirrorNode) ->
                         "Dropping queue mirror on node ~p for ~s~n",
                         [MirrorNode, rabbit_misc:rs(Name)]),
                       exit(Pid, {shutdown, dropped}),
-                      ok
+                      {ok, dropped}
               end
       end).
 
-add_mirror(VHostPath, QueueName, MirrorNode) ->
-    add_mirror(rabbit_misc:r(VHostPath, queue, QueueName), MirrorNode).
+add_mirrors(QName, Nodes) ->
+    [add_mirror(QName, Node)  || Node <- Nodes],
+    ok.
 
-add_mirror(Queue, MirrorNode) ->
-    if_mirrored_queue(
-      Queue,
+add_mirror(QName, MirrorNode) ->
+    rabbit_amqqueue:with(
+      QName,
       fun (#amqqueue { name = Name, pid = QPid, slave_pids = SPids } = Q) ->
               case [Pid || Pid <- [QPid | SPids], node(Pid) =:= MirrorNode] of
-                  []  -> case rabbit_mirror_queue_slave_sup:start_child(
-                                MirrorNode, [Q]) of
-                             {ok, undefined} -> %% Already running
-                                 ok;
-                             {ok, SPid} ->
-                                 rabbit_log:info(
-                                   "Adding mirror of ~s on node ~p: ~p~n",
-                                   [rabbit_misc:rs(Name), MirrorNode, SPid]),
-                                 ok;
-                             Other ->
-                                 Other
-                         end;
-                  [_] -> {error, {queue_already_mirrored_on_node, MirrorNode}}
+                  [] ->
+                      start_child(Name, MirrorNode, Q);
+                  [SPid] ->
+                      case rabbit_misc:is_process_alive(SPid) of
+                          true  -> {ok, already_mirrored};
+                          false -> start_child(Name, MirrorNode, Q)
+                      end
               end
       end).
 
-if_mirrored_queue(Queue, Fun) ->
-    rabbit_amqqueue:with(
-      Queue, fun (#amqqueue { arguments = Args } = Q) ->
-                     case rabbit_misc:table_lookup(Args, <<"x-ha-policy">>) of
-                         undefined -> ok;
-                         _         -> Fun(Q)
-                     end
-             end).
+start_child(Name, MirrorNode, Q) ->
+    case rabbit_misc:with_exit_handler(
+           rabbit_misc:const({ok, down}),
+           fun () ->
+                   rabbit_mirror_queue_slave_sup:start_child(MirrorNode, [Q])
+           end) of
+        {ok, SPid} when is_pid(SPid)  ->
+            maybe_auto_sync(Q),
+            rabbit_log:info("Adding mirror of ~s on node ~p: ~p~n",
+                            [rabbit_misc:rs(Name), MirrorNode, SPid]),
+            {ok, started};
+        {error, {{stale_master_pid, StalePid}, _}} ->
+            rabbit_log:warning("Detected stale HA master while adding "
+                               "mirror of ~s on node ~p: ~p~n",
+                               [rabbit_misc:rs(Name), MirrorNode, StalePid]),
+            {ok, stale_master};
+        {error, {{duplicate_live_master, _}=Err, _}} ->
+            Err;
+        Other ->
+            Other
+    end.
 
 report_deaths(_MirrorPid, _IsMaster, _QueueName, []) ->
     ok;
 report_deaths(MirrorPid, IsMaster, QueueName, DeadPids) ->
-    rabbit_event:notify(queue_mirror_deaths, [{name, QueueName},
-                                              {pids, DeadPids}]),
     rabbit_log:info("Mirrored-queue (~s): ~s ~s saw deaths of mirrors ~s~n",
                     [rabbit_misc:rs(QueueName),
                      case IsMaster of
@@ -172,3 +212,134 @@ report_deaths(MirrorPid, IsMaster, QueueName, DeadPids) ->
                      end,
                      rabbit_misc:pid_to_string(MirrorPid),
                      [[rabbit_misc:pid_to_string(P), $ ] || P <- DeadPids]]).
+
+store_updated_slaves(Q = #amqqueue{slave_pids      = SPids,
+                                   sync_slave_pids = SSPids}) ->
+    %% TODO now that we clear sync_slave_pids in rabbit_durable_queue,
+    %% do we still need this filtering?
+    SSPids1 = [SSPid || SSPid <- SSPids, lists:member(SSPid, SPids)],
+    Q1 = Q#amqqueue{sync_slave_pids = SSPids1},
+    ok = rabbit_amqqueue:store_queue(Q1),
+    %% Wake it up so that we emit a stats event
+    rabbit_amqqueue:wake_up(Q1),
+    Q1.
+
+%%----------------------------------------------------------------------------
+
+promote_slave([SPid | SPids]) ->
+    %% The slave pids are maintained in descending order of age, so
+    %% the one to promote is the oldest.
+    {SPid, SPids}.
+
+suggested_queue_nodes(Q) ->
+    suggested_queue_nodes(Q, rabbit_mnesia:cluster_nodes(running)).
+
+%% This variant exists so we can pull a call to
+%% rabbit_mnesia:cluster_nodes(running) out of a loop or
+%% transaction or both.
+suggested_queue_nodes(Q, All) ->
+    {MNode0, SNodes, SSNodes} = actual_queue_nodes(Q),
+    MNode = case MNode0 of
+                none -> node();
+                _    -> MNode0
+            end,
+    Params = policy(<<"ha-params">>, Q),
+    case module(Q) of
+        {ok, M} -> M:suggested_queue_nodes(Params, MNode, SNodes, SSNodes, All);
+        _       -> {MNode, []}
+    end.
+
+policy(Policy, Q) ->
+    case rabbit_policy:get(Policy, Q) of
+        {ok, P} -> P;
+        _       -> none
+    end.
+
+module(#amqqueue{} = Q) ->
+    case rabbit_policy:get(<<"ha-mode">>, Q) of
+        {ok, Mode} -> module(Mode);
+        _          -> not_mirrored
+    end;
+
+module(Mode) when is_binary(Mode) ->
+    case rabbit_registry:binary_to_type(Mode) of
+        {error, not_found} -> not_mirrored;
+        T                  -> case rabbit_registry:lookup_module(ha_mode, T) of
+                                  {ok, Module} -> {ok, Module};
+                                  _            -> not_mirrored
+                              end
+    end.
+
+is_mirrored(Q) ->
+    case module(Q) of
+        {ok, _}  -> true;
+        _        -> false
+    end.
+
+actual_queue_nodes(#amqqueue{pid             = MPid,
+                             slave_pids      = SPids,
+                             sync_slave_pids = SSPids}) ->
+    Nodes = fun (L) -> [node(Pid) || Pid <- L] end,
+    {case MPid of
+         none -> none;
+         _    -> node(MPid)
+     end, Nodes(SPids), Nodes(SSPids)}.
+
+maybe_auto_sync(Q = #amqqueue{pid = QPid}) ->
+    case policy(<<"ha-sync-mode">>, Q) of
+        <<"automatic">> ->
+            spawn(fun() -> rabbit_amqqueue:sync_mirrors(QPid) end);
+        _ ->
+            ok
+    end.
+
+update_mirrors(OldQ = #amqqueue{pid = QPid},
+               NewQ = #amqqueue{pid = QPid}) ->
+    case {is_mirrored(OldQ), is_mirrored(NewQ)} of
+        {false, false} -> ok;
+        {true,  false} -> rabbit_amqqueue:stop_mirroring(QPid);
+        {false,  true} -> rabbit_amqqueue:start_mirroring(QPid);
+        {true,   true} -> update_mirrors0(OldQ, NewQ)
+    end.
+
+update_mirrors0(OldQ = #amqqueue{name = QName},
+                NewQ = #amqqueue{name = QName}) ->
+    {OldMNode, OldSNodes, _} = actual_queue_nodes(OldQ),
+    {NewMNode, NewSNodes}    = suggested_queue_nodes(NewQ),
+    OldNodes = [OldMNode | OldSNodes],
+    NewNodes = [NewMNode | NewSNodes],
+    add_mirrors (QName, NewNodes -- OldNodes),
+    drop_mirrors(QName, OldNodes -- NewNodes),
+    maybe_auto_sync(NewQ),
+    ok.
+
+%%----------------------------------------------------------------------------
+
+validate_policy(KeyList) ->
+    Mode = proplists:get_value(<<"ha-mode">>, KeyList, none),
+    Params = proplists:get_value(<<"ha-params">>, KeyList, none),
+    SyncMode = proplists:get_value(<<"ha-sync-mode">>, KeyList, none),
+    case {Mode, Params, SyncMode} of
+        {none, none, none} ->
+            ok;
+        {none, _, _} ->
+            {error, "ha-mode must be specified to specify ha-params or "
+             "ha-sync-mode", []};
+        _ ->
+            case module(Mode) of
+                {ok, M} -> case M:validate_policy(Params) of
+                               ok -> validate_sync_mode(SyncMode);
+                               E  -> E
+                           end;
+                _       -> {error, "~p is not a valid ha-mode value", [Mode]}
+            end
+    end.
+
+validate_sync_mode(SyncMode) ->
+    case SyncMode of
+        <<"automatic">> -> ok;
+        <<"manual">>    -> ok;
+        none            -> ok;
+        Mode            -> {error, "ha-sync-mode must be \"manual\" "
+                            "or \"automatic\", got ~p", [Mode]}
+    end.
diff --git a/src/rabbit_mirror_queue_mode.erl b/src/rabbit_mirror_queue_mode.erl
new file mode 100644
index 00000000..9e2015d9
--- /dev/null
+++ b/src/rabbit_mirror_queue_mode.erl
@@ -0,0 +1,57 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_mirror_queue_mode).
+
+-ifdef(use_specs).
+
+-type(master() :: node()).
+-type(slave() :: node()).
+-type(params() :: any()).
+
+-callback description() -> [proplists:property()].
+
+%% Called whenever we think we might need to change nodes for a
+%% mirrored queue. Note that this is called from a variety of
+%% contexts, both inside and outside Mnesia transactions. Ideally it
+%% will be pure-functional.
+%%
+%% Takes: parameters set in the policy,
+%%        current master,
+%%        current slaves,
+%%        current synchronised slaves,
+%%        all nodes to consider
+%%
+%% Returns: tuple of new master, new slaves
+%%
+-callback suggested_queue_nodes(
+            params(), master(), [slave()], [slave()], [node()]) ->
+    {master(), [slave()]}.
+
+%% Are the parameters valid for this mode?
+-callback validate_policy(params()) ->
+    rabbit_policy_validator:validate_results().
+
+-else.
+
+-export([behaviour_info/1]).
+
+behaviour_info(callbacks) ->
+    [{description, 0}, {suggested_queue_nodes, 5}, {validate_policy, 1}];
+behaviour_info(_Other) ->
+    undefined.
+
+-endif.
diff --git a/src/rabbit_mirror_queue_mode_all.erl b/src/rabbit_mirror_queue_mode_all.erl
new file mode 100644
index 00000000..3b5163a3
--- /dev/null
+++ b/src/rabbit_mirror_queue_mode_all.erl
@@ -0,0 +1,41 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_mirror_queue_mode_all).
+
+-include("rabbit.hrl").
+
+-behaviour(rabbit_mirror_queue_mode).
+
+-export([description/0, suggested_queue_nodes/5, validate_policy/1]).
+
+-rabbit_boot_step({?MODULE,
+                   [{description, "mirror mode all"},
+                    {mfa,         {rabbit_registry, register,
+                                   [ha_mode, <<"all">>, ?MODULE]}},
+                    {requires,    rabbit_registry},
+                    {enables,     kernel_ready}]}).
+
+description() ->
+    [{description, <<"Mirror queue to all nodes">>}].
+
+suggested_queue_nodes(_Params, MNode, _SNodes, _SSNodes, Poss) ->
+    {MNode, Poss -- [MNode]}.
+
+validate_policy(none) ->
+    ok;
+validate_policy(_Params) ->
+    {error, "ha-mode=\"all\" does not take parameters", []}.
diff --git a/src/rabbit_mirror_queue_mode_exactly.erl b/src/rabbit_mirror_queue_mode_exactly.erl
new file mode 100644
index 00000000..2841f87e
--- /dev/null
+++ b/src/rabbit_mirror_queue_mode_exactly.erl
@@ -0,0 +1,56 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_mirror_queue_mode_exactly).
+
+-include("rabbit.hrl").
+
+-behaviour(rabbit_mirror_queue_mode).
+
+-export([description/0, suggested_queue_nodes/5, validate_policy/1]).
+
+-rabbit_boot_step({?MODULE,
+                   [{description, "mirror mode exactly"},
+                    {mfa,         {rabbit_registry, register,
+                                   [ha_mode, <<"exactly">>, ?MODULE]}},
+                    {requires,    rabbit_registry},
+                    {enables,     kernel_ready}]}).
+
+description() ->
+    [{description, <<"Mirror queue to a specified number of nodes">>}].
+
+%% When we need to add nodes, we randomise our candidate list as a
+%% crude form of load-balancing. TODO it would also be nice to
+%% randomise the list of ones to remove when we have too many - we
+%% would have to take account of synchronisation though.
+suggested_queue_nodes(Count, MNode, SNodes, _SSNodes, Poss) ->
+    SCount = Count - 1,
+    {MNode, case SCount > length(SNodes) of
+                true  -> Cand = shuffle((Poss -- [MNode]) -- SNodes),
+                         SNodes ++ lists:sublist(Cand, SCount - length(SNodes));
+                false -> lists:sublist(SNodes, SCount)
+            end}.
+
+shuffle(L) ->
+    {A1,A2,A3} = now(),
+    random:seed(A1, A2, A3),
+    {_, L1} = lists:unzip(lists:keysort(1, [{random:uniform(), N} || N <- L])),
+    L1.
+
+validate_policy(N) when is_integer(N) andalso N > 0 ->
+    ok;
+validate_policy(Params) ->
+    {error, "ha-mode=\"exactly\" takes an integer, ~p given", [Params]}.
diff --git a/src/rabbit_mirror_queue_mode_nodes.erl b/src/rabbit_mirror_queue_mode_nodes.erl
new file mode 100644
index 00000000..779b439d
--- /dev/null
+++ b/src/rabbit_mirror_queue_mode_nodes.erl
@@ -0,0 +1,70 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_mirror_queue_mode_nodes).
+
+-include("rabbit.hrl").
+
+-behaviour(rabbit_mirror_queue_mode).
+
+-export([description/0, suggested_queue_nodes/5, validate_policy/1]).
+
+-rabbit_boot_step({?MODULE,
+                   [{description, "mirror mode nodes"},
+                    {mfa,         {rabbit_registry, register,
+                                   [ha_mode, <<"nodes">>, ?MODULE]}},
+                    {requires,    rabbit_registry},
+                    {enables,     kernel_ready}]}).
+
+description() ->
+    [{description, <<"Mirror queue to specified nodes">>}].
+
+suggested_queue_nodes(Nodes0, MNode, _SNodes, SSNodes, Poss) ->
+    Nodes1 = [list_to_atom(binary_to_list(Node)) || Node <- Nodes0],
+    %% If the current master is not in the nodes specified, then what we want
+    %% to do depends on whether there are any synchronised slaves. If there
+    %% are then we can just kill the current master - the admin has asked for
+    %% a migration and we should give it to them. If there are not however
+    %% then we must keep the master around so as not to lose messages.
+    Nodes = case SSNodes of
+                [] -> lists:usort([MNode | Nodes1]);
+                _  -> Nodes1
+            end,
+    Unavailable = Nodes -- Poss,
+    Available = Nodes -- Unavailable,
+    case Available of
+        [] -> %% We have never heard of anything? Not much we can do but
+              %% keep the master alive.
+              {MNode, []};
+        _  -> case lists:member(MNode, Available) of
+                  true  -> {MNode, Available -- [MNode]};
+                  false -> %% Make sure the new master is synced! In order to
+                           %% get here SSNodes must not be empty.
+                           [NewMNode | _] = SSNodes,
+                           {NewMNode, Available -- [NewMNode]}
+              end
+    end.
+
+validate_policy([]) ->
+    {error, "ha-mode=\"nodes\" list must be non-empty", []};
+validate_policy(Nodes) when is_list(Nodes) ->
+    case [I || I <- Nodes, not is_binary(I)] of
+        []      -> ok;
+        Invalid -> {error, "ha-mode=\"nodes\" takes a list of strings, "
+                    "~p was not a string", [Invalid]}
+    end;
+validate_policy(Params) ->
+    {error, "ha-mode=\"nodes\" takes a list, ~p given", [Params]}.
diff --git a/src/rabbit_mirror_queue_slave.erl b/src/rabbit_mirror_queue_slave.erl
index e412fbbc..18f848c3 100644
--- a/src/rabbit_mirror_queue_slave.erl
+++ b/src/rabbit_mirror_queue_slave.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2010-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_mirror_queue_slave).
@@ -19,25 +19,16 @@
 %% For general documentation of HA design, see
 %% rabbit_mirror_queue_coordinator
 %%
-%% We join the GM group before we add ourselves to the amqqueue
-%% record. As a result:
-%% 1. We can receive msgs from GM that correspond to messages we will
-%% never receive from publishers.
-%% 2. When we receive a message from publishers, we must receive a
-%% message from the GM group for it.
-%% 3. However, that instruction from the GM group can arrive either
-%% before or after the actual message. We need to be able to
-%% distinguish between GM instructions arriving early, and case (1)
-%% above.
-%%
+%% We receive messages from GM and from publishers, and the gm
+%% messages can arrive either before or after the 'actual' message.
 %% All instructions from the GM group must be processed in the order
 %% in which they're received.
 
 -export([start_link/1, set_maximum_since_use/2, info/1]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
-         code_change/3, handle_pre_hibernate/1, prioritise_call/3,
-         prioritise_cast/2, prioritise_info/2]).
+         code_change/3, handle_pre_hibernate/1, prioritise_call/4,
+         prioritise_cast/3, prioritise_info/3, format_message_queue/2]).
 
 -export([joined/2, members_changed/3, handle_msg/3]).
 
@@ -46,90 +37,78 @@
 
 -include("rabbit.hrl").
 
-%%----------------------------------------------------------------------------
-
 -include("gm_specs.hrl").
 
--ifdef(use_specs).
-%% Shut dialyzer up
--spec(promote_me/2 :: (_, _) -> no_return()).
--endif.
-
 %%----------------------------------------------------------------------------
 
-
--define(CREATION_EVENT_KEYS,
+-define(INFO_KEYS,
         [pid,
          name,
          master_pid,
          is_synchronised
         ]).
 
--define(INFO_KEYS, ?CREATION_EVENT_KEYS).
-
 -define(SYNC_INTERVAL,                 25). %% milliseconds
 -define(RAM_DURATION_UPDATE_INTERVAL,  5000).
 -define(DEATH_TIMEOUT,                 20000). %% 20 seconds
 
 -record(state, { q,
                  gm,
-                 master_pid,
                  backing_queue,
                  backing_queue_state,
                  sync_timer_ref,
                  rate_timer_ref,
 
-                 sender_queues, %% :: Pid -> {Q {Msg, Bool}, Set MsgId}
+                 sender_queues, %% :: Pid -> {Q Msg, Set MsgId, ChState}
                  msg_id_ack,    %% :: MsgId -> AckTag
-                 ack_num,
 
                  msg_id_status,
                  known_senders,
 
-                 synchronised
+                 %% Master depth - local depth
+                 depth_delta
                }).
 
-start_link(Q) ->
-    gen_server2:start_link(?MODULE, Q, []).
+%%----------------------------------------------------------------------------
+
+start_link(Q) -> gen_server2:start_link(?MODULE, Q, []).
 
 set_maximum_since_use(QPid, Age) ->
     gen_server2:cast(QPid, {set_maximum_since_use, Age}).
 
-info(QPid) ->
-    gen_server2:call(QPid, info, infinity).
-
-init(#amqqueue { name = QueueName } = Q) ->
+info(QPid) -> gen_server2:call(QPid, info, infinity).
+
+init(Q = #amqqueue { name = QName }) ->
+    %% We join the GM group before we add ourselves to the amqqueue
+    %% record. As a result:
+    %% 1. We can receive msgs from GM that correspond to messages we will
+    %% never receive from publishers.
+    %% 2. When we receive a message from publishers, we must receive a
+    %% message from the GM group for it.
+    %% 3. However, that instruction from the GM group can arrive either
+    %% before or after the actual message. We need to be able to
+    %% distinguish between GM instructions arriving early, and case (1)
+    %% above.
+    %%
+    process_flag(trap_exit, true), %% amqqueue_process traps exits too.
+    {ok, GM} = gm:start_link(QName, ?MODULE, [self()],
+                             fun rabbit_misc:execute_mnesia_transaction/1),
+    receive {joined, GM} -> ok end,
     Self = self(),
     Node = node(),
     case rabbit_misc:execute_mnesia_transaction(
-           fun () ->
-                   [Q1 = #amqqueue { pid = QPid, slave_pids = MPids }] =
-                       mnesia:read({rabbit_queue, QueueName}),
-                   case [Pid || Pid <- [QPid | MPids], node(Pid) =:= Node] of
-                       []     -> MPids1 = MPids ++ [Self],
-                                 ok = rabbit_amqqueue:store_queue(
-                                        Q1 #amqqueue { slave_pids = MPids1 }),
-                                 {new, QPid};
-                       [SPid] -> true = rabbit_misc:is_process_alive(SPid),
-                                 existing
-                   end
-           end) of
-        {new, MPid} ->
-            process_flag(trap_exit, true), %% amqqueue_process traps exits too.
-            {ok, GM} = gm:start_link(QueueName, ?MODULE, [self()]),
-            receive {joined, GM} ->
-                    ok
-            end,
-            erlang:monitor(process, MPid),
+           fun() -> init_it(Self, GM, Node, QName) end) of
+        {new, QPid, GMPids} ->
+            erlang:monitor(process, QPid),
             ok = file_handle_cache:register_callback(
                    rabbit_amqqueue, set_maximum_since_use, [Self]),
             ok = rabbit_memory_monitor:register(
                    Self, {rabbit_amqqueue, set_ram_duration_target, [Self]}),
             {ok, BQ} = application:get_env(backing_queue_module),
-            BQS = bq_init(BQ, Q, false),
-            State = #state { q                   = Q,
+            Q1 = Q #amqqueue { pid = QPid },
+            BQS = bq_init(BQ, Q1, false),
+            State = #state { q                   = Q1,
                              gm                  = GM,
-                             master_pid          = MPid,
                              backing_queue       = BQ,
                              backing_queue_state = BQS,
                              rate_timer_ref      = undefined,
@@ -137,70 +116,82 @@ init(#amqqueue { name = QueueName } = Q) ->
 
                              sender_queues       = dict:new(),
                              msg_id_ack          = dict:new(),
-                             ack_num             = 0,
 
                              msg_id_status       = dict:new(),
-                             known_senders       = pmon:new(),
+                             known_senders       = pmon:new(delegate),
 
-                             synchronised        = false
+                             depth_delta         = undefined
                    },
-            rabbit_event:notify(queue_slave_created,
-                                infos(?CREATION_EVENT_KEYS, State)),
-            ok = gm:broadcast(GM, request_length),
+            ok = gm:broadcast(GM, request_depth),
+            ok = gm:validate_members(GM, [GM | [G || {G, _} <- GMPids]]),
             {ok, State, hibernate,
              {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN,
               ?DESIRED_HIBERNATE}};
+        {stale, StalePid} ->
+            {stop, {stale_master_pid, StalePid}};
+        duplicate_live_master ->
+            {stop, {duplicate_live_master, Node}};
         existing ->
+            gm:leave(GM),
             ignore
     end.
 
-handle_call({deliver, Delivery = #delivery { immediate = true }},
-            From, State) ->
-    %% It is safe to reply 'false' here even if a) we've not seen the
-    %% msg via gm, or b) the master dies before we receive the msg via
-    %% gm. In the case of (a), we will eventually receive the msg via
-    %% gm, and it's only the master's result to the channel that is
-    %% important. In the case of (b), if the master does die and we do
-    %% get promoted then at that point we have no consumers, thus
-    %% 'false' is precisely the correct answer. However, we must be
-    %% careful to _not_ enqueue the message in this case.
-
-    %% Note this is distinct from the case where we receive the msg
-    %% via gm first, then we're promoted to master, and only then do
-    %% we receive the msg from the channel.
-    gen_server2:reply(From, false), %% master may deliver it, not us
-    noreply(maybe_enqueue_message(Delivery, false, State));
-
-handle_call({deliver, Delivery = #delivery { mandatory = true }},
-            From, State) ->
-    gen_server2:reply(From, true), %% amqqueue throws away the result anyway
-    noreply(maybe_enqueue_message(Delivery, true, State));
+init_it(Self, GM, Node, QName) ->
+    [Q = #amqqueue { pid = QPid, slave_pids = SPids, gm_pids = GMPids }] =
+        mnesia:read({rabbit_queue, QName}),
+    case [Pid || Pid <- [QPid | SPids], node(Pid) =:= Node] of
+        []     -> add_slave(Q, Self, GM),
+                  {new, QPid, GMPids};
+        [QPid] -> case rabbit_misc:is_process_alive(QPid) of
+                      true  -> duplicate_live_master;
+                      false -> {stale, QPid}
+                  end;
+        [SPid] -> case rabbit_misc:is_process_alive(SPid) of
+                      true  -> existing;
+                      false -> Q1 = Q#amqqueue {
+                                      slave_pids = SPids -- [SPid],
+                                      gm_pids    = [T || T = {_, S} <- GMPids,
+                                                         S =/= SPid] },
+                               add_slave(Q1, Self, GM),
+                               {new, QPid, GMPids}
+                  end
+    end.
+
+%% Add to the end, so they are in descending order of age, see
+%% rabbit_mirror_queue_misc:promote_slave/1
+add_slave(Q = #amqqueue { slave_pids = SPids, gm_pids = GMPids }, New, GM) ->
+    rabbit_mirror_queue_misc:store_updated_slaves(
+      Q#amqqueue{slave_pids = SPids ++ [New], gm_pids = [{GM, New} | GMPids]}).
+
+handle_call({deliver, Delivery, true}, From, State) ->
+    %% Synchronous, "mandatory" deliver mode.
+    gen_server2:reply(From, ok),
+    noreply(maybe_enqueue_message(Delivery, State));
 
 handle_call({gm_deaths, Deaths}, From,
-            State = #state { q          = #amqqueue { name = QueueName },
-                             gm         = GM,
-                             master_pid = MPid }) ->
-    %% The GM has told us about deaths, which means we're not going to
-    %% receive any more messages from GM
-    case rabbit_mirror_queue_misc:remove_from_queue(QueueName, Deaths) of
+            State = #state { q = Q = #amqqueue { name = QName, pid = MPid }}) ->
+    Self = self(),
+    case rabbit_mirror_queue_misc:remove_from_queue(QName, Self, Deaths) of
         {error, not_found} ->
             gen_server2:reply(From, ok),
             {stop, normal, State};
         {ok, Pid, DeadPids} ->
-            rabbit_mirror_queue_misc:report_deaths(self(), false, QueueName,
+            rabbit_mirror_queue_misc:report_deaths(Self, false, QName,
                                                    DeadPids),
-            if node(Pid) =:= node(MPid) ->
+            case Pid of
+                MPid ->
                     %% master hasn't changed
-                    reply(ok, State);
-               node(Pid) =:= node() ->
+                    gen_server2:reply(From, ok),
+                    noreply(State);
+                Self ->
                     %% we've become master
-                    promote_me(From, State);
-               true ->
-                    %% master has changed to not us.
+                    QueueState = promote_me(From, State),
+                    {become, rabbit_amqqueue_process, QueueState, hibernate};
+                _ ->
+                    %% master has changed to not us
                     gen_server2:reply(From, ok),
                     erlang:monitor(process, Pid),
-                    ok = gm:broadcast(GM, heartbeat),
-                    noreply(State #state { master_pid = Pid })
+                    noreply(State #state { q = Q #amqqueue { pid = Pid } })
             end
     end;
 
@@ -213,13 +204,39 @@ handle_cast({run_backing_queue, Mod, Fun}, State) ->
 handle_cast({gm, Instruction}, State) ->
     handle_process_result(process_instruction(Instruction, State));
 
-handle_cast({deliver, Delivery = #delivery{sender = Sender}, Flow}, State) ->
-    %% Asynchronous, non-"mandatory", non-"immediate" deliver mode.
+handle_cast({deliver, Delivery = #delivery{sender = Sender}, true, Flow},
+            State) ->
+    %% Asynchronous, non-"mandatory", deliver mode.
     case Flow of
         flow   -> credit_flow:ack(Sender);
         noflow -> ok
     end,
-    noreply(maybe_enqueue_message(Delivery, true, State));
+    noreply(maybe_enqueue_message(Delivery, State));
+
+handle_cast({sync_start, Ref, Syncer},
+            State = #state { depth_delta         = DD,
+                             backing_queue       = BQ,
+                             backing_queue_state = BQS }) ->
+    State1 = #state{rate_timer_ref = TRef} = ensure_rate_timer(State),
+    S = fun({MA, TRefN, BQSN}) ->
+                State1#state{depth_delta         = undefined,
+                             msg_id_ack          = dict:from_list(MA),
+                             rate_timer_ref      = TRefN,
+                             backing_queue_state = BQSN}
+        end,
+    case rabbit_mirror_queue_sync:slave(
+           DD, Ref, TRef, Syncer, BQ, BQS,
+           fun (BQN, BQSN) ->
+                   BQSN1 = update_ram_duration(BQN, BQSN),
+                   TRefN = erlang:send_after(?RAM_DURATION_UPDATE_INTERVAL,
+                                             self(), update_ram_duration),
+                   {TRefN, BQSN1}
+           end) of
+        denied              -> noreply(State1);
+        {ok,           Res} -> noreply(set_delta(0, S(Res)));
+        {failed,       Res} -> noreply(S(Res));
+        {stop, Reason, Res} -> {stop, Reason, S(Res)}
+    end;
 
 handle_cast({set_maximum_since_use, Age}, State) ->
     ok = file_handle_cache:set_maximum_since_use(Age),
@@ -231,15 +248,14 @@ handle_cast({set_ram_duration_target, Duration},
     BQS1 = BQ:set_ram_duration_target(Duration, BQS),
     noreply(State #state { backing_queue_state = BQS1 }).
 
-handle_info(update_ram_duration,
-            State = #state { backing_queue = BQ,
-                             backing_queue_state = BQS }) ->
-    {RamDuration, BQS1} = BQ:ram_duration(BQS),
-    DesiredDuration =
-        rabbit_memory_monitor:report_ram_duration(self(), RamDuration),
-    BQS2 = BQ:set_ram_duration_target(DesiredDuration, BQS1),
-    noreply(State #state { rate_timer_ref = just_measured,
-                           backing_queue_state = BQS2 });
+handle_info(update_ram_duration, State = #state{backing_queue       = BQ,
+                                                backing_queue_state = BQS}) ->
+    BQS1 = update_ram_duration(BQ, BQS),
+    %% Don't call noreply/1, we don't want to set timers
+    {State1, Timeout} = next_state(State #state {
+                                     rate_timer_ref      = undefined,
+                                     backing_queue_state = BQS1 }),
+    {noreply, State1, Timeout};
 
 handle_info(sync_timeout, State) ->
     noreply(backing_queue_timeout(
@@ -249,12 +265,13 @@ handle_info(timeout, State) ->
     noreply(backing_queue_timeout(State));
 
 handle_info({'DOWN', _MonitorRef, process, MPid, _Reason},
-           State = #state { gm = GM, master_pid = MPid }) ->
-    ok = gm:broadcast(GM, {process_death, MPid}),
+            State = #state { gm = GM, q = #amqqueue { pid = MPid } }) ->
+    ok = gm:broadcast(GM, process_death),
     noreply(State);
 
 handle_info({'DOWN', _MonitorRef, process, ChPid, _Reason}, State) ->
-    noreply(local_sender_death(ChPid, State));
+    local_sender_death(ChPid, State),
+    noreply(maybe_forget_sender(ChPid, down_from_ch, State));
 
 handle_info({'EXIT', _Pid, Reason}, State) ->
     {stop, Reason, State};
@@ -286,7 +303,7 @@ terminate(Reason, #state { q                   = Q,
                            rate_timer_ref      = RateTRef }) ->
     ok = gm:leave(GM),
     QueueState = rabbit_amqqueue_process:init_with_backing_queue_state(
-                   Q, BQ, BQS, RateTRef, [], [], pmon:new(), dict:new()),
+                   Q, BQ, BQS, RateTRef, [], pmon:new(), dict:new()),
     rabbit_amqqueue_process:terminate(Reason, QueueState);
 terminate([_SPid], _Reason) ->
     %% gm case
@@ -304,67 +321,71 @@ handle_pre_hibernate(State = #state { backing_queue       = BQ,
     BQS3 = BQ:handle_pre_hibernate(BQS2),
     {hibernate, stop_rate_timer(State #state { backing_queue_state = BQS3 })}.
 
-prioritise_call(Msg, _From, _State) ->
+prioritise_call(Msg, _From, _Len, _State) ->
     case Msg of
         info                                 -> 9;
         {gm_deaths, _Deaths}                 -> 5;
         _                                    -> 0
     end.
 
-prioritise_cast(Msg, _State) ->
+prioritise_cast(Msg, _Len, _State) ->
     case Msg of
         {set_ram_duration_target, _Duration} -> 8;
         {set_maximum_since_use, _Age}        -> 8;
         {run_backing_queue, _Mod, _Fun}      -> 6;
         {gm, _Msg}                           -> 5;
-        {post_commit, _Txn, _AckTags}        -> 4;
         _                                    -> 0
     end.
 
-prioritise_info(Msg, _State) ->
+prioritise_info(Msg, _Len, _State) ->
     case Msg of
         update_ram_duration                  -> 8;
         sync_timeout                         -> 6;
         _                                    -> 0
     end.
 
+format_message_queue(Opt, MQ) -> rabbit_misc:format_message_queue(Opt, MQ).
+
 %% ---------------------------------------------------------------------------
 %% GM
 %% ---------------------------------------------------------------------------
 
-joined([SPid], _Members) ->
-    SPid ! {joined, self()},
-    ok.
+joined([SPid], _Members) -> SPid ! {joined, self()}, ok.
 
-members_changed([_SPid], _Births, []) ->
-    ok;
-members_changed([SPid], _Births, Deaths) ->
-    inform_deaths(SPid, Deaths).
+members_changed([_SPid], _Births,     []) -> ok;
+members_changed([ SPid], _Births, Deaths) -> inform_deaths(SPid, Deaths).
 
-handle_msg([_SPid], _From, heartbeat) ->
-    ok;
-handle_msg([_SPid], _From, request_length) ->
+handle_msg([_SPid], _From, request_depth) ->
     %% This is only of value to the master
     ok;
 handle_msg([_SPid], _From, {ensure_monitoring, _Pid}) ->
     %% This is only of value to the master
     ok;
-handle_msg([SPid], _From, {process_death, Pid}) ->
-    inform_deaths(SPid, [Pid]);
+handle_msg([_SPid], _From, process_death) ->
+    %% Since GM is by nature lazy we need to make sure there is some
+    %% traffic when a master dies, to make sure we get informed of the
+    %% death. That's all process_death does, create some traffic. We
+    %% must not take any notice of the master death here since it
+    %% comes without ordering guarantees - there could still be
+    %% messages from the master we have yet to receive. When we get
+    %% members_changed, then there will be no more messages.
+    ok;
+handle_msg([CPid], _From, {delete_and_terminate, _Reason} = Msg) ->
+    ok = gen_server2:cast(CPid, {gm, Msg}),
+    {stop, {shutdown, ring_shutdown}};
+handle_msg([SPid], _From, {sync_start, Ref, Syncer, SPids}) ->
+    case lists:member(SPid, SPids) of
+        true  -> gen_server2:cast(SPid, {sync_start, Ref, Syncer});
+        false -> ok
+    end;
 handle_msg([SPid], _From, Msg) ->
     ok = gen_server2:cast(SPid, {gm, Msg}).
 
 inform_deaths(SPid, Deaths) ->
-    rabbit_misc:with_exit_handler(
-      fun () -> {stop, normal} end,
-      fun () ->
-              case gen_server2:call(SPid, {gm_deaths, Deaths}, infinity) of
-                  ok ->
-                      ok;
-                  {promote, CPid} ->
-                      {become, rabbit_mirror_queue_coordinator, [CPid]}
-              end
-      end).
+    case gen_server2:call(SPid, {gm_deaths, Deaths}, infinity) of
+        ok              -> ok;
+        {promote, CPid} -> {become, rabbit_mirror_queue_coordinator, [CPid]}
+    end.
 
 %% ---------------------------------------------------------------------------
 %% Others
@@ -374,8 +395,8 @@ infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
 i(pid,             _State)                                   -> self();
 i(name,            #state { q = #amqqueue { name = Name } }) -> Name;
-i(master_pid,      #state { master_pid = MPid })             -> MPid;
-i(is_synchronised, #state { synchronised = Synchronised })   -> Synchronised;
+i(master_pid,      #state { q = #amqqueue { pid  = MPid } }) -> MPid;
+i(is_synchronised, #state { depth_delta = DD })              -> DD =:= 0;
 i(Item,            _State) -> throw({bad_argument, Item}).
 
 bq_init(BQ, Q, Recover) ->
@@ -393,14 +414,20 @@ run_backing_queue(Mod, Fun, State = #state { backing_queue       = BQ,
                                              backing_queue_state = BQS }) ->
     State #state { backing_queue_state = BQ:invoke(Mod, Fun, BQS) }.
 
-needs_confirming(#delivery{ msg_seq_no = undefined }, _State) ->
-    never;
-needs_confirming(#delivery { message = #basic_message {
-                               is_persistent = true } },
-                 #state { q = #amqqueue { durable = true } }) ->
-    eventually;
-needs_confirming(_Delivery, _State) ->
-    immediately.
+send_or_record_confirm(_, #delivery{ msg_seq_no = undefined }, MS, _State) ->
+    MS;
+send_or_record_confirm(published, #delivery { sender     = ChPid,
+                                              msg_seq_no = MsgSeqNo,
+                                              message    = #basic_message {
+                                                id            = MsgId,
+                                                is_persistent = true } },
+                       MS, #state { q = #amqqueue { durable = true } }) ->
+    dict:store(MsgId, {published, ChPid, MsgSeqNo} , MS);
+send_or_record_confirm(_Status, #delivery { sender     = ChPid,
+                                            msg_seq_no = MsgSeqNo },
+                       MS, _State) ->
+    ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
+    MS.
 
 confirm_messages(MsgIds, State = #state { msg_id_status = MS }) ->
     {CMs, MS1} =
@@ -412,16 +439,16 @@ confirm_messages(MsgIds, State = #state { msg_id_status = MS }) ->
                           %% If it needed confirming, it'll have
                           %% already been done.
                           Acc;
-                      {ok, {published, ChPid}} ->
+                      {ok, published} ->
                           %% Still not seen it from the channel, just
                           %% record that it's been confirmed.
-                          {CMsN, dict:store(MsgId, {confirmed, ChPid}, MSN)};
+                          {CMsN, dict:store(MsgId, confirmed, MSN)};
                       {ok, {published, ChPid, MsgSeqNo}} ->
                           %% Seen from both GM and Channel. Can now
                           %% confirm.
                           {rabbit_misc:gb_trees_cons(ChPid, MsgSeqNo, CMsN),
                            dict:erase(MsgId, MSN)};
-                      {ok, {confirmed, _ChPid}} ->
+                      {ok, confirmed} ->
                           %% It's already been confirmed. This is
                           %% probably it's been both sync'd to disk
                           %% and then delivered and ack'd before we've
@@ -436,6 +463,9 @@ confirm_messages(MsgIds, State = #state { msg_id_status = MS }) ->
 handle_process_result({ok,   State}) -> noreply(State);
 handle_process_result({stop, State}) -> {stop, normal, State}.
 
+-ifdef(use_specs).
+-spec(promote_me/2 :: ({pid(), term()}, #state{}) -> no_return()).
+-endif.
 promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
                           gm                  = GM,
                           backing_queue       = BQ,
@@ -445,17 +475,14 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
                           msg_id_ack          = MA,
                           msg_id_status       = MS,
                           known_senders       = KS }) ->
-    rabbit_event:notify(queue_slave_promoted, [{pid,  self()},
-                                               {name, QName}]),
     rabbit_log:info("Mirrored-queue (~s): Promoting slave ~s to master~n",
                     [rabbit_misc:rs(QName), rabbit_misc:pid_to_string(self())]),
     Q1 = Q #amqqueue { pid = self() },
     {ok, CPid} = rabbit_mirror_queue_coordinator:start_link(
                    Q1, GM, rabbit_mirror_queue_master:sender_death_fun(),
-                   rabbit_mirror_queue_master:length_fun()),
+                   rabbit_mirror_queue_master:depth_fun()),
     true = unlink(GM),
     gen_server2:reply(From, {promote, CPid}),
-    ok = gm:confirmed_broadcast(GM, heartbeat),
 
     %% Everything that we're monitoring, we need to ensure our new
     %% coordinator is monitoring.
@@ -463,8 +490,7 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     ok = rabbit_mirror_queue_coordinator:ensure_monitoring(CPid, MPids),
 
     %% We find all the messages that we've received from channels but
-    %% not from gm, and if they're due to be enqueued on promotion
-    %% then we pass them to the
+    %% not from gm, and pass them to the
     %% queue_process:init_with_backing_queue_state to be enqueued.
     %%
     %% We also have to requeue messages which are pending acks: the
@@ -492,18 +518,18 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     %%
     %% MS contains the following three entry types:
     %%
-    %% a) {published, ChPid}:
+    %% a) published:
     %%   published via gm only; pending arrival of publication from
     %%   channel, maybe pending confirm.
     %%
     %% b) {published, ChPid, MsgSeqNo}:
     %%   published via gm and channel; pending confirm.
     %%
-    %% c) {confirmed, ChPid}:
+    %% c) confirmed:
     %%   published via gm only, and confirmed; pending publication
     %%   from channel.
     %%
-    %% d) discarded
+    %% d) discarded:
     %%   seen via gm only as discarded. Pending publication from
     %%   channel
     %%
@@ -520,96 +546,76 @@ promote_me(From, #state { q                   = Q = #amqqueue { name = QName },
     %% those messages are then requeued. However, as discussed above,
     %% this does not affect MS, nor which bits go through to SS in
     %% Master, or MTC in queue_process.
-    %%
-    %% Everything that's in MA gets requeued. Consequently the new
-    %% master should start with a fresh AM as there are no messages
-    %% pending acks.
 
-    MSList = dict:to_list(MS),
-    SS = dict:from_list(
-           [E || E = {_MsgId, discarded} <- MSList] ++
-               [{MsgId, Status}
-                || {MsgId, {Status, _ChPid}} <- MSList,
-                   Status =:= published orelse Status =:= confirmed]),
+    St = [published, confirmed, discarded],
+    SS = dict:filter(fun (_MsgId, Status) -> lists:member(Status, St) end, MS),
+    AckTags = [AckTag || {_MsgId, AckTag} <- dict:to_list(MA)],
 
     MasterState = rabbit_mirror_queue_master:promote_backing_queue_state(
-                    CPid, BQ, BQS, GM, SS, MPids),
-
-    MTC = lists:foldl(fun ({MsgId, {published, ChPid, MsgSeqNo}}, MTC0) ->
-                              gb_trees:insert(MsgId, {ChPid, MsgSeqNo}, MTC0);
-                          (_, MTC0) ->
-                              MTC0
-                      end, gb_trees:empty(), MSList),
-    NumAckTags = [NumAckTag || {_MsgId, NumAckTag} <- dict:to_list(MA)],
-    AckTags = [AckTag || {_Num, AckTag} <- lists:sort(NumAckTags)],
-    Deliveries = [Delivery || {_ChPid, {PubQ, _PendCh}} <- dict:to_list(SQ),
-                              {Delivery, true} <- queue:to_list(PubQ)],
-    QueueState = rabbit_amqqueue_process:init_with_backing_queue_state(
-                   Q1, rabbit_mirror_queue_master, MasterState, RateTRef,
-                   AckTags, Deliveries, KS, MTC),
-    {become, rabbit_amqqueue_process, QueueState, hibernate}.
+                    QName, CPid, BQ, BQS, GM, AckTags, SS, MPids),
+
+    MTC = dict:fold(fun (MsgId, {published, ChPid, MsgSeqNo}, MTC0) ->
+                            gb_trees:insert(MsgId, {ChPid, MsgSeqNo}, MTC0);
+                        (_Msgid, _Status, MTC0) ->
+                            MTC0
+                    end, gb_trees:empty(), MS),
+    Deliveries = [Delivery ||
+                   {_ChPid, {PubQ, _PendCh, _ChState}} <- dict:to_list(SQ),
+                   Delivery <- queue:to_list(PubQ)],
+    AwaitGmDown = [ChPid || {ChPid, {_, _, down_from_ch}} <- dict:to_list(SQ)],
+    KS1 = lists:foldl(fun (ChPid0, KS0) ->
+                              pmon:demonitor(ChPid0, KS0)
+                      end, KS, AwaitGmDown),
+    rabbit_amqqueue_process:init_with_backing_queue_state(
+      Q1, rabbit_mirror_queue_master, MasterState, RateTRef, Deliveries, KS1,
+      MTC).
 
 noreply(State) ->
     {NewState, Timeout} = next_state(State),
-    {noreply, NewState, Timeout}.
+    {noreply, ensure_rate_timer(NewState), Timeout}.
 
 reply(Reply, State) ->
     {NewState, Timeout} = next_state(State),
-    {reply, Reply, NewState, Timeout}.
+    {reply, Reply, ensure_rate_timer(NewState), Timeout}.
 
 next_state(State = #state{backing_queue = BQ, backing_queue_state = BQS}) ->
     {MsgIds, BQS1} = BQ:drain_confirmed(BQS),
-    State1 = ensure_rate_timer(
-               confirm_messages(MsgIds, State #state {
-                                          backing_queue_state = BQS1 })),
+    State1 = confirm_messages(MsgIds,
+                              State #state { backing_queue_state = BQS1 }),
     case BQ:needs_timeout(BQS1) of
-        false -> {stop_sync_timer(State1),   hibernate};
-        idle  -> {stop_sync_timer(State1),   0        };
-        timed -> {ensure_sync_timer(State1), 0        }
+        false -> {stop_sync_timer(State1),   hibernate     };
+        idle  -> {stop_sync_timer(State1),   ?SYNC_INTERVAL};
+        timed -> {ensure_sync_timer(State1), 0             }
     end.
 
 backing_queue_timeout(State = #state { backing_queue = BQ }) ->
     run_backing_queue(BQ, fun (M, BQS) -> M:timeout(BQS) end, State).
 
-ensure_sync_timer(State = #state { sync_timer_ref = undefined }) ->
-    TRef = erlang:send_after(?SYNC_INTERVAL, self(), sync_timeout),
-    State #state { sync_timer_ref = TRef };
 ensure_sync_timer(State) ->
-    State.
+    rabbit_misc:ensure_timer(State, #state.sync_timer_ref,
+                             ?SYNC_INTERVAL, sync_timeout).
+
+stop_sync_timer(State) -> rabbit_misc:stop_timer(State, #state.sync_timer_ref).
 
-stop_sync_timer(State = #state { sync_timer_ref = undefined }) ->
-    State;
-stop_sync_timer(State = #state { sync_timer_ref = TRef }) ->
-    erlang:cancel_timer(TRef),
-    State #state { sync_timer_ref = undefined }.
-
-ensure_rate_timer(State = #state { rate_timer_ref = undefined }) ->
-    TRef = erlang:send_after(?RAM_DURATION_UPDATE_INTERVAL,
-                             self(), update_ram_duration),
-    State #state { rate_timer_ref = TRef };
-ensure_rate_timer(State = #state { rate_timer_ref = just_measured }) ->
-    State #state { rate_timer_ref = undefined };
 ensure_rate_timer(State) ->
-    State.
+    rabbit_misc:ensure_timer(State, #state.rate_timer_ref,
+                             ?RAM_DURATION_UPDATE_INTERVAL,
+                             update_ram_duration).
 
-stop_rate_timer(State = #state { rate_timer_ref = undefined }) ->
-    State;
-stop_rate_timer(State = #state { rate_timer_ref = just_measured }) ->
-    State #state { rate_timer_ref = undefined };
-stop_rate_timer(State = #state { rate_timer_ref = TRef }) ->
-    erlang:cancel_timer(TRef),
-    State #state { rate_timer_ref = undefined }.
+stop_rate_timer(State) -> rabbit_misc:stop_timer(State, #state.rate_timer_ref).
 
 ensure_monitoring(ChPid, State = #state { known_senders = KS }) ->
     State #state { known_senders = pmon:monitor(ChPid, KS) }.
 
-local_sender_death(ChPid, State = #state { known_senders = KS }) ->
+local_sender_death(ChPid, #state { known_senders = KS }) ->
+    %% The channel will be monitored iff we have received a delivery
+    %% from it but not heard about its death from the master. So if it
+    %% is monitored we need to point the death out to the master (see
+    %% essay).
     ok = case pmon:is_monitored(ChPid, KS) of
              false -> ok;
-             true  -> credit_flow:peer_down(ChPid),
-                      confirm_sender_death(ChPid)
-         end,
-    State.
+             true  -> confirm_sender_death(ChPid)
+         end.
 
 confirm_sender_death(Pid) ->
     %% We have to deal with the possibility that we'll be promoted to
@@ -620,6 +626,10 @@ confirm_sender_death(Pid) ->
         fun (?MODULE, State = #state { known_senders = KS,
                                        gm            = GM }) ->
                 %% We're running still as a slave
+                %%
+                %% See comment in local_sender_death/2; we might have
+                %% received a sender_death in the meanwhile so check
+                %% again.
                 ok = case pmon:is_monitored(Pid, KS) of
                          false -> ok;
                          true  -> gm:broadcast(GM, {ensure_monitoring, [Pid]}),
@@ -634,62 +644,62 @@ confirm_sender_death(Pid) ->
                 State
         end,
     %% Note that we do not remove our knowledge of this ChPid until we
-    %% get the sender_death from GM.
+    %% get the sender_death from GM as well as a DOWN notification.
     {ok, _TRef} = timer:apply_after(
                     ?DEATH_TIMEOUT, rabbit_amqqueue, run_backing_queue,
                     [self(), rabbit_mirror_queue_master, Fun]),
     ok.
 
+forget_sender(running, _)                        -> false;
+forget_sender(_, running)                        -> false;
+forget_sender(Down1, Down2) when Down1 =/= Down2 -> true.
+
+%% Record and process lifetime events from channels. Forget all about a channel
+%% only when down notifications are received from both the channel and from gm.
+maybe_forget_sender(ChPid, ChState, State = #state { sender_queues = SQ,
+                                                     msg_id_status = MS,
+                                                     known_senders = KS }) ->
+    case dict:find(ChPid, SQ) of
+        error ->
+            State;
+        {ok, {MQ, PendCh, ChStateRecord}} ->
+            case forget_sender(ChState, ChStateRecord) of
+                true ->
+                    credit_flow:peer_down(ChPid),
+                    State #state { sender_queues = dict:erase(ChPid, SQ),
+                                   msg_id_status = lists:foldl(
+                                                     fun dict:erase/2,
+                                                     MS, sets:to_list(PendCh)),
+                                   known_senders = pmon:demonitor(ChPid, KS) };
+                false ->
+                    SQ1 = dict:store(ChPid, {MQ, PendCh, ChState}, SQ),
+                    State #state { sender_queues = SQ1 }
+            end
+    end.
+
 maybe_enqueue_message(
-  Delivery = #delivery { message    = #basic_message { id = MsgId },
-                         msg_seq_no = MsgSeqNo,
-                         sender     = ChPid },
-  EnqueueOnPromotion,
+  Delivery = #delivery { message = #basic_message { id = MsgId },
+                         sender  = ChPid },
   State = #state { sender_queues = SQ, msg_id_status = MS }) ->
     State1 = ensure_monitoring(ChPid, State),
     %% We will never see {published, ChPid, MsgSeqNo} here.
     case dict:find(MsgId, MS) of
         error ->
-            {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
-            MQ1 = queue:in({Delivery, EnqueueOnPromotion}, MQ),
-            SQ1 = dict:store(ChPid, {MQ1, PendingCh}, SQ),
+            {MQ, PendingCh, ChState} = get_sender_queue(ChPid, SQ),
+            MQ1 = queue:in(Delivery, MQ),
+            SQ1 = dict:store(ChPid, {MQ1, PendingCh, ChState}, SQ),
             State1 #state { sender_queues = SQ1 };
-        {ok, {confirmed, ChPid}} ->
-            %% BQ has confirmed it but we didn't know what the
-            %% msg_seq_no was at the time. We do now!
-            ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-            SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-            State1 #state { sender_queues = SQ1,
-                            msg_id_status = dict:erase(MsgId, MS) };
-        {ok, {published, ChPid}} ->
-            %% It was published to the BQ and we didn't know the
-            %% msg_seq_no so couldn't confirm it at the time.
-            case needs_confirming(Delivery, State1) of
-                never ->
-                    SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-                    State1 #state { msg_id_status = dict:erase(MsgId, MS),
-                                    sender_queues = SQ1 };
-                eventually ->
-                    State1 #state {
-                      msg_id_status =
-                          dict:store(MsgId, {published, ChPid, MsgSeqNo}, MS) };
-                immediately ->
-                    ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-                    SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-                    State1 #state { msg_id_status = dict:erase(MsgId, MS),
-                                    sender_queues = SQ1 }
-            end;
-        {ok, discarded} ->
-            %% We've already heard from GM that the msg is to be
-            %% discarded. We won't see this again.
+        {ok, Status} ->
+            MS1 = send_or_record_confirm(
+                    Status, Delivery, dict:erase(MsgId, MS), State1),
             SQ1 = remove_from_pending_ch(MsgId, ChPid, SQ),
-            State1 #state { msg_id_status = dict:erase(MsgId, MS),
+            State1 #state { msg_id_status = MS1,
                             sender_queues = SQ1 }
     end.
 
 get_sender_queue(ChPid, SQ) ->
     case dict:find(ChPid, SQ) of
-        error     -> {queue:new(), sets:new()};
+        error     -> {queue:new(), sets:new(), running};
         {ok, Val} -> Val
     end.
 
@@ -697,49 +707,32 @@ remove_from_pending_ch(MsgId, ChPid, SQ) ->
     case dict:find(ChPid, SQ) of
         error ->
             SQ;
-        {ok, {MQ, PendingCh}} ->
-            dict:store(ChPid, {MQ, sets:del_element(MsgId, PendingCh)}, SQ)
+        {ok, {MQ, PendingCh, ChState}} ->
+            dict:store(ChPid, {MQ, sets:del_element(MsgId, PendingCh), ChState},
+                       SQ)
     end.
 
-process_instruction(
-  {publish, Deliver, ChPid, MsgProps, Msg = #basic_message { id = MsgId }},
-  State = #state { sender_queues       = SQ,
-                   backing_queue       = BQ,
-                   backing_queue_state = BQS,
-                   msg_id_status       = MS }) ->
-
-    %% We really are going to do the publish right now, even though we
-    %% may not have seen it directly from the channel. As a result, we
-    %% may know that it needs confirming without knowing its
-    %% msg_seq_no, which means that we can see the confirmation come
-    %% back from the backing queue without knowing the msg_seq_no,
-    %% which means that we're going to have to hang on to the fact
-    %% that we've seen the msg_id confirmed until we can associate it
-    %% with a msg_seq_no.
+publish_or_discard(Status, ChPid, MsgId,
+                   State = #state { sender_queues = SQ, msg_id_status = MS }) ->
+    %% We really are going to do the publish/discard right now, even
+    %% though we may not have seen it directly from the channel. But
+    %% we cannot issue confirms until the latter has happened. So we
+    %% need to keep track of the MsgId and its confirmation status in
+    %% the meantime.
     State1 = ensure_monitoring(ChPid, State),
-    {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
+    {MQ, PendingCh, ChState} = get_sender_queue(ChPid, SQ),
     {MQ1, PendingCh1, MS1} =
         case queue:out(MQ) of
             {empty, _MQ2} ->
                 {MQ, sets:add_element(MsgId, PendingCh),
-                 dict:store(MsgId, {published, ChPid}, MS)};
-            {{value, {Delivery = #delivery {
-                        msg_seq_no = MsgSeqNo,
-                        message    = #basic_message { id = MsgId } },
-                      _EnqueueOnPromotion}}, MQ2} ->
-                %% We received the msg from the channel first. Thus we
-                %% need to deal with confirms here.
-                case needs_confirming(Delivery, State1) of
-                    never ->
-                        {MQ2, PendingCh, MS};
-                    eventually ->
-                        {MQ2, PendingCh,
-                         dict:store(MsgId, {published, ChPid, MsgSeqNo}, MS)};
-                    immediately ->
-                        ok = rabbit_misc:confirm_to_sender(ChPid, [MsgSeqNo]),
-                        {MQ2, PendingCh, MS}
-                end;
-            {{value, {#delivery {}, _EnqueueOnPromotion}}, _MQ2} ->
+                 dict:store(MsgId, Status, MS)};
+            {{value, Delivery = #delivery {
+                       message = #basic_message { id = MsgId } }}, MQ2} ->
+                {MQ2, PendingCh,
+                 %% We received the msg from the channel first. Thus
+                 %% we need to deal with confirms here.
+                 send_or_record_confirm(Status, Delivery, MS, State1)};
+            {{value, #delivery {}}, _MQ2} ->
                 %% The instruction was sent to us before we were
                 %% within the slave_pids within the #amqqueue{}
                 %% record. We'll never receive the message directly
@@ -747,88 +740,47 @@ process_instruction(
                 %% expecting any confirms from us.
                 {MQ, PendingCh, MS}
         end,
-
-    SQ1 = dict:store(ChPid, {MQ1, PendingCh1}, SQ),
-    State2 = State1 #state { sender_queues = SQ1, msg_id_status = MS1 },
-
-    {ok,
-     case Deliver of
-         false ->
-             BQS1 = BQ:publish(Msg, MsgProps, ChPid, BQS),
-             State2 #state { backing_queue_state = BQS1 };
-         {true, AckRequired} ->
-             {AckTag, BQS1} = BQ:publish_delivered(AckRequired, Msg, MsgProps,
-                                                   ChPid, BQS),
-             maybe_store_ack(AckRequired, MsgId, AckTag,
-                             State2 #state { backing_queue_state = BQS1 })
-     end};
-process_instruction({discard, ChPid, Msg = #basic_message { id = MsgId }},
-                    State = #state { sender_queues       = SQ,
-                                     backing_queue       = BQ,
-                                     backing_queue_state = BQS,
-                                     msg_id_status       = MS }) ->
-    %% Many of the comments around the publish head above apply here
-    %% too.
-    State1 = ensure_monitoring(ChPid, State),
-    {MQ, PendingCh} = get_sender_queue(ChPid, SQ),
-    {MQ1, PendingCh1, MS1} =
-        case queue:out(MQ) of
-            {empty, _MQ} ->
-                {MQ, sets:add_element(MsgId, PendingCh),
-                 dict:store(MsgId, discarded, MS)};
-            {{value, {#delivery { message = #basic_message { id = MsgId } },
-                      _EnqueueOnPromotion}}, MQ2} ->
-                %% We've already seen it from the channel, we're not
-                %% going to see this again, so don't add it to MS
-                {MQ2, PendingCh, MS};
-            {{value, {#delivery {}, _EnqueueOnPromotion}}, _MQ2} ->
-                %% The instruction was sent to us before we were
-                %% within the slave_pids within the #amqqueue{}
-                %% record. We'll never receive the message directly
-                %% from the channel.
-                {MQ, PendingCh, MS}
-        end,
-    SQ1 = dict:store(ChPid, {MQ1, PendingCh1}, SQ),
-    BQS1 = BQ:discard(Msg, ChPid, BQS),
-    {ok, State1 #state { sender_queues       = SQ1,
-                         msg_id_status       = MS1,
-                         backing_queue_state = BQS1 }};
-process_instruction({set_length, Length, AckRequired},
+    SQ1 = dict:store(ChPid, {MQ1, PendingCh1, ChState}, SQ),
+    State1 #state { sender_queues = SQ1, msg_id_status = MS1 }.
+
+
+process_instruction({publish, ChPid, MsgProps,
+                     Msg = #basic_message { id = MsgId }}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(published, ChPid, MsgId, State),
+    BQS1 = BQ:publish(Msg, MsgProps, true, ChPid, BQS),
+    {ok, State1 #state { backing_queue_state = BQS1 }};
+process_instruction({publish_delivered, ChPid, MsgProps,
+                     Msg = #basic_message { id = MsgId }}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(published, ChPid, MsgId, State),
+    true = BQ:is_empty(BQS),
+    {AckTag, BQS1} = BQ:publish_delivered(Msg, MsgProps, ChPid, BQS),
+    {ok, maybe_store_ack(true, MsgId, AckTag,
+                         State1 #state { backing_queue_state = BQS1 })};
+process_instruction({discard, ChPid, MsgId}, State) ->
+    State1 = #state { backing_queue = BQ, backing_queue_state = BQS } =
+        publish_or_discard(discarded, ChPid, MsgId, State),
+    BQS1 = BQ:discard(MsgId, ChPid, BQS),
+    {ok, State1 #state { backing_queue_state = BQS1 }};
+process_instruction({drop, Length, Dropped, AckRequired},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
     QLen = BQ:len(BQS),
-    ToDrop = QLen - Length,
-    {ok,
-     case ToDrop >= 0 of
-         true ->
-             State1 =
-                 lists:foldl(
-                   fun (const, StateN = #state {backing_queue_state = BQSN}) ->
-                           {{#basic_message{id = MsgId}, _IsDelivered, AckTag,
-                             _Remaining}, BQSN1} = BQ:fetch(AckRequired, BQSN),
-                           maybe_store_ack(
-                             AckRequired, MsgId, AckTag,
-                             StateN #state { backing_queue_state = BQSN1 })
-                   end, State, lists:duplicate(ToDrop, const)),
-             set_synchronised(true, State1);
-         false ->
-             State
-     end};
-process_instruction({fetch, AckRequired, MsgId, Remaining},
-                    State = #state { backing_queue       = BQ,
-                                     backing_queue_state = BQS }) ->
-    QLen = BQ:len(BQS),
-    {ok, case QLen - 1 of
-             Remaining ->
-                 {{#basic_message{id = MsgId}, _IsDelivered,
-                   AckTag, Remaining}, BQS1} = BQ:fetch(AckRequired, BQS),
-                 maybe_store_ack(AckRequired, MsgId, AckTag,
-                                 State #state { backing_queue_state = BQS1 });
-             Other when Other + 1 =:= Remaining ->
-                 set_synchronised(true, State);
-             Other when Other < Remaining ->
-                 %% we must be shorter than the master
-                 State
+    ToDrop = case QLen - Length of
+                 N when N > 0 -> N;
+                 _            -> 0
+             end,
+    State1 = lists:foldl(
+               fun (const, StateN = #state{backing_queue_state = BQSN}) ->
+                       {{MsgId, AckTag}, BQSN1} = BQ:drop(AckRequired, BQSN),
+                       maybe_store_ack(
+                         AckRequired, MsgId, AckTag,
+                         StateN #state { backing_queue_state = BQSN1 })
+               end, State, lists:duplicate(ToDrop, const)),
+    {ok, case AckRequired of
+             true  -> State1;
+             false -> update_delta(ToDrop - Dropped, State1)
          end};
 process_instruction({ack, MsgIds},
                     State = #state { backing_queue       = BQ,
@@ -837,48 +789,31 @@ process_instruction({ack, MsgIds},
     {AckTags, MA1} = msg_ids_to_acktags(MsgIds, MA),
     {MsgIds1, BQS1} = BQ:ack(AckTags, BQS),
     [] = MsgIds1 -- MsgIds, %% ASSERTION
-    {ok, State #state { msg_id_ack          = MA1,
-                        backing_queue_state = BQS1 }};
+    {ok, update_delta(length(MsgIds1) - length(MsgIds),
+                      State #state { msg_id_ack          = MA1,
+                                     backing_queue_state = BQS1 })};
 process_instruction({requeue, MsgIds},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS,
                                      msg_id_ack          = MA }) ->
     {AckTags, MA1} = msg_ids_to_acktags(MsgIds, MA),
-    {ok, case length(AckTags) =:= length(MsgIds) of
-             true ->
-                 {MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
-                 State #state { msg_id_ack          = MA1,
-                                backing_queue_state = BQS1 };
-             false ->
-                 %% The only thing we can safely do is nuke out our BQ
-                 %% and MA. The interaction between this and confirms
-                 %% doesn't really bear thinking about...
-                 {_Count, BQS1} = BQ:purge(BQS),
-                 {_MsgIds, BQS2} = ack_all(BQ, MA, BQS1),
-                 State #state { msg_id_ack          = dict:new(),
-                                backing_queue_state = BQS2 }
-         end};
+    {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS),
+    {ok, State #state { msg_id_ack          = MA1,
+                        backing_queue_state = BQS1 }};
 process_instruction({sender_death, ChPid},
-                    State = #state { sender_queues = SQ,
-                                     msg_id_status = MS,
-                                     known_senders = KS }) ->
+                    State = #state { known_senders = KS }) ->
+    %% The channel will be monitored iff we have received a message
+    %% from it. In this case we just want to avoid doing work if we
+    %% never got any messages.
     {ok, case pmon:is_monitored(ChPid, KS) of
              false -> State;
-             true  -> MS1 = case dict:find(ChPid, SQ) of
-                                error ->
-                                    MS;
-                                {ok, {_MQ, PendingCh}} ->
-                                    lists:foldl(fun dict:erase/2, MS,
-                                                sets:to_list(PendingCh))
-                            end,
-                      State #state { sender_queues = dict:erase(ChPid, SQ),
-                                     msg_id_status = MS1,
-                                     known_senders = pmon:demonitor(ChPid, KS) }
+             true  -> maybe_forget_sender(ChPid, down_from_gm, State)
          end};
-process_instruction({length, Length},
-                    State = #state { backing_queue = BQ,
+process_instruction({depth, Depth},
+                    State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
-    {ok, set_synchronised(Length =:= BQ:len(BQS), State)};
+    {ok, set_delta(Depth - BQ:depth(BQS), State)};
+
 process_instruction({delete_and_terminate, Reason},
                     State = #state { backing_queue       = BQ,
                                      backing_queue_state = BQS }) ->
@@ -890,31 +825,56 @@ msg_ids_to_acktags(MsgIds, MA) ->
         lists:foldl(
           fun (MsgId, {Acc, MAN}) ->
                   case dict:find(MsgId, MA) of
-                      error                -> {Acc, MAN};
-                      {ok, {_Num, AckTag}} -> {[AckTag | Acc],
-                                               dict:erase(MsgId, MAN)}
+                      error        -> {Acc, MAN};
+                      {ok, AckTag} -> {[AckTag | Acc], dict:erase(MsgId, MAN)}
                   end
           end, {[], MA}, MsgIds),
     {lists:reverse(AckTags), MA1}.
 
-ack_all(BQ, MA, BQS) ->
-    BQ:ack([AckTag || {_MsgId, {_Num, AckTag}} <- dict:to_list(MA)], BQS).
-
 maybe_store_ack(false, _MsgId, _AckTag, State) ->
     State;
-maybe_store_ack(true, MsgId, AckTag, State = #state { msg_id_ack = MA,
-                                                      ack_num    = Num }) ->
-    State #state { msg_id_ack = dict:store(MsgId, {Num, AckTag}, MA),
-                   ack_num    = Num + 1 }.
-
-%% We intentionally leave out the head where a slave becomes
-%% unsynchronised: we assert that can never happen.
-set_synchronised(true, State = #state { q = #amqqueue { name = QName },
-                                        synchronised = false }) ->
-    rabbit_event:notify(queue_slave_synchronised, [{pid,  self()},
-                                                   {name, QName}]),
-    State #state { synchronised = true };
-set_synchronised(true, State) ->
+maybe_store_ack(true, MsgId, AckTag, State = #state { msg_id_ack = MA }) ->
+    State #state { msg_id_ack = dict:store(MsgId, AckTag, MA) }.
+
+set_delta(0,        State = #state { depth_delta = undefined }) ->
+    ok = record_synchronised(State#state.q),
+    State #state { depth_delta = 0 };
+set_delta(NewDelta, State = #state { depth_delta = undefined }) ->
+    true = NewDelta > 0, %% assertion
+    State #state { depth_delta = NewDelta };
+set_delta(NewDelta, State = #state { depth_delta = Delta     }) ->
+    update_delta(NewDelta - Delta, State).
+
+update_delta(_DeltaChange, State = #state { depth_delta = undefined }) ->
     State;
-set_synchronised(false, State = #state { synchronised = false }) ->
-    State.
+update_delta( DeltaChange, State = #state { depth_delta = 0         }) ->
+    0 = DeltaChange, %% assertion: we cannot become unsync'ed
+    State;
+update_delta( DeltaChange, State = #state { depth_delta = Delta     }) ->
+    true = DeltaChange =< 0, %% assertion: we cannot become 'less' sync'ed
+    set_delta(Delta + DeltaChange, State #state { depth_delta = undefined }).
+
+update_ram_duration(BQ, BQS) ->
+    {RamDuration, BQS1} = BQ:ram_duration(BQS),
+    DesiredDuration =
+        rabbit_memory_monitor:report_ram_duration(self(), RamDuration),
+    BQ:set_ram_duration_target(DesiredDuration, BQS1).
+
+%% [1] - the arrival of this newly synced slave may cause the master to die if
+%% the admin has requested a migration-type change to policy.
+record_synchronised(#amqqueue { name = QName }) ->
+    Self = self(),
+    case rabbit_misc:execute_mnesia_transaction(
+           fun () ->
+                   case mnesia:read({rabbit_queue, QName}) of
+                       [] ->
+                           ok;
+                       [Q1 = #amqqueue { sync_slave_pids = SSPids }] ->
+                           Q2 = Q1#amqqueue{sync_slave_pids = [Self | SSPids]},
+                           rabbit_mirror_queue_misc:store_updated_slaves(Q2),
+                           {ok, Q1, Q2}
+                   end
+           end) of
+        ok           -> ok;
+        {ok, Q1, Q2} -> rabbit_mirror_queue_misc:update_mirrors(Q1, Q2) %% [1]
+    end.
diff --git a/src/rabbit_mirror_queue_slave_sup.erl b/src/rabbit_mirror_queue_slave_sup.erl
index a2034876..6fba99db 100644
--- a/src/rabbit_mirror_queue_slave_sup.erl
+++ b/src/rabbit_mirror_queue_slave_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2010-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_mirror_queue_slave_sup).
@@ -31,7 +31,7 @@ start_link() -> supervisor2:start_link({local, ?SERVER}, ?MODULE, []).
 start_child(Node, Args) -> supervisor2:start_child({?SERVER, Node}, Args).
 
 init([]) ->
-    {ok, {{simple_one_for_one_terminate, 10, 10},
+    {ok, {{simple_one_for_one, 10, 10},
           [{rabbit_mirror_queue_slave,
             {rabbit_mirror_queue_slave, start_link, []},
             temporary, ?MAX_WAIT, worker, [rabbit_mirror_queue_slave]}]}}.
diff --git a/src/rabbit_mirror_queue_sync.erl b/src/rabbit_mirror_queue_sync.erl
new file mode 100644
index 00000000..61e90105
--- /dev/null
+++ b/src/rabbit_mirror_queue_sync.erl
@@ -0,0 +1,260 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2010-2012 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_mirror_queue_sync).
+
+-include("rabbit.hrl").
+
+-export([master_prepare/3, master_go/7, slave/7]).
+
+-define(SYNC_PROGRESS_INTERVAL, 1000000).
+
+%% There are three processes around, the master, the syncer and the
+%% slave(s). The syncer is an intermediary, linked to the master in
+%% order to make sure we do not mess with the master's credit flow or
+%% set of monitors.
+%%
+%% Interactions
+%% ------------
+%%
+%% '*' indicates repeating messages. All are standard Erlang messages
+%% except sync_start which is sent over GM to flush out any other
+%% messages that we might have sent that way already. (credit) is the
+%% usual credit_flow bump message every so often.
+%%
+%%               Master             Syncer                 Slave(s)
+%% sync_mirrors -> ||                                         ||
+%% (from channel)  || -- (spawns) --> ||                      ||
+%%                 || --------- sync_start (over GM) -------> ||
+%%                 ||                 || <--- sync_ready ---- ||
+%%                 ||                 ||         (or)         ||
+%%                 ||                 || <--- sync_deny ----- ||
+%%                 || <--- ready ---- ||                      ||
+%%                 || <--- next* ---- ||                      ||  }
+%%                 || ---- msg* ----> ||                      ||  } loop
+%%                 ||                 || ---- sync_msg* ----> ||  }
+%%                 ||                 || <--- (credit)* ----- ||  }
+%%                 || <--- next  ---- ||                      ||
+%%                 || ---- done ----> ||                      ||
+%%                 ||                 || -- sync_complete --> ||
+%%                 ||               (Dies)                    ||
+
+-ifdef(use_specs).
+
+-type(log_fun() :: fun ((string(), [any()]) -> 'ok')).
+-type(bq() :: atom()).
+-type(bqs() :: any()).
+-type(ack() :: any()).
+-type(slave_sync_state() :: {[{rabbit_types:msg_id(), ack()}], timer:tref(),
+                             bqs()}).
+
+-spec(master_prepare/3 :: (reference(), log_fun(), [pid()]) -> pid()).
+-spec(master_go/7 :: (pid(), reference(), log_fun(),
+                      rabbit_mirror_queue_master:stats_fun(),
+                      rabbit_mirror_queue_master:stats_fun(),
+                      bq(), bqs()) ->
+                          {'already_synced', bqs()} | {'ok', bqs()} |
+                          {'shutdown', any(), bqs()} |
+                          {'sync_died', any(), bqs()}).
+-spec(slave/7 :: (non_neg_integer(), reference(), timer:tref(), pid(),
+                  bq(), bqs(), fun((bq(), bqs()) -> {timer:tref(), bqs()})) ->
+                      'denied' |
+                      {'ok' | 'failed', slave_sync_state()} |
+                      {'stop', any(), slave_sync_state()}).
+
+-endif.
+
+%% ---------------------------------------------------------------------------
+%% Master
+
+master_prepare(Ref, Log, SPids) ->
+    MPid = self(),
+    spawn_link(fun () -> syncer(Ref, Log, MPid, SPids) end).
+
+master_go(Syncer, Ref, Log, HandleInfo, EmitStats, BQ, BQS) ->
+    Args = {Syncer, Ref, Log, HandleInfo, EmitStats, rabbit_misc:get_parent()},
+    receive
+        {'EXIT', Syncer, normal} -> {already_synced, BQS};
+        {'EXIT', Syncer, Reason} -> {sync_died, Reason, BQS};
+        {ready, Syncer}          -> EmitStats({syncing, 0}),
+                                    master_go0(Args, BQ, BQS)
+    end.
+
+master_go0(Args, BQ, BQS) ->
+    case BQ:fold(fun (Msg, MsgProps, Unacked, Acc) ->
+                         master_send(Msg, MsgProps, Unacked, Args, Acc)
+                 end, {0, erlang:now()}, BQS) of
+        {{shutdown,  Reason}, BQS1} -> {shutdown,  Reason, BQS1};
+        {{sync_died, Reason}, BQS1} -> {sync_died, Reason, BQS1};
+        {_,                   BQS1} -> master_done(Args, BQS1)
+    end.
+
+master_send(Msg, MsgProps, Unacked,
+            {Syncer, Ref, Log, HandleInfo, EmitStats, Parent}, {I, Last}) ->
+    T = case timer:now_diff(erlang:now(), Last) > ?SYNC_PROGRESS_INTERVAL of
+            true  -> EmitStats({syncing, I}),
+                     Log("~p messages", [I]),
+                     erlang:now();
+            false -> Last
+        end,
+    HandleInfo({syncing, I}),
+    receive
+        {'$gen_cast', {set_maximum_since_use, Age}} ->
+            ok = file_handle_cache:set_maximum_since_use(Age)
+    after 0 ->
+            ok
+    end,
+    receive
+        {'$gen_call', From,
+         cancel_sync_mirrors}    -> stop_syncer(Syncer, {cancel, Ref}),
+                                    gen_server2:reply(From, ok),
+                                    {stop, cancelled};
+        {next, Ref}              -> Syncer ! {msg, Ref, Msg, MsgProps, Unacked},
+                                    {cont, {I + 1, T}};
+        {'EXIT', Parent, Reason} -> {stop, {shutdown,  Reason}};
+        {'EXIT', Syncer, Reason} -> {stop, {sync_died, Reason}}
+    end.
+
+master_done({Syncer, Ref, _Log, _HandleInfo, _EmitStats, Parent}, BQS) ->
+    receive
+        {next, Ref}              -> stop_syncer(Syncer, {done, Ref}),
+                                    {ok, BQS};
+        {'EXIT', Parent, Reason} -> {shutdown,  Reason, BQS};
+        {'EXIT', Syncer, Reason} -> {sync_died, Reason, BQS}
+    end.
+
+stop_syncer(Syncer, Msg) ->
+    unlink(Syncer),
+    Syncer ! Msg,
+    receive {'EXIT', Syncer, _} -> ok
+    after 0 -> ok
+    end.
+
+%% Master
+%% ---------------------------------------------------------------------------
+%% Syncer
+
+syncer(Ref, Log, MPid, SPids) ->
+    [erlang:monitor(process, SPid) || SPid <- SPids],
+    %% We wait for a reply from the slaves so that we know they are in
+    %% a receive block and will thus receive messages we send to them
+    %% *without* those messages ending up in their gen_server2 pqueue.
+    case [SPid || SPid <- SPids,
+                  receive
+                      {sync_ready, Ref, SPid}       -> true;
+                      {sync_deny,  Ref, SPid}       -> false;
+                      {'DOWN', _, process, SPid, _} -> false
+                  end] of
+        []     -> Log("all slaves already synced", []);
+        SPids1 -> MPid ! {ready, self()},
+                  Log("mirrors ~p to sync", [[node(SPid) || SPid <- SPids1]]),
+                  syncer_loop(Ref, MPid, SPids1)
+    end.
+
+syncer_loop(Ref, MPid, SPids) ->
+    MPid ! {next, Ref},
+    receive
+        {msg, Ref, Msg, MsgProps, Unacked} ->
+            SPids1 = wait_for_credit(SPids),
+            [begin
+                 credit_flow:send(SPid),
+                 SPid ! {sync_msg, Ref, Msg, MsgProps, Unacked}
+             end || SPid <- SPids1],
+            syncer_loop(Ref, MPid, SPids1);
+        {cancel, Ref} ->
+            %% We don't tell the slaves we will die - so when we do
+            %% they interpret that as a failure, which is what we
+            %% want.
+            ok;
+        {done, Ref} ->
+            [SPid ! {sync_complete, Ref} || SPid <- SPids]
+    end.
+
+wait_for_credit(SPids) ->
+    case credit_flow:blocked() of
+        true  -> receive
+                     {bump_credit, Msg} ->
+                         credit_flow:handle_bump_msg(Msg),
+                         wait_for_credit(SPids);
+                     {'DOWN', _, process, SPid, _} ->
+                         credit_flow:peer_down(SPid),
+                         wait_for_credit(lists:delete(SPid, SPids))
+                 end;
+        false -> SPids
+    end.
+
+%% Syncer
+%% ---------------------------------------------------------------------------
+%% Slave
+
+slave(0, Ref, _TRef, Syncer, _BQ, _BQS, _UpdateRamDuration) ->
+    Syncer ! {sync_deny, Ref, self()},
+    denied;
+
+slave(_DD, Ref, TRef, Syncer, BQ, BQS, UpdateRamDuration) ->
+    MRef = erlang:monitor(process, Syncer),
+    Syncer ! {sync_ready, Ref, self()},
+    {_MsgCount, BQS1} = BQ:purge(BQ:purge_acks(BQS)),
+    slave_sync_loop({Ref, MRef, Syncer, BQ, UpdateRamDuration,
+                     rabbit_misc:get_parent()}, {[], TRef, BQS1}).
+
+slave_sync_loop(Args = {Ref, MRef, Syncer, BQ, UpdateRamDuration, Parent},
+                State = {MA, TRef, BQS}) ->
+    receive
+        {'DOWN', MRef, process, Syncer, _Reason} ->
+            %% If the master dies half way we are not in the usual
+            %% half-synced state (with messages nearer the tail of the
+            %% queue); instead we have ones nearer the head. If we then
+            %% sync with a newly promoted master, or even just receive
+            %% messages from it, we have a hole in the middle. So the
+            %% only thing to do here is purge.
+            {_MsgCount, BQS1} = BQ:purge(BQ:purge_acks(BQS)),
+            credit_flow:peer_down(Syncer),
+            {failed, {[], TRef, BQS1}};
+        {bump_credit, Msg} ->
+            credit_flow:handle_bump_msg(Msg),
+            slave_sync_loop(Args, State);
+        {sync_complete, Ref} ->
+            erlang:demonitor(MRef, [flush]),
+            credit_flow:peer_down(Syncer),
+            {ok, State};
+        {'$gen_cast', {set_maximum_since_use, Age}} ->
+            ok = file_handle_cache:set_maximum_since_use(Age),
+            slave_sync_loop(Args, State);
+        {'$gen_cast', {set_ram_duration_target, Duration}} ->
+            BQS1 = BQ:set_ram_duration_target(Duration, BQS),
+            slave_sync_loop(Args, {MA, TRef, BQS1});
+        update_ram_duration ->
+            {TRef1, BQS1} = UpdateRamDuration(BQ, BQS),
+            slave_sync_loop(Args, {MA, TRef1, BQS1});
+        {sync_msg, Ref, Msg, Props, Unacked} ->
+            credit_flow:ack(Syncer),
+            Props1 = Props#message_properties{needs_confirming = false},
+            {MA1, BQS1} =
+                case Unacked of
+                    false -> {MA, BQ:publish(Msg, Props1, true, none, BQS)};
+                    true  -> {AckTag, BQS2} = BQ:publish_delivered(
+                                                Msg, Props1, none, BQS),
+                             {[{Msg#basic_message.id, AckTag} | MA], BQS2}
+                end,
+            slave_sync_loop(Args, {MA1, TRef, BQS1});
+        {'EXIT', Parent, Reason} ->
+            {stop, Reason, State};
+        %% If the master throws an exception
+        {'$gen_cast', {gm, {delete_and_terminate, Reason}}} ->
+            BQ:delete_and_terminate(Reason, BQS),
+            {stop, Reason, {[], TRef, undefined}}
+    end.
diff --git a/src/rabbit_misc.erl b/src/rabbit_misc.erl
index 93c784ec..bca9d5ce 100644
--- a/src/rabbit_misc.erl
+++ b/src/rabbit_misc.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_misc).
@@ -19,9 +19,9 @@
 -include("rabbit_framing.hrl").
 
 -export([method_record_type/1, polite_pause/0, polite_pause/1]).
--export([die/1, frame_error/2, amqp_error/4,
+-export([die/1, frame_error/2, amqp_error/4, quit/1,
          protocol_error/3, protocol_error/4, protocol_error/1]).
--export([not_found/1, assert_args_equivalence/4]).
+-export([not_found/1, absent/1, assert_args_equivalence/4]).
 -export([dirty_read/1]).
 -export([table_lookup/2, set_table_value/4]).
 -export([r/3, r/2, r_arg/4, rs/1]).
@@ -29,24 +29,24 @@
 -export([enable_cover/1, report_cover/1]).
 -export([start_cover/1]).
 -export([confirm_to_sender/2]).
--export([throw_on_error/2, with_exit_handler/2, filter_exit_map/2]).
--export([is_abnormal_termination/1]).
+-export([throw_on_error/2, with_exit_handler/2, is_abnormal_exit/1,
+         filter_exit_map/2]).
 -export([with_user/2, with_user_and_vhost/3]).
 -export([execute_mnesia_transaction/1]).
 -export([execute_mnesia_transaction/2]).
 -export([execute_mnesia_tx_with_tail/1]).
 -export([ensure_ok/2]).
--export([tcp_name/3]).
+-export([tcp_name/3, format_inet_error/1]).
 -export([upmap/2, map_in_order/2]).
 -export([table_filter/3]).
 -export([dirty_read_all/1, dirty_foreach_key/2, dirty_dump_log/1]).
 -export([format/2, format_many/1, format_stderr/2]).
 -export([with_local_io/1, local_info_msg/2]).
--export([start_applications/1, stop_applications/1]).
 -export([unfold/2, ceil/1, queue_fold/3]).
 -export([sort_field_table/1]).
 -export([pid_to_string/1, string_to_pid/1]).
 -export([version_compare/2, version_compare/3]).
+-export([version_minor_equivalent/2]).
 -export([dict_cons/3, orddict_cons/3, gb_trees_cons/3]).
 -export([gb_trees_fold/3, gb_trees_foreach/2]).
 -export([parse_arguments/3]).
@@ -59,9 +59,24 @@
 -export([format_message_queue/2]).
 -export([append_rpc_all_nodes/4]).
 -export([multi_call/2]).
--export([quit/1]).
 -export([os_cmd/1]).
 -export([gb_sets_difference/2]).
+-export([version/0, which_applications/0]).
+-export([sequence_error/1]).
+-export([json_encode/1, json_decode/1, json_to_term/1, term_to_json/1]).
+-export([check_expiry/1]).
+-export([base64url/1]).
+-export([interval_operation/4]).
+-export([ensure_timer/4, stop_timer/2]).
+-export([get_parent/0]).
+
+%% Horrible macro to use in guards
+-define(IS_BENIGN_EXIT(R),
+        R =:= noproc; R =:= noconnection; R =:= nodedown; R =:= normal;
+            R =:= shutdown).
+
+%% This is dictated by `erlang:send_after' on which we depend to implement TTL.
+-define(MAX_EXPIRY_TIMER, 4294967295).
 
 %%----------------------------------------------------------------------------
 
@@ -87,6 +102,9 @@
 -spec(polite_pause/1 :: (non_neg_integer()) -> 'done').
 -spec(die/1 ::
         (rabbit_framing:amqp_exception()) -> channel_or_connection_exit()).
+
+-spec(quit/1 :: (integer()) -> no_return()).
+
 -spec(frame_error/2 :: (rabbit_framing:amqp_method_name(), binary())
                        -> rabbit_types:connection_exit()).
 -spec(amqp_error/4 ::
@@ -101,6 +119,7 @@
 -spec(protocol_error/1 ::
         (rabbit_types:amqp_error()) -> channel_or_connection_exit()).
 -spec(not_found/1 :: (rabbit_types:r(atom())) -> rabbit_types:channel_exit()).
+-spec(absent/1 :: (rabbit_types:amqqueue()) -> rabbit_types:channel_exit()).
 -spec(assert_args_equivalence/4 :: (rabbit_framing:amqp_table(),
                                     rabbit_framing:amqp_table(),
                                     rabbit_types:r(any()), [binary()]) ->
@@ -123,9 +142,11 @@
                when is_subtype(K, atom())).
 -spec(r_arg/4 ::
         (rabbit_types:vhost() | rabbit_types:r(atom()), K,
-         rabbit_framing:amqp_table(), binary())
-        -> undefined | rabbit_types:r(K)
-               when is_subtype(K, atom())).
+         rabbit_framing:amqp_table(), binary()) ->
+                      undefined |
+                      rabbit_types:error(
+                        {invalid_type, rabbit_framing:amqp_field_type()}) |
+                      rabbit_types:r(K) when is_subtype(K, atom())).
 -spec(rs/1 :: (rabbit_types:r(atom())) -> string()).
 -spec(enable_cover/0 :: () -> ok_or_error()).
 -spec(start_cover/1 :: ([{string(), string()} | string()]) -> 'ok').
@@ -135,8 +156,8 @@
 -spec(throw_on_error/2 ::
         (atom(), thunk(rabbit_types:error(any()) | {ok, A} | A)) -> A).
 -spec(with_exit_handler/2 :: (thunk(A), thunk(A)) -> A).
+-spec(is_abnormal_exit/1 :: (any()) -> boolean()).
 -spec(filter_exit_map/2 :: (fun ((A) -> B), [A]) -> [B]).
--spec(is_abnormal_termination/1 :: (any()) -> boolean()).
 -spec(with_user/2 :: (rabbit_types:username(), thunk(A)) -> A).
 -spec(with_user_and_vhost/3 ::
         (rabbit_types:username(), rabbit_types:vhost(), thunk(A))
@@ -150,6 +171,7 @@
 -spec(tcp_name/3 ::
         (atom(), inet:ip_address(), rabbit_networking:ip_port())
         -> atom()).
+-spec(format_inet_error/1 :: (atom()) -> string()).
 -spec(upmap/2 :: (fun ((A) -> B), [A]) -> [B]).
 -spec(map_in_order/2 :: (fun ((A) -> B), [A]) -> [B]).
 -spec(table_filter/3:: (fun ((A) -> boolean()), fun ((A, boolean()) -> 'ok'),
@@ -163,8 +185,6 @@
 -spec(format_stderr/2 :: (string(), [any()]) -> 'ok').
 -spec(with_local_io/1 :: (fun (() -> A)) -> A).
 -spec(local_info_msg/2 :: (string(), [any()]) -> 'ok').
--spec(start_applications/1 :: ([atom()]) -> 'ok').
--spec(stop_applications/1 :: ([atom()]) -> 'ok').
 -spec(unfold/2  :: (fun ((A) -> ({'true', B, A} | 'false')), A) -> {[B], A}).
 -spec(ceil/1 :: (number()) -> integer()).
 -spec(queue_fold/3 :: (fun ((any(), B) -> B), B, queue()) -> B).
@@ -176,6 +196,7 @@
 -spec(version_compare/3 ::
         (string(), string(), ('lt' | 'lte' | 'eq' | 'gte' | 'gt'))
         -> boolean()).
+-spec(version_minor_equivalent/2 :: (string(), string()) -> boolean()).
 -spec(dict_cons/3 :: (any(), any(), dict()) -> dict()).
 -spec(orddict_cons/3 :: (any(), any(), orddict:orddict()) -> orddict:orddict()).
 -spec(gb_trees_cons/3 :: (any(), any(), gb_tree()) -> gb_tree()).
@@ -210,10 +231,24 @@
 -spec(append_rpc_all_nodes/4 :: ([node()], atom(), atom(), [any()]) -> [any()]).
 -spec(multi_call/2 ::
         ([pid()], any()) -> {[{pid(), any()}], [{pid(), any()}]}).
--spec(quit/1 :: (integer() | string()) -> no_return()).
 -spec(os_cmd/1 :: (string()) -> string()).
 -spec(gb_sets_difference/2 :: (gb_set(), gb_set()) -> gb_set()).
-
+-spec(version/0 :: () -> string()).
+-spec(which_applications/0 :: () -> [{atom(), string(), string()}]).
+-spec(sequence_error/1 :: ([({'error', any()} | any())])
+                       -> {'error', any()} | any()).
+-spec(json_encode/1 :: (any()) -> {'ok', string()} | {'error', any()}).
+-spec(json_decode/1 :: (string()) -> {'ok', any()} | 'error').
+-spec(json_to_term/1 :: (any()) -> any()).
+-spec(term_to_json/1 :: (any()) -> any()).
+-spec(check_expiry/1 :: (integer()) -> rabbit_types:ok_or_error(any())).
+-spec(base64url/1 :: (binary()) -> string()).
+-spec(interval_operation/4 ::
+        ({atom(), atom(), any()}, float(), non_neg_integer(), non_neg_integer())
+        -> {any(), non_neg_integer()}).
+-spec(ensure_timer/4 :: (A, non_neg_integer(), non_neg_integer(), any()) -> A).
+-spec(stop_timer/2 :: (A, non_neg_integer()) -> A).
+-spec(get_parent/0 :: () -> pid()).
 -endif.
 
 %%----------------------------------------------------------------------------
@@ -250,6 +285,15 @@ protocol_error(#amqp_error{} = Error) ->
 
 not_found(R) -> protocol_error(not_found, "no ~s", [rs(R)]).
 
+absent(#amqqueue{name = QueueName, pid = QPid, durable = true}) ->
+    %% The assertion of durability is mainly there because we mention
+    %% durability in the error message. That way we will hopefully
+    %% notice if at some future point our logic changes s.t. we get
+    %% here with non-durable queues.
+    protocol_error(not_found,
+                   "home node '~s' of durable ~s is down or inaccessible",
+                   [node(QPid), rs(QueueName)]).
+
 type_class(byte)      -> int;
 type_class(short)     -> int;
 type_class(signedint) -> int;
@@ -315,13 +359,12 @@ set_table_value(Table, Key, Type, Value) ->
     sort_field_table(
       lists:keystore(Key, 1, Table, {Key, Type, Value})).
 
-r(#resource{virtual_host = VHostPath}, Kind, Name)
-  when is_binary(Name) ->
+r(#resource{virtual_host = VHostPath}, Kind, Name) ->
     #resource{virtual_host = VHostPath, kind = Kind, name = Name};
-r(VHostPath, Kind, Name) when is_binary(Name) andalso is_binary(VHostPath) ->
+r(VHostPath, Kind, Name) ->
     #resource{virtual_host = VHostPath, kind = Kind, name = Name}.
 
-r(VHostPath, Kind) when is_binary(VHostPath) ->
+r(VHostPath, Kind) ->
     #resource{virtual_host = VHostPath, kind = Kind, name = '_'}.
 
 r_arg(#resource{virtual_host = VHostPath}, Kind, Table, Key) ->
@@ -329,7 +372,8 @@ r_arg(#resource{virtual_host = VHostPath}, Kind, Table, Key) ->
 r_arg(VHostPath, Kind, Table, Key) ->
     case table_lookup(Table, Key) of
         {longstr, NameBin} -> r(VHostPath, Kind, NameBin);
-        undefined          -> undefined
+        undefined          -> undefined;
+        {Type, _}          -> {error, {invalid_type, Type}}
     end.
 
 rs(#resource{virtual_host = VHostPath, kind = Kind, name = Name}) ->
@@ -391,6 +435,18 @@ report_coverage_percentage(File, Cov, NotCov, Mod) ->
 confirm_to_sender(Pid, MsgSeqNos) ->
     gen_server2:cast(Pid, {confirm, MsgSeqNos, self()}).
 
+%% @doc Halts the emulator returning the given status code to the os.
+%% On Windows this function will block indefinitely so as to give the io
+%% subsystem time to flush stdout completely.
+quit(Status) ->
+    case os:type() of
+        {unix,  _} -> halt(Status);
+        {win32, _} -> init:stop(Status),
+                      receive
+                      after infinity -> ok
+                      end
+    end.
+
 throw_on_error(E, Thunk) ->
     case Thunk() of
         {error, Reason} -> throw({E, Reason});
@@ -402,13 +458,14 @@ with_exit_handler(Handler, Thunk) ->
     try
         Thunk()
     catch
-        exit:{R, _} when R =:= noproc; R =:= nodedown;
-                         R =:= normal; R =:= shutdown ->
-            Handler();
-        exit:{{R, _}, _} when R =:= nodedown; R =:= shutdown ->
-            Handler()
+        exit:{R, _}      when ?IS_BENIGN_EXIT(R) -> Handler();
+        exit:{{R, _}, _} when ?IS_BENIGN_EXIT(R) -> Handler()
     end.
 
+is_abnormal_exit(R)      when ?IS_BENIGN_EXIT(R) -> false;
+is_abnormal_exit({R, _}) when ?IS_BENIGN_EXIT(R) -> false;
+is_abnormal_exit(_)                              -> true.
+
 filter_exit_map(F, L) ->
     Ref = make_ref(),
     lists:filter(fun (R) -> R =/= Ref end,
@@ -416,11 +473,6 @@ filter_exit_map(F, L) ->
                     fun () -> Ref end,
                     fun () -> F(I) end) || I <- L]).
 
-is_abnormal_termination(Reason)
-  when Reason =:= noproc; Reason =:= noconnection;
-       Reason =:= normal; Reason =:= shutdown -> false;
-is_abnormal_termination({shutdown, _})        -> false;
-is_abnormal_termination(_)                    -> true.
 
 with_user(Username, Thunk) ->
     fun () ->
@@ -489,6 +541,10 @@ tcp_name(Prefix, IPAddress, Port)
     list_to_atom(
       format("~w_~s:~w", [Prefix, inet_parse:ntoa(IPAddress), Port])).
 
+format_inet_error(address) -> "cannot connect to host/port";
+format_inet_error(timeout) -> "timed out";
+format_inet_error(Error)   -> inet:format_error(Error).
+
 %% This is a modified version of Luke Gorrie's pmap -
 %% http://lukego.livejournal.com/6753.html - that doesn't care about
 %% the order in which results are received.
@@ -593,34 +649,6 @@ with_local_io(Fun) ->
 local_info_msg(Format, Args) ->
     with_local_io(fun () -> error_logger:info_msg(Format, Args) end).
 
-manage_applications(Iterate, Do, Undo, SkipError, ErrorTag, Apps) ->
-    Iterate(fun (App, Acc) ->
-                    case Do(App) of
-                        ok -> [App | Acc];
-                        {error, {SkipError, _}} -> Acc;
-                        {error, Reason} ->
-                            lists:foreach(Undo, Acc),
-                            throw({error, {ErrorTag, App, Reason}})
-                    end
-            end, [], Apps),
-    ok.
-
-start_applications(Apps) ->
-    manage_applications(fun lists:foldl/3,
-                        fun application:start/1,
-                        fun application:stop/1,
-                        already_started,
-                        cannot_start_application,
-                        Apps).
-
-stop_applications(Apps) ->
-    manage_applications(fun lists:foldr/3,
-                        fun application:stop/1,
-                        fun application:start/1,
-                        not_started,
-                        cannot_stop_application,
-                        Apps).
-
 unfold(Fun, Init) ->
     unfold(Fun, [], Init).
 
@@ -715,6 +743,16 @@ version_compare(A,  B) ->
        ANum > BNum   -> gt
     end.
 
+%% a.b.c and a.b.d match, but a.b.c and a.d.e don't. If
+%% versions do not match that pattern, just compare them.
+version_minor_equivalent(A, B) ->
+    {ok, RE} = re:compile("^(\\d+\\.\\d+)(\\.\\d+)\$"),
+    Opts = [{capture, all_but_first, list}],
+    case {re:run(A, RE, Opts), re:run(B, RE, Opts)} of
+        {{match, [A1|_]}, {match, [B1|_]}} -> A1 =:= B1;
+        _                                  -> A =:= B
+    end.
+
 dropdot(A) -> lists:dropwhile(fun (X) -> X =:= $. end, A).
 
 dict_cons(Key, Value, Dict) ->
@@ -865,13 +903,8 @@ ntoab(IP) ->
         _ -> "[" ++ Str ++ "]"
     end.
 
-is_process_alive(Pid) when node(Pid) =:= node() ->
-    erlang:is_process_alive(Pid);
 is_process_alive(Pid) ->
-    case rpc:call(node(Pid), erlang, is_process_alive, [Pid]) of
-        true -> true;
-        _    -> false
-    end.
+    rpc:call(node(Pid), erlang, is_process_alive, [Pid]) =:= true.
 
 pget(K, P) -> proplists:get_value(K, P).
 pget(K, P, D) -> proplists:get_value(K, P, D).
@@ -937,19 +970,149 @@ receive_multi_call([{Mref, Pid} | MonitorPids], Good, Bad) ->
             receive_multi_call(MonitorPids, Good, [{Pid, Reason} | Bad])
     end.
 
-%% the slower shutdown on windows required to flush stdout
-quit(Status) ->
-    case os:type() of
-        {unix,  _} -> halt(Status);
-        {win32, _} -> init:stop(Status)
-    end.
-
 os_cmd(Command) ->
-    Exec = hd(string:tokens(Command, " ")),
-    case os:find_executable(Exec) of
-        false -> throw({command_not_found, Exec});
-        _     -> os:cmd(Command)
+    case os:type() of
+        {win32, _} ->
+            %% Clink workaround; see
+            %% http://code.google.com/p/clink/issues/detail?id=141
+            os:cmd(" " ++ Command);
+        _ ->
+            %% Don't just return "/bin/sh: <cmd>: not found" if not found
+            Exec = hd(string:tokens(Command, " ")),
+            case os:find_executable(Exec) of
+                false -> throw({command_not_found, Exec});
+                _     -> os:cmd(Command)
+            end
     end.
 
 gb_sets_difference(S1, S2) ->
     gb_sets:fold(fun gb_sets:delete_any/2, S1, S2).
+
+version() ->
+    {ok, VSN} = application:get_key(rabbit, vsn),
+    VSN.
+
+%% application:which_applications(infinity) is dangerous, since it can
+%% cause deadlocks on shutdown. So we have to use a timeout variant,
+%% but w/o creating spurious timeout errors.
+which_applications() ->
+    try
+        application:which_applications()
+    catch
+        exit:{timeout, _} -> []
+    end.
+
+sequence_error([T])                      -> T;
+sequence_error([{error, _} = Error | _]) -> Error;
+sequence_error([_ | Rest])               -> sequence_error(Rest).
+
+json_encode(Term) ->
+    try
+        {ok, mochijson2:encode(Term)}
+    catch
+        exit:{json_encode, E} ->
+            {error, E}
+    end.
+
+json_decode(Term) ->
+    try
+        {ok, mochijson2:decode(Term)}
+    catch
+        %% Sadly `mochijson2:decode/1' does not offer a nice way to catch
+        %% decoding errors...
+        error:_ -> error
+    end.
+
+json_to_term({struct, L}) ->
+    [{K, json_to_term(V)} || {K, V} <- L];
+json_to_term(L) when is_list(L) ->
+    [json_to_term(I) || I <- L];
+json_to_term(V) when is_binary(V) orelse is_number(V) orelse V =:= null orelse
+                     V =:= true orelse V =:= false ->
+    V.
+
+%% This has the flaw that empty lists will never be JSON objects, so use with
+%% care.
+term_to_json([{_, _}|_] = L) ->
+    {struct, [{K, term_to_json(V)} || {K, V} <- L]};
+term_to_json(L) when is_list(L) ->
+    [term_to_json(I) || I <- L];
+term_to_json(V) when is_binary(V) orelse is_number(V) orelse V =:= null orelse
+                     V =:= true orelse V =:= false ->
+    V.
+
+check_expiry(N) when N > ?MAX_EXPIRY_TIMER -> {error, {value_too_big, N}};
+check_expiry(N) when N < 0                 -> {error, {value_negative, N}};
+check_expiry(_N)                           -> ok.
+
+base64url(In) ->
+    lists:reverse(lists:foldl(fun ($\+, Acc) -> [$\- | Acc];
+                                  ($\/, Acc) -> [$\_ | Acc];
+                                  ($\=, Acc) -> Acc;
+                                  (Chr, Acc) -> [Chr | Acc]
+                              end, [], base64:encode_to_string(In))).
+
+%% Ideally, you'd want Fun to run every IdealInterval. but you don't
+%% want it to take more than MaxRatio of IdealInterval. So if it takes
+%% more then you want to run it less often. So we time how long it
+%% takes to run, and then suggest how long you should wait before
+%% running it again. Times are in millis.
+interval_operation({M, F, A}, MaxRatio, IdealInterval, LastInterval) ->
+    {Micros, Res} = timer:tc(M, F, A),
+    {Res, case {Micros > 1000 * (MaxRatio * IdealInterval),
+                Micros > 1000 * (MaxRatio * LastInterval)} of
+              {true,  true}  -> round(LastInterval * 1.5);
+              {true,  false} -> LastInterval;
+              {false, false} -> lists:max([IdealInterval,
+                                           round(LastInterval / 1.5)])
+          end}.
+
+ensure_timer(State, Idx, After, Msg) ->
+    case element(Idx, State) of
+        undefined -> TRef = erlang:send_after(After, self(), Msg),
+                     setelement(Idx, State, TRef);
+        _         -> State
+    end.
+
+stop_timer(State, Idx) ->
+    case element(Idx, State) of
+        undefined -> State;
+        TRef      -> case erlang:cancel_timer(TRef) of
+                         false -> State;
+                         _     -> setelement(Idx, State, undefined)
+                     end
+    end.
+
+%% -------------------------------------------------------------------------
+%% Begin copypasta from gen_server2.erl
+
+get_parent() ->
+    case get('$ancestors') of
+        [Parent | _] when is_pid (Parent) -> Parent;
+        [Parent | _] when is_atom(Parent) -> name_to_pid(Parent);
+        _ -> exit(process_was_not_started_by_proc_lib)
+    end.
+
+name_to_pid(Name) ->
+    case whereis(Name) of
+        undefined -> case whereis_name(Name) of
+                         undefined -> exit(could_not_find_registerd_name);
+                         Pid       -> Pid
+                     end;
+        Pid       -> Pid
+    end.
+
+whereis_name(Name) ->
+    case ets:lookup(global_names, Name) of
+        [{_Name, Pid, _Method, _RPid, _Ref}] ->
+            if node(Pid) == node() -> case erlang:is_process_alive(Pid) of
+                                          true  -> Pid;
+                                          false -> undefined
+                                      end;
+               true                -> Pid
+            end;
+        [] -> undefined
+    end.
+
+%% End copypasta from gen_server2.erl
+%% -------------------------------------------------------------------------
diff --git a/src/rabbit_mnesia.erl b/src/rabbit_mnesia.erl
index 7e9346f9..85958400 100644
--- a/src/rabbit_mnesia.erl
+++ b/src/rabbit_mnesia.erl
@@ -10,27 +10,38 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
-
 -module(rabbit_mnesia).
 
--export([ensure_mnesia_dir/0, dir/0, status/0, init/0, is_db_empty/0,
-         cluster/1, force_cluster/1, reset/0, force_reset/0, init_db/3,
-         is_clustered/0, running_clustered_nodes/0, all_clustered_nodes/0,
-         empty_ram_only_tables/0, copy_db/1, wait_for_tables/1,
-         create_cluster_nodes_config/1, read_cluster_nodes_config/0,
-         record_running_nodes/0, read_previously_running_nodes/0,
-         running_nodes_filename/0, is_disc_node/0, on_node_down/1,
-         on_node_up/1]).
-
--export([table_names/0]).
-
-%% create_tables/0 exported for helping embed RabbitMQ in or alongside
-%% other mnesia-using Erlang applications, such as ejabberd
--export([create_tables/0]).
+-export([init/0,
+         join_cluster/2,
+         reset/0,
+         force_reset/0,
+         update_cluster_nodes/1,
+         change_cluster_node_type/1,
+         forget_cluster_node/2,
+
+         status/0,
+         is_clustered/0,
+         cluster_nodes/1,
+         node_type/0,
+         dir/0,
+         cluster_status_from_mnesia/0,
+
+         init_db_unchecked/2,
+         copy_db/1,
+         check_cluster_consistency/0,
+         ensure_mnesia_dir/0,
+
+         on_node_up/1,
+         on_node_down/1
+        ]).
+
+%% Used internally in rpc calls
+-export([node_info/0, remove_node_if_mnesia_running/1]).
 
 -include("rabbit.hrl").
 
@@ -38,314 +49,432 @@
 
 -ifdef(use_specs).
 
--export_type([node_type/0]).
+-export_type([node_type/0, cluster_status/0]).
 
--type(node_type() :: disc_only | disc | ram | unknown).
--spec(status/0 :: () -> [{'nodes', [{node_type(), [node()]}]} |
-                         {'running_nodes', [node()]}]).
--spec(dir/0 :: () -> file:filename()).
--spec(ensure_mnesia_dir/0 :: () -> 'ok').
+-type(node_type() :: disc | ram).
+-type(cluster_status() :: {[node()], [node()], [node()]}).
+
+%% Main interface
 -spec(init/0 :: () -> 'ok').
--spec(init_db/3 :: ([node()], boolean(), rabbit_misc:thunk('ok')) -> 'ok').
--spec(is_db_empty/0 :: () -> boolean()).
--spec(cluster/1 :: ([node()]) -> 'ok').
--spec(force_cluster/1 :: ([node()]) -> 'ok').
--spec(cluster/2 :: ([node()], boolean()) -> 'ok').
+-spec(join_cluster/2 :: (node(), node_type())
+                        -> 'ok' | {'ok', 'already_member'}).
 -spec(reset/0 :: () -> 'ok').
 -spec(force_reset/0 :: () -> 'ok').
+-spec(update_cluster_nodes/1 :: (node()) -> 'ok').
+-spec(change_cluster_node_type/1 :: (node_type()) -> 'ok').
+-spec(forget_cluster_node/2 :: (node(), boolean()) -> 'ok').
+
+%% Various queries to get the status of the db
+-spec(status/0 :: () -> [{'nodes', [{node_type(), [node()]}]} |
+                         {'running_nodes', [node()]} |
+                         {'partitions', [{node(), [node()]}]}]).
 -spec(is_clustered/0 :: () -> boolean()).
--spec(running_clustered_nodes/0 :: () -> [node()]).
--spec(all_clustered_nodes/0 :: () -> [node()]).
--spec(empty_ram_only_tables/0 :: () -> 'ok').
--spec(create_tables/0 :: () -> 'ok').
+-spec(cluster_nodes/1 :: ('all' | 'disc' | 'ram' | 'running') -> [node()]).
+-spec(node_type/0 :: () -> node_type()).
+-spec(dir/0 :: () -> file:filename()).
+-spec(cluster_status_from_mnesia/0 :: () -> rabbit_types:ok_or_error2(
+                                              cluster_status(), any())).
+
+%% Operations on the db and utils, mainly used in `rabbit_upgrade' and `rabbit'
+-spec(init_db_unchecked/2 :: ([node()], node_type()) -> 'ok').
 -spec(copy_db/1 :: (file:filename()) ->  rabbit_types:ok_or_error(any())).
--spec(wait_for_tables/1 :: ([atom()]) -> 'ok').
--spec(create_cluster_nodes_config/1 :: ([node()]) ->  'ok').
--spec(read_cluster_nodes_config/0 :: () ->  [node()]).
--spec(record_running_nodes/0 :: () ->  'ok').
--spec(read_previously_running_nodes/0 :: () ->  [node()]).
--spec(running_nodes_filename/0 :: () -> file:filename()).
--spec(is_disc_node/0 :: () -> boolean()).
+-spec(check_cluster_consistency/0 :: () -> 'ok').
+-spec(ensure_mnesia_dir/0 :: () -> 'ok').
+
+%% Hooks used in `rabbit_node_monitor'
 -spec(on_node_up/1 :: (node()) -> 'ok').
 -spec(on_node_down/1 :: (node()) -> 'ok').
 
--spec(table_names/0 :: () -> [atom()]).
-
 -endif.
 
 %%----------------------------------------------------------------------------
-
-status() ->
-    [{nodes, case mnesia:system_info(is_running) of
-                 yes -> [{Key, Nodes} ||
-                            {Key, CopyType} <- [{disc_only, disc_only_copies},
-                                                {disc,      disc_copies},
-                                                {ram,       ram_copies}],
-                            begin
-                                Nodes = nodes_of_type(CopyType),
-                                Nodes =/= []
-                            end];
-                 no -> case all_clustered_nodes() of
-                           [] -> [];
-                           Nodes -> [{unknown, Nodes}]
-                       end;
-                 Reason when Reason =:= starting; Reason =:= stopping ->
-                     exit({rabbit_busy, try_again_later})
-             end},
-     {running_nodes, running_clustered_nodes()}].
+%% Main interface
+%%----------------------------------------------------------------------------
 
 init() ->
     ensure_mnesia_running(),
     ensure_mnesia_dir(),
-    Nodes = read_cluster_nodes_config(),
-    ok = init_db(Nodes, should_be_disc_node(Nodes)),
+    case is_virgin_node() of
+        true  -> init_from_config();
+        false -> NodeType = node_type(),
+                 init_db_and_upgrade(cluster_nodes(all), NodeType,
+                                     NodeType =:= ram)
+    end,
     %% We intuitively expect the global name server to be synced when
-    %% Mnesia is up. In fact that's not guaranteed to be the case - let's
-    %% make it so.
+    %% Mnesia is up. In fact that's not guaranteed to be the case -
+    %% let's make it so.
     ok = global:sync(),
-    ok = delete_previously_running_nodes(),
     ok.
 
-is_db_empty() ->
-    lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end,
-              table_names()).
-
-cluster(ClusterNodes) ->
-    cluster(ClusterNodes, false).
-force_cluster(ClusterNodes) ->
-    cluster(ClusterNodes, true).
-
-%% Alter which disk nodes this node is clustered with. This can be a
-%% subset of all the disk nodes in the cluster but can (and should)
-%% include the node itself if it is to be a disk rather than a ram
-%% node.  If Force is false, only connections to online nodes are
-%% allowed.
-cluster(ClusterNodes, Force) ->
-    rabbit_misc:local_info_msg("Clustering with ~p~s~n",
-                               [ClusterNodes, if Force -> " forcefully";
-                                                 true  -> ""
-                                              end]),
+init_from_config() ->
+    {TryNodes, NodeType} =
+        case application:get_env(rabbit, cluster_nodes) of
+            {ok, Nodes} when is_list(Nodes) ->
+                Config = {Nodes -- [node()], case lists:member(node(), Nodes) of
+                                                 true  -> disc;
+                                                 false -> ram
+                                             end},
+                error_logger:warning_msg(
+                  "Converting legacy 'cluster_nodes' configuration~n    ~w~n"
+                  "to~n    ~w.~n~n"
+                  "Please update the configuration to the new format "
+                  "{Nodes, NodeType}, where Nodes contains the nodes that the "
+                  "node will try to cluster with, and NodeType is either "
+                  "'disc' or 'ram'~n", [Nodes, Config]),
+                Config;
+            {ok, Config} ->
+                Config
+        end,
+    case find_good_node(nodes_excl_me(TryNodes)) of
+        {ok, Node} ->
+            rabbit_log:info("Node '~p' selected for clustering from "
+                            "configuration~n", [Node]),
+            {ok, {_, DiscNodes, _}} = discover_cluster(Node),
+            init_db_and_upgrade(DiscNodes, NodeType, true),
+            rabbit_node_monitor:notify_joined_cluster();
+        none ->
+            rabbit_log:warning("Could not find any suitable node amongst the "
+                               "ones provided in the configuration: ~p~n",
+                               [TryNodes]),
+            init_db_and_upgrade([node()], disc, false)
+    end.
+
+%% Make the node join a cluster. The node will be reset automatically
+%% before we actually cluster it. The nodes provided will be used to
+%% find out about the nodes in the cluster.
+%%
+%% This function will fail if:
+%%
+%%   * The node is currently the only disc node of its cluster
+%%   * We can't connect to any of the nodes provided
+%%   * The node is currently already clustered with the cluster of the nodes
+%%     provided
+%%
+%% Note that we make no attempt to verify that the nodes provided are
+%% all in the same cluster, we simply pick the first online node and
+%% we cluster to its cluster.
+join_cluster(DiscoveryNode, NodeType) ->
     ensure_mnesia_not_running(),
     ensure_mnesia_dir(),
+    case is_only_clustered_disc_node() of
+        true  -> e(clustering_only_disc_node);
+        false -> ok
+    end,
+    {ClusterNodes, _, _} = case discover_cluster(DiscoveryNode) of
+                               {ok, Res}      -> Res;
+                               {error, _} = E -> throw(E)
+                           end,
+    case me_in_nodes(ClusterNodes) of
+        false ->
+            %% reset the node. this simplifies things and it will be needed in
+            %% this case - we're joining a new cluster with new nodes which
+            %% are not in synch with the current node. I also lifts the burden
+            %% of reseting the node from the user.
+            reset_gracefully(),
+
+            %% Join the cluster
+            rabbit_misc:local_info_msg("Clustering with ~p as ~p node~n",
+                                       [ClusterNodes, NodeType]),
+            ok = init_db_with_mnesia(ClusterNodes, NodeType, true, true),
+            rabbit_node_monitor:notify_joined_cluster(),
+            ok;
+        true ->
+            rabbit_misc:local_info_msg("Already member of cluster: ~p~n",
+                                       [ClusterNodes]),
+            {ok, already_member}
+    end.
 
-    case not Force andalso is_clustered() andalso
-         is_only_disc_node(node(), false) andalso
-         not should_be_disc_node(ClusterNodes)
-    of
-        true -> log_both("last running disc node leaving cluster");
-        _    -> ok
+%% return node to its virgin state, where it is not member of any
+%% cluster, has no cluster configuration, no local database, and no
+%% persisted messages
+reset() ->
+    ensure_mnesia_not_running(),
+    rabbit_misc:local_info_msg("Resetting Rabbit~n", []),
+    reset_gracefully().
+
+force_reset() ->
+    ensure_mnesia_not_running(),
+    rabbit_misc:local_info_msg("Resetting Rabbit forcefully~n", []),
+    wipe().
+
+reset_gracefully() ->
+    AllNodes = cluster_nodes(all),
+    %% Reconnecting so that we will get an up to date nodes.  We don't
+    %% need to check for consistency because we are resetting.
+    %% Force=true here so that reset still works when clustered with a
+    %% node which is down.
+    init_db_with_mnesia(AllNodes, node_type(), false, false),
+    case is_only_clustered_disc_node() of
+        true  -> e(resetting_only_disc_node);
+        false -> ok
     end,
+    leave_cluster(),
+    rabbit_misc:ensure_ok(mnesia:delete_schema([node()]), cannot_delete_schema),
+    wipe().
+
+wipe() ->
+    %% We need to make sure that we don't end up in a distributed
+    %% Erlang system with nodes while not being in an Mnesia cluster
+    %% with them. We don't handle that well.
+    [erlang:disconnect_node(N) || N <- cluster_nodes(all)],
+    %% remove persisted messages and any other garbage we find
+    ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
+    ok = rabbit_node_monitor:reset_cluster_status(),
+    ok.
 
-    %% Wipe mnesia if we're changing type from disc to ram
-    case {is_disc_node(), should_be_disc_node(ClusterNodes)} of
-        {true, false} -> rabbit_misc:with_local_io(
-                           fun () -> error_logger:warning_msg(
-                                       "changing node type; wiping "
-                                       "mnesia...~n~n")
-                           end),
-                         rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
-                                               cannot_delete_schema);
-        _             -> ok
+change_cluster_node_type(Type) ->
+    ensure_mnesia_not_running(),
+    ensure_mnesia_dir(),
+    case is_clustered() of
+        false -> e(not_clustered);
+        true  -> ok
     end,
+    {_, _, RunningNodes} = case discover_cluster(cluster_nodes(all)) of
+                               {ok, Status}     -> Status;
+                               {error, _Reason} -> e(cannot_connect_to_cluster)
+                           end,
+    %% We might still be marked as running by a remote node since the
+    %% information of us going down might not have propagated yet.
+    Node = case RunningNodes -- [node()] of
+               []        -> e(no_online_cluster_nodes);
+               [Node0|_] -> Node0
+           end,
+    ok = reset(),
+    ok = join_cluster(Node, Type).
 
-    %% Pre-emptively leave the cluster
-    %%
-    %% We're trying to handle the following two cases:
-    %% 1. We have a two-node cluster, where both nodes are disc nodes.
-    %% One node is re-clustered as a ram node.  When it tries to
-    %% re-join the cluster, but before it has time to update its
-    %% tables definitions, the other node will order it to re-create
-    %% its disc tables.  So, we need to leave the cluster before we
-    %% can join it again.
-    %% 2. We have a two-node cluster, where both nodes are disc nodes.
-    %% One node is forcefully reset (so, the other node thinks its
-    %% still a part of the cluster).  The reset node is re-clustered
-    %% as a ram node.  Same as above, we need to leave the cluster
-    %% before we can join it.  But, since we don't know if we're in a
-    %% cluster or not, we just pre-emptively leave it before joining.
-    ProperClusterNodes = ClusterNodes -- [node()],
-    try
-        ok = leave_cluster(ProperClusterNodes, ProperClusterNodes)
-    catch
-        {error, {no_running_cluster_nodes, _, _}} when Force ->
-            ok
+update_cluster_nodes(DiscoveryNode) ->
+    ensure_mnesia_not_running(),
+    ensure_mnesia_dir(),
+    Status = {AllNodes, _, _} =
+        case discover_cluster(DiscoveryNode) of
+            {ok, Status0}    -> Status0;
+            {error, _Reason} -> e(cannot_connect_to_node)
+        end,
+    case me_in_nodes(AllNodes) of
+        true ->
+            %% As in `check_consistency/0', we can safely delete the
+            %% schema here, since it'll be replicated from the other
+            %% nodes
+            mnesia:delete_schema([node()]),
+            rabbit_node_monitor:write_cluster_status(Status),
+            rabbit_misc:local_info_msg("Updating cluster nodes from ~p~n",
+                                       [DiscoveryNode]),
+            init_db_with_mnesia(AllNodes, node_type(), true, true);
+        false ->
+            e(inconsistent_cluster)
     end,
+    ok.
 
-    %% Join the cluster
-    start_mnesia(),
-    try
-        ok = init_db(ClusterNodes, Force),
-        ok = create_cluster_nodes_config(ClusterNodes)
-    after
-        stop_mnesia()
+%% We proceed like this: try to remove the node locally. If the node
+%% is offline, we remove the node if:
+%%   * This node is a disc node
+%%   * All other nodes are offline
+%%   * This node was, at the best of our knowledge (see comment below)
+%%     the last or second to last after the node we're removing to go
+%%     down
+forget_cluster_node(Node, RemoveWhenOffline) ->
+    case lists:member(Node, cluster_nodes(all)) of
+        true  -> ok;
+        false -> e(not_a_cluster_node)
     end,
+    case {RemoveWhenOffline, is_running()} of
+        {true,  false} -> remove_node_offline_node(Node);
+        {true,   true} -> e(online_node_offline_flag);
+        {false, false} -> e(offline_node_no_offline_flag);
+        {false,  true} -> rabbit_misc:local_info_msg(
+                            "Removing node ~p from cluster~n", [Node]),
+                          case remove_node_if_mnesia_running(Node) of
+                              ok               -> ok;
+                              {error, _} = Err -> throw(Err)
+                          end
+    end.
 
-    ok.
+remove_node_offline_node(Node) ->
+    %% Here `mnesia:system_info(running_db_nodes)' will RPC, but that's what we
+    %% want - we need to know the running nodes *now*.  If the current node is a
+    %% RAM node it will return bogus results, but we don't care since we only do
+    %% this operation from disc nodes.
+    case {mnesia:system_info(running_db_nodes) -- [Node], node_type()} of
+        {[], disc} ->
+            start_mnesia(),
+            try
+                %% What we want to do here is replace the last node to
+                %% go down with the current node.  The way we do this
+                %% is by force loading the table, and making sure that
+                %% they are loaded.
+                rabbit_table:force_load(),
+                rabbit_table:wait_for_replicated(),
+                forget_cluster_node(Node, false),
+                force_load_next_boot()
+            after
+                stop_mnesia()
+            end;
+        {_, _} ->
+            e(removing_node_from_offline_node)
+    end.
 
-%% return node to its virgin state, where it is not member of any
-%% cluster, has no cluster configuration, no local database, and no
-%% persisted messages
-reset()       -> reset(false).
-force_reset() -> reset(true).
-
-is_clustered() ->
-    RunningNodes = running_clustered_nodes(),
-    [node()] /= RunningNodes andalso [] /= RunningNodes.
-
-all_clustered_nodes() ->
-    mnesia:system_info(db_nodes).
-
-running_clustered_nodes() ->
-    mnesia:system_info(running_db_nodes).
-
-empty_ram_only_tables() ->
-    Node = node(),
-    lists:foreach(
-      fun (TabName) ->
-              case lists:member(Node, mnesia:table_info(TabName, ram_copies)) of
-                  true  -> {atomic, ok} = mnesia:clear_table(TabName);
-                  false -> ok
-              end
-      end, table_names()),
-    ok.
 
-%%--------------------------------------------------------------------
+%%----------------------------------------------------------------------------
+%% Queries
+%%----------------------------------------------------------------------------
 
-nodes_of_type(Type) ->
-    %% This function should return the nodes of a certain type (ram,
-    %% disc or disc_only) in the current cluster.  The type of nodes
-    %% is determined when the cluster is initially configured.
-    mnesia:table_info(schema, Type).
-
-%% The tables aren't supposed to be on disk on a ram node
-table_definitions(disc) ->
-    table_definitions();
-table_definitions(ram) ->
-    [{Tab, copy_type_to_ram(TabDef)} || {Tab, TabDef} <- table_definitions()].
-
-table_definitions() ->
-    [{rabbit_user,
-      [{record_name, internal_user},
-       {attributes, record_info(fields, internal_user)},
-       {disc_copies, [node()]},
-       {match, #internal_user{_='_'}}]},
-     {rabbit_user_permission,
-      [{record_name, user_permission},
-       {attributes, record_info(fields, user_permission)},
-       {disc_copies, [node()]},
-       {match, #user_permission{user_vhost = #user_vhost{_='_'},
-                                permission = #permission{_='_'},
-                                _='_'}}]},
-     {rabbit_vhost,
-      [{record_name, vhost},
-       {attributes, record_info(fields, vhost)},
-       {disc_copies, [node()]},
-       {match, #vhost{_='_'}}]},
-     {rabbit_listener,
-      [{record_name, listener},
-       {attributes, record_info(fields, listener)},
-       {type, bag},
-       {match, #listener{_='_'}}]},
-     {rabbit_durable_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {disc_copies, [node()]},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_semi_durable_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {type, ordered_set},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_route,
-      [{record_name, route},
-       {attributes, record_info(fields, route)},
-       {type, ordered_set},
-       {match, #route{binding = binding_match(), _='_'}}]},
-     {rabbit_reverse_route,
-      [{record_name, reverse_route},
-       {attributes, record_info(fields, reverse_route)},
-       {type, ordered_set},
-       {match, #reverse_route{reverse_binding = reverse_binding_match(),
-                              _='_'}}]},
-     {rabbit_topic_trie_node,
-      [{record_name, topic_trie_node},
-       {attributes, record_info(fields, topic_trie_node)},
-       {type, ordered_set},
-       {match, #topic_trie_node{trie_node = trie_node_match(), _='_'}}]},
-     {rabbit_topic_trie_edge,
-      [{record_name, topic_trie_edge},
-       {attributes, record_info(fields, topic_trie_edge)},
-       {type, ordered_set},
-       {match, #topic_trie_edge{trie_edge = trie_edge_match(), _='_'}}]},
-     {rabbit_topic_trie_binding,
-      [{record_name, topic_trie_binding},
-       {attributes, record_info(fields, topic_trie_binding)},
-       {type, ordered_set},
-       {match, #topic_trie_binding{trie_binding = trie_binding_match(),
-                                   _='_'}}]},
-     {rabbit_durable_exchange,
-      [{record_name, exchange},
-       {attributes, record_info(fields, exchange)},
-       {disc_copies, [node()]},
-       {match, #exchange{name = exchange_name_match(), _='_'}}]},
-     {rabbit_exchange,
-      [{record_name, exchange},
-       {attributes, record_info(fields, exchange)},
-       {match, #exchange{name = exchange_name_match(), _='_'}}]},
-     {rabbit_exchange_serial,
-      [{record_name, exchange_serial},
-       {attributes, record_info(fields, exchange_serial)},
-       {match, #exchange_serial{name = exchange_name_match(), _='_'}}]},
-     {rabbit_runtime_parameters,
-      [{record_name, runtime_parameters},
-       {attributes, record_info(fields, runtime_parameters)},
-       {disc_copies, [node()]},
-       {match, #runtime_parameters{_='_'}}]},
-     {rabbit_durable_queue,
-      [{record_name, amqqueue},
-       {attributes, record_info(fields, amqqueue)},
-       {disc_copies, [node()]},
-       {match, #amqqueue{name = queue_name_match(), _='_'}}]},
-     {rabbit_queue,
-      [{record_name, amqqueue},
-       {attributes, record_info(fields, amqqueue)},
-       {match, #amqqueue{name = queue_name_match(), _='_'}}]}]
-        ++ gm:table_definitions()
-        ++ mirrored_supervisor:table_definitions().
-
-binding_match() ->
-    #binding{source = exchange_name_match(),
-             destination = binding_destination_match(),
-             _='_'}.
-reverse_binding_match() ->
-    #reverse_binding{destination = binding_destination_match(),
-                     source = exchange_name_match(),
-                     _='_'}.
-binding_destination_match() ->
-    resource_match('_').
-trie_node_match() ->
-    #trie_node{   exchange_name = exchange_name_match(), _='_'}.
-trie_edge_match() ->
-    #trie_edge{   exchange_name = exchange_name_match(), _='_'}.
-trie_binding_match() ->
-    #trie_binding{exchange_name = exchange_name_match(), _='_'}.
-exchange_name_match() ->
-    resource_match(exchange).
-queue_name_match() ->
-    resource_match(queue).
-resource_match(Kind) ->
-    #resource{kind = Kind, _='_'}.
-
-table_names() ->
-    [Tab || {Tab, _} <- table_definitions()].
-
-replicated_table_names() ->
-    [Tab || {Tab, TabDef} <- table_definitions(),
-            not lists:member({local_content, true}, TabDef)
-    ].
+status() ->
+    IfNonEmpty = fun (_,       []) -> [];
+                     (Type, Nodes) -> [{Type, Nodes}]
+                 end,
+    [{nodes, (IfNonEmpty(disc, cluster_nodes(disc)) ++
+                  IfNonEmpty(ram, cluster_nodes(ram)))}] ++
+        case is_running() of
+            true  -> RunningNodes = cluster_nodes(running),
+                     [{running_nodes, RunningNodes},
+                      {partitions,    mnesia_partitions(RunningNodes)}];
+            false -> []
+        end.
+
+mnesia_partitions(Nodes) ->
+    Replies = rabbit_node_monitor:partitions(Nodes),
+    [Reply || Reply = {_, R} <- Replies, R =/= []].
+
+is_running() -> mnesia:system_info(is_running) =:= yes.
+
+is_clustered() -> AllNodes = cluster_nodes(all),
+                  AllNodes =/= [] andalso AllNodes =/= [node()].
+
+cluster_nodes(WhichNodes) -> cluster_status(WhichNodes).
+
+%% This function is the actual source of information, since it gets
+%% the data from mnesia. Obviously it'll work only when mnesia is
+%% running.
+cluster_status_from_mnesia() ->
+    case is_running() of
+        false ->
+            {error, mnesia_not_running};
+        true ->
+            %% If the tables are not present, it means that
+            %% `init_db/3' hasn't been run yet. In other words, either
+            %% we are a virgin node or a restarted RAM node. In both
+            %% cases we're not interested in what mnesia has to say.
+            NodeType = case mnesia:system_info(use_dir) of
+                           true  -> disc;
+                           false -> ram
+                       end,
+            case rabbit_table:is_present() of
+                true  -> AllNodes = mnesia:system_info(db_nodes),
+                         DiscCopies = mnesia:table_info(schema, disc_copies),
+                         DiscNodes = case NodeType of
+                                         disc -> nodes_incl_me(DiscCopies);
+                                         ram  -> DiscCopies
+                                     end,
+                         %% `mnesia:system_info(running_db_nodes)' is safe since
+                         %% we know that mnesia is running
+                         RunningNodes = mnesia:system_info(running_db_nodes),
+                         {ok, {AllNodes, DiscNodes, RunningNodes}};
+                false -> {error, tables_not_present}
+            end
+    end.
+
+cluster_status(WhichNodes) ->
+    {AllNodes, DiscNodes, RunningNodes} = Nodes =
+        case cluster_status_from_mnesia() of
+            {ok, Nodes0} ->
+                Nodes0;
+            {error, _Reason} ->
+                {AllNodes0, DiscNodes0, RunningNodes0} =
+                    rabbit_node_monitor:read_cluster_status(),
+                %% The cluster status file records the status when the node is
+                %% online, but we know for sure that the node is offline now, so
+                %% we can remove it from the list of running nodes.
+                {AllNodes0, DiscNodes0, nodes_excl_me(RunningNodes0)}
+        end,
+    case WhichNodes of
+        status  -> Nodes;
+        all     -> AllNodes;
+        disc    -> DiscNodes;
+        ram     -> AllNodes -- DiscNodes;
+        running -> RunningNodes
+    end.
+
+node_info() ->
+    {erlang:system_info(otp_release), rabbit_misc:version(),
+     cluster_status_from_mnesia()}.
+
+node_type() ->
+    DiscNodes = cluster_nodes(disc),
+    case DiscNodes =:= [] orelse me_in_nodes(DiscNodes) of
+        true  -> disc;
+        false -> ram
+    end.
 
 dir() -> mnesia:system_info(directory).
 
+%%----------------------------------------------------------------------------
+%% Operations on the db
+%%----------------------------------------------------------------------------
+
+%% Adds the provided nodes to the mnesia cluster, creating a new
+%% schema if there is the need to and catching up if there are other
+%% nodes in the cluster already. It also updates the cluster status
+%% file.
+init_db(ClusterNodes, NodeType, CheckOtherNodes) ->
+    Nodes = change_extra_db_nodes(ClusterNodes, CheckOtherNodes),
+    %% Note that we use `system_info' here and not the cluster status
+    %% since when we start rabbit for the first time the cluster
+    %% status will say we are a disc node but the tables won't be
+    %% present yet.
+    WasDiscNode = mnesia:system_info(use_dir),
+    case {Nodes, WasDiscNode, NodeType} of
+        {[], _, ram} ->
+            %% Standalone ram node, we don't want that
+            throw({error, cannot_create_standalone_ram_node});
+        {[], false, disc} ->
+            %% RAM -> disc, starting from scratch
+            ok = create_schema();
+        {[], true, disc} ->
+            %% First disc node up
+            maybe_force_load(),
+            ok;
+        {[AnotherNode | _], _, _} ->
+            %% Subsequent node in cluster, catch up
+            ensure_version_ok(
+              rpc:call(AnotherNode, rabbit_version, recorded, [])),
+            maybe_force_load(),
+            ok = rabbit_table:wait_for_replicated(),
+            ok = rabbit_table:create_local_copy(NodeType)
+    end,
+    ensure_schema_integrity(),
+    rabbit_node_monitor:update_cluster_status(),
+    ok.
+
+init_db_unchecked(ClusterNodes, NodeType) ->
+    init_db(ClusterNodes, NodeType, false).
+
+init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes) ->
+    ok = init_db(ClusterNodes, NodeType, CheckOtherNodes),
+    ok = case rabbit_upgrade:maybe_upgrade_local() of
+             ok                    -> ok;
+             starting_from_scratch -> rabbit_version:record_desired();
+             version_not_available -> schema_ok_or_move()
+         end,
+    %% `maybe_upgrade_local' restarts mnesia, so ram nodes will forget
+    %% about the cluster
+    case NodeType of
+        ram  -> start_mnesia(),
+                change_extra_db_nodes(ClusterNodes, false);
+        disc -> ok
+    end,
+    %% ...and all nodes will need to wait for tables
+    rabbit_table:wait_for_replicated(),
+    ok.
+
+init_db_with_mnesia(ClusterNodes, NodeType,
+                    CheckOtherNodes, CheckConsistency) ->
+    start_mnesia(CheckConsistency),
+    try
+        init_db_and_upgrade(ClusterNodes, NodeType, CheckOtherNodes)
+    after
+        stop_mnesia()
+    end.
+
 ensure_mnesia_dir() ->
     MnesiaDir = dir() ++ "/",
     case filelib:ensure_dir(MnesiaDir) of
@@ -378,210 +507,124 @@ ensure_mnesia_not_running() ->
     end.
 
 ensure_schema_integrity() ->
-    case check_schema_integrity() of
+    case rabbit_table:check_schema_integrity() of
         ok ->
             ok;
         {error, Reason} ->
             throw({error, {schema_integrity_check_failed, Reason}})
     end.
 
-check_schema_integrity() ->
-    Tables = mnesia:system_info(tables),
-    case check_tables(fun (Tab, TabDef) ->
-                              case lists:member(Tab, Tables) of
-                                  false -> {error, {table_missing, Tab}};
-                                  true  -> check_table_attributes(Tab, TabDef)
-                              end
-                      end) of
-        ok     -> ok = wait_for_tables(),
-                  check_tables(fun check_table_content/2);
-        Other  -> Other
-    end.
+copy_db(Destination) ->
+    ok = ensure_mnesia_not_running(),
+    rabbit_file:recursive_copy(dir(), Destination).
+
+force_load_filename() ->
+    filename:join(rabbit_mnesia:dir(), "force_load").
+
+force_load_next_boot() ->
+    rabbit_file:write_file(force_load_filename(), <<"">>).
 
-check_table_attributes(Tab, TabDef) ->
-    {_, ExpAttrs} = proplists:lookup(attributes, TabDef),
-    case mnesia:table_info(Tab, attributes) of
-        ExpAttrs -> ok;
-        Attrs    -> {error, {table_attributes_mismatch, Tab, ExpAttrs, Attrs}}
+maybe_force_load() ->
+    case rabbit_file:is_file(force_load_filename()) of
+        true  -> rabbit_table:force_load(),
+                 rabbit_file:delete(force_load_filename());
+        false -> ok
     end.
 
-check_table_content(Tab, TabDef) ->
-    {_, Match} = proplists:lookup(match, TabDef),
-    case mnesia:dirty_first(Tab) of
-        '$end_of_table' ->
+%% This does not guarantee us much, but it avoids some situations that
+%% will definitely end up badly
+check_cluster_consistency() ->
+    %% We want to find 0 or 1 consistent nodes.
+    case lists:foldl(
+           fun (Node,  {error, _})    -> check_cluster_consistency(Node);
+               (_Node, {ok, Status})  -> {ok, Status}
+           end, {error, not_found}, nodes_excl_me(cluster_nodes(all)))
+    of
+        {ok, Status = {RemoteAllNodes, _, _}} ->
+            case ordsets:is_subset(ordsets:from_list(cluster_nodes(all)),
+                                   ordsets:from_list(RemoteAllNodes)) of
+                true  ->
+                    ok;
+                false ->
+                    %% We delete the schema here since we think we are
+                    %% clustered with nodes that are no longer in the
+                    %% cluster and there is no other way to remove
+                    %% them from our schema. On the other hand, we are
+                    %% sure that there is another online node that we
+                    %% can use to sync the tables with. There is a
+                    %% race here: if between this check and the
+                    %% `init_db' invocation the cluster gets
+                    %% disbanded, we're left with a node with no
+                    %% mnesia data that will try to connect to offline
+                    %% nodes.
+                    mnesia:delete_schema([node()])
+            end,
+            rabbit_node_monitor:write_cluster_status(Status);
+        {error, not_found} ->
             ok;
-        Key ->
-            ObjList = mnesia:dirty_read(Tab, Key),
-            MatchComp = ets:match_spec_compile([{Match, [], ['$_']}]),
-            case ets:match_spec_run(ObjList, MatchComp) of
-                ObjList -> ok;
-                _       -> {error, {table_content_invalid, Tab, Match, ObjList}}
-            end
+        {error, _} = E ->
+            throw(E)
     end.
 
-check_tables(Fun) ->
-    case [Error || {Tab, TabDef} <- table_definitions(
-                                      case is_disc_node() of
-                                          true  -> disc;
-                                          false -> ram
-                                      end),
-                   case Fun(Tab, TabDef) of
-                       ok             -> Error = none, false;
-                       {error, Error} -> true
-                   end] of
-        []     -> ok;
-        Errors -> {error, Errors}
+check_cluster_consistency(Node) ->
+    case rpc:call(Node, rabbit_mnesia, node_info, []) of
+        {badrpc, _Reason} ->
+            {error, not_found};
+        {_OTP, _Rabbit, {error, _}} ->
+            {error, not_found};
+        {OTP, Rabbit, {ok, Status}} ->
+            case check_consistency(OTP, Rabbit, Node, Status) of
+                {error, _} = E -> E;
+                {ok, Res}      -> {ok, Res}
+            end;
+        {_OTP, Rabbit, _Hash, _Status} ->
+            %% delegate hash checking implies version mismatch
+            version_error("Rabbit", rabbit_misc:version(), Rabbit)
     end.
 
-%% The cluster node config file contains some or all of the disk nodes
-%% that are members of the cluster this node is / should be a part of.
-%%
-%% If the file is absent, the list is empty, or only contains the
-%% current node, then the current node is a standalone (disk)
-%% node. Otherwise it is a node that is part of a cluster as either a
-%% disk node, if it appears in the cluster node config, or ram node if
-%% it doesn't.
-
-cluster_nodes_config_filename() ->
-    dir() ++ "/cluster_nodes.config".
-
-create_cluster_nodes_config(ClusterNodes) ->
-    FileName = cluster_nodes_config_filename(),
-    case rabbit_file:write_term_file(FileName, [ClusterNodes]) of
-        ok -> ok;
-        {error, Reason} ->
-            throw({error, {cannot_create_cluster_nodes_config,
-                           FileName, Reason}})
-    end.
+%%--------------------------------------------------------------------
+%% Hooks for `rabbit_node_monitor'
+%%--------------------------------------------------------------------
 
-read_cluster_nodes_config() ->
-    FileName = cluster_nodes_config_filename(),
-    case rabbit_file:read_term_file(FileName) of
-        {ok, [ClusterNodes]} -> ClusterNodes;
-        {error, enoent} ->
-            {ok, ClusterNodes} = application:get_env(rabbit, cluster_nodes),
-            ClusterNodes;
-        {error, Reason} ->
-            throw({error, {cannot_read_cluster_nodes_config,
-                           FileName, Reason}})
+on_node_up(Node) ->
+    case running_disc_nodes() of
+        [Node] -> rabbit_log:info("cluster contains disc nodes again~n");
+        _      -> ok
     end.
 
-delete_cluster_nodes_config() ->
-    FileName = cluster_nodes_config_filename(),
-    case file:delete(FileName) of
-        ok -> ok;
-        {error, enoent} -> ok;
-        {error, Reason} ->
-            throw({error, {cannot_delete_cluster_nodes_config,
-                           FileName, Reason}})
+on_node_down(_Node) ->
+    case running_disc_nodes() of
+        [] -> rabbit_log:info("only running disc node went down~n");
+        _  -> ok
     end.
 
-running_nodes_filename() ->
-    filename:join(dir(), "nodes_running_at_shutdown").
-
-record_running_nodes() ->
-    FileName = running_nodes_filename(),
-    Nodes = running_clustered_nodes() -- [node()],
-    %% Don't check the result: we're shutting down anyway and this is
-    %% a best-effort-basis.
-    rabbit_file:write_term_file(FileName, [Nodes]),
-    ok.
-
-read_previously_running_nodes() ->
-    FileName = running_nodes_filename(),
-    case rabbit_file:read_term_file(FileName) of
-        {ok, [Nodes]}   -> Nodes;
-        {error, enoent} -> [];
-        {error, Reason} -> throw({error, {cannot_read_previous_nodes_file,
-                                          FileName, Reason}})
-    end.
+running_disc_nodes() ->
+    {_AllNodes, DiscNodes, RunningNodes} = cluster_status(status),
+    ordsets:to_list(ordsets:intersection(ordsets:from_list(DiscNodes),
+                                         ordsets:from_list(RunningNodes))).
 
-delete_previously_running_nodes() ->
-    FileName = running_nodes_filename(),
-    case file:delete(FileName) of
-        ok              -> ok;
-        {error, enoent} -> ok;
-        {error, Reason} -> throw({error, {cannot_delete_previous_nodes_file,
-                                          FileName, Reason}})
-    end.
+%%--------------------------------------------------------------------
+%% Internal helpers
+%%--------------------------------------------------------------------
 
-init_db(ClusterNodes, Force) ->
-    init_db(
-      ClusterNodes, Force,
-      fun () ->
-              case rabbit_upgrade:maybe_upgrade_local() of
-                  ok                    -> ok;
-                  %% If we're just starting up a new node we won't have a
-                  %% version
-                  starting_from_scratch -> ok = rabbit_version:record_desired()
-              end
-      end).
-
-%% Take a cluster node config and create the right kind of node - a
-%% standalone disk node, or disk or ram node connected to the
-%% specified cluster nodes.  If Force is false, don't allow
-%% connections to offline nodes.
-init_db(ClusterNodes, Force, SecondaryPostMnesiaFun) ->
-    UClusterNodes = lists:usort(ClusterNodes),
-    ProperClusterNodes = UClusterNodes -- [node()],
-    case mnesia:change_config(extra_db_nodes, ProperClusterNodes) of
-        {ok, []} when not Force andalso ProperClusterNodes =/= [] ->
-            throw({error, {failed_to_cluster_with, ProperClusterNodes,
-                           "Mnesia could not connect to any disc nodes."}});
-        {ok, Nodes} ->
-            WasDiscNode = is_disc_node(),
-            WantDiscNode = should_be_disc_node(ClusterNodes),
-            %% We create a new db (on disk, or in ram) in the first
-            %% two cases and attempt to upgrade the in the other two
-            case {Nodes, WasDiscNode, WantDiscNode} of
-                {[], _, false} ->
-                    %% New ram node; start from scratch
-                    ok = create_schema(ram);
-                {[], false, true} ->
-                    %% Nothing there at all, start from scratch
-                    ok = create_schema(disc);
-                {[], true, true} ->
-                    %% We're the first node up
-                    case rabbit_upgrade:maybe_upgrade_local() of
-                        ok                    -> ensure_schema_integrity();
-                        version_not_available -> ok = schema_ok_or_move()
-                    end;
-                {[AnotherNode|_], _, _} ->
-                    %% Subsequent node in cluster, catch up
-                    ensure_version_ok(
-                      rpc:call(AnotherNode, rabbit_version, recorded, [])),
-                    {CopyType, CopyTypeAlt} =
-                        case WantDiscNode of
-                            true  -> {disc, disc_copies};
-                            false -> {ram, ram_copies}
-                        end,
-                    ok = wait_for_replicated_tables(),
-                    ok = create_local_table_copy(schema, CopyTypeAlt),
-                    ok = create_local_table_copies(CopyType),
-
-                    ok = SecondaryPostMnesiaFun(),
-                    %% We've taken down mnesia, so ram nodes will need
-                    %% to re-sync
-                    case is_disc_node() of
-                        false -> start_mnesia(),
-                                 mnesia:change_config(extra_db_nodes,
-                                                      ProperClusterNodes),
-                                 wait_for_replicated_tables();
-                        true  -> ok
-                    end,
-
-                    ensure_schema_integrity(),
-                    ok
-            end;
-        {error, Reason} ->
-            %% one reason we may end up here is if we try to join
-            %% nodes together that are currently running standalone or
-            %% are members of a different cluster
-            throw({error, {unable_to_join_cluster, ClusterNodes, Reason}})
+discover_cluster(Nodes) when is_list(Nodes) ->
+    lists:foldl(fun (_, {ok, Res})     -> {ok, Res};
+                    (Node, {error, _}) -> discover_cluster(Node)
+                end, {error, no_nodes_provided}, Nodes);
+discover_cluster(Node) when Node == node() ->
+    {error, {cannot_discover_cluster, "Cannot cluster node with itself"}};
+discover_cluster(Node) ->
+    OfflineError =
+        {error, {cannot_discover_cluster,
+                 "The nodes provided are either offline or not running"}},
+    case rpc:call(Node, rabbit_mnesia, cluster_status_from_mnesia, []) of
+        {badrpc, _Reason}           -> OfflineError;
+        {error, mnesia_not_running} -> OfflineError;
+        {ok, Res}                   -> {ok, Res}
     end.
 
 schema_ok_or_move() ->
-    case check_schema_integrity() of
+    case rabbit_table:check_schema_integrity() of
         ok ->
             ok;
         {error, Reason} ->
@@ -592,7 +635,7 @@ schema_ok_or_move() ->
                                      "and recreating schema from scratch~n",
                                      [Reason]),
             ok = move_db(),
-            ok = create_schema(disc)
+            ok = create_schema()
     end.
 
 ensure_version_ok({ok, DiscVersion}) ->
@@ -604,25 +647,16 @@ ensure_version_ok({ok, DiscVersion}) ->
 ensure_version_ok({error, _}) ->
     ok = rabbit_version:record_desired().
 
-create_schema(Type) ->
+%% We only care about disc nodes since ram nodes are supposed to catch
+%% up only
+create_schema() ->
     stop_mnesia(),
-    case Type of
-        disc -> rabbit_misc:ensure_ok(mnesia:create_schema([node()]),
-                                      cannot_create_schema);
-        ram  -> %% remove the disc schema since this is a ram node
-                rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
-                                      cannot_delete_schema)
-    end,
+    rabbit_misc:ensure_ok(mnesia:create_schema([node()]), cannot_create_schema),
     start_mnesia(),
-    ok = create_tables(Type),
+    ok = rabbit_table:create(),
     ensure_schema_integrity(),
     ok = rabbit_version:record_desired().
 
-is_disc_node() -> mnesia:system_info(use_dir).
-
-should_be_disc_node(ClusterNodes) ->
-    ClusterNodes == [] orelse lists:member(node(), ClusterNodes).
-
 move_db() ->
     stop_mnesia(),
     MnesiaDir = filename:dirname(dir() ++ "/"),
@@ -644,186 +678,192 @@ move_db() ->
     start_mnesia(),
     ok.
 
-copy_db(Destination) ->
-    ok = ensure_mnesia_not_running(),
-    rabbit_file:recursive_copy(dir(), Destination).
+remove_node_if_mnesia_running(Node) ->
+    case is_running() of
+        false ->
+            {error, mnesia_not_running};
+        true ->
+            %% Deleting the the schema copy of the node will result in
+            %% the node being removed from the cluster, with that
+            %% change being propagated to all nodes
+            case mnesia:del_table_copy(schema, Node) of
+                {atomic, ok} ->
+                    rabbit_amqqueue:forget_all_durable(Node),
+                    rabbit_node_monitor:notify_left_cluster(Node),
+                    ok;
+                {aborted, Reason} ->
+                    {error, {failed_to_remove_node, Node, Reason}}
+            end
+    end.
 
-create_tables() -> create_tables(disc).
+leave_cluster() ->
+    case nodes_excl_me(cluster_nodes(all)) of
+        []       -> ok;
+        AllNodes -> case lists:any(fun leave_cluster/1, AllNodes) of
+                        true  -> ok;
+                        false -> e(no_running_cluster_nodes)
+                    end
+    end.
 
-create_tables(Type) ->
-    lists:foreach(fun ({Tab, TabDef}) ->
-                          TabDef1 = proplists:delete(match, TabDef),
-                          case mnesia:create_table(Tab, TabDef1) of
-                              {atomic, ok} -> ok;
-                              {aborted, Reason} ->
-                                  throw({error, {table_creation_failed,
-                                                 Tab, TabDef1, Reason}})
-                          end
-                  end,
-                  table_definitions(Type)),
-    ok.
+leave_cluster(Node) ->
+    case rpc:call(Node,
+                  rabbit_mnesia, remove_node_if_mnesia_running, [node()]) of
+        ok                          -> true;
+        {error, mnesia_not_running} -> false;
+        {error, Reason}             -> throw({error, Reason});
+        {badrpc, nodedown}          -> false
+    end.
 
-copy_type_to_ram(TabDef) ->
-    [{disc_copies, []}, {ram_copies, [node()]}
-     | proplists:delete(ram_copies, proplists:delete(disc_copies, TabDef))].
-
-table_has_copy_type(TabDef, DiscType) ->
-    lists:member(node(), proplists:get_value(DiscType, TabDef, [])).
-
-create_local_table_copies(Type) ->
-    lists:foreach(
-      fun ({Tab, TabDef}) ->
-              HasDiscCopies     = table_has_copy_type(TabDef, disc_copies),
-              HasDiscOnlyCopies = table_has_copy_type(TabDef, disc_only_copies),
-              LocalTab          = proplists:get_bool(local_content, TabDef),
-              StorageType =
-                  if
-                      Type =:= disc orelse LocalTab ->
-                          if
-                              HasDiscCopies     -> disc_copies;
-                              HasDiscOnlyCopies -> disc_only_copies;
-                              true              -> ram_copies
-                          end;
-%%% unused code - commented out to keep dialyzer happy
-%%%                      Type =:= disc_only ->
-%%%                          if
-%%%                              HasDiscCopies or HasDiscOnlyCopies ->
-%%%                                  disc_only_copies;
-%%%                              true -> ram_copies
-%%%                          end;
-                      Type =:= ram ->
-                          ram_copies
-                  end,
-              ok = create_local_table_copy(Tab, StorageType)
-      end,
-      table_definitions(Type)),
-    ok.
+wait_for(Condition) ->
+    error_logger:info_msg("Waiting for ~p...~n", [Condition]),
+    timer:sleep(1000).
 
-create_local_table_copy(Tab, Type) ->
-    StorageType = mnesia:table_info(Tab, storage_type),
-    {atomic, ok} =
-        if
-            StorageType == unknown ->
-                mnesia:add_table_copy(Tab, node(), Type);
-            StorageType /= Type ->
-                mnesia:change_table_copy_type(Tab, node(), Type);
-            true -> {atomic, ok}
-        end,
-    ok.
+start_mnesia(CheckConsistency) ->
+    case CheckConsistency of
+        true  -> check_cluster_consistency();
+        false -> ok
+    end,
+    rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
+    ensure_mnesia_running().
 
-wait_for_replicated_tables() -> wait_for_tables(replicated_table_names()).
+start_mnesia() ->
+    start_mnesia(true).
 
-wait_for_tables() -> wait_for_tables(table_names()).
+stop_mnesia() ->
+    stopped = mnesia:stop(),
+    ensure_mnesia_not_running().
 
-wait_for_tables(TableNames) ->
-    case mnesia:wait_for_tables(TableNames, 30000) of
-        ok ->
-            ok;
-        {timeout, BadTabs} ->
-            throw({error, {timeout_waiting_for_tables, BadTabs}});
-        {error, Reason} ->
-            throw({error, {failed_waiting_for_tables, Reason}})
+change_extra_db_nodes(ClusterNodes0, CheckOtherNodes) ->
+    ClusterNodes = nodes_excl_me(ClusterNodes0),
+    case {mnesia:change_config(extra_db_nodes, ClusterNodes), ClusterNodes} of
+        {{ok, []}, [_|_]} when CheckOtherNodes ->
+            throw({error, {failed_to_cluster_with, ClusterNodes,
+                           "Mnesia could not connect to any nodes."}});
+        {{ok, Nodes}, _} ->
+            Nodes
     end.
 
-reset(Force) ->
-    rabbit_misc:local_info_msg("Resetting Rabbit~s~n", [if Force -> " forcefully";
-                                                           true  -> ""
-                                                        end]),
-    ensure_mnesia_not_running(),
-    case not Force andalso is_clustered() andalso
-         is_only_disc_node(node(), false)
-    of
-        true  -> log_both("no other disc nodes running");
-        false -> ok
-    end,
-    Node = node(),
-    Nodes = all_clustered_nodes() -- [Node],
-    case Force of
-        true  -> ok;
+check_consistency(OTP, Rabbit) ->
+    rabbit_misc:sequence_error(
+      [check_otp_consistency(OTP),
+       check_rabbit_consistency(Rabbit)]).
+
+check_consistency(OTP, Rabbit, Node, Status) ->
+    rabbit_misc:sequence_error(
+      [check_otp_consistency(OTP),
+       check_rabbit_consistency(Rabbit),
+       check_nodes_consistency(Node, Status)]).
+
+check_nodes_consistency(Node, RemoteStatus = {RemoteAllNodes, _, _}) ->
+    case me_in_nodes(RemoteAllNodes) of
+        true ->
+            {ok, RemoteStatus};
         false ->
-            ensure_mnesia_dir(),
-            start_mnesia(),
-            RunningNodes =
-                try
-                    %% Force=true here so that reset still works when clustered
-                    %% with a node which is down
-                    ok = init_db(read_cluster_nodes_config(), true),
-                    running_clustered_nodes() -- [Node]
-                after
-                    stop_mnesia()
-                end,
-            leave_cluster(Nodes, RunningNodes),
-            rabbit_misc:ensure_ok(mnesia:delete_schema([Node]),
-                                  cannot_delete_schema)
-    end,
-    %% We need to make sure that we don't end up in a distributed
-    %% Erlang system with nodes while not being in an Mnesia cluster
-    %% with them. We don't handle that well.
-    [erlang:disconnect_node(N) || N <- Nodes],
-    ok = delete_cluster_nodes_config(),
-    %% remove persisted messages and any other garbage we find
-    ok = rabbit_file:recursive_delete(filelib:wildcard(dir() ++ "/*")),
-    ok.
-
-leave_cluster([], _) -> ok;
-leave_cluster(Nodes, RunningNodes) ->
-    %% find at least one running cluster node and instruct it to
-    %% remove our schema copy which will in turn result in our node
-    %% being removed as a cluster node from the schema, with that
-    %% change being propagated to all nodes
-    case lists:any(
-           fun (Node) ->
-                   case rpc:call(Node, mnesia, del_table_copy,
-                                 [schema, node()]) of
-                       {atomic, ok} -> true;
-                       {badrpc, nodedown} -> false;
-                       {aborted, {node_not_running, _}} -> false;
-                       {aborted, Reason} ->
-                           throw({error, {failed_to_leave_cluster,
-                                          Nodes, RunningNodes, Reason}})
-                   end
-           end,
-           RunningNodes) of
-        true -> ok;
-        false -> throw({error, {no_running_cluster_nodes,
-                                Nodes, RunningNodes}})
+            {error, {inconsistent_cluster,
+                     rabbit_misc:format("Node ~p thinks it's clustered "
+                                        "with node ~p, but ~p disagrees",
+                                        [node(), Node, Node])}}
     end.
 
-wait_for(Condition) ->
-    error_logger:info_msg("Waiting for ~p...~n", [Condition]),
-    timer:sleep(1000).
+check_version_consistency(This, Remote, Name) ->
+    check_version_consistency(This, Remote, Name, fun (A, B) -> A =:= B end).
 
-on_node_up(Node) ->
-    case is_only_disc_node(Node, true) of
-        true  -> rabbit_log:info("cluster contains disc nodes again~n");
-        false -> ok
+check_version_consistency(This, Remote, Name, Comp) ->
+    case Comp(This, Remote) of
+        true  -> ok;
+        false -> version_error(Name, This, Remote)
     end.
 
-on_node_down(Node) ->
-    case is_only_disc_node(Node, true) of
-        true  -> rabbit_log:info("only running disc node went down~n");
-        false -> ok
+version_error(Name, This, Remote) ->
+    {error, {inconsistent_cluster,
+             rabbit_misc:format("~s version mismatch: local node is ~s, "
+                                "remote node ~s", [Name, This, Remote])}}.
+
+check_otp_consistency(Remote) ->
+    check_version_consistency(erlang:system_info(otp_release), Remote, "OTP").
+
+check_rabbit_consistency(Remote) ->
+    check_version_consistency(
+      rabbit_misc:version(), Remote, "Rabbit",
+      fun rabbit_misc:version_minor_equivalent/2).
+
+%% This is fairly tricky.  We want to know if the node is in the state
+%% that a `reset' would leave it in.  We cannot simply check if the
+%% mnesia tables aren't there because restarted RAM nodes won't have
+%% tables while still being non-virgin.  What we do instead is to
+%% check if the mnesia directory is non existant or empty, with the
+%% exception of the cluster status files, which will be there thanks to
+%% `rabbit_node_monitor:prepare_cluster_status_file/0'.
+is_virgin_node() ->
+    case rabbit_file:list_dir(dir()) of
+        {error, enoent} ->
+            true;
+        {ok, []} ->
+            true;
+        {ok, [File1, File2]} ->
+            lists:usort([dir() ++ "/" ++ File1, dir() ++ "/" ++ File2]) =:=
+                lists:usort([rabbit_node_monitor:cluster_status_filename(),
+                             rabbit_node_monitor:running_nodes_filename()]);
+        {ok, _} ->
+            false
     end.
 
-is_only_disc_node(Node, _MnesiaRunning = true) ->
-    RunningSet = sets:from_list(running_clustered_nodes()),
-    DiscSet = sets:from_list(nodes_of_type(disc_copies)),
-    [Node] =:= sets:to_list(sets:intersection(RunningSet, DiscSet));
-is_only_disc_node(Node, false) ->
-    start_mnesia(),
-    Res = is_only_disc_node(Node, true),
-    stop_mnesia(),
-    Res.
-
-log_both(Warning) ->
-    io:format("Warning: ~s~n", [Warning]),
-    rabbit_misc:with_local_io(
-      fun () -> error_logger:warning_msg("~s~n", [Warning]) end).
-
-start_mnesia() ->
-    rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
-    ensure_mnesia_running().
+find_good_node([]) ->
+    none;
+find_good_node([Node | Nodes]) ->
+    case rpc:call(Node, rabbit_mnesia, node_info, []) of
+        {badrpc, _Reason}         -> find_good_node(Nodes);
+        %% old delegate hash check
+        {_OTP, _Rabbit, _Hash, _} -> find_good_node(Nodes);
+        {OTP, Rabbit, _}          -> case check_consistency(OTP, Rabbit) of
+                                         {error, _} -> find_good_node(Nodes);
+                                         ok         -> {ok, Node}
+                                     end
+    end.
 
-stop_mnesia() ->
-    stopped = mnesia:stop(),
-    ensure_mnesia_not_running().
+is_only_clustered_disc_node() ->
+    node_type() =:= disc andalso is_clustered() andalso
+        cluster_nodes(disc) =:= [node()].
+
+me_in_nodes(Nodes) -> lists:member(node(), Nodes).
+
+nodes_incl_me(Nodes) -> lists:usort([node()|Nodes]).
+
+nodes_excl_me(Nodes) -> Nodes -- [node()].
+
+e(Tag) -> throw({error, {Tag, error_description(Tag)}}).
+
+error_description(clustering_only_disc_node) ->
+    "You cannot cluster a node if it is the only disc node in its existing "
+        " cluster. If new nodes joined while this node was offline, use "
+        "'update_cluster_nodes' to add them manually.";
+error_description(resetting_only_disc_node) ->
+    "You cannot reset a node when it is the only disc node in a cluster. "
+        "Please convert another node of the cluster to a disc node first.";
+error_description(not_clustered) ->
+    "Non-clustered nodes can only be disc nodes.";
+error_description(cannot_connect_to_cluster) ->
+    "Could not connect to the cluster nodes present in this node's "
+        "status file. If the cluster has changed, you can use the "
+        "'update_cluster_nodes' command to point to the new cluster nodes.";
+error_description(no_online_cluster_nodes) ->
+    "Could not find any online cluster nodes. If the cluster has changed, "
+        "you can use the 'update_cluster_nodes' command.";
+error_description(cannot_connect_to_node) ->
+    "Could not connect to the cluster node provided.";
+error_description(inconsistent_cluster) ->
+    "The nodes provided do not have this node as part of the cluster.";
+error_description(not_a_cluster_node) ->
+    "The node selected is not in the cluster.";
+error_description(online_node_offline_flag) ->
+    "You set the --offline flag, which is used to remove nodes remotely from "
+        "offline nodes, but this node is online.";
+error_description(offline_node_no_offline_flag) ->
+    "You are trying to remove a node from an offline node. That is dangerous, "
+        "but can be done with the --offline flag. Please consult the manual "
+        "for rabbitmqctl for more information.";
+error_description(removing_node_from_offline_node) ->
+    "To remove a node remotely from an offline node, the node you are removing "
+        "from must be a disc node and all the other nodes must be offline.";
+error_description(no_running_cluster_nodes) ->
+    "You cannot leave a cluster if no online nodes are present.".
diff --git a/src/rabbit_msg_file.erl b/src/rabbit_msg_file.erl
index f685b109..a37106d6 100644
--- a/src/rabbit_msg_file.erl
+++ b/src/rabbit_msg_file.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_msg_file).
diff --git a/src/rabbit_msg_store.erl b/src/rabbit_msg_store.erl
index d69dad1f..9a4439a7 100644
--- a/src/rabbit_msg_store.erl
+++ b/src/rabbit_msg_store.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_msg_store).
@@ -29,8 +29,8 @@
 -export([transform_dir/3, force_recovery/2]). %% upgrade
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
-         code_change/3, prioritise_call/3, prioritise_cast/2,
-         prioritise_info/2, format_message_queue/2]).
+         code_change/3, prioritise_call/4, prioritise_cast/3,
+         prioritise_info/3, format_message_queue/2]).
 
 %%----------------------------------------------------------------------------
 
@@ -51,6 +51,9 @@
 
 -define(HANDLE_CACHE_BUFFER_SIZE, 1048576). %% 1MB
 
+ %% i.e. two pairs, so GC does not go idle when busy
+-define(MAXIMUM_SIMULTANEOUS_GC_FILES, 4).
+
 %%----------------------------------------------------------------------------
 
 -record(msstate,
@@ -624,7 +627,10 @@ client_update_flying(Diff, MsgId, #client_msstate { flying_ets = FlyingEts,
     Key = {MsgId, CRef},
     case ets:insert_new(FlyingEts, {Key, Diff}) of
         true  -> ok;
-        false -> try ets:update_counter(FlyingEts, Key, {2, Diff})
+        false -> try ets:update_counter(FlyingEts, Key, {2, Diff}) of
+                     0    -> ok;
+                     Diff -> ok;
+                     Err  -> throw({bad_flying_ets_update, Diff, Err, Key})
                  catch error:badarg ->
                          %% this is guaranteed to succeed since the
                          %% server only removes and updates flying_ets
@@ -738,7 +744,7 @@ init([Server, BaseDir, ClientRefs, StartupFunState]) ->
      hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
-prioritise_call(Msg, _From, _State) ->
+prioritise_call(Msg, _From, _Len, _State) ->
     case Msg of
         successfully_recovered_state                        -> 7;
         {new_client_state, _Ref, _Pid, _MODC, _CloseFDsFun} -> 7;
@@ -746,7 +752,7 @@ prioritise_call(Msg, _From, _State) ->
         _                                                   -> 0
     end.
 
-prioritise_cast(Msg, _State) ->
+prioritise_cast(Msg, _Len, _State) ->
     case Msg of
         {combine_files, _Source, _Destination, _Reclaimed} -> 8;
         {delete_file, _File, _Reclaimed}                   -> 8;
@@ -755,7 +761,7 @@ prioritise_cast(Msg, _State) ->
         _                                                  -> 0
     end.
 
-prioritise_info(Msg, _State) ->
+prioritise_info(Msg, _Len, _State) ->
     case Msg of
         sync                                               -> 8;
         _                                                  -> 0
@@ -943,15 +949,12 @@ next_state(State = #msstate { cref_to_msg_ids = CTM }) ->
         _ -> {State, 0}
     end.
 
-start_sync_timer(State = #msstate { sync_timer_ref = undefined }) ->
-    TRef = erlang:send_after(?SYNC_INTERVAL, self(), sync),
-    State #msstate { sync_timer_ref = TRef }.
+start_sync_timer(State) ->
+    rabbit_misc:ensure_timer(State, #msstate.sync_timer_ref,
+                             ?SYNC_INTERVAL, sync).
 
-stop_sync_timer(State = #msstate { sync_timer_ref = undefined }) ->
-    State;
-stop_sync_timer(State = #msstate { sync_timer_ref = TRef }) ->
-    erlang:cancel_timer(TRef),
-    State #msstate { sync_timer_ref = undefined }.
+stop_sync_timer(State) ->
+    rabbit_misc:stop_timer(State, #msstate.sync_timer_ref).
 
 internal_sync(State = #msstate { current_file_handle = CurHdl,
                                  cref_to_msg_ids     = CTM }) ->
@@ -975,13 +978,21 @@ update_flying(Diff, MsgId, CRef, #msstate { flying_ets = FlyingEts }) ->
     NDiff = -Diff,
     case ets:lookup(FlyingEts, Key) of
         []           -> ignore;
-        [{_,  Diff}] -> ignore;
+        [{_,  Diff}] -> ignore; %% [1]
         [{_, NDiff}] -> ets:update_counter(FlyingEts, Key, {2, Diff}),
                         true = ets:delete_object(FlyingEts, {Key, 0}),
                         process;
         [{_, 0}]     -> true = ets:delete_object(FlyingEts, {Key, 0}),
-                        ignore
+                        ignore;
+        [{_, Err}]   -> throw({bad_flying_ets_record, Diff, Err, Key})
     end.
+%% [1] We can get here, for example, in the following scenario: There
+%% is a write followed by a remove in flight. The counter will be 0,
+%% so on processing the write the server attempts to delete the
+%% entry. If at that point the client injects another write it will
+%% either insert a new entry, containing +1, or increment the existing
+%% entry to +1, thus preventing its removal. Either way therefore when
+%% the server processes the read, the counter will be +1.
 
 write_action({true, not_found}, _MsgId, State) ->
     {ignore, undefined, State};
@@ -1394,7 +1405,7 @@ filenum_to_name(File) -> integer_to_list(File) ++ ?FILE_EXTENSION.
 
 filename_to_num(FileName) -> list_to_integer(filename:rootname(FileName)).
 
-list_sorted_file_names(Dir, Ext) ->
+list_sorted_filenames(Dir, Ext) ->
     lists:sort(fun (A, B) -> filename_to_num(A) < filename_to_num(B) end,
                filelib:wildcard("*" ++ Ext, Dir)).
 
@@ -1531,8 +1542,8 @@ count_msg_refs(Gen, Seed, State) ->
     end.
 
 recover_crashed_compactions(Dir) ->
-    FileNames =    list_sorted_file_names(Dir, ?FILE_EXTENSION),
-    TmpFileNames = list_sorted_file_names(Dir, ?FILE_EXTENSION_TMP),
+    FileNames =    list_sorted_filenames(Dir, ?FILE_EXTENSION),
+    TmpFileNames = list_sorted_filenames(Dir, ?FILE_EXTENSION_TMP),
     lists:foreach(
       fun (TmpFileName) ->
               NonTmpRelatedFileName =
@@ -1609,7 +1620,7 @@ build_index(false, {MsgRefDeltaGen, MsgRefDeltaGenInit},
     ok = count_msg_refs(MsgRefDeltaGen, MsgRefDeltaGenInit, State),
     {ok, Pid} = gatherer:start_link(),
     case [filename_to_num(FileName) ||
-             FileName <- list_sorted_file_names(Dir, ?FILE_EXTENSION)] of
+             FileName <- list_sorted_filenames(Dir, ?FILE_EXTENSION)] of
         []     -> build_index(Pid, undefined, [State #msstate.current_file],
                               State);
         Files  -> {Offset, State1} = build_index(Pid, undefined, Files, State),
@@ -1731,10 +1742,12 @@ maybe_compact(State = #msstate { sum_valid_data        = SumValid,
        (SumFileSize - SumValid) / SumFileSize > ?GARBAGE_FRACTION ->
     %% TODO: the algorithm here is sub-optimal - it may result in a
     %% complete traversal of FileSummaryEts.
-    case ets:first(FileSummaryEts) of
-        '$end_of_table' ->
+    First = ets:first(FileSummaryEts),
+    case First =:= '$end_of_table' orelse
+        orddict:size(Pending) >= ?MAXIMUM_SIMULTANEOUS_GC_FILES of
+        true ->
             State;
-        First ->
+        false ->
             case find_files_to_combine(FileSummaryEts, FileSizeLimit,
                                        ets:lookup(FileSummaryEts, First)) of
                 not_found ->
@@ -2023,7 +2036,7 @@ transform_dir(BaseDir, Store, TransformFun) ->
     CopyFile = fun (Src, Dst) -> {ok, _Bytes} = file:copy(Src, Dst), ok end,
     case filelib:is_dir(TmpDir) of
         true  -> throw({error, transform_failed_previously});
-        false -> FileList = list_sorted_file_names(Dir, ?FILE_EXTENSION),
+        false -> FileList = list_sorted_filenames(Dir, ?FILE_EXTENSION),
                  foreach_file(Dir, TmpDir, TransformFile,     FileList),
                  foreach_file(Dir,         fun file:delete/1, FileList),
                  foreach_file(TmpDir, Dir, CopyFile,          FileList),
diff --git a/src/rabbit_msg_store_ets_index.erl b/src/rabbit_msg_store_ets_index.erl
index 3defeaaf..c17ff2cb 100644
--- a/src/rabbit_msg_store_ets_index.erl
+++ b/src/rabbit_msg_store_ets_index.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_msg_store_ets_index).
diff --git a/src/rabbit_msg_store_gc.erl b/src/rabbit_msg_store_gc.erl
index 3b61ed0b..1edd7d51 100644
--- a/src/rabbit_msg_store_gc.erl
+++ b/src/rabbit_msg_store_gc.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_msg_store_gc).
@@ -23,7 +23,7 @@
 -export([set_maximum_since_use/2]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3, prioritise_cast/2]).
+         terminate/2, code_change/3, prioritise_cast/3]).
 
 -record(state,
         { pending_no_readers,
@@ -79,8 +79,8 @@ init([MsgStoreState]) ->
                   msg_store_state    = MsgStoreState }, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
-prioritise_cast({set_maximum_since_use, _Age}, _State) -> 8;
-prioritise_cast(_Msg,                          _State) -> 0.
+prioritise_cast({set_maximum_since_use, _Age}, _Len, _State) -> 8;
+prioritise_cast(_Msg,                          _Len, _State) -> 0.
 
 handle_call(stop, _From, State) ->
     {stop, normal, ok, State}.
diff --git a/src/rabbit_msg_store_index.erl b/src/rabbit_msg_store_index.erl
index 6cc0b2a7..bb5f11b0 100644
--- a/src/rabbit_msg_store_index.erl
+++ b/src/rabbit_msg_store_index.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_msg_store_index).
diff --git a/src/rabbit_net.erl b/src/rabbit_net.erl
index bedf5142..e8c96818 100644
--- a/src/rabbit_net.erl
+++ b/src/rabbit_net.erl
@@ -10,17 +10,17 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_net).
 -include("rabbit.hrl").
 
 -export([is_ssl/1, ssl_info/1, controlling_process/2, getstat/2,
-         recv/1, async_recv/3, port_command/2, getopts/2, setopts/2, send/2,
-         close/1, maybe_fast_close/1, sockname/1, peername/1, peercert/1,
-         tune_buffer_size/1, connection_string/2]).
+         recv/1, sync_recv/2, async_recv/3, port_command/2, getopts/2,
+         setopts/2, send/2, close/1, fast_close/1, sockname/1, peername/1,
+         peercert/1, connection_string/2, socket_ends/2]).
 
 %%---------------------------------------------------------------------------
 
@@ -36,7 +36,7 @@
 -type(socket() :: port() | #ssl_socket{}).
 -type(opts() :: [{atom(), any()} |
                  {raw, non_neg_integer(), non_neg_integer(), binary()}]).
-
+-type(host_or_ip() :: binary() | inet:ip_address()).
 -spec(is_ssl/1 :: (socket()) -> boolean()).
 -spec(ssl_info/1 :: (socket())
                     -> 'nossl' | ok_val_or_error(
@@ -48,6 +48,8 @@
 -spec(recv/1 :: (socket()) ->
                      {'data', [char()] | binary()} | 'closed' |
                      rabbit_types:error(any()) | {'other', any()}).
+-spec(sync_recv/2 :: (socket(), integer()) -> rabbit_types:ok(binary()) |
+                                              rabbit_types:error(any())).
 -spec(async_recv/3 ::
         (socket(), integer(), timeout()) -> rabbit_types:ok(any())).
 -spec(port_command/2 :: (socket(), iolist()) -> 'true').
@@ -59,7 +61,7 @@
 -spec(setopts/2 :: (socket(), opts()) -> ok_or_any_error()).
 -spec(send/2 :: (socket(), binary() | iolist()) -> ok_or_any_error()).
 -spec(close/1 :: (socket()) -> ok_or_any_error()).
--spec(maybe_fast_close/1 :: (socket()) -> ok_or_any_error()).
+-spec(fast_close/1 :: (socket()) -> ok_or_any_error()).
 -spec(sockname/1 ::
         (socket())
         -> ok_val_or_error({inet:ip_address(), rabbit_networking:ip_port()})).
@@ -69,14 +71,19 @@
 -spec(peercert/1 ::
         (socket())
         -> 'nossl' | ok_val_or_error(rabbit_ssl:certificate())).
--spec(tune_buffer_size/1 :: (socket()) -> ok_or_any_error()).
 -spec(connection_string/2 ::
         (socket(), 'inbound' | 'outbound') -> ok_val_or_error(string())).
+-spec(socket_ends/2 ::
+        (socket(), 'inbound' | 'outbound')
+        -> ok_val_or_error({host_or_ip(), rabbit_networking:ip_port(),
+                            host_or_ip(), rabbit_networking:ip_port()})).
 
 -endif.
 
 %%---------------------------------------------------------------------------
 
+-define(SSL_CLOSE_TIMEOUT, 5000).
+
 -define(IS_SSL(Sock), is_record(Sock, ssl_socket)).
 
 is_ssl(Sock) -> ?IS_SSL(Sock).
@@ -109,6 +116,11 @@ recv(S, {DataTag, ClosedTag, ErrorTag}) ->
         Other                 -> {other, Other}
     end.
 
+sync_recv(Sock, Length) when ?IS_SSL(Sock) ->
+    ssl:recv(Sock#ssl_socket.ssl, Length);
+sync_recv(Sock, Length) ->
+    gen_tcp:recv(Sock, Length).
+
 async_recv(Sock, Length, Timeout) when ?IS_SSL(Sock) ->
     Pid = self(),
     Ref = make_ref(),
@@ -148,8 +160,31 @@ send(Sock, Data) when is_port(Sock) -> gen_tcp:send(Sock, Data).
 close(Sock)      when ?IS_SSL(Sock) -> ssl:close(Sock#ssl_socket.ssl);
 close(Sock)      when is_port(Sock) -> gen_tcp:close(Sock).
 
-maybe_fast_close(Sock) when ?IS_SSL(Sock) -> ok;
-maybe_fast_close(Sock) when is_port(Sock) -> erlang:port_close(Sock), ok.
+fast_close(Sock) when ?IS_SSL(Sock) ->
+    %% We cannot simply port_close the underlying tcp socket since the
+    %% TLS protocol is quite insistent that a proper closing handshake
+    %% should take place (see RFC 5245 s7.2.1). So we call ssl:close
+    %% instead, but that can block for a very long time, e.g. when
+    %% there is lots of pending output and there is tcp backpressure,
+    %% or the ssl_connection process has entered the the
+    %% workaround_transport_delivery_problems function during
+    %% termination, which, inexplicably, does a gen_tcp:recv(Socket,
+    %% 0), which may never return if the client doesn't send a FIN or
+    %% that gets swallowed by the network. Since there is no timeout
+    %% variant of ssl:close, we construct our own.
+    {Pid, MRef} = spawn_monitor(fun () -> ssl:close(Sock#ssl_socket.ssl) end),
+    erlang:send_after(?SSL_CLOSE_TIMEOUT, self(), {Pid, ssl_close_timeout}),
+    receive
+        {Pid, ssl_close_timeout} ->
+            erlang:demonitor(MRef, [flush]),
+            exit(Pid, kill);
+        {'DOWN', MRef, process, Pid, _Reason} ->
+            ok
+    end,
+    catch port_close(Sock#ssl_socket.tcp),
+    ok;
+fast_close(Sock) when is_port(Sock) ->
+    catch port_close(Sock), ok.
 
 sockname(Sock)   when ?IS_SSL(Sock) -> ssl:sockname(Sock#ssl_socket.ssl);
 sockname(Sock)   when is_port(Sock) -> inet:sockname(Sock).
@@ -160,25 +195,38 @@ peername(Sock)   when is_port(Sock) -> inet:peername(Sock).
 peercert(Sock)   when ?IS_SSL(Sock) -> ssl:peercert(Sock#ssl_socket.ssl);
 peercert(Sock)   when is_port(Sock) -> nossl.
 
-tune_buffer_size(Sock) ->
-    case getopts(Sock, [sndbuf, recbuf, buffer]) of
-        {ok, BufSizes} -> BufSz = lists:max([Sz || {_Opt, Sz} <- BufSizes]),
-                          setopts(Sock, [{buffer, BufSz}]);
-        Err            -> Err
+connection_string(Sock, Direction) ->
+    case socket_ends(Sock, Direction) of
+        {ok, {FromAddress, FromPort, ToAddress, ToPort}} ->
+            {ok, rabbit_misc:format(
+                   "~s:~p -> ~s:~p",
+                   [maybe_ntoab(FromAddress), FromPort,
+                    maybe_ntoab(ToAddress),   ToPort])};
+        Error ->
+            Error
     end.
 
-connection_string(Sock, Direction) ->
-    {From, To} = case Direction of
-                     inbound  -> {fun peername/1, fun sockname/1};
-                     outbound -> {fun sockname/1, fun peername/1}
-                 end,
+socket_ends(Sock, Direction) ->
+    {From, To} = sock_funs(Direction),
     case {From(Sock), To(Sock)} of
         {{ok, {FromAddress, FromPort}}, {ok, {ToAddress, ToPort}}} ->
-            {ok, rabbit_misc:format("~s:~p -> ~s:~p",
-                                    [rabbit_misc:ntoab(FromAddress), FromPort,
-                                     rabbit_misc:ntoab(ToAddress),   ToPort])};
+            {ok, {rdns(FromAddress), FromPort,
+                  rdns(ToAddress),   ToPort}};
         {{error, _Reason} = Error, _} ->
             Error;
         {_, {error, _Reason} = Error} ->
             Error
     end.
+
+maybe_ntoab(Addr) when is_tuple(Addr) -> rabbit_misc:ntoab(Addr);
+maybe_ntoab(Host)                     -> Host.
+
+rdns(Addr) ->
+    {ok, Lookup} = application:get_env(rabbit, reverse_dns_lookups),
+    case Lookup of
+        true -> list_to_binary(rabbit_networking:tcp_host(Addr));
+        _    -> Addr
+    end.
+
+sock_funs(inbound)  -> {fun peername/1, fun sockname/1};
+sock_funs(outbound) -> {fun sockname/1, fun peername/1}.
diff --git a/src/rabbit_networking.erl b/src/rabbit_networking.erl
index 78deea97..46cfabe3 100644
--- a/src/rabbit_networking.erl
+++ b/src/rabbit_networking.erl
@@ -10,18 +10,19 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_networking).
 
 -export([boot/0, start/0, start_tcp_listener/1, start_ssl_listener/2,
          stop_tcp_listener/1, on_node_down/1, active_listeners/0,
-         node_listeners/1, connections/0, connection_info_keys/0,
+         node_listeners/1, register_connection/1, unregister_connection/1,
+         connections/0, connection_info_keys/0,
          connection_info/1, connection_info/2,
          connection_info_all/0, connection_info_all/1,
-         close_connection/2, force_connection_event_refresh/0]).
+         close_connection/2, force_connection_event_refresh/0, tcp_host/1]).
 
 %%used by TCP-based transports, e.g. STOMP adapter
 -export([tcp_listener_addresses/1, tcp_listener_spec/6,
@@ -65,6 +66,8 @@
 -spec(stop_tcp_listener/1 :: (listener_config()) -> 'ok').
 -spec(active_listeners/0 :: () -> [rabbit_types:listener()]).
 -spec(node_listeners/1 :: (node()) -> [rabbit_types:listener()]).
+-spec(register_connection/1 :: (pid()) -> ok).
+-spec(unregister_connection/1 :: (pid()) -> ok).
 -spec(connections/0 :: () -> [rabbit_types:connection()]).
 -spec(connections_local/0 :: () -> [rabbit_types:connection()]).
 -spec(connection_info_keys/0 :: () -> rabbit_types:info_keys()).
@@ -142,7 +145,8 @@ start() -> rabbit_sup:start_supervisor_child(
               {rabbit_connection_sup,start_link,[]}]).
 
 ensure_ssl() ->
-    ok = rabbit_misc:start_applications([crypto, public_key, ssl]),
+    {ok, SslAppsConfig} = application:get_env(rabbit, ssl_apps),
+    ok = app_utils:start_applications(SslAppsConfig),
     {ok, SslOptsConfig} = application:get_env(rabbit, ssl_options),
 
     % unknown_ca errors are silently ignored prior to R14B unless we
@@ -160,7 +164,19 @@ ssl_transform_fun(SslOpts) ->
             case catch ssl:ssl_accept(Sock, SslOpts, ?SSL_TIMEOUT * 1000) of
                 {ok, SslSock} ->
                     {ok, #ssl_socket{tcp = Sock, ssl = SslSock}};
+                {error, timeout} ->
+                    {error, {ssl_upgrade_error, timeout}};
                 {error, Reason} ->
+                    %% We have no idea what state the ssl_connection
+                    %% process is in - it could still be happily
+                    %% going, it might be stuck, or it could be just
+                    %% about to fail. There is little that our caller
+                    %% can do but close the TCP socket, but this could
+                    %% cause ssl alerts to get dropped (which is bad
+                    %% form, according to the TLS spec). So we give
+                    %% the ssl_connection a little bit of time to send
+                    %% such alerts.
+                    timer:sleep(?SSL_TIMEOUT * 1000),
                     {error, {ssl_upgrade_error, Reason}};
                 {'EXIT', Reason} ->
                     {error, {ssl_upgrade_failure, Reason}}
@@ -282,20 +298,15 @@ start_client(Sock) ->
 start_ssl_client(SslOpts, Sock) ->
     start_client(Sock, ssl_transform_fun(SslOpts)).
 
+register_connection(Pid) -> pg_local:join(rabbit_connections, Pid).
+
+unregister_connection(Pid) -> pg_local:leave(rabbit_connections, Pid).
+
 connections() ->
-    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:running_clustered_nodes(),
+    rabbit_misc:append_rpc_all_nodes(rabbit_mnesia:cluster_nodes(running),
                                      rabbit_networking, connections_local, []).
 
-connections_local() ->
-    [Reader ||
-        {_, ConnSup, supervisor, _}
-            <- supervisor:which_children(rabbit_tcp_client_sup),
-        Reader <- [try
-                       rabbit_connection_sup:reader(ConnSup)
-                   catch exit:{noproc, _} ->
-                           noproc
-                   end],
-        Reader =/= noproc].
+connections_local() -> pg_local:get_members(rabbit_connections).
 
 connection_info_keys() -> rabbit_reader:info_keys().
 
diff --git a/src/rabbit_node_monitor.erl b/src/rabbit_node_monitor.erl
index 323cf0ce..57dce7cd 100644
--- a/src/rabbit_node_monitor.erl
+++ b/src/rabbit_node_monitor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_node_monitor).
@@ -19,78 +19,321 @@
 -behaviour(gen_server).
 
 -export([start_link/0]).
+-export([running_nodes_filename/0,
+         cluster_status_filename/0, prepare_cluster_status_files/0,
+         write_cluster_status/1, read_cluster_status/0,
+         update_cluster_status/0, reset_cluster_status/0]).
+-export([notify_node_up/0, notify_joined_cluster/0, notify_left_cluster/1]).
+-export([partitions/0, partitions/1, subscribe/1]).
 
--export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3]).
--export([notify_cluster/0, rabbit_running_on/1]).
+%% gen_server callbacks
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
+         code_change/3]).
+
+ %% Utils
+-export([all_rabbit_nodes_up/0, run_outside_applications/1]).
 
 -define(SERVER, ?MODULE).
 -define(RABBIT_UP_RPC_TIMEOUT, 2000).
+-define(RABBIT_DOWN_PING_INTERVAL, 1000).
+
+-record(state, {monitors, partitions, subscribers, down_ping_timer, autoheal}).
 
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
 -spec(start_link/0 :: () -> rabbit_types:ok_pid_or_error()).
--spec(rabbit_running_on/1 :: (node()) -> 'ok').
--spec(notify_cluster/0 :: () -> 'ok').
+
+-spec(running_nodes_filename/0 :: () -> string()).
+-spec(cluster_status_filename/0 :: () -> string()).
+-spec(prepare_cluster_status_files/0 :: () -> 'ok').
+-spec(write_cluster_status/1 :: (rabbit_mnesia:cluster_status()) -> 'ok').
+-spec(read_cluster_status/0 :: () -> rabbit_mnesia:cluster_status()).
+-spec(update_cluster_status/0 :: () -> 'ok').
+-spec(reset_cluster_status/0 :: () -> 'ok').
+
+-spec(notify_node_up/0 :: () -> 'ok').
+-spec(notify_joined_cluster/0 :: () -> 'ok').
+-spec(notify_left_cluster/1 :: (node()) -> 'ok').
+
+-spec(partitions/0 :: () -> [node()]).
+-spec(partitions/1 :: ([node()]) -> [{node(), [node()]}]).
+-spec(subscribe/1 :: (pid()) -> 'ok').
+
+-spec(all_rabbit_nodes_up/0 :: () -> boolean()).
+-spec(run_outside_applications/1 :: (fun (() -> any())) -> pid()).
 
 -endif.
 
-%%--------------------------------------------------------------------
+%%----------------------------------------------------------------------------
+%% Start
+%%----------------------------------------------------------------------------
+
+start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+
+%%----------------------------------------------------------------------------
+%% Cluster file operations
+%%----------------------------------------------------------------------------
+
+%% The cluster file information is kept in two files.  The "cluster
+%% status file" contains all the clustered nodes and the disc nodes.
+%% The "running nodes file" contains the currently running nodes or
+%% the running nodes at shutdown when the node is down.
+%%
+%% We strive to keep the files up to date and we rely on this
+%% assumption in various situations. Obviously when mnesia is offline
+%% the information we have will be outdated, but it cannot be
+%% otherwise.
+
+running_nodes_filename() ->
+    filename:join(rabbit_mnesia:dir(), "nodes_running_at_shutdown").
+
+cluster_status_filename() ->
+    rabbit_mnesia:dir() ++ "/cluster_nodes.config".
+
+prepare_cluster_status_files() ->
+    rabbit_mnesia:ensure_mnesia_dir(),
+    Corrupt = fun(F) -> throw({error, corrupt_cluster_status_files, F}) end,
+    RunningNodes1 = case try_read_file(running_nodes_filename()) of
+                        {ok, [Nodes]} when is_list(Nodes) -> Nodes;
+                        {ok, Other}                       -> Corrupt(Other);
+                        {error, enoent}                   -> []
+                    end,
+    ThisNode = [node()],
+    %% The running nodes file might contain a set or a list, in case
+    %% of the legacy file
+    RunningNodes2 = lists:usort(ThisNode ++ RunningNodes1),
+    {AllNodes1, WantDiscNode} =
+        case try_read_file(cluster_status_filename()) of
+            {ok, [{AllNodes, DiscNodes0}]} ->
+                {AllNodes, lists:member(node(), DiscNodes0)};
+            {ok, [AllNodes0]} when is_list(AllNodes0) ->
+                {legacy_cluster_nodes(AllNodes0),
+                 legacy_should_be_disc_node(AllNodes0)};
+            {ok, Files} ->
+                Corrupt(Files);
+            {error, enoent} ->
+                {legacy_cluster_nodes([]), true}
+        end,
+    AllNodes2 = lists:usort(AllNodes1 ++ RunningNodes2),
+    DiscNodes = case WantDiscNode of
+                    true  -> ThisNode;
+                    false -> []
+                end,
+    ok = write_cluster_status({AllNodes2, DiscNodes, RunningNodes2}).
+
+write_cluster_status({All, Disc, Running}) ->
+    ClusterStatusFN = cluster_status_filename(),
+    Res = case rabbit_file:write_term_file(ClusterStatusFN, [{All, Disc}]) of
+              ok ->
+                  RunningNodesFN = running_nodes_filename(),
+                  {RunningNodesFN,
+                   rabbit_file:write_term_file(RunningNodesFN, [Running])};
+              E1 = {error, _} ->
+                  {ClusterStatusFN, E1}
+          end,
+    case Res of
+        {_, ok}           -> ok;
+        {FN, {error, E2}} -> throw({error, {could_not_write_file, FN, E2}})
+    end.
 
-start_link() ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
-
-rabbit_running_on(Node) ->
-    gen_server:cast(rabbit_node_monitor, {rabbit_running_on, Node}).
-
-notify_cluster() ->
-    Node = node(),
-    Nodes = rabbit_mnesia:running_clustered_nodes() -- [Node],
-    %% notify other rabbits of this rabbit
-    case rpc:multicall(Nodes, rabbit_node_monitor, rabbit_running_on,
-                       [Node], ?RABBIT_UP_RPC_TIMEOUT) of
-        {_, [] } -> ok;
-        {_, Bad} -> rabbit_log:info("failed to contact nodes ~p~n", [Bad])
-    end,
+read_cluster_status() ->
+    case {try_read_file(cluster_status_filename()),
+          try_read_file(running_nodes_filename())} of
+        {{ok, [{All, Disc}]}, {ok, [Running]}} when is_list(Running) ->
+            {All, Disc, Running};
+        {Stat, Run} ->
+            throw({error, {corrupt_or_missing_cluster_files, Stat, Run}})
+    end.
+
+update_cluster_status() ->
+    {ok, Status} = rabbit_mnesia:cluster_status_from_mnesia(),
+    write_cluster_status(Status).
+
+reset_cluster_status() ->
+    write_cluster_status({[node()], [node()], [node()]}).
+
+%%----------------------------------------------------------------------------
+%% Cluster notifications
+%%----------------------------------------------------------------------------
+
+notify_node_up() ->
+    Nodes = rabbit_mnesia:cluster_nodes(running) -- [node()],
+    gen_server:abcast(Nodes, ?SERVER,
+                      {node_up, node(), rabbit_mnesia:node_type()}),
     %% register other active rabbits with this rabbit
-    [ rabbit_running_on(N) || N <- Nodes ],
+    DiskNodes = rabbit_mnesia:cluster_nodes(disc),
+    [gen_server:cast(?SERVER, {node_up, N, case lists:member(N, DiskNodes) of
+                                               true  -> disc;
+                                               false -> ram
+                                           end}) || N <- Nodes],
     ok.
 
-%%--------------------------------------------------------------------
+notify_joined_cluster() ->
+    Nodes = rabbit_mnesia:cluster_nodes(running) -- [node()],
+    gen_server:abcast(Nodes, ?SERVER,
+                      {joined_cluster, node(), rabbit_mnesia:node_type()}),
+    ok.
+
+notify_left_cluster(Node) ->
+    Nodes = rabbit_mnesia:cluster_nodes(running),
+    gen_server:abcast(Nodes, ?SERVER, {left_cluster, Node}),
+    ok.
+
+%%----------------------------------------------------------------------------
+%% Server calls
+%%----------------------------------------------------------------------------
+
+partitions() ->
+    gen_server:call(?SERVER, partitions, infinity).
+
+partitions(Nodes) ->
+    {Replies, _} = gen_server:multi_call(Nodes, ?SERVER, partitions, infinity),
+    Replies.
+
+subscribe(Pid) ->
+    gen_server:cast(?SERVER, {subscribe, Pid}).
+
+%%----------------------------------------------------------------------------
+%% gen_server callbacks
+%%----------------------------------------------------------------------------
 
 init([]) ->
-    {ok, ordsets:new()}.
+    %% We trap exits so that the supervisor will not just kill us. We
+    %% want to be sure that we are not going to be killed while
+    %% writing out the cluster status files - bad things can then
+    %% happen.
+    process_flag(trap_exit, true),
+    net_kernel:monitor_nodes(true),
+    {ok, _} = mnesia:subscribe(system),
+    {ok, #state{monitors    = pmon:new(),
+                subscribers = pmon:new(),
+                partitions  = [],
+                autoheal    = rabbit_autoheal:init()}}.
+
+handle_call(partitions, _From, State = #state{partitions = Partitions}) ->
+    {reply, Partitions, State};
 
 handle_call(_Request, _From, State) ->
     {noreply, State}.
 
-handle_cast({rabbit_running_on, Node}, Nodes) ->
-    case ordsets:is_element(Node, Nodes) of
-        true  -> {noreply, Nodes};
+%% Note: when updating the status file, we can't simply write the
+%% mnesia information since the message can (and will) overtake the
+%% mnesia propagation.
+handle_cast({node_up, Node, NodeType},
+            State = #state{monitors = Monitors}) ->
+    case pmon:is_monitored({rabbit, Node}, Monitors) of
+        true  -> {noreply, State};
         false -> rabbit_log:info("rabbit on node ~p up~n", [Node]),
-                 erlang:monitor(process, {rabbit, Node}),
+                 {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+                 write_cluster_status({add_node(Node, AllNodes),
+                                       case NodeType of
+                                           disc -> add_node(Node, DiscNodes);
+                                           ram  -> DiscNodes
+                                       end,
+                                       add_node(Node, RunningNodes)}),
                  ok = handle_live_rabbit(Node),
-                 {noreply, ordsets:add_element(Node, Nodes)}
+                 {noreply, State#state{
+                             monitors = pmon:monitor({rabbit, Node}, Monitors)}}
     end;
+handle_cast({joined_cluster, Node, NodeType}, State) ->
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({add_node(Node, AllNodes),
+                          case NodeType of
+                              disc -> add_node(Node, DiscNodes);
+                              ram  -> DiscNodes
+                          end,
+                          RunningNodes}),
+    {noreply, State};
+handle_cast({left_cluster, Node}, State) ->
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({del_node(Node, AllNodes), del_node(Node, DiscNodes),
+                          del_node(Node, RunningNodes)}),
+    {noreply, State};
+handle_cast({subscribe, Pid}, State = #state{subscribers = Subscribers}) ->
+    {noreply, State#state{subscribers = pmon:monitor(Pid, Subscribers)}};
 handle_cast(_Msg, State) ->
     {noreply, State}.
 
-handle_info({'DOWN', _MRef, process, {rabbit, Node}, _Reason}, Nodes) ->
+handle_info({'DOWN', _MRef, process, {rabbit, Node}, _Reason},
+            State = #state{monitors = Monitors, subscribers = Subscribers}) ->
     rabbit_log:info("rabbit on node ~p down~n", [Node]),
+    {AllNodes, DiscNodes, RunningNodes} = read_cluster_status(),
+    write_cluster_status({AllNodes, DiscNodes, del_node(Node, RunningNodes)}),
     ok = handle_dead_rabbit(Node),
-    {noreply, ordsets:del_element(Node, Nodes)};
+    [P ! {node_down, Node} || P <- pmon:monitored(Subscribers)],
+    {noreply, handle_dead_rabbit_state(
+                Node,
+                State#state{monitors = pmon:erase({rabbit, Node}, Monitors)})};
+
+handle_info({'DOWN', _MRef, process, Pid, _Reason},
+            State = #state{subscribers = Subscribers}) ->
+    {noreply, State#state{subscribers = pmon:erase(Pid, Subscribers)}};
+
+handle_info({nodedown, Node}, State) ->
+    ok = handle_dead_node(Node),
+    {noreply, State};
+
+handle_info({mnesia_system_event,
+             {inconsistent_database, running_partitioned_network, Node}},
+            State = #state{partitions = Partitions,
+                           monitors   = Monitors,
+                           autoheal   = AState}) ->
+    %% We will not get a node_up from this node - yet we should treat it as
+    %% up (mostly).
+    State1 = case pmon:is_monitored({rabbit, Node}, Monitors) of
+                 true  -> State;
+                 false -> State#state{
+                            monitors = pmon:monitor({rabbit, Node}, Monitors)}
+             end,
+    ok = handle_live_rabbit(Node),
+    Partitions1 = ordsets:to_list(
+                    ordsets:add_element(Node, ordsets:from_list(Partitions))),
+    {noreply, State1#state{partitions = Partitions1,
+                           autoheal   = rabbit_autoheal:maybe_start(AState)}};
+
+handle_info({autoheal_msg, Msg}, State = #state{autoheal   = AState,
+                                                partitions = Partitions}) ->
+    AState1 = rabbit_autoheal:handle_msg(Msg, AState, Partitions),
+    {noreply, State#state{autoheal = AState1}};
+
+handle_info(ping_nodes, State) ->
+    %% We ping nodes when some are down to ensure that we find out
+    %% about healed partitions quickly. We ping all nodes rather than
+    %% just the ones we know are down for simplicity; it's not expensive
+    %% to ping the nodes that are up, after all.
+    State1 = State#state{down_ping_timer = undefined},
+    Self = self(),
+    %% all_nodes_up() both pings all the nodes and tells us if we need to again.
+    %%
+    %% We ping in a separate process since in a partition it might
+    %% take some noticeable length of time and we don't want to block
+    %% the node monitor for that long.
+    spawn_link(fun () ->
+                       case all_nodes_up() of
+                           true  -> ok;
+                           false -> Self ! ping_again
+                       end
+               end),
+    {noreply, State1};
+
+handle_info(ping_again, State) ->
+    {noreply, ensure_ping_timer(State)};
+
 handle_info(_Info, State) ->
     {noreply, State}.
 
-terminate(_Reason, _State) ->
+terminate(_Reason, State) ->
+    rabbit_misc:stop_timer(State, #state.down_ping_timer),
     ok.
 
 code_change(_OldVsn, State, _Extra) ->
     {ok, State}.
 
-%%--------------------------------------------------------------------
+%%----------------------------------------------------------------------------
+%% Functions that call the module specific hooks when nodes go up/down
+%%----------------------------------------------------------------------------
 
 %% TODO: This may turn out to be a performance hog when there are lots
 %% of nodes.  We really only need to execute some of these statements
@@ -99,8 +342,135 @@ handle_dead_rabbit(Node) ->
     ok = rabbit_networking:on_node_down(Node),
     ok = rabbit_amqqueue:on_node_down(Node),
     ok = rabbit_alarm:on_node_down(Node),
-    ok = rabbit_mnesia:on_node_down(Node).
+    ok = rabbit_mnesia:on_node_down(Node),
+    ok.
+
+handle_dead_node(_Node) ->
+    %% In general in rabbit_node_monitor we care about whether the
+    %% rabbit application is up rather than the node; we do this so
+    %% that we can respond in the same way to "rabbitmqctl stop_app"
+    %% and "rabbitmqctl stop" as much as possible.
+    %%
+    %% However, for pause_minority mode we can't do this, since we
+    %% depend on looking at whether other nodes are up to decide
+    %% whether to come back up ourselves - if we decide that based on
+    %% the rabbit application we would go down and never come back.
+    case application:get_env(rabbit, cluster_partition_handling) of
+        {ok, pause_minority} ->
+            case majority() of
+                true  -> ok;
+                false -> await_cluster_recovery()
+            end;
+        {ok, ignore} ->
+            ok;
+        {ok, autoheal} ->
+            ok;
+        {ok, Term} ->
+            rabbit_log:warning("cluster_partition_handling ~p unrecognised, "
+                               "assuming 'ignore'~n", [Term]),
+            ok
+    end.
+
+await_cluster_recovery() ->
+    rabbit_log:warning("Cluster minority status detected - awaiting recovery~n",
+                       []),
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    run_outside_applications(fun () ->
+                                     rabbit:stop(),
+                                     wait_for_cluster_recovery(Nodes)
+                             end),
+    ok.
+
+run_outside_applications(Fun) ->
+    spawn(fun () ->
+                  %% If our group leader is inside an application we are about
+                  %% to stop, application:stop/1 does not return.
+                  group_leader(whereis(init), self()),
+                  %% Ensure only one such process at a time, the
+                  %% exit(badarg) is harmless if one is already running
+                  try register(rabbit_outside_app_process, self()) of
+                      true           -> Fun()
+                  catch error:badarg -> ok
+                  end
+          end).
+
+wait_for_cluster_recovery(Nodes) ->
+    case majority() of
+        true  -> rabbit:start();
+        false -> timer:sleep(?RABBIT_DOWN_PING_INTERVAL),
+                 wait_for_cluster_recovery(Nodes)
+    end.
+
+handle_dead_rabbit_state(Node, State = #state{partitions = Partitions,
+                                              autoheal   = Autoheal}) ->
+    %% If we have been partitioned, and we are now in the only remaining
+    %% partition, we no longer care about partitions - forget them. Note
+    %% that we do not attempt to deal with individual (other) partitions
+    %% going away. It's only safe to forget anything about partitions when
+    %% there are no partitions.
+    Partitions1 = case Partitions -- (Partitions -- alive_rabbit_nodes()) of
+                      [] -> [];
+                      _  -> Partitions
+                  end,
+    ensure_ping_timer(
+      State#state{partitions = Partitions1,
+                  autoheal   = rabbit_autoheal:node_down(Node, Autoheal)}).
+
+ensure_ping_timer(State) ->
+    rabbit_misc:ensure_timer(
+      State, #state.down_ping_timer, ?RABBIT_DOWN_PING_INTERVAL, ping_nodes).
 
 handle_live_rabbit(Node) ->
     ok = rabbit_alarm:on_node_up(Node),
     ok = rabbit_mnesia:on_node_up(Node).
+
+%%--------------------------------------------------------------------
+%% Internal utils
+%%--------------------------------------------------------------------
+
+try_read_file(FileName) ->
+    case rabbit_file:read_term_file(FileName) of
+        {ok, Term}      -> {ok, Term};
+        {error, enoent} -> {error, enoent};
+        {error, E}      -> throw({error, {cannot_read_file, FileName, E}})
+    end.
+
+legacy_cluster_nodes(Nodes) ->
+    %% We get all the info that we can, including the nodes from
+    %% mnesia, which will be there if the node is a disc node (empty
+    %% list otherwise)
+    lists:usort(Nodes ++ mnesia:system_info(db_nodes)).
+
+legacy_should_be_disc_node(DiscNodes) ->
+    DiscNodes == [] orelse lists:member(node(), DiscNodes).
+
+add_node(Node, Nodes) -> lists:usort([Node | Nodes]).
+
+del_node(Node, Nodes) -> Nodes -- [Node].
+
+%%--------------------------------------------------------------------
+
+%% mnesia:system_info(db_nodes) (and hence
+%% rabbit_mnesia:cluster_nodes(running)) does not give reliable
+%% results when partitioned. So we have a small set of replacement
+%% functions here. "rabbit" in a function's name implies we test if
+%% the rabbit application is up, not just the node.
+
+majority() ->
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    length(alive_nodes(Nodes)) / length(Nodes) > 0.5.
+
+all_nodes_up() ->
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    length(alive_nodes(Nodes)) =:= length(Nodes).
+
+all_rabbit_nodes_up() ->
+    Nodes = rabbit_mnesia:cluster_nodes(all),
+    length(alive_rabbit_nodes(Nodes)) =:= length(Nodes).
+
+alive_nodes(Nodes) -> [N || N <- Nodes, pong =:= net_adm:ping(N)].
+
+alive_rabbit_nodes() -> alive_rabbit_nodes(rabbit_mnesia:cluster_nodes(all)).
+
+alive_rabbit_nodes(Nodes) ->
+    [N || N <- alive_nodes(Nodes), rabbit:is_running(N)].
diff --git a/src/rabbit_nodes.erl b/src/rabbit_nodes.erl
index 1c23632d..b54fdd2e 100644
--- a/src/rabbit_nodes.erl
+++ b/src/rabbit_nodes.erl
@@ -10,13 +10,14 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_nodes).
 
--export([names/1, diagnostics/1, make/1, parts/1, cookie_hash/0, is_running/2]).
+-export([names/1, diagnostics/1, make/1, parts/1, cookie_hash/0,
+         is_running/2, is_process_running/2]).
 
 -define(EPMD_TIMEOUT, 30000).
 
@@ -33,6 +34,7 @@
 -spec(parts/1 :: (node() | string()) -> {string(), string()}).
 -spec(cookie_hash/0 :: () -> string()).
 -spec(is_running/2 :: (node(), atom()) -> boolean()).
+-spec(is_process_running/2 :: (node(), atom()) -> boolean()).
 
 -endif.
 
@@ -70,8 +72,8 @@ diagnostics0() ->
 diagnostics_host(Host) ->
     case names(Host) of
         {error, EpmdReason} ->
-            {"- unable to connect to epmd on ~s: ~w",
-             [Host, EpmdReason]};
+            {"- unable to connect to epmd on ~s: ~w (~s)",
+             [Host, EpmdReason, rabbit_misc:format_inet_error(EpmdReason)]};
         {ok, NamePorts} ->
             {"- ~s: ~p",
              [Host, [{list_to_atom(Name), Port} ||
@@ -94,7 +96,14 @@ cookie_hash() ->
     base64:encode_to_string(erlang:md5(atom_to_list(erlang:get_cookie()))).
 
 is_running(Node, Application) ->
-    case rpc:call(Node, application, which_applications, [infinity]) of
+    case rpc:call(Node, rabbit_misc, which_applications, []) of
         {badrpc, _} -> false;
         Apps        -> proplists:is_defined(Application, Apps)
     end.
+
+is_process_running(Node, Process) ->
+    case rpc:call(Node, erlang, whereis, [Process]) of
+        {badrpc, _}      -> false;
+        undefined        -> false;
+        P when is_pid(P) -> true
+    end.
diff --git a/src/rabbit_parameter_validation.erl b/src/rabbit_parameter_validation.erl
new file mode 100644
index 00000000..0a878432
--- /dev/null
+++ b/src/rabbit_parameter_validation.erl
@@ -0,0 +1,87 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_parameter_validation).
+
+-export([number/2, binary/2, boolean/2, list/2, regex/2, proplist/3, enum/1]).
+
+number(_Name, Term) when is_number(Term) ->
+    ok;
+
+number(Name, Term) ->
+    {error, "~s should be number, actually was ~p", [Name, Term]}.
+
+binary(_Name, Term) when is_binary(Term) ->
+    ok;
+
+binary(Name, Term) ->
+    {error, "~s should be binary, actually was ~p", [Name, Term]}.
+
+boolean(_Name, Term) when is_boolean(Term) ->
+    ok;
+boolean(Name, Term) ->
+    {error, "~s should be boolean, actually was ~p", [Name, Term]}.
+
+list(_Name, Term) when is_list(Term) ->
+    ok;
+
+list(Name, Term) ->
+    {error, "~s should be list, actually was ~p", [Name, Term]}.
+
+regex(Name, Term) when is_binary(Term) ->
+    case re:compile(Term) of
+        {ok, _}         -> ok;
+        {error, Reason} -> {error, "~s should be regular expression "
+                                   "but is invalid: ~p", [Name, Reason]}
+    end;
+regex(Name, Term) ->
+    {error, "~s should be a binary but was ~p", [Name, Term]}.
+
+proplist(Name, Constraints, Term) when is_list(Term) ->
+    {Results, Remainder}
+        = lists:foldl(
+            fun ({Key, Fun, Needed}, {Results0, Term0}) ->
+                    case {lists:keytake(Key, 1, Term0), Needed} of
+                        {{value, {Key, Value}, Term1}, _} ->
+                            {[Fun(Key, Value) | Results0],
+                             Term1};
+                        {false, mandatory} ->
+                            {[{error, "Key \"~s\" not found in ~s",
+                               [Key, Name]} | Results0], Term0};
+                        {false, optional} ->
+                            {Results0, Term0}
+                    end
+            end, {[], Term}, Constraints),
+    case Remainder of
+        [] -> Results;
+        _  -> [{error, "Unrecognised terms ~p in ~s", [Remainder, Name]}
+               | Results]
+    end;
+
+proplist(Name, _Constraints, Term) ->
+    {error, "~s not a list ~p", [Name, Term]}.
+
+enum(OptionsA) ->
+    Options = [list_to_binary(atom_to_list(O)) || O <- OptionsA],
+    fun (Name, Term) when is_binary(Term) ->
+            case lists:member(Term, Options) of
+                true  -> ok;
+                false -> {error, "~s should be one of ~p, actually was ~p",
+                          [Name, Options, Term]}
+            end;
+        (Name, Term) ->
+            {error, "~s should be binary, actually was ~p", [Name, Term]}
+    end.
diff --git a/src/rabbit_plugins.erl b/src/rabbit_plugins.erl
index 30c7bb37..168ced3c 100644
--- a/src/rabbit_plugins.erl
+++ b/src/rabbit_plugins.erl
@@ -10,164 +10,48 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2011-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_plugins).
 -include("rabbit.hrl").
 
--export([start/0, stop/0, find_plugins/1, read_enabled_plugins/1,
-         lookup_plugins/2, calculate_required_plugins/2, plugin_names/1]).
-
--define(VERBOSE_OPT, "-v").
--define(MINIMAL_OPT, "-m").
--define(ENABLED_OPT, "-E").
--define(ENABLED_ALL_OPT, "-e").
-
--define(VERBOSE_DEF, {?VERBOSE_OPT, flag}).
--define(MINIMAL_DEF, {?MINIMAL_OPT, flag}).
--define(ENABLED_DEF, {?ENABLED_OPT, flag}).
--define(ENABLED_ALL_DEF, {?ENABLED_ALL_OPT, flag}).
-
--define(GLOBAL_DEFS, []).
-
--define(COMMANDS,
-        [{list, [?VERBOSE_DEF, ?MINIMAL_DEF, ?ENABLED_DEF, ?ENABLED_ALL_DEF]},
-         enable,
-         disable]).
+-export([setup/0, active/0, read_enabled/1, list/1, dependencies/3]).
 
 %%----------------------------------------------------------------------------
 
 -ifdef(use_specs).
 
--spec(start/0 :: () -> no_return()).
--spec(stop/0 :: () -> 'ok').
--spec(find_plugins/1 :: (file:filename()) -> [#plugin{}]).
--spec(read_enabled_plugins/1 :: (file:filename()) -> [atom()]).
--spec(lookup_plugins/2 :: ([atom()], [#plugin{}]) -> [#plugin{}]).
--spec(calculate_required_plugins/2 :: ([atom()], [#plugin{}]) -> [atom()]).
--spec(plugin_names/1 :: ([#plugin{}]) -> [atom()]).
-
--endif.
-
-%%----------------------------------------------------------------------------
-
-start() ->
-    {ok, [[PluginsFile|_]|_]} =
-        init:get_argument(enabled_plugins_file),
-    {ok, [[PluginsDir|_]|_]} = init:get_argument(plugins_dist_dir),
-    {Command, Opts, Args} =
-        case rabbit_misc:parse_arguments(?COMMANDS, ?GLOBAL_DEFS,
-                                         init:get_plain_arguments())
-        of
-            {ok, Res}  -> Res;
-            no_command -> print_error("could not recognise command", []),
-                          usage()
-        end,
-
-    PrintInvalidCommandError =
-        fun () ->
-                print_error("invalid command '~s'",
-                            [string:join([atom_to_list(Command) | Args], " ")])
-        end,
-
-    case catch action(Command, Args, Opts, PluginsFile, PluginsDir) of
-        ok ->
-            rabbit_misc:quit(0);
-        {'EXIT', {function_clause, [{?MODULE, action, _} | _]}} ->
-            PrintInvalidCommandError(),
-            usage();
-        {'EXIT', {function_clause, [{?MODULE, action, _, _} | _]}} ->
-            PrintInvalidCommandError(),
-            usage();
-        {error, Reason} ->
-            print_error("~p", [Reason]),
-            rabbit_misc:quit(2);
-        {error_string, Reason} ->
-            print_error("~s", [Reason]),
-            rabbit_misc:quit(2);
-        Other ->
-            print_error("~p", [Other]),
-            rabbit_misc:quit(2)
-    end.
-
-stop() ->
-    ok.
+-type(plugin_name() :: atom()).
 
-print_error(Format, Args) ->
-    rabbit_misc:format_stderr("Error: " ++ Format ++ "~n", Args).
+-spec(setup/0 :: () -> [plugin_name()]).
+-spec(active/0 :: () -> [plugin_name()]).
+-spec(list/1 :: (string()) -> [#plugin{}]).
+-spec(read_enabled/1 :: (file:filename()) -> [plugin_name()]).
+-spec(dependencies/3 :: (boolean(), [plugin_name()], [#plugin{}]) ->
+                             [plugin_name()]).
 
-usage() ->
-    io:format("~s", [rabbit_plugins_usage:usage()]),
-    rabbit_misc:quit(1).
-
-%%----------------------------------------------------------------------------
-
-action(list, [], Opts, PluginsFile, PluginsDir) ->
-    action(list, [".*"], Opts, PluginsFile, PluginsDir);
-action(list, [Pat], Opts, PluginsFile, PluginsDir) ->
-    format_plugins(Pat, Opts, PluginsFile, PluginsDir);
-
-action(enable, ToEnable0, _Opts, PluginsFile, PluginsDir) ->
-    case ToEnable0 of
-        [] -> throw({error_string, "Not enough arguments for 'enable'"});
-        _  -> ok
-    end,
-    AllPlugins = find_plugins(PluginsDir),
-    Enabled = read_enabled_plugins(PluginsFile),
-    ImplicitlyEnabled = calculate_required_plugins(Enabled, AllPlugins),
-    ToEnable = [list_to_atom(Name) || Name <- ToEnable0],
-    Missing = ToEnable -- plugin_names(AllPlugins),
-    case Missing of
-        [] -> ok;
-        _  -> throw({error_string,
-                     fmt_list("The following plugins could not be found:",
-                              Missing)})
-    end,
-    NewEnabled = lists:usort(Enabled ++ ToEnable),
-    write_enabled_plugins(PluginsFile, NewEnabled),
-    NewImplicitlyEnabled = calculate_required_plugins(NewEnabled, AllPlugins),
-    maybe_warn_mochiweb(NewImplicitlyEnabled),
-    case NewEnabled -- ImplicitlyEnabled of
-        [] -> io:format("Plugin configuration unchanged.~n");
-        _  -> print_list("The following plugins have been enabled:",
-                         NewImplicitlyEnabled -- ImplicitlyEnabled),
-              report_change()
-    end;
-
-action(disable, ToDisable0, _Opts, PluginsFile, PluginsDir) ->
-    case ToDisable0 of
-        [] -> throw({error_string, "Not enough arguments for 'disable'"});
-        _  -> ok
-    end,
-    ToDisable = [list_to_atom(Name) || Name <- ToDisable0],
-    Enabled = read_enabled_plugins(PluginsFile),
-    AllPlugins = find_plugins(PluginsDir),
-    Missing = ToDisable -- plugin_names(AllPlugins),
-    case Missing of
-        [] -> ok;
-        _  -> print_list("Warning: the following plugins could not be found:",
-                         Missing)
-    end,
-    ToDisableDeps = calculate_dependencies(true, ToDisable, AllPlugins),
-    NewEnabled = Enabled -- ToDisableDeps,
-    case length(Enabled) =:= length(NewEnabled) of
-        true  -> io:format("Plugin configuration unchanged.~n");
-        false -> ImplicitlyEnabled =
-                     calculate_required_plugins(Enabled, AllPlugins),
-                 NewImplicitlyEnabled =
-                     calculate_required_plugins(NewEnabled, AllPlugins),
-                 print_list("The following plugins have been disabled:",
-                            ImplicitlyEnabled -- NewImplicitlyEnabled),
-                 write_enabled_plugins(PluginsFile, NewEnabled),
-                 report_change()
-    end.
+-endif.
 
 %%----------------------------------------------------------------------------
 
-%% Get the #plugin{}s ready to be enabled.
-find_plugins(PluginsDir) ->
+%% @doc Prepares the file system and installs all enabled plugins.
+setup() ->
+    {ok, PluginDir}   = application:get_env(rabbit, plugins_dir),
+    {ok, ExpandDir}   = application:get_env(rabbit, plugins_expand_dir),
+    {ok, EnabledFile} = application:get_env(rabbit, enabled_plugins_file),
+    prepare_plugins(EnabledFile, PluginDir, ExpandDir).
+
+%% @doc Lists the plugins which are currently running.
+active() ->
+    {ok, ExpandDir} = application:get_env(rabbit, plugins_expand_dir),
+    InstalledPlugins = [ P#plugin.name || P <- list(ExpandDir) ],
+    [App || {App, _, _} <- rabbit_misc:which_applications(),
+            lists:member(App, InstalledPlugins)].
+
+%% @doc Get the list of plugins which are ready to be enabled.
+list(PluginsDir) ->
     EZs = [{ez, EZ} || EZ <- filelib:wildcard("*.ez", PluginsDir)],
     FreeApps = [{app, App} ||
                    App <- filelib:wildcard("*/ebin/*.app", PluginsDir)],
@@ -177,24 +61,105 @@ find_plugins(PluginsDir) ->
                         (Plugin = #plugin{}, {Plugins1, Problems1}) ->
                             {[Plugin|Plugins1], Problems1}
                     end, {[], []},
-                    [get_plugin_info(PluginsDir, Plug) ||
-                        Plug <- EZs ++ FreeApps]),
+                    [plugin_info(PluginsDir, Plug) || Plug <- EZs ++ FreeApps]),
     case Problems of
         [] -> ok;
-        _  -> io:format("Warning: Problem reading some plugins: ~p~n",
-                        [Problems])
+        _  -> error_logger:warning_msg(
+                "Problem reading some plugins: ~p~n", [Problems])
     end,
     Plugins.
 
-%% Get the #plugin{} from an .ez.
-get_plugin_info(Base, {ez, EZ0}) ->
+%% @doc Read the list of enabled plugins from the supplied term file.
+read_enabled(PluginsFile) ->
+    case rabbit_file:read_term_file(PluginsFile) of
+        {ok, [Plugins]} -> Plugins;
+        {ok, []}        -> [];
+        {ok, [_|_]}     -> throw({error, {malformed_enabled_plugins_file,
+                                          PluginsFile}});
+        {error, enoent} -> [];
+        {error, Reason} -> throw({error, {cannot_read_enabled_plugins_file,
+                                          PluginsFile, Reason}})
+    end.
+
+%% @doc Calculate the dependency graph from <i>Sources</i>.
+%% When Reverse =:= true the bottom/leaf level applications are returned in
+%% the resulting list, otherwise they're skipped.
+dependencies(Reverse, Sources, AllPlugins) ->
+    {ok, G} = rabbit_misc:build_acyclic_graph(
+                fun (App, _Deps) -> [{App, App}] end,
+                fun (App,  Deps) -> [{App, Dep} || Dep <- Deps] end,
+                lists:ukeysort(
+                  1, [{Name, Deps} ||
+                         #plugin{name         = Name,
+                                 dependencies = Deps} <- AllPlugins] ++
+                      [{Dep,   []} ||
+                          #plugin{dependencies = Deps} <- AllPlugins,
+                          Dep                          <- Deps])),
+    Dests = case Reverse of
+                false -> digraph_utils:reachable(Sources, G);
+                true  -> digraph_utils:reaching(Sources, G)
+            end,
+    true = digraph:delete(G),
+    Dests.
+
+%%----------------------------------------------------------------------------
+
+prepare_plugins(EnabledFile, PluginsDistDir, ExpandDir) ->
+    AllPlugins = list(PluginsDistDir),
+    Enabled = read_enabled(EnabledFile),
+    ToUnpack = dependencies(false, Enabled, AllPlugins),
+    ToUnpackPlugins = lookup_plugins(ToUnpack, AllPlugins),
+
+    case Enabled -- plugin_names(ToUnpackPlugins) of
+        []      -> ok;
+        Missing -> error_logger:warning_msg(
+                     "The following enabled plugins were not found: ~p~n",
+                     [Missing])
+    end,
+
+    %% Eliminate the contents of the destination directory
+    case delete_recursively(ExpandDir) of
+        ok          -> ok;
+        {error, E1} -> throw({error, {cannot_delete_plugins_expand_dir,
+                                      [ExpandDir, E1]}})
+    end,
+    case filelib:ensure_dir(ExpandDir ++ "/") of
+        ok          -> ok;
+        {error, E2} -> throw({error, {cannot_create_plugins_expand_dir,
+                                      [ExpandDir, E2]}})
+    end,
+
+    [prepare_plugin(Plugin, ExpandDir) || Plugin <- ToUnpackPlugins],
+
+    [prepare_dir_plugin(PluginAppDescPath) ||
+        PluginAppDescPath <- filelib:wildcard(ExpandDir ++ "/*/ebin/*.app")].
+
+prepare_dir_plugin(PluginAppDescPath) ->
+    code:add_path(filename:dirname(PluginAppDescPath)),
+    list_to_atom(filename:basename(PluginAppDescPath, ".app")).
+
+%%----------------------------------------------------------------------------
+
+delete_recursively(Fn) ->
+    case rabbit_file:recursive_delete([Fn]) of
+        ok                 -> ok;
+        {error, {Path, E}} -> {error, {cannot_delete, Path, E}};
+        Error              -> Error
+    end.
+
+prepare_plugin(#plugin{type = ez, location = Location}, ExpandDir) ->
+    zip:unzip(Location, [{cwd, ExpandDir}]);
+prepare_plugin(#plugin{type = dir, name = Name, location = Location},
+               ExpandDir) ->
+    rabbit_file:recursive_copy(Location, filename:join([ExpandDir, Name])).
+
+plugin_info(Base, {ez, EZ0}) ->
     EZ = filename:join([Base, EZ0]),
     case read_app_file(EZ) of
         {application, Name, Props} -> mkplugin(Name, Props, ez, EZ);
         {error, Reason}            -> {error, EZ, Reason}
     end;
-%% Get the #plugin{} from an .app.
-get_plugin_info(Base, {app, App0}) ->
+plugin_info(Base, {app, App0}) ->
     App = filename:join([Base, App0]),
     case rabbit_file:read_term_file(App) of
         {ok, [{application, Name, Props}]} ->
@@ -213,7 +178,6 @@ mkplugin(Name, Props, Type, Location) ->
     #plugin{name = Name, version = Version, description = Description,
             dependencies = Dependencies, location = Location, type = Type}.
 
-%% Read the .app file from an ez.
 read_app_file(EZ) ->
     case zip:list_dir(EZ) of
         {ok, [_|ZippedFiles]} ->
@@ -229,13 +193,11 @@ read_app_file(EZ) ->
             {error, {invalid_ez, Reason}}
     end.
 
-%% Return the path of the .app files in ebin/.
 find_app_files(ZippedFiles) ->
     {ok, RE} = re:compile("^.*/ebin/.*.app$"),
     [Path || {zip_file, Path, _, _, _, _} <- ZippedFiles,
              re:run(Path, RE, [{capture, none}]) =:= match].
 
-%% Parse a binary into a term.
 parse_binary(Bin) ->
     try
         {ok, Ts, _} = erl_scan:string(binary_to_list(Bin)),
@@ -245,85 +207,10 @@ parse_binary(Bin) ->
         Err -> {error, {invalid_app, Err}}
     end.
 
-%% Pretty print a list of plugins.
-format_plugins(Pattern, Opts, PluginsFile, PluginsDir) ->
-    Verbose = proplists:get_bool(?VERBOSE_OPT, Opts),
-    Minimal = proplists:get_bool(?MINIMAL_OPT, Opts),
-    Format = case {Verbose, Minimal} of
-                 {false, false} -> normal;
-                 {true,  false} -> verbose;
-                 {false, true}  -> minimal;
-                 {true,  true}  -> throw({error_string,
-                                          "Cannot specify -m and -v together"})
-             end,
-    OnlyEnabled    = proplists:get_bool(?ENABLED_OPT,     Opts),
-    OnlyEnabledAll = proplists:get_bool(?ENABLED_ALL_OPT, Opts),
-
-    AvailablePlugins = find_plugins(PluginsDir),
-    EnabledExplicitly = read_enabled_plugins(PluginsFile),
-    EnabledImplicitly =
-        calculate_required_plugins(EnabledExplicitly, AvailablePlugins) --
-        EnabledExplicitly,
-    {ok, RE} = re:compile(Pattern),
-    Plugins = [ Plugin ||
-                  Plugin = #plugin{name = Name} <- AvailablePlugins,
-                  re:run(atom_to_list(Name), RE, [{capture, none}]) =:= match,
-                  if OnlyEnabled    ->  lists:member(Name, EnabledExplicitly);
-                     OnlyEnabledAll -> (lists:member(Name, EnabledExplicitly) or
-                                        lists:member(Name, EnabledImplicitly));
-                     true           -> true
-                  end],
-    Plugins1 = usort_plugins(Plugins),
-    MaxWidth = lists:max([length(atom_to_list(Name)) ||
-                             #plugin{name = Name} <- Plugins1] ++ [0]),
-    [format_plugin(P, EnabledExplicitly, EnabledImplicitly, Format,
-                   MaxWidth) || P <- Plugins1],
-    ok.
-
-format_plugin(#plugin{name = Name, version = Version,
-                      description = Description, dependencies = Deps},
-              EnabledExplicitly, EnabledImplicitly, Format, MaxWidth) ->
-    Glyph = case {lists:member(Name, EnabledExplicitly),
-                  lists:member(Name, EnabledImplicitly)} of
-                {true, false} -> "[E]";
-                {false, true} -> "[e]";
-                _             -> "[ ]"
-            end,
-    case Format of
-        minimal -> io:format("~s~n", [Name]);
-        normal  -> io:format("~s ~-" ++ integer_to_list(MaxWidth) ++
-                                 "w ~s~n", [Glyph, Name, Version]);
-        verbose -> io:format("~s ~w~n", [Glyph, Name]),
-                   io:format("    Version:    \t~s~n", [Version]),
-                   case Deps of
-                       [] -> ok;
-                       _  -> io:format("    Dependencies:\t~p~n", [Deps])
-                   end,
-                   io:format("    Description:\t~s~n", [Description]),
-                   io:format("~n")
-    end.
-
-print_list(Header, Plugins) ->
-    io:format(fmt_list(Header, Plugins)).
-
-fmt_list(Header, Plugins) ->
-    lists:flatten(
-      [Header, $\n, [io_lib:format("  ~s~n", [P]) || P <- Plugins]]).
-
-usort_plugins(Plugins) ->
-    lists:usort(fun plugins_cmp/2, Plugins).
-
-plugins_cmp(#plugin{name = N1, version = V1},
-            #plugin{name = N2, version = V2}) ->
-    {N1, V1} =< {N2, V2}.
-
-%% Filter out applications that can be loaded *right now*.
 filter_applications(Applications) ->
     [Application || Application <- Applications,
                     not is_available_app(Application)].
 
-%% Return whether is application is already available (and hence
-%% doesn't need enabling).
 is_available_app(Application) ->
     case application:load(Application) of
         {error, {already_loaded, _}} -> true;
@@ -332,78 +219,8 @@ is_available_app(Application) ->
         _                            -> false
     end.
 
-%% Return the names of the given plugins.
 plugin_names(Plugins) ->
     [Name || #plugin{name = Name} <- Plugins].
 
-%% Find plugins by name in a list of plugins.
 lookup_plugins(Names, AllPlugins) ->
     [P || P = #plugin{name = Name} <- AllPlugins, lists:member(Name, Names)].
-
-%% Read the enabled plugin names from disk.
-read_enabled_plugins(PluginsFile) ->
-    case rabbit_file:read_term_file(PluginsFile) of
-        {ok, [Plugins]} -> Plugins;
-        {ok, []}        -> [];
-        {ok, [_|_]}     -> throw({error, {malformed_enabled_plugins_file,
-                                          PluginsFile}});
-        {error, enoent} -> [];
-        {error, Reason} -> throw({error, {cannot_read_enabled_plugins_file,
-                                          PluginsFile, Reason}})
-    end.
-
-%% Write the enabled plugin names on disk.
-write_enabled_plugins(PluginsFile, Plugins) ->
-    case rabbit_file:write_term_file(PluginsFile, [Plugins]) of
-        ok              -> ok;
-        {error, Reason} -> throw({error, {cannot_write_enabled_plugins_file,
-                                          PluginsFile, Reason}})
-    end.
-
-calculate_required_plugins(Sources, AllPlugins) ->
-    calculate_dependencies(false, Sources, AllPlugins).
-
-calculate_dependencies(Reverse, Sources, AllPlugins) ->
-    {ok, G} = rabbit_misc:build_acyclic_graph(
-                fun (App, _Deps) -> [{App, App}] end,
-                fun (App,  Deps) -> [{App, Dep} || Dep <- Deps] end,
-                [{Name, Deps}
-                 || #plugin{name = Name, dependencies = Deps} <- AllPlugins]),
-    Dests = case Reverse of
-                false -> digraph_utils:reachable(Sources, G);
-                true  -> digraph_utils:reaching(Sources, G)
-            end,
-    true = digraph:delete(G),
-    Dests.
-
-maybe_warn_mochiweb(Enabled) ->
-    V = erlang:system_info(otp_release),
-    case lists:member(mochiweb, Enabled) andalso V < "R13B01" of
-        true ->
-            Stars = string:copies("*", 80),
-            io:format("~n~n~s~n"
-                      "  Warning: Mochiweb enabled and Erlang version ~s "
-                      "detected.~n"
-                      "  Enabling plugins that depend on Mochiweb is not "
-                      "supported on this Erlang~n"
-                      "  version. At least R13B01 is required.~n~n"
-                      "  RabbitMQ will not start successfully in this "
-                      "configuration. You *must*~n"
-                      "  disable the Mochiweb plugin, or upgrade Erlang.~n"
-                      "~s~n~n~n", [Stars, V, Stars]);
-        false ->
-            ok
-    end.
-
-report_change() ->
-    io:format("Plugin configuration has changed. "
-              "Restart RabbitMQ for changes to take effect.~n"),
-    case os:type() of
-        {win32, _OsName} ->
-             io:format("If you have RabbitMQ running as a service then you must"
-                       " reinstall by running~n  rabbitmq-service.bat stop~n"
-                       "  rabbitmq-service.bat install~n"
-                       "  rabbitmq-service.bat start~n~n");
-        _ ->
-             ok
-    end.
diff --git a/src/rabbit_plugins_main.erl b/src/rabbit_plugins_main.erl
new file mode 100644
index 00000000..948d2ab0
--- /dev/null
+++ b/src/rabbit_plugins_main.erl
@@ -0,0 +1,267 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_plugins_main).
+-include("rabbit.hrl").
+
+-export([start/0, stop/0]).
+
+-define(VERBOSE_OPT, "-v").
+-define(MINIMAL_OPT, "-m").
+-define(ENABLED_OPT, "-E").
+-define(ENABLED_ALL_OPT, "-e").
+
+-define(VERBOSE_DEF, {?VERBOSE_OPT, flag}).
+-define(MINIMAL_DEF, {?MINIMAL_OPT, flag}).
+-define(ENABLED_DEF, {?ENABLED_OPT, flag}).
+-define(ENABLED_ALL_DEF, {?ENABLED_ALL_OPT, flag}).
+
+-define(GLOBAL_DEFS, []).
+
+-define(COMMANDS,
+        [{list, [?VERBOSE_DEF, ?MINIMAL_DEF, ?ENABLED_DEF, ?ENABLED_ALL_DEF]},
+         enable,
+         disable]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start/0 :: () -> no_return()).
+-spec(stop/0 :: () -> 'ok').
+-spec(usage/0 :: () -> no_return()).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start() ->
+    {ok, [[PluginsFile|_]|_]} =
+        init:get_argument(enabled_plugins_file),
+    {ok, [[PluginsDir|_]|_]} = init:get_argument(plugins_dist_dir),
+    {Command, Opts, Args} =
+        case rabbit_misc:parse_arguments(?COMMANDS, ?GLOBAL_DEFS,
+                                         init:get_plain_arguments())
+        of
+            {ok, Res}  -> Res;
+            no_command -> print_error("could not recognise command", []),
+                          usage()
+        end,
+
+    PrintInvalidCommandError =
+        fun () ->
+                print_error("invalid command '~s'",
+                            [string:join([atom_to_list(Command) | Args], " ")])
+        end,
+
+    case catch action(Command, Args, Opts, PluginsFile, PluginsDir) of
+        ok ->
+            rabbit_misc:quit(0);
+        {'EXIT', {function_clause, [{?MODULE, action, _} | _]}} ->
+            PrintInvalidCommandError(),
+            usage();
+        {'EXIT', {function_clause, [{?MODULE, action, _, _} | _]}} ->
+            PrintInvalidCommandError(),
+            usage();
+        {error, Reason} ->
+            print_error("~p", [Reason]),
+            rabbit_misc:quit(2);
+        {error_string, Reason} ->
+            print_error("~s", [Reason]),
+            rabbit_misc:quit(2);
+        Other ->
+            print_error("~p", [Other]),
+            rabbit_misc:quit(2)
+    end.
+
+stop() ->
+    ok.
+
+%%----------------------------------------------------------------------------
+
+action(list, [], Opts, PluginsFile, PluginsDir) ->
+    action(list, [".*"], Opts, PluginsFile, PluginsDir);
+action(list, [Pat], Opts, PluginsFile, PluginsDir) ->
+    format_plugins(Pat, Opts, PluginsFile, PluginsDir);
+
+action(enable, ToEnable0, _Opts, PluginsFile, PluginsDir) ->
+    case ToEnable0 of
+        [] -> throw({error_string, "Not enough arguments for 'enable'"});
+        _  -> ok
+    end,
+    AllPlugins = rabbit_plugins:list(PluginsDir),
+    Enabled = rabbit_plugins:read_enabled(PluginsFile),
+    ImplicitlyEnabled = rabbit_plugins:dependencies(false,
+                                                    Enabled, AllPlugins),
+    ToEnable = [list_to_atom(Name) || Name <- ToEnable0],
+    Missing = ToEnable -- plugin_names(AllPlugins),
+    NewEnabled = lists:usort(Enabled ++ ToEnable),
+    NewImplicitlyEnabled = rabbit_plugins:dependencies(false,
+                                                       NewEnabled, AllPlugins),
+    MissingDeps = (NewImplicitlyEnabled -- plugin_names(AllPlugins)) -- Missing,
+    case {Missing, MissingDeps} of
+        {[],   []} -> ok;
+        {Miss, []} -> throw({error_string, fmt_missing("plugins",      Miss)});
+        {[], Miss} -> throw({error_string, fmt_missing("dependencies", Miss)});
+        {_,     _} -> throw({error_string,
+                             fmt_missing("plugins", Missing) ++
+                                 fmt_missing("dependencies", MissingDeps)})
+    end,
+    write_enabled_plugins(PluginsFile, NewEnabled),
+    case NewEnabled -- ImplicitlyEnabled of
+        [] -> io:format("Plugin configuration unchanged.~n");
+        _  -> print_list("The following plugins have been enabled:",
+                         NewImplicitlyEnabled -- ImplicitlyEnabled),
+              report_change()
+    end;
+
+action(disable, ToDisable0, _Opts, PluginsFile, PluginsDir) ->
+    case ToDisable0 of
+        [] -> throw({error_string, "Not enough arguments for 'disable'"});
+        _  -> ok
+    end,
+    ToDisable = [list_to_atom(Name) || Name <- ToDisable0],
+    Enabled = rabbit_plugins:read_enabled(PluginsFile),
+    AllPlugins = rabbit_plugins:list(PluginsDir),
+    Missing = ToDisable -- plugin_names(AllPlugins),
+    case Missing of
+        [] -> ok;
+        _  -> print_list("Warning: the following plugins could not be found:",
+                         Missing)
+    end,
+    ToDisableDeps = rabbit_plugins:dependencies(true, ToDisable, AllPlugins),
+    NewEnabled = Enabled -- ToDisableDeps,
+    case length(Enabled) =:= length(NewEnabled) of
+        true  -> io:format("Plugin configuration unchanged.~n");
+        false -> ImplicitlyEnabled =
+                     rabbit_plugins:dependencies(false, Enabled, AllPlugins),
+                 NewImplicitlyEnabled =
+                     rabbit_plugins:dependencies(false,
+                                                 NewEnabled, AllPlugins),
+                 print_list("The following plugins have been disabled:",
+                            ImplicitlyEnabled -- NewImplicitlyEnabled),
+                 write_enabled_plugins(PluginsFile, NewEnabled),
+                 report_change()
+    end.
+
+%%----------------------------------------------------------------------------
+
+print_error(Format, Args) ->
+    rabbit_misc:format_stderr("Error: " ++ Format ++ "~n", Args).
+
+usage() ->
+    io:format("~s", [rabbit_plugins_usage:usage()]),
+    rabbit_misc:quit(1).
+
+%% Pretty print a list of plugins.
+format_plugins(Pattern, Opts, PluginsFile, PluginsDir) ->
+    Verbose = proplists:get_bool(?VERBOSE_OPT, Opts),
+    Minimal = proplists:get_bool(?MINIMAL_OPT, Opts),
+    Format = case {Verbose, Minimal} of
+                 {false, false} -> normal;
+                 {true,  false} -> verbose;
+                 {false, true}  -> minimal;
+                 {true,  true}  -> throw({error_string,
+                                          "Cannot specify -m and -v together"})
+             end,
+    OnlyEnabled    = proplists:get_bool(?ENABLED_OPT,     Opts),
+    OnlyEnabledAll = proplists:get_bool(?ENABLED_ALL_OPT, Opts),
+
+    AvailablePlugins = rabbit_plugins:list(PluginsDir),
+    EnabledExplicitly = rabbit_plugins:read_enabled(PluginsFile),
+    EnabledImplicitly =
+        rabbit_plugins:dependencies(false, EnabledExplicitly,
+                                    AvailablePlugins) -- EnabledExplicitly,
+    Missing = [#plugin{name = Name, dependencies = []} ||
+                  Name <- ((EnabledExplicitly ++ EnabledImplicitly) --
+                               plugin_names(AvailablePlugins))],
+    {ok, RE} = re:compile(Pattern),
+    Plugins = [ Plugin ||
+                  Plugin = #plugin{name = Name} <- AvailablePlugins ++ Missing,
+                  re:run(atom_to_list(Name), RE, [{capture, none}]) =:= match,
+                  if OnlyEnabled    ->  lists:member(Name, EnabledExplicitly);
+                     OnlyEnabledAll -> (lists:member(Name,
+                                                     EnabledExplicitly) or
+                                        lists:member(Name, EnabledImplicitly));
+                     true           -> true
+                  end],
+    Plugins1 = usort_plugins(Plugins),
+    MaxWidth = lists:max([length(atom_to_list(Name)) ||
+                             #plugin{name = Name} <- Plugins1] ++ [0]),
+    [format_plugin(P, EnabledExplicitly, EnabledImplicitly,
+                   plugin_names(Missing), Format, MaxWidth) || P <- Plugins1],
+    ok.
+
+format_plugin(#plugin{name = Name, version = Version,
+                      description = Description, dependencies = Deps},
+              EnabledExplicitly, EnabledImplicitly, Missing,
+              Format, MaxWidth) ->
+    Glyph = case {lists:member(Name, EnabledExplicitly),
+                  lists:member(Name, EnabledImplicitly),
+                  lists:member(Name, Missing)} of
+                {true, false, false} -> "[E]";
+                {false, true, false} -> "[e]";
+                {_,        _,  true} -> "[!]";
+                _                    -> "[ ]"
+            end,
+    Opt = fun (_F, A, A) -> ok;
+              ( F, A, _) -> io:format(F, [A])
+          end,
+    case Format of
+        minimal -> io:format("~s~n", [Name]);
+        normal  -> io:format("~s ~-" ++ integer_to_list(MaxWidth) ++ "w ",
+                             [Glyph, Name]),
+                   Opt("~s", Version, undefined),
+                   io:format("~n");
+        verbose -> io:format("~s ~w~n", [Glyph, Name]),
+                   Opt("    Version:     \t~s~n", Version,     undefined),
+                   Opt("    Dependencies:\t~p~n", Deps,        []),
+                   Opt("    Description: \t~s~n", Description, undefined),
+                   io:format("~n")
+    end.
+
+print_list(Header, Plugins) ->
+    io:format(fmt_list(Header, Plugins)).
+
+fmt_list(Header, Plugins) ->
+    lists:flatten(
+      [Header, $\n, [io_lib:format("  ~s~n", [P]) || P <- Plugins]]).
+
+fmt_missing(Desc, Missing) ->
+    fmt_list("The following " ++ Desc ++ " could not be found:", Missing).
+
+usort_plugins(Plugins) ->
+    lists:usort(fun plugins_cmp/2, Plugins).
+
+plugins_cmp(#plugin{name = N1, version = V1},
+            #plugin{name = N2, version = V2}) ->
+    {N1, V1} =< {N2, V2}.
+
+%% Return the names of the given plugins.
+plugin_names(Plugins) ->
+    [Name || #plugin{name = Name} <- Plugins].
+
+%% Write the enabled plugin names on disk.
+write_enabled_plugins(PluginsFile, Plugins) ->
+    case rabbit_file:write_term_file(PluginsFile, [Plugins]) of
+        ok              -> ok;
+        {error, Reason} -> throw({error, {cannot_write_enabled_plugins_file,
+                                          PluginsFile, Reason}})
+    end.
+
+report_change() ->
+    io:format("Plugin configuration has changed. "
+              "Restart RabbitMQ for changes to take effect.~n").
diff --git a/src/rabbit_policy.erl b/src/rabbit_policy.erl
new file mode 100644
index 00000000..0785d278
--- /dev/null
+++ b/src/rabbit_policy.erl
@@ -0,0 +1,319 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_policy).
+
+%% TODO specs
+
+-behaviour(rabbit_runtime_parameter).
+
+-include("rabbit.hrl").
+
+-import(rabbit_misc, [pget/2]).
+
+-export([register/0]).
+-export([invalidate/0, recover/0]).
+-export([name/1, get/2, set/1]).
+-export([validate/4, notify/4, notify_clear/3]).
+-export([parse_set/6, set/6, delete/2, lookup/2, list/0, list/1,
+         list_formatted/1, info_keys/0]).
+
+-rabbit_boot_step({?MODULE,
+                   [{description, "policy parameters"},
+                    {mfa, {rabbit_policy, register, []}},
+                    {requires, rabbit_registry},
+                    {enables, recovery}]}).
+
+register() ->
+    rabbit_registry:register(runtime_parameter, <<"policy">>, ?MODULE).
+
+name(#amqqueue{policy = Policy}) -> name0(Policy);
+name(#exchange{policy = Policy}) -> name0(Policy).
+
+name0(undefined) -> none;
+name0(Policy)    -> pget(name, Policy).
+
+set(Q = #amqqueue{name = Name}) -> Q#amqqueue{policy = set0(Name)};
+set(X = #exchange{name = Name}) -> rabbit_exchange_decorator:set(
+                                     X#exchange{policy = set0(Name)}).
+
+set0(Name = #resource{virtual_host = VHost}) -> match(Name, list(VHost)).
+
+set(Q = #amqqueue{name = Name}, Ps) -> Q#amqqueue{policy = match(Name, Ps)};
+set(X = #exchange{name = Name}, Ps) -> rabbit_exchange_decorator:set(
+                                         X#exchange{policy = match(Name, Ps)}).
+
+get(Name, #amqqueue{policy = Policy}) -> get0(Name, Policy);
+get(Name, #exchange{policy = Policy}) -> get0(Name, Policy);
+%% Caution - SLOW.
+get(Name, EntityName = #resource{virtual_host = VHost}) ->
+    get0(Name, match(EntityName, list(VHost))).
+
+get0(_Name, undefined) -> {error, not_found};
+get0(Name, List)       -> case pget(definition, List) of
+                              undefined -> {error, not_found};
+                              Policy    -> case pget(Name, Policy) of
+                                               undefined -> {error, not_found};
+                                               Value    -> {ok, Value}
+                                           end
+                          end.
+
+%%----------------------------------------------------------------------------
+
+%% Gets called during upgrades - therefore must not assume anything about the
+%% state of Mnesia
+invalidate() ->
+    rabbit_file:write_file(invalid_file(), <<"">>).
+
+recover() ->
+    case rabbit_file:is_file(invalid_file()) of
+        true  -> recover0(),
+                 rabbit_file:delete(invalid_file());
+        false -> ok
+    end.
+
+%% To get here we have to have just completed an Mnesia upgrade - i.e. we are
+%% the first node starting. So we can rewrite the whole database.  Note that
+%% recovery has not yet happened; we must work with the rabbit_durable_<thing>
+%% variants.
+recover0() ->
+    Xs = mnesia:dirty_match_object(rabbit_durable_exchange, #exchange{_ = '_'}),
+    Qs = mnesia:dirty_match_object(rabbit_durable_queue,    #amqqueue{_ = '_'}),
+    Policies = list(),
+    [rabbit_misc:execute_mnesia_transaction(
+       fun () ->
+               mnesia:write(rabbit_durable_exchange, set(X, Policies), write)
+       end) || X <- Xs],
+    [rabbit_misc:execute_mnesia_transaction(
+       fun () ->
+               mnesia:write(rabbit_durable_queue, set(Q, Policies), write)
+       end) || Q <- Qs],
+    ok.
+
+invalid_file() ->
+    filename:join(rabbit_mnesia:dir(), "policies_are_invalid").
+
+%%----------------------------------------------------------------------------
+
+parse_set(VHost, Name, Pattern, Definition, Priority, ApplyTo) ->
+    try list_to_integer(Priority) of
+        Num -> parse_set0(VHost, Name, Pattern, Definition, Num, ApplyTo)
+    catch
+        error:badarg -> {error, "~p priority must be a number", [Priority]}
+    end.
+
+parse_set0(VHost, Name, Pattern, Defn, Priority, ApplyTo) ->
+    case rabbit_misc:json_decode(Defn) of
+        {ok, JSON} ->
+            set0(VHost, Name,
+                 [{<<"pattern">>,    list_to_binary(Pattern)},
+                  {<<"definition">>, rabbit_misc:json_to_term(JSON)},
+                  {<<"priority">>,   Priority},
+                  {<<"apply-to">>,   ApplyTo}]);
+        error ->
+            {error_string, "JSON decoding error"}
+    end.
+
+set(VHost, Name, Pattern, Definition, Priority, ApplyTo) ->
+    PolicyProps = [{<<"pattern">>,    Pattern},
+                   {<<"definition">>, Definition},
+                   {<<"priority">>,   case Priority of
+                                          undefined -> 0;
+                                          _         -> Priority
+                                      end},
+                   {<<"apply-to">>,   case ApplyTo of
+                                          undefined -> <<"all">>;
+                                          _         -> ApplyTo
+                                      end}],
+    set0(VHost, Name, PolicyProps).
+
+set0(VHost, Name, Term) ->
+    rabbit_runtime_parameters:set_any(VHost, <<"policy">>, Name, Term).
+
+delete(VHost, Name) ->
+    rabbit_runtime_parameters:clear_any(VHost, <<"policy">>, Name).
+
+lookup(VHost, Name) ->
+    case rabbit_runtime_parameters:lookup(VHost, <<"policy">>, Name) of
+        not_found  -> not_found;
+        P          -> p(P, fun ident/1)
+    end.
+
+list() ->
+    list('_').
+
+list(VHost) ->
+    list0(VHost, fun ident/1).
+
+list_formatted(VHost) ->
+    order_policies(list0(VHost, fun format/1)).
+
+list0(VHost, DefnFun) ->
+    [p(P, DefnFun) || P <- rabbit_runtime_parameters:list(VHost, <<"policy">>)].
+
+order_policies(PropList) ->
+    lists:sort(fun (A, B) -> pget(priority, A) < pget(priority, B) end,
+               PropList).
+
+p(Parameter, DefnFun) ->
+    Value = pget(value, Parameter),
+    [{vhost,      pget(vhost, Parameter)},
+     {name,       pget(name, Parameter)},
+     {pattern,    pget(<<"pattern">>, Value)},
+     {'apply-to', pget(<<"apply-to">>, Value)},
+     {definition, DefnFun(pget(<<"definition">>, Value))},
+     {priority,   pget(<<"priority">>, Value)}].
+
+format(Term) ->
+    {ok, JSON} = rabbit_misc:json_encode(rabbit_misc:term_to_json(Term)),
+    list_to_binary(JSON).
+
+ident(X) -> X.
+
+info_keys() -> [vhost, name, 'apply-to', pattern, definition, priority].
+
+%%----------------------------------------------------------------------------
+
+validate(_VHost, <<"policy">>, Name, Term) ->
+    rabbit_parameter_validation:proplist(
+      Name, policy_validation(), Term).
+
+notify(VHost, <<"policy">>, _Name, _Term) ->
+    update_policies(VHost).
+
+notify_clear(VHost, <<"policy">>, _Name) ->
+    update_policies(VHost).
+
+%%----------------------------------------------------------------------------
+
+update_policies(VHost) ->
+    Policies = list(VHost),
+    {Xs, Qs} = rabbit_misc:execute_mnesia_transaction(
+                 fun() ->
+                         {[update_exchange(X, Policies) ||
+                              X <- rabbit_exchange:list(VHost)],
+                          [update_queue(Q, Policies) ||
+                              Q <- rabbit_amqqueue:list(VHost)]}
+                 end),
+    [catch notify(X) || X <- Xs],
+    [catch notify(Q) || Q <- Qs],
+    ok.
+
+update_exchange(X = #exchange{name = XName, policy = OldPolicy}, Policies) ->
+    case match(XName, Policies) of
+        OldPolicy -> no_change;
+        NewPolicy -> case rabbit_exchange:update(
+                            XName, fun (X0) ->
+                                           rabbit_exchange_decorator:set(
+                                             X0 #exchange{policy = NewPolicy})
+                                   end) of
+                         #exchange{} = X1 -> {X, X1};
+                         not_found        -> {X, X }
+                     end
+    end.
+
+update_queue(Q = #amqqueue{name = QName, policy = OldPolicy}, Policies) ->
+    case match(QName, Policies) of
+        OldPolicy -> no_change;
+        NewPolicy -> rabbit_amqqueue:update(
+                       QName, fun(Q1) -> Q1#amqqueue{policy = NewPolicy} end),
+                     {Q, Q#amqqueue{policy = NewPolicy}}
+    end.
+
+notify(no_change)->
+    ok;
+notify({X1 = #exchange{}, X2 = #exchange{}}) ->
+    rabbit_exchange:policy_changed(X1, X2);
+notify({Q1 = #amqqueue{}, Q2 = #amqqueue{}}) ->
+    rabbit_amqqueue:policy_changed(Q1, Q2).
+
+match(Name, Policies) ->
+    case lists:sort(fun sort_pred/2, [P || P <- Policies, matches(Name, P)]) of
+        []               -> undefined;
+        [Policy | _Rest] -> Policy
+    end.
+
+matches(#resource{name = Name, kind = Kind, virtual_host = VHost}, Policy) ->
+    matches_type(Kind, pget('apply-to', Policy)) andalso
+        match =:= re:run(Name, pget(pattern, Policy), [{capture, none}]) andalso
+        VHost =:= pget(vhost, Policy).
+
+matches_type(exchange, <<"exchanges">>) -> true;
+matches_type(queue,    <<"queues">>)    -> true;
+matches_type(exchange, <<"all">>)       -> true;
+matches_type(queue,    <<"all">>)       -> true;
+matches_type(_,        _)               -> false.
+
+sort_pred(A, B) -> pget(priority, A) >= pget(priority, B).
+
+%%----------------------------------------------------------------------------
+
+policy_validation() ->
+    [{<<"priority">>,   fun rabbit_parameter_validation:number/2, mandatory},
+     {<<"pattern">>,    fun rabbit_parameter_validation:regex/2,  mandatory},
+     {<<"apply-to">>,   fun apply_to_validation/2,                optional},
+     {<<"definition">>, fun validation/2,                         mandatory}].
+
+validation(_Name, []) ->
+    {error, "no policy provided", []};
+validation(_Name, Terms) when is_list(Terms) ->
+    {Keys, Modules} = lists:unzip(
+                        rabbit_registry:lookup_all(policy_validator)),
+    [] = dups(Keys), %% ASSERTION
+    Validators = lists:zipwith(fun (M, K) ->  {M, a2b(K)} end, Modules, Keys),
+    case is_proplist(Terms) of
+        true  -> {TermKeys, _} = lists:unzip(Terms),
+                 case dups(TermKeys) of
+                     []   -> validation0(Validators, Terms);
+                     Dup  -> {error, "~p duplicate keys not allowed", [Dup]}
+                 end;
+        false -> {error, "definition must be a dictionary: ~p", [Terms]}
+    end;
+validation(_Name, Term) ->
+    {error, "parse error while reading policy: ~p", [Term]}.
+
+validation0(Validators, Terms) ->
+    case lists:foldl(
+           fun (Mod, {ok, TermsLeft}) ->
+                   ModKeys = proplists:get_all_values(Mod, Validators),
+                   case [T || {Key, _} = T <- TermsLeft,
+                              lists:member(Key, ModKeys)] of
+                       []    -> {ok, TermsLeft};
+                       Scope -> {Mod:validate_policy(Scope), TermsLeft -- Scope}
+                   end;
+               (_, Acc) ->
+                   Acc
+           end, {ok, Terms}, proplists:get_keys(Validators)) of
+         {ok, []} ->
+             ok;
+         {ok, Unvalidated} ->
+             {error, "~p are not recognised policy settings", [Unvalidated]};
+         {Error, _} ->
+             Error
+    end.
+
+a2b(A) -> list_to_binary(atom_to_list(A)).
+
+dups(L) -> L -- lists:usort(L).
+
+is_proplist(L) -> length(L) =:= length([I || I = {_, _} <- L]).
+
+apply_to_validation(_Name, <<"all">>)       -> ok;
+apply_to_validation(_Name, <<"exchanges">>) -> ok;
+apply_to_validation(_Name, <<"queues">>)    -> ok;
+apply_to_validation(_Name, Term) ->
+    {error, "apply-to '~s' unrecognised; should be 'queues', 'exchanges' "
+     "or 'all'", [Term]}.
diff --git a/src/rabbit_policy_validator.erl b/src/rabbit_policy_validator.erl
new file mode 100644
index 00000000..661db73d
--- /dev/null
+++ b/src/rabbit_policy_validator.erl
@@ -0,0 +1,39 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_policy_validator).
+
+-ifdef(use_specs).
+
+-export_type([validate_results/0]).
+
+-type(validate_results() ::
+        'ok' | {error, string(), [term()]} | [validate_results()]).
+
+-callback validate_policy([{binary(), term()}]) -> validate_results().
+
+-else.
+
+-export([behaviour_info/1]).
+
+behaviour_info(callbacks) ->
+    [
+     {validate_policy, 1}
+    ];
+behaviour_info(_Other) ->
+    undefined.
+
+-endif.
diff --git a/src/rabbit_prelaunch.erl b/src/rabbit_prelaunch.erl
index 162d44f1..be407a02 100644
--- a/src/rabbit_prelaunch.erl
+++ b/src/rabbit_prelaunch.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_prelaunch).
@@ -31,212 +31,21 @@
 
 -spec(start/0 :: () -> no_return()).
 -spec(stop/0 :: () -> 'ok').
-%% Shut dialyzer up
--spec(terminate/1 :: (string()) -> no_return()).
--spec(terminate/2 :: (string(), [any()]) -> no_return()).
 
 -endif.
 
 %%----------------------------------------------------------------------------
 
 start() ->
-    io:format("Activating RabbitMQ plugins ...~n"),
-
-    %% Determine our various directories
-    [EnabledPluginsFile, PluginsDistDir, UnpackedPluginDir, NodeStr] =
-        init:get_plain_arguments(),
-    RootName = UnpackedPluginDir ++ "/rabbit",
-
-    prepare_plugins(EnabledPluginsFile, PluginsDistDir, UnpackedPluginDir),
-
-    %% Build a list of required apps based on the fixed set, and any plugins
-    PluginApps = find_plugins(UnpackedPluginDir),
-    RequiredApps = ?BaseApps ++ PluginApps,
-
-    %% Build the entire set of dependencies - this will load the
-    %% applications along the way
-    AllApps = case catch sets:to_list(expand_dependencies(RequiredApps)) of
-                  {failed_to_load_app, App, Err} ->
-                      terminate("failed to load application ~s:~n~p",
-                                [App, Err]);
-                  AppList ->
-                      AppList
-              end,
-    AppVersions = [determine_version(App) || App <- AllApps],
-    RabbitVersion = proplists:get_value(rabbit, AppVersions),
-
-    %% Build the overall release descriptor
-    RDesc = {release,
-             {"rabbit", RabbitVersion},
-             {erts, erlang:system_info(version)},
-             AppVersions},
-
-    %% Write it out to $RABBITMQ_PLUGINS_EXPAND_DIR/rabbit.rel
-    rabbit_file:write_file(RootName ++ ".rel", io_lib:format("~p.~n", [RDesc])),
-
-    %% We exclude mochiweb due to its optional use of fdsrv.
-    XRefExclude = [mochiweb],
-
-    %% Compile the script
-    ScriptFile = RootName ++ ".script",
-    case systools:make_script(RootName, [local, silent,
-                                         {exref, AllApps -- XRefExclude}]) of
-        {ok, Module, Warnings} ->
-            %% This gets lots of spurious no-source warnings when we
-            %% have .ez files, so we want to supress them to prevent
-            %% hiding real issues. On Ubuntu, we also get warnings
-            %% about kernel/stdlib sources being out of date, which we
-            %% also ignore for the same reason.
-            WarningStr = Module:format_warning(
-                           [W || W <- Warnings,
-                                 case W of
-                                     {warning, {source_not_found, _}} -> false;
-                                     {warning, {obj_out_of_date, {_,_,WApp,_,_}}}
-                                       when WApp == mnesia;
-                                            WApp == stdlib;
-                                            WApp == kernel;
-                                            WApp == sasl;
-                                            WApp == crypto;
-                                            WApp == os_mon -> false;
-                                     _ -> true
-                                 end]),
-            case length(WarningStr) of
-                0 -> ok;
-                _ -> S = string:copies("*", 80),
-                     io:format("~n~s~n~s~s~n~n", [S, WarningStr, S])
-            end,
-            ok;
-        {error, Module, Error} ->
-            terminate("generation of boot script file ~s failed:~n~s",
-                      [ScriptFile, Module:format_error(Error)])
-    end,
-
-    case post_process_script(ScriptFile) of
-        ok -> ok;
-        {error, Reason} ->
-            terminate("post processing of boot script file ~s failed:~n~w",
-                      [ScriptFile, Reason])
-    end,
-    case systools:script2boot(RootName) of
-        ok    -> ok;
-        error -> terminate("failed to compile boot script file ~s",
-                           [ScriptFile])
-    end,
-    io:format("~w plugins activated:~n", [length(PluginApps)]),
-    [io:format("* ~s-~s~n", [App, proplists:get_value(App, AppVersions)])
-     || App <- PluginApps],
-    io:nl(),
-
+    [NodeStr] = init:get_plain_arguments(),
     ok = duplicate_node_check(NodeStr),
-
-    terminate(0),
+    rabbit_misc:quit(0),
     ok.
 
 stop() ->
     ok.
 
-determine_version(App) ->
-    application:load(App),
-    {ok, Vsn} = application:get_key(App, vsn),
-    {App, Vsn}.
-
-delete_recursively(Fn) ->
-    case rabbit_file:recursive_delete([Fn]) of
-        ok                 -> ok;
-        {error, {Path, E}} -> {error, {cannot_delete, Path, E}};
-        Error              -> Error
-    end.
-
-prepare_plugins(EnabledPluginsFile, PluginsDistDir, DestDir) ->
-    AllPlugins = rabbit_plugins:find_plugins(PluginsDistDir),
-    Enabled = rabbit_plugins:read_enabled_plugins(EnabledPluginsFile),
-    ToUnpack = rabbit_plugins:calculate_required_plugins(Enabled, AllPlugins),
-    ToUnpackPlugins = rabbit_plugins:lookup_plugins(ToUnpack, AllPlugins),
-
-    Missing = Enabled -- rabbit_plugins:plugin_names(ToUnpackPlugins),
-    case Missing of
-        [] -> ok;
-        _  -> io:format("Warning: the following enabled plugins were "
-                        "not found: ~p~n", [Missing])
-    end,
-
-    %% Eliminate the contents of the destination directory
-    case delete_recursively(DestDir) of
-        ok         -> ok;
-        {error, E} -> terminate("Could not delete dir ~s (~p)", [DestDir, E])
-    end,
-    case filelib:ensure_dir(DestDir ++ "/") of
-        ok          -> ok;
-        {error, E2} -> terminate("Could not create dir ~s (~p)", [DestDir, E2])
-    end,
-
-    [prepare_plugin(Plugin, DestDir) || Plugin <- ToUnpackPlugins].
-
-prepare_plugin(#plugin{type = ez, location = Location}, PluginDestDir) ->
-    zip:unzip(Location, [{cwd, PluginDestDir}]);
-prepare_plugin(#plugin{type = dir, name = Name, location = Location},
-               PluginsDestDir) ->
-    rabbit_file:recursive_copy(Location,
-                               filename:join([PluginsDestDir, Name])).
-
-find_plugins(PluginDir) ->
-    [prepare_dir_plugin(PluginName) ||
-        PluginName <- filelib:wildcard(PluginDir ++ "/*/ebin/*.app")].
-
-prepare_dir_plugin(PluginAppDescFn) ->
-    %% Add the plugin ebin directory to the load path
-    PluginEBinDirN = filename:dirname(PluginAppDescFn),
-    code:add_path(PluginEBinDirN),
-
-    %% We want the second-last token
-    NameTokens = string:tokens(PluginAppDescFn,"/."),
-    PluginNameString = lists:nth(length(NameTokens) - 1, NameTokens),
-    list_to_atom(PluginNameString).
-
-expand_dependencies(Pending) ->
-    expand_dependencies(sets:new(), Pending).
-expand_dependencies(Current, []) ->
-    Current;
-expand_dependencies(Current, [Next|Rest]) ->
-    case sets:is_element(Next, Current) of
-        true ->
-            expand_dependencies(Current, Rest);
-        false ->
-            case application:load(Next) of
-                ok ->
-                    ok;
-                {error, {already_loaded, _}} ->
-                    ok;
-                {error, Reason} ->
-                    throw({failed_to_load_app, Next, Reason})
-            end,
-            {ok, Required} = application:get_key(Next, applications),
-            Unique = [A || A <- Required, not(sets:is_element(A, Current))],
-            expand_dependencies(sets:add_element(Next, Current), Rest ++ Unique)
-    end.
-
-post_process_script(ScriptFile) ->
-    case file:consult(ScriptFile) of
-        {ok, [{script, Name, Entries}]} ->
-            NewEntries = lists:flatmap(fun process_entry/1, Entries),
-            case file:open(ScriptFile, [write]) of
-                {ok, Fd} ->
-                    io:format(Fd, "%% script generated at ~w ~w~n~p.~n",
-                              [date(), time(), {script, Name, NewEntries}]),
-                    file:close(Fd),
-                    ok;
-                {error, OReason} ->
-                    {error, {failed_to_open_script_file_for_writing, OReason}}
-            end;
-        {error, Reason} ->
-            {error, {failed_to_load_script, Reason}}
-    end.
-
-process_entry(Entry = {apply,{application,start_boot,[mnesia,permanent]}}) ->
-    [{apply,{rabbit,maybe_hipe_compile,[]}},
-     {apply,{rabbit,prepare,[]}}, Entry];
-process_entry(Entry) ->
-    [Entry].
+%%----------------------------------------------------------------------------
 
 %% Check whether a node with the same name is already running
 duplicate_node_check([]) ->
@@ -248,32 +57,16 @@ duplicate_node_check(NodeStr) ->
     case rabbit_nodes:names(NodeHost) of
         {ok, NamePorts}  ->
             case proplists:is_defined(NodeName, NamePorts) of
-                true -> io:format("node with name ~p "
+                true -> io:format("ERROR: node with name ~p "
                                   "already running on ~p~n",
                                   [NodeName, NodeHost]),
                         io:format(rabbit_nodes:diagnostics([Node]) ++ "~n"),
-                        terminate(?ERROR_CODE);
+                        rabbit_misc:quit(?ERROR_CODE);
                 false -> ok
             end;
         {error, EpmdReason} ->
-            terminate("epmd error for host ~p: ~p (~s)~n",
+            io:format("ERROR: epmd error for host ~p: ~p (~s)~n",
                       [NodeHost, EpmdReason,
-                       case EpmdReason of
-                           address -> "unable to establish tcp connection";
-                           timeout -> "timed out establishing tcp connection";
-                           _       -> inet:format_error(EpmdReason)
-                       end])
-    end.
-
-terminate(Fmt, Args) ->
-    io:format("ERROR: " ++ Fmt ++ "~n", Args),
-    terminate(?ERROR_CODE).
-
-terminate(Status) ->
-    case os:type() of
-        {unix,  _} -> halt(Status);
-        {win32, _} -> init:stop(Status),
-                      receive
-                      after infinity -> ok
-                      end
+                       rabbit_misc:format_inet_error(EpmdReason)]),
+            rabbit_misc:quit(?ERROR_CODE)
     end.
diff --git a/src/rabbit_queue_collector.erl b/src/rabbit_queue_collector.erl
index 6dad01cc..6406f7e9 100644
--- a/src/rabbit_queue_collector.erl
+++ b/src/rabbit_queue_collector.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_queue_collector).
diff --git a/src/rabbit_queue_index.erl b/src/rabbit_queue_index.erl
index 3ef769c7..f69d8355 100644
--- a/src/rabbit_queue_index.erl
+++ b/src/rabbit_queue_index.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_queue_index).
@@ -23,7 +23,7 @@
 
 -export([scan/3]).
 
--export([add_queue_ttl/0]).
+-export([add_queue_ttl/0, avoid_zeroes/0]).
 
 -define(CLEAN_FILENAME, "clean.dot").
 
@@ -123,9 +123,9 @@
 -define(REL_SEQ_BITS, 14).
 -define(SEGMENT_ENTRY_COUNT, 16384). %% trunc(math:pow(2,?REL_SEQ_BITS))).
 
-%% seq only is binary 00 followed by 14 bits of rel seq id
+%% seq only is binary 01 followed by 14 bits of rel seq id
 %% (range: 0 - 16383)
--define(REL_SEQ_ONLY_PREFIX, 00).
+-define(REL_SEQ_ONLY_PREFIX, 01).
 -define(REL_SEQ_ONLY_PREFIX_BITS, 2).
 -define(REL_SEQ_ONLY_RECORD_BYTES, 2).
 
@@ -162,7 +162,7 @@
 %%----------------------------------------------------------------------------
 
 -record(qistate, { dir, segments, journal_handle, dirty_count,
-                   max_journal_entries, on_sync, unsynced_msg_ids }).
+                   max_journal_entries, on_sync, unconfirmed }).
 
 -record(segment, { num, path, journal_entries, unacked }).
 
@@ -171,6 +171,7 @@
 %%----------------------------------------------------------------------------
 
 -rabbit_upgrade({add_queue_ttl, local, []}).
+-rabbit_upgrade({avoid_zeroes,  local, [add_queue_ttl]}).
 
 -ifdef(use_specs).
 
@@ -190,7 +191,7 @@
                               dirty_count         :: integer(),
                               max_journal_entries :: non_neg_integer(),
                               on_sync             :: on_sync_fun(),
-                              unsynced_msg_ids    :: gb_set()
+                              unconfirmed         :: gb_set()
                             }).
 -type(contains_predicate() :: fun ((rabbit_types:msg_id()) -> boolean())).
 -type(walker(A) :: fun ((A) -> 'finished' |
@@ -210,7 +211,7 @@
 -spec(deliver/2 :: ([seq_id()], qistate()) -> qistate()).
 -spec(ack/2 :: ([seq_id()], qistate()) -> qistate()).
 -spec(sync/1 :: (qistate()) -> qistate()).
--spec(needs_sync/1 :: (qistate()) -> boolean()).
+-spec(needs_sync/1 :: (qistate()) -> 'confirms' | 'other' | 'false').
 -spec(flush/1 :: (qistate()) -> qistate()).
 -spec(read/3 :: (seq_id(), seq_id(), qistate()) ->
                      {[{rabbit_types:msg_id(), seq_id(),
@@ -269,13 +270,16 @@ delete_and_terminate(State) ->
     State1.
 
 publish(MsgId, SeqId, MsgProps, IsPersistent,
-        State = #qistate { unsynced_msg_ids = UnsyncedMsgIds })
+        State = #qistate { unconfirmed = Unconfirmed })
   when is_binary(MsgId) ->
     ?MSG_ID_BYTES = size(MsgId),
     {JournalHdl, State1} =
         get_journal_handle(
-          State #qistate {
-            unsynced_msg_ids = gb_sets:add_element(MsgId, UnsyncedMsgIds) }),
+          case MsgProps#message_properties.needs_confirming of
+              true  -> Unconfirmed1 = gb_sets:add_element(MsgId, Unconfirmed),
+                       State #qistate { unconfirmed = Unconfirmed1 };
+              false -> State
+          end),
     ok = file_handle_cache:append(
            JournalHdl, [<<(case IsPersistent of
                                true  -> ?PUB_PERSIST_JPREFIX;
@@ -302,8 +306,14 @@ sync(State = #qistate { journal_handle = JournalHdl }) ->
 
 needs_sync(#qistate { journal_handle = undefined }) ->
     false;
-needs_sync(#qistate { journal_handle = JournalHdl }) ->
-    file_handle_cache:needs_sync(JournalHdl).
+needs_sync(#qistate { journal_handle = JournalHdl, unconfirmed = UC }) ->
+    case gb_sets:is_empty(UC) of
+        true  -> case file_handle_cache:needs_sync(JournalHdl) of
+                     true  -> other;
+                     false -> false
+                 end;
+        false -> confirms
+    end.
 
 flush(State = #qistate { dirty_count = 0 }) -> State;
 flush(State)                                -> flush_journal(State).
@@ -398,21 +408,21 @@ blank_state_dir(Dir) ->
                dirty_count         = 0,
                max_journal_entries = MaxJournal,
                on_sync             = fun (_) -> ok end,
-               unsynced_msg_ids    = gb_sets:new() }.
+               unconfirmed         = gb_sets:new() }.
 
-clean_file_name(Dir) -> filename:join(Dir, ?CLEAN_FILENAME).
+clean_filename(Dir) -> filename:join(Dir, ?CLEAN_FILENAME).
 
 detect_clean_shutdown(Dir) ->
-    case rabbit_file:delete(clean_file_name(Dir)) of
+    case rabbit_file:delete(clean_filename(Dir)) of
         ok              -> true;
         {error, enoent} -> false
     end.
 
 read_shutdown_terms(Dir) ->
-    rabbit_file:read_term_file(clean_file_name(Dir)).
+    rabbit_file:read_term_file(clean_filename(Dir)).
 
 store_clean_shutdown(Terms, Dir) ->
-    CleanFileName = clean_file_name(Dir),
+    CleanFileName = clean_filename(Dir),
     ok = rabbit_file:ensure_dir(CleanFileName),
     rabbit_file:write_term_file(CleanFileName, Terms).
 
@@ -537,7 +547,7 @@ queue_index_walker_reader(QueueName, Gatherer) ->
     State = blank_state(QueueName),
     ok = scan_segments(
            fun (_SeqId, MsgId, _MsgProps, true, _IsDelivered, no_ack, ok) ->
-                   gatherer:in(Gatherer, {MsgId, 1});
+                   gatherer:sync_in(Gatherer, {MsgId, 1});
                (_SeqId, _MsgId, _MsgProps, _IsPersistent, _IsDelivered,
                 _IsAcked, Acc) ->
                    Acc
@@ -607,19 +617,21 @@ add_to_journal(RelSeq, Action,
                                end};
 
 add_to_journal(RelSeq, Action, JEntries) ->
-    Val = case array:get(RelSeq, JEntries) of
-              undefined ->
-                  case Action of
-                      ?PUB -> {Action, no_del, no_ack};
-                      del  -> {no_pub,    del, no_ack};
-                      ack  -> {no_pub, no_del,    ack}
-                  end;
-              ({Pub, no_del, no_ack}) when Action == del ->
-                  {Pub, del, no_ack};
-              ({Pub,    Del, no_ack}) when Action == ack ->
-                  {Pub, Del,    ack}
-          end,
-    array:set(RelSeq, Val, JEntries).
+    case array:get(RelSeq, JEntries) of
+        undefined ->
+            array:set(RelSeq,
+                      case Action of
+                          ?PUB -> {Action, no_del, no_ack};
+                          del  -> {no_pub,    del, no_ack};
+                          ack  -> {no_pub, no_del,    ack}
+                      end, JEntries);
+        ({Pub,    no_del, no_ack}) when Action == del ->
+            array:set(RelSeq, {Pub,    del, no_ack}, JEntries);
+        ({no_pub,    del, no_ack}) when Action == ack ->
+            array:set(RelSeq, {no_pub, del,    ack}, JEntries);
+        ({?PUB,      del, no_ack}) when Action == ack ->
+            array:reset(RelSeq, JEntries)
+    end.
 
 maybe_flush_journal(State = #qistate { dirty_count = DCount,
                                        max_journal_entries = MaxJournal })
@@ -704,7 +716,11 @@ load_journal_entries(State = #qistate { journal_handle = Hdl }) ->
                     load_journal_entries(add_to_journal(SeqId, ack, State));
                 _ ->
                     case file_handle_cache:read(Hdl, ?PUB_RECORD_BODY_BYTES) of
-                        {ok, Bin} ->
+                        %% Journal entry composed only of zeroes was probably
+                        %% produced during a dirty shutdown so stop reading
+                        {ok, <<0:?PUB_RECORD_BODY_BYTES/unit:8>>} ->
+                            State;
+                        {ok, <<Bin:?PUB_RECORD_BODY_BYTES/binary>>} ->
                             {MsgId, MsgProps} = parse_pub_record_body(Bin),
                             IsPersistent = case Prefix of
                                                ?PUB_PERSIST_JPREFIX -> true;
@@ -732,9 +748,12 @@ deliver_or_ack(Kind, SeqIds, State) ->
                                             add_to_journal(SeqId, Kind, StateN)
                                     end, State1, SeqIds)).
 
-notify_sync(State = #qistate { unsynced_msg_ids = UG, on_sync = OnSyncFun }) ->
-    OnSyncFun(UG),
-    State #qistate { unsynced_msg_ids = gb_sets:new() }.
+notify_sync(State = #qistate { unconfirmed = UC, on_sync = OnSyncFun }) ->
+    case gb_sets:is_empty(UC) of
+        true  -> State;
+        false -> OnSyncFun(UC),
+                 State #qistate { unconfirmed = gb_sets:new() }
+    end.
 
 %%----------------------------------------------------------------------------
 %% segment manipulation
@@ -1008,7 +1027,18 @@ journal_minus_segment1({no_pub, del, ack},         {?PUB, no_del, no_ack}) ->
 journal_minus_segment1({no_pub, del, ack},         {?PUB, del, no_ack}) ->
     {{no_pub, no_del, ack}, 0};
 journal_minus_segment1({no_pub, del, ack},         {?PUB, del, ack}) ->
-    {undefined, -1}.
+    {undefined, -1};
+
+%% Missing segment. If flush_journal/1 is interrupted after deleting
+%% the segment but before truncating the journal we can get these
+%% cases: a delivery and an acknowledgement in the journal, or just an
+%% acknowledgement in the journal, but with no segment. In both cases
+%% we have really forgotten the message; so ignore what's in the
+%% journal.
+journal_minus_segment1({no_pub, no_del, ack},      undefined) ->
+    {undefined, 0};
+journal_minus_segment1({no_pub, del, ack},         undefined) ->
+    {undefined, 0}.
 
 %%----------------------------------------------------------------------------
 %% upgrade
@@ -1043,6 +1073,21 @@ add_queue_ttl_segment(<<?REL_SEQ_ONLY_PREFIX:?REL_SEQ_ONLY_PREFIX_BITS,
 add_queue_ttl_segment(_) ->
     stop.
 
+avoid_zeroes() ->
+    foreach_queue_index({none, fun avoid_zeroes_segment/1}).
+
+avoid_zeroes_segment(<<?PUB_PREFIX:?PUB_PREFIX_BITS,  IsPersistentNum:1,
+                       RelSeq:?REL_SEQ_BITS, MsgId:?MSG_ID_BITS,
+                       Expiry:?EXPIRY_BITS, Rest/binary>>) ->
+    {<<?PUB_PREFIX:?PUB_PREFIX_BITS, IsPersistentNum:1, RelSeq:?REL_SEQ_BITS,
+       MsgId:?MSG_ID_BITS, Expiry:?EXPIRY_BITS>>, Rest};
+avoid_zeroes_segment(<<0:?REL_SEQ_ONLY_PREFIX_BITS,
+                       RelSeq:?REL_SEQ_BITS, Rest/binary>>) ->
+    {<<?REL_SEQ_ONLY_PREFIX:?REL_SEQ_ONLY_PREFIX_BITS, RelSeq:?REL_SEQ_BITS>>,
+     Rest};
+avoid_zeroes_segment(_) ->
+    stop.
+
 %%----------------------------------------------------------------------------
 
 foreach_queue_index(Funs) ->
@@ -1067,7 +1112,9 @@ transform_queue(Dir, Gatherer, {JournalFun, SegmentFun}) ->
      || Seg <- rabbit_file:wildcard(".*\\" ++ ?SEGMENT_EXTENSION, Dir)],
     ok = gatherer:finish(Gatherer).
 
-transform_file(Path, Fun) ->
+transform_file(_Path, none) ->
+    ok;
+transform_file(Path, Fun) when is_function(Fun)->
     PathTmp = Path ++ ".upgrade",
     case rabbit_file:file_size(Path) of
         0    -> ok;
diff --git a/src/rabbit_reader.erl b/src/rabbit_reader.erl
index 6f12e22e..8106a46a 100644
--- a/src/rabbit_reader.erl
+++ b/src/rabbit_reader.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_reader).
@@ -23,7 +23,7 @@
 
 -export([system_continue/3, system_terminate/4, system_code_change/4]).
 
--export([init/4, mainloop/2]).
+-export([init/4, mainloop/2, recvloop/2]).
 
 -export([conserve_resources/3, server_properties/1]).
 
@@ -37,21 +37,27 @@
 
 -record(v1, {parent, sock, connection, callback, recv_len, pending_recv,
              connection_state, queue_collector, heartbeater, stats_timer,
-             channel_sup_sup_pid, start_heartbeat_fun, buf, buf_len,
-             auth_mechanism, auth_state, conserve_resources,
-             last_blocked_by, last_blocked_at}).
+             ch_sup3_pid, channel_sup_sup_pid, start_heartbeat_fun,
+             buf, buf_len, throttle}).
+
+-record(connection, {name, host, peer_host, port, peer_port,
+                     protocol, user, timeout_sec, frame_max, vhost,
+                     client_properties, capabilities,
+                     auth_mechanism, auth_state}).
+
+-record(throttle, {alarmed_by, last_blocked_by, last_blocked_at,
+                   blocked_sent}).
 
 -define(STATISTICS_KEYS, [pid, recv_oct, recv_cnt, send_oct, send_cnt,
                           send_pend, state, last_blocked_by, last_blocked_age,
                           channels]).
 
--define(CREATION_EVENT_KEYS, [pid, name, address, port, peer_address, peer_port,
-                              ssl, peer_cert_subject, peer_cert_issuer,
-                              peer_cert_validity, auth_mechanism,
-                              ssl_protocol, ssl_key_exchange,
-                              ssl_cipher, ssl_hash,
-                              protocol, user, vhost, timeout, frame_max,
-                              client_properties]).
+-define(CREATION_EVENT_KEYS,
+        [pid, name, port, peer_port, host,
+        peer_host, ssl, peer_cert_subject, peer_cert_issuer,
+        peer_cert_validity, auth_mechanism, ssl_protocol,
+        ssl_key_exchange, ssl_cipher, ssl_hash, protocol, user, vhost,
+        timeout, frame_max, client_properties]).
 
 -define(INFO_KEYS, ?CREATION_EVENT_KEYS ++ ?STATISTICS_KEYS -- [pid]).
 
@@ -60,6 +66,10 @@
          State#v1.connection_state =:= blocking orelse
          State#v1.connection_state =:= blocked)).
 
+-define(IS_STOPPING(State),
+        (State#v1.connection_state =:= closing orelse
+         State#v1.connection_state =:= closed)).
+
 %%--------------------------------------------------------------------------
 
 -ifdef(use_specs).
@@ -94,24 +104,24 @@
 
 %%--------------------------------------------------------------------------
 
-start_link(ChannelSupSupPid, Collector, StartHeartbeatFun) ->
-    {ok, proc_lib:spawn_link(?MODULE, init, [self(), ChannelSupSupPid,
+start_link(ChannelSup3Pid, Collector, StartHeartbeatFun) ->
+    {ok, proc_lib:spawn_link(?MODULE, init, [self(), ChannelSup3Pid,
                                              Collector, StartHeartbeatFun])}.
 
 shutdown(Pid, Explanation) ->
     gen_server:call(Pid, {shutdown, Explanation}, infinity).
 
-init(Parent, ChannelSupSupPid, Collector, StartHeartbeatFun) ->
+init(Parent, ChSup3Pid, Collector, StartHeartbeatFun) ->
     Deb = sys:debug_options([]),
     receive
         {go, Sock, SockTransform} ->
             start_connection(
-              Parent, ChannelSupSupPid, Collector, StartHeartbeatFun, Deb, Sock,
+              Parent, ChSup3Pid, Collector, StartHeartbeatFun, Deb, Sock,
               SockTransform)
     end.
 
 system_continue(Parent, Deb, State) ->
-    ?MODULE:mainloop(Deb, State#v1{parent = Parent}).
+    mainloop(Deb, State#v1{parent = Parent}).
 
 system_terminate(Reason, _Parent, _Deb, _State) ->
     exit(Reason).
@@ -133,8 +143,8 @@ info(Pid, Items) ->
 force_event_refresh(Pid) ->
     gen_server:cast(Pid, force_event_refresh).
 
-conserve_resources(Pid, _Source, Conserve) ->
-    Pid ! {conserve_resources, Conserve},
+conserve_resources(Pid, Source, Conserve) ->
+    Pid ! {conserve_resources, Source, Conserve},
     ok.
 
 server_properties(Protocol) ->
@@ -169,84 +179,110 @@ server_capabilities(rabbit_framing_amqp_0_9_1) ->
     [{<<"publisher_confirms">>,         bool, true},
      {<<"exchange_exchange_bindings">>, bool, true},
      {<<"basic.nack">>,                 bool, true},
-     {<<"consumer_cancel_notify">>,     bool, true}];
+     {<<"consumer_cancel_notify">>,     bool, true},
+     {<<"connection.blocked">>,         bool, true}];
 server_capabilities(_) ->
     [].
 
+%%--------------------------------------------------------------------------
+
 log(Level, Fmt, Args) -> rabbit_log:log(connection, Level, Fmt, Args).
 
+socket_error(Reason) ->
+    log(error, "error on AMQP connection ~p: ~p (~s)~n",
+        [self(), Reason, rabbit_misc:format_inet_error(Reason)]).
+
 inet_op(F) -> rabbit_misc:throw_on_error(inet_error, F).
 
 socket_op(Sock, Fun) ->
     case Fun(Sock) of
         {ok, Res}       -> Res;
-        {error, Reason} -> log(error, "error on AMQP connection ~p: ~p~n",
-                               [self(), Reason]),
+        {error, Reason} -> socket_error(Reason),
+                           %% NB: this is tcp socket, even in case of ssl
+                           rabbit_net:fast_close(Sock),
                            exit(normal)
     end.
 
-name(Sock) ->
-    socket_op(Sock, fun (S) -> rabbit_net:connection_string(S, inbound) end).
-
-start_connection(Parent, ChannelSupSupPid, Collector, StartHeartbeatFun, Deb,
+start_connection(Parent, ChSup3Pid, Collector, StartHeartbeatFun, Deb,
                  Sock, SockTransform) ->
     process_flag(trap_exit, true),
-    ConnStr = name(Sock),
-    log(info, "accepting AMQP connection ~p (~s)~n", [self(), ConnStr]),
+    Name = case rabbit_net:connection_string(Sock, inbound) of
+               {ok, Str}         -> Str;
+               {error, enotconn} -> rabbit_net:fast_close(Sock),
+                                    exit(normal);
+               {error, Reason}   -> socket_error(Reason),
+                                    rabbit_net:fast_close(Sock),
+                                    exit(normal)
+           end,
+    log(info, "accepting AMQP connection ~p (~s)~n", [self(), Name]),
     ClientSock = socket_op(Sock, SockTransform),
-    erlang:send_after(?HANDSHAKE_TIMEOUT * 1000, self(),
-                      handshake_timeout),
+    erlang:send_after(?HANDSHAKE_TIMEOUT * 1000, self(), handshake_timeout),
+    {PeerHost, PeerPort, Host, Port} =
+        socket_op(Sock, fun (S) -> rabbit_net:socket_ends(S, inbound) end),
     State = #v1{parent              = Parent,
                 sock                = ClientSock,
                 connection          = #connection{
+                  name               = list_to_binary(Name),
+                  host               = Host,
+                  peer_host          = PeerHost,
+                  port               = Port,
+                  peer_port          = PeerPort,
                   protocol           = none,
                   user               = none,
                   timeout_sec        = ?HANDSHAKE_TIMEOUT,
                   frame_max          = ?FRAME_MIN_SIZE,
                   vhost              = none,
                   client_properties  = none,
-                  capabilities       = []},
+                  capabilities       = [],
+                  auth_mechanism     = none,
+                  auth_state         = none},
                 callback            = uninitialized_callback,
                 recv_len            = 0,
                 pending_recv        = false,
                 connection_state    = pre_init,
                 queue_collector     = Collector,
                 heartbeater         = none,
-                channel_sup_sup_pid = ChannelSupSupPid,
+                ch_sup3_pid         = ChSup3Pid,
+                channel_sup_sup_pid = none,
                 start_heartbeat_fun = StartHeartbeatFun,
                 buf                 = [],
                 buf_len             = 0,
-                auth_mechanism      = none,
-                auth_state          = none,
-                conserve_resources  = false,
-                last_blocked_by     = none,
-                last_blocked_at     = never},
+                throttle            = #throttle{
+                                         alarmed_by      = [],
+                                         last_blocked_by = none,
+                                         last_blocked_at = never,
+                                         blocked_sent    = false}},
     try
-        ok = inet_op(fun () -> rabbit_net:tune_buffer_size(ClientSock) end),
-        recvloop(Deb, switch_callback(rabbit_event:init_stats_timer(
-                                       State, #v1.stats_timer),
-                                      handshake, 8)),
-        log(info, "closing AMQP connection ~p (~s)~n", [self(), ConnStr])
+        run({?MODULE, recvloop,
+             [Deb, switch_callback(rabbit_event:init_stats_timer(
+                                     State, #v1.stats_timer),
+                                   handshake, 8)]}),
+        log(info, "closing AMQP connection ~p (~s)~n", [self(), Name])
     catch
         Ex -> log(case Ex of
                       connection_closed_abruptly -> warning;
                       _                          -> error
                   end, "closing AMQP connection ~p (~s):~n~p~n",
-                  [self(), ConnStr, Ex])
+                  [self(), Name, Ex])
     after
-        %% The reader is the controlling process and hence its
-        %% termination will close the socket. Furthermore,
-        %% gen_tcp:close/1 waits for pending output to be sent, which
-        %% results in unnecessary delays. However, to keep the
-        %% file_handle_cache accounting as accurate as possible it
-        %% would be good to close the socket immediately if we
-        %% can. But we can only do this for non-ssl sockets.
-        %%
-        rabbit_net:maybe_fast_close(ClientSock),
+        %% We don't call gen_tcp:close/1 here since it waits for
+        %% pending output to be sent, which results in unnecessary
+        %% delays. We could just terminate - the reader is the
+        %% controlling process and hence its termination will close
+        %% the socket. However, to keep the file_handle_cache
+        %% accounting as accurate as possible we ought to close the
+        %% socket w/o delay before termination.
+        rabbit_net:fast_close(ClientSock),
+        rabbit_networking:unregister_connection(self()),
         rabbit_event:notify(connection_closed, [{pid, self()}])
     end,
     done.
 
+run({M, F, A}) ->
+    try apply(M, F, A)
+    catch {become, MFA} -> run(MFA)
+    end.
+
 recvloop(Deb, State = #v1{pending_recv = true}) ->
     mainloop(Deb, State);
 recvloop(Deb, State = #v1{connection_state = blocked}) ->
@@ -276,18 +312,32 @@ mainloop(Deb, State = #v1{sock = Sock, buf = Buf, buf_len = BufLen}) ->
                 _      -> throw(connection_closed_abruptly)
             end;
         {inet_async, _Sock, _Ref, {error, Reason}} ->
+            maybe_emit_stats(State),
             throw({inet_error, Reason});
+        {system, From, Request} ->
+            sys:handle_system_msg(Request, From, State#v1.parent,
+                                  ?MODULE, Deb, State);
         Other ->
-            handle_other(Other, Deb, State)
+            case handle_other(Other, State) of
+                stop     -> ok;
+                NewState -> recvloop(Deb, NewState)
+            end
     end.
 
-handle_other({conserve_resources, Conserve}, Deb, State) ->
-    recvloop(Deb, control_throttle(State#v1{conserve_resources = Conserve}));
-handle_other({channel_closing, ChPid}, Deb, State) ->
+handle_other({conserve_resources, Source, Conserve},
+             State = #v1{throttle = Throttle =
+                             #throttle{alarmed_by = CR}}) ->
+    CR1 = case Conserve of
+              true  -> lists:usort([Source | CR]);
+              false -> CR -- [Source]
+          end,
+    Throttle1 = Throttle#throttle{alarmed_by = CR1},
+    control_throttle(State#v1{throttle = Throttle1});
+handle_other({channel_closing, ChPid}, State) ->
     ok = rabbit_channel:ready_for_close(ChPid),
     channel_cleanup(ChPid),
-    mainloop(Deb, maybe_close(control_throttle(State)));
-handle_other({'EXIT', Parent, Reason}, _Deb, State = #v1{parent = Parent}) ->
+    maybe_close(control_throttle(State));
+handle_other({'EXIT', Parent, Reason}, State = #v1{parent = Parent}) ->
     terminate(io_lib:format("broker forced connection closure "
                             "with reason '~w'", [Reason]), State),
     %% this is what we are expected to do according to
@@ -298,94 +348,133 @@ handle_other({'EXIT', Parent, Reason}, _Deb, State = #v1{parent = Parent}) ->
     %% ordinary error case. However, since this termination is
     %% initiated by our parent it is probably more important to exit
     %% quickly.
+    maybe_emit_stats(State),
     exit(Reason);
-handle_other({channel_exit, _Channel, E = {writer, send_failed, _Error}},
-             _Deb, _State) ->
+handle_other({channel_exit, _Channel, E = {writer, send_failed, _E}}, State) ->
+    maybe_emit_stats(State),
     throw(E);
-handle_other({channel_exit, Channel, Reason}, Deb, State) ->
-    mainloop(Deb, handle_exception(State, Channel, Reason));
-handle_other({'DOWN', _MRef, process, ChPid, Reason}, Deb, State) ->
-    mainloop(Deb, handle_dependent_exit(ChPid, Reason, State));
-handle_other(terminate_connection, _Deb, State) ->
+handle_other({channel_exit, Channel, Reason}, State) ->
+    handle_exception(State, Channel, Reason);
+handle_other({'DOWN', _MRef, process, ChPid, Reason}, State) ->
+    handle_dependent_exit(ChPid, Reason, State);
+handle_other(terminate_connection, State) ->
+    maybe_emit_stats(State),
+    stop;
+handle_other(handshake_timeout, State)
+  when ?IS_RUNNING(State) orelse ?IS_STOPPING(State) ->
     State;
-handle_other(handshake_timeout, Deb, State)
-  when ?IS_RUNNING(State) orelse
-       State#v1.connection_state =:= closing orelse
-       State#v1.connection_state =:= closed ->
-    mainloop(Deb, State);
-handle_other(handshake_timeout, _Deb, State) ->
+handle_other(handshake_timeout, State) ->
+    maybe_emit_stats(State),
     throw({handshake_timeout, State#v1.callback});
-handle_other(timeout, Deb, State = #v1{connection_state = closed}) ->
-    mainloop(Deb, State);
-handle_other(timeout, _Deb, #v1{connection_state = S}) ->
-    throw({timeout, S});
-handle_other({'$gen_call', From, {shutdown, Explanation}}, Deb, State) ->
+handle_other(heartbeat_timeout, State = #v1{connection_state = closed}) ->
+    State;
+handle_other(heartbeat_timeout, State = #v1{connection_state = S}) ->
+    maybe_emit_stats(State),
+    throw({heartbeat_timeout, S});
+handle_other({'$gen_call', From, {shutdown, Explanation}}, State) ->
     {ForceTermination, NewState} = terminate(Explanation, State),
     gen_server:reply(From, ok),
     case ForceTermination of
-        force  -> ok;
-        normal -> mainloop(Deb, NewState)
+        force  -> stop;
+        normal -> NewState
     end;
-handle_other({'$gen_call', From, info}, Deb, State) ->
+handle_other({'$gen_call', From, info}, State) ->
     gen_server:reply(From, infos(?INFO_KEYS, State)),
-    mainloop(Deb, State);
-handle_other({'$gen_call', From, {info, Items}}, Deb, State) ->
+    State;
+handle_other({'$gen_call', From, {info, Items}}, State) ->
     gen_server:reply(From, try {ok, infos(Items, State)}
                            catch Error -> {error, Error}
                            end),
-    mainloop(Deb, State);
-handle_other({'$gen_cast', force_event_refresh}, Deb, State)
+    State;
+handle_other({'$gen_cast', force_event_refresh}, State)
   when ?IS_RUNNING(State) ->
     rabbit_event:notify(connection_created,
                         [{type, network} | infos(?CREATION_EVENT_KEYS, State)]),
-    mainloop(Deb, State);
-handle_other({'$gen_cast', force_event_refresh}, Deb, State) ->
+    State;
+handle_other({'$gen_cast', force_event_refresh}, State) ->
     %% Ignore, we will emit a created event once we start running.
-    mainloop(Deb, State);
-handle_other(emit_stats, Deb, State) ->
-    mainloop(Deb, emit_stats(State));
-handle_other({system, From, Request}, Deb, State = #v1{parent = Parent}) ->
-    sys:handle_system_msg(Request, From, Parent, ?MODULE, Deb, State);
-handle_other({bump_credit, Msg}, Deb, State) ->
+    State;
+handle_other(ensure_stats, State) ->
+    ensure_stats_timer(State);
+handle_other(emit_stats, State) ->
+    emit_stats(State);
+handle_other({bump_credit, Msg}, State) ->
     credit_flow:handle_bump_msg(Msg),
-    recvloop(Deb, control_throttle(State));
-handle_other(Other, _Deb, _State) ->
+    control_throttle(State);
+handle_other(Other, State) ->
     %% internal error -> something worth dying for
+    maybe_emit_stats(State),
     exit({unexpected_message, Other}).
 
 switch_callback(State, Callback, Length) ->
     State#v1{callback = Callback, recv_len = Length}.
 
 terminate(Explanation, State) when ?IS_RUNNING(State) ->
-    {normal, send_exception(State, 0,
-                            rabbit_misc:amqp_error(
-                              connection_forced, Explanation, [], none))};
+    {normal, handle_exception(State, 0,
+                              rabbit_misc:amqp_error(
+                                connection_forced, Explanation, [], none))};
 terminate(_Explanation, State) ->
     {force, State}.
 
-control_throttle(State = #v1{connection_state   = CS,
-                             conserve_resources = Mem}) ->
-    case {CS, Mem orelse credit_flow:blocked()} of
+control_throttle(State = #v1{connection_state = CS, throttle = Throttle}) ->
+    IsThrottled = ((Throttle#throttle.alarmed_by =/= []) orelse
+               credit_flow:blocked()),
+    case {CS, IsThrottled} of
         {running,   true} -> State#v1{connection_state = blocking};
         {blocking, false} -> State#v1{connection_state = running};
         {blocked,  false} -> ok = rabbit_heartbeat:resume_monitor(
                                     State#v1.heartbeater),
-                             State#v1{connection_state = running};
-        {blocked,   true} -> update_last_blocked_by(State);
+                             maybe_send_unblocked(State),
+                             State#v1{connection_state = running,
+                                      throttle = Throttle#throttle{
+                                                   blocked_sent = false}};
+        {blocked,   true} -> State#v1{throttle = update_last_blocked_by(
+                                                   Throttle)};
         {_,            _} -> State
     end.
 
-maybe_block(State = #v1{connection_state = blocking}) ->
+maybe_block(State = #v1{connection_state = blocking,
+                        throttle         = Throttle}) ->
     ok = rabbit_heartbeat:pause_monitor(State#v1.heartbeater),
-    update_last_blocked_by(State#v1{connection_state = blocked,
-                                    last_blocked_at  = erlang:now()});
+    Sent = maybe_send_blocked(State),
+    State#v1{connection_state = blocked,
+             throttle = update_last_blocked_by(
+                          Throttle#throttle{last_blocked_at = erlang:now(),
+                                            blocked_sent    = Sent})};
 maybe_block(State) ->
     State.
 
-update_last_blocked_by(State = #v1{conserve_resources = true}) ->
-    State#v1{last_blocked_by = resource};
-update_last_blocked_by(State = #v1{conserve_resources = false}) ->
-    State#v1{last_blocked_by = flow}.
+maybe_send_blocked(#v1{throttle = #throttle{alarmed_by = []}}) ->
+    false;
+maybe_send_blocked(#v1{throttle   = #throttle{alarmed_by = CR},
+                       connection = #connection{
+                                       protocol     = Protocol,
+                                       capabilities = Capabilities},
+                       sock       = Sock}) ->
+    case rabbit_misc:table_lookup(Capabilities, <<"connection.blocked">>) of
+        {bool, true} ->
+            RStr = string:join([atom_to_list(A) || A <- CR], " & "),
+            Reason = list_to_binary(rabbit_misc:format("low on ~s", [RStr])),
+            ok = send_on_channel0(Sock, #'connection.blocked'{reason = Reason},
+                                  Protocol),
+            true;
+        _ ->
+            false
+    end.
+
+maybe_send_unblocked(#v1{throttle = #throttle{blocked_sent = false}}) ->
+    ok;
+maybe_send_unblocked(#v1{connection = #connection{protocol = Protocol},
+                         sock       = Sock}) ->
+    ok = send_on_channel0(Sock, #'connection.unblocked'{}, Protocol).
+
+update_last_blocked_by(Throttle = #throttle{alarmed_by = []}) ->
+    Throttle#throttle{last_blocked_by = flow};
+update_last_blocked_by(Throttle) ->
+    Throttle#throttle{last_blocked_by = resource}.
+
+%%--------------------------------------------------------------------------
+%% error handling / termination
 
 close_connection(State = #v1{queue_collector = Collector,
                              connection = #connection{
@@ -405,29 +494,15 @@ close_connection(State = #v1{queue_collector = Collector,
 
 handle_dependent_exit(ChPid, Reason, State) ->
     case {channel_cleanup(ChPid), termination_kind(Reason)} of
-        {undefined, uncontrolled} ->
-            exit({abnormal_dependent_exit, ChPid, Reason});
-        {_Channel, controlled} ->
-            maybe_close(control_throttle(State));
-        {Channel, uncontrolled} ->
-            log(error, "AMQP connection ~p, channel ~p - error:~n~p~n",
-                [self(), Channel, Reason]),
-            maybe_close(handle_exception(control_throttle(State),
-                                         Channel, Reason))
-    end.
-
-channel_cleanup(ChPid) ->
-    case get({ch_pid, ChPid}) of
-        undefined       -> undefined;
-        {Channel, MRef} -> credit_flow:peer_down(ChPid),
-                           erase({channel, Channel}),
-                           erase({ch_pid, ChPid}),
-                           erlang:demonitor(MRef, [flush]),
-                           Channel
+        {undefined,   controlled} -> State;
+        {undefined, uncontrolled} -> exit({abnormal_dependent_exit,
+                                           ChPid, Reason});
+        {_Channel,    controlled} -> maybe_close(control_throttle(State));
+        {Channel,   uncontrolled} -> State1 = handle_exception(
+                                                State, Channel, Reason),
+                                     maybe_close(control_throttle(State1))
     end.
 
-all_channels() -> [ChPid || {{ch_pid, ChPid}, _ChannelMRef} <- get()].
-
 terminate_channels() ->
     NChannels =
         length([rabbit_channel:shutdown(ChPid) || ChPid <- all_channels()]),
@@ -481,78 +556,179 @@ maybe_close(State) ->
 termination_kind(normal) -> controlled;
 termination_kind(_)      -> uncontrolled.
 
+handle_exception(State = #v1{connection_state = closed}, Channel, Reason) ->
+    log(error, "AMQP connection ~p (~p), channel ~p - error:~n~p~n",
+        [self(), closed, Channel, Reason]),
+    State;
+handle_exception(State = #v1{connection = #connection{protocol = Protocol},
+                             connection_state = CS},
+                 Channel, Reason)
+  when ?IS_RUNNING(State) orelse CS =:= closing ->
+    log(error, "AMQP connection ~p (~p), channel ~p - error:~n~p~n",
+        [self(), CS, Channel, Reason]),
+    {0, CloseMethod} =
+        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
+    terminate_channels(),
+    State1 = close_connection(State),
+    ok = send_on_channel0(State1#v1.sock, CloseMethod, Protocol),
+    State1;
+handle_exception(State, Channel, Reason) ->
+    %% We don't trust the client at this point - force them to wait
+    %% for a bit so they can't DOS us with repeated failed logins etc.
+    timer:sleep(?SILENT_CLOSE_DELAY * 1000),
+    throw({handshake_error, State#v1.connection_state, Channel, Reason}).
+
+%% we've "lost sync" with the client and hence must not accept any
+%% more input
+fatal_frame_error(Error, Type, Channel, Payload, State) ->
+    frame_error(Error, Type, Channel, Payload, State),
+    %% grace period to allow transmission of error
+    timer:sleep(?SILENT_CLOSE_DELAY * 1000),
+    throw(fatal_frame_error).
+
+frame_error(Error, Type, Channel, Payload, State) ->
+    {Str, Bin} = payload_snippet(Payload),
+    handle_exception(State, Channel,
+                     rabbit_misc:amqp_error(frame_error,
+                                            "type ~p, ~s octets = ~p: ~p",
+                                            [Type, Str, Bin, Error], none)).
+
+unexpected_frame(Type, Channel, Payload, State) ->
+    {Str, Bin} = payload_snippet(Payload),
+    handle_exception(State, Channel,
+                     rabbit_misc:amqp_error(unexpected_frame,
+                                            "type ~p, ~s octets = ~p",
+                                            [Type, Str, Bin], none)).
+
+payload_snippet(Payload) when size(Payload) =< 16 ->
+    {"all", Payload};
+payload_snippet(<<Snippet:16/binary, _/binary>>) ->
+    {"first 16", Snippet}.
+
+%%--------------------------------------------------------------------------
+
+create_channel(Channel, State) ->
+    #v1{sock = Sock, queue_collector = Collector,
+        channel_sup_sup_pid = ChanSupSup,
+        connection = #connection{name         = Name,
+                                 protocol     = Protocol,
+                                 frame_max    = FrameMax,
+                                 user         = User,
+                                 vhost        = VHost,
+                                 capabilities = Capabilities}} = State,
+    {ok, _ChSupPid, {ChPid, AState}} =
+        rabbit_channel_sup_sup:start_channel(
+          ChanSupSup, {tcp, Sock, Channel, FrameMax, self(), Name,
+                       Protocol, User, VHost, Capabilities, Collector}),
+    MRef = erlang:monitor(process, ChPid),
+    put({ch_pid, ChPid}, {Channel, MRef}),
+    put({channel, Channel}, {ChPid, AState}),
+    {ChPid, AState}.
+
+channel_cleanup(ChPid) ->
+    case get({ch_pid, ChPid}) of
+        undefined       -> undefined;
+        {Channel, MRef} -> credit_flow:peer_down(ChPid),
+                           erase({channel, Channel}),
+                           erase({ch_pid, ChPid}),
+                           erlang:demonitor(MRef, [flush]),
+                           Channel
+    end.
+
+all_channels() -> [ChPid || {{ch_pid, ChPid}, _ChannelMRef} <- get()].
+
+%%--------------------------------------------------------------------------
+
 handle_frame(Type, 0, Payload,
-             State = #v1{connection_state = CS,
-                         connection = #connection{protocol = Protocol}})
-  when CS =:= closing; CS =:= closed ->
+             State = #v1{connection = #connection{protocol = Protocol}})
+  when ?IS_STOPPING(State) ->
     case rabbit_command_assembler:analyze_frame(Type, Payload, Protocol) of
         {method, MethodName, FieldsBin} ->
             handle_method0(MethodName, FieldsBin, State);
         _Other -> State
     end;
-handle_frame(_Type, _Channel, _Payload, State = #v1{connection_state = CS})
-  when CS =:= closing; CS =:= closed ->
-    State;
 handle_frame(Type, 0, Payload,
              State = #v1{connection = #connection{protocol = Protocol}}) ->
     case rabbit_command_assembler:analyze_frame(Type, Payload, Protocol) of
-        error     -> throw({unknown_frame, 0, Type, Payload});
+        error     -> frame_error(unknown_frame, Type, 0, Payload, State);
         heartbeat -> State;
         {method, MethodName, FieldsBin} ->
             handle_method0(MethodName, FieldsBin, State);
-        Other -> throw({unexpected_frame_on_channel0, Other})
+        _Other    -> unexpected_frame(Type, 0, Payload, State)
     end;
 handle_frame(Type, Channel, Payload,
-             State = #v1{connection = #connection{protocol = Protocol}}) ->
+             State = #v1{connection = #connection{protocol = Protocol}})
+  when ?IS_RUNNING(State) ->
     case rabbit_command_assembler:analyze_frame(Type, Payload, Protocol) of
-        error         -> throw({unknown_frame, Channel, Type, Payload});
-        heartbeat     -> throw({unexpected_heartbeat_frame, Channel});
-        AnalyzedFrame -> process_frame(AnalyzedFrame, Channel, State)
-    end.
+        error     -> frame_error(unknown_frame, Type, Channel, Payload, State);
+        heartbeat -> unexpected_frame(Type, Channel, Payload, State);
+        Frame     -> process_frame(Frame, Channel, State)
+    end;
+handle_frame(_Type, _Channel, _Payload, State) when ?IS_STOPPING(State) ->
+    State;
+handle_frame(Type, Channel, Payload, State) ->
+    unexpected_frame(Type, Channel, Payload, State).
 
 process_frame(Frame, Channel, State) ->
-    case get({channel, Channel}) of
-        {ChPid, AState} ->
-            case process_channel_frame(Frame,  ChPid, AState) of
-                {ok, NewAState} -> put({channel, Channel}, {ChPid, NewAState}),
-                                   post_process_frame(Frame, ChPid, State);
-                {error, Reason} -> handle_exception(State, Channel, Reason)
-            end;
-        undefined when ?IS_RUNNING(State) ->
-            ok = create_channel(Channel, State),
-            process_frame(Frame, Channel, State);
-        undefined ->
-            throw({channel_frame_while_starting,
-                   Channel, State#v1.connection_state, Frame})
+    ChKey = {channel, Channel},
+    {ChPid, AState} = case get(ChKey) of
+                          undefined -> create_channel(Channel, State);
+                          Other     -> Other
+                      end,
+    case rabbit_command_assembler:process(Frame, AState) of
+        {ok, NewAState} ->
+            put(ChKey, {ChPid, NewAState}),
+            post_process_frame(Frame, ChPid, State);
+        {ok, Method, NewAState} ->
+            rabbit_channel:do(ChPid, Method),
+            put(ChKey, {ChPid, NewAState}),
+            post_process_frame(Frame, ChPid, State);
+        {ok, Method, Content, NewAState} ->
+            rabbit_channel:do_flow(ChPid, Method, Content),
+            put(ChKey, {ChPid, NewAState}),
+            post_process_frame(Frame, ChPid, control_throttle(State));
+        {error, Reason} ->
+            handle_exception(State, Channel, Reason)
     end.
 
 post_process_frame({method, 'channel.close_ok', _}, ChPid, State) ->
     channel_cleanup(ChPid),
+    %% This is not strictly necessary, but more obviously
+    %% correct. Also note that we do not need to call maybe_close/1
+    %% since we cannot possibly be in the 'closing' state.
     control_throttle(State);
-post_process_frame({method, MethodName, _}, _ChPid,
-                   State = #v1{connection = #connection{
-                                 protocol = Protocol}}) ->
-    case Protocol:method_has_content(MethodName) of
-        true  -> erlang:bump_reductions(2000),
-                 maybe_block(control_throttle(State));
-        false -> control_throttle(State)
-    end;
+post_process_frame({content_header, _, _, _, _}, _ChPid, State) ->
+    maybe_block(State);
+post_process_frame({content_body, _}, _ChPid, State) ->
+    maybe_block(State);
 post_process_frame(_Frame, _ChPid, State) ->
-    control_throttle(State).
+    State.
+
+%%--------------------------------------------------------------------------
 
+%% We allow clients to exceed the frame size a little bit since quite
+%% a few get it wrong - off-by 1 or 8 (empty frame size) are typical.
+-define(FRAME_SIZE_FUDGE, ?EMPTY_FRAME_SIZE).
+
+handle_input(frame_header, <<Type:8,Channel:16,PayloadSize:32>>,
+             State = #v1{connection = #connection{frame_max = FrameMax}})
+  when FrameMax /= 0 andalso
+       PayloadSize > FrameMax - ?EMPTY_FRAME_SIZE + ?FRAME_SIZE_FUDGE ->
+    fatal_frame_error(
+      {frame_too_large, PayloadSize, FrameMax - ?EMPTY_FRAME_SIZE},
+      Type, Channel, <<>>, State);
 handle_input(frame_header, <<Type:8,Channel:16,PayloadSize:32>>, State) ->
     ensure_stats_timer(
       switch_callback(State, {frame_payload, Type, Channel, PayloadSize},
                       PayloadSize + 1));
 
-handle_input({frame_payload, Type, Channel, PayloadSize},
-             PayloadAndMarker, State) ->
-    case PayloadAndMarker of
-        <<Payload:PayloadSize/binary, ?FRAME_END>> ->
-            switch_callback(handle_frame(Type, Channel, Payload, State),
-                            frame_header, 7);
-        _ ->
-            throw({bad_payload, Type, Channel, PayloadSize, PayloadAndMarker})
+handle_input({frame_payload, Type, Channel, PayloadSize}, Data, State) ->
+    <<Payload:PayloadSize/binary, EndMarker>> = Data,
+    case EndMarker of
+        ?FRAME_END -> State1 = handle_frame(Type, Channel, Payload, State),
+                      switch_callback(State1, frame_header, 7);
+        _          -> fatal_frame_error({invalid_frame_end_marker, EndMarker},
+                                        Type, Channel, Payload, State)
     end;
 
 %% The two rules pertaining to version negotiation:
@@ -582,8 +758,12 @@ handle_input(handshake, <<"AMQP", 1, 1, 8, 0>>, State) ->
 handle_input(handshake, <<"AMQP", 1, 1, 9, 1>>, State) ->
     start_connection({8, 0, 0}, rabbit_framing_amqp_0_8, State);
 
+%% ... and finally, the 1.0 spec is crystal clear!  Note that the
+handle_input(handshake, <<"AMQP", Id, 1, 0, 0>>, State) ->
+    become_1_0(Id, State);
+
 handle_input(handshake, <<"AMQP", A, B, C, D>>, #v1{sock = Sock}) ->
-    refuse_connection(Sock, {bad_version, A, B, C, D});
+    refuse_connection(Sock, {bad_version, {A, B, C, D}});
 
 handle_input(handshake, Other, #v1{sock = Sock}) ->
     refuse_connection(Sock, {bad_header, Other});
@@ -597,6 +777,7 @@ handle_input(Callback, Data, _State) ->
 start_connection({ProtocolMajor, ProtocolMinor, _ProtocolRevision},
                  Protocol,
                  State = #v1{sock = Sock, connection = Connection}) ->
+    rabbit_networking:register_connection(self()),
     Start = #'connection.start'{
       version_major = ProtocolMajor,
       version_minor = ProtocolMinor,
@@ -610,10 +791,16 @@ start_connection({ProtocolMajor, ProtocolMinor, _ProtocolRevision},
                              connection_state = starting},
                     frame_header, 7).
 
-refuse_connection(Sock, Exception) ->
-    ok = inet_op(fun () -> rabbit_net:send(Sock, <<"AMQP",0,0,9,1>>) end),
+refuse_connection(Sock, Exception, {A, B, C, D}) ->
+    ok = inet_op(fun () -> rabbit_net:send(Sock, <<"AMQP",A,B,C,D>>) end),
     throw(Exception).
 
+-ifdef(use_specs).
+-spec(refuse_connection/2 :: (rabbit_net:socket(), any()) -> no_return()).
+-endif.
+refuse_connection(Sock, Exception) ->
+    refuse_connection(Sock, Exception, {0, 0, 9, 1}).
+
 ensure_stats_timer(State = #v1{connection_state = running}) ->
     rabbit_event:ensure_stats_timer(State, #v1.stats_timer, emit_stats);
 ensure_stats_timer(State) ->
@@ -623,24 +810,14 @@ ensure_stats_timer(State) ->
 
 handle_method0(MethodName, FieldsBin,
                State = #v1{connection = #connection{protocol = Protocol}}) ->
-    HandleException =
-        fun(R) ->
-                case ?IS_RUNNING(State) of
-                    true  -> send_exception(State, 0, R);
-                    %% We don't trust the client at this point - force
-                    %% them to wait for a bit so they can't DOS us with
-                    %% repeated failed logins etc.
-                    false -> timer:sleep(?SILENT_CLOSE_DELAY * 1000),
-                             throw({channel0_error, State#v1.connection_state, R})
-                end
-        end,
     try
         handle_method0(Protocol:decode_method_fields(MethodName, FieldsBin),
                        State)
     catch exit:#amqp_error{method = none} = Reason ->
-            HandleException(Reason#amqp_error{method = MethodName});
+            handle_exception(State, 0, Reason#amqp_error{method = MethodName});
           Type:Reason ->
-            HandleException({Type, Reason, MethodName, erlang:get_stacktrace()})
+            Stack = erlang:get_stacktrace(),
+            handle_exception(State, 0, {Type, Reason, MethodName, Stack})
     end.
 
 handle_method0(#'connection.start_ok'{mechanism = Mechanism,
@@ -655,13 +832,13 @@ handle_method0(#'connection.start_ok'{mechanism = Mechanism,
             {table, Capabilities1} -> Capabilities1;
             _                      -> []
         end,
-    State = State0#v1{auth_mechanism   = AuthMechanism,
-                      auth_state       = AuthMechanism:init(Sock),
-                      connection_state = securing,
+    State = State0#v1{connection_state = securing,
                       connection       =
                           Connection#connection{
                             client_properties = ClientProperties,
-                            capabilities      = Capabilities}},
+                            capabilities      = Capabilities,
+                            auth_mechanism    = {Mechanism, AuthMechanism},
+                            auth_state        = AuthMechanism:init(Sock)}},
     auth_phase(Response, State);
 
 handle_method0(#'connection.secure_ok'{response = Response},
@@ -687,7 +864,7 @@ handle_method0(#'connection.tune_ok'{frame_max = FrameMax,
             Frame = rabbit_binary_generator:build_heartbeat_frame(),
             SendFun = fun() -> catch rabbit_net:send(Sock, Frame) end,
             Parent = self(),
-            ReceiveFun = fun() -> Parent ! timeout end,
+            ReceiveFun = fun() -> Parent ! heartbeat_timeout end,
             Heartbeater = SHF(Sock, ClientHeartbeat, SendFun,
                               ClientHeartbeat, ReceiveFun),
             State#v1{connection_state = opening,
@@ -699,32 +876,39 @@ handle_method0(#'connection.tune_ok'{frame_max = FrameMax,
 
 handle_method0(#'connection.open'{virtual_host = VHostPath},
                State = #v1{connection_state = opening,
-                           connection = Connection = #connection{
-                                          user = User,
-                                          protocol = Protocol},
-                           sock = Sock}) ->
+                           connection       = Connection = #connection{
+                                                user = User,
+                                                protocol = Protocol},
+                           ch_sup3_pid      = ChSup3Pid,
+                           sock             = Sock,
+                           throttle         = Throttle}) ->
     ok = rabbit_access_control:check_vhost_access(User, VHostPath),
     NewConnection = Connection#connection{vhost = VHostPath},
     ok = send_on_channel0(Sock, #'connection.open_ok'{}, Protocol),
     Conserve = rabbit_alarm:register(self(), {?MODULE, conserve_resources, []}),
+    Throttle1 = Throttle#throttle{alarmed_by = Conserve},
+    {ok, ChannelSupSupPid} =
+        supervisor2:start_child(
+          ChSup3Pid,
+          {channel_sup_sup, {rabbit_channel_sup_sup, start_link, []},
+           intrinsic, infinity, supervisor, [rabbit_channel_sup_sup]}),
     State1 = control_throttle(
-               State#v1{connection_state   = running,
-                        connection         = NewConnection,
-                        conserve_resources = Conserve}),
+               State#v1{connection_state    = running,
+                        connection          = NewConnection,
+                        channel_sup_sup_pid = ChannelSupSupPid,
+                        throttle            = Throttle1}),
     rabbit_event:notify(connection_created,
                         [{type, network} |
                          infos(?CREATION_EVENT_KEYS, State1)]),
-    rabbit_event:if_enabled(State1, #v1.stats_timer,
-                            fun() -> emit_stats(State1) end),
+    maybe_emit_stats(State1),
     State1;
 handle_method0(#'connection.close'{}, State) when ?IS_RUNNING(State) ->
     lists:foreach(fun rabbit_channel:shutdown/1, all_channels()),
     maybe_close(State#v1{connection_state = closing});
 handle_method0(#'connection.close'{},
-               State = #v1{connection_state = CS,
-                           connection = #connection{protocol = Protocol},
+               State = #v1{connection = #connection{protocol = Protocol},
                            sock = Sock})
-  when CS =:= closing; CS =:= closed ->
+  when ?IS_STOPPING(State) ->
     %% We're already closed or closing, so we don't need to cleanup
     %% anything.
     ok = send_on_channel0(Sock, #'connection.close_ok'{}, Protocol),
@@ -733,19 +917,20 @@ handle_method0(#'connection.close_ok'{},
                State = #v1{connection_state = closed}) ->
     self() ! terminate_connection,
     State;
-handle_method0(_Method, State = #v1{connection_state = CS})
-  when CS =:= closing; CS =:= closed ->
+handle_method0(_Method, State) when ?IS_STOPPING(State) ->
     State;
 handle_method0(_Method, #v1{connection_state = S}) ->
     rabbit_misc:protocol_error(
       channel_error, "unexpected method in connection state ~w", [S]).
 
-%% Compute frame_max for this instance. Could simply use 0, but breaks
-%% QPid Java client.
 server_frame_max() ->
     {ok, FrameMax} = application:get_env(rabbit, frame_max),
     FrameMax.
 
+server_heartbeat() ->
+    {ok, Heartbeat} = application:get_env(rabbit, heartbeat),
+    Heartbeat.
+
 send_on_channel0(Sock, Method, Protocol) ->
     ok = rabbit_writer:internal_send_command(Sock, 0, Method, Protocol).
 
@@ -777,115 +962,86 @@ auth_mechanisms_binary(Sock) ->
       string:join([atom_to_list(A) || A <- auth_mechanisms(Sock)], " ")).
 
 auth_phase(Response,
-           State = #v1{auth_mechanism = AuthMechanism,
-                       auth_state = AuthState,
-                       connection = Connection =
-                           #connection{protocol = Protocol},
+           State = #v1{connection = Connection =
+                           #connection{protocol       = Protocol,
+                                       auth_mechanism = {Name, AuthMechanism},
+                                       auth_state     = AuthState},
                        sock = Sock}) ->
     case AuthMechanism:handle_response(Response, AuthState) of
         {refused, Msg, Args} ->
             rabbit_misc:protocol_error(
               access_refused, "~s login refused: ~s",
-              [proplists:get_value(name, AuthMechanism:description()),
-               io_lib:format(Msg, Args)]);
+              [Name, io_lib:format(Msg, Args)]);
         {protocol_error, Msg, Args} ->
             rabbit_misc:protocol_error(syntax_error, Msg, Args);
         {challenge, Challenge, AuthState1} ->
             Secure = #'connection.secure'{challenge = Challenge},
             ok = send_on_channel0(Sock, Secure, Protocol),
-            State#v1{auth_state = AuthState1};
+            State#v1{connection = Connection#connection{
+                                    auth_state = AuthState1}};
         {ok, User} ->
             Tune = #'connection.tune'{channel_max = 0,
                                       frame_max = server_frame_max(),
-                                      heartbeat = 0},
+                                      heartbeat = server_heartbeat()},
             ok = send_on_channel0(Sock, Tune, Protocol),
             State#v1{connection_state = tuning,
-                     connection = Connection#connection{user = User}}
+                     connection = Connection#connection{user       = User,
+                                                        auth_state = none}}
     end.
 
 %%--------------------------------------------------------------------------
 
 infos(Items, State) -> [{Item, i(Item, State)} || Item <- Items].
 
-i(pid, #v1{}) ->
-    self();
-i(name, #v1{sock = Sock}) ->
-    list_to_binary(name(Sock));
-i(address, #v1{sock = Sock}) ->
-    socket_info(fun rabbit_net:sockname/1, fun ({A, _}) -> A end, Sock);
-i(port, #v1{sock = Sock}) ->
-    socket_info(fun rabbit_net:sockname/1, fun ({_, P}) -> P end, Sock);
-i(peer_address, #v1{sock = Sock}) ->
-    socket_info(fun rabbit_net:peername/1, fun ({A, _}) -> A end, Sock);
-i(peer_port, #v1{sock = Sock}) ->
-    socket_info(fun rabbit_net:peername/1, fun ({_, P}) -> P end, Sock);
-i(ssl, #v1{sock = Sock}) ->
-    rabbit_net:is_ssl(Sock);
-i(ssl_protocol, #v1{sock = Sock}) ->
-    ssl_info(fun ({P, _}) -> P end, Sock);
-i(ssl_key_exchange, #v1{sock = Sock}) ->
-    ssl_info(fun ({_, {K, _, _}}) -> K end, Sock);
-i(ssl_cipher, #v1{sock = Sock}) ->
-    ssl_info(fun ({_, {_, C, _}}) -> C end, Sock);
-i(ssl_hash, #v1{sock = Sock}) ->
-    ssl_info(fun ({_, {_, _, H}}) -> H end, Sock);
-i(peer_cert_issuer, #v1{sock = Sock}) ->
-    cert_info(fun rabbit_ssl:peer_cert_issuer/1, Sock);
-i(peer_cert_subject, #v1{sock = Sock}) ->
-    cert_info(fun rabbit_ssl:peer_cert_subject/1, Sock);
-i(peer_cert_validity, #v1{sock = Sock}) ->
-    cert_info(fun rabbit_ssl:peer_cert_validity/1, Sock);
-i(SockStat, #v1{sock = Sock}) when SockStat =:= recv_oct;
-                                   SockStat =:= recv_cnt;
-                                   SockStat =:= send_oct;
-                                   SockStat =:= send_cnt;
-                                   SockStat =:= send_pend ->
-    socket_info(fun () -> rabbit_net:getstat(Sock, [SockStat]) end,
-                fun ([{_, I}]) -> I end);
-i(state, #v1{connection_state = S}) ->
-    S;
-i(last_blocked_by, #v1{last_blocked_by = By}) ->
-    By;
-i(last_blocked_age, #v1{last_blocked_at = never}) ->
+i(pid,                #v1{}) -> self();
+i(SockStat,           S) when SockStat =:= recv_oct;
+                              SockStat =:= recv_cnt;
+                              SockStat =:= send_oct;
+                              SockStat =:= send_cnt;
+                              SockStat =:= send_pend ->
+    socket_info(fun (Sock) -> rabbit_net:getstat(Sock, [SockStat]) end,
+                fun ([{_, I}]) -> I end, S);
+i(ssl,                #v1{sock = Sock}) -> rabbit_net:is_ssl(Sock);
+i(ssl_protocol,       S) -> ssl_info(fun ({P,         _}) -> P end, S);
+i(ssl_key_exchange,   S) -> ssl_info(fun ({_, {K, _, _}}) -> K end, S);
+i(ssl_cipher,         S) -> ssl_info(fun ({_, {_, C, _}}) -> C end, S);
+i(ssl_hash,           S) -> ssl_info(fun ({_, {_, _, H}}) -> H end, S);
+i(peer_cert_issuer,   S) -> cert_info(fun rabbit_ssl:peer_cert_issuer/1,   S);
+i(peer_cert_subject,  S) -> cert_info(fun rabbit_ssl:peer_cert_subject/1,  S);
+i(peer_cert_validity, S) -> cert_info(fun rabbit_ssl:peer_cert_validity/1, S);
+i(state,              #v1{connection_state = CS}) -> CS;
+i(last_blocked_by,    #v1{throttle = #throttle{last_blocked_by = By}}) -> By;
+i(last_blocked_age,   #v1{throttle = #throttle{last_blocked_at = never}}) ->
     infinity;
-i(last_blocked_age, #v1{last_blocked_at = T}) ->
+i(last_blocked_age,   #v1{throttle = #throttle{last_blocked_at = T}}) ->
     timer:now_diff(erlang:now(), T) / 1000000;
-i(channels, #v1{}) ->
-    length(all_channels());
-i(protocol, #v1{connection = #connection{protocol = none}}) ->
-    none;
-i(protocol, #v1{connection = #connection{protocol = Protocol}}) ->
-    Protocol:version();
-i(auth_mechanism, #v1{auth_mechanism = none}) ->
-    none;
-i(auth_mechanism, #v1{auth_mechanism = Mechanism}) ->
-    proplists:get_value(name, Mechanism:description());
-i(user, #v1{connection = #connection{user = #user{username = Username}}}) ->
-    Username;
-i(user, #v1{connection = #connection{user = none}}) ->
-    '';
-i(vhost, #v1{connection = #connection{vhost = VHost}}) ->
-    VHost;
-i(timeout, #v1{connection = #connection{timeout_sec = Timeout}}) ->
-    Timeout;
-i(frame_max, #v1{connection = #connection{frame_max = FrameMax}}) ->
-    FrameMax;
-i(client_properties, #v1{connection = #connection{
-                           client_properties = ClientProperties}}) ->
-    ClientProperties;
-i(Item, #v1{}) ->
-    throw({bad_argument, Item}).
-
-socket_info(Get, Select, Sock) ->
-    socket_info(fun() -> Get(Sock) end, Select).
-
-socket_info(Get, Select) ->
-    case Get() of
+i(channels,           #v1{}) -> length(all_channels());
+i(Item,               #v1{connection = Conn}) -> ic(Item, Conn).
+
+ic(name,              #connection{name        = Name})     -> Name;
+ic(host,              #connection{host        = Host})     -> Host;
+ic(peer_host,         #connection{peer_host   = PeerHost}) -> PeerHost;
+ic(port,              #connection{port        = Port})     -> Port;
+ic(peer_port,         #connection{peer_port   = PeerPort}) -> PeerPort;
+ic(protocol,          #connection{protocol    = none})     -> none;
+ic(protocol,          #connection{protocol    = P})        -> P:version();
+ic(user,              #connection{user        = none})     -> '';
+ic(user,              #connection{user        = U})        -> U#user.username;
+ic(vhost,             #connection{vhost       = VHost})    -> VHost;
+ic(timeout,           #connection{timeout_sec = Timeout})  -> Timeout;
+ic(frame_max,         #connection{frame_max   = FrameMax}) -> FrameMax;
+ic(client_properties, #connection{client_properties = CP}) -> CP;
+ic(auth_mechanism,    #connection{auth_mechanism = none})  -> none;
+ic(auth_mechanism,    #connection{auth_mechanism = {Name, _Mod}}) -> Name;
+ic(Item,              #connection{}) -> throw({bad_argument, Item}).
+
+socket_info(Get, Select, #v1{sock = Sock}) ->
+    case Get(Sock) of
         {ok,    T} -> Select(T);
         {error, _} -> ''
     end.
 
-ssl_info(F, Sock) ->
+ssl_info(F, #v1{sock = Sock}) ->
     %% The first ok form is R14
     %% The second is R13 - the extra term is exportability (by inspection,
     %% the docs are wrong)
@@ -896,58 +1052,47 @@ ssl_info(F, Sock) ->
         {ok, {P, {K, C, H, _}}} -> F({P, {K, C, H}})
     end.
 
-cert_info(F, Sock) ->
+cert_info(F, #v1{sock = Sock}) ->
     case rabbit_net:peercert(Sock) of
         nossl                -> '';
         {error, no_peercert} -> '';
         {ok, Cert}           -> list_to_binary(F(Cert))
     end.
 
-%%--------------------------------------------------------------------------
-
-create_channel(Channel, State) ->
-    #v1{sock = Sock, queue_collector = Collector,
-        channel_sup_sup_pid = ChanSupSup,
-        connection = #connection{protocol     = Protocol,
-                                 frame_max    = FrameMax,
-                                 user         = User,
-                                 vhost        = VHost,
-                                 capabilities = Capabilities}} = State,
-    {ok, _ChSupPid, {ChPid, AState}} =
-        rabbit_channel_sup_sup:start_channel(
-          ChanSupSup, {tcp, Sock, Channel, FrameMax, self(), name(Sock),
-                       Protocol, User, VHost, Capabilities, Collector}),
-    MRef = erlang:monitor(process, ChPid),
-    put({ch_pid, ChPid}, {Channel, MRef}),
-    put({channel, Channel}, {ChPid, AState}),
-    ok.
-
-process_channel_frame(Frame, ChPid, AState) ->
-    case rabbit_command_assembler:process(Frame, AState) of
-        {ok, NewAState}                  -> {ok, NewAState};
-        {ok, Method, NewAState}          -> rabbit_channel:do(ChPid, Method),
-                                            {ok, NewAState};
-        {ok, Method, Content, NewAState} -> rabbit_channel:do_flow(
-                                              ChPid, Method, Content),
-                                            {ok, NewAState};
-        {error, Reason}                  -> {error, Reason}
-    end.
-
-handle_exception(State = #v1{connection_state = closed}, _Channel, _Reason) ->
-    State;
-handle_exception(State, Channel, Reason) ->
-    send_exception(State, Channel, Reason).
-
-send_exception(State = #v1{connection = #connection{protocol = Protocol}},
-               Channel, Reason) ->
-    {0, CloseMethod} =
-        rabbit_binary_generator:map_exception(Channel, Reason, Protocol),
-    terminate_channels(),
-    State1 = close_connection(State),
-    ok = rabbit_writer:internal_send_command(
-           State1#v1.sock, 0, CloseMethod, Protocol),
-    State1.
+maybe_emit_stats(State) ->
+    rabbit_event:if_enabled(State, #v1.stats_timer,
+                            fun() -> emit_stats(State) end).
 
 emit_stats(State) ->
     rabbit_event:notify(connection_stats, infos(?STATISTICS_KEYS, State)),
     rabbit_event:reset_stats_timer(State, #v1.stats_timer).
+
+%% 1.0 stub
+-ifdef(use_specs).
+-spec(become_1_0/2 :: (non_neg_integer(), #v1{}) -> no_return()).
+-endif.
+become_1_0(Id, State = #v1{sock = Sock}) ->
+    case code:is_loaded(rabbit_amqp1_0_reader) of
+        false -> refuse_connection(Sock, amqp1_0_plugin_not_enabled);
+        _     -> Mode = case Id of
+                            0 -> amqp;
+                            3 -> sasl;
+                            _ -> refuse_connection(
+                                   Sock, {unsupported_amqp1_0_protocol_id, Id},
+                                   {3, 1, 0, 0})
+                        end,
+                 throw({become, {rabbit_amqp1_0_reader, init,
+                                 [Mode, pack_for_1_0(State)]}})
+    end.
+
+pack_for_1_0(#v1{parent              = Parent,
+                 sock                = Sock,
+                 recv_len            = RecvLen,
+                 pending_recv        = PendingRecv,
+                 queue_collector     = QueueCollector,
+                 ch_sup3_pid         = ChSup3Pid,
+                 start_heartbeat_fun = SHF,
+                 buf                 = Buf,
+                 buf_len             = BufLen}) ->
+    {Parent, Sock, RecvLen, PendingRecv, QueueCollector, ChSup3Pid, SHF,
+     Buf, BufLen}.
diff --git a/src/rabbit_registry.erl b/src/rabbit_registry.erl
index 637835c3..f933e4e9 100644
--- a/src/rabbit_registry.erl
+++ b/src/rabbit_registry.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_registry).
@@ -84,12 +84,34 @@ internal_binary_to_type(TypeBin) when is_binary(TypeBin) ->
 internal_register(Class, TypeName, ModuleName)
   when is_atom(Class), is_binary(TypeName), is_atom(ModuleName) ->
     ok = sanity_check_module(class_module(Class), ModuleName),
-    true = ets:insert(?ETS_NAME,
-                      {{Class, internal_binary_to_type(TypeName)}, ModuleName}),
+    RegArg = {{Class, internal_binary_to_type(TypeName)}, ModuleName},
+    true = ets:insert(?ETS_NAME, RegArg),
+    conditional_register(RegArg),
     ok.
 
 internal_unregister(Class, TypeName) ->
-    true = ets:delete(?ETS_NAME, {Class, internal_binary_to_type(TypeName)}),
+    UnregArg = {Class, internal_binary_to_type(TypeName)},
+    conditional_unregister(UnregArg),
+    true = ets:delete(?ETS_NAME, UnregArg),
+    ok.
+
+%% register exchange decorator route callback only when implemented,
+%% in order to avoid unnecessary decorator calls on the fast
+%% publishing path
+conditional_register({{exchange_decorator, Type}, ModuleName}) ->
+    case erlang:function_exported(ModuleName, route, 2) of
+        true  -> true = ets:insert(?ETS_NAME,
+                                   {{exchange_decorator_route, Type},
+                                    ModuleName});
+        false -> ok
+    end;
+conditional_register(_) ->
+    ok.
+
+conditional_unregister({exchange_decorator, Type}) ->
+    true = ets:delete(?ETS_NAME, {exchange_decorator_route, Type}),
+    ok;
+conditional_unregister(_) ->
     ok.
 
 sanity_check_module(ClassModule, Module) ->
@@ -104,9 +126,12 @@ sanity_check_module(ClassModule, Module) ->
         true                  -> ok
     end.
 
-class_module(exchange)          -> rabbit_exchange_type;
-class_module(auth_mechanism)    -> rabbit_auth_mechanism;
-class_module(runtime_parameter) -> rabbit_runtime_parameter.
+class_module(exchange)           -> rabbit_exchange_type;
+class_module(auth_mechanism)     -> rabbit_auth_mechanism;
+class_module(runtime_parameter)  -> rabbit_runtime_parameter;
+class_module(exchange_decorator) -> rabbit_exchange_decorator;
+class_module(policy_validator)   -> rabbit_policy_validator;
+class_module(ha_mode)            -> rabbit_mirror_queue_mode.
 
 %%---------------------------------------------------------------------------
 
diff --git a/src/rabbit_restartable_sup.erl b/src/rabbit_restartable_sup.erl
index 237ab78c..65a2ca0a 100644
--- a/src/rabbit_restartable_sup.erl
+++ b/src/rabbit_restartable_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_restartable_sup).
diff --git a/src/rabbit_router.erl b/src/rabbit_router.erl
index f4bbda0f..00343570 100644
--- a/src/rabbit_router.erl
+++ b/src/rabbit_router.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_router).
diff --git a/src/rabbit_runtime_parameter.erl b/src/rabbit_runtime_parameter.erl
index c7d30116..ee48165b 100644
--- a/src/rabbit_runtime_parameter.erl
+++ b/src/rabbit_runtime_parameter.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_runtime_parameter).
@@ -21,10 +21,10 @@
 -type(validate_results() ::
         'ok' | {error, string(), [term()]} | [validate_results()]).
 
--callback validate(binary(), binary(), term()) -> validate_results().
--callback validate_clear(binary(), binary()) -> validate_results().
--callback notify(binary(), binary(), term()) -> 'ok'.
--callback notify_clear(binary(), binary()) -> 'ok'.
+-callback validate(rabbit_types:vhost(), binary(), binary(),
+                   term()) -> validate_results().
+-callback notify(rabbit_types:vhost(), binary(), binary(), term()) -> 'ok'.
+-callback notify_clear(rabbit_types:vhost(), binary(), binary()) -> 'ok'.
 
 -else.
 
@@ -32,10 +32,9 @@
 
 behaviour_info(callbacks) ->
     [
-     {validate, 3},
-     {validate_clear, 2},
-     {notify, 3},
-     {notify_clear, 2}
+     {validate, 4},
+     {notify, 4},
+     {notify_clear, 3}
     ];
 behaviour_info(_Other) ->
     undefined.
diff --git a/src/rabbit_runtime_parameters.erl b/src/rabbit_runtime_parameters.erl
index 172cee92..c13c333e 100644
--- a/src/rabbit_runtime_parameters.erl
+++ b/src/rabbit_runtime_parameters.erl
@@ -10,16 +10,17 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_runtime_parameters).
 
 -include("rabbit.hrl").
 
--export([parse_set/3, set/3, clear/2, list/0, list/1, list_formatted/0,
-         lookup/2, value/2, value/3, info_keys/0]).
+-export([parse_set/4, set/4, set_any/4, clear/3, clear_any/3, list/0, list/1,
+         list_component/1, list/2, list_formatted/1, lookup/3,
+         value/3, value/4, info_keys/0]).
 
 %%----------------------------------------------------------------------------
 
@@ -27,15 +28,26 @@
 
 -type(ok_or_error_string() :: 'ok' | {'error_string', string()}).
 
--spec(parse_set/3 :: (binary(), binary(), string()) -> ok_or_error_string()).
--spec(set/3 :: (binary(), binary(), term()) -> ok_or_error_string()).
--spec(clear/2 :: (binary(), binary()) -> ok_or_error_string()).
+-spec(parse_set/4 :: (rabbit_types:vhost(), binary(), binary(), string())
+                     -> ok_or_error_string()).
+-spec(set/4 :: (rabbit_types:vhost(), binary(), binary(), term())
+               -> ok_or_error_string()).
+-spec(set_any/4 :: (rabbit_types:vhost(), binary(), binary(), term())
+                   -> ok_or_error_string()).
+-spec(clear/3 :: (rabbit_types:vhost(), binary(), binary())
+                 -> ok_or_error_string()).
+-spec(clear_any/3 :: (rabbit_types:vhost(), binary(), binary())
+                     -> ok_or_error_string()).
 -spec(list/0 :: () -> [rabbit_types:infos()]).
--spec(list/1 :: (binary()) -> [rabbit_types:infos()] | 'not_found').
--spec(list_formatted/0 :: () -> [rabbit_types:infos()]).
--spec(lookup/2 :: (binary(), binary()) -> rabbit_types:infos()).
--spec(value/2 :: (binary(), binary()) -> term()).
--spec(value/3 :: (binary(), binary(), term()) -> term()).
+-spec(list/1 :: (rabbit_types:vhost() | '_') -> [rabbit_types:infos()]).
+-spec(list_component/1 :: (binary()) -> [rabbit_types:infos()]).
+-spec(list/2 :: (rabbit_types:vhost() | '_', binary() | '_')
+                -> [rabbit_types:infos()]).
+-spec(list_formatted/1 :: (rabbit_types:vhost()) -> [rabbit_types:infos()]).
+-spec(lookup/3 :: (rabbit_types:vhost(), binary(), binary())
+                  -> rabbit_types:infos() | 'not_found').
+-spec(value/3 :: (rabbit_types:vhost(), binary(), binary()) -> term()).
+-spec(value/4 :: (rabbit_types:vhost(), binary(), binary(), term()) -> term()).
 -spec(info_keys/0 :: () -> rabbit_types:info_keys()).
 
 -endif.
@@ -48,36 +60,39 @@
 
 %%---------------------------------------------------------------------------
 
-parse_set(Component, Key, String) ->
-    case parse(String) of
-        {ok, Term}  -> set(Component, Key, Term);
-        {errors, L} -> format_error(L)
+parse_set(_, <<"policy">>, _, _) ->
+    {error_string, "policies may not be set using this method"};
+parse_set(VHost, Component, Name, String) ->
+    case rabbit_misc:json_decode(String) of
+        {ok, JSON} -> set(VHost, Component, Name,
+                          rabbit_misc:json_to_term(JSON));
+        error      -> {error_string, "JSON decoding error"}
     end.
 
-set(Component, Key, Term) ->
-    case set0(Component, Key, Term) of
-        ok          -> ok;
-        {errors, L} -> format_error(L)
-    end.
+set(_, <<"policy">>, _, _) ->
+    {error_string, "policies may not be set using this method"};
+set(VHost, Component, Name, Term) ->
+    set_any(VHost, Component, Name, Term).
 
 format_error(L) ->
     {error_string, rabbit_misc:format_many([{"Validation failed~n", []} | L])}.
 
-set0(Component, Key, Term) ->
+set_any(VHost, Component, Name, Term) ->
+    case set_any0(VHost, Component, Name, Term) of
+        ok          -> ok;
+        {errors, L} -> format_error(L)
+    end.
+
+set_any0(VHost, Component, Name, Term) ->
     case lookup_component(Component) of
         {ok, Mod} ->
-            case flatten_errors(validate(Term)) of
+            case flatten_errors(Mod:validate(VHost, Component, Name, Term)) of
                 ok ->
-                    case flatten_errors(Mod:validate(Component, Key, Term)) of
-                        ok ->
-                            case mnesia_update(Component, Key, Term) of
-                                {old, Term} -> ok;
-                                _           -> Mod:notify(Component, Key, Term)
-                            end,
-                            ok;
-                        E ->
-                            E
-                    end;
+                    case mnesia_update(VHost, Component, Name, Term) of
+                        {old, Term} -> ok;
+                        _           -> Mod:notify(VHost, Component, Name, Term)
+                    end,
+                    ok;
                 E ->
                     E
             end;
@@ -85,96 +100,105 @@ set0(Component, Key, Term) ->
             E
     end.
 
-mnesia_update(Component, Key, Term) ->
-    rabbit_misc:execute_mnesia_transaction(
-      fun () ->
-              Res = case mnesia:read(?TABLE, {Component, Key}, read) of
-                        []       -> new;
-                        [Params] -> {old, Params#runtime_parameters.value}
-                    end,
-              ok = mnesia:write(?TABLE, c(Component, Key, Term), write),
-              Res
-      end).
-
-clear(Component, Key) ->
-    case clear0(Component, Key) of
-        ok          -> ok;
-        {errors, L} -> format_error(L)
+mnesia_update(VHost, Comp, Name, Term) ->
+    F = fun () ->
+                Res = case mnesia:read(?TABLE, {VHost, Comp, Name}, read) of
+                          []       -> new;
+                          [Params] -> {old, Params#runtime_parameters.value}
+                      end,
+                ok = mnesia:write(?TABLE, c(VHost, Comp, Name, Term), write),
+                Res
+        end,
+    rabbit_misc:execute_mnesia_transaction(rabbit_vhost:with(VHost, F)).
+
+clear(_, <<"policy">> , _) ->
+    {error_string, "policies may not be cleared using this method"};
+clear(VHost, Component, Name) ->
+    clear_any(VHost, Component, Name).
+
+clear_any(VHost, Component, Name) ->
+    case lookup(VHost, Component, Name) of
+        not_found -> {error_string, "Parameter does not exist"};
+        _         -> mnesia_clear(VHost, Component, Name),
+                     case lookup_component(Component) of
+                         {ok, Mod} -> Mod:notify_clear(VHost, Component, Name);
+                         _         -> ok
+                     end
     end.
 
-clear0(Component, Key) ->
-    case lookup_component(Component) of
-        {ok, Mod} -> case flatten_errors(Mod:validate_clear(Component, Key)) of
-                         ok -> mnesia_clear(Component, Key),
-                               Mod:notify_clear(Component, Key),
-                               ok;
-                         E  -> E
-                     end;
-        E         -> E
-    end.
-
-mnesia_clear(Component, Key) ->
-    ok = rabbit_misc:execute_mnesia_transaction(
-           fun () ->
-                   ok = mnesia:delete(?TABLE, {Component, Key}, write)
-           end).
+mnesia_clear(VHost, Component, Name) ->
+    F = fun () ->
+                ok = mnesia:delete(?TABLE, {VHost, Component, Name}, write)
+        end,
+    ok = rabbit_misc:execute_mnesia_transaction(rabbit_vhost:with(VHost, F)).
 
 list() ->
-    [p(P) || P <- rabbit_misc:dirty_read_all(?TABLE)].
-
-list(Component) ->
-    case lookup_component(Component) of
-        {ok, _} -> Match = #runtime_parameters{key = {Component, '_'}, _ = '_'},
-                   [p(P) || P <- mnesia:dirty_match_object(?TABLE, Match)];
-        _       -> not_found
-    end.
-
-list_formatted() ->
-    [pset(value, format(pget(value, P)), P) || P <- list()].
-
-lookup(Component, Key) ->
-    case lookup0(Component, Key, rabbit_misc:const(not_found)) of
+    [p(P) || #runtime_parameters{ key = {_VHost, Comp, _Name}} = P <-
+             rabbit_misc:dirty_read_all(?TABLE), Comp /= <<"policy">>].
+
+list(VHost)               -> list(VHost, '_').
+list_component(Component) -> list('_',   Component).
+
+list(VHost, Component) ->
+    case VHost of
+        '_' -> ok;
+        _   -> rabbit_vhost:assert(VHost)
+    end,
+    Match = #runtime_parameters{key = {VHost, Component, '_'}, _ = '_'},
+    [p(P) || #runtime_parameters{key = {_VHost, Comp, _Name}} = P <-
+                 mnesia:dirty_match_object(?TABLE, Match),
+             Comp =/= <<"policy">> orelse Component =:= <<"policy">>].
+
+list_formatted(VHost) ->
+    [pset(value, format(pget(value, P)), P) || P <- list(VHost)].
+
+lookup(VHost, Component, Name) ->
+    case lookup0(VHost, Component, Name, rabbit_misc:const(not_found)) of
         not_found -> not_found;
         Params    -> p(Params)
     end.
 
-value(Component, Key) ->
-    case lookup0(Component, Key, rabbit_misc:const(not_found)) of
+value(VHost, Component, Name) ->
+    case lookup0(VHost, Component, Name, rabbit_misc:const(not_found)) of
         not_found -> not_found;
         Params    -> Params#runtime_parameters.value
     end.
 
-value(Component, Key, Default) ->
-    Params = lookup0(Component, Key,
-                     fun () -> lookup_missing(Component, Key, Default) end),
+value(VHost, Component, Name, Default) ->
+    Params = lookup0(VHost, Component, Name,
+                     fun () ->
+                             lookup_missing(VHost, Component, Name, Default)
+                     end),
     Params#runtime_parameters.value.
 
-lookup0(Component, Key, DefaultFun) ->
-    case mnesia:dirty_read(?TABLE, {Component, Key}) of
+lookup0(VHost, Component, Name, DefaultFun) ->
+    case mnesia:dirty_read(?TABLE, {VHost, Component, Name}) of
         []  -> DefaultFun();
         [R] -> R
     end.
 
-lookup_missing(Component, Key, Default) ->
+lookup_missing(VHost, Component, Name, Default) ->
     rabbit_misc:execute_mnesia_transaction(
       fun () ->
-              case mnesia:read(?TABLE, {Component, Key}, read) of
-                  []  -> Record = c(Component, Key, Default),
+              case mnesia:read(?TABLE, {VHost, Component, Name}, read) of
+                  []  -> Record = c(VHost, Component, Name, Default),
                          mnesia:write(?TABLE, Record, write),
                          Record;
                   [R] -> R
               end
       end).
 
-c(Component, Key, Default) -> #runtime_parameters{key = {Component, Key},
-                                                  value = Default}.
+c(VHost, Component, Name, Default) ->
+    #runtime_parameters{key = {VHost, Component, Name},
+                        value = Default}.
 
-p(#runtime_parameters{key = {Component, Key}, value = Value}) ->
-    [{component, Component},
-     {key,       Key},
+p(#runtime_parameters{key = {VHost, Component, Name}, value = Value}) ->
+    [{vhost,     VHost},
+     {component, Component},
+     {name,      Name},
      {value,     Value}].
 
-info_keys() -> [component, key, value].
+info_keys() -> [component, name, value].
 
 %%---------------------------------------------------------------------------
 
@@ -186,51 +210,9 @@ lookup_component(Component) ->
         {ok, Module}       -> {ok, Module}
     end.
 
-parse(Src0) ->
-    Src1 = string:strip(Src0),
-    Src = case lists:reverse(Src1) of
-              [$. |_] -> Src1;
-              _       -> Src1 ++ "."
-          end,
-    case erl_scan:string(Src) of
-        {ok, Scanned, _} ->
-            case erl_parse:parse_term(Scanned) of
-                {ok, Parsed} ->
-                    {ok, Parsed};
-                {error, E} ->
-                    {errors,
-                     [{"Could not parse value: ~s", [format_parse_error(E)]}]}
-            end;
-        {error, E, _} ->
-            {errors, [{"Could not scan value: ~s", [format_parse_error(E)]}]}
-    end.
-
-format_parse_error({_Line, Mod, Err}) ->
-    lists:flatten(Mod:format_error(Err)).
-
 format(Term) ->
-    list_to_binary(rabbit_misc:format("~p", [Term])).
-
-%%---------------------------------------------------------------------------
-
-%% We will want to be able to biject these to JSON. So we have some
-%% generic restrictions on what we consider acceptable.
-validate(Proplist = [T | _]) when is_tuple(T) -> validate_proplist(Proplist);
-validate(L) when is_list(L)                   -> validate_list(L);
-validate(T) when is_tuple(T)                  -> {error, "tuple: ~p", [T]};
-validate(B) when is_boolean(B)                -> ok;
-validate(null)                                -> ok;
-validate(A) when is_atom(A)                   -> {error, "atom: ~p", [A]};
-validate(N) when is_number(N)                 -> ok;
-validate(B) when is_binary(B)                 -> ok;
-validate(B) when is_bitstring(B)              -> {error, "bitstring: ~p", [B]}.
-
-validate_list(L) -> [validate(I) || I <- L].
-validate_proplist(L) -> [vp(I) || I <- L].
-
-vp({K, V}) when is_binary(K) -> validate(V);
-vp({K, _V})                  -> {error, "bad key: ~p", [K]};
-vp(H)                        -> {error, "not two tuple: ~p", [H]}.
+    {ok, JSON} = rabbit_misc:json_encode(rabbit_misc:term_to_json(Term)),
+    list_to_binary(JSON).
 
 flatten_errors(L) ->
     case [{F, A} || I <- lists:flatten([L]), {error, F, A} <- [I]] of
diff --git a/src/rabbit_runtime_parameters_test.erl b/src/rabbit_runtime_parameters_test.erl
index f23b3227..05c85881 100644
--- a/src/rabbit_runtime_parameters_test.erl
+++ b/src/rabbit_runtime_parameters_test.erl
@@ -10,15 +10,20 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_runtime_parameters_test).
 -behaviour(rabbit_runtime_parameter).
+-behaviour(rabbit_policy_validator).
 
--export([validate/3, validate_clear/2, notify/3, notify_clear/2]).
+-export([validate/4, notify/4, notify_clear/3]).
 -export([register/0, unregister/0]).
+-export([validate_policy/1]).
+-export([register_policy_validator/0, unregister_policy_validator/0]).
+
+%----------------------------------------------------------------------------
 
 register() ->
     rabbit_registry:register(runtime_parameter, <<"test">>, ?MODULE).
@@ -26,13 +31,34 @@ register() ->
 unregister() ->
     rabbit_registry:unregister(runtime_parameter, <<"test">>).
 
-validate(<<"test">>, <<"good">>,  _Term)      -> ok;
-validate(<<"test">>, <<"maybe">>, <<"good">>) -> ok;
-validate(<<"test">>, _, _)                    -> {error, "meh", []}.
+validate(_, <<"test">>, <<"good">>,  _Term)      -> ok;
+validate(_, <<"test">>, <<"maybe">>, <<"good">>) -> ok;
+validate(_, <<"test">>, _, _)                    -> {error, "meh", []}.
+
+notify(_, _, _, _) -> ok.
+notify_clear(_, _, _) -> ok.
+
+%----------------------------------------------------------------------------
+
+register_policy_validator() ->
+    rabbit_registry:register(policy_validator, <<"testeven">>, ?MODULE),
+    rabbit_registry:register(policy_validator, <<"testpos">>,  ?MODULE).
+
+unregister_policy_validator() ->
+    rabbit_registry:unregister(policy_validator, <<"testeven">>),
+    rabbit_registry:unregister(policy_validator, <<"testpos">>).
+
+validate_policy([{<<"testeven">>, Terms}]) when is_list(Terms) ->
+    case  length(Terms) rem 2 =:= 0 of
+        true  -> ok;
+        false -> {error, "meh", []}
+    end;
 
-validate_clear(<<"test">>, <<"good">>)  -> ok;
-validate_clear(<<"test">>, <<"maybe">>) -> ok;
-validate_clear(<<"test">>, _)           -> {error, "meh", []}.
+validate_policy([{<<"testpos">>, Terms}]) when is_list(Terms) ->
+    case lists:all(fun (N) -> is_integer(N) andalso N > 0 end, Terms) of
+        true  -> ok;
+        false -> {error, "meh", []}
+    end;
 
-notify(_, _, _) -> ok.
-notify_clear(_, _) -> ok.
+validate_policy(_) ->
+    {error, "meh", []}.
diff --git a/src/rabbit_sasl_report_file_h.erl b/src/rabbit_sasl_report_file_h.erl
index e8beecfe..39a10ac3 100644
--- a/src/rabbit_sasl_report_file_h.erl
+++ b/src/rabbit_sasl_report_file_h.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_sasl_report_file_h).
diff --git a/src/rabbit_ssl.erl b/src/rabbit_ssl.erl
index 22ff555f..109bff30 100644
--- a/src/rabbit_ssl.erl
+++ b/src/rabbit_ssl.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_ssl).
@@ -162,15 +162,16 @@ format_rdn(#'AttributeTypeAndValue'{type = T, value = V}) ->
             {?'id-at-pseudonym'              , "PSEUDONYM"},
             {?'id-domainComponent'           , "DC"},
             {?'id-emailAddress'              , "EMAILADDRESS"},
-            {?'street-address'               , "STREET"}],
+            {?'street-address'               , "STREET"},
+            {{0,9,2342,19200300,100,1,1}     , "UID"}], %% Not in public_key.hrl
     case proplists:lookup(T, Fmts) of
         {_, Fmt} ->
-            io_lib:format(Fmt ++ "=~s", [FV]);
+            rabbit_misc:format(Fmt ++ "=~s", [FV]);
         none when is_tuple(T) ->
-            TypeL = [io_lib:format("~w", [X]) || X <- tuple_to_list(T)],
-            io_lib:format("~s:~s", [string:join(TypeL, "."), FV]);
+            TypeL = [rabbit_misc:format("~w", [X]) || X <- tuple_to_list(T)],
+            rabbit_misc:format("~s=~s", [string:join(TypeL, "."), FV]);
         none ->
-            io_lib:format("~p:~s", [T, FV])
+            rabbit_misc:format("~p=~s", [T, FV])
     end.
 
 %% Escape a string as per RFC4514.
@@ -204,14 +205,26 @@ format_asn1_value({ST, S}) when ST =:= teletexString; ST =:= printableString;
     format_directory_string(ST, S);
 format_asn1_value({utcTime, [Y1, Y2, M1, M2, D1, D2, H1, H2,
                              Min1, Min2, S1, S2, $Z]}) ->
-    io_lib:format("20~c~c-~c~c-~c~cT~c~c:~c~c:~c~cZ",
-                  [Y1, Y2, M1, M2, D1, D2, H1, H2, Min1, Min2, S1, S2]);
+    rabbit_misc:format("20~c~c-~c~c-~c~cT~c~c:~c~c:~c~cZ",
+                       [Y1, Y2, M1, M2, D1, D2, H1, H2, Min1, Min2, S1, S2]);
 %% We appear to get an untagged value back for an ia5string
 %% (e.g. domainComponent).
 format_asn1_value(V) when is_list(V) ->
     V;
+format_asn1_value(V) when is_binary(V) ->
+    %% OTP does not decode some values when combined with an unknown
+    %% type. That's probably wrong, so as a last ditch effort let's
+    %% try manually decoding. 'DirectoryString' is semi-arbitrary -
+    %% but it is the type which covers the various string types we
+    %% handle below.
+    try
+        {ST, S} = public_key:der_decode('DirectoryString', V),
+        format_directory_string(ST, S)
+    catch _:_ ->
+            rabbit_misc:format("~p", [V])
+    end;
 format_asn1_value(V) ->
-    io_lib:format("~p", [V]).
+    rabbit_misc:format("~p", [V]).
 
 %% DirectoryString { INTEGER : maxSize } ::= CHOICE {
 %%     teletexString     TeletexString (SIZE (1..maxSize)),
diff --git a/src/rabbit_sup.erl b/src/rabbit_sup.erl
index f142d233..c1deb14b 100644
--- a/src/rabbit_sup.erl
+++ b/src/rabbit_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_sup).
diff --git a/src/rabbit_table.erl b/src/rabbit_table.erl
new file mode 100644
index 00000000..a29c57d5
--- /dev/null
+++ b/src/rabbit_table.erl
@@ -0,0 +1,311 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_table).
+
+-export([create/0, create_local_copy/1, wait_for_replicated/0, wait/1,
+         force_load/0, is_present/0, is_empty/0,
+         check_schema_integrity/0, clear_ram_only_tables/0]).
+
+-include("rabbit.hrl").
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(create/0 :: () -> 'ok').
+-spec(create_local_copy/1 :: ('disc' | 'ram') -> 'ok').
+-spec(wait_for_replicated/0 :: () -> 'ok').
+-spec(wait/1 :: ([atom()]) -> 'ok').
+-spec(force_load/0 :: () -> 'ok').
+-spec(is_present/0 :: () -> boolean()).
+-spec(is_empty/0 :: () -> boolean()).
+-spec(check_schema_integrity/0 :: () -> rabbit_types:ok_or_error(any())).
+-spec(clear_ram_only_tables/0 :: () -> 'ok').
+
+-endif.
+
+%%----------------------------------------------------------------------------
+%% Main interface
+%%----------------------------------------------------------------------------
+
+create() ->
+    lists:foreach(fun ({Tab, TabDef}) ->
+                          TabDef1 = proplists:delete(match, TabDef),
+                          case mnesia:create_table(Tab, TabDef1) of
+                              {atomic, ok} -> ok;
+                              {aborted, Reason} ->
+                                  throw({error, {table_creation_failed,
+                                                 Tab, TabDef1, Reason}})
+                          end
+                  end, definitions()),
+    ok.
+
+%% The sequence in which we delete the schema and then the other
+%% tables is important: if we delete the schema first when moving to
+%% RAM mnesia will loudly complain since it doesn't make much sense to
+%% do that. But when moving to disc, we need to move the schema first.
+create_local_copy(disc) ->
+    create_local_copy(schema, disc_copies),
+    create_local_copies(disc);
+create_local_copy(ram)  ->
+    create_local_copies(ram),
+    create_local_copy(schema, ram_copies).
+
+wait_for_replicated() ->
+    wait([Tab || {Tab, TabDef} <- definitions(),
+                 not lists:member({local_content, true}, TabDef)]).
+
+wait(TableNames) ->
+    case mnesia:wait_for_tables(TableNames, 30000) of
+        ok ->
+            ok;
+        {timeout, BadTabs} ->
+            throw({error, {timeout_waiting_for_tables, BadTabs}});
+        {error, Reason} ->
+            throw({error, {failed_waiting_for_tables, Reason}})
+    end.
+
+force_load() -> [mnesia:force_load_table(T) || T <- names()], ok.
+
+is_present() -> names() -- mnesia:system_info(tables) =:= [].
+
+is_empty() ->
+    lists:all(fun (Tab) -> mnesia:dirty_first(Tab) == '$end_of_table' end,
+              names()).
+
+check_schema_integrity() ->
+    Tables = mnesia:system_info(tables),
+    case check(fun (Tab, TabDef) ->
+                       case lists:member(Tab, Tables) of
+                           false -> {error, {table_missing, Tab}};
+                           true  -> check_attributes(Tab, TabDef)
+                       end
+               end) of
+        ok     -> ok = wait(names()),
+                  check(fun check_content/2);
+        Other  -> Other
+    end.
+
+clear_ram_only_tables() ->
+    Node = node(),
+    lists:foreach(
+      fun (TabName) ->
+              case lists:member(Node, mnesia:table_info(TabName, ram_copies)) of
+                  true  -> {atomic, ok} = mnesia:clear_table(TabName);
+                  false -> ok
+              end
+      end, names()),
+    ok.
+
+%%--------------------------------------------------------------------
+%% Internal helpers
+%%--------------------------------------------------------------------
+
+create_local_copies(Type) ->
+    lists:foreach(
+      fun ({Tab, TabDef}) ->
+              HasDiscCopies     = has_copy_type(TabDef, disc_copies),
+              HasDiscOnlyCopies = has_copy_type(TabDef, disc_only_copies),
+              LocalTab          = proplists:get_bool(local_content, TabDef),
+              StorageType =
+                  if
+                      Type =:= disc orelse LocalTab ->
+                          if
+                              HasDiscCopies     -> disc_copies;
+                              HasDiscOnlyCopies -> disc_only_copies;
+                              true              -> ram_copies
+                          end;
+                      Type =:= ram ->
+                          ram_copies
+                  end,
+              ok = create_local_copy(Tab, StorageType)
+      end, definitions(Type)),
+    ok.
+
+create_local_copy(Tab, Type) ->
+    StorageType = mnesia:table_info(Tab, storage_type),
+    {atomic, ok} =
+        if
+            StorageType == unknown ->
+                mnesia:add_table_copy(Tab, node(), Type);
+            StorageType /= Type ->
+                mnesia:change_table_copy_type(Tab, node(), Type);
+            true -> {atomic, ok}
+        end,
+    ok.
+
+has_copy_type(TabDef, DiscType) ->
+    lists:member(node(), proplists:get_value(DiscType, TabDef, [])).
+
+check_attributes(Tab, TabDef) ->
+    {_, ExpAttrs} = proplists:lookup(attributes, TabDef),
+    case mnesia:table_info(Tab, attributes) of
+        ExpAttrs -> ok;
+        Attrs    -> {error, {table_attributes_mismatch, Tab, ExpAttrs, Attrs}}
+    end.
+
+check_content(Tab, TabDef) ->
+    {_, Match} = proplists:lookup(match, TabDef),
+    case mnesia:dirty_first(Tab) of
+        '$end_of_table' ->
+            ok;
+        Key ->
+            ObjList = mnesia:dirty_read(Tab, Key),
+            MatchComp = ets:match_spec_compile([{Match, [], ['$_']}]),
+            case ets:match_spec_run(ObjList, MatchComp) of
+                ObjList -> ok;
+                _       -> {error, {table_content_invalid, Tab, Match, ObjList}}
+            end
+    end.
+
+check(Fun) ->
+    case [Error || {Tab, TabDef} <- definitions(),
+                   case Fun(Tab, TabDef) of
+                       ok             -> Error = none, false;
+                       {error, Error} -> true
+                   end] of
+        []     -> ok;
+        Errors -> {error, Errors}
+    end.
+
+%%--------------------------------------------------------------------
+%% Table definitions
+%%--------------------------------------------------------------------
+
+names() -> [Tab || {Tab, _} <- definitions()].
+
+%% The tables aren't supposed to be on disk on a ram node
+definitions(disc) ->
+    definitions();
+definitions(ram) ->
+    [{Tab, [{disc_copies, []}, {ram_copies, [node()]} |
+            proplists:delete(
+              ram_copies, proplists:delete(disc_copies, TabDef))]} ||
+        {Tab, TabDef} <- definitions()].
+
+definitions() ->
+    [{rabbit_user,
+      [{record_name, internal_user},
+       {attributes, record_info(fields, internal_user)},
+       {disc_copies, [node()]},
+       {match, #internal_user{_='_'}}]},
+     {rabbit_user_permission,
+      [{record_name, user_permission},
+       {attributes, record_info(fields, user_permission)},
+       {disc_copies, [node()]},
+       {match, #user_permission{user_vhost = #user_vhost{_='_'},
+                                permission = #permission{_='_'},
+                                _='_'}}]},
+     {rabbit_vhost,
+      [{record_name, vhost},
+       {attributes, record_info(fields, vhost)},
+       {disc_copies, [node()]},
+       {match, #vhost{_='_'}}]},
+     {rabbit_listener,
+      [{record_name, listener},
+       {attributes, record_info(fields, listener)},
+       {type, bag},
+       {match, #listener{_='_'}}]},
+     {rabbit_durable_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {disc_copies, [node()]},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_semi_durable_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {type, ordered_set},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_route,
+      [{record_name, route},
+       {attributes, record_info(fields, route)},
+       {type, ordered_set},
+       {match, #route{binding = binding_match(), _='_'}}]},
+     {rabbit_reverse_route,
+      [{record_name, reverse_route},
+       {attributes, record_info(fields, reverse_route)},
+       {type, ordered_set},
+       {match, #reverse_route{reverse_binding = reverse_binding_match(),
+                              _='_'}}]},
+     {rabbit_topic_trie_node,
+      [{record_name, topic_trie_node},
+       {attributes, record_info(fields, topic_trie_node)},
+       {type, ordered_set},
+       {match, #topic_trie_node{trie_node = trie_node_match(), _='_'}}]},
+     {rabbit_topic_trie_edge,
+      [{record_name, topic_trie_edge},
+       {attributes, record_info(fields, topic_trie_edge)},
+       {type, ordered_set},
+       {match, #topic_trie_edge{trie_edge = trie_edge_match(), _='_'}}]},
+     {rabbit_topic_trie_binding,
+      [{record_name, topic_trie_binding},
+       {attributes, record_info(fields, topic_trie_binding)},
+       {type, ordered_set},
+       {match, #topic_trie_binding{trie_binding = trie_binding_match(),
+                                   _='_'}}]},
+     {rabbit_durable_exchange,
+      [{record_name, exchange},
+       {attributes, record_info(fields, exchange)},
+       {disc_copies, [node()]},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
+     {rabbit_exchange,
+      [{record_name, exchange},
+       {attributes, record_info(fields, exchange)},
+       {match, #exchange{name = exchange_name_match(), _='_'}}]},
+     {rabbit_exchange_serial,
+      [{record_name, exchange_serial},
+       {attributes, record_info(fields, exchange_serial)},
+       {match, #exchange_serial{name = exchange_name_match(), _='_'}}]},
+     {rabbit_runtime_parameters,
+      [{record_name, runtime_parameters},
+       {attributes, record_info(fields, runtime_parameters)},
+       {disc_copies, [node()]},
+       {match, #runtime_parameters{_='_'}}]},
+     {rabbit_durable_queue,
+      [{record_name, amqqueue},
+       {attributes, record_info(fields, amqqueue)},
+       {disc_copies, [node()]},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]},
+     {rabbit_queue,
+      [{record_name, amqqueue},
+       {attributes, record_info(fields, amqqueue)},
+       {match, #amqqueue{name = queue_name_match(), _='_'}}]}]
+        ++ gm:table_definitions()
+        ++ mirrored_supervisor:table_definitions().
+
+binding_match() ->
+    #binding{source = exchange_name_match(),
+             destination = binding_destination_match(),
+             _='_'}.
+reverse_binding_match() ->
+    #reverse_binding{destination = binding_destination_match(),
+                     source = exchange_name_match(),
+                     _='_'}.
+binding_destination_match() ->
+    resource_match('_').
+trie_node_match() ->
+    #trie_node{   exchange_name = exchange_name_match(), _='_'}.
+trie_edge_match() ->
+    #trie_edge{   exchange_name = exchange_name_match(), _='_'}.
+trie_binding_match() ->
+    #trie_binding{exchange_name = exchange_name_match(), _='_'}.
+exchange_name_match() ->
+    resource_match(exchange).
+queue_name_match() ->
+    resource_match(queue).
+resource_match(Kind) ->
+    #resource{kind = Kind, _='_'}.
diff --git a/src/rabbit_tests.erl b/src/rabbit_tests.erl
index bae4928d..5af4969a 100644
--- a/src/rabbit_tests.erl
+++ b/src/rabbit_tests.erl
@@ -10,15 +10,15 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_tests).
 
 -compile([export_all]).
 
--export([all_tests/0, test_parsing/0]).
+-export([all_tests/0]).
 
 -import(rabbit_misc, [pget/2]).
 
@@ -29,20 +29,25 @@
 -define(PERSISTENT_MSG_STORE, msg_store_persistent).
 -define(TRANSIENT_MSG_STORE,  msg_store_transient).
 -define(CLEANUP_QUEUE_NAME, <<"cleanup-queue">>).
+-define(TIMEOUT, 5000).
 
 all_tests() ->
+    ok = setup_cluster(),
+    ok = supervisor2_tests:test_all(),
     passed = gm_tests:all_tests(),
     passed = mirrored_supervisor_tests:all_tests(),
     application:set_env(rabbit, file_handles_high_watermark, 10, infinity),
     ok = file_handle_cache:set_limit(10),
+    passed = test_version_equivalance(),
     passed = test_multi_call(),
     passed = test_file_handle_cache(),
     passed = test_backing_queue(),
+    passed = test_rabbit_basic_header_handling(),
     passed = test_priority_queue(),
     passed = test_pg_local(),
     passed = test_unfold(),
     passed = test_supervisor_delayed_restart(),
-    passed = test_parsing(),
+    passed = test_table_codec(),
     passed = test_content_framing(),
     passed = test_content_transcoding(),
     passed = test_topic_matching(),
@@ -51,36 +56,68 @@ all_tests() ->
     passed = test_log_management_during_startup(),
     passed = test_statistics(),
     passed = test_arguments_parser(),
-    passed = test_cluster_management(),
+    passed = test_dynamic_mirroring(),
     passed = test_user_management(),
     passed = test_runtime_parameters(),
+    passed = test_policy_validation(),
+    passed = test_policy_opts_validation(),
+    passed = test_ha_policy_validation(),
     passed = test_server_status(),
+    passed = test_amqp_connection_refusal(),
     passed = test_confirms(),
-    passed = maybe_run_cluster_dependent_tests(),
+    passed = test_with_state(),
+    passed =
+        do_if_secondary_node(
+          fun run_cluster_dependent_tests/1,
+          fun (SecondaryNode) ->
+                  io:format("Skipping cluster dependent tests with node ~p~n",
+                            [SecondaryNode]),
+                  passed
+          end),
     passed = test_configurable_server_properties(),
     passed.
 
-maybe_run_cluster_dependent_tests() ->
+
+do_if_secondary_node(Up, Down) ->
     SecondaryNode = rabbit_nodes:make("hare"),
 
     case net_adm:ping(SecondaryNode) of
-        pong -> passed = run_cluster_dependent_tests(SecondaryNode);
-        pang -> io:format("Skipping cluster dependent tests with node ~p~n",
-                          [SecondaryNode])
-    end,
-    passed.
+        pong -> Up(SecondaryNode);
+        pang -> Down(SecondaryNode)
+    end.
 
-run_cluster_dependent_tests(SecondaryNode) ->
-    SecondaryNodeS = atom_to_list(SecondaryNode),
+setup_cluster() ->
+    do_if_secondary_node(
+      fun (SecondaryNode) ->
+              cover:stop(SecondaryNode),
+              ok = control_action(stop_app, []),
+              %% 'cover' does not cope at all well with nodes disconnecting,
+              %% which happens as part of reset. So we turn it off
+              %% temporarily. That is ok even if we're not in general using
+              %% cover, it just turns the engine on / off and doesn't log
+              %% anything.  Note that this way cover won't be on when joining
+              %% the cluster, but this is OK since we're testing the clustering
+              %% interface elsewere anyway.
+              cover:stop(nodes()),
+              ok = control_action(join_cluster,
+                                  [atom_to_list(SecondaryNode)]),
+              cover:start(nodes()),
+              ok = control_action(start_app, []),
+              ok = control_action(start_app, SecondaryNode, [], [])
+      end,
+      fun (_) -> ok end).
 
-    cover:stop(SecondaryNode),
-    ok = control_action(stop_app, []),
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    cover:start(SecondaryNode),
-    ok = control_action(start_app, SecondaryNode, [], []),
+maybe_run_cluster_dependent_tests() ->
+    do_if_secondary_node(
+      fun (SecondaryNode) ->
+              passed = run_cluster_dependent_tests(SecondaryNode)
+      end,
+      fun (SecondaryNode) ->
+              io:format("Skipping cluster dependent tests with node ~p~n",
+                        [SecondaryNode])
+      end).
 
+run_cluster_dependent_tests(SecondaryNode) ->
     io:format("Running cluster dependent tests with node ~p~n", [SecondaryNode]),
     passed = test_delegates_async(SecondaryNode),
     passed = test_delegates_sync(SecondaryNode),
@@ -109,6 +146,16 @@ run_cluster_dependent_tests(SecondaryNode) ->
 
     passed.
 
+test_version_equivalance() ->
+    true = rabbit_misc:version_minor_equivalent("3.0.0", "3.0.0"),
+    true = rabbit_misc:version_minor_equivalent("3.0.0", "3.0.1"),
+    true = rabbit_misc:version_minor_equivalent("%%VSN%%", "%%VSN%%"),
+    false = rabbit_misc:version_minor_equivalent("3.0.0", "3.1.0"),
+    false = rabbit_misc:version_minor_equivalent("3.0.0", "3.0"),
+    false = rabbit_misc:version_minor_equivalent("3.0.0", "3.0.0.1"),
+    false = rabbit_misc:version_minor_equivalent("3.0.0", "3.0.foo"),
+    passed.
+
 test_multi_call() ->
     Fun = fun() ->
                   receive
@@ -129,6 +176,78 @@ test_multi_call() ->
     exit(Pid3, bang),
     passed.
 
+test_rabbit_basic_header_handling() ->
+    passed = write_table_with_invalid_existing_type_test(),
+    passed = invalid_existing_headers_test(),
+    passed = disparate_invalid_header_entries_accumulate_separately_test(),
+    passed = corrupt_or_invalid_headers_are_overwritten_test(),
+    passed = invalid_same_header_entry_accumulation_test(),
+    passed.
+
+-define(XDEATH_TABLE,
+        [{<<"reason">>,       longstr,   <<"blah">>},
+         {<<"queue">>,        longstr,   <<"foo.bar.baz">>},
+         {<<"exchange">>,     longstr,   <<"my-exchange">>},
+         {<<"routing-keys">>, array,     []}]).
+
+-define(ROUTE_TABLE, [{<<"redelivered">>, bool, <<"true">>}]).
+
+-define(BAD_HEADER(K), {<<K>>, longstr, <<"bad ", K>>}).
+-define(BAD_HEADER2(K, Suf), {<<K>>, longstr, <<"bad ", K, Suf>>}).
+-define(FOUND_BAD_HEADER(K), {<<K>>, array, [{longstr, <<"bad ", K>>}]}).
+
+write_table_with_invalid_existing_type_test() ->
+    prepend_check(<<"header1">>, ?XDEATH_TABLE, [?BAD_HEADER("header1")]),
+    passed.
+
+invalid_existing_headers_test() ->
+    Headers =
+        prepend_check(<<"header2">>, ?ROUTE_TABLE, [?BAD_HEADER("header2")]),
+    {array, [{table, ?ROUTE_TABLE}]} =
+        rabbit_misc:table_lookup(Headers, <<"header2">>),
+    passed.
+
+disparate_invalid_header_entries_accumulate_separately_test() ->
+    BadHeaders = [?BAD_HEADER("header2")],
+    Headers = prepend_check(<<"header2">>, ?ROUTE_TABLE, BadHeaders),
+    Headers2 = prepend_check(<<"header1">>, ?XDEATH_TABLE,
+                             [?BAD_HEADER("header1") | Headers]),
+    {table, [?FOUND_BAD_HEADER("header1"),
+             ?FOUND_BAD_HEADER("header2")]} =
+        rabbit_misc:table_lookup(Headers2, ?INVALID_HEADERS_KEY),
+    passed.
+
+corrupt_or_invalid_headers_are_overwritten_test() ->
+    Headers0 = [?BAD_HEADER("header1"),
+                ?BAD_HEADER("x-invalid-headers")],
+    Headers1 = prepend_check(<<"header1">>, ?XDEATH_TABLE, Headers0),
+    {table,[?FOUND_BAD_HEADER("header1"),
+            ?FOUND_BAD_HEADER("x-invalid-headers")]} =
+        rabbit_misc:table_lookup(Headers1, ?INVALID_HEADERS_KEY),
+    passed.
+
+invalid_same_header_entry_accumulation_test() ->
+    BadHeader1 = ?BAD_HEADER2("header1", "a"),
+    Headers = prepend_check(<<"header1">>, ?ROUTE_TABLE, [BadHeader1]),
+    Headers2 = prepend_check(<<"header1">>, ?ROUTE_TABLE,
+                             [?BAD_HEADER2("header1", "b") | Headers]),
+    {table, InvalidHeaders} =
+        rabbit_misc:table_lookup(Headers2, ?INVALID_HEADERS_KEY),
+    {array, [{longstr,<<"bad header1b">>},
+             {longstr,<<"bad header1a">>}]} =
+        rabbit_misc:table_lookup(InvalidHeaders, <<"header1">>),
+    passed.
+
+prepend_check(HeaderKey, HeaderTable, Headers) ->
+    Headers1 = rabbit_basic:prepend_table_header(
+                HeaderKey, HeaderTable, Headers),
+    {table, Invalid} =
+        rabbit_misc:table_lookup(Headers1, ?INVALID_HEADERS_KEY),
+    {Type, Value} = rabbit_misc:table_lookup(Headers, HeaderKey),
+    {array, [{Type, Value} | _]} =
+        rabbit_misc:table_lookup(Invalid, HeaderKey),
+    Headers1.
+
 test_priority_queue() ->
 
     false = priority_queue:is_queue(not_a_queue),
@@ -320,113 +439,45 @@ test_unfold() ->
                                    end, 10),
     passed.
 
-test_parsing() ->
-    passed = test_content_properties(),
-    passed = test_field_values(),
-    passed.
-
-test_content_prop_encoding(Datum, Binary) ->
-    Types =  [element(1, E) || E <- Datum],
-    Values = [element(2, E) || E <- Datum],
-    Binary = rabbit_binary_generator:encode_properties(Types, Values). %% assertion
-
-test_content_properties() ->
-    test_content_prop_encoding([], <<0, 0>>),
-    test_content_prop_encoding([{bit, true}, {bit, false}, {bit, true}, {bit, false}],
-                               <<16#A0, 0>>),
-    test_content_prop_encoding([{bit, true}, {octet, 123}, {bit, true}, {octet, undefined},
-                                {bit, true}],
-                               <<16#E8,0,123>>),
-    test_content_prop_encoding([{bit, true}, {octet, 123}, {octet, 123}, {bit, true}],
-                               <<16#F0,0,123,123>>),
-    test_content_prop_encoding([{bit, true}, {shortstr, <<"hi">>}, {bit, true},
-                                {shortint, 54321}, {bit, true}],
-                               <<16#F8,0,2,"hi",16#D4,16#31>>),
-    test_content_prop_encoding([{bit, true}, {shortstr, undefined}, {bit, true},
-                                {shortint, 54321}, {bit, true}],
-                               <<16#B8,0,16#D4,16#31>>),
-    test_content_prop_encoding([{table, [{<<"a signedint">>, signedint, 12345678},
-                                         {<<"a longstr">>, longstr, <<"yes please">>},
-                                         {<<"a decimal">>, decimal, {123, 12345678}},
-                                         {<<"a timestamp">>, timestamp, 123456789012345},
-                                         {<<"a nested table">>, table,
-                                          [{<<"one">>, signedint, 1},
-                                           {<<"two">>, signedint, 2}]}]}],
-                               <<
-                                 %% property-flags
-                                 16#8000:16,
-
-                                 %% property-list:
-
-                                 %% table
-                                 117:32,                % table length in bytes
-
-                                 11,"a signedint",      % name
-                                 "I",12345678:32,       % type and value
-
-                                 9,"a longstr",
-                                 "S",10:32,"yes please",
-
-                                 9,"a decimal",
-                                 "D",123,12345678:32,
-
-                                 11,"a timestamp",
-                                 "T", 123456789012345:64,
-
-                                 14,"a nested table",
-                                 "F",
-                                 18:32,
-
-                                 3,"one",
-                                 "I",1:32,
-
-                                 3,"two",
-                                 "I",2:32 >>),
-    passed.
-
-test_field_values() ->
+test_table_codec() ->
     %% FIXME this does not test inexact numbers (double and float) yet,
     %% because they won't pass the equality assertions
-    test_content_prop_encoding(
-      [{table, [{<<"longstr">>, longstr, <<"Here is a long string">>},
-                {<<"signedint">>, signedint, 12345},
-                {<<"decimal">>, decimal, {3, 123456}},
-                {<<"timestamp">>, timestamp, 109876543209876},
-                {<<"table">>, table, [{<<"one">>, signedint, 54321},
-                                      {<<"two">>, longstr, <<"A long string">>}]},
-                {<<"byte">>, byte, 255},
-                {<<"long">>, long, 1234567890},
-                {<<"short">>, short, 655},
-                {<<"bool">>, bool, true},
-                {<<"binary">>, binary, <<"a binary string">>},
-                {<<"void">>, void, undefined},
-                {<<"array">>, array, [{signedint, 54321},
-                                      {longstr, <<"A long string">>}]}
-
-               ]}],
-      <<
-        %% property-flags
-        16#8000:16,
-        %% table length in bytes
-        228:32,
-
-        7,"longstr",   "S", 21:32, "Here is a long string",      %      = 34
-        9,"signedint", "I", 12345:32/signed,                     % + 15 = 49
-        7,"decimal",   "D", 3, 123456:32,                        % + 14 = 63
-        9,"timestamp", "T", 109876543209876:64,                  % + 19 = 82
-        5,"table",     "F", 31:32, % length of table             % + 11 = 93
-        3,"one", "I", 54321:32,                                  % +  9 = 102
-        3,"two", "S", 13:32, "A long string",                    % + 22 = 124
-        4,"byte",      "b", 255:8,                               % +  7 = 131
-        4,"long",      "l", 1234567890:64,                       % + 14 = 145
-        5,"short",     "s", 655:16,                              % +  9 = 154
-        4,"bool",      "t", 1,                                   % +  7 = 161
-        6,"binary",    "x", 15:32, "a binary string",            % + 27 = 188
-        4,"void",      "V",                                      % +  6 = 194
-        5,"array",     "A", 23:32,                               % + 11 = 205
-        "I", 54321:32,                                           % +  5 = 210
-        "S", 13:32, "A long string"                              % + 18 = 228
-      >>),
+    Table = [{<<"longstr">>,   longstr,   <<"Here is a long string">>},
+             {<<"signedint">>, signedint, 12345},
+             {<<"decimal">>,   decimal,   {3, 123456}},
+             {<<"timestamp">>, timestamp, 109876543209876},
+             {<<"table">>,     table,     [{<<"one">>, signedint, 54321},
+                                           {<<"two">>, longstr,
+                                            <<"A long string">>}]},
+             {<<"byte">>,      byte,      255},
+             {<<"long">>,      long,      1234567890},
+             {<<"short">>,     short,     655},
+             {<<"bool">>,      bool,      true},
+             {<<"binary">>,    binary,    <<"a binary string">>},
+             {<<"void">>,      void,      undefined},
+             {<<"array">>,     array,     [{signedint, 54321},
+                                           {longstr, <<"A long string">>}]}
+            ],
+    Binary = <<
+               7,"longstr",   "S", 21:32, "Here is a long string",
+               9,"signedint", "I", 12345:32/signed,
+               7,"decimal",   "D", 3, 123456:32,
+               9,"timestamp", "T", 109876543209876:64,
+               5,"table",     "F", 31:32, % length of table
+               3,"one",       "I", 54321:32,
+               3,"two",       "S", 13:32, "A long string",
+               4,"byte",      "b", 255:8,
+               4,"long",      "l", 1234567890:64,
+               5,"short",     "s", 655:16,
+               4,"bool",      "t", 1,
+               6,"binary",    "x", 15:32, "a binary string",
+               4,"void",      "V",
+               5,"array",     "A", 23:32,
+               "I", 54321:32,
+               "S", 13:32, "A long string"
+             >>,
+    Binary = rabbit_binary_generator:generate_table(Table),
+    Table  = rabbit_binary_parser:parse_table(Binary),
     passed.
 
 %% Test that content frames don't exceed frame-max
@@ -515,8 +566,9 @@ test_topic_matching() ->
     XName = #resource{virtual_host = <<"/">>,
                       kind = exchange,
                       name = <<"test_exchange">>},
-    X = #exchange{name = XName, type = topic, durable = false,
-                  auto_delete = false, arguments = []},
+    X0 = #exchange{name = XName, type = topic, durable = false,
+                   auto_delete = false, arguments = []},
+    X = rabbit_exchange_decorator:set(X0),
     %% create
     rabbit_exchange_type_topic:validate(X),
     exchange_op_callback(X, create, []),
@@ -617,8 +669,8 @@ test_topic_matching() ->
 
 exchange_op_callback(X, Fun, Args) ->
     rabbit_misc:execute_mnesia_transaction(
-      fun () -> rabbit_exchange:callback(X, Fun, [transaction, X] ++ Args) end),
-    rabbit_exchange:callback(X, Fun, [none, X] ++ Args).
+      fun () -> rabbit_exchange:callback(X, Fun, transaction, [X] ++ Args) end),
+    rabbit_exchange:callback(X, Fun, none, [X] ++ Args).
 
 test_topic_expect_match(X, List) ->
     lists:foreach(
@@ -628,7 +680,6 @@ test_topic_expect_match(X, List) ->
                                              #'P_basic'{}, <<>>),
               Res = rabbit_exchange_type_topic:route(
                       X, #delivery{mandatory = false,
-                                   immediate = false,
                                    sender    = self(),
                                    message   = Message}),
               ExpectedRes = lists:map(
@@ -746,7 +797,9 @@ test_log_management_during_startup() ->
     ok = case catch control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotation_tty_no_handlers_test});
-             {error, {cannot_log_to_tty, _, _}} -> ok
+             {badrpc, {'EXIT', {rabbit,failure_during_boot,
+               {error,{cannot_log_to_tty,
+                       _, not_installed}}}}} -> ok
          end,
 
     %% fix sasl logging
@@ -770,7 +823,9 @@ test_log_management_during_startup() ->
     ok = case control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotation_no_write_permission_dir_test});
-             {error, {cannot_log_to_file, _, _}} -> ok
+             {badrpc, {'EXIT',
+               {rabbit, failure_during_boot,
+                {error, {cannot_log_to_file, _, _}}}}} -> ok
          end,
 
     %% start application with logging to a subdirectory which
@@ -781,8 +836,11 @@ test_log_management_during_startup() ->
     ok = case control_action(start_app, []) of
              ok -> exit({got_success_but_expected_failure,
                          log_rotatation_parent_dirs_test});
-             {error, {cannot_log_to_file, _,
-                      {error, {cannot_create_parent_dirs, _, eacces}}}} -> ok
+             {badrpc,
+              {'EXIT', {rabbit,failure_during_boot,
+                {error, {cannot_log_to_file, _,
+                  {error,
+                   {cannot_create_parent_dirs, _, eacces}}}}}}} -> ok
          end,
     ok = set_permissions(TmpDir, 8#00700),
     ok = set_permissions(TmpLog, 8#00600),
@@ -855,199 +913,60 @@ test_arguments_parser() ->
 
     passed.
 
-test_cluster_management() ->
-    %% 'cluster' and 'reset' should only work if the app is stopped
-    {error, _} = control_action(cluster, []),
-    {error, _} = control_action(reset, []),
-    {error, _} = control_action(force_reset, []),
+test_dynamic_mirroring() ->
+    %% Just unit tests of the node selection logic, see multi node
+    %% tests for the rest...
+    Test = fun ({NewM, NewSs, ExtraSs}, Policy, Params,
+                {MNode, SNodes, SSNodes}, All) ->
+                   {ok, M} = rabbit_mirror_queue_misc:module(Policy),
+                   {NewM, NewSs0} = M:suggested_queue_nodes(
+                                      Params, MNode, SNodes, SSNodes, All),
+                   NewSs1 = lists:sort(NewSs0),
+                   case dm_list_match(NewSs, NewSs1, ExtraSs) of
+                       ok    -> ok;
+                       error -> exit({no_match, NewSs, NewSs1, ExtraSs})
+                   end
+           end,
+
+    Test({a,[b,c],0},<<"all">>,'_',{a,[],   []},   [a,b,c]),
+    Test({a,[b,c],0},<<"all">>,'_',{a,[b,c],[b,c]},[a,b,c]),
+    Test({a,[b,c],0},<<"all">>,'_',{a,[d],  [d]},  [a,b,c]),
+
+    N = fun (Atoms) -> [list_to_binary(atom_to_list(A)) || A <- Atoms] end,
+
+    %% Add a node
+    Test({a,[b,c],0},<<"nodes">>,N([a,b,c]),{a,[b],[b]},[a,b,c,d]),
+    Test({b,[a,c],0},<<"nodes">>,N([a,b,c]),{b,[a],[a]},[a,b,c,d]),
+    %% Add two nodes and drop one
+    Test({a,[b,c],0},<<"nodes">>,N([a,b,c]),{a,[d],[d]},[a,b,c,d]),
+    %% Don't try to include nodes that are not running
+    Test({a,[b],  0},<<"nodes">>,N([a,b,f]),{a,[b],[b]},[a,b,c,d]),
+    %% If we can't find any of the nodes listed then just keep the master
+    Test({a,[],   0},<<"nodes">>,N([f,g,h]),{a,[b],[b]},[a,b,c,d]),
+    %% And once that's happened, still keep the master even when not listed,
+    %% if nothing is synced
+    Test({a,[b,c],0},<<"nodes">>,N([b,c]),  {a,[], []}, [a,b,c,d]),
+    Test({a,[b,c],0},<<"nodes">>,N([b,c]),  {a,[b],[]}, [a,b,c,d]),
+    %% But if something is synced we can lose the master - but make
+    %% sure we pick the new master from the nodes which are synced!
+    Test({b,[c],  0},<<"nodes">>,N([b,c]),  {a,[b],[b]},[a,b,c,d]),
+    Test({b,[c],  0},<<"nodes">>,N([c,b]),  {a,[b],[b]},[a,b,c,d]),
+
+    Test({a,[],   1},<<"exactly">>,2,{a,[],   []},   [a,b,c,d]),
+    Test({a,[],   2},<<"exactly">>,3,{a,[],   []},   [a,b,c,d]),
+    Test({a,[c],  0},<<"exactly">>,2,{a,[c],  [c]},  [a,b,c,d]),
+    Test({a,[c],  1},<<"exactly">>,3,{a,[c],  [c]},  [a,b,c,d]),
+    Test({a,[c],  0},<<"exactly">>,2,{a,[c,d],[c,d]},[a,b,c,d]),
+    Test({a,[c,d],0},<<"exactly">>,3,{a,[c,d],[c,d]},[a,b,c,d]),
 
-    ok = control_action(stop_app, []),
-
-    %% various ways of creating a standalone node
-    NodeS = atom_to_list(node()),
-    ClusteringSequence = [[],
-                          [NodeS],
-                          ["invalid@invalid", NodeS],
-                          [NodeS, "invalid@invalid"]],
-
-    ok = control_action(reset, []),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(force_cluster, Arg),
-                          ok
-                  end,
-                  ClusteringSequence),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(reset, []),
-                          ok = control_action(force_cluster, Arg),
-                          ok
-                  end,
-                  ClusteringSequence),
-    ok = control_action(reset, []),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(force_cluster, Arg),
-                          ok = control_action(start_app, []),
-                          ok = control_action(stop_app, []),
-                          ok
-                  end,
-                  ClusteringSequence),
-    lists:foreach(fun (Arg) ->
-                          ok = control_action(reset, []),
-                          ok = control_action(force_cluster, Arg),
-                          ok = control_action(start_app, []),
-                          ok = control_action(stop_app, []),
-                          ok
-                  end,
-                  ClusteringSequence),
-
-    %% convert a disk node into a ram node
-    ok = control_action(reset, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
-
-    %% join a non-existing cluster as a ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
-
-    ok = control_action(reset, []),
-
-    SecondaryNode = rabbit_nodes:make("hare"),
-    case net_adm:ping(SecondaryNode) of
-        pong -> passed = test_cluster_management2(SecondaryNode);
-        pang -> io:format("Skipping clustering tests with node ~p~n",
-                          [SecondaryNode])
-    end,
-
-    ok = control_action(start_app, []),
     passed.
 
-test_cluster_management2(SecondaryNode) ->
-    NodeS = atom_to_list(node()),
-    SecondaryNodeS = atom_to_list(SecondaryNode),
-
-    %% make a disk node
-    ok = control_action(cluster, [NodeS]),
-    ok = assert_disc_node(),
-    %% make a ram node
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = assert_ram_node(),
-
-    %% join cluster as a ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, [SecondaryNodeS, "invalid1@invalid"]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% ram node will not start by itself
-    ok = control_action(stop_app, []),
-    ok = control_action(stop_app, SecondaryNode, [], []),
-    {error, _} = control_action(start_app, []),
-    ok = control_action(start_app, SecondaryNode, [], []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-
-    %% change cluster config while remaining in same cluster
-    ok = control_action(force_cluster, ["invalid2@invalid", SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-
-    %% join non-existing cluster as a ram node
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    {error, _} = control_action(start_app, []),
-    ok = assert_ram_node(),
-
-    %% join empty cluster as a ram node (converts to disc)
-    ok = control_action(cluster, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% make a new ram node
-    ok = control_action(reset, []),
-    ok = control_action(force_cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% turn ram node into disk node
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% convert a disk node into a ram node
-    ok = assert_disc_node(),
-    ok = control_action(force_cluster, ["invalid1@invalid",
-                                        "invalid2@invalid"]),
-    ok = assert_ram_node(),
-
-    %% make a new disk node
-    ok = control_action(force_reset, []),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_disc_node(),
-
-    %% turn a disk node into a ram node
-    ok = control_action(reset, []),
-    ok = control_action(cluster, [SecondaryNodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    ok = assert_ram_node(),
-
-    %% NB: this will log an inconsistent_database error, which is harmless
-    %% Turning cover on / off is OK even if we're not in general using cover,
-    %% it just turns the engine on / off, doesn't actually log anything.
-    cover:stop([SecondaryNode]),
-    true = disconnect_node(SecondaryNode),
-    pong = net_adm:ping(SecondaryNode),
-    cover:start([SecondaryNode]),
-
-    %% leaving a cluster as a ram node
-    ok = control_action(reset, []),
-    %% ...and as a disk node
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, []),
-    cover:stop(SecondaryNode),
-    ok = control_action(reset, []),
-    cover:start(SecondaryNode),
-
-    %% attempt to leave cluster when no other node is alive
-    ok = control_action(cluster, [SecondaryNodeS, NodeS]),
-    ok = control_action(start_app, []),
-    ok = control_action(stop_app, SecondaryNode, [], []),
-    ok = control_action(stop_app, []),
-    {error, {no_running_cluster_nodes, _, _}} =
-        control_action(reset, []),
-
-    %% attempt to change type when no other node is alive
-    {error, {no_running_cluster_nodes, _, _}} =
-        control_action(cluster, [SecondaryNodeS]),
-
-    %% leave system clustered, with the secondary node as a ram node
-    ok = control_action(force_reset, []),
-    ok = control_action(start_app, []),
-    %% Yes, this is rather ugly. But since we're a clustered Mnesia
-    %% node and we're telling another clustered node to reset itself,
-    %% we will get disconnected half way through causing a
-    %% badrpc. This never happens in real life since rabbitmqctl is
-    %% not a clustered Mnesia node.
-    cover:stop(SecondaryNode),
-    {badrpc, nodedown} = control_action(force_reset, SecondaryNode, [], []),
-    pong = net_adm:ping(SecondaryNode),
-    cover:start(SecondaryNode),
-    ok = control_action(cluster, SecondaryNode, [NodeS], []),
-    ok = control_action(start_app, SecondaryNode, [], []),
-
-    passed.
+%% Does the first list match the second where the second is required
+%% to have exactly Extra superfluous items?
+dm_list_match([],     [],      0)     -> ok;
+dm_list_match(_,      [],     _Extra) -> error;
+dm_list_match([H|T1], [H |T2], Extra) -> dm_list_match(T1, T2, Extra);
+dm_list_match(L1,     [_H|T2], Extra) -> dm_list_match(L1, T2, Extra - 1).
 
 test_user_management() ->
 
@@ -1134,22 +1053,21 @@ test_runtime_parameters() ->
     Bad  = fun(L) -> {error_string, _} = control_action(set_parameter, L) end,
 
     %% Acceptable for bijection
-    Good(["test", "good", "<<\"ignore\">>"]),
+    Good(["test", "good", "\"ignore\""]),
     Good(["test", "good", "123"]),
     Good(["test", "good", "true"]),
     Good(["test", "good", "false"]),
     Good(["test", "good", "null"]),
-    Good(["test", "good", "[{<<\"key\">>, <<\"value\">>}]"]),
+    Good(["test", "good", "{\"key\": \"value\"}"]),
 
-    %% Various forms of fail due to non-bijectability
+    %% Invalid json
     Bad(["test", "good", "atom"]),
-    Bad(["test", "good", "{tuple, foo}"]),
-    Bad(["test", "good", "[{<<\"key\">>, <<\"value\">>, 1}]"]),
-    Bad(["test", "good", "[{key, <<\"value\">>}]"]),
+    Bad(["test", "good", "{\"foo\": \"bar\""]),
+    Bad(["test", "good", "{foo: \"bar\"}"]),
 
     %% Test actual validation hook
-    Good(["test", "maybe", "<<\"good\">>"]),
-    Bad(["test", "maybe", "<<\"bad\">>"]),
+    Good(["test", "maybe", "\"good\""]),
+    Bad(["test", "maybe", "\"bad\""]),
 
     ok = control_action(list_parameters, []),
 
@@ -1157,25 +1075,100 @@ test_runtime_parameters() ->
     ok = control_action(clear_parameter, ["test", "maybe"]),
     {error_string, _} =
         control_action(clear_parameter, ["test", "neverexisted"]),
+
+    %% We can delete for a component that no longer exists
+    Good(["test", "good", "\"ignore\""]),
     rabbit_runtime_parameters_test:unregister(),
+    ok = control_action(clear_parameter, ["test", "good"]),
+    passed.
+
+test_policy_validation() ->
+    rabbit_runtime_parameters_test:register_policy_validator(),
+    SetPol = fun (Key, Val) ->
+                     control_action_opts(
+                       ["set_policy", "name", ".*",
+                        rabbit_misc:format("{\"~s\":~p}", [Key, Val])])
+             end,
+
+    ok    = SetPol("testeven", []),
+    ok    = SetPol("testeven", [1, 2]),
+    ok    = SetPol("testeven", [1, 2, 3, 4]),
+    ok    = SetPol("testpos",  [2, 5, 5678]),
+
+    error = SetPol("testpos",  [-1, 0, 1]),
+    error = SetPol("testeven", [ 1, 2, 3]),
+
+    ok = control_action(clear_policy, ["name"]),
+    rabbit_runtime_parameters_test:unregister_policy_validator(),
+    passed.
+
+test_policy_opts_validation() ->
+    Set  = fun (Extra) -> control_action_opts(
+                            ["set_policy", "name", ".*", "{\"ha-mode\":\"all\"}"
+                             | Extra]) end,
+    OK   = fun (Extra) -> ok = Set(Extra) end,
+    Fail = fun (Extra) -> error = Set(Extra) end,
+
+    OK  ([]),
+
+    OK  (["--priority", "0"]),
+    OK  (["--priority", "3"]),
+    Fail(["--priority", "banana"]),
+    Fail(["--priority"]),
+
+    OK  (["--apply-to", "all"]),
+    OK  (["--apply-to", "queues"]),
+    Fail(["--apply-to", "bananas"]),
+    Fail(["--apply-to"]),
+
+    OK  (["--priority", "3",      "--apply-to", "queues"]),
+    Fail(["--priority", "banana", "--apply-to", "queues"]),
+    Fail(["--priority", "3",      "--apply-to", "bananas"]),
+
+    Fail(["--offline"]),
+
+    ok = control_action(clear_policy, ["name"]),
+    passed.
+
+test_ha_policy_validation() ->
+    Set  = fun (JSON) -> control_action_opts(
+                           ["set_policy", "name", ".*", JSON]) end,
+    OK   = fun (JSON) -> ok = Set(JSON) end,
+    Fail = fun (JSON) -> error = Set(JSON) end,
+
+    OK  ("{\"ha-mode\":\"all\"}"),
+    Fail("{\"ha-mode\":\"made_up\"}"),
+
+    Fail("{\"ha-mode\":\"nodes\"}"),
+    Fail("{\"ha-mode\":\"nodes\",\"ha-params\":2}"),
+    Fail("{\"ha-mode\":\"nodes\",\"ha-params\":[\"a\",2]}"),
+    OK  ("{\"ha-mode\":\"nodes\",\"ha-params\":[\"a\",\"b\"]}"),
+    Fail("{\"ha-params\":[\"a\",\"b\"]}"),
+
+    Fail("{\"ha-mode\":\"exactly\"}"),
+    Fail("{\"ha-mode\":\"exactly\",\"ha-params\":[\"a\",\"b\"]}"),
+    OK  ("{\"ha-mode\":\"exactly\",\"ha-params\":2}"),
+    Fail("{\"ha-params\":2}"),
+
+    OK  ("{\"ha-mode\":\"all\",\"ha-sync-mode\":\"manual\"}"),
+    OK  ("{\"ha-mode\":\"all\",\"ha-sync-mode\":\"automatic\"}"),
+    Fail("{\"ha-mode\":\"all\",\"ha-sync-mode\":\"made_up\"}"),
+    Fail("{\"ha-sync-mode\":\"manual\"}"),
+    Fail("{\"ha-sync-mode\":\"automatic\"}"),
+
+    ok = control_action(clear_policy, ["name"]),
     passed.
 
 test_server_status() ->
     %% create a few things so there is some useful information to list
-    Writer = spawn(fun () -> receive shutdown -> ok end end),
-    {ok, Ch} = rabbit_channel:start_link(
-                 1, self(), Writer, self(), "", rabbit_framing_amqp_0_9_1,
-                 user(<<"user">>), <<"/">>, [], self(),
-                 rabbit_limiter:make_token(self())),
+    {_Writer, Limiter, Ch} = test_channel(),
     [Q, Q2] = [Queue || Name <- [<<"foo">>, <<"bar">>],
                         {new, Queue = #amqqueue{}} <-
                             [rabbit_amqqueue:declare(
                                rabbit_misc:r(<<"/">>, queue, Name),
                                false, false, [], none)]],
-
     ok = rabbit_amqqueue:basic_consume(
-           Q, true, Ch, rabbit_limiter:make_token(),
-           <<"ctag">>, true, undefined),
+           Q, true, Ch, Limiter, false, <<"ctag">>, true, none, undefined),
 
     %% list queues
     ok = info_action(list_queues, rabbit_amqqueue:info_keys(), true),
@@ -1195,11 +1188,9 @@ test_server_status() ->
             rabbit_misc:r(<<"/">>, queue, <<"foo">>)),
 
     %% list connections
-    [#listener{host = H, port = P} | _] =
-        [L || L = #listener{node = N} <- rabbit_networking:active_listeners(),
-              N =:= node()],
-
-    {ok, _C} = gen_tcp:connect(H, P, []),
+    {H, P} = find_listener(),
+    {ok, C} = gen_tcp:connect(H, P, []),
+    gen_tcp:send(C, <<"AMQP", 0, 0, 9, 1>>),
     timer:sleep(100),
     ok = info_action(list_connections,
                      rabbit_networking:connection_info_keys(), false),
@@ -1215,7 +1206,18 @@ test_server_status() ->
     ok = control_action(list_consumers, []),
 
     %% set vm memory high watermark
+    HWM = vm_memory_monitor:get_vm_memory_high_watermark(),
+    ok = control_action(set_vm_memory_high_watermark, ["1"]),
     ok = control_action(set_vm_memory_high_watermark, ["1.0"]),
+    %% this will trigger an alarm
+    ok = control_action(set_vm_memory_high_watermark, ["0.0"]),
+    %% reset
+    ok = control_action(set_vm_memory_high_watermark, [float_to_list(HWM)]),
+
+    %% eval
+    {error_string, _} = control_action(eval, ["\""]),
+    {error_string, _} = control_action(eval, ["a("]),
+    ok = control_action(eval, ["a."]),
 
     %% cleanup
     [{ok, _} = rabbit_amqqueue:delete(QR, false, false) || QR <- [Q, Q2]],
@@ -1225,22 +1227,48 @@ test_server_status() ->
 
     passed.
 
+test_amqp_connection_refusal() ->
+    [passed = test_amqp_connection_refusal(V) ||
+        V <- [<<"AMQP",9,9,9,9>>, <<"AMQP",0,1,0,0>>, <<"XXXX",0,0,9,1>>]],
+    passed.
+
+test_amqp_connection_refusal(Header) ->
+    {H, P} = find_listener(),
+    {ok, C} = gen_tcp:connect(H, P, [binary, {active, false}]),
+    ok = gen_tcp:send(C, Header),
+    {ok, <<"AMQP",0,0,9,1>>} = gen_tcp:recv(C, 8, 100),
+    ok = gen_tcp:close(C),
+    passed.
+
+find_listener() ->
+    [#listener{host = H, port = P} | _] =
+        [L || L = #listener{node = N} <- rabbit_networking:active_listeners(),
+              N =:= node()],
+    {H, P}.
+
 test_writer(Pid) ->
     receive
-        shutdown               -> ok;
-        {send_command, Method} -> Pid ! Method, test_writer(Pid)
+        {'$gen_call', From, flush} -> gen_server:reply(From, ok),
+                                      test_writer(Pid);
+        {send_command, Method}     -> Pid ! Method,
+                                      test_writer(Pid);
+        shutdown                   -> ok
     end.
 
-test_spawn() ->
+test_channel() ->
     Me = self(),
     Writer = spawn(fun () -> test_writer(Me) end),
+    {ok, Limiter} = rabbit_limiter:start_link(),
     {ok, Ch} = rabbit_channel:start_link(
                  1, Me, Writer, Me, "", rabbit_framing_amqp_0_9_1,
-                 user(<<"guest">>), <<"/">>, [], Me,
-                  rabbit_limiter:make_token(self())),
+                 user(<<"guest">>), <<"/">>, [], Me, Limiter),
+    {Writer, Limiter, Ch}.
+
+test_spawn() ->
+    {Writer, _Limiter, Ch} = test_channel(),
     ok = rabbit_channel:do(Ch, #'channel.open'{}),
     receive #'channel.open_ok'{} -> ok
-    after 1000 -> throw(failed_to_receive_channel_open_ok)
+    after ?TIMEOUT -> throw(failed_to_receive_channel_open_ok)
     end,
     {Writer, Ch}.
 
@@ -1261,7 +1289,7 @@ test_spawn_remote() ->
                   end
           end),
     receive Res -> Res
-    after 1000  -> throw(failed_to_receive_result)
+    after ?TIMEOUT  -> throw(failed_to_receive_result)
     end.
 
 user(Username) ->
@@ -1281,13 +1309,10 @@ test_confirms() ->
                                             queue = Q0,
                                             exchange = <<"amq.direct">>,
                                             routing_key = "magic" }),
-                        receive #'queue.bind_ok'{} ->
-                                Q0
-                        after 1000 ->
-                                throw(failed_to_bind_queue)
+                        receive #'queue.bind_ok'{} -> Q0
+                        after ?TIMEOUT -> throw(failed_to_bind_queue)
                         end
-                after 1000 ->
-                        throw(failed_to_declare_queue)
+                after ?TIMEOUT -> throw(failed_to_declare_queue)
                 end
         end,
     %% Declare and bind two queues
@@ -1300,7 +1325,7 @@ test_confirms() ->
     rabbit_channel:do(Ch, #'confirm.select'{}),
     receive
         #'confirm.select_ok'{} -> ok
-    after 1000 -> throw(failed_to_enable_confirms)
+    after ?TIMEOUT -> throw(failed_to_enable_confirms)
     end,
     %% Publish a message
     rabbit_channel:do(Ch, #'basic.publish'{exchange = <<"amq.direct">>,
@@ -1317,7 +1342,7 @@ test_confirms() ->
     receive
         #'basic.nack'{} -> ok;
         #'basic.ack'{}  -> throw(received_ack_instead_of_nack)
-    after 2000 -> throw(did_not_receive_nack)
+    after ?TIMEOUT-> throw(did_not_receive_nack)
     end,
     receive
         #'basic.ack'{} -> throw(received_ack_when_none_expected)
@@ -1327,13 +1352,18 @@ test_confirms() ->
     rabbit_channel:do(Ch, #'queue.delete'{queue = QName2}),
     receive
         #'queue.delete_ok'{} -> ok
-    after 1000 -> throw(failed_to_cleanup_queue)
+    after ?TIMEOUT -> throw(failed_to_cleanup_queue)
     end,
     unlink(Ch),
     ok = rabbit_channel:shutdown(Ch),
 
     passed.
 
+test_with_state() ->
+    fhc_state = gen_server2:with_state(file_handle_cache,
+                                       fun (S) -> element(1, S) end),
+    passed.
+
 test_statistics_event_receiver(Pid) ->
     receive
         Foo -> Pid ! Foo, test_statistics_event_receiver(Pid)
@@ -1350,7 +1380,7 @@ test_statistics_receive_event1(Ch, Matcher) ->
                 true -> Props;
                 _    -> test_statistics_receive_event1(Ch, Matcher)
             end
-    after 1000 -> throw(failed_to_receive_event)
+    after ?TIMEOUT -> throw(failed_to_receive_event)
     end.
 
 test_statistics() ->
@@ -1362,12 +1392,10 @@ test_statistics() ->
     %% Set up a channel and queue
     {_Writer, Ch} = test_spawn(),
     rabbit_channel:do(Ch, #'queue.declare'{}),
-    QName = receive #'queue.declare_ok'{queue = Q0} ->
-                    Q0
-            after 1000 -> throw(failed_to_receive_queue_declare_ok)
+    QName = receive #'queue.declare_ok'{queue = Q0} -> Q0
+            after ?TIMEOUT -> throw(failed_to_receive_queue_declare_ok)
             end,
-    {ok, Q} = rabbit_amqqueue:lookup(rabbit_misc:r(<<"/">>, queue, QName)),
-    QPid = Q#amqqueue.pid,
+    QRes = rabbit_misc:r(<<"/">>, queue, QName),
     X = rabbit_misc:r(<<"/">>, exchange, <<"">>),
 
     rabbit_tests_event_receiver:start(self(), [node()], [channel_stats]),
@@ -1391,9 +1419,9 @@ test_statistics() ->
                        length(proplists:get_value(
                                 channel_queue_exchange_stats, E)) > 0
                end),
-    [{QPid,[{get,1}]}] = proplists:get_value(channel_queue_stats, Event2),
+    [{QRes, [{get,1}]}] = proplists:get_value(channel_queue_stats,    Event2),
     [{X,[{publish,1}]}] = proplists:get_value(channel_exchange_stats, Event2),
-    [{{QPid,X},[{publish,1}]}] =
+    [{{QRes,X},[{publish,1}]}] =
         proplists:get_value(channel_queue_exchange_stats, Event2),
 
     %% Check the stats remove stuff on queue deletion
@@ -1418,33 +1446,33 @@ test_refresh_events(SecondaryNode) ->
                                       [channel_created, queue_created]),
 
     {_Writer, Ch} = test_spawn(),
-    expect_events(Ch, channel_created),
+    expect_events(pid, Ch, channel_created),
     rabbit_channel:shutdown(Ch),
 
     {_Writer2, Ch2} = test_spawn(SecondaryNode),
-    expect_events(Ch2, channel_created),
+    expect_events(pid, Ch2, channel_created),
     rabbit_channel:shutdown(Ch2),
 
-    {new, #amqqueue { pid = QPid } = Q} =
+    {new, #amqqueue{name = QName} = Q} =
         rabbit_amqqueue:declare(test_queue(), false, false, [], none),
-    expect_events(QPid, queue_created),
+    expect_events(name, QName, queue_created),
     rabbit_amqqueue:delete(Q, false, false),
 
     rabbit_tests_event_receiver:stop(),
     passed.
 
-expect_events(Pid, Type) ->
-    expect_event(Pid, Type),
+expect_events(Tag, Key, Type) ->
+    expect_event(Tag, Key, Type),
     rabbit:force_event_refresh(),
-    expect_event(Pid, Type).
+    expect_event(Tag, Key, Type).
 
-expect_event(Pid, Type) ->
+expect_event(Tag, Key, Type) ->
     receive #event{type = Type, props = Props} ->
-            case pget(pid, Props) of
-                Pid -> ok;
-                _   -> expect_event(Pid, Type)
+            case pget(Tag, Props) of
+                Key -> ok;
+                _   -> expect_event(Tag, Key, Type)
             end
-    after 1000 -> throw({failed_to_receive_event, Type})
+    after ?TIMEOUT -> throw({failed_to_receive_event, Type})
     end.
 
 test_delegates_async(SecondaryNode) ->
@@ -1468,7 +1496,7 @@ make_responder(FMsg) -> make_responder(FMsg, timeout).
 make_responder(FMsg, Throw) ->
     fun () ->
             receive Msg -> FMsg(Msg)
-            after 1000 -> throw(Throw)
+            after ?TIMEOUT -> throw(Throw)
             end
     end.
 
@@ -1481,9 +1509,7 @@ await_response(Count) ->
     receive
         response -> ok,
                     await_response(Count - 1)
-    after 1000 ->
-            io:format("Async reply not received~n"),
-            throw(timeout)
+    after ?TIMEOUT -> throw(timeout)
     end.
 
 must_exit(Fun) ->
@@ -1550,7 +1576,7 @@ test_queue_cleanup(_SecondaryNode) ->
     rabbit_channel:do(Ch, #'queue.declare'{ queue = ?CLEANUP_QUEUE_NAME }),
     receive #'queue.declare_ok'{queue = ?CLEANUP_QUEUE_NAME} ->
             ok
-    after 1000 -> throw(failed_to_receive_queue_declare_ok)
+    after ?TIMEOUT -> throw(failed_to_receive_queue_declare_ok)
     end,
     rabbit_channel:shutdown(Ch),
     rabbit:stop(),
@@ -1561,8 +1587,7 @@ test_queue_cleanup(_SecondaryNode) ->
     receive
         #'channel.close'{reply_code = ?NOT_FOUND} ->
             ok
-    after 2000 ->
-            throw(failed_to_receive_channel_exit)
+    after ?TIMEOUT -> throw(failed_to_receive_channel_exit)
     end,
     rabbit_channel:shutdown(Ch2),
     passed.
@@ -1589,8 +1614,7 @@ test_declare_on_dead_queue(SecondaryNode) ->
             true = rabbit_misc:is_process_alive(Q#amqqueue.pid),
             {ok, 0} = rabbit_amqqueue:delete(Q, false, false),
             passed
-    after 2000 ->
-            throw(failed_to_create_and_kill_queue)
+    after ?TIMEOUT -> throw(failed_to_create_and_kill_queue)
     end.
 
 %%---------------------------------------------------------------------
@@ -1603,7 +1627,7 @@ control_action(Command, Args, NewOpts) ->
                    expand_options(default_options(), NewOpts)).
 
 control_action(Command, Node, Args, Opts) ->
-    case catch rabbit_control:action(
+    case catch rabbit_control_main:action(
                  Command, Node, Args, Opts,
                  fun (Format, Args1) ->
                          io:format(Format ++ " ...~n", Args1)
@@ -1616,9 +1640,21 @@ control_action(Command, Node, Args, Opts) ->
             Other
     end.
 
+control_action_opts(Raw) ->
+    NodeStr = atom_to_list(node()),
+    case rabbit_control_main:parse_arguments(Raw, NodeStr) of
+        {ok, {Cmd, Opts, Args}} ->
+            case control_action(Cmd, node(), Args, Opts) of
+                ok -> ok;
+                _  -> error
+            end;
+        _ ->
+            error
+    end.
+
 info_action(Command, Args, CheckVHost) ->
     ok = control_action(Command, []),
-    if CheckVHost -> ok = control_action(Command, []);
+    if CheckVHost -> ok = control_action(Command, [], ["-p", "/"]);
        true       -> ok
     end,
     ok = control_action(Command, lists:map(fun atom_to_list/1, Args)),
@@ -1679,15 +1715,15 @@ clean_logs(Files, Suffix) ->
     ok.
 
 assert_ram_node() ->
-    case rabbit_mnesia:is_disc_node() of
-        true  -> exit('not_ram_node');
-        false -> ok
+    case rabbit_mnesia:node_type() of
+        disc -> exit('not_ram_node');
+        ram  -> ok
     end.
 
 assert_disc_node() ->
-    case rabbit_mnesia:is_disc_node() of
-        true  -> ok;
-        false -> exit('not_disc_node')
+    case rabbit_mnesia:node_type() of
+        disc -> ok;
+        ram  -> exit('not_disc_node')
     end.
 
 delete_file(File) ->
@@ -1821,7 +1857,7 @@ on_disk_capture(OnDisk, Awaiting, Pid) ->
                             Pid);
         stop ->
             done
-    after (case Awaiting of [] -> 200; _ -> 1000 end) ->
+    after (case Awaiting of [] -> 200; _ -> ?TIMEOUT end) ->
             case Awaiting of
                 [] -> Pid ! {self(), arrived}, on_disk_capture();
                 _  -> Pid ! {self(), timeout}
@@ -2301,6 +2337,10 @@ variable_queue_publish(IsPersistent, Count, VQ) ->
     variable_queue_publish(IsPersistent, Count, fun (_N, P) -> P end, VQ).
 
 variable_queue_publish(IsPersistent, Count, PropFun, VQ) ->
+    variable_queue_publish(IsPersistent, 1, Count, PropFun,
+                           fun (_N) -> <<>> end, VQ).
+
+variable_queue_publish(IsPersistent, Start, Count, PropFun, PayloadFun, VQ) ->
     lists:foldl(
       fun (N, VQN) ->
               rabbit_variable_queue:publish(
@@ -2309,16 +2349,18 @@ variable_queue_publish(IsPersistent, Count, PropFun, VQ) ->
                   <<>>, #'P_basic'{delivery_mode = case IsPersistent of
                                                        true  -> 2;
                                                        false -> 1
-                                                   end}, <<>>),
-                PropFun(N, #message_properties{}), self(), VQN)
-      end, VQ, lists:seq(1, Count)).
+                                                   end},
+                                   PayloadFun(N)),
+                PropFun(N, #message_properties{}), false, self(), VQN)
+      end, VQ, lists:seq(Start, Start + Count - 1)).
 
 variable_queue_fetch(Count, IsPersistent, IsDelivered, Len, VQ) ->
     lists:foldl(fun (N, {VQN, AckTagsAcc}) ->
                         Rem = Len - N,
                         {{#basic_message { is_persistent = IsPersistent },
-                          IsDelivered, AckTagN, Rem}, VQM} =
+                          IsDelivered, AckTagN}, VQM} =
                             rabbit_variable_queue:fetch(true, VQN),
+                        Rem = rabbit_variable_queue:len(VQM),
                         {VQM, [AckTagN | AckTagsAcc]}
                 end, {VQ, []}, lists:seq(1, Count)).
 
@@ -2361,8 +2403,8 @@ publish_and_confirm(Q, Payload, Count) ->
          Msg = rabbit_basic:message(rabbit_misc:r(<<>>, exchange, <<>>),
                                     <<>>, #'P_basic'{delivery_mode = 2},
                                     Payload),
-         Delivery = #delivery{mandatory = false, immediate = false,
-                              sender = self(), message = Msg, msg_seq_no = Seq},
+         Delivery = #delivery{mandatory = false, sender = self(),
+                              message = Msg, msg_seq_no = Seq},
          {routed, _} = rabbit_amqqueue:deliver([Q], Delivery)
      end || Seq <- Seqs],
     wait_for_confirms(gb_sets:from_list(Seqs)).
@@ -2374,7 +2416,7 @@ wait_for_confirms(Unconfirmed) ->
                          wait_for_confirms(
                            rabbit_misc:gb_sets_difference(
                              Unconfirmed, gb_sets:from_list(Confirmed)))
-                 after 5000 -> exit(timeout_waiting_for_confirm)
+                 after ?TIMEOUT -> exit(timeout_waiting_for_confirm)
                  end
     end.
 
@@ -2384,37 +2426,141 @@ test_variable_queue() ->
               fun test_variable_queue_partial_segments_delta_thing/1,
               fun test_variable_queue_all_the_bits_not_covered_elsewhere1/1,
               fun test_variable_queue_all_the_bits_not_covered_elsewhere2/1,
-              fun test_dropwhile/1,
+              fun test_drop/1,
+              fun test_variable_queue_fold_msg_on_disk/1,
+              fun test_dropfetchwhile/1,
               fun test_dropwhile_varying_ram_duration/1,
+              fun test_fetchwhile_varying_ram_duration/1,
               fun test_variable_queue_ack_limiting/1,
-              fun test_variable_queue_requeue/1]],
+              fun test_variable_queue_purge/1,
+              fun test_variable_queue_requeue/1,
+              fun test_variable_queue_requeue_ram_beta/1,
+              fun test_variable_queue_fold/1]],
     passed.
 
-test_variable_queue_requeue(VQ0) ->
-    Interval = 50,
-    Count = rabbit_queue_index:next_segment_boundary(0) + 2 * Interval,
+test_variable_queue_fold(VQ0) ->
+    {PendingMsgs, RequeuedMsgs, FreshMsgs, VQ1} =
+        variable_queue_with_holes(VQ0),
+    Count = rabbit_variable_queue:depth(VQ1),
+    Msgs = lists:sort(PendingMsgs ++ RequeuedMsgs ++ FreshMsgs),
+    lists:foldl(fun (Cut, VQ2) ->
+                        test_variable_queue_fold(Cut, Msgs, PendingMsgs, VQ2)
+                end, VQ1, [0, 1, 2, Count div 2,
+                           Count - 1, Count, Count + 1, Count * 2]).
+
+test_variable_queue_fold(Cut, Msgs, PendingMsgs, VQ0) ->
+    {Acc, VQ1} = rabbit_variable_queue:fold(
+                   fun (M, _, Pending, A) ->
+                           MInt = msg2int(M),
+                           Pending = lists:member(MInt, PendingMsgs), %% assert
+                           case MInt =< Cut of
+                               true  -> {cont, [MInt | A]};
+                               false -> {stop, A}
+                           end
+                   end, [], VQ0),
+    Expected = lists:takewhile(fun (I) -> I =< Cut end, Msgs),
+    Expected = lists:reverse(Acc), %% assertion
+    VQ1.
+
+msg2int(#basic_message{content = #content{ payload_fragments_rev = P}}) ->
+    binary_to_term(list_to_binary(lists:reverse(P))).
+
+ack_subset(AckSeqs, Interval, Rem) ->
+    lists:filter(fun ({_Ack, N}) -> (N + Rem) rem Interval == 0 end, AckSeqs).
+
+requeue_one_by_one(Acks, VQ) ->
+    lists:foldl(fun (AckTag, VQN) ->
+                        {_MsgId, VQM} = rabbit_variable_queue:requeue(
+                                          [AckTag], VQN),
+                        VQM
+                end, VQ, Acks).
+
+%% Create a vq with messages in q1, delta, and q3, and holes (in the
+%% form of pending acks) in the latter two.
+variable_queue_with_holes(VQ0) ->
+    Interval = 64,
+    Count = rabbit_queue_index:next_segment_boundary(0)*2 + 2 * Interval,
     Seq = lists:seq(1, Count),
     VQ1 = rabbit_variable_queue:set_ram_duration_target(0, VQ0),
-    VQ2 = variable_queue_publish(false, Count, VQ1),
-    {VQ3, Acks} = variable_queue_fetch(Count, false, false, Count, VQ2),
-    Subset = lists:foldl(fun ({Ack, N}, Acc) when N rem Interval == 0 ->
-                                 [Ack | Acc];
-                             (_, Acc) ->
-                                 Acc
-                         end, [], lists:zip(Acks, Seq)),
-    {_MsgIds, VQ4} = rabbit_variable_queue:requeue(Acks -- Subset, VQ3),
-    VQ5 = lists:foldl(fun (AckTag, VQN) ->
-                              {_MsgId, VQM} = rabbit_variable_queue:requeue(
-                                                [AckTag], VQN),
-                              VQM
-                      end, VQ4, Subset),
-    VQ6 = lists:foldl(fun (AckTag, VQa) ->
-                              {{#basic_message{}, true, AckTag, _}, VQb} =
+    VQ2 = variable_queue_publish(
+            false, 1, Count,
+            fun (_, P) -> P end, fun erlang:term_to_binary/1, VQ1),
+    {VQ3, AcksR} = variable_queue_fetch(Count, false, false, Count, VQ2),
+    Acks = lists:reverse(AcksR),
+    AckSeqs = lists:zip(Acks, Seq),
+    [{Subset1, _Seq1}, {Subset2, _Seq2}, {Subset3, Seq3}] =
+        [lists:unzip(ack_subset(AckSeqs, Interval, I)) || I <- [0, 1, 2]],
+    %% we requeue in three phases in order to exercise requeuing logic
+    %% in various vq states
+    {_MsgIds, VQ4} = rabbit_variable_queue:requeue(
+                       Acks -- (Subset1 ++ Subset2 ++ Subset3), VQ3),
+    VQ5 = requeue_one_by_one(Subset1, VQ4),
+    %% by now we have some messages (and holes) in delt
+    VQ6 = requeue_one_by_one(Subset2, VQ5),
+    VQ7 = rabbit_variable_queue:set_ram_duration_target(infinity, VQ6),
+    %% add the q1 tail
+    VQ8 = variable_queue_publish(
+            true, Count + 1, 64,
+            fun (_, P) -> P end, fun erlang:term_to_binary/1, VQ7),
+    %% assertions
+    [false = case V of
+                 {delta, _, 0, _} -> true;
+                 0                -> true;
+                 _                -> false
+             end || {K, V} <- rabbit_variable_queue:status(VQ8),
+                    lists:member(K, [q1, delta, q3])],
+    Depth = Count + 64,
+    Depth = rabbit_variable_queue:depth(VQ8),
+    Len = Depth - length(Subset3),
+    Len = rabbit_variable_queue:len(VQ8),
+    {Seq3, Seq -- Seq3, lists:seq(Count + 1, Count + 64), VQ8}.
+
+test_variable_queue_requeue(VQ0) ->
+    {_PendingMsgs, RequeuedMsgs, FreshMsgs, VQ1} =
+        variable_queue_with_holes(VQ0),
+    Msgs =
+        lists:zip(RequeuedMsgs,
+                  lists:duplicate(length(RequeuedMsgs), true)) ++
+        lists:zip(FreshMsgs,
+                  lists:duplicate(length(FreshMsgs), false)),
+    VQ2 = lists:foldl(fun ({I, Requeued}, VQa) ->
+                              {{M, MRequeued, _}, VQb} =
                                   rabbit_variable_queue:fetch(true, VQa),
+                              Requeued = MRequeued, %% assertion
+                              I = msg2int(M),       %% assertion
                               VQb
-                      end, VQ5, lists:reverse(Acks)),
-    {empty, VQ7} = rabbit_variable_queue:fetch(true, VQ6),
-    VQ7.
+                      end, VQ1, Msgs),
+    {empty, VQ3} = rabbit_variable_queue:fetch(true, VQ2),
+    VQ3.
+
+%% requeue from ram_pending_ack into q3, move to delta and then empty queue
+test_variable_queue_requeue_ram_beta(VQ0) ->
+    Count = rabbit_queue_index:next_segment_boundary(0)*2 + 2,
+    VQ1 = rabbit_tests:variable_queue_publish(false, Count, VQ0),
+    {VQ2, AcksR} = variable_queue_fetch(Count, false, false, Count, VQ1),
+    {Back, Front} = lists:split(Count div 2, AcksR),
+    {_, VQ3} = rabbit_variable_queue:requeue(erlang:tl(Back), VQ2),
+    VQ4 = rabbit_variable_queue:set_ram_duration_target(0, VQ3),
+    {_, VQ5} = rabbit_variable_queue:requeue([erlang:hd(Back)], VQ4),
+    VQ6 = requeue_one_by_one(Front, VQ5),
+    {VQ7, AcksAll} = variable_queue_fetch(Count, false, true, Count, VQ6),
+    {_, VQ8} = rabbit_variable_queue:ack(AcksAll, VQ7),
+    VQ8.
+
+test_variable_queue_purge(VQ0) ->
+    LenDepth = fun (VQ) ->
+                       {rabbit_variable_queue:len(VQ),
+                        rabbit_variable_queue:depth(VQ)}
+               end,
+    VQ1         = variable_queue_publish(false, 10, VQ0),
+    {VQ2, Acks} = variable_queue_fetch(6, false, false, 10, VQ1),
+    {4, VQ3}    = rabbit_variable_queue:purge(VQ2),
+    {0, 6}      = LenDepth(VQ3),
+    {_, VQ4}    = rabbit_variable_queue:requeue(lists:sublist(Acks, 2), VQ3),
+    {2, 6}      = LenDepth(VQ4),
+    VQ5         = rabbit_variable_queue:purge_acks(VQ4),
+    {2, 2}      = LenDepth(VQ5),
+    VQ5.
 
 test_variable_queue_ack_limiting(VQ0) ->
     %% start by sending in a bunch of messages
@@ -2445,41 +2591,86 @@ test_variable_queue_ack_limiting(VQ0) ->
 
     VQ6.
 
-test_dropwhile(VQ0) ->
+test_drop(VQ0) ->
+    %% start by sending a messages
+    VQ1 = variable_queue_publish(false, 1, VQ0),
+    %% drop message with AckRequired = true
+    {{MsgId, AckTag}, VQ2} = rabbit_variable_queue:drop(true, VQ1),
+    true = rabbit_variable_queue:is_empty(VQ2),
+    true = AckTag =/= undefinded,
+    %% drop again -> empty
+    {empty, VQ3} = rabbit_variable_queue:drop(false, VQ2),
+    %% requeue
+    {[MsgId], VQ4} = rabbit_variable_queue:requeue([AckTag], VQ3),
+    %% drop message with AckRequired = false
+    {{MsgId, undefined}, VQ5} = rabbit_variable_queue:drop(false, VQ4),
+    true = rabbit_variable_queue:is_empty(VQ5),
+    VQ5.
+
+test_dropfetchwhile(VQ0) ->
     Count = 10,
 
     %% add messages with sequential expiry
     VQ1 = variable_queue_publish(
-            false, Count,
-            fun (N, Props) -> Props#message_properties{expiry = N} end, VQ0),
+            false, 1, Count,
+            fun (N, Props) -> Props#message_properties{expiry = N} end,
+            fun erlang:term_to_binary/1, VQ0),
+
+    %% fetch the first 5 messages
+    {#message_properties{expiry = 6}, {Msgs, AckTags}, VQ2} =
+        rabbit_variable_queue:fetchwhile(
+          fun (#message_properties{expiry = Expiry}) -> Expiry =< 5 end,
+          fun (Msg, AckTag, {MsgAcc, AckAcc}) ->
+                  {[Msg | MsgAcc], [AckTag | AckAcc]}
+          end, {[], []}, VQ1),
+    true = lists:seq(1, 5) == [msg2int(M) || M <- lists:reverse(Msgs)],
+
+    %% requeue them
+    {_MsgIds, VQ3} = rabbit_variable_queue:requeue(AckTags, VQ2),
 
     %% drop the first 5 messages
-    {undefined, VQ2} = rabbit_variable_queue:dropwhile(
-                         fun(#message_properties { expiry = Expiry }) ->
-                                 Expiry =< 5
-                         end, false, VQ1),
-
-    %% fetch five now
-    VQ3 = lists:foldl(fun (_N, VQN) ->
-                              {{#basic_message{}, _, _, _}, VQM} =
+    {#message_properties{expiry = 6}, VQ4} =
+        rabbit_variable_queue:dropwhile(
+          fun (#message_properties {expiry = Expiry}) -> Expiry =< 5 end, VQ3),
+
+    %% fetch 5
+    VQ5 = lists:foldl(fun (N, VQN) ->
+                              {{Msg, _, _}, VQM} =
                                   rabbit_variable_queue:fetch(false, VQN),
+                              true = msg2int(Msg) == N,
                               VQM
-                      end, VQ2, lists:seq(6, Count)),
+                      end, VQ4, lists:seq(6, Count)),
 
     %% should be empty now
-    {empty, VQ4} = rabbit_variable_queue:fetch(false, VQ3),
+    true = rabbit_variable_queue:is_empty(VQ5),
 
-    VQ4.
+    VQ5.
 
 test_dropwhile_varying_ram_duration(VQ0) ->
+    test_dropfetchwhile_varying_ram_duration(
+      fun (VQ1) ->
+              {_, VQ2} = rabbit_variable_queue:dropwhile(
+                           fun (_) -> false end, VQ1),
+              VQ2
+      end, VQ0).
+
+test_fetchwhile_varying_ram_duration(VQ0) ->
+    test_dropfetchwhile_varying_ram_duration(
+      fun (VQ1) ->
+              {_, ok, VQ2} = rabbit_variable_queue:fetchwhile(
+                               fun (_) -> false end,
+                               fun (_, _, A) -> A end,
+                               ok, VQ1),
+              VQ2
+      end, VQ0).
+
+test_dropfetchwhile_varying_ram_duration(Fun, VQ0) ->
     VQ1 = variable_queue_publish(false, 1, VQ0),
     VQ2 = rabbit_variable_queue:set_ram_duration_target(0, VQ1),
-    {undefined, VQ3} = rabbit_variable_queue:dropwhile(
-                         fun(_) -> false end, false, VQ2),
+    VQ3 = Fun(VQ2),
     VQ4 = rabbit_variable_queue:set_ram_duration_target(infinity, VQ3),
     VQ5 = variable_queue_publish(false, 1, VQ4),
-    {undefined, VQ6} =
-        rabbit_variable_queue:dropwhile(fun(_) -> false end, false, VQ5),
+    VQ6 = Fun(VQ5),
     VQ6.
 
 test_variable_queue_dynamic_duration_change(VQ0) ->
@@ -2514,7 +2705,8 @@ publish_fetch_and_ack(0, _Len, VQ0) ->
     VQ0;
 publish_fetch_and_ack(N, Len, VQ0) ->
     VQ1 = variable_queue_publish(false, 1, VQ0),
-    {{_Msg, false, AckTag, Len}, VQ2} = rabbit_variable_queue:fetch(true, VQ1),
+    {{_Msg, false, AckTag}, VQ2} = rabbit_variable_queue:fetch(true, VQ1),
+    Len = rabbit_variable_queue:len(VQ2),
     {_Guids, VQ3} = rabbit_variable_queue:ack([AckTag], VQ2),
     publish_fetch_and_ack(N-1, Len, VQ3).
 
@@ -2579,8 +2771,8 @@ test_variable_queue_all_the_bits_not_covered_elsewhere1(VQ0) ->
                                             Count, VQ4),
     _VQ6 = rabbit_variable_queue:terminate(shutdown, VQ5),
     VQ7 = variable_queue_init(test_amqqueue(true), true),
-    {{_Msg1, true, _AckTag1, Count1}, VQ8} =
-        rabbit_variable_queue:fetch(true, VQ7),
+    {{_Msg1, true, _AckTag1}, VQ8} = rabbit_variable_queue:fetch(true, VQ7),
+    Count1 = rabbit_variable_queue:len(VQ8),
     VQ9 = variable_queue_publish(false, 1, VQ8),
     VQ10 = rabbit_variable_queue:set_ram_duration_target(0, VQ9),
     {VQ11, _AckTags2} = variable_queue_fetch(Count1, true, true, Count, VQ10),
@@ -2599,6 +2791,13 @@ test_variable_queue_all_the_bits_not_covered_elsewhere2(VQ0) ->
     {empty, VQ8} = rabbit_variable_queue:fetch(false, VQ7),
     VQ8.
 
+test_variable_queue_fold_msg_on_disk(VQ0) ->
+    VQ1 = variable_queue_publish(true, 1, VQ0),
+    {VQ2, AckTags} = variable_queue_fetch(1, true, false, 1, VQ1),
+    {ok, VQ3} = rabbit_variable_queue:ackfold(fun (_M, _A, ok) -> ok end,
+                                              ok, VQ2, AckTags),
+    VQ3.
+
 test_queue_recover() ->
     Count = 2 * rabbit_queue_index:next_segment_boundary(0),
     {new, #amqqueue { pid = QPid, name = QName } = Q} =
@@ -2611,19 +2810,21 @@ test_queue_recover() ->
     after 10000 -> exit(timeout_waiting_for_queue_death)
     end,
     rabbit_amqqueue:stop(),
-    rabbit_amqqueue:start(),
+    rabbit_amqqueue:start(rabbit_amqqueue:recover()),
+    {ok, Limiter} = rabbit_limiter:start_link(),
     rabbit_amqqueue:with_or_die(
       QName,
       fun (Q1 = #amqqueue { pid = QPid1 }) ->
               CountMinusOne = Count - 1,
               {ok, CountMinusOne, {QName, QPid1, _AckTag, true, _Msg}} =
-                  rabbit_amqqueue:basic_get(Q1, self(), false),
+                  rabbit_amqqueue:basic_get(Q1, self(), false, Limiter),
               exit(QPid1, shutdown),
               VQ1 = variable_queue_init(Q, true),
-              {{_Msg1, true, _AckTag1, CountMinusOne}, VQ2} =
+              {{_Msg1, true, _AckTag1}, VQ2} =
                   rabbit_variable_queue:fetch(true, VQ1),
+              CountMinusOne = rabbit_variable_queue:len(VQ2),
               _VQ3 = rabbit_variable_queue:delete_and_terminate(shutdown, VQ2),
-              rabbit_amqqueue:internal_delete(QName, QPid1)
+              rabbit_amqqueue:internal_delete(QName)
       end),
     passed.
 
@@ -2637,9 +2838,11 @@ test_variable_queue_delete_msg_store_files_callback() ->
 
     rabbit_amqqueue:set_ram_duration_target(QPid, 0),
 
+    {ok, Limiter} = rabbit_limiter:start_link(),
+
     CountMinusOne = Count - 1,
     {ok, CountMinusOne, {QName, QPid, _AckTag, false, _Msg}} =
-        rabbit_amqqueue:basic_get(Q, self(), true),
+        rabbit_amqqueue:basic_get(Q, self(), true, Limiter),
     {ok, CountMinusOne} = rabbit_amqqueue:purge(Q),
 
     %% give the queue a second to receive the close_fds callback msg
diff --git a/src/rabbit_tests_event_receiver.erl b/src/rabbit_tests_event_receiver.erl
index 72c07b51..7b756cbc 100644
--- a/src/rabbit_tests_event_receiver.erl
+++ b/src/rabbit_tests_event_receiver.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_tests_event_receiver).
diff --git a/src/rabbit_trace.erl b/src/rabbit_trace.erl
index 3a5b96de..d0dcaa71 100644
--- a/src/rabbit_trace.erl
+++ b/src/rabbit_trace.erl
@@ -10,13 +10,13 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_trace).
 
--export([init/1, tracing/1, tap_trace_in/2, tap_trace_out/2, start/1, stop/1]).
+-export([init/1, enabled/1, tap_in/2, tap_out/2, start/1, stop/1]).
 
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
@@ -31,9 +31,9 @@
 -type(state() :: rabbit_types:exchange() | 'none').
 
 -spec(init/1 :: (rabbit_types:vhost()) -> state()).
--spec(tracing/1 :: (rabbit_types:vhost()) -> boolean()).
--spec(tap_trace_in/2 :: (rabbit_types:basic_message(), state()) -> 'ok').
--spec(tap_trace_out/2 :: (rabbit_amqqueue:qmsg(), state()) -> 'ok').
+-spec(enabled/1 :: (rabbit_types:vhost()) -> boolean()).
+-spec(tap_in/2 :: (rabbit_types:basic_message(), state()) -> 'ok').
+-spec(tap_out/2 :: (rabbit_amqqueue:qmsg(), state()) -> 'ok').
 
 -spec(start/1 :: (rabbit_types:vhost()) -> 'ok').
 -spec(stop/1 :: (rabbit_types:vhost()) -> 'ok').
@@ -43,26 +43,26 @@
 %%----------------------------------------------------------------------------
 
 init(VHost) ->
-    case tracing(VHost) of
+    case enabled(VHost) of
         false -> none;
         true  -> {ok, X} = rabbit_exchange:lookup(
                              rabbit_misc:r(VHost, exchange, ?XNAME)),
                  X
     end.
 
-tracing(VHost) ->
+enabled(VHost) ->
     {ok, VHosts} = application:get_env(rabbit, ?TRACE_VHOSTS),
     lists:member(VHost, VHosts).
 
-tap_trace_in(Msg = #basic_message{exchange_name = #resource{name = XName}},
-             TraceX) ->
-    maybe_trace(TraceX, Msg, <<"publish">>, XName, []).
+tap_in(_Msg, none) -> ok;
+tap_in(Msg = #basic_message{exchange_name = #resource{name = XName}}, TraceX) ->
+    trace(TraceX, Msg, <<"publish">>, XName, []).
 
-tap_trace_out({#resource{name = QName}, _QPid, _QMsgId, Redelivered, Msg},
-              TraceX) ->
+tap_out(_Msg, none) -> ok;
+tap_out({#resource{name = QName}, _QPid, _QMsgId, Redelivered, Msg}, TraceX) ->
     RedeliveredNum = case Redelivered of true -> 1; false -> 0 end,
-    maybe_trace(TraceX, Msg, <<"deliver">>, QName,
-                [{<<"redelivered">>, signedint, RedeliveredNum}]).
+    trace(TraceX, Msg, <<"deliver">>, QName,
+          [{<<"redelivered">>, signedint, RedeliveredNum}]).
 
 %%----------------------------------------------------------------------------
 
@@ -83,14 +83,11 @@ update_config(Fun) ->
 
 %%----------------------------------------------------------------------------
 
-maybe_trace(none, _Msg, _RKPrefix, _RKSuffix, _Extra) ->
+trace(#exchange{name = Name}, #basic_message{exchange_name = Name},
+      _RKPrefix, _RKSuffix, _Extra) ->
     ok;
-maybe_trace(#exchange{name = Name}, #basic_message{exchange_name = Name},
-            _RKPrefix, _RKSuffix, _Extra) ->
-    ok;
-maybe_trace(X, Msg = #basic_message{content = #content{
-                                      payload_fragments_rev = PFR}},
-            RKPrefix, RKSuffix, Extra) ->
+trace(X, Msg = #basic_message{content = #content{payload_fragments_rev = PFR}},
+      RKPrefix, RKSuffix, Extra) ->
     {ok, _, _} = rabbit_basic:publish(
                    X, <<RKPrefix/binary, ".", RKSuffix/binary>>,
                    #'P_basic'{headers = msg_to_table(Msg) ++ Extra}, PFR),
diff --git a/src/rabbit_types.erl b/src/rabbit_types.erl
index 732c29b6..a36613db 100644
--- a/src/rabbit_types.erl
+++ b/src/rabbit_types.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_types).
@@ -64,12 +64,11 @@
         #basic_message{exchange_name  :: rabbit_exchange:name(),
                        routing_keys   :: [rabbit_router:routing_key()],
                        content        :: content(),
-                     id             :: msg_id(),
+                       id             :: msg_id(),
                        is_persistent  :: boolean()}).
 -type(message() :: basic_message()).
 -type(delivery() ::
         #delivery{mandatory :: boolean(),
-                  immediate :: boolean(),
                   sender    :: pid(),
                   message   :: message()}).
 -type(message_properties() ::
@@ -118,8 +117,7 @@
                   exclusive_owner :: rabbit_types:maybe(pid()),
                   arguments       :: rabbit_framing:amqp_table(),
                   pid             :: rabbit_types:maybe(pid()),
-                  slave_pids      :: [pid()],
-                  mirror_nodes    :: [node()] | 'undefined' | 'all'}).
+                  slave_pids      :: [pid()]}).
 
 -type(exchange() ::
         #exchange{name        :: rabbit_exchange:name(),
diff --git a/src/rabbit_upgrade.erl b/src/rabbit_upgrade.erl
index e1a7bcae..1047b823 100644
--- a/src/rabbit_upgrade.erl
+++ b/src/rabbit_upgrade.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_upgrade).
@@ -66,11 +66,11 @@
 %% into the boot process by prelaunch before the mnesia application is
 %% started. By the time Mnesia is started the upgrades have happened
 %% (on the primary), or Mnesia has been reset (on the secondary) and
-%% rabbit_mnesia:init_db/3 can then make the node rejoin the cluster
+%% rabbit_mnesia:init_db_unchecked/2 can then make the node rejoin the cluster
 %% in the normal way.
 %%
 %% The non-mnesia upgrades are then triggered by
-%% rabbit_mnesia:init_db/3. Of course, it's possible for a given
+%% rabbit_mnesia:init_db_unchecked/2. Of course, it's possible for a given
 %% upgrade process to only require Mnesia upgrades, or only require
 %% non-Mnesia upgrades. In the latter case no Mnesia resets and
 %% reclusterings occur.
@@ -121,19 +121,16 @@ remove_backup() ->
     info("upgrades: Mnesia backup removed~n", []).
 
 maybe_upgrade_mnesia() ->
-    %% rabbit_mnesia:all_clustered_nodes/0 will return [] at this point
-    %% if we are a RAM node since Mnesia has not started yet.
-    AllNodes = lists:usort(rabbit_mnesia:all_clustered_nodes() ++
-                               rabbit_mnesia:read_cluster_nodes_config()),
+    AllNodes = rabbit_mnesia:cluster_nodes(all),
     case rabbit_version:upgrades_required(mnesia) of
         {error, starting_from_scratch} ->
             ok;
         {error, version_not_available} ->
             case AllNodes of
-                [_] -> ok;
-                _   -> die("Cluster upgrade needed but upgrading from "
-                           "< 2.1.1.~nUnfortunately you will need to "
-                           "rebuild the cluster.", [])
+                [] -> die("Cluster upgrade needed but upgrading from "
+                          "< 2.1.1.~nUnfortunately you will need to "
+                          "rebuild the cluster.", []);
+                _  -> ok
             end;
         {error, _} = Err ->
             throw(Err);
@@ -150,12 +147,12 @@ maybe_upgrade_mnesia() ->
 upgrade_mode(AllNodes) ->
     case nodes_running(AllNodes) of
         [] ->
-            AfterUs = rabbit_mnesia:read_previously_running_nodes(),
-            case {is_disc_node_legacy(), AfterUs} of
-                {true, []}  ->
+            AfterUs = rabbit_mnesia:cluster_nodes(running) -- [node()],
+            case {node_type_legacy(), AfterUs} of
+                {disc, []}  ->
                     primary;
-                {true, _}  ->
-                    Filename = rabbit_mnesia:running_nodes_filename(),
+                {disc, _}  ->
+                    Filename = rabbit_node_monitor:running_nodes_filename(),
                     die("Cluster upgrade needed but other disc nodes shut "
                         "down after this one.~nPlease first start the last "
                         "disc node to shut down.~n~nNote: if several disc "
@@ -163,7 +160,7 @@ upgrade_mode(AllNodes) ->
                         "all~nshow this message. In which case, remove "
                         "the lock file on one of them and~nstart that node. "
                         "The lock file on this node is:~n~n ~s ", [Filename]);
-                {false, _} ->
+                {ram, _} ->
                     die("Cluster upgrade needed but this is a ram node.~n"
                         "Please first start the last disc node to shut down.",
                         [])
@@ -204,7 +201,7 @@ primary_upgrade(Upgrades, Nodes) ->
            mnesia,
            Upgrades,
            fun () ->
-                   force_tables(),
+                   rabbit_table:force_load(),
                    case Others of
                        [] -> ok;
                        _  -> info("mnesia upgrades: Breaking cluster~n", []),
@@ -214,23 +211,13 @@ primary_upgrade(Upgrades, Nodes) ->
            end),
     ok.
 
-force_tables() ->
-    [mnesia:force_load_table(T) || T <- rabbit_mnesia:table_names()].
-
 secondary_upgrade(AllNodes) ->
     %% must do this before we wipe out schema
-    IsDiscNode = is_disc_node_legacy(),
+    NodeType = node_type_legacy(),
     rabbit_misc:ensure_ok(mnesia:delete_schema([node()]),
                           cannot_delete_schema),
-    %% Note that we cluster with all nodes, rather than all disc nodes
-    %% (as we can't know all disc nodes at this point). This is safe as
-    %% we're not writing the cluster config, just setting up Mnesia.
-    ClusterNodes = case IsDiscNode of
-                       true  -> AllNodes;
-                       false -> AllNodes -- [node()]
-                   end,
     rabbit_misc:ensure_ok(mnesia:start(), cannot_start_mnesia),
-    ok = rabbit_mnesia:init_db(ClusterNodes, true, fun () -> ok end),
+    ok = rabbit_mnesia:init_db_unchecked(AllNodes, NodeType),
     ok = rabbit_version:record_desired_for_scope(mnesia),
     ok.
 
@@ -278,13 +265,16 @@ lock_filename() -> lock_filename(dir()).
 lock_filename(Dir) -> filename:join(Dir, ?LOCK_FILENAME).
 backup_dir() -> dir() ++ "-upgrade-backup".
 
-is_disc_node_legacy() ->
+node_type_legacy() ->
     %% This is pretty ugly but we can't start Mnesia and ask it (will
     %% hang), we can't look at the config file (may not include us
     %% even if we're a disc node).  We also can't use
-    %% rabbit_mnesia:is_disc_node/0 because that will give false
+    %% rabbit_mnesia:node_type/0 because that will give false
     %% postivies on Rabbit up to 2.5.1.
-    filelib:is_regular(filename:join(dir(), "rabbit_durable_exchange.DCD")).
+    case filelib:is_regular(filename:join(dir(), "rabbit_durable_exchange.DCD")) of
+        true  -> disc;
+        false -> ram
+    end.
 
 %% NB: we cannot use rabbit_log here since it may not have been
 %% started yet
diff --git a/src/rabbit_upgrade_functions.erl b/src/rabbit_upgrade_functions.erl
index 485ccc5f..d50cb282 100644
--- a/src/rabbit_upgrade_functions.erl
+++ b/src/rabbit_upgrade_functions.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_upgrade_functions).
@@ -37,6 +37,14 @@
 -rabbit_upgrade({mirrored_supervisor,   mnesia, []}).
 -rabbit_upgrade({topic_trie_node,       mnesia, []}).
 -rabbit_upgrade({runtime_parameters,    mnesia, []}).
+-rabbit_upgrade({exchange_scratches,    mnesia, [exchange_scratch]}).
+-rabbit_upgrade({policy,                mnesia,
+                 [exchange_scratches, ha_mirrors]}).
+-rabbit_upgrade({sync_slave_pids,       mnesia, [policy]}).
+-rabbit_upgrade({no_mirror_nodes,       mnesia, [sync_slave_pids]}).
+-rabbit_upgrade({gm_pids,               mnesia, [no_mirror_nodes]}).
+-rabbit_upgrade({exchange_decorators,   mnesia, [policy]}).
+-rabbit_upgrade({policy_apply_to,       mnesia, [runtime_parameters]}).
 
 %% -------------------------------------------------------------------
 
@@ -58,6 +66,12 @@
 -spec(mirrored_supervisor/0   :: () -> 'ok').
 -spec(topic_trie_node/0       :: () -> 'ok').
 -spec(runtime_parameters/0    :: () -> 'ok').
+-spec(policy/0                :: () -> 'ok').
+-spec(sync_slave_pids/0       :: () -> 'ok').
+-spec(no_mirror_nodes/0       :: () -> 'ok').
+-spec(gm_pids/0               :: () -> 'ok').
+-spec(exchange_decorators/0   :: () -> 'ok').
+-spec(policy_apply_to/0       :: () -> 'ok').
 
 -endif.
 
@@ -193,15 +207,131 @@ runtime_parameters() ->
             {attributes, [key, value]},
             {disc_copies, [node()]}]).
 
+exchange_scratches() ->
+    ok = exchange_scratches(rabbit_exchange),
+    ok = exchange_scratches(rabbit_durable_exchange).
+
+exchange_scratches(Table) ->
+    transform(
+      Table,
+      fun ({exchange, Name, Type = <<"x-federation">>, Dur, AutoDel, Int, Args,
+            Scratch}) ->
+              Scratches = orddict:store(federation, Scratch, orddict:new()),
+              {exchange, Name, Type, Dur, AutoDel, Int, Args, Scratches};
+          %% We assert here that nothing else uses the scratch mechanism ATM
+          ({exchange, Name, Type, Dur, AutoDel, Int, Args, undefined}) ->
+              {exchange, Name, Type, Dur, AutoDel, Int, Args, undefined}
+      end,
+      [name, type, durable, auto_delete, internal, arguments, scratches]).
+
+policy() ->
+    ok = exchange_policy(rabbit_exchange),
+    ok = exchange_policy(rabbit_durable_exchange),
+    ok = queue_policy(rabbit_queue),
+    ok = queue_policy(rabbit_durable_queue).
+
+exchange_policy(Table) ->
+    transform(
+      Table,
+      fun ({exchange, Name, Type, Dur, AutoDel, Int, Args, Scratches}) ->
+              {exchange, Name, Type, Dur, AutoDel, Int, Args, Scratches,
+               undefined}
+      end,
+      [name, type, durable, auto_delete, internal, arguments, scratches,
+       policy]).
+
+queue_policy(Table) ->
+    transform(
+      Table,
+      fun ({amqqueue, Name, Dur, AutoDel, Excl, Args, Pid, SPids, MNodes}) ->
+              {amqqueue, Name, Dur, AutoDel, Excl, Args, Pid, SPids, MNodes,
+               undefined}
+      end,
+      [name, durable, auto_delete, exclusive_owner, arguments, pid,
+       slave_pids, mirror_nodes, policy]).
+
+sync_slave_pids() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    AddSyncSlavesFun =
+        fun ({amqqueue, N, D, AD, Excl, Args, Pid, SPids, MNodes, Pol}) ->
+                {amqqueue, N, D, AD, Excl, Args, Pid, SPids, [], MNodes, Pol}
+        end,
+    [ok = transform(T, AddSyncSlavesFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, mirror_nodes, policy])
+     || T <- Tables],
+    ok.
+
+no_mirror_nodes() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    RemoveMirrorNodesFun =
+        fun ({amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, _MNodes, Pol}) ->
+                {amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol}
+        end,
+    [ok = transform(T, RemoveMirrorNodesFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, policy])
+     || T <- Tables],
+    ok.
+
+gm_pids() ->
+    Tables = [rabbit_queue, rabbit_durable_queue],
+    AddGMPidsFun =
+        fun ({amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol}) ->
+                {amqqueue, N, D, AD, O, A, Pid, SPids, SSPids, Pol, []}
+        end,
+    [ok = transform(T, AddGMPidsFun,
+                    [name, durable, auto_delete, exclusive_owner, arguments,
+                     pid, slave_pids, sync_slave_pids, policy, gm_pids])
+     || T <- Tables],
+    ok.
+
+exchange_decorators() ->
+    ok = exchange_decorators(rabbit_exchange),
+    ok = exchange_decorators(rabbit_durable_exchange).
+
+exchange_decorators(Table) ->
+    transform(
+      Table,
+      fun ({exchange, Name, Type, Dur, AutoDel, Int, Args, Scratches,
+            Policy}) ->
+              {exchange, Name, Type, Dur, AutoDel, Int, Args, Scratches, Policy,
+                {[], []}}
+      end,
+      [name, type, durable, auto_delete, internal, arguments, scratches, policy,
+       decorators]).
+
+policy_apply_to() ->
+    transform(
+      rabbit_runtime_parameters,
+      fun ({runtime_parameters, Key = {_VHost, <<"policy">>, _Name}, Value}) ->
+              ApplyTo = apply_to(proplists:get_value(<<"definition">>, Value)),
+              {runtime_parameters, Key, [{<<"apply-to">>, ApplyTo} | Value]};
+          ({runtime_parameters, Key, Value}) ->
+              {runtime_parameters, Key, Value}
+      end,
+      [key, value]),
+    rabbit_policy:invalidate(),
+    ok.
+
+apply_to(Def) ->
+    case [proplists:get_value(K, Def) ||
+             K <- [<<"federation-upstream-set">>, <<"ha-mode">>]] of
+        [undefined, undefined] -> <<"all">>;
+        [_,         undefined] -> <<"exchanges">>;
+        [undefined, _]         -> <<"queues">>;
+        [_,         _]         -> <<"all">>
+    end.
+
 %%--------------------------------------------------------------------
 
 transform(TableName, Fun, FieldList) ->
-    rabbit_mnesia:wait_for_tables([TableName]),
+    rabbit_table:wait([TableName]),
     {atomic, ok} = mnesia:transform_table(TableName, Fun, FieldList),
     ok.
 
 transform(TableName, Fun, FieldList, NewRecordName) ->
-    rabbit_mnesia:wait_for_tables([TableName]),
+    rabbit_table:wait([TableName]),
     {atomic, ok} = mnesia:transform_table(TableName, Fun, FieldList,
                                           NewRecordName),
     ok.
diff --git a/src/rabbit_variable_queue.erl b/src/rabbit_variable_queue.erl
index 49213c95..ac2b9f52 100644
--- a/src/rabbit_variable_queue.erl
+++ b/src/rabbit_variable_queue.erl
@@ -10,18 +10,19 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_variable_queue).
 
--export([init/3, terminate/2, delete_and_terminate/2, purge/1,
-         publish/4, publish_delivered/5, drain_confirmed/1,
-         dropwhile/3, fetch/2, ack/2, requeue/2, len/1, is_empty/1,
-         set_ram_duration_target/2, ram_duration/1, needs_timeout/1,
-         timeout/1, handle_pre_hibernate/1, status/1, invoke/3,
-         is_duplicate/2, discard/3, multiple_routing_keys/0, fold/3]).
+-export([init/3, terminate/2, delete_and_terminate/2, purge/1, purge_acks/1,
+         publish/5, publish_delivered/4, discard/3, drain_confirmed/1,
+         dropwhile/2, fetchwhile/4,
+         fetch/2, drop/2, ack/2, requeue/2, ackfold/4, fold/3, len/1,
+         is_empty/1, depth/1, set_ram_duration_target/2, ram_duration/1,
+         needs_timeout/1, timeout/1, handle_pre_hibernate/1, status/1, invoke/3,
+         is_duplicate/2, multiple_routing_keys/0]).
 
 -export([start/1, stop/0]).
 
@@ -254,16 +255,13 @@
           q3,
           q4,
           next_seq_id,
-          pending_ack,
-          pending_ack_index,
-          ram_ack_index,
+          ram_pending_ack,
+          disk_pending_ack,
           index_state,
           msg_store_clients,
           durable,
           transient_threshold,
 
-          async_callback,
-
           len,
           persistent_count,
 
@@ -348,16 +346,14 @@
              q3                    :: ?QUEUE:?QUEUE(),
              q4                    :: ?QUEUE:?QUEUE(),
              next_seq_id           :: seq_id(),
-             pending_ack           :: gb_tree(),
-             ram_ack_index         :: gb_tree(),
+             ram_pending_ack       :: gb_tree(),
+             disk_pending_ack      :: gb_tree(),
              index_state           :: any(),
              msg_store_clients     :: 'undefined' | {{any(), binary()},
                                                     {any(), binary()}},
              durable               :: boolean(),
              transient_threshold   :: non_neg_integer(),
 
-             async_callback        :: rabbit_backing_queue:async_callback(),
-
              len                   :: non_neg_integer(),
              persistent_count      :: non_neg_integer(),
 
@@ -426,7 +422,7 @@ init(Queue, Recover, AsyncCallback) ->
 init(#amqqueue { name = QueueName, durable = IsDurable }, false,
      AsyncCallback, MsgOnDiskFun, MsgIdxOnDiskFun) ->
     IndexState = rabbit_queue_index:init(QueueName, MsgIdxOnDiskFun),
-    init(IsDurable, IndexState, 0, [], AsyncCallback,
+    init(IsDurable, IndexState, 0, [],
          case IsDurable of
              true  -> msg_store_client_init(?PERSISTENT_MSG_STORE,
                                             MsgOnDiskFun, AsyncCallback);
@@ -454,7 +450,7 @@ init(#amqqueue { name = QueueName, durable = true }, true,
                   rabbit_msg_store:contains(MsgId, PersistentClient)
           end,
           MsgIdxOnDiskFun),
-    init(true, IndexState, DeltaCount, Terms1, AsyncCallback,
+    init(true, IndexState, DeltaCount, Terms1,
          PersistentClient, TransientClient).
 
 terminate(_Reason, State) ->
@@ -519,18 +515,19 @@ purge(State = #vqstate { q4                = Q4,
                               ram_msg_count     = 0,
                               persistent_count  = PCount1 })}.
 
+purge_acks(State) -> a(purge_pending_ack(false, State)).
+
 publish(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
         MsgProps = #message_properties { needs_confirming = NeedsConfirming },
-        _ChPid, State = #vqstate { q1 = Q1, q3 = Q3, q4 = Q4,
-                                   next_seq_id      = SeqId,
-                                   len              = Len,
-                                   in_counter       = InCount,
-                                   persistent_count = PCount,
-                                   durable          = IsDurable,
-                                   ram_msg_count    = RamMsgCount,
-                                   unconfirmed      = UC }) ->
+        IsDelivered, _ChPid, State = #vqstate { q1 = Q1, q3 = Q3, q4 = Q4,
+                                                next_seq_id      = SeqId,
+                                                len              = Len,
+                                                in_counter       = InCount,
+                                                persistent_count = PCount,
+                                                durable          = IsDurable,
+                                                unconfirmed      = UC }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
-    MsgStatus = msg_status(IsPersistent1, SeqId, Msg, MsgProps),
+    MsgStatus = msg_status(IsPersistent1, IsDelivered, SeqId, Msg, MsgProps),
     {MsgStatus1, State1} = maybe_write_to_disk(false, false, MsgStatus, State),
     State2 = case ?QUEUE:is_empty(Q3) of
                  false -> State1 #vqstate { q1 = ?QUEUE:in(m(MsgStatus1), Q1) };
@@ -538,36 +535,25 @@ publish(Msg = #basic_message { is_persistent = IsPersistent, id = MsgId },
              end,
     PCount1 = PCount + one_if(IsPersistent1),
     UC1 = gb_sets_maybe_insert(NeedsConfirming, MsgId, UC),
-    a(reduce_memory_use(State2 #vqstate { next_seq_id      = SeqId   + 1,
-                                          len              = Len     + 1,
-                                          in_counter       = InCount + 1,
-                                          persistent_count = PCount1,
-                                          ram_msg_count    = RamMsgCount + 1,
-                                          unconfirmed      = UC1 })).
-
-publish_delivered(false, #basic_message { id = MsgId },
-                  #message_properties { needs_confirming = NeedsConfirming },
-                  _ChPid, State = #vqstate { async_callback = Callback,
-                                             len = 0 }) ->
-    case NeedsConfirming of
-        true  -> blind_confirm(Callback, gb_sets:singleton(MsgId));
-        false -> ok
-    end,
-    {undefined, a(State)};
-publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent,
-                                               id = MsgId },
+    a(reduce_memory_use(
+        inc_ram_msg_count(State2 #vqstate { next_seq_id      = SeqId   + 1,
+                                            len              = Len     + 1,
+                                            in_counter       = InCount + 1,
+                                            persistent_count = PCount1,
+                                            unconfirmed      = UC1 }))).
+
+publish_delivered(Msg = #basic_message { is_persistent = IsPersistent,
+                                         id = MsgId },
                   MsgProps = #message_properties {
                     needs_confirming = NeedsConfirming },
-                  _ChPid, State = #vqstate { len              = 0,
-                                             next_seq_id      = SeqId,
+                  _ChPid, State = #vqstate { next_seq_id      = SeqId,
                                              out_counter      = OutCount,
                                              in_counter       = InCount,
                                              persistent_count = PCount,
                                              durable          = IsDurable,
                                              unconfirmed      = UC }) ->
     IsPersistent1 = IsDurable andalso IsPersistent,
-    MsgStatus = (msg_status(IsPersistent1, SeqId, Msg, MsgProps))
-        #msg_status { is_delivered = true },
+    MsgStatus = msg_status(IsPersistent1, true, SeqId, Msg, MsgProps),
     {MsgStatus1, State1} = maybe_write_to_disk(false, false, MsgStatus, State),
     State2 = record_pending_ack(m(MsgStatus1), State1),
     PCount1 = PCount + one_if(IsPersistent1),
@@ -579,6 +565,8 @@ publish_delivered(true, Msg = #basic_message { is_persistent = IsPersistent,
                                   persistent_count = PCount1,
                                   unconfirmed      = UC1 }))}.
 
+discard(_MsgId, _ChPid, State) -> State.
+
 drain_confirmed(State = #vqstate { confirmed = C }) ->
     case gb_sets:is_empty(C) of
         true  -> {[], State}; %% common case
@@ -586,27 +574,28 @@ drain_confirmed(State = #vqstate { confirmed = C }) ->
                                         confirmed = gb_sets:new() }}
     end.
 
-dropwhile(Pred, AckRequired, State) -> dropwhile(Pred, AckRequired, State, []).
+dropwhile(Pred, State) ->
+    case queue_out(State) of
+        {empty, State1} ->
+            {undefined, a(State1)};
+        {{value, MsgStatus = #msg_status { msg_props = MsgProps }}, State1} ->
+            case Pred(MsgProps) of
+                true  -> {_, State2} = remove(false, MsgStatus, State1),
+                         dropwhile(Pred, State2);
+                false -> {MsgProps, a(in_r(MsgStatus, State1))}
+            end
+    end.
 
-dropwhile(Pred, AckRequired, State, Msgs) ->
-    End = fun(S) when AckRequired -> {lists:reverse(Msgs), S};
-             (S)                  -> {undefined, S}
-          end,
+fetchwhile(Pred, Fun, Acc, State) ->
     case queue_out(State) of
         {empty, State1} ->
-            End(a(State1));
+            {undefined, Acc, a(State1)};
         {{value, MsgStatus = #msg_status { msg_props = MsgProps }}, State1} ->
-            case {Pred(MsgProps), AckRequired} of
-                {true, true} ->
-                    {MsgStatus1, State2} = read_msg(MsgStatus, State1),
-                    {{Msg, _, AckTag, _}, State3} =
-                         internal_fetch(true, MsgStatus1, State2),
-                    dropwhile(Pred, AckRequired, State3, [{Msg, AckTag} | Msgs]);
-                {true, false} ->
-                    {_, State2} = internal_fetch(false, MsgStatus, State1),
-                    dropwhile(Pred, AckRequired, State2, undefined);
-                {false, _} ->
-                    End(a(in_r(MsgStatus, State1)))
+            case Pred(MsgProps) of
+                true  -> {Msg, State2} = read_msg(MsgStatus, State1),
+                         {AckTag, State3} = remove(true, MsgStatus, State2),
+                         fetchwhile(Pred, Fun, Fun(Msg, AckTag, Acc), State3);
+                false -> {MsgProps, Acc, a(in_r(MsgStatus, State1))}
             end
     end.
 
@@ -617,9 +606,18 @@ fetch(AckRequired, State) ->
         {{value, MsgStatus}, State1} ->
             %% it is possible that the message wasn't read from disk
             %% at this point, so read it in.
-            {MsgStatus1, State2} = read_msg(MsgStatus, State1),
-            {Res, State3} = internal_fetch(AckRequired, MsgStatus1, State2),
-            {Res, a(State3)}
+            {Msg, State2} = read_msg(MsgStatus, State1),
+            {AckTag, State3} = remove(AckRequired, MsgStatus, State2),
+            {{Msg, MsgStatus#msg_status.is_delivered, AckTag}, a(State3)}
+    end.
+
+drop(AckRequired, State) ->
+    case queue_out(State) of
+        {empty, State1} ->
+            {empty, a(State1)};
+        {{value, MsgStatus}, State1} ->
+            {AckTag, State2} = remove(AckRequired, MsgStatus, State1),
+            {{MsgStatus#msg_status.msg_id, AckTag}, a(State2)}
     end.
 
 ack([], State) ->
@@ -645,17 +643,6 @@ ack(AckTags, State) ->
                          persistent_count = PCount1,
                          ack_out_counter  = AckOutCount + length(AckTags) })}.
 
-fold(undefined, State, _AckTags) ->
-    State;
-fold(MsgFun, State = #vqstate{pending_ack = PA}, AckTags) ->
-    lists:foldl(
-      fun(SeqId, State1) ->
-              {MsgStatus, State2} =
-                  read_msg(gb_trees:get(SeqId, PA), State1),
-              MsgFun(MsgStatus#msg_status.msg, SeqId),
-              State2
-      end, State, AckTags).
-
 requeue(AckTags, #vqstate { delta      = Delta,
                             q3         = Q3,
                             q4         = Q4,
@@ -677,10 +664,29 @@ requeue(AckTags, #vqstate { delta      = Delta,
                                     in_counter = InCounter + MsgCount,
                                     len        = Len + MsgCount }))}.
 
+ackfold(MsgFun, Acc, State, AckTags) ->
+    {AccN, StateN} =
+        lists:foldl(fun(SeqId, {Acc0, State0}) ->
+                            MsgStatus = lookup_pending_ack(SeqId, State0),
+                            {Msg, State1} = read_msg(MsgStatus, State0),
+                            {MsgFun(Msg, SeqId, Acc0), State1}
+                    end, {Acc, State}, AckTags),
+    {AccN, a(StateN)}.
+
+fold(Fun, Acc, State = #vqstate{index_state = IndexState}) ->
+    {Its, IndexState1} = lists:foldl(fun inext/2, {[], IndexState},
+                                     [msg_iterator(State),
+                                      disk_ack_iterator(State),
+                                      ram_ack_iterator(State)]),
+    ifold(Fun, Acc, Its, State#vqstate{index_state = IndexState1}).
+
 len(#vqstate { len = Len }) -> Len.
 
 is_empty(State) -> 0 == len(State).
 
+depth(State = #vqstate { ram_pending_ack = RPA, disk_pending_ack = DPA }) ->
+    len(State) + gb_trees:size(RPA) + gb_trees:size(DPA).
+
 set_ram_duration_target(
   DurationTarget, State = #vqstate {
                     rates     = #rates { avg_egress  = AvgEgressRate,
@@ -716,7 +722,7 @@ ram_duration(State = #vqstate {
                ack_out_counter    = AckOutCount,
                ram_msg_count      = RamMsgCount,
                ram_msg_count_prev = RamMsgCountPrev,
-               ram_ack_index      = RamAckIndex,
+               ram_pending_ack    = RPA,
                ram_ack_count_prev = RamAckCountPrev }) ->
     Now = now(),
     {AvgEgressRate,   Egress1} = update_rate(Now, Timestamp, OutCount, Egress),
@@ -727,7 +733,7 @@ ram_duration(State = #vqstate {
     {AvgAckIngressRate, AckIngress1} =
         update_rate(Now, AckTimestamp, AckInCount, AckIngress),
 
-    RamAckCount = gb_trees:size(RamAckIndex),
+    RamAckCount = gb_trees:size(RPA),
 
     Duration = %% msgs+acks / (msgs+acks/sec) == sec
         case (AvgEgressRate == 0 andalso AvgIngressRate == 0 andalso
@@ -759,21 +765,20 @@ ram_duration(State = #vqstate {
                  ram_msg_count_prev = RamMsgCount,
                  ram_ack_count_prev = RamAckCount }}.
 
-needs_timeout(State = #vqstate { index_state = IndexState }) ->
-    case must_sync_index(State) of
-        true  -> timed;
-        false ->
-            case rabbit_queue_index:needs_sync(IndexState) of
-                true  -> idle;
-                false -> case reduce_memory_use(
-                                fun (_Quota, State1) -> {0, State1} end,
-                                fun (_Quota, State1) -> State1 end,
-                                fun (_Quota, State1) -> {0, State1} end,
-                                State) of
-                             {true,  _State} -> idle;
-                             {false, _State} -> false
-                         end
-            end
+needs_timeout(State = #vqstate { index_state      = IndexState,
+                                 target_ram_count = TargetRamCount }) ->
+    case rabbit_queue_index:needs_sync(IndexState) of
+        confirms                              -> timed;
+        other                                 -> idle;
+        false when TargetRamCount == infinity -> false;
+        false -> case reduce_memory_use(
+                        fun (_Quota, State1) -> {0, State1} end,
+                        fun (_Quota, State1) -> State1 end,
+                        fun (_Quota, State1) -> {0, State1} end,
+                        State) of
+                     {true,  _State} -> idle;
+                     {false, _State} -> false
+                 end
     end.
 
 timeout(State = #vqstate { index_state = IndexState }) ->
@@ -787,8 +792,8 @@ handle_pre_hibernate(State = #vqstate { index_state = IndexState }) ->
 status(#vqstate {
           q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
           len              = Len,
-          pending_ack      = PA,
-          ram_ack_index    = RAI,
+          ram_pending_ack  = RPA,
+          disk_pending_ack = DPA,
           target_ram_count = TargetRamCount,
           ram_msg_count    = RamMsgCount,
           next_seq_id      = NextSeqId,
@@ -803,10 +808,10 @@ status(#vqstate {
       {q3                  , ?QUEUE:len(Q3)},
       {q4                  , ?QUEUE:len(Q4)},
       {len                 , Len},
-      {pending_acks        , gb_trees:size(PA)},
+      {pending_acks        , gb_trees:size(RPA) + gb_trees:size(DPA)},
       {target_ram_count    , TargetRamCount},
       {ram_msg_count       , RamMsgCount},
-      {ram_ack_count       , gb_trees:size(RAI)},
+      {ram_ack_count       , gb_trees:size(RPA)},
       {next_seq_id         , NextSeqId},
       {persistent_count    , PersistentCount},
       {avg_ingress_rate    , AvgIngressRate},
@@ -814,12 +819,11 @@ status(#vqstate {
       {avg_ack_ingress_rate, AvgAckIngressRate},
       {avg_ack_egress_rate , AvgAckEgressRate} ].
 
-invoke(?MODULE, Fun, State) -> Fun(?MODULE, State).
+invoke(?MODULE, Fun, State) -> Fun(?MODULE, State);
+invoke(      _,   _, State) -> State.
 
 is_duplicate(_Msg, State) -> {false, State}.
 
-discard(_Msg, _ChPid, State) -> State.
-
 %%----------------------------------------------------------------------------
 %% Minor helpers
 %%----------------------------------------------------------------------------
@@ -843,6 +847,7 @@ a(State = #vqstate { q1 = Q1, q2 = Q2, delta = Delta, q3 = Q3, q4 = Q4,
     true = Len             >= 0,
     true = PersistentCount >= 0,
     true = RamMsgCount     >= 0,
+    true = RamMsgCount     =< Len,
 
     State.
 
@@ -867,15 +872,28 @@ cons_if(true,   E, L) -> [E | L];
 cons_if(false, _E, L) -> L.
 
 gb_sets_maybe_insert(false, _Val, Set) -> Set;
-%% when requeueing, we re-add a msg_id to the unconfirmed set
-gb_sets_maybe_insert(true,  Val,  Set) -> gb_sets:add(Val, Set).
-
-msg_status(IsPersistent, SeqId, Msg = #basic_message { id = MsgId },
-           MsgProps) ->
-    #msg_status { seq_id = SeqId, msg_id = MsgId, msg = Msg,
-                  is_persistent = IsPersistent, is_delivered = false,
-                  msg_on_disk = false, index_on_disk = false,
-                  msg_props = MsgProps }.
+gb_sets_maybe_insert(true,   Val, Set) -> gb_sets:add(Val, Set).
+
+msg_status(IsPersistent, IsDelivered, SeqId,
+           Msg = #basic_message {id = MsgId}, MsgProps) ->
+    #msg_status{seq_id        = SeqId,
+                msg_id        = MsgId,
+                msg           = Msg,
+                is_persistent = IsPersistent,
+                is_delivered  = IsDelivered,
+                msg_on_disk   = false,
+                index_on_disk = false,
+                msg_props     = MsgProps}.
+
+beta_msg_status({MsgId, SeqId, MsgProps, IsPersistent, IsDelivered}) ->
+  #msg_status{seq_id        = SeqId,
+              msg_id        = MsgId,
+              msg           = undefined,
+              is_persistent = IsPersistent,
+              is_delivered  = IsDelivered,
+              msg_on_disk   = true,
+              index_on_disk = true,
+              msg_props     = MsgProps}.
 
 trim_msg_status(MsgStatus) -> MsgStatus #msg_status { msg = undefined }.
 
@@ -939,31 +957,21 @@ maybe_write_delivered(false, _SeqId, IndexState) ->
 maybe_write_delivered(true, SeqId, IndexState) ->
     rabbit_queue_index:deliver([SeqId], IndexState).
 
-betas_from_index_entries(List, TransientThreshold, PA, IndexState) ->
+betas_from_index_entries(List, TransientThreshold, RPA, DPA, IndexState) ->
     {Filtered, Delivers, Acks} =
         lists:foldr(
-          fun ({MsgId, SeqId, MsgProps, IsPersistent, IsDelivered},
+          fun ({_MsgId, SeqId, _MsgProps, IsPersistent, IsDelivered} = M,
                {Filtered1, Delivers1, Acks1} = Acc) ->
                   case SeqId < TransientThreshold andalso not IsPersistent of
                       true  -> {Filtered1,
                                 cons_if(not IsDelivered, SeqId, Delivers1),
                                 [SeqId | Acks1]};
-                      false -> case gb_trees:is_defined(SeqId, PA) of
-                                   false ->
-                                       {?QUEUE:in_r(
-                                           m(#msg_status {
-                                                seq_id        = SeqId,
-                                                msg_id        = MsgId,
-                                                msg           = undefined,
-                                                is_persistent = IsPersistent,
-                                                is_delivered  = IsDelivered,
-                                                msg_on_disk   = true,
-                                                index_on_disk = true,
-                                                msg_props     = MsgProps
-                                               }), Filtered1),
-                                        Delivers1, Acks1};
-                                   true ->
-                                       Acc
+                      false -> case (gb_trees:is_defined(SeqId, RPA) orelse
+                                     gb_trees:is_defined(SeqId, DPA)) of
+                                   false -> {?QUEUE:in_r(m(beta_msg_status(M)),
+                                                         Filtered1),
+                                             Delivers1, Acks1};
+                                   true  -> Acc
                            end
                   end
           end, {?QUEUE:new(), [], []}, List),
@@ -991,7 +999,7 @@ update_rate(Now, Then, Count, {OThen, OCount}) ->
 %% Internal major helpers for Public API
 %%----------------------------------------------------------------------------
 
-init(IsDurable, IndexState, DeltaCount, Terms, AsyncCallback,
+init(IsDurable, IndexState, DeltaCount, Terms,
      PersistentClient, TransientClient) ->
     {LowSeqId, NextSeqId, IndexState1} = rabbit_queue_index:bounds(IndexState),
 
@@ -1010,15 +1018,13 @@ init(IsDurable, IndexState, DeltaCount, Terms, AsyncCallback,
       q3                  = ?QUEUE:new(),
       q4                  = ?QUEUE:new(),
       next_seq_id         = NextSeqId,
-      pending_ack         = gb_trees:empty(),
-      ram_ack_index       = gb_trees:empty(),
+      ram_pending_ack     = gb_trees:empty(),
+      disk_pending_ack    = gb_trees:empty(),
       index_state         = IndexState1,
       msg_store_clients   = {PersistentClient, TransientClient},
       durable             = IsDurable,
       transient_threshold = NextSeqId,
 
-      async_callback      = AsyncCallback,
-
       len                 = DeltaCount1,
       persistent_count    = DeltaCount1,
 
@@ -1049,9 +1055,11 @@ in_r(MsgStatus = #msg_status { msg = undefined },
      State = #vqstate { q3 = Q3, q4 = Q4 }) ->
     case ?QUEUE:is_empty(Q4) of
         true  -> State #vqstate { q3 = ?QUEUE:in_r(MsgStatus, Q3) };
-        false -> {MsgStatus1, State1 = #vqstate { q4 = Q4a }} =
+        false -> {Msg, State1 = #vqstate { q4 = Q4a }} =
                      read_msg(MsgStatus, State),
-                 State1 #vqstate { q4 = ?QUEUE:in_r(MsgStatus1, Q4a) }
+                 inc_ram_msg_count(
+                   State1 #vqstate { q4 = ?QUEUE:in_r(MsgStatus#msg_status {
+                                                        msg = Msg }, Q4a) })
     end;
 in_r(MsgStatus, State = #vqstate { q4 = Q4 }) ->
     State #vqstate { q4 = ?QUEUE:in_r(MsgStatus, Q4) }.
@@ -1067,33 +1075,35 @@ queue_out(State = #vqstate { q4 = Q4 }) ->
             {{value, MsgStatus}, State #vqstate { q4 = Q4a }}
     end.
 
-read_msg(MsgStatus = #msg_status { msg           = undefined,
-                                   msg_id        = MsgId,
-                                   is_persistent = IsPersistent },
-         State = #vqstate { ram_msg_count     = RamMsgCount,
-                            msg_store_clients = MSCState}) ->
+read_msg(#msg_status{msg           = undefined,
+                     msg_id        = MsgId,
+                     is_persistent = IsPersistent}, State) ->
+    read_msg(MsgId, IsPersistent, State);
+read_msg(#msg_status{msg = Msg}, State) ->
+    {Msg, State}.
+
+read_msg(MsgId, IsPersistent, State = #vqstate{msg_store_clients = MSCState}) ->
     {{ok, Msg = #basic_message {}}, MSCState1} =
         msg_store_read(MSCState, IsPersistent, MsgId),
-    {MsgStatus #msg_status { msg = Msg },
-     State #vqstate { ram_msg_count     = RamMsgCount + 1,
-                      msg_store_clients = MSCState1 }};
-read_msg(MsgStatus, State) ->
-    {MsgStatus, State}.
-
-internal_fetch(AckRequired, MsgStatus = #msg_status {
-                              seq_id        = SeqId,
-                              msg_id        = MsgId,
-                              msg           = Msg,
-                              is_persistent = IsPersistent,
-                              is_delivered  = IsDelivered,
-                              msg_on_disk   = MsgOnDisk,
-                              index_on_disk = IndexOnDisk },
-               State = #vqstate {ram_msg_count     = RamMsgCount,
-                                 out_counter       = OutCount,
-                                 index_state       = IndexState,
-                                 msg_store_clients = MSCState,
-                                 len               = Len,
-                                 persistent_count  = PCount }) ->
+    {Msg, State #vqstate {msg_store_clients = MSCState1}}.
+
+inc_ram_msg_count(State = #vqstate{ram_msg_count = RamMsgCount}) ->
+    State#vqstate{ram_msg_count = RamMsgCount + 1}.
+
+remove(AckRequired, MsgStatus = #msg_status {
+                      seq_id        = SeqId,
+                      msg_id        = MsgId,
+                      msg           = Msg,
+                      is_persistent = IsPersistent,
+                      is_delivered  = IsDelivered,
+                      msg_on_disk   = MsgOnDisk,
+                      index_on_disk = IndexOnDisk },
+       State = #vqstate {ram_msg_count     = RamMsgCount,
+                         out_counter       = OutCount,
+                         index_state       = IndexState,
+                         msg_store_clients = MSCState,
+                         len               = Len,
+                         persistent_count  = PCount}) ->
     %% 1. Mark it delivered if necessary
     IndexState1 = maybe_write_delivered(
                     IndexOnDisk andalso not IsDelivered,
@@ -1104,12 +1114,11 @@ internal_fetch(AckRequired, MsgStatus = #msg_status {
                   ok = msg_store_remove(MSCState, IsPersistent, [MsgId])
           end,
     Ack = fun () -> rabbit_queue_index:ack([SeqId], IndexState1) end,
-    IndexState2 =
-        case {AckRequired, MsgOnDisk, IndexOnDisk} of
-            {false, true, false} -> Rem(), IndexState1;
-            {false, true,  true} -> Rem(), Ack();
-            _                    -> IndexState1
-        end,
+    IndexState2 = case {AckRequired, MsgOnDisk, IndexOnDisk} of
+                      {false, true, false} -> Rem(), IndexState1;
+                      {false, true,  true} -> Rem(), Ack();
+                      _                    -> IndexState1
+                  end,
 
     %% 3. If an ack is required, add something sensible to PA
     {AckTag, State1} = case AckRequired of
@@ -1120,16 +1129,14 @@ internal_fetch(AckRequired, MsgStatus = #msg_status {
                            false -> {undefined, State}
                        end,
 
-    PCount1 = PCount - one_if(IsPersistent andalso not AckRequired),
-    Len1 = Len - 1,
+    PCount1      = PCount      - one_if(IsPersistent andalso not AckRequired),
     RamMsgCount1 = RamMsgCount - one_if(Msg =/= undefined),
 
-    {{Msg, IsDelivered, AckTag, Len1},
-     State1 #vqstate { ram_msg_count    = RamMsgCount1,
-                       out_counter      = OutCount + 1,
-                       index_state      = IndexState2,
-                       len              = Len1,
-                       persistent_count = PCount1 }}.
+    {AckTag, State1 #vqstate {ram_msg_count    = RamMsgCount1,
+                              out_counter      = OutCount + 1,
+                              index_state      = IndexState2,
+                              len              = Len - 1,
+                              persistent_count = PCount1}}.
 
 purge_betas_and_deltas(LensByStore,
                        State = #vqstate { q3                = Q3,
@@ -1226,37 +1233,48 @@ maybe_write_to_disk(ForceMsg, ForceIndex, MsgStatus,
 %% Internal gubbins for acks
 %%----------------------------------------------------------------------------
 
-record_pending_ack(#msg_status { seq_id        = SeqId,
-                                 msg_id        = MsgId,
-                                 msg_on_disk   = MsgOnDisk } = MsgStatus,
-                   State = #vqstate { pending_ack     = PA,
-                                      ram_ack_index   = RAI,
-                                      ack_in_counter  = AckInCount}) ->
-    {AckEntry, RAI1} =
-        case MsgOnDisk of
-            true  -> {m(trim_msg_status(MsgStatus)), RAI};
-            false -> {MsgStatus, gb_trees:insert(SeqId, MsgId, RAI)}
+record_pending_ack(#msg_status { seq_id = SeqId, msg = Msg } = MsgStatus,
+                   State = #vqstate { ram_pending_ack  = RPA,
+                                      disk_pending_ack = DPA,
+                                      ack_in_counter   = AckInCount}) ->
+    {RPA1, DPA1} =
+        case Msg of
+            undefined -> {RPA, gb_trees:insert(SeqId, MsgStatus, DPA)};
+            _         -> {gb_trees:insert(SeqId, MsgStatus, RPA), DPA}
         end,
-    State #vqstate { pending_ack    = gb_trees:insert(SeqId, AckEntry, PA),
-                     ram_ack_index  = RAI1,
-                     ack_in_counter = AckInCount + 1}.
+    State #vqstate { ram_pending_ack  = RPA1,
+                     disk_pending_ack = DPA1,
+                     ack_in_counter   = AckInCount + 1}.
+
+lookup_pending_ack(SeqId, #vqstate { ram_pending_ack  = RPA,
+                                     disk_pending_ack = DPA }) ->
+    case gb_trees:lookup(SeqId, RPA) of
+        {value, V} -> V;
+        none       -> gb_trees:get(SeqId, DPA)
+    end.
 
-remove_pending_ack(SeqId, State = #vqstate { pending_ack   = PA,
-                                             ram_ack_index = RAI }) ->
-    {gb_trees:get(SeqId, PA),
-     State #vqstate { pending_ack   = gb_trees:delete(SeqId, PA),
-                      ram_ack_index = gb_trees:delete_any(SeqId, RAI) }}.
+remove_pending_ack(SeqId, State = #vqstate { ram_pending_ack  = RPA,
+                                             disk_pending_ack = DPA }) ->
+    case gb_trees:lookup(SeqId, RPA) of
+        {value, V} -> RPA1 = gb_trees:delete(SeqId, RPA),
+                      {V, State #vqstate { ram_pending_ack = RPA1 }};
+        none       -> DPA1 = gb_trees:delete(SeqId, DPA),
+                      {gb_trees:get(SeqId, DPA),
+                       State #vqstate { disk_pending_ack = DPA1 }}
+    end.
 
 purge_pending_ack(KeepPersistent,
-                  State = #vqstate { pending_ack       = PA,
+                  State = #vqstate { ram_pending_ack   = RPA,
+                                     disk_pending_ack  = DPA,
                                      index_state       = IndexState,
                                      msg_store_clients = MSCState }) ->
+    F = fun (_SeqId, MsgStatus, Acc) -> accumulate_ack(MsgStatus, Acc) end,
     {IndexOnDiskSeqIds, MsgIdsByStore, _AllMsgIds} =
-        rabbit_misc:gb_trees_fold(fun (_SeqId, MsgStatus, Acc) ->
-                                          accumulate_ack(MsgStatus, Acc)
-                                  end, accumulate_ack_init(), PA),
-    State1 = State #vqstate { pending_ack   = gb_trees:empty(),
-                              ram_ack_index = gb_trees:empty() },
+        rabbit_misc:gb_trees_fold(
+          F, rabbit_misc:gb_trees_fold(F, accumulate_ack_init(), RPA), DPA),
+    State1 = State #vqstate { ram_pending_ack  = gb_trees:empty(),
+                              disk_pending_ack = gb_trees:empty() },
+
     case KeepPersistent of
         true  -> case orddict:find(false, MsgIdsByStore) of
                      error        -> State1;
@@ -1306,27 +1324,9 @@ record_confirms(MsgIdSet, State = #vqstate { msgs_on_disk        = MOD,
       unconfirmed         = rabbit_misc:gb_sets_difference(UC,   MsgIdSet),
       confirmed           = gb_sets:union(C, MsgIdSet) }.
 
-must_sync_index(#vqstate { msg_indices_on_disk = MIOD,
-                           unconfirmed = UC }) ->
-    %% If UC is empty then by definition, MIOD and MOD are also empty
-    %% and there's nothing that can be pending a sync.
-
-    %% If UC is not empty, then we want to find is_empty(UC - MIOD),
-    %% but the subtraction can be expensive. Thus instead, we test to
-    %% see if UC is a subset of MIOD. This can only be the case if
-    %% MIOD == UC, which would indicate that every message in UC is
-    %% also in MIOD and is thus _all_ pending on a msg_store sync, not
-    %% on a qi sync. Thus the negation of this is sufficient. Because
-    %% is_subset is short circuiting, this is more efficient than the
-    %% subtraction.
-    not (gb_sets:is_empty(UC) orelse gb_sets:is_subset(UC, MIOD)).
-
-blind_confirm(Callback, MsgIdSet) ->
-    Callback(?MODULE,
-             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end).
-
 msgs_written_to_disk(Callback, MsgIdSet, ignored) ->
-    blind_confirm(Callback, MsgIdSet);
+    Callback(?MODULE,
+             fun (?MODULE, State) -> record_confirms(MsgIdSet, State) end);
 msgs_written_to_disk(Callback, MsgIdSet, written) ->
     Callback(?MODULE,
              fun (?MODULE, State = #vqstate { msgs_on_disk        = MOD,
@@ -1356,16 +1356,14 @@ msg_indices_written_to_disk(Callback, MsgIdSet) ->
 %%----------------------------------------------------------------------------
 
 publish_alpha(#msg_status { msg = undefined } = MsgStatus, State) ->
-    read_msg(MsgStatus, State);
-publish_alpha(MsgStatus, #vqstate {ram_msg_count = RamMsgCount } = State) ->
-    {MsgStatus, State #vqstate { ram_msg_count = RamMsgCount + 1 }}.
+    {Msg, State1} = read_msg(MsgStatus, State),
+    {MsgStatus#msg_status { msg = Msg }, inc_ram_msg_count(State1)};
+publish_alpha(MsgStatus, State) ->
+    {MsgStatus, inc_ram_msg_count(State)}.
 
 publish_beta(MsgStatus, State) ->
-    {#msg_status { msg = Msg} = MsgStatus1,
-     #vqstate { ram_msg_count = RamMsgCount } = State1} =
-        maybe_write_to_disk(true, false, MsgStatus, State),
-    {MsgStatus1, State1 #vqstate {
-                   ram_msg_count = RamMsgCount + one_if(Msg =/= undefined) }}.
+    {MsgStatus1, State1} = maybe_write_to_disk(true, false, MsgStatus, State),
+    {m(trim_msg_status(MsgStatus1)), State1}.
 
 %% Rebuild queue, inserting sequence ids to maintain ordering
 queue_merge(SeqIds, Q, MsgIds, Limit, PubFun, State) ->
@@ -1422,6 +1420,82 @@ delta_limit(?BLANK_DELTA_PATTERN(_X))             -> undefined;
 delta_limit(#delta { start_seq_id = StartSeqId }) -> StartSeqId.
 
 %%----------------------------------------------------------------------------
+%% Iterator
+%%----------------------------------------------------------------------------
+
+ram_ack_iterator(State) ->
+    {ack, gb_trees:iterator(State#vqstate.ram_pending_ack)}.
+
+disk_ack_iterator(State) ->
+    {ack, gb_trees:iterator(State#vqstate.disk_pending_ack)}.
+
+msg_iterator(State) -> istate(start, State).
+
+istate(start, State) -> {q4,    State#vqstate.q4,    State};
+istate(q4,    State) -> {q3,    State#vqstate.q3,    State};
+istate(q3,    State) -> {delta, State#vqstate.delta, State};
+istate(delta, State) -> {q2,    State#vqstate.q2,    State};
+istate(q2,    State) -> {q1,    State#vqstate.q1,    State};
+istate(q1,   _State) -> done.
+
+next({ack, It}, IndexState) ->
+    case gb_trees:next(It) of
+        none                     -> {empty, IndexState};
+        {_SeqId, MsgStatus, It1} -> Next = {ack, It1},
+                                    {value, MsgStatus, true, Next, IndexState}
+    end;
+next(done, IndexState) -> {empty, IndexState};
+next({delta, #delta{start_seq_id = SeqId,
+                    end_seq_id   = SeqId}, State}, IndexState) ->
+    next(istate(delta, State), IndexState);
+next({delta, #delta{start_seq_id = SeqId,
+                    end_seq_id   = SeqIdEnd} = Delta, State}, IndexState) ->
+    SeqIdB = rabbit_queue_index:next_segment_boundary(SeqId),
+    SeqId1 = lists:min([SeqIdB, SeqIdEnd]),
+    {List, IndexState1} = rabbit_queue_index:read(SeqId, SeqId1, IndexState),
+    next({delta, Delta#delta{start_seq_id = SeqId1}, List, State}, IndexState1);
+next({delta, Delta, [], State}, IndexState) ->
+    next({delta, Delta, State}, IndexState);
+next({delta, Delta, [{_, SeqId, _, _, _} = M | Rest], State}, IndexState) ->
+    case (gb_trees:is_defined(SeqId, State#vqstate.ram_pending_ack) orelse
+          gb_trees:is_defined(SeqId, State#vqstate.disk_pending_ack)) of
+        false -> Next = {delta, Delta, Rest, State},
+                 {value, beta_msg_status(M), false, Next, IndexState};
+        true  -> next({delta, Delta, Rest, State}, IndexState)
+    end;
+next({Key, Q, State}, IndexState) ->
+    case ?QUEUE:out(Q) of
+        {empty, _Q}              -> next(istate(Key, State), IndexState);
+        {{value, MsgStatus}, QN} -> Next = {Key, QN, State},
+                                    {value, MsgStatus, false, Next, IndexState}
+    end.
+
+inext(It, {Its, IndexState}) ->
+    case next(It, IndexState) of
+        {empty, IndexState1} ->
+            {Its, IndexState1};
+        {value, MsgStatus1, Unacked, It1, IndexState1} ->
+            {[{MsgStatus1, Unacked, It1} | Its], IndexState1}
+    end.
+
+ifold(_Fun, Acc, [], State) ->
+    {Acc, State};
+ifold(Fun, Acc, Its, State) ->
+    [{MsgStatus, Unacked, It} | Rest] =
+        lists:sort(fun ({#msg_status{seq_id = SeqId1}, _, _},
+                        {#msg_status{seq_id = SeqId2}, _, _}) ->
+                           SeqId1 =< SeqId2
+                   end, Its),
+    {Msg, State1} = read_msg(MsgStatus, State),
+    case Fun(Msg, MsgStatus#msg_status.msg_props, Unacked, Acc) of
+        {stop, Acc1} ->
+            {Acc1, State};
+        {cont, Acc1} ->
+            {Its1, IndexState1} = inext(It, {Rest, State1#vqstate.index_state}),
+            ifold(Fun, Acc1, Its1, State1#vqstate{index_state = IndexState1})
+    end.
+
+%%----------------------------------------------------------------------------
 %% Phase changes
 %%----------------------------------------------------------------------------
 
@@ -1444,12 +1518,9 @@ delta_limit(#delta { start_seq_id = StartSeqId }) -> StartSeqId.
 %% one segment's worth of messages in q3 - and thus would risk
 %% perpetually reporting the need for a conversion when no such
 %% conversion is needed. That in turn could cause an infinite loop.
-reduce_memory_use(_AlphaBetaFun, _BetaDeltaFun, _AckFun,
-                  State = #vqstate {target_ram_count = infinity}) ->
-    {false, State};
 reduce_memory_use(AlphaBetaFun, BetaDeltaFun, AckFun,
                   State = #vqstate {
-                    ram_ack_index    = RamAckIndex,
+                    ram_pending_ack  = RPA,
                     ram_msg_count    = RamMsgCount,
                     target_ram_count = TargetRamCount,
                     rates            = #rates { avg_ingress = AvgIngress,
@@ -1459,8 +1530,7 @@ reduce_memory_use(AlphaBetaFun, BetaDeltaFun, AckFun,
                    }) ->
 
     {Reduce, State1 = #vqstate { q2 = Q2, q3 = Q3 }} =
-        case chunk_size(RamMsgCount + gb_trees:size(RamAckIndex),
-                        TargetRamCount) of
+        case chunk_size(RamMsgCount + gb_trees:size(RPA), TargetRamCount) of
             0  -> {false, State};
             %% Reduce memory of pending acks and alphas. The order is
             %% determined based on which is growing faster. Whichever
@@ -1485,23 +1555,23 @@ reduce_memory_use(AlphaBetaFun, BetaDeltaFun, AckFun,
 
 limit_ram_acks(0, State) ->
     {0, State};
-limit_ram_acks(Quota, State = #vqstate { pending_ack   = PA,
-                                         ram_ack_index = RAI }) ->
-    case gb_trees:is_empty(RAI) of
+limit_ram_acks(Quota, State = #vqstate { ram_pending_ack  = RPA,
+                                         disk_pending_ack = DPA }) ->
+    case gb_trees:is_empty(RPA) of
         true ->
             {Quota, State};
         false ->
-            {SeqId, MsgId, RAI1} = gb_trees:take_largest(RAI),
-            MsgStatus = #msg_status { msg_id = MsgId, is_persistent = false} =
-                gb_trees:get(SeqId, PA),
+            {SeqId, MsgStatus, RPA1} = gb_trees:take_largest(RPA),
             {MsgStatus1, State1} =
                 maybe_write_to_disk(true, false, MsgStatus, State),
-            PA1 = gb_trees:update(SeqId, m(trim_msg_status(MsgStatus1)), PA),
+            DPA1 = gb_trees:insert(SeqId, m(trim_msg_status(MsgStatus1)), DPA),
             limit_ram_acks(Quota - 1,
-                           State1 #vqstate { pending_ack   = PA1,
-                                             ram_ack_index = RAI1 })
+                           State1 #vqstate { ram_pending_ack  = RPA1,
+                                             disk_pending_ack = DPA1 })
     end.
 
+reduce_memory_use(State = #vqstate { target_ram_count = infinity }) ->
+    State;
 reduce_memory_use(State) ->
     {_, State1} = reduce_memory_use(fun push_alphas_to_betas/2,
                                     fun push_betas_to_deltas/2,
@@ -1567,7 +1637,8 @@ maybe_deltas_to_betas(State = #vqstate {
                         delta                = Delta,
                         q3                   = Q3,
                         index_state          = IndexState,
-                        pending_ack          = PA,
+                        ram_pending_ack      = RPA,
+                        disk_pending_ack     = DPA,
                         transient_threshold  = TransientThreshold }) ->
     #delta { start_seq_id = DeltaSeqId,
              count        = DeltaCount,
@@ -1575,10 +1646,10 @@ maybe_deltas_to_betas(State = #vqstate {
     DeltaSeqId1 =
         lists:min([rabbit_queue_index:next_segment_boundary(DeltaSeqId),
                    DeltaSeqIdEnd]),
-    {List, IndexState1} =
-        rabbit_queue_index:read(DeltaSeqId, DeltaSeqId1, IndexState),
-    {Q3a, IndexState2} =
-        betas_from_index_entries(List, TransientThreshold, PA, IndexState1),
+    {List, IndexState1} = rabbit_queue_index:read(DeltaSeqId, DeltaSeqId1,
+                                                  IndexState),
+    {Q3a, IndexState2} = betas_from_index_entries(List, TransientThreshold,
+                                                  RPA, DPA, IndexState1),
     State1 = State #vqstate { index_state = IndexState2 },
     case ?QUEUE:len(Q3a) of
         0 ->
diff --git a/src/rabbit_version.erl b/src/rabbit_version.erl
index 1cc7d6c8..c629180e 100644
--- a/src/rabbit_version.erl
+++ b/src/rabbit_version.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_version).
diff --git a/src/rabbit_vhost.erl b/src/rabbit_vhost.erl
index 5548ef6d..fcf77bf9 100644
--- a/src/rabbit_vhost.erl
+++ b/src/rabbit_vhost.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_vhost).
@@ -20,7 +20,7 @@
 
 %%----------------------------------------------------------------------------
 
--export([add/1, delete/1, exists/1, list/0, with/2]).
+-export([add/1, delete/1, exists/1, list/0, with/2, assert/1]).
 -export([info/1, info/2, info_all/0, info_all/1]).
 
 -ifdef(use_specs).
@@ -30,6 +30,7 @@
 -spec(exists/1 :: (rabbit_types:vhost()) -> boolean()).
 -spec(list/0 :: () -> [rabbit_types:vhost()]).
 -spec(with/2 :: (rabbit_types:vhost(), rabbit_misc:thunk(A)) -> A).
+-spec(assert/1 :: (rabbit_types:vhost()) -> 'ok').
 
 -spec(info/1 :: (rabbit_types:vhost()) -> rabbit_types:infos()).
 -spec(info/2 :: (rabbit_types:vhost(), rabbit_types:info_keys())
@@ -70,6 +71,7 @@ add(VHostPath) ->
                            {<<"amq.rabbitmq.trace">>, topic}]],
                   ok
           end),
+    rabbit_event:notify(vhost_created, info(VHostPath)),
     R.
 
 delete(VHostPath) ->
@@ -87,15 +89,19 @@ delete(VHostPath) ->
           with(VHostPath, fun () ->
                                   ok = internal_delete(VHostPath)
                           end)),
+    ok = rabbit_event:notify(vhost_deleted, [{name, VHostPath}]),
     R.
 
 internal_delete(VHostPath) ->
-    lists:foreach(
-      fun (Info) ->
-              ok = rabbit_auth_backend_internal:clear_permissions(
-                     proplists:get_value(user, Info), VHostPath)
-      end,
-      rabbit_auth_backend_internal:list_vhost_permissions(VHostPath)),
+    [ok = rabbit_auth_backend_internal:clear_permissions(
+            proplists:get_value(user, Info), VHostPath)
+     || Info <- rabbit_auth_backend_internal:list_vhost_permissions(VHostPath)],
+    [ok = rabbit_runtime_parameters:clear(VHostPath,
+                                          proplists:get_value(component, Info),
+                                          proplists:get_value(name, Info))
+     || Info <- rabbit_runtime_parameters:list(VHostPath)],
+    [ok = rabbit_policy:delete(VHostPath, proplists:get_value(name, Info))
+     || Info <- rabbit_policy:list(VHostPath)],
     ok = mnesia:delete({rabbit_vhost, VHostPath}),
     ok.
 
@@ -115,12 +121,18 @@ with(VHostPath, Thunk) ->
             end
     end.
 
+%% Like with/2 but outside an Mnesia tx
+assert(VHostPath) -> case rabbit_vhost:exists(VHostPath) of
+                         true  -> ok;
+                         false -> throw({error, {no_such_vhost, VHostPath}})
+                     end.
+
 %%----------------------------------------------------------------------------
 
 infos(Items, X) -> [{Item, i(Item, X)} || Item <- Items].
 
 i(name,    VHost) -> VHost;
-i(tracing, VHost) -> rabbit_trace:tracing(VHost);
+i(tracing, VHost) -> rabbit_trace:enabled(VHost);
 i(Item, _)        -> throw({bad_argument, Item}).
 
 info(VHost)        -> infos(?INFO_KEYS, VHost).
diff --git a/src/rabbit_vm.erl b/src/rabbit_vm.erl
new file mode 100644
index 00000000..597f9094
--- /dev/null
+++ b/src/rabbit_vm.erl
@@ -0,0 +1,228 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(rabbit_vm).
+
+-export([memory/0]).
+
+-define(MAGIC_PLUGINS, ["mochiweb", "webmachine", "cowboy", "sockjs",
+                        "rfc4627_jsonrpc"]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(memory/0 :: () -> rabbit_types:infos()).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+%% Like erlang:memory(), but with awareness of rabbit-y things
+memory() ->
+    ConnProcs     = [rabbit_tcp_client_sup, ssl_connection_sup, amqp_sup],
+    QProcs        = [rabbit_amqqueue_sup, rabbit_mirror_queue_slave_sup],
+    MsgIndexProcs = [msg_store_transient, msg_store_persistent],
+    MgmtDbProcs   = [rabbit_mgmt_sup],
+    PluginProcs   = plugin_sups(),
+
+    All = [ConnProcs, QProcs, MsgIndexProcs, MgmtDbProcs, PluginProcs],
+
+    {Sums, _Other} = sum_processes(lists:append(All), [memory]),
+
+    [Conns, Qs, MsgIndexProc, MgmtDbProc, AllPlugins] =
+        [aggregate_memory(Names, Sums) || Names <- All],
+
+    Mnesia       = mnesia_memory(),
+    MsgIndexETS  = ets_memory(rabbit_msg_store_ets_index),
+    MgmtDbETS    = ets_memory(rabbit_mgmt_db),
+    Plugins      = AllPlugins - MgmtDbProc,
+
+    [{total,     Total},
+     {processes, Processes},
+     {ets,       ETS},
+     {atom,      Atom},
+     {binary,    Bin},
+     {code,      Code},
+     {system,    System}] =
+        erlang:memory([total, processes, ets, atom, binary, code, system]),
+
+    OtherProc = Processes - Conns - Qs - MsgIndexProc - AllPlugins,
+
+    [{total,            Total},
+     {connection_procs, Conns},
+     {queue_procs,      Qs},
+     {plugins,          Plugins},
+     {other_proc,       lists:max([0, OtherProc])}, %% [1]
+     {mnesia,           Mnesia},
+     {mgmt_db,          MgmtDbETS + MgmtDbProc},
+     {msg_index,        MsgIndexETS + MsgIndexProc},
+     {other_ets,        ETS - Mnesia - MsgIndexETS - MgmtDbETS},
+     {binary,           Bin},
+     {code,             Code},
+     {atom,             Atom},
+     {other_system,     System - ETS - Atom - Bin - Code}].
+
+%% [1] - erlang:memory(processes) can be less than the sum of its
+%% parts. Rather than display something nonsensical, just silence any
+%% claims about negative memory. See
+%% http://erlang.org/pipermail/erlang-questions/2012-September/069320.html
+
+%%----------------------------------------------------------------------------
+
+mnesia_memory() ->
+    case mnesia:system_info(is_running) of
+        yes -> lists:sum([bytes(mnesia:table_info(Tab, memory)) ||
+                             Tab <- mnesia:system_info(tables)]);
+        no  -> 0
+    end.
+
+ets_memory(Name) ->
+    lists:sum([bytes(ets:info(T, memory)) || T <- ets:all(),
+                                             N <- [ets:info(T, name)],
+                                             N =:= Name]).
+
+bytes(Words) ->  Words * erlang:system_info(wordsize).
+
+plugin_sups() ->
+    lists:append([plugin_sup(App) ||
+                     {App, _, _} <- rabbit_misc:which_applications(),
+                     is_plugin(atom_to_list(App))]).
+
+plugin_sup(App) ->
+    case application_controller:get_master(App) of
+        undefined -> [];
+        Master    -> case application_master:get_child(Master) of
+                         {Pid, _} when is_pid(Pid) -> [process_name(Pid)];
+                         Pid      when is_pid(Pid) -> [process_name(Pid)];
+                         _                         -> []
+                     end
+    end.
+
+process_name(Pid) ->
+    case process_info(Pid, registered_name) of
+        {registered_name, Name} -> Name;
+        _                       -> Pid
+    end.
+
+is_plugin("rabbitmq_" ++ _) -> true;
+is_plugin(App)              -> lists:member(App, ?MAGIC_PLUGINS).
+
+aggregate_memory(Names, Sums) ->
+    lists:sum([extract_memory(Name, Sums) || Name <- Names]).
+
+extract_memory(Name, Sums) ->
+    {value, {_, Accs}} = lists:keysearch(Name, 1, Sums),
+    {value, {memory, V}} = lists:keysearch(memory, 1, Accs),
+    V.
+
+%%----------------------------------------------------------------------------
+
+%% NB: this code is non-rabbit specific.
+
+-ifdef(use_specs).
+-type(process() :: pid() | atom()).
+-type(info_key() :: atom()).
+-type(info_value() :: any()).
+-type(info_item() :: {info_key(), info_value()}).
+-type(accumulate() :: fun ((info_key(), info_value(), info_value()) ->
+                                  info_value())).
+-spec(sum_processes/2 :: ([process()], [info_key()]) ->
+                              {[{process(), [info_item()]}], [info_item()]}).
+-spec(sum_processes/3 :: ([process()], accumulate(), [info_item()]) ->
+                              {[{process(), [info_item()]}], [info_item()]}).
+-endif.
+
+sum_processes(Names, Items) ->
+    sum_processes(Names, fun (_, X, Y) -> X + Y end,
+                  [{Item, 0} || Item <- Items]).
+
+%% summarize the process_info of all processes based on their
+%% '$ancestor' hierarchy, recorded in their process dictionary.
+%%
+%% The function takes
+%%
+%% 1) a list of names/pids of processes that are accumulation points
+%%    in the hierarchy.
+%%
+%% 2) a function that aggregates individual info items -taking the
+%%    info item key, value and accumulated value as the input and
+%%    producing a new accumulated value.
+%%
+%% 3) a list of info item key / initial accumulator value pairs.
+%%
+%% The process_info of a process is accumulated at the nearest of its
+%% ancestors that is mentioned in the first argument, or, if no such
+%% ancestor exists or the ancestor information is absent, in a special
+%% 'other' bucket.
+%%
+%% The result is a pair consisting of
+%%
+%% 1) a k/v list, containing for each of the accumulation names/pids a
+%%    list of info items, containing the accumulated data, and
+%%
+%% 2) the 'other' bucket - a list of info items containing the
+%%    accumulated data of all processes with no matching ancestors
+%%
+%% Note that this function operates on names as well as pids, but
+%% these must match whatever is contained in the '$ancestor' process
+%% dictionary entry. Generally that means for all registered processes
+%% the name should be used.
+sum_processes(Names, Fun, Acc0) ->
+    Items = [Item || {Item, _Val0} <- Acc0],
+    Acc0Dict  = orddict:from_list(Acc0),
+    NameAccs0 = orddict:from_list([{Name, Acc0Dict} || Name <- Names]),
+    {NameAccs, OtherAcc} =
+        lists:foldl(
+          fun (Pid, Acc) ->
+                  InfoItems = [registered_name, dictionary | Items],
+                  case process_info(Pid, InfoItems) of
+                      undefined ->
+                          Acc;
+                      [{registered_name, RegName}, {dictionary, D} | Vals] ->
+                          %% see docs for process_info/2 for the
+                          %% special handling of 'registered_name'
+                          %% info items
+                          Extra = case RegName of
+                                      [] -> [];
+                                      N  -> [N]
+                                  end,
+                          accumulate(find_ancestor(Extra, D, Names), Fun,
+                                     orddict:from_list(Vals), Acc)
+                  end
+          end, {NameAccs0, Acc0Dict}, processes()),
+    %% these conversions aren't strictly necessary; we do them simply
+    %% for the sake of encapsulating the representation.
+    {[{Name, orddict:to_list(Accs)} ||
+         {Name, Accs} <- orddict:to_list(NameAccs)],
+     orddict:to_list(OtherAcc)}.
+
+find_ancestor(Extra, D, Names) ->
+    Ancestors = case lists:keysearch('$ancestors', 1, D) of
+                    {value, {_, Ancs}} -> Ancs;
+                    false              -> []
+                end,
+    case lists:splitwith(fun (A) -> not lists:member(A, Names) end,
+                         Extra ++ Ancestors) of
+        {_,         []} -> undefined;
+        {_, [Name | _]} -> Name
+    end.
+
+accumulate(undefined, Fun, ValsDict, {NameAccs, OtherAcc}) ->
+    {NameAccs, orddict:merge(Fun, ValsDict, OtherAcc)};
+accumulate(Name,      Fun, ValsDict, {NameAccs, OtherAcc}) ->
+    F = fun (NameAcc) -> orddict:merge(Fun, ValsDict, NameAcc) end,
+    {orddict:update(Name, F, NameAccs), OtherAcc}.
diff --git a/src/rabbit_writer.erl b/src/rabbit_writer.erl
index f3a8cacf..bf6964d8 100644
--- a/src/rabbit_writer.erl
+++ b/src/rabbit_writer.erl
@@ -10,21 +10,29 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(rabbit_writer).
 -include("rabbit.hrl").
 -include("rabbit_framing.hrl").
 
--export([start/5, start_link/5, mainloop/2, mainloop1/2]).
+-export([start/5, start_link/5, start/6, start_link/6]).
+
+-export([system_continue/3, system_terminate/4, system_code_change/4]).
+
 -export([send_command/2, send_command/3,
          send_command_sync/2, send_command_sync/3,
-         send_command_and_notify/4, send_command_and_notify/5]).
+         send_command_and_notify/4, send_command_and_notify/5,
+         flush/1]).
 -export([internal_send_command/4, internal_send_command/6]).
 
--record(wstate, {sock, channel, frame_max, protocol, pending}).
+%% internal
+-export([mainloop/2, mainloop1/2]).
+
+-record(wstate, {sock, channel, frame_max, protocol, reader,
+                 stats_timer, pending}).
 
 -define(HIBERNATE_AFTER, 5000).
 
@@ -40,6 +48,19 @@
         (rabbit_net:socket(), rabbit_channel:channel_number(),
          non_neg_integer(), rabbit_types:protocol(), pid())
         -> rabbit_types:ok(pid())).
+-spec(start/6 ::
+        (rabbit_net:socket(), rabbit_channel:channel_number(),
+         non_neg_integer(), rabbit_types:protocol(), pid(), boolean())
+        -> rabbit_types:ok(pid())).
+-spec(start_link/6 ::
+        (rabbit_net:socket(), rabbit_channel:channel_number(),
+         non_neg_integer(), rabbit_types:protocol(), pid(), boolean())
+        -> rabbit_types:ok(pid())).
+
+-spec(system_code_change/4 :: (_,_,_,_) -> {'ok',_}).
+-spec(system_continue/3 :: (_,_,#wstate{}) -> any()).
+-spec(system_terminate/4 :: (_,_,_,_) -> none()).
+
 -spec(send_command/2 ::
         (pid(), rabbit_framing:amqp_method_record()) -> 'ok').
 -spec(send_command/3 ::
@@ -57,6 +78,7 @@
         (pid(), pid(), pid(), rabbit_framing:amqp_method_record(),
          rabbit_types:content())
         -> 'ok').
+-spec(flush/1 :: (pid()) -> 'ok').
 -spec(internal_send_command/4 ::
         (rabbit_net:socket(), rabbit_channel:channel_number(),
          rabbit_framing:amqp_method_record(), rabbit_types:protocol())
@@ -67,63 +89,95 @@
          non_neg_integer(), rabbit_types:protocol())
         -> 'ok').
 
--spec(mainloop/2 :: (_,_) -> 'done').
--spec(mainloop1/2 :: (_,_) -> any()).
-
 -endif.
 
 %%---------------------------------------------------------------------------
 
 start(Sock, Channel, FrameMax, Protocol, ReaderPid) ->
-    {ok,
-     proc_lib:spawn(?MODULE, mainloop, [ReaderPid,
-                                        #wstate{sock = Sock,
-                                                channel = Channel,
-                                                frame_max = FrameMax,
-                                                protocol = Protocol,
-                                                pending = []}])}.
+    start(Sock, Channel, FrameMax, Protocol, ReaderPid, false).
 
 start_link(Sock, Channel, FrameMax, Protocol, ReaderPid) ->
-    {ok,
-     proc_lib:spawn_link(?MODULE, mainloop, [ReaderPid,
-                                             #wstate{sock = Sock,
-                                                     channel = Channel,
-                                                     frame_max = FrameMax,
-                                                     protocol = Protocol,
-                                                     pending = []}])}.
-
-mainloop(ReaderPid, State) ->
+    start_link(Sock, Channel, FrameMax, Protocol, ReaderPid, false).
+
+start(Sock, Channel, FrameMax, Protocol, ReaderPid, ReaderWantsStats) ->
+    State = initial_state(Sock, Channel, FrameMax, Protocol, ReaderPid,
+                          ReaderWantsStats),
+    Deb = sys:debug_options([]),
+    {ok, proc_lib:spawn(?MODULE, mainloop, [Deb, State])}.
+
+start_link(Sock, Channel, FrameMax, Protocol, ReaderPid, ReaderWantsStats) ->
+    State = initial_state(Sock, Channel, FrameMax, Protocol, ReaderPid,
+                          ReaderWantsStats),
+    Deb = sys:debug_options([]),
+    {ok, proc_lib:spawn_link(?MODULE, mainloop, [Deb, State])}.
+
+initial_state(Sock, Channel, FrameMax, Protocol, ReaderPid, ReaderWantsStats) ->
+    (case ReaderWantsStats of
+         true  -> fun rabbit_event:init_stats_timer/2;
+         false -> fun rabbit_event:init_disabled_stats_timer/2
+     end)(#wstate{sock      = Sock,
+                  channel   = Channel,
+                  frame_max = FrameMax,
+                  protocol  = Protocol,
+                  reader    = ReaderPid,
+                  pending   = []},
+          #wstate.stats_timer).
+
+system_continue(Parent, Deb, State) ->
+    mainloop(Deb, State#wstate{reader = Parent}).
+
+system_terminate(Reason, _Parent, _Deb, _State) ->
+    exit(Reason).
+
+system_code_change(Misc, _Module, _OldVsn, _Extra) ->
+    {ok, Misc}.
+
+mainloop(Deb, State) ->
     try
-        mainloop1(ReaderPid, State)
+        mainloop1(Deb, State)
     catch
-        exit:Error -> ReaderPid ! {channel_exit, #wstate.channel, Error}
+        exit:Error -> #wstate{reader = ReaderPid, channel = Channel} = State,
+                      ReaderPid ! {channel_exit, Channel, Error}
     end,
     done.
 
-mainloop1(ReaderPid, State = #wstate{pending = []}) ->
+mainloop1(Deb, State = #wstate{pending = []}) ->
     receive
-        Message -> ?MODULE:mainloop1(ReaderPid, handle_message(Message, State))
+        Message -> {Deb1, State1} = handle_message(Deb, Message, State),
+                   ?MODULE:mainloop1(Deb1, State1)
     after ?HIBERNATE_AFTER ->
-            erlang:hibernate(?MODULE, mainloop, [ReaderPid, State])
+            erlang:hibernate(?MODULE, mainloop, [Deb, State])
     end;
-mainloop1(ReaderPid, State) ->
+mainloop1(Deb, State) ->
     receive
-        Message -> ?MODULE:mainloop1(ReaderPid, handle_message(Message, State))
+        Message -> {Deb1, State1} = handle_message(Deb, Message, State),
+                   ?MODULE:mainloop1(Deb1, State1)
     after 0 ->
-            ?MODULE:mainloop1(ReaderPid, flush(State))
+            ?MODULE:mainloop1(Deb, internal_flush(State))
     end.
 
+handle_message(Deb, {system, From, Req}, State = #wstate{reader = Parent}) ->
+    sys:handle_system_msg(Req, From, Parent, ?MODULE, Deb, State);
+handle_message(Deb, Message, State) ->
+    {Deb, handle_message(Message, State)}.
+
 handle_message({send_command, MethodRecord}, State) ->
     internal_send_command_async(MethodRecord, State);
 handle_message({send_command, MethodRecord, Content}, State) ->
     internal_send_command_async(MethodRecord, Content, State);
 handle_message({'$gen_call', From, {send_command_sync, MethodRecord}}, State) ->
-    State1 = flush(internal_send_command_async(MethodRecord, State)),
+    State1 = internal_flush(
+               internal_send_command_async(MethodRecord, State)),
     gen_server:reply(From, ok),
     State1;
 handle_message({'$gen_call', From, {send_command_sync, MethodRecord, Content}},
                State) ->
-    State1 = flush(internal_send_command_async(MethodRecord, Content, State)),
+    State1 = internal_flush(
+               internal_send_command_async(MethodRecord, Content, State)),
+    gen_server:reply(From, ok),
+    State1;
+handle_message({'$gen_call', From, flush}, State) ->
+    State1 = internal_flush(State),
     gen_server:reply(From, ok),
     State1;
 handle_message({send_command_and_notify, QPid, ChPid, MethodRecord}, State) ->
@@ -139,9 +193,12 @@ handle_message({'DOWN', _MRef, process, QPid, _Reason}, State) ->
     rabbit_amqqueue:notify_sent_queue_down(QPid),
     State;
 handle_message({inet_reply, _, ok}, State) ->
-    State;
+    rabbit_event:ensure_stats_timer(State, #wstate.stats_timer, emit_stats);
 handle_message({inet_reply, _, Status}, _State) ->
     exit({writer, send_failed, Status});
+handle_message(emit_stats, State = #wstate{reader = ReaderPid}) ->
+    ReaderPid ! ensure_stats,
+    rabbit_event:reset_stats_timer(State, #wstate.stats_timer);
 handle_message(Message, _State) ->
     exit({writer, message_not_understood, Message}).
 
@@ -169,6 +226,8 @@ send_command_and_notify(W, Q, ChPid, MethodRecord, Content) ->
     W ! {send_command_and_notify, Q, ChPid, MethodRecord, Content},
     ok.
 
+flush(W) -> call(W, flush).
+
 %%---------------------------------------------------------------------------
 
 call(Pid, Msg) ->
@@ -228,13 +287,13 @@ internal_send_command_async(MethodRecord, Content,
 
 maybe_flush(State = #wstate{pending = Pending}) ->
     case iolist_size(Pending) >= ?FLUSH_THRESHOLD of
-        true  -> flush(State);
+        true  -> internal_flush(State);
         false -> State
     end.
 
-flush(State = #wstate{pending = []}) ->
+internal_flush(State = #wstate{pending = []}) ->
     State;
-flush(State = #wstate{sock = Sock, pending = Pending}) ->
+internal_flush(State = #wstate{sock = Sock, pending = Pending}) ->
     ok = port_cmd(Sock, lists:reverse(Pending)),
     State#wstate{pending = []}.
 
diff --git a/src/supervised_lifecycle.erl b/src/supervised_lifecycle.erl
new file mode 100644
index 00000000..8b306f6f
--- /dev/null
+++ b/src/supervised_lifecycle.erl
@@ -0,0 +1,68 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License
+%% at http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and
+%% limitations under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+%% Invoke callbacks on startup and termination.
+%%
+%% Simply hook this process into a supervision hierarchy, to have the
+%% callbacks invoked at a precise point during the establishment and
+%% teardown of that hierarchy, respectively.
+%%
+%% Or launch the process independently, and link to it, to have the
+%% callbacks invoked on startup and when the linked process
+%% terminates, respectively.
+
+-module(supervised_lifecycle).
+
+-behavior(gen_server).
+
+-export([start_link/3]).
+
+%% gen_server callbacks
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2,
+         code_change/3]).
+
+%%----------------------------------------------------------------------------
+
+-ifdef(use_specs).
+
+-spec(start_link/3 :: (atom(), rabbit_types:mfargs(), rabbit_types:mfargs()) ->
+                           rabbit_types:ok_pid_or_error()).
+
+-endif.
+
+%%----------------------------------------------------------------------------
+
+start_link(Name, StartMFA, StopMFA) ->
+    gen_server:start_link({local, Name}, ?MODULE, [StartMFA, StopMFA], []).
+
+%%----------------------------------------------------------------------------
+
+init([{M, F, A}, StopMFA]) ->
+    process_flag(trap_exit, true),
+    apply(M, F, A),
+    {ok, StopMFA}.
+
+handle_call(_Request, _From, State) -> {noreply, State}.
+
+handle_cast(_Msg, State) -> {noreply, State}.
+
+handle_info(_Info, State) -> {noreply, State}.
+
+terminate(_Reason, {M, F, A}) ->
+    apply(M, F, A),
+    ok.
+
+code_change(_OldVsn, State, _Extra) -> {ok, State}.
diff --git a/src/supervisor2.erl b/src/supervisor2.erl
index 3d3623d7..5a6dc887 100644
--- a/src/supervisor2.erl
+++ b/src/supervisor2.erl
@@ -1,14 +1,18 @@
-%% This file is a copy of supervisor.erl from the R13B-3 Erlang/OTP
+%% This file is a copy of supervisor.erl from the R16B Erlang/OTP
 %% distribution, with the following modifications:
 %%
 %% 1) the module name is supervisor2
 %%
-%% 2) there is a new strategy called
-%%    simple_one_for_one_terminate. This is exactly the same as for
-%%    simple_one_for_one, except that children *are* explicitly
-%%    terminated as per the shutdown component of the child_spec.
+%% 2) a find_child/2 utility function has been added
 %%
-%% 3) child specifications can contain, as the restart type, a tuple
+%% 3) Added an 'intrinsic' restart type. Like the transient type, this
+%%    type means the child should only be restarted if the child exits
+%%    abnormally. Unlike the transient type, if the child exits
+%%    normally, the supervisor itself also exits normally. If the
+%%    child is a supervisor and it exits normally (i.e. with reason of
+%%    'shutdown') then the child's parent also exits normally.
+%%
+%% 4) child specifications can contain, as the restart type, a tuple
 %%    {permanent, Delay} | {transient, Delay} | {intrinsic, Delay}
 %%    where Delay >= 0 (see point (4) below for intrinsic). The delay,
 %%    in seconds, indicates what should happen if a child, upon being
@@ -41,21 +45,14 @@
 %%    perspective it's a normal exit, whilst from supervisor's
 %%    perspective, it's an abnormal exit.
 %%
-%% 4) Added an 'intrinsic' restart type. Like the transient type, this
-%%    type means the child should only be restarted if the child exits
-%%    abnormally. Unlike the transient type, if the child exits
-%%    normally, the supervisor itself also exits normally. If the
-%%    child is a supervisor and it exits normally (i.e. with reason of
-%%    'shutdown') then the child's parent also exits normally.
-%%
 %% 5) normal, and {shutdown, _} exit reasons are all treated the same
 %%    (i.e. are regarded as normal exits)
 %%
-%% All modifications are (C) 2010-2012 VMware, Inc.
+%% All modifications are (C) 2010-2013 GoPivotal, Inc.
 %%
 %% %CopyrightBegin%
 %%
-%% Copyright Ericsson AB 1996-2009. All Rights Reserved.
+%% Copyright Ericsson AB 1996-2012. All Rights Reserved.
 %%
 %% The contents of this file are subject to the Erlang Public License,
 %% Version 1.1, (the "License"); you may not use this file except in
@@ -75,61 +72,34 @@
 -behaviour(gen_server).
 
 %% External exports
--export([start_link/2,start_link/3,
+-export([start_link/2, start_link/3,
 	 start_child/2, restart_child/2,
 	 delete_child/2, terminate_child/2,
-	 which_children/1, find_child/2,
-	 check_childspecs/1]).
+	 which_children/1, count_children/1,
+	 find_child/2, check_childspecs/1]).
 
 %% Internal exports
--export([init/1, handle_call/3, handle_info/2, terminate/2, code_change/3]).
--export([handle_cast/2]).
-
--define(DICT, dict).
-
--record(state, {name,
-		strategy,
-		children = [],
-		dynamics = ?DICT:new(),
-		intensity,
-		period,
-		restarts = [],
-	        module,
-	        args}).
-
--record(child, {pid = undefined,  % pid is undefined when child is not running
-		name,
-		mfa,
-		restart_type,
-		shutdown,
-		child_type,
-		modules = []}).
-
--define(is_simple(State), State#state.strategy =:= simple_one_for_one orelse
-        State#state.strategy =:= simple_one_for_one_terminate).
--define(is_terminate_simple(State),
-        State#state.strategy =:= simple_one_for_one_terminate).
-
--ifdef(use_specs).
+-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
+	 terminate/2, code_change/3]).
+-export([try_again_restart/3]).
 
 %%--------------------------------------------------------------------------
-%% Types
+-ifdef(use_specs).
+-export_type([child_spec/0, startchild_ret/0, strategy/0]).
+-endif.
 %%--------------------------------------------------------------------------
 
--export_type([child_spec/0, startchild_ret/0, strategy/0, sup_name/0]).
-
--type child() :: 'undefined' | pid().
+-ifdef(use_specs).
+-type child()    :: 'undefined' | pid().
 -type child_id() :: term().
--type mfargs() :: {M :: module(), F :: atom(), A :: [term()] | undefined}.
--type modules() :: [module()] | 'dynamic'.
--type delay() :: non_neg_integer().
--type restart() :: 'permanent' | 'transient' | 'temporary' | 'intrinsic'
-                 | {'permanent', delay()} | {'transient', delay()}
-                 | {'intrinsic', delay()}.
+-type mfargs()   :: {M :: module(), F :: atom(), A :: [term()] | undefined}.
+-type modules()  :: [module()] | 'dynamic'.
+-type delay()    :: non_neg_integer().
+-type restart()  :: 'permanent' | 'transient' | 'temporary' | 'intrinsic' | {'permanent', delay()} | {'transient', delay()} | {'intrinsic', delay()}.
 -type shutdown() :: 'brutal_kill' | timeout().
--type worker() :: 'worker' | 'supervisor'.
+-type worker()   :: 'worker' | 'supervisor'.
 -type sup_name() :: {'local', Name :: atom()} | {'global', Name :: atom()}.
--type sup_ref() :: (Name :: atom())
+-type sup_ref()  :: (Name :: atom())
                   | {Name :: atom(), Node :: node()}
                   | {'global', Name :: atom()}
                   | pid().
@@ -140,40 +110,110 @@
                        Type :: worker(),
                        Modules :: modules()}.
 
-
 -type strategy() :: 'one_for_all' | 'one_for_one'
-                  | 'rest_for_one' | 'simple_one_for_one'
-                  | 'simple_one_for_one_terminate'.
-
--type child_rec() :: #child{pid :: child() | {restarting,pid()} | [pid()],
-                            name :: child_id(),
-                            mfa :: mfargs(),
-                            restart_type :: restart(),
-                            shutdown :: shutdown(),
-                            child_type :: worker(),
-                            modules :: modules()}.
-
--type state() :: #state{strategy :: strategy(),
-                        children :: [child_rec()],
-                        dynamics :: ?DICT(),
-                        intensity :: non_neg_integer(),
-                        period :: pos_integer()}.
+                  | 'rest_for_one' | 'simple_one_for_one'.
+-endif.
 
 %%--------------------------------------------------------------------------
-%% Callback behaviour
-%%--------------------------------------------------------------------------
 
+-ifdef(use_specs).
+-record(child, {% pid is undefined when child is not running
+	        pid = undefined :: child() | {restarting,pid()} | [pid()],
+		name            :: child_id(),
+		mfargs          :: mfargs(),
+		restart_type    :: restart(),
+		shutdown        :: shutdown(),
+		child_type      :: worker(),
+		modules = []    :: modules()}).
+-type child_rec() :: #child{}.
+-else.
+-record(child, {
+	        pid = undefined,
+		name,
+		mfargs,
+		restart_type,
+		shutdown,
+		child_type,
+		modules = []}).
+-endif.
+
+-define(DICT, dict).
+-define(SETS, sets).
+-define(SET, set).
+
+-ifdef(use_specs).
+-record(state, {name,
+		strategy               :: strategy(),
+		children = []          :: [child_rec()],
+		dynamics               :: ?DICT() | ?SET(),
+		intensity              :: non_neg_integer(),
+		period                 :: pos_integer(),
+		restarts = [],
+	        module,
+	        args}).
+-type state() :: #state{}.
+-else.
+-record(state, {name,
+		strategy,
+		children = [],
+		dynamics,
+		intensity,
+		period,
+		restarts = [],
+	        module,
+	        args}).
+-endif.
+
+-define(is_simple(State), State#state.strategy =:= simple_one_for_one).
+-define(is_permanent(R), ((R =:= permanent) orelse
+                          (is_tuple(R) andalso
+                           tuple_size(R) == 2 andalso
+                           element(1, R) =:= permanent))).
+-define(is_explicit_restart(R),
+        R == {shutdown, restart}).
+
+-ifdef(use_specs).
 -callback init(Args :: term()) ->
     {ok, {{RestartStrategy :: strategy(),
-           MaxR :: non_neg_integer(),
-           MaxT :: non_neg_integer()},
+           MaxR            :: non_neg_integer(),
+           MaxT            :: non_neg_integer()},
            [ChildSpec :: child_spec()]}}
     | ignore.
+-endif.
+-define(restarting(_Pid_), {restarting,_Pid_}).
 
-%%--------------------------------------------------------------------------
-%% Specs
-%%--------------------------------------------------------------------------
+%%% ---------------------------------------------------
+%%% This is a general process supervisor built upon gen_server.erl.
+%%% Servers/processes should/could also be built using gen_server.erl.
+%%% SupName = {local, atom()} | {global, atom()}.
+%%% ---------------------------------------------------
+-ifdef(use_specs).
+-type startlink_err() :: {'already_started', pid()}
+                         | {'shutdown', term()}
+                         | term().
+-type startlink_ret() :: {'ok', pid()} | 'ignore' | {'error', startlink_err()}.
+
+-spec start_link(Module, Args) -> startlink_ret() when
+      Module :: module(),
+      Args :: term().
 
+-endif.
+start_link(Mod, Args) ->
+    gen_server:start_link(?MODULE, {self, Mod, Args}, []).
+ 
+-ifdef(use_specs).
+-spec start_link(SupName, Module, Args) -> startlink_ret() when
+      SupName :: sup_name(),
+      Module :: module(),
+      Args :: term().
+-endif.
+start_link(SupName, Mod, Args) ->
+    gen_server:start_link(SupName, ?MODULE, {SupName, Mod, Args}, []).
+ 
+%%% ---------------------------------------------------
+%%% Interface functions.
+%%% ---------------------------------------------------
+-ifdef(use_specs).
 -type startchild_err() :: 'already_present'
 			| {'already_started', Child :: child()} | term().
 -type startchild_ret() :: {'ok', Child :: child()}
@@ -183,91 +223,30 @@
 -spec start_child(SupRef, ChildSpec) -> startchild_ret() when
       SupRef :: sup_ref(),
       ChildSpec :: child_spec() | (List :: [term()]).
+-endif.
+start_child(Supervisor, ChildSpec) ->
+    call(Supervisor, {start_child, ChildSpec}).
 
+-ifdef(use_specs).
 -spec restart_child(SupRef, Id) -> Result when
       SupRef :: sup_ref(),
       Id :: child_id(),
       Result :: {'ok', Child :: child()}
               | {'ok', Child :: child(), Info :: term()}
               | {'error', Error},
-      Error :: 'running' | 'not_found' | 'simple_one_for_one' | term().
+      Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one' |
+	       term().
+-endif.
+restart_child(Supervisor, Name) ->
+    call(Supervisor, {restart_child, Name}).
 
+-ifdef(use_specs).
 -spec delete_child(SupRef, Id) -> Result when
       SupRef :: sup_ref(),
       Id :: child_id(),
       Result :: 'ok' | {'error', Error},
-      Error :: 'running' | 'not_found' | 'simple_one_for_one'.
-
--spec terminate_child(SupRef, Id) -> Result when
-      SupRef :: sup_ref(),
-      Id :: pid() | child_id(),
-      Result :: 'ok' | {'error', Error},
-      Error :: 'not_found' | 'simple_one_for_one'.
-
--spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when
-      SupRef :: sup_ref(),
-      Id :: child_id() | 'undefined',
-      Child :: child(),
-      Type :: worker(),
-      Modules :: modules().
-
--spec check_childspecs(ChildSpecs) -> Result when
-      ChildSpecs :: [child_spec()],
-      Result :: 'ok' | {'error', Error :: term()}.
-
--type init_sup_name() :: sup_name() | 'self'.
-
--type stop_rsn() :: 'shutdown' | {'bad_return', {module(),'init', term()}}
-                  | {'bad_start_spec', term()} | {'start_spec', term()}
-                  | {'supervisor_data', term()}.
-
--spec init({init_sup_name(), module(), [term()]}) ->
-        {'ok', state()} | 'ignore' | {'stop', stop_rsn()}.
-
--type call() :: 'which_children'.
--spec handle_call(call(), term(), state()) -> {'reply', term(), state()}.
-
--spec handle_cast('null', state()) -> {'noreply', state()}.
-
--spec handle_info(term(), state()) ->
-        {'noreply', state()} | {'stop', 'shutdown', state()}.
-
--spec terminate(term(), state()) -> 'ok'.
-
--spec code_change(term(), state(), term()) ->
-        {'ok', state()} | {'error', term()}.
-
--else.
-
--export([behaviour_info/1]).
-
-behaviour_info(callbacks) ->
-    [{init,1}];
-behaviour_info(_Other) ->
-    undefined.
-
+      Error :: 'running' | 'restarting' | 'not_found' | 'simple_one_for_one'.
 -endif.
-
-%%% ---------------------------------------------------
-%%% This is a general process supervisor built upon gen_server.erl.
-%%% Servers/processes should/could also be built using gen_server.erl.
-%%% SupName = {local, atom()} | {global, atom()}.
-%%% ---------------------------------------------------
-start_link(Mod, Args) ->
-    gen_server:start_link(?MODULE, {self, Mod, Args}, []).
- 
-start_link(SupName, Mod, Args) ->
-    gen_server:start_link(SupName, ?MODULE, {SupName, Mod, Args}, []).
- 
-%%% ---------------------------------------------------
-%%% Interface functions.
-%%% ---------------------------------------------------
-start_child(Supervisor, ChildSpec) ->
-    call(Supervisor, {start_child, ChildSpec}).
-
-restart_child(Supervisor, Name) ->
-    call(Supervisor, {restart_child, Name}).
-
 delete_child(Supervisor, Name) ->
     call(Supervisor, {delete_child, Name}).
 
@@ -277,12 +256,44 @@ delete_child(Supervisor, Name) ->
 %%          Note that the child is *always* terminated in some
 %%          way (maybe killed).
 %%-----------------------------------------------------------------
+-ifdef(use_specs).
+-spec terminate_child(SupRef, Id) -> Result when
+      SupRef :: sup_ref(),
+      Id :: pid() | child_id(),
+      Result :: 'ok' | {'error', Error},
+      Error :: 'not_found' | 'simple_one_for_one'.
+-endif.
 terminate_child(Supervisor, Name) ->
     call(Supervisor, {terminate_child, Name}).
 
+-ifdef(use_specs).
+-spec which_children(SupRef) -> [{Id,Child,Type,Modules}] when
+      SupRef :: sup_ref(),
+      Id :: child_id() | undefined,
+      Child :: child() | 'restarting',
+      Type :: worker(),
+      Modules :: modules().
+-endif.
 which_children(Supervisor) ->
     call(Supervisor, which_children).
 
+-ifdef(use_specs).
+-spec count_children(SupRef) -> PropListOfCounts when
+      SupRef :: sup_ref(),
+      PropListOfCounts :: [Count],
+      Count :: {specs, ChildSpecCount :: non_neg_integer()}
+             | {active, ActiveProcessCount :: non_neg_integer()}
+             | {supervisors, ChildSupervisorCount :: non_neg_integer()}
+             |{workers, ChildWorkerCount :: non_neg_integer()}.
+-endif.
+count_children(Supervisor) ->
+    call(Supervisor, count_children).
+
+-ifdef(use_specs).
+-spec find_child(Supervisor, Name) -> [pid()] when
+      Supervisor :: sup_ref(),
+      Name :: child_id().
+-endif.
 find_child(Supervisor, Name) ->
     [Pid || {Name1, Pid, _Type, _Modules} <- which_children(Supervisor),
             Name1 =:= Name].
@@ -290,6 +301,11 @@ find_child(Supervisor, Name) ->
 call(Supervisor, Req) ->
     gen_server:call(Supervisor, Req, infinity).
 
+-ifdef(use_specs).
+-spec check_childspecs(ChildSpecs) -> Result when
+      ChildSpecs :: [child_spec()],
+      Result :: 'ok' | {'error', Error :: term()}.
+-endif.
 check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
     case check_startspec(ChildSpecs) of
 	{ok, _} -> ok;
@@ -297,11 +313,37 @@ check_childspecs(ChildSpecs) when is_list(ChildSpecs) ->
     end;
 check_childspecs(X) -> {error, {badarg, X}}.
 
+%%%-----------------------------------------------------------------
+%%% Called by timer:apply_after from restart/2
+-ifdef(use_specs).
+-spec try_again_restart(SupRef, Child, Reason) -> ok when
+      SupRef :: sup_ref(),
+      Child :: child_id() | pid(),
+      Reason :: term().
+-endif.
+try_again_restart(Supervisor, Child, Reason) ->
+    cast(Supervisor, {try_again_restart, Child, Reason}).
+
+cast(Supervisor, Req) ->
+    gen_server:cast(Supervisor, Req).
+
 %%% ---------------------------------------------------
 %%% 
 %%% Initialize the supervisor.
 %%% 
 %%% ---------------------------------------------------
+-ifdef(use_specs).
+-type init_sup_name() :: sup_name() | 'self'.
+
+-type stop_rsn() :: {'shutdown', term()}
+                  | {'bad_return', {module(),'init', term()}}
+                  | {'bad_start_spec', term()}
+                  | {'start_spec', term()}
+                  | {'supervisor_data', term()}.
+
+-spec init({init_sup_name(), module(), [term()]}) ->
+        {'ok', state()} | 'ignore' | {'stop', stop_rsn()}.
+-endif.
 init({SupName, Mod, Args}) ->
     process_flag(trap_exit, true),
     case Mod:init(Args) of
@@ -319,7 +361,7 @@ init({SupName, Mod, Args}) ->
 	Error ->
 	    {stop, {bad_return, {Mod, init, Error}}}
     end.
-	
+
 init_children(State, StartSpec) ->
     SupName = State#state.name,
     case check_startspec(StartSpec) of
@@ -327,9 +369,9 @@ init_children(State, StartSpec) ->
             case start_children(Children, SupName) of
                 {ok, NChildren} ->
                     {ok, State#state{children = NChildren}};
-                {error, NChildren} ->
+                {error, NChildren, Reason} ->
                     terminate_children(NChildren, SupName),
-                    {stop, shutdown}
+                    {stop, {shutdown, Reason}}
             end;
         Error ->
             {stop, {start_spec, Error}}
@@ -347,32 +389,35 @@ init_dynamic(_State, StartSpec) ->
 
 %%-----------------------------------------------------------------
 %% Func: start_children/2
-%% Args: Children = [#child] in start order
-%%       SupName = {local, atom()} | {global, atom()} | {pid(),Mod}
-%% Purpose: Start all children.  The new list contains #child's 
+%% Args: Children = [child_rec()] in start order
+%%       SupName = {local, atom()} | {global, atom()} | {pid(), Mod}
+%% Purpose: Start all children.  The new list contains #child's
 %%          with pids.
-%% Returns: {ok, NChildren} | {error, NChildren}
-%%          NChildren = [#child] in termination order (reversed
+%% Returns: {ok, NChildren} | {error, NChildren, Reason}
+%%          NChildren = [child_rec()] in termination order (reversed
 %%                        start order)
 %%-----------------------------------------------------------------
 start_children(Children, SupName) -> start_children(Children, [], SupName).
 
 start_children([Child|Chs], NChildren, SupName) ->
     case do_start_child(SupName, Child) of
+	{ok, undefined} when Child#child.restart_type =:= temporary ->
+	    start_children(Chs, NChildren, SupName);
 	{ok, Pid} ->
 	    start_children(Chs, [Child#child{pid = Pid}|NChildren], SupName);
 	{ok, Pid, _Extra} ->
 	    start_children(Chs, [Child#child{pid = Pid}|NChildren], SupName);
 	{error, Reason} ->
 	    report_error(start_error, Reason, Child, SupName),
-	    {error, lists:reverse(Chs) ++ [Child | NChildren]}
+	    {error, lists:reverse(Chs) ++ [Child | NChildren],
+	     {failed_to_start_child,Child#child.name,Reason}}
     end;
 start_children([], NChildren, _SupName) ->
     {ok, NChildren}.
 
 do_start_child(SupName, Child) ->
-    #child{mfa = {M, F, A}} = Child,
-    case catch apply(M, F, A) of
+    #child{mfargs = {M, F, Args}} = Child,
+    case catch apply(M, F, Args) of
 	{ok, Pid} when is_pid(Pid) ->
 	    NChild = Child#child{pid = Pid},
 	    report_progress(NChild, SupName),
@@ -381,7 +426,7 @@ do_start_child(SupName, Child) ->
 	    NChild = Child#child{pid = Pid},
 	    report_progress(NChild, SupName),
 	    {ok, Pid, Extra};
-	ignore -> 
+	ignore ->
 	    {ok, undefined};
 	{error, What} -> {error, What};
 	What -> {error, What}
@@ -400,36 +445,55 @@ do_start_child_i(M, F, A) ->
 	What ->
 	    {error, What}
     end.
-    
 
 %%% ---------------------------------------------------
 %%% 
 %%% Callback functions.
 %%% 
 %%% ---------------------------------------------------
+-ifdef(use_specs).
+-type call() :: 'which_children' | 'count_children' | {_, _}.	% XXX: refine
+-spec handle_call(call(), term(), state()) -> {'reply', term(), state()}.
+-endif.
 handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->
-    #child{mfa = {M, F, A}} = hd(State#state.children),
+    Child = hd(State#state.children),
+    #child{mfargs = {M, F, A}} = Child,
     Args = A ++ EArgs,
     case do_start_child_i(M, F, Args) of
-        {ok, undefined} ->
-            {reply, {ok, undefined}, State};
+	{ok, undefined} when Child#child.restart_type =:= temporary ->
+	    {reply, {ok, undefined}, State};
 	{ok, Pid} ->
-	    NState = State#state{dynamics = 
-				 ?DICT:store(Pid, Args, State#state.dynamics)},
+	    NState = save_dynamic_child(Child#child.restart_type, Pid, Args, State),
 	    {reply, {ok, Pid}, NState};
 	{ok, Pid, Extra} ->
-	    NState = State#state{dynamics = 
-				 ?DICT:store(Pid, Args, State#state.dynamics)},
+	    NState = save_dynamic_child(Child#child.restart_type, Pid, Args, State),
 	    {reply, {ok, Pid, Extra}, NState};
 	What ->
 	    {reply, What, State}
     end;
 
-%%% The requests terminate_child, delete_child and restart_child are
-%%% invalid for simple_one_for_one and simple_one_for_one_terminate
-%%% supervisors.
+%% terminate_child for simple_one_for_one can only be done with pid
+handle_call({terminate_child, Name}, _From, State) when not is_pid(Name),
+							?is_simple(State) ->
+    {reply, {error, simple_one_for_one}, State};
+
+handle_call({terminate_child, Name}, _From, State) ->
+    case get_child(Name, State, ?is_simple(State)) of
+	{value, Child} ->
+	    case do_terminate(Child, State#state.name) of
+		#child{restart_type=RT} when RT=:=temporary; ?is_simple(State) ->
+		    {reply, ok, state_del_child(Child, State)};
+		NChild ->
+		    {reply, ok, replace_child(NChild, State)}
+		end;
+	false ->
+	    {reply, {error, not_found}, State}
+    end;
+
+%%% The requests delete_child and restart_child are invalid for
+%%% simple_one_for_one supervisors.
 handle_call({_Req, _Data}, _From, State) when ?is_simple(State) ->
-    {reply, {error, State#state.strategy}, State};
+    {reply, {error, simple_one_for_one}, State};
 
 handle_call({start_child, ChildSpec}, _From, State) ->
     case check_childspec(ChildSpec) of
@@ -453,6 +517,8 @@ handle_call({restart_child, Name}, _From, State) ->
 		Error ->
 		    {reply, Error, State}
 	    end;
+	{value, #child{pid=?restarting(_)}} ->
+	    {reply, {error, restarting}, State};
 	{value, _} ->
 	    {reply, {error, running}, State};
 	_ ->
@@ -464,60 +530,146 @@ handle_call({delete_child, Name}, _From, State) ->
 	{value, Child} when Child#child.pid =:= undefined ->
 	    NState = remove_child(Child, State),
 	    {reply, ok, NState};
+	{value, #child{pid=?restarting(_)}} ->
+	    {reply, {error, restarting}, State};
 	{value, _} ->
 	    {reply, {error, running}, State};
 	_ ->
 	    {reply, {error, not_found}, State}
     end;
 
-handle_call({terminate_child, Name}, _From, State) ->
-    case get_child(Name, State) of
-	{value, Child} ->
-	    NChild = do_terminate(Child, State#state.name),
-	    {reply, ok, replace_child(NChild, State)};
-	_ ->
-	    {reply, {error, not_found}, State}
-    end;
+handle_call(which_children, _From, #state{children = [#child{restart_type = temporary,
+							     child_type = CT,
+							     modules = Mods}]} =
+		State) when ?is_simple(State) ->
+    Reply = lists:map(fun(Pid) -> {undefined, Pid, CT, Mods} end,
+                      ?SETS:to_list(dynamics_db(temporary, State#state.dynamics))),
+    {reply, Reply, State};
 
-handle_call(which_children, _From, State) when ?is_simple(State) ->
-    [#child{child_type = CT, modules = Mods}] = State#state.children,
-    Reply = lists:map(fun ({Pid, _}) -> {undefined, Pid, CT, Mods} end,
-		      ?DICT:to_list(State#state.dynamics)),
+handle_call(which_children, _From, #state{children = [#child{restart_type = RType,
+							 child_type = CT,
+							 modules = Mods}]} =
+		State) when ?is_simple(State) ->
+    Reply = lists:map(fun({?restarting(_),_}) -> {undefined,restarting,CT,Mods};
+			 ({Pid, _}) -> {undefined, Pid, CT, Mods} end,
+		      ?DICT:to_list(dynamics_db(RType, State#state.dynamics))),
     {reply, Reply, State};
 
 handle_call(which_children, _From, State) ->
     Resp =
-	lists:map(fun (#child{pid = Pid, name = Name,
+	lists:map(fun(#child{pid = ?restarting(_), name = Name,
+			     child_type = ChildType, modules = Mods}) ->
+			  {Name, restarting, ChildType, Mods};
+		     (#child{pid = Pid, name = Name,
 			     child_type = ChildType, modules = Mods}) ->
-		    {Name, Pid, ChildType, Mods}
+			  {Name, Pid, ChildType, Mods}
 		  end,
 		  State#state.children),
-    {reply, Resp, State}.
+    {reply, Resp, State};
 
-%%% Hopefully cause a function-clause as there is no API function
-%%% that utilizes cast.
-handle_cast(null, State) ->
-    error_logger:error_msg("ERROR: Supervisor received cast-message 'null'~n", 
-			   []),
 
-    {noreply, State}.
+handle_call(count_children, _From, #state{children = [#child{restart_type = temporary,
+							     child_type = CT}]} = State)
+  when ?is_simple(State) ->
+    {Active, Count} =
+	?SETS:fold(fun(Pid, {Alive, Tot}) ->
+			   case is_pid(Pid) andalso is_process_alive(Pid) of
+			       true ->{Alive+1, Tot +1};
+			       false ->
+				   {Alive, Tot + 1}
+			   end
+		   end, {0, 0}, dynamics_db(temporary, State#state.dynamics)),
+    Reply = case CT of
+		supervisor -> [{specs, 1}, {active, Active},
+			       {supervisors, Count}, {workers, 0}];
+		worker -> [{specs, 1}, {active, Active},
+			   {supervisors, 0}, {workers, Count}]
+	    end,
+    {reply, Reply, State};
 
-handle_info({delayed_restart, {RestartType, Reason, Child}}, State)
+handle_call(count_children, _From,  #state{children = [#child{restart_type = RType,
+							      child_type = CT}]} = State)
   when ?is_simple(State) ->
-    {ok, NState} = do_restart(RestartType, Reason, Child, State),
-    {noreply, NState};
-handle_info({delayed_restart, {RestartType, Reason, Child}}, State) ->
-    case get_child(Child#child.name, State) of
-        {value, Child1} ->
-            {ok, NState} = do_restart(RestartType, Reason, Child1, State),
-            {noreply, NState};
-        _ ->
+    {Active, Count} =
+	?DICT:fold(fun(Pid, _Val, {Alive, Tot}) ->
+			   case is_pid(Pid) andalso is_process_alive(Pid) of
+			       true ->
+				   {Alive+1, Tot +1};
+			       false ->
+				   {Alive, Tot + 1}
+			   end
+		   end, {0, 0}, dynamics_db(RType, State#state.dynamics)),
+    Reply = case CT of
+		supervisor -> [{specs, 1}, {active, Active},
+			       {supervisors, Count}, {workers, 0}];
+		worker -> [{specs, 1}, {active, Active},
+			   {supervisors, 0}, {workers, Count}]
+	    end,
+    {reply, Reply, State};
+
+handle_call(count_children, _From, State) ->
+    %% Specs and children are together on the children list...
+    {Specs, Active, Supers, Workers} =
+	lists:foldl(fun(Child, Counts) ->
+			   count_child(Child, Counts)
+		   end, {0,0,0,0}, State#state.children),
+
+    %% Reformat counts to a property list.
+    Reply = [{specs, Specs}, {active, Active},
+	     {supervisors, Supers}, {workers, Workers}],
+    {reply, Reply, State}.
+
+
+count_child(#child{pid = Pid, child_type = worker},
+	    {Specs, Active, Supers, Workers}) ->
+    case is_pid(Pid) andalso is_process_alive(Pid) of
+	true ->  {Specs+1, Active+1, Supers, Workers+1};
+	false -> {Specs+1, Active, Supers, Workers+1}
+    end;
+count_child(#child{pid = Pid, child_type = supervisor},
+	    {Specs, Active, Supers, Workers}) ->
+    case is_pid(Pid) andalso is_process_alive(Pid) of
+	true ->  {Specs+1, Active+1, Supers+1, Workers};
+	false -> {Specs+1, Active, Supers+1, Workers}
+    end.
+
+
+%%% If a restart attempt failed, this message is sent via
+%%% timer:apply_after(0,...) in order to give gen_server the chance to
+%%% check it's inbox before trying again.
+-ifdef(use_specs).
+-spec handle_cast({try_again_restart, child_id() | pid(), term()}, state()) ->
+			 {'noreply', state()} | {stop, shutdown, state()}.
+-endif.
+handle_cast({try_again_restart,Pid,Reason}, #state{children=[Child]}=State)
+  when ?is_simple(State) ->
+    RT = Child#child.restart_type,
+    RPid = restarting(Pid),
+    case dynamic_child_args(RPid, dynamics_db(RT, State#state.dynamics)) of
+	{ok, Args} ->
+	    {M, F, _} = Child#child.mfargs,
+	    NChild = Child#child{pid = RPid, mfargs = {M, F, Args}},
+            try_restart(Child#child.restart_type, Reason, NChild, State);
+	error ->
             {noreply, State}
     end;
 
+handle_cast({try_again_restart,Name,Reason}, State) ->
+    %% we still support >= R12-B3 in which lists:keyfind/3 doesn't exist
+    case lists:keysearch(Name,#child.name,State#state.children) of
+	{value, Child = #child{pid=?restarting(_), restart_type=RestartType}} ->
+            try_restart(RestartType, Reason, Child, State);
+	_ ->
+	    {noreply,State}
+    end.
+
 %%
 %% Take care of terminated children.
 %%
+-ifdef(use_specs).
+-spec handle_info(term(), state()) ->
+        {'noreply', state()} | {'stop', 'shutdown', state()}.
+-endif.
 handle_info({'EXIT', Pid, Reason}, State) ->
     case restart_child(Pid, Reason, State) of
 	{ok, State1} ->
@@ -526,20 +678,34 @@ handle_info({'EXIT', Pid, Reason}, State) ->
 	    {stop, shutdown, State1}
     end;
 
+handle_info({delayed_restart, {RestartType, Reason, Child}}, State)
+  when ?is_simple(State) ->
+    try_restart(RestartType, Reason, Child, State);
+handle_info({delayed_restart, {RestartType, Reason, Child}}, State) ->
+    case get_child(Child#child.name, State) of
+        {value, Child1} ->
+            try_restart(RestartType, Reason, Child1, State);
+        _What ->
+            {noreply, State}
+    end;
+
 handle_info(Msg, State) ->
     error_logger:error_msg("Supervisor received unexpected message: ~p~n", 
 			   [Msg]),
     {noreply, State}.
+
 %%
 %% Terminate this server.
 %%
-terminate(_Reason, State) when ?is_terminate_simple(State) ->
-    terminate_simple_children(
-      hd(State#state.children), State#state.dynamics, State#state.name),
-    ok;
+-ifdef(use_specs).
+-spec terminate(term(), state()) -> 'ok'.
+-endif.
+terminate(_Reason, #state{children=[Child]} = State) when ?is_simple(State) ->
+    terminate_dynamic_children(Child, dynamics_db(Child#child.restart_type,
+                                                  State#state.dynamics),
+                               State#state.name);
 terminate(_Reason, State) ->
-    terminate_children(State#state.children, State#state.name),
-    ok.
+    terminate_children(State#state.children, State#state.name).
 
 %%
 %% Change code for the supervisor.
@@ -550,6 +716,10 @@ terminate(_Reason, State) ->
 %% NOTE: This requires that the init function of the call-back module
 %%       does not have any side effects.
 %%
+-ifdef(use_specs).
+-spec code_change(term(), state(), term()) ->
+        {'ok', state()} | {'error', term()}.
+-endif.
 code_change(_, State, _) ->
     case (State#state.module):init(State#state.args) of
 	{ok, {SupFlags, StartSpec}} ->
@@ -577,14 +747,13 @@ check_flags({Strategy, MaxIntensity, Period}) ->
 check_flags(What) ->
     {bad_flags, What}.
 
-update_childspec(State, StartSpec)  when ?is_simple(State) -> 
-    case check_startspec(StartSpec) of                        
-        {ok, [Child]} ->                                      
-            {ok, State#state{children = [Child]}};            
-        Error ->                                              
-            {error, Error}                                    
-    end;                                                      
-
+update_childspec(State, StartSpec) when ?is_simple(State) ->
+    case check_startspec(StartSpec) of
+        {ok, [Child]} ->
+            {ok, State#state{children = [Child]}};
+        Error ->
+            {error, Error}
+    end;
 update_childspec(State, StartSpec) ->
     case check_startspec(StartSpec) of
 	{ok, Children} ->
@@ -603,11 +772,11 @@ update_childspec1([Child|OldC], Children, KeepOld) ->
 	    update_childspec1(OldC, Children, [Child|KeepOld])
     end;
 update_childspec1([], Children, KeepOld) ->
-    % Return them in (keeped) reverse start order.
-    lists:reverse(Children ++ KeepOld).  
+    %% Return them in (kept) reverse start order.
+    lists:reverse(Children ++ KeepOld).
 
 update_chsp(OldCh, Children) ->
-    case lists:map(fun (Ch) when OldCh#child.name =:= Ch#child.name ->
+    case lists:map(fun(Ch) when OldCh#child.name =:= Ch#child.name ->
 			   Ch#child{pid = OldCh#child.pid};
 		      (Ch) ->
 			   Ch
@@ -627,20 +796,16 @@ handle_start_child(Child, State) ->
     case get_child(Child#child.name, State) of
 	false ->
 	    case do_start_child(State#state.name, Child) of
+		{ok, undefined} when Child#child.restart_type =:= temporary ->
+		    {{ok, undefined}, State};
 		{ok, Pid} ->
-		    Children = State#state.children,
-		    {{ok, Pid},
-		     State#state{children = 
-				 [Child#child{pid = Pid}|Children]}};
+		    {{ok, Pid}, save_child(Child#child{pid = Pid}, State)};
 		{ok, Pid, Extra} ->
-		    Children = State#state.children,
-		    {{ok, Pid, Extra},
-		     State#state{children = 
-				 [Child#child{pid = Pid}|Children]}};
+		    {{ok, Pid, Extra}, save_child(Child#child{pid = Pid}, State)};
 		{error, What} ->
 		    {{error, {What, Child}}, State}
 	    end;
-	{value, OldChild} when OldChild#child.pid =/= undefined ->
+	{value, OldChild} when is_pid(OldChild#child.pid) ->
 	    {{error, {already_started, OldChild#child.pid}}, State};
 	{value, _OldChild} ->
 	    {{error, already_present}, State}
@@ -648,105 +813,145 @@ handle_start_child(Child, State) ->
 
 %%% ---------------------------------------------------
 %%% Restart. A process has terminated.
-%%% Returns: {ok, #state} | {shutdown, #state}
+%%% Returns: {ok, state()} | {shutdown, state()}
 %%% ---------------------------------------------------
 
-restart_child(Pid, Reason, State) when ?is_simple(State) ->
-    case ?DICT:find(Pid, State#state.dynamics) of
+restart_child(Pid, Reason, #state{children = [Child]} = State) when ?is_simple(State) ->
+    RestartType = Child#child.restart_type,
+    case dynamic_child_args(Pid, dynamics_db(RestartType, State#state.dynamics)) of
 	{ok, Args} ->
-	    [Child] = State#state.children,
-	    RestartType = Child#child.restart_type,
-	    {M, F, _} = Child#child.mfa,
-	    NChild = Child#child{pid = Pid, mfa = {M, F, Args}},
+	    {M, F, _} = Child#child.mfargs,
+	    NChild = Child#child{pid = Pid, mfargs = {M, F, Args}},
 	    do_restart(RestartType, Reason, NChild, State);
 	error ->
-	    {ok, State}
+            {ok, State}
     end;
+
 restart_child(Pid, Reason, State) ->
     Children = State#state.children,
+    %% we still support >= R12-B3 in which lists:keyfind/3 doesn't exist
     case lists:keysearch(Pid, #child.pid, Children) of
-	{value, Child} ->
-	    RestartType = Child#child.restart_type,
+	{value, #child{restart_type = RestartType} = Child} ->
 	    do_restart(RestartType, Reason, Child, State);
-	_ ->
+	false ->
 	    {ok, State}
     end.
 
-do_restart({permanent = RestartType, Delay}, Reason, Child, State) ->
-    do_restart_delay({RestartType, Delay}, Reason, Child, State);
-do_restart(permanent, Reason, Child, State) ->
-    report_error(child_terminated, Reason, Child, State#state.name),
-    restart(Child, State);
-do_restart(Type, normal, Child, State) ->
-    del_child_and_maybe_shutdown(Type, Child, State);
-do_restart({RestartType, Delay}, {shutdown, restart} = Reason, Child, State)
-  when RestartType =:= transient orelse RestartType =:= intrinsic ->
-    do_restart_delay({RestartType, Delay}, Reason, Child, State);
-do_restart(Type, {shutdown, _}, Child, State) ->
-    del_child_and_maybe_shutdown(Type, Child, State);
-do_restart(Type, shutdown, Child = #child{child_type = supervisor}, State) ->
-    del_child_and_maybe_shutdown(Type, Child, State);
-do_restart({RestartType, Delay}, Reason, Child, State)
-  when RestartType =:= transient orelse RestartType =:= intrinsic ->
-    do_restart_delay({RestartType, Delay}, Reason, Child, State);
-do_restart(Type, Reason, Child, State) when Type =:= transient orelse
-                                            Type =:= intrinsic ->
-    report_error(child_terminated, Reason, Child, State#state.name),
+try_restart(RestartType, Reason, Child, State) ->
+    case handle_restart(RestartType, Reason, Child, State) of
+        {ok, NState}       -> {noreply, NState};
+        {shutdown, State2} -> {stop, shutdown, State2}
+    end.
+
+do_restart(RestartType, Reason, Child, State) ->
+    maybe_report_error(RestartType, Reason, Child, State),
+    handle_restart(RestartType, Reason, Child, State).
+
+maybe_report_error(permanent, Reason, Child, State) ->
+    report_child_termination(Reason, Child, State);
+maybe_report_error({permanent, _}, Reason, Child, State) ->
+    report_child_termination(Reason, Child, State);
+maybe_report_error(_Type, Reason, Child, State) ->
+    case is_abnormal_termination(Reason) of
+        true  -> report_child_termination(Reason, Child, State);
+        false -> ok
+    end.
+
+report_child_termination(Reason, Child, State) ->
+    report_error(child_terminated, Reason, Child, State#state.name).
+
+handle_restart(permanent, _Reason, Child, State) ->
     restart(Child, State);
-do_restart(temporary, Reason, Child, State) ->
-    report_error(child_terminated, Reason, Child, State#state.name),
-    NState = state_del_child(Child, State),
-    {ok, NState}.
+handle_restart(transient, Reason, Child, State) ->
+    restart_if_explicit_or_abnormal(fun restart/2,
+                                    fun delete_child_and_continue/2,
+                                    Reason, Child, State);
+handle_restart(intrinsic, Reason, Child, State) ->
+    restart_if_explicit_or_abnormal(fun restart/2,
+                                    fun delete_child_and_stop/2,
+                                    Reason, Child, State);
+handle_restart(temporary, _Reason, Child, State) ->
+    delete_child_and_continue(Child, State);
+handle_restart({permanent, _Delay}=Restart, Reason, Child, State) ->
+    do_restart_delay(Restart, Reason, Child, State);
+handle_restart({transient, _Delay}=Restart, Reason, Child, State) ->
+    restart_if_explicit_or_abnormal(defer_to_restart_delay(Restart, Reason),
+                                    fun delete_child_and_continue/2,
+                                    Reason, Child, State);
+handle_restart({intrinsic, _Delay}=Restart, Reason, Child, State) ->
+    restart_if_explicit_or_abnormal(defer_to_restart_delay(Restart, Reason),
+                                    fun delete_child_and_stop/2,
+                                    Reason, Child, State).
+
+restart_if_explicit_or_abnormal(RestartHow, Otherwise, Reason, Child, State) ->
+    case ?is_explicit_restart(Reason) orelse is_abnormal_termination(Reason) of
+        true  -> RestartHow(Child, State);
+        false -> Otherwise(Child, State)
+    end.
+
+defer_to_restart_delay(Restart, Reason) ->
+    fun(Child, State) -> do_restart_delay(Restart, Reason, Child, State) end.
+
+delete_child_and_continue(Child, State) ->
+    {ok, state_del_child(Child, State)}.
+
+delete_child_and_stop(Child, State) ->
+    {shutdown, state_del_child(Child, State)}.
+
+is_abnormal_termination(normal)        -> false;
+is_abnormal_termination(shutdown)      -> false;
+is_abnormal_termination({shutdown, _}) -> false;
+is_abnormal_termination(_Other)        -> true.
 
 do_restart_delay({RestartType, Delay}, Reason, Child, State) ->
-    case restart1(Child, State) of
+    case add_restart(State) of
         {ok, NState} ->
-            {ok, NState};
-        {terminate, NState} ->
+            maybe_restart(NState#state.strategy, Child, NState);
+        {terminate, _NState} ->
+            %% we've reached the max restart intensity, but the
+            %% add_restart will have added to the restarts
+            %% field. Given we don't want to die here, we need to go
+            %% back to the old restarts field otherwise we'll never
+            %% attempt to restart later, which is why we ignore
+            %% NState for this clause.
             _TRef = erlang:send_after(trunc(Delay*1000), self(),
                                       {delayed_restart,
                                        {{RestartType, Delay}, Reason, Child}}),
-            {ok, state_del_child(Child, NState)}
+            {ok, state_del_child(Child, State)}
     end.
 
-del_child_and_maybe_shutdown(intrinsic, Child, State) ->
-    {shutdown, state_del_child(Child, State)};
-del_child_and_maybe_shutdown({intrinsic, _Delay}, Child, State) ->
-    {shutdown, state_del_child(Child, State)};
-del_child_and_maybe_shutdown(_, Child, State) ->
-    {ok, state_del_child(Child, State)}.
-
 restart(Child, State) ->
     case add_restart(State) of
 	{ok, NState} ->
-	    restart(NState#state.strategy, Child, NState, fun restart/2);
+	    maybe_restart(NState#state.strategy, Child, NState);
 	{terminate, NState} ->
 	    report_error(shutdown, reached_max_restart_intensity,
 			 Child, State#state.name),
-	    {shutdown, state_del_child(Child, NState)}
+	    {shutdown, remove_child(Child, NState)}
     end.
 
-restart1(Child, State) ->
-    case add_restart(State) of
-	{ok, NState} ->
-	    restart(NState#state.strategy, Child, NState, fun restart1/2);
-	{terminate, _NState} ->
-            %% we've reached the max restart intensity, but the
-            %% add_restart will have added to the restarts
-            %% field. Given we don't want to die here, we need to go
-            %% back to the old restarts field otherwise we'll never
-            %% attempt to restart later.
-            {terminate, State}
+maybe_restart(Strategy, Child, State) ->
+    case restart(Strategy, Child, State) of
+        {try_again, Reason, NState2} ->
+            %% Leaving control back to gen_server before
+            %% trying again. This way other incoming requsts
+            %% for the supervisor can be handled - e.g. a
+            %% shutdown request for the supervisor or the
+            %% child.
+            Id = if ?is_simple(State) -> Child#child.pid;
+                    true -> Child#child.name
+                 end,
+            timer:apply_after(0,?MODULE,try_again_restart,[self(),Id,Reason]),
+            {ok,NState2};
+        Other ->
+            Other
     end.
 
-restart(Strategy, Child, State, Restart)
-  when Strategy =:= simple_one_for_one orelse
-       Strategy =:= simple_one_for_one_terminate ->
-    #child{mfa = {M, F, A}} = Child,
-    Dynamics = ?DICT:erase(Child#child.pid, State#state.dynamics),
+restart(simple_one_for_one, Child, State) ->
+    #child{pid = OldPid, mfargs = {M, F, A}} = Child,
+    Dynamics = ?DICT:erase(OldPid, dynamics_db(Child#child.restart_type,
+					       State#state.dynamics)),
     case do_start_child_i(M, F, A) of
-        {ok, undefined} ->
-            {ok, State};
 	{ok, Pid} ->
 	    NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
 	    {ok, NState};
@@ -754,10 +959,13 @@ restart(Strategy, Child, State, Restart)
 	    NState = State#state{dynamics = ?DICT:store(Pid, A, Dynamics)},
 	    {ok, NState};
 	{error, Error} ->
+	    NState = State#state{dynamics = ?DICT:store(restarting(OldPid), A,
+							Dynamics)},
 	    report_error(start_error, Error, Child, State#state.name),
-	    Restart(Child, State)
+	    {try_again, Error, NState}
     end;
-restart(one_for_one, Child, State, Restart) ->
+restart(one_for_one, Child, State) ->
+    OldPid = Child#child.pid,
     case do_start_child(State#state.name, Child) of
 	{ok, Pid} ->
 	    NState = replace_child(Child#child{pid = Pid}, State),
@@ -766,123 +974,71 @@ restart(one_for_one, Child, State, Restart) ->
 	    NState = replace_child(Child#child{pid = Pid}, State),
 	    {ok, NState};
 	{error, Reason} ->
+	    NState = replace_child(Child#child{pid = restarting(OldPid)}, State),
 	    report_error(start_error, Reason, Child, State#state.name),
-	    Restart(Child, State)
+	    {try_again, Reason, NState}
     end;
-restart(rest_for_one, Child, State, Restart) ->
+restart(rest_for_one, Child, State) ->
     {ChAfter, ChBefore} = split_child(Child#child.pid, State#state.children),
     ChAfter2 = terminate_children(ChAfter, State#state.name),
     case start_children(ChAfter2, State#state.name) of
 	{ok, ChAfter3} ->
 	    {ok, State#state{children = ChAfter3 ++ ChBefore}};
-	{error, ChAfter3} ->
-	    Restart(Child, State#state{children = ChAfter3 ++ ChBefore})
+	{error, ChAfter3, Reason} ->
+	    NChild = Child#child{pid=restarting(Child#child.pid)},
+	    NState = State#state{children = ChAfter3 ++ ChBefore},
+	    {try_again, Reason, replace_child(NChild,NState)}
     end;
-restart(one_for_all, Child, State, Restart) ->
+restart(one_for_all, Child, State) ->
     Children1 = del_child(Child#child.pid, State#state.children),
     Children2 = terminate_children(Children1, State#state.name),
     case start_children(Children2, State#state.name) of
 	{ok, NChs} ->
 	    {ok, State#state{children = NChs}};
-	{error, NChs} ->
-	    Restart(Child, State#state{children = NChs})
+	{error, NChs, Reason} ->
+	    NChild = Child#child{pid=restarting(Child#child.pid)},
+	    NState = State#state{children = NChs},
+	    {try_again, Reason, replace_child(NChild,NState)}
     end.
 
+restarting(Pid) when is_pid(Pid) -> ?restarting(Pid);
+restarting(RPid) -> RPid.
+
 %%-----------------------------------------------------------------
 %% Func: terminate_children/2
-%% Args: Children = [#child] in termination order
+%% Args: Children = [child_rec()] in termination order
 %%       SupName = {local, atom()} | {global, atom()} | {pid(),Mod}
-%% Returns: NChildren = [#child] in
+%% Returns: NChildren = [child_rec()] in
 %%          startup order (reversed termination order)
 %%-----------------------------------------------------------------
 terminate_children(Children, SupName) ->
     terminate_children(Children, SupName, []).
 
+%% Temporary children should not be restarted and thus should
+%% be skipped when building the list of terminated children, although
+%% we do want them to be shut down as many functions from this module
+%% use this function to just clear everything.
+terminate_children([Child = #child{restart_type=temporary} | Children], SupName, Res) ->
+    do_terminate(Child, SupName),
+    terminate_children(Children, SupName, Res);
 terminate_children([Child | Children], SupName, Res) ->
     NChild = do_terminate(Child, SupName),
     terminate_children(Children, SupName, [NChild | Res]);
 terminate_children([], _SupName, Res) ->
     Res.
 
-terminate_simple_children(Child, Dynamics, SupName) ->
-    Pids = dict:fold(fun (Pid, _Args, Pids) ->
-                         erlang:monitor(process, Pid),
-                         unlink(Pid),
-                         exit(Pid, child_exit_reason(Child)),
-                         [Pid | Pids]
-                     end, [], Dynamics),
-    TimeoutMsg = {timeout, make_ref()},
-    TRef = timeout_start(Child, TimeoutMsg),
-    {Replies, Timedout} =
-        lists:foldl(
-          fun (_Pid, {Replies, Timedout}) ->
-                  {Reply, Timedout1} =
-                      receive
-                          TimeoutMsg ->
-                              Remaining = Pids -- [P || {P, _} <- Replies],
-                              [exit(P, kill) || P <- Remaining],
-                              receive {'DOWN', _MRef, process, Pid, Reason} ->
-                                      {{error, Reason}, true}
-                              end;
-                          {'DOWN', _MRef, process, Pid, Reason} ->
-                              {child_res(Child, Reason, Timedout), Timedout};
-                          {'EXIT', Pid, Reason} ->
-                              receive {'DOWN', _MRef, process, Pid, _} ->
-                                      {{error, Reason}, Timedout}
-                              end
-                      end,
-                  {[{Pid, Reply} | Replies], Timedout1}
-          end, {[], false}, Pids),
-    timeout_stop(Child, TRef, TimeoutMsg, Timedout),
-    ReportError = shutdown_error_reporter(SupName),
-    [case Reply of
-         {_Pid, ok}         -> ok;
-         {Pid,  {error, R}} -> ReportError(R, Child#child{pid = Pid})
-     end || Reply <- Replies],
-    ok.
-
-child_exit_reason(#child{shutdown = brutal_kill}) -> kill;
-child_exit_reason(#child{})                       -> shutdown.
-
-child_res(#child{shutdown=brutal_kill},   killed,    false) -> ok;
-child_res(#child{},                       shutdown,  false) -> ok;
-child_res(#child{restart_type=permanent}, normal,    false) -> {error, normal};
-child_res(#child{restart_type={permanent,_}},normal, false) -> {error, normal};
-child_res(#child{},                       normal,    false) -> ok;
-child_res(#child{},                       R,         _)     -> {error, R}.
-
-timeout_start(#child{shutdown = Time}, Msg) when is_integer(Time) ->
-    erlang:send_after(Time, self(), Msg);
-timeout_start(#child{}, _Msg) ->
-    ok.
-
-timeout_stop(#child{shutdown = Time}, TRef, Msg, false) when is_integer(Time) ->
-    erlang:cancel_timer(TRef),
-    receive
-        Msg -> ok
-    after
-        0 -> ok
-    end;
-timeout_stop(#child{}, ok, _Msg, _Timedout) ->
-    ok.
-
-do_terminate(Child, SupName) when Child#child.pid =/= undefined ->
-    ReportError = shutdown_error_reporter(SupName),
+do_terminate(Child, SupName) when is_pid(Child#child.pid) ->
     case shutdown(Child#child.pid, Child#child.shutdown) of
         ok ->
             ok;
-        {error, normal} ->
-            case Child#child.restart_type of
-                permanent           -> ReportError(normal, Child);
-                {permanent, _Delay} -> ReportError(normal, Child);
-                _                   -> ok
-            end;
+        {error, normal} when not ?is_permanent(Child#child.restart_type) ->
+            ok;
         {error, OtherReason} ->
-            ReportError(OtherReason, Child)
+            report_error(shutdown_error, OtherReason, Child, SupName)
     end,
     Child#child{pid = undefined};
 do_terminate(Child, _SupName) ->
-    Child.
+    Child#child{pid = undefined}.
 
 %%-----------------------------------------------------------------
 %% Shutdowns a child. We must check the EXIT value 
@@ -895,7 +1051,6 @@ do_terminate(Child, _SupName) ->
 %% Returns: ok | {error, OtherReason}  (this should be reported)
 %%-----------------------------------------------------------------
 shutdown(Pid, brutal_kill) ->
-  
     case monitor_child(Pid) of
 	ok ->
 	    exit(Pid, kill),
@@ -908,9 +1063,7 @@ shutdown(Pid, brutal_kill) ->
 	{error, Reason} ->      
 	    {error, Reason}
     end;
-
 shutdown(Pid, Time) ->
-    
     case monitor_child(Pid) of
 	ok ->
 	    exit(Pid, shutdown), %% Try to shutdown gracefully
@@ -957,20 +1110,163 @@ monitor_child(Pid) ->
 	    %% that will be handled in shutdown/2. 
 	    ok   
     end.
-    
-   
+
+
+%%-----------------------------------------------------------------
+%% Func: terminate_dynamic_children/3
+%% Args: Child    = child_rec()
+%%       Dynamics = ?DICT() | ?SET()
+%%       SupName  = {local, atom()} | {global, atom()} | {pid(),Mod}
+%% Returns: ok
+%%
+%%
+%% Shutdown all dynamic children. This happens when the supervisor is
+%% stopped. Because the supervisor can have millions of dynamic children, we
+%% can have an significative overhead here.
+%%-----------------------------------------------------------------
+terminate_dynamic_children(Child, Dynamics, SupName) ->
+    {Pids, EStack0} = monitor_dynamic_children(Child, Dynamics),
+    Sz = ?SETS:size(Pids),
+    EStack = case Child#child.shutdown of
+                 brutal_kill ->
+                     ?SETS:fold(fun(P, _) -> exit(P, kill) end, ok, Pids),
+                     wait_dynamic_children(Child, Pids, Sz, undefined, EStack0);
+                 infinity ->
+                     ?SETS:fold(fun(P, _) -> exit(P, shutdown) end, ok, Pids),
+                     wait_dynamic_children(Child, Pids, Sz, undefined, EStack0);
+                 Time ->
+                     ?SETS:fold(fun(P, _) -> exit(P, shutdown) end, ok, Pids),
+                     TRef = erlang:start_timer(Time, self(), kill),
+                     wait_dynamic_children(Child, Pids, Sz, TRef, EStack0)
+             end,
+    %% Unroll stacked errors and report them
+    ?DICT:fold(fun(Reason, Ls, _) ->
+                       report_error(shutdown_error, Reason,
+                                    Child#child{pid=Ls}, SupName)
+               end, ok, EStack).
+
+
+monitor_dynamic_children(#child{restart_type=temporary}, Dynamics) ->
+    ?SETS:fold(fun(P, {Pids, EStack}) ->
+                       case monitor_child(P) of
+                           ok ->
+                               {?SETS:add_element(P, Pids), EStack};
+                           {error, normal} ->
+                               {Pids, EStack};
+                           {error, Reason} ->
+                               {Pids, ?DICT:append(Reason, P, EStack)}
+                       end
+               end, {?SETS:new(), ?DICT:new()}, Dynamics);
+monitor_dynamic_children(#child{restart_type=RType}, Dynamics) ->
+    ?DICT:fold(fun(P, _, {Pids, EStack}) when is_pid(P) ->
+                       case monitor_child(P) of
+                           ok ->
+                               {?SETS:add_element(P, Pids), EStack};
+                           {error, normal} when not ?is_permanent(RType) ->
+                               {Pids, EStack};
+                           {error, Reason} ->
+                               {Pids, ?DICT:append(Reason, P, EStack)}
+                       end;
+		  (?restarting(_), _, {Pids, EStack}) ->
+		       {Pids, EStack}
+               end, {?SETS:new(), ?DICT:new()}, Dynamics).
+
+wait_dynamic_children(_Child, _Pids, 0, undefined, EStack) ->
+    EStack;
+wait_dynamic_children(_Child, _Pids, 0, TRef, EStack) ->
+	%% If the timer has expired before its cancellation, we must empty the
+	%% mail-box of the 'timeout'-message.
+    erlang:cancel_timer(TRef),
+    receive
+        {timeout, TRef, kill} ->
+            EStack
+    after 0 ->
+            EStack
+    end;
+wait_dynamic_children(#child{shutdown=brutal_kill} = Child, Pids, Sz,
+                      TRef, EStack) ->
+    receive
+        {'DOWN', _MRef, process, Pid, killed} ->
+            wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1,
+                                  TRef, EStack);
+
+        {'DOWN', _MRef, process, Pid, Reason} ->
+            wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1,
+                                  TRef, ?DICT:append(Reason, Pid, EStack))
+    end;
+wait_dynamic_children(#child{restart_type=RType} = Child, Pids, Sz,
+                      TRef, EStack) ->
+    receive
+        {'DOWN', _MRef, process, Pid, shutdown} ->
+            wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1,
+                                  TRef, EStack);
+
+        {'DOWN', _MRef, process, Pid, normal} when not ?is_permanent(RType) ->
+            wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1,
+                                  TRef, EStack);
+
+        {'DOWN', _MRef, process, Pid, Reason} ->
+            wait_dynamic_children(Child, ?SETS:del_element(Pid, Pids), Sz-1,
+                                  TRef, ?DICT:append(Reason, Pid, EStack));
+
+        {timeout, TRef, kill} ->
+            ?SETS:fold(fun(P, _) -> exit(P, kill) end, ok, Pids),
+            wait_dynamic_children(Child, Pids, Sz-1, undefined, EStack)
+    end.
+
 %%-----------------------------------------------------------------
 %% Child/State manipulating functions.
 %%-----------------------------------------------------------------
-state_del_child(#child{pid = Pid}, State) when ?is_simple(State) ->
-    NDynamics = ?DICT:erase(Pid, State#state.dynamics),
+
+%% Note we do not want to save the parameter list for temporary processes as
+%% they will not be restarted, and hence we do not need this information.
+%% Especially for dynamic children to simple_one_for_one supervisors
+%% it could become very costly as it is not uncommon to spawn
+%% very many such processes.
+save_child(#child{restart_type = temporary,
+		  mfargs = {M, F, _}} = Child, #state{children = Children} = State) ->
+    State#state{children = [Child#child{mfargs = {M, F, undefined}} |Children]};
+save_child(Child, #state{children = Children} = State) ->
+    State#state{children = [Child |Children]}.
+
+save_dynamic_child(temporary, Pid, _, #state{dynamics = Dynamics} = State) ->
+    State#state{dynamics = ?SETS:add_element(Pid, dynamics_db(temporary, Dynamics))};
+save_dynamic_child(RestartType, Pid, Args, #state{dynamics = Dynamics} = State) ->
+    State#state{dynamics = ?DICT:store(Pid, Args, dynamics_db(RestartType, Dynamics))}.
+
+dynamics_db(temporary, undefined) ->
+    ?SETS:new();
+dynamics_db(_, undefined) ->
+    ?DICT:new();
+dynamics_db(_,Dynamics) ->
+    Dynamics.
+
+dynamic_child_args(Pid, Dynamics) ->
+    case ?SETS:is_set(Dynamics) of
+        true ->
+            {ok, undefined};
+        false ->
+            ?DICT:find(Pid, Dynamics)
+    end.
+
+state_del_child(#child{pid = Pid, restart_type = temporary}, State) when ?is_simple(State) ->
+    NDynamics = ?SETS:del_element(Pid, dynamics_db(temporary, State#state.dynamics)),
+    State#state{dynamics = NDynamics};
+state_del_child(#child{pid = Pid, restart_type = RType}, State) when ?is_simple(State) ->
+    NDynamics = ?DICT:erase(Pid, dynamics_db(RType, State#state.dynamics)),
     State#state{dynamics = NDynamics};
 state_del_child(Child, State) ->
     NChildren = del_child(Child#child.name, State#state.children),
     State#state{children = NChildren}.
 
+del_child(Name, [Ch=#child{pid = ?restarting(_)}|_]=Chs) when Ch#child.name =:= Name ->
+    Chs;
+del_child(Name, [Ch|Chs]) when Ch#child.name =:= Name, Ch#child.restart_type =:= temporary ->
+    Chs;
 del_child(Name, [Ch|Chs]) when Ch#child.name =:= Name ->
     [Ch#child{pid = undefined} | Chs];
+del_child(Pid, [Ch|Chs]) when Ch#child.pid =:= Pid, Ch#child.restart_type =:= temporary ->
+    Chs;
 del_child(Pid, [Ch|Chs]) when Ch#child.pid =:= Pid ->
     [Ch#child{pid = undefined} | Chs];
 del_child(Name, [Ch|Chs]) ->
@@ -993,7 +1289,38 @@ split_child(_, [], After) ->
     {lists:reverse(After), []}.
 
 get_child(Name, State) ->
+    get_child(Name, State, false).
+get_child(Pid, State, AllowPid) when AllowPid, is_pid(Pid) ->
+    get_dynamic_child(Pid, State);
+get_child(Name, State, _) ->
     lists:keysearch(Name, #child.name, State#state.children).
+
+get_dynamic_child(Pid, #state{children=[Child], dynamics=Dynamics}) ->
+    DynamicsDb = dynamics_db(Child#child.restart_type, Dynamics),
+    case is_dynamic_pid(Pid, DynamicsDb) of
+	true ->
+	    {value, Child#child{pid=Pid}};
+	false ->
+	    RPid = restarting(Pid),
+	    case is_dynamic_pid(RPid, DynamicsDb) of
+		true ->
+		    {value, Child#child{pid=RPid}};
+		false ->
+		    case erlang:is_process_alive(Pid) of
+			true -> false;
+			false -> {value, Child}
+		    end
+	    end
+    end.
+
+is_dynamic_pid(Pid, Dynamics) ->
+    case ?SETS:is_set(Dynamics) of
+	true ->
+	    ?SETS:is_element(Pid, Dynamics);
+	false ->
+	    ?DICT:is_key(Pid, Dynamics)
+    end.
+
 replace_child(Child, State) ->
     Chs = do_replace_child(Child, State#state.children),
     State#state{children = Chs}.
@@ -1012,13 +1339,13 @@ remove_child(Child, State) ->
 %% Args: SupName = {local, atom()} | {global, atom()} | self
 %%       Type = {Strategy, MaxIntensity, Period}
 %%         Strategy = one_for_one | one_for_all | simple_one_for_one |
-%%                    rest_for_one 
-%%         MaxIntensity = integer()
-%%         Period = integer()
+%%                    rest_for_one
+%%         MaxIntensity = integer() >= 0
+%%         Period = integer() > 0
 %%       Mod :== atom()
-%%       Arsg :== term()
+%%       Args :== term()
 %% Purpose: Check that Type is of correct type (!)
-%% Returns: {ok, #state} | Error
+%% Returns: {ok, state()} | Error
 %%-----------------------------------------------------------------
 init_state(SupName, Type, Mod, Args) ->
     case catch init_state1(SupName, Type, Mod, Args) of
@@ -1033,46 +1360,45 @@ init_state1(SupName, {Strategy, MaxIntensity, Period}, Mod, Args) ->
     validIntensity(MaxIntensity),
     validPeriod(Period),
     {ok, #state{name = supname(SupName,Mod),
-	       strategy = Strategy,
-	       intensity = MaxIntensity,
-	       period = Period,
-	       module = Mod,
-	       args = Args}};
+		strategy = Strategy,
+		intensity = MaxIntensity,
+		period = Period,
+		module = Mod,
+		args = Args}};
 init_state1(_SupName, Type, _, _) ->
     {invalid_type, Type}.
 
-validStrategy(simple_one_for_one_terminate) -> true;
-validStrategy(simple_one_for_one)           -> true;
-validStrategy(one_for_one)                  -> true;
-validStrategy(one_for_all)                  -> true;
-validStrategy(rest_for_one)                 -> true;
-validStrategy(What)                         -> throw({invalid_strategy, What}).
+validStrategy(simple_one_for_one) -> true;
+validStrategy(one_for_one)        -> true;
+validStrategy(one_for_all)        -> true;
+validStrategy(rest_for_one)       -> true;
+validStrategy(What)               -> throw({invalid_strategy, What}).
 
 validIntensity(Max) when is_integer(Max),
                          Max >=  0 -> true;
-validIntensity(What)              -> throw({invalid_intensity, What}).
+validIntensity(What)               -> throw({invalid_intensity, What}).
 
 validPeriod(Period) when is_integer(Period),
                          Period > 0 -> true;
 validPeriod(What)                   -> throw({invalid_period, What}).
 
-supname(self,Mod) -> {self(),Mod};
-supname(N,_)      -> N.
+supname(self, Mod) -> {self(), Mod};
+supname(N, _)      -> N.
 
 %%% ------------------------------------------------------
 %%% Check that the children start specification is valid.
 %%% Shall be a six (6) tuple
 %%%    {Name, Func, RestartType, Shutdown, ChildType, Modules}
 %%% where Name is an atom
-%%%       Func is {Mod, Fun, Args} == {atom, atom, list}
+%%%       Func is {Mod, Fun, Args} == {atom(), atom(), list()}
 %%%       RestartType is permanent | temporary | transient |
 %%%                      intrinsic | {permanent, Delay} |
 %%%                      {transient, Delay} | {intrinsic, Delay}
 %%                       where Delay >= 0
-%%%       Shutdown = integer() | infinity | brutal_kill
+%%%       Shutdown = integer() > 0 | infinity | brutal_kill
 %%%       ChildType = supervisor | worker
 %%%       Modules = [atom()] | dynamic
-%%% Returns: {ok, [#child]} | Error
+%%% Returns: {ok, [child_rec()]} | Error
 %%% ------------------------------------------------------
 
 check_startspec(Children) -> check_startspec(Children, []).
@@ -1100,14 +1426,14 @@ check_childspec(Name, Func, RestartType, Shutdown, ChildType, Mods) ->
     validChildType(ChildType),
     validShutdown(Shutdown, ChildType),
     validMods(Mods),
-    {ok, #child{name = Name, mfa = Func, restart_type = RestartType,
+    {ok, #child{name = Name, mfargs = Func, restart_type = RestartType,
 		shutdown = Shutdown, child_type = ChildType, modules = Mods}}.
 
 validChildType(supervisor) -> true;
 validChildType(worker) -> true;
 validChildType(What) -> throw({invalid_child_type, What}).
 
-validName(_Name) -> true. 
+validName(_Name) -> true.
 
 validFunc({M, F, A}) when is_atom(M), 
                           is_atom(F), 
@@ -1130,13 +1456,13 @@ validDelay(What)                  -> throw({invalid_delay, What}).
 
 validShutdown(Shutdown, _) 
   when is_integer(Shutdown), Shutdown > 0 -> true;
-validShutdown(infinity, supervisor)    -> true;
+validShutdown(infinity, _)             -> true;
 validShutdown(brutal_kill, _)          -> true;
 validShutdown(Shutdown, _)             -> throw({invalid_shutdown, Shutdown}).
 
 validMods(dynamic) -> true;
 validMods(Mods) when is_list(Mods) ->
-    lists:foreach(fun (Mod) ->
+    lists:foreach(fun(Mod) ->
 		    if
 			is_atom(Mod) -> ok;
 			true -> throw({invalid_module, Mod})
@@ -1210,15 +1536,18 @@ report_error(Error, Reason, Child, SupName) ->
 		{offender, extract_child(Child)}],
     error_logger:error_report(supervisor_report, ErrorMsg).
 
-shutdown_error_reporter(SupName) ->
-    fun(Reason, Child) ->
-        report_error(shutdown_error, Reason, Child, SupName)
-    end.
 
+extract_child(Child) when is_list(Child#child.pid) ->
+    [{nb_children, length(Child#child.pid)},
+     {name, Child#child.name},
+     {mfargs, Child#child.mfargs},
+     {restart_type, Child#child.restart_type},
+     {shutdown, Child#child.shutdown},
+     {child_type, Child#child.child_type}];
 extract_child(Child) ->
     [{pid, Child#child.pid},
      {name, Child#child.name},
-     {mfa, Child#child.mfa},
+     {mfargs, Child#child.mfargs},
      {restart_type, Child#child.restart_type},
      {shutdown, Child#child.shutdown},
      {child_type, Child#child.child_type}].
diff --git a/src/supervisor2_tests.erl b/src/supervisor2_tests.erl
new file mode 100644
index 00000000..5a47e309
--- /dev/null
+++ b/src/supervisor2_tests.erl
@@ -0,0 +1,70 @@
+%% The contents of this file are subject to the Mozilla Public License
+%% Version 1.1 (the "License"); you may not use this file except in
+%% compliance with the License. You may obtain a copy of the License at
+%% http://www.mozilla.org/MPL/
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+%% License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% The Original Code is RabbitMQ.
+%%
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2011-2013 GoPivotal, Inc.  All rights reserved.
+%%
+
+-module(supervisor2_tests).
+-behaviour(supervisor2).
+
+-export([test_all/0, start_link/0]).
+-export([init/1]).
+
+test_all() ->
+    ok = check_shutdown(stop,    200, 200, 2000),
+    ok = check_shutdown(ignored,   1,   2, 2000).
+
+check_shutdown(SigStop, Iterations, ChildCount, SupTimeout) ->
+    {ok, Sup} = supervisor2:start_link(?MODULE, [SupTimeout]),
+    Res = lists:foldl(
+            fun (I, ok) ->
+                    TestSupPid = erlang:whereis(?MODULE),
+                    ChildPids =
+                        [begin
+                             {ok, ChildPid} =
+                                 supervisor2:start_child(TestSupPid, []),
+                             ChildPid
+                         end || _ <- lists:seq(1, ChildCount)],
+                    MRef = erlang:monitor(process, TestSupPid),
+                    [P ! SigStop || P <- ChildPids],
+                    ok = supervisor2:terminate_child(Sup, test_sup),
+                    {ok, _} = supervisor2:restart_child(Sup, test_sup),
+                    receive
+                        {'DOWN', MRef, process, TestSupPid, shutdown} ->
+                            ok;
+                        {'DOWN', MRef, process, TestSupPid, Reason} ->
+                            {error, {I, Reason}}
+                    end;
+                (_, R) ->
+                    R
+            end, ok, lists:seq(1, Iterations)),
+    unlink(Sup),
+    exit(Sup, shutdown),
+    Res.
+
+start_link() ->
+    Pid = spawn_link(fun () ->
+                             process_flag(trap_exit, true),
+                             receive stop -> ok end
+                     end),
+    {ok, Pid}.
+
+init([Timeout]) ->
+    {ok, {{one_for_one, 0, 1},
+          [{test_sup, {supervisor2, start_link,
+                       [{local, ?MODULE}, ?MODULE, []]},
+            transient, Timeout, supervisor, [?MODULE]}]}};
+init([]) ->
+    {ok, {{simple_one_for_one, 0, 1},
+          [{test_worker, {?MODULE, start_link, []},
+            temporary, 1000, worker, [?MODULE]}]}}.
diff --git a/src/tcp_acceptor.erl b/src/tcp_acceptor.erl
index 344196d7..267ce4f1 100644
--- a/src/tcp_acceptor.erl
+++ b/src/tcp_acceptor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(tcp_acceptor).
@@ -55,21 +55,30 @@ handle_info({inet_async, LSock, Ref, {ok, Sock}},
     inet_db:register_socket(Sock, Mod),
 
     %% handle
-    file_handle_cache:transfer(apply(M, F, A ++ [Sock])),
-    ok = file_handle_cache:obtain(),
+    case tune_buffer_size(Sock) of
+        ok                -> file_handle_cache:transfer(
+                               apply(M, F, A ++ [Sock])),
+                             ok = file_handle_cache:obtain();
+        {error, enotconn} -> catch port_close(Sock);
+        {error, Err}      -> {ok, {IPAddress, Port}} = inet:sockname(LSock),
+                             error_logger:error_msg(
+                               "failed to tune buffer size of "
+                               "connection accepted on ~s:~p - ~p (~s)~n",
+                               [rabbit_misc:ntoab(IPAddress), Port,
+                                Err, rabbit_misc:format_inet_error(Err)]),
+                             catch port_close(Sock)
+    end,
 
     %% accept more
     accept(State);
 
-handle_info({inet_async, LSock, Ref, {error, closed}},
-            State=#state{sock=LSock, ref=Ref}) ->
-    %% It would be wrong to attempt to restart the acceptor when we
-    %% know this will fail.
-    {stop, normal, State};
-
 handle_info({inet_async, LSock, Ref, {error, Reason}},
             State=#state{sock=LSock, ref=Ref}) ->
-    {stop, {accept_failed, Reason}, State};
+    case Reason of
+        closed       -> {stop, normal, State}; %% listening socket closed
+        econnaborted -> accept(State); %% client sent RST before we accepted
+        _            -> {stop, {accept_failed, Reason}, State}
+    end;
 
 handle_info(_Info, State) ->
     {noreply, State}.
@@ -87,3 +96,10 @@ accept(State = #state{sock=LSock}) ->
         {ok, Ref} -> {noreply, State#state{ref=Ref}};
         Error     -> {stop, {cannot_accept, Error}, State}
     end.
+
+tune_buffer_size(Sock) ->
+    case inet:getopts(Sock, [sndbuf, recbuf, buffer]) of
+        {ok, BufSizes} -> BufSz = lists:max([Sz || {_Opt, Sz} <- BufSizes]),
+                          inet:setopts(Sock, [{buffer, BufSz}]);
+        Error          -> Error
+    end.
diff --git a/src/tcp_acceptor_sup.erl b/src/tcp_acceptor_sup.erl
index d8844441..3619875f 100644
--- a/src/tcp_acceptor_sup.erl
+++ b/src/tcp_acceptor_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(tcp_acceptor_sup).
diff --git a/src/tcp_listener.erl b/src/tcp_listener.erl
index fb01c792..4b4a31b5 100644
--- a/src/tcp_listener.erl
+++ b/src/tcp_listener.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(tcp_listener).
diff --git a/src/tcp_listener_sup.erl b/src/tcp_listener_sup.erl
index 9ee921b4..2a65cc17 100644
--- a/src/tcp_listener_sup.erl
+++ b/src/tcp_listener_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(tcp_listener_sup).
diff --git a/src/test_sup.erl b/src/test_sup.erl
index 7f4b5049..da325f1e 100644
--- a/src/test_sup.erl
+++ b/src/test_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(test_sup).
@@ -34,7 +34,7 @@
 %%----------------------------------------------------------------------------
 
 test_supervisor_delayed_restart() ->
-    passed = with_sup(simple_one_for_one_terminate,
+    passed = with_sup(simple_one_for_one,
                       fun (SupPid) ->
                               {ok, _ChildPid} =
                                   supervisor2:start_child(SupPid, []),
diff --git a/src/vm_memory_monitor.erl b/src/vm_memory_monitor.erl
index fb184d1a..a07f6c65 100644
--- a/src/vm_memory_monitor.erl
+++ b/src/vm_memory_monitor.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 %% In practice Erlang shouldn't be allowed to grow to more than a half
@@ -27,7 +27,7 @@
 
 -behaviour(gen_server).
 
--export([start_link/1]).
+-export([start_link/1, start_link/3]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
          terminate/2, code_change/3]).
@@ -49,9 +49,11 @@
 
 -record(state, {total_memory,
                 memory_limit,
+                memory_fraction,
                 timeout,
                 timer,
-                alarmed
+                alarmed,
+                alarm_funs
                }).
 
 %%----------------------------------------------------------------------------
@@ -59,6 +61,8 @@
 -ifdef(use_specs).
 
 -spec(start_link/1 :: (float()) -> rabbit_types:ok_pid_or_error()).
+-spec(start_link/3 :: (float(), fun ((any()) -> 'ok'),
+                       fun ((any()) -> 'ok')) -> rabbit_types:ok_pid_or_error()).
 -spec(get_total_memory/0 :: () -> (non_neg_integer() | 'unknown')).
 -spec(get_vm_limit/0 :: () -> non_neg_integer()).
 -spec(get_check_interval/0 :: () -> non_neg_integer()).
@@ -73,11 +77,9 @@
 %% Public API
 %%----------------------------------------------------------------------------
 
-get_total_memory() ->
-    get_total_memory(os:type()).
+get_total_memory() -> get_total_memory(os:type()).
 
-get_vm_limit() ->
-    get_vm_limit(os:type()).
+get_vm_limit() -> get_vm_limit(os:type()).
 
 get_check_interval() ->
     gen_server:call(?MODULE, get_check_interval, infinity).
@@ -99,24 +101,27 @@ get_memory_limit() ->
 %% gen_server callbacks
 %%----------------------------------------------------------------------------
 
-start_link(Args) ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [Args], []).
+start_link(MemFraction) ->
+    start_link(MemFraction,
+               fun alarm_handler:set_alarm/1, fun alarm_handler:clear_alarm/1).
 
-init([MemFraction]) ->
+start_link(MemFraction, AlarmSet, AlarmClear) ->
+    gen_server:start_link({local, ?SERVER}, ?MODULE,
+                          [MemFraction, {AlarmSet, AlarmClear}], []).
+
+init([MemFraction, AlarmFuns]) ->
     TRef = start_timer(?DEFAULT_MEMORY_CHECK_INTERVAL),
-    State = #state { timeout = ?DEFAULT_MEMORY_CHECK_INTERVAL,
-                     timer = TRef,
-                     alarmed = false},
+    State = #state { timeout    = ?DEFAULT_MEMORY_CHECK_INTERVAL,
+                     timer      = TRef,
+                     alarmed    = false,
+                     alarm_funs = AlarmFuns },
     {ok, set_mem_limits(State, MemFraction)}.
 
 handle_call(get_vm_memory_high_watermark, _From, State) ->
-    {reply, State#state.memory_limit / State#state.total_memory, State};
+    {reply, State#state.memory_fraction, State};
 
 handle_call({set_vm_memory_high_watermark, MemFraction}, _From, State) ->
-    State1 = set_mem_limits(State, MemFraction),
-    error_logger:info_msg("Memory alarm changed to ~p, ~p bytes.~n",
-                          [MemFraction, State1#state.memory_limit]),
-    {reply, ok, State1};
+    {reply, ok, set_mem_limits(State, MemFraction)};
 
 handle_call(get_check_interval, _From, State) ->
     {reply, State#state.timeout, State};
@@ -168,32 +173,41 @@ set_mem_limits(State, MemFraction) ->
                 ?MEMORY_SIZE_FOR_UNKNOWN_OS;
             M -> M
         end,
-    MemLim = get_mem_limit(MemFraction, TotalMemory),
+    UsableMemory = case get_vm_limit() of
+                       Limit when Limit < TotalMemory ->
+                           error_logger:warning_msg(
+                             "Only ~pMB of ~pMB memory usable due to "
+                             "limited address space.~n",
+                             [trunc(V/?ONE_MB) || V <- [Limit, TotalMemory]]),
+                           Limit;
+                       _ ->
+                           TotalMemory
+                   end,
+    MemLim = trunc(MemFraction * UsableMemory),
     error_logger:info_msg("Memory limit set to ~pMB of ~pMB total.~n",
                           [trunc(MemLim/?ONE_MB), trunc(TotalMemory/?ONE_MB)]),
-    internal_update(State #state { total_memory = TotalMemory,
-                                   memory_limit = MemLim }).
+    internal_update(State #state { total_memory    = TotalMemory,
+                                   memory_limit    = MemLim,
+                                   memory_fraction = MemFraction}).
 
 internal_update(State = #state { memory_limit = MemLimit,
-                                 alarmed = Alarmed}) ->
+                                 alarmed      = Alarmed,
+                                 alarm_funs   = {AlarmSet, AlarmClear} }) ->
     MemUsed = erlang:memory(total),
     NewAlarmed = MemUsed > MemLimit,
     case {Alarmed, NewAlarmed} of
-        {false, true} ->
-            emit_update_info(set, MemUsed, MemLimit),
-            alarm_handler:set_alarm({{resource_limit, memory, node()}, []});
-        {true, false} ->
-            emit_update_info(clear, MemUsed, MemLimit),
-            alarm_handler:clear_alarm({resource_limit, memory, node()});
-        _ ->
-            ok
+        {false, true} -> emit_update_info(set, MemUsed, MemLimit),
+                         AlarmSet({{resource_limit, memory, node()}, []});
+        {true, false} -> emit_update_info(clear, MemUsed, MemLimit),
+                         AlarmClear({resource_limit, memory, node()});
+        _             -> ok
     end,
     State #state {alarmed = NewAlarmed}.
 
-emit_update_info(State, MemUsed, MemLimit) ->
+emit_update_info(AlarmState, MemUsed, MemLimit) ->
     error_logger:info_msg(
       "vm_memory_high_watermark ~p. Memory used:~p allowed:~p~n",
-      [State, MemUsed, MemLimit]).
+      [AlarmState, MemUsed, MemLimit]).
 
 start_timer(Timeout) ->
     {ok, TRef} = timer:send_interval(Timeout, update),
@@ -207,7 +221,7 @@ get_vm_limit({win32,_OSname}) ->
         8 -> 8*1024*1024*1024*1024      %% 8 TB for 64 bits  2^42
     end;
 
-%% On a 32-bit machine, if you're using more than 2 gigs of RAM you're
+%% On a 32-bit machine, if you're using more than 4 gigs of RAM you're
 %% in big trouble anyway.
 get_vm_limit(_OsType) ->
     case erlang:system_info(wordsize) of
@@ -216,10 +230,6 @@ get_vm_limit(_OsType) ->
              %%http://en.wikipedia.org/wiki/X86-64#Virtual_address_space_details
     end.
 
-get_mem_limit(MemFraction, TotalMemory) ->
-    AvMem = lists:min([TotalMemory, get_vm_limit()]),
-    trunc(AvMem * MemFraction).
-
 %%----------------------------------------------------------------------------
 %% Internal Helpers
 %%----------------------------------------------------------------------------
@@ -253,29 +263,11 @@ get_total_memory({unix,openbsd}) ->
     sysctl("hw.usermem");
 
 get_total_memory({win32,_OSname}) ->
-    %% Due to the Erlang print format bug, on Windows boxes the memory
-    %% size is broken. For example Windows 7 64 bit with 4Gigs of RAM
-    %% we get negative memory size:
-    %% > os_mon_sysinfo:get_mem_info().
-    %% ["76 -1658880 1016913920 -1 -1021628416 2147352576 2134794240\n"]
-    %% Due to this bug, we don't actually know anything. Even if the
-    %% number is postive we can't be sure if it's correct. This only
-    %% affects us on os_mon versions prior to 2.2.1.
-    case application:get_key(os_mon, vsn) of
-        undefined ->
-            unknown;
-        {ok, Version} ->
-            case rabbit_misc:version_compare(Version, "2.2.1", lt) of
-                true -> %% os_mon is < 2.2.1, so we know nothing
-                    unknown;
-                false ->
-                    [Result|_] = os_mon_sysinfo:get_mem_info(),
-                    {ok, [_MemLoad, TotPhys, _AvailPhys,
-                          _TotPage, _AvailPage, _TotV, _AvailV], _RestStr} =
-                        io_lib:fread("~d~d~d~d~d~d~d", Result),
-                    TotPhys
-            end
-    end;
+    [Result|_] = os_mon_sysinfo:get_mem_info(),
+    {ok, [_MemLoad, TotPhys, _AvailPhys, _TotPage, _AvailPage, _TotV, _AvailV],
+     _RestStr} =
+        io_lib:fread("~d~d~d~d~d~d~d", Result),
+    TotPhys;
 
 get_total_memory({unix, linux}) ->
     File = read_proc_file("/proc/meminfo"),
diff --git a/src/worker_pool.erl b/src/worker_pool.erl
index c9ecccd6..488db5ec 100644
--- a/src/worker_pool.erl
+++ b/src/worker_pool.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(worker_pool).
diff --git a/src/worker_pool_sup.erl b/src/worker_pool_sup.erl
index ff356366..24bc375c 100644
--- a/src/worker_pool_sup.erl
+++ b/src/worker_pool_sup.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(worker_pool_sup).
diff --git a/src/worker_pool_worker.erl b/src/worker_pool_worker.erl
index 1ddcebb2..a976503f 100644
--- a/src/worker_pool_worker.erl
+++ b/src/worker_pool_worker.erl
@@ -10,8 +10,8 @@
 %%
 %% The Original Code is RabbitMQ.
 %%
-%% The Initial Developer of the Original Code is VMware, Inc.
-%% Copyright (c) 2007-2012 VMware, Inc.  All rights reserved.
+%% The Initial Developer of the Original Code is GoPivotal, Inc.
+%% Copyright (c) 2007-2013 GoPivotal, Inc.  All rights reserved.
 %%
 
 -module(worker_pool_worker).
@@ -23,7 +23,7 @@
 -export([set_maximum_since_use/2]).
 
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2,
-         terminate/2, code_change/3, prioritise_cast/2]).
+         terminate/2, code_change/3, prioritise_cast/3]).
 
 %%----------------------------------------------------------------------------
 
@@ -73,8 +73,8 @@ init([WId]) ->
     {ok, WId, hibernate,
      {backoff, ?HIBERNATE_AFTER_MIN, ?HIBERNATE_AFTER_MIN, ?DESIRED_HIBERNATE}}.
 
-prioritise_cast({set_maximum_since_use, _Age}, _State) -> 8;
-prioritise_cast(_Msg,                          _State) -> 0.
+prioritise_cast({set_maximum_since_use, _Age}, _Len, _State) -> 8;
+prioritise_cast(_Msg,                          _Len, _State) -> 0.
 
 handle_call({submit, Fun}, From, WId) ->
     gen_server2:reply(From, run(Fun)),
author	Simon MacMullen <simon@rabbitmq.com>	2013-08-19 17:14:13 +0100
committer	Simon MacMullen <simon@rabbitmq.com>	2013-08-19 17:14:13 +0100
commit	11049881a87eb51e9bf6efbb4d2ef1ee4be62bfe (patch)
tree	2a3f21103e1d6050802ed32714d1e62763aeb0a5 /src
parent	bd1305279e255adcf583afdd55a7cee18a9fcddb (diff)
parent	af4ef7640e817141615298c504e9129d14be1d9d (diff)
download	rabbitmq-server-bug24969.tar.gz