57 files changed, 2604 insertions, 2430 deletions
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index 93ba56dccd..4c67c2029f 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -404,6 +404,7 @@ atom min_bin_vheap_size
 atom minor
 atom minor_version
 atom Minus='-'
+atom MinusMinus='--'
 atom module
 atom module_info
 atom monitored_by
@@ -494,6 +495,7 @@ atom packet
 atom packet_size
 atom parallelism
 atom Plus='+'
+atom PlusPlus='++'
 atom pause
 atom pending
 atom pending_driver
diff --git a/erts/emulator/beam/beam_bif_load.c b/erts/emulator/beam/beam_bif_load.c
index 4c8ee5178a..dcbff99f54 100644
--- a/erts/emulator/beam/beam_bif_load.c
+++ b/erts/emulator/beam/beam_bif_load.c
@@ -835,21 +835,25 @@ BIF_RETTYPE finish_after_on_load_2(BIF_ALIST_2)
 	 */
 	num_exps = export_list_size(code_ix);
 	for (i = 0; i < num_exps; i++) {
-	    Export *ep = export_list(i,code_ix);
-	    if (ep == NULL || ep->info.mfa.module != BIF_ARG_1) {
-		continue;
-	    }
-	    if (ep->beam[1] != 0) {
-		ep->addressv[code_ix] = (void *) ep->beam[1];
-		ep->beam[1] = 0;
-	    } else {
-		if (ep->addressv[code_ix] == ep->beam &&
-		    BeamIsOpCode(ep->beam[0], op_apply_bif)) {
-		    continue;
-		}
-                ep->addressv[code_ix] = ep->beam;
-                ep->beam[0] = BeamOpCodeAddr(op_call_error_handler);
-	    }
+            Export *ep = export_list(i, code_ix);
+
+            if (ep == NULL || ep->info.mfa.module != BIF_ARG_1) {
+                continue;
+            }
+
+            DBG_CHECK_EXPORT(ep, code_ix);
+
+            if (ep->trampoline.not_loaded.deferred != 0) {
+                    ep->addressv[code_ix] = (void*)ep->trampoline.not_loaded.deferred;
+                    ep->trampoline.not_loaded.deferred = 0;
+            } else {
+                if (ep->bif_table_index != -1) {
+                    continue;
+                }
+
+                ep->addressv[code_ix] = ep->trampoline.raw;
+                ep->trampoline.op = BeamOpCodeAddr(op_call_error_handler);
+            }
 	}
 	modp->curr.code_hdr->on_load_function_ptr = NULL;
 
@@ -872,10 +876,11 @@ BIF_RETTYPE finish_after_on_load_2(BIF_ALIST_2)
 	    if (ep == NULL || ep->info.mfa.module != BIF_ARG_1) {
 		continue;
 	    }
-	    if (BeamIsOpCode(ep->beam[0], op_apply_bif)) {
+	    if (ep->bif_table_index != -1) {
 		continue;
 	    }
-	    ep->beam[1] = 0;
+
+            ep->trampoline.not_loaded.deferred = 0;
 	}
     }
     erts_release_code_write_permission();
@@ -1125,16 +1130,15 @@ check_process_code(Process* rp, Module* modp, int *redsp, int fcalls)
     mod_size = modp->old.code_length;
 
     /*
-     * Check if current instruction or continuation pointer points into module.
+     * Check if the instruction pointer points into module.
      */
-    if (ErtsInArea(rp->i, mod_start, mod_size)
-	|| ErtsInArea(rp->cp, mod_start, mod_size)) {
+    if (ErtsInArea(rp->i, mod_start, mod_size)) {
 	return am_true;
     }
- 
+
     *redsp += 1;
 
-    if (erts_check_nif_export_in_area(rp, mod_start, mod_size))
+    if (erts_check_nfunc_in_area(rp, mod_start, mod_size))
 	return am_true;
 
     *redsp += (STACK_START(rp) - rp->stop) / 32;
@@ -1885,25 +1889,28 @@ delete_code(Module* modp)
     for (i = 0; i < num_exps; i++) {
 	Export *ep = export_list(i, code_ix);
         if (ep != NULL && (ep->info.mfa.module == module)) {
-	    if (ep->addressv[code_ix] == ep->beam) {
-		if (BeamIsOpCode(ep->beam[0], op_apply_bif)) {
-		    continue;
-		}
-		else if (BeamIsOpCode(ep->beam[0], op_i_generic_breakpoint)) {
+	    if (ep->addressv[code_ix] == ep->trampoline.raw) {
+                if (BeamIsOpCode(ep->trampoline.op, op_i_generic_breakpoint)) {
 		    ERTS_LC_ASSERT(erts_thr_progress_is_blocking());
 		    ASSERT(modp->curr.num_traced_exports > 0);
 		    DBG_TRACE_MFA_P(&ep->info.mfa,
 				  "export trace cleared, code_ix=%d", code_ix);
-		    erts_clear_export_break(modp, &ep->info);
+		    erts_clear_export_break(modp, ep);
 		}
 		else {
-                    ASSERT(BeamIsOpCode(ep->beam[0], op_call_error_handler) ||
+                    ASSERT(BeamIsOpCode(ep->trampoline.op, op_call_error_handler) ||
                            !erts_initialized);
                 }
             }
-	    ep->addressv[code_ix] = ep->beam;
-	    ep->beam[0] = BeamOpCodeAddr(op_call_error_handler);
-	    ep->beam[1] = 0;
+
+            if (ep->bif_table_index != -1 && ep->is_bif_traced) {
+                /* Code unloading kills both global and local call tracing. */
+                ep->is_bif_traced = 0;
+            }
+
+	    ep->addressv[code_ix] = ep->trampoline.raw;
+	    ep->trampoline.op = BeamOpCodeAddr(op_call_error_handler);
+	    ep->trampoline.not_loaded.deferred = 0;
 	    DBG_TRACE_MFA_P(&ep->info.mfa,
 			    "export invalidation, code_ix=%d", code_ix);
 	}
diff --git a/erts/emulator/beam/beam_bp.c b/erts/emulator/beam/beam_bp.c
index 0832b3f374..1bb20f6ae3 100644
--- a/erts/emulator/beam/beam_bp.c
+++ b/erts/emulator/beam/beam_bp.c
@@ -207,9 +207,6 @@ erts_bp_match_functions(BpFunctions* f, ErtsCodeMFA *mfa, int specified)
 	    if (erts_is_function_native(ci)) {
 		continue;
 	    }
-	    if (is_nil(ci->mfa.module)) { /* Ignore BIF stub */
-		continue;
-	    }
             switch (specified) {
             case 3:
                 if (ci->mfa.arity != mfa->arity)
@@ -244,8 +241,10 @@ erts_bp_match_export(BpFunctions* f, ErtsCodeMFA *mfa, int specified)
     f->matching = (BpFunction *) Alloc(num_exps*sizeof(BpFunction));
     ne = 0;
     for (i = 0; i < num_exps; i++) {
-	Export* ep = export_list(i, code_ix);
-	BeamInstr* pc;
+        BeamInstr *func;
+        Export* ep;
+
+        ep = export_list(i, code_ix);
 
         switch (specified) {
         case 3:
@@ -263,19 +262,20 @@ erts_bp_match_export(BpFunctions* f, ErtsCodeMFA *mfa, int specified)
             ASSERT(0);
         }
 
-	pc = ep->beam;
-	if (ep->addressv[code_ix] == pc) {
-	    if (BeamIsOpCode(*pc, op_apply_bif) ||
-                BeamIsOpCode(*pc, op_call_error_handler)) {
-                continue;
-	    }
-	    ASSERT(BeamIsOpCode(*pc, op_i_generic_breakpoint));
-	} else if (erts_is_function_native(erts_code_to_codeinfo(ep->addressv[code_ix]))) {
-	    continue;
-	}
+        func = ep->addressv[code_ix];
+
+        if (func == ep->trampoline.raw) {
+            if (BeamIsOpCode(*func, op_call_error_handler)) {
+                    continue;
+            }
+            ASSERT(BeamIsOpCode(*func, op_i_generic_breakpoint));
+        } else if (erts_is_function_native(erts_code_to_codeinfo(func))) {
+            continue;
+        }
 
 	f->matching[ne].ci = &ep->info;
 	f->matching[ne].mod = erts_get_module(ep->info.mfa.module, code_ix);
+
 	ne++;
 
     }
@@ -305,18 +305,6 @@ erts_consolidate_bp_data(BpFunctions* f, int local)
     }
 }
 
-void
-erts_consolidate_bif_bp_data(void)
-{
-    int i;
-
-    ERTS_LC_ASSERT(erts_has_code_write_permission());
-    for (i = 0; i < BIF_SIZE; i++) {
-	Export *ep = bif_export[i];
-	consolidate_bp_data(0, &ep->info, 0);
-    }
-}
-
 static void
 consolidate_bp_data(Module* modp, ErtsCodeInfo *ci, int local)
 {
@@ -495,7 +483,7 @@ erts_set_mtrace_break(BpFunctions* f, Binary *match_spec, ErtsTracer tracer)
 }
 
 void
-erts_set_call_trace_bif(ErtsCodeInfo *ci, Binary *match_spec, int local)
+erts_set_export_trace(ErtsCodeInfo *ci, Binary *match_spec, int local)
 {
     Uint flags = local ? ERTS_BPF_LOCAL_TRACE : ERTS_BPF_GLOBAL_TRACE;
 
@@ -503,25 +491,6 @@ erts_set_call_trace_bif(ErtsCodeInfo *ci, Binary *match_spec, int local)
 }
 
 void
-erts_set_mtrace_bif(ErtsCodeInfo *ci, Binary *match_spec, ErtsTracer tracer)
-{
-    set_function_break(ci, match_spec, ERTS_BPF_META_TRACE, 0, tracer);
-}
-
-void
-erts_set_time_trace_bif(ErtsCodeInfo *ci, enum erts_break_op count_op)
-{
-    set_function_break(ci, NULL,
-		       ERTS_BPF_TIME_TRACE|ERTS_BPF_TIME_TRACE_ACTIVE,
-		       count_op, erts_tracer_nil);
-}
-
-void
-erts_clear_time_trace_bif(ErtsCodeInfo *ci) {
-    clear_function_break(ci, ERTS_BPF_TIME_TRACE|ERTS_BPF_TIME_TRACE_ACTIVE);
-}
-
-void
 erts_set_debug_break(BpFunctions* f) {
     set_break(f, NULL, ERTS_BPF_DEBUG, 0, erts_tracer_nil);
 }
@@ -547,7 +516,7 @@ erts_clear_trace_break(BpFunctions* f)
 }
 
 void
-erts_clear_call_trace_bif(ErtsCodeInfo *ci, int local)
+erts_clear_export_trace(ErtsCodeInfo *ci, int local)
 {
     GenericBp* g = ci->u.gen_bp;
 
@@ -566,12 +535,6 @@ erts_clear_mtrace_break(BpFunctions* f)
 }
 
 void
-erts_clear_mtrace_bif(ErtsCodeInfo *ci)
-{
-    clear_function_break(ci, ERTS_BPF_META_TRACE);
-}
-
-void
 erts_clear_debug_break(BpFunctions* f)
 {
     ERTS_LC_ASSERT(erts_thr_progress_is_blocking());
@@ -630,58 +593,56 @@ erts_clear_module_break(Module *modp) {
 }
 
 void
-erts_clear_export_break(Module* modp, ErtsCodeInfo *ci)
+erts_clear_export_break(Module* modp, Export *ep)
 {
+    ErtsCodeInfo *ci;
+
     ERTS_LC_ASSERT(erts_thr_progress_is_blocking());
 
+    ci = &ep->info;
+
+    ASSERT(erts_codeinfo_to_code(ci) == ep->trampoline.raw);
+
+    ASSERT(BeamIsOpCode(ep->trampoline.op, op_i_generic_breakpoint));
+    ep->trampoline.op = 0;
+
     clear_function_break(ci, ERTS_BPF_ALL);
     erts_commit_staged_bp();
-    *erts_codeinfo_to_code(ci) = (BeamInstr) 0;
+
     consolidate_bp_data(modp, ci, 0);
     ASSERT(ci->u.gen_bp == NULL);
 }
 
 /*
- * If c_p->cp is a trace return instruction, we set cp
- * to be the place where we again start to execute code.
+ * If the topmost continuation pointer on the stack is a trace return
+ * instruction, we modify it to be the place where we again start to
+ * execute code.
  *
- * cp is used by match spec {caller} to get the calling
- * function, and if we don't do this fixup it will be
- * 'undefined'. This has the odd side effect of {caller}
- * not really being which function is the caller, but
- * rather which function we are about to return to.
+ * This continuation pointer is used by match spec {caller} to get the
+ * calling function, and if we don't do this fixup it will be
+ * 'undefined'. This has the odd side effect of {caller} not really
+ * being the function which is the caller, but rather the function
+ * which we are about to return to.
  */
 static void fixup_cp_before_trace(Process *c_p, int *return_to_trace)
 {
-    Eterm *cpp, *E = c_p->stop;
-    BeamInstr w = *c_p->cp;
-    if (BeamIsOpCode(w, op_return_trace)) {
-        cpp = &E[2];
-    } else if (BeamIsOpCode(w, op_i_return_to_trace)) {
-        *return_to_trace = 1;
-        cpp = &E[0];
-    } else if (BeamIsOpCode(w, op_i_return_time_trace)) {
-        cpp = &E[0];
-    } else {
-        cpp = NULL;
-    }
-    if (cpp) {
-        for (;;) {
-            BeamInstr w = *cp_val(*cpp);
-            if (BeamIsOpCode(w, op_return_trace)) {
-                cpp += 3;
-            } else if (BeamIsOpCode(w, op_i_return_to_trace)) {
-                *return_to_trace = 1;
-                cpp += 1;
-            } else if (BeamIsOpCode(w, op_i_return_time_trace)) {
-                cpp += 2;
-            } else {
-                break;
-            }
+    Eterm *cpp = c_p->stop;
+
+    for (;;) {
+        BeamInstr w = *cp_val(*cpp);
+        if (BeamIsOpCode(w, op_return_trace)) {
+            cpp += 3;
+        } else if (BeamIsOpCode(w, op_i_return_to_trace)) {
+            *return_to_trace = 1;
+            cpp += 1;
+        } else if (BeamIsOpCode(w, op_i_return_time_trace)) {
+            cpp += 2;
+        } else {
+            break;
         }
-        c_p->cp = (BeamInstr *) cp_val(*cpp);
-        ASSERT(is_CP(*cpp));
     }
+    c_p->stop[0] = (Eterm) cp_val(*cpp);
+    ASSERT(is_CP(*cpp));
 }
 
 BeamInstr
@@ -743,12 +704,13 @@ erts_generic_breakpoint(Process* c_p, ErtsCodeInfo *info, Eterm* reg)
 
     if (bp_flags & ERTS_BPF_TIME_TRACE_ACTIVE) {
 	Eterm w;
+        Eterm* E;
 	erts_trace_time_call(c_p, info, bp->time);
-	w = (BeamInstr) *c_p->cp;
+        E = c_p->stop;
+        w = (BeamInstr) E[0];
 	if (! (BeamIsOpCode(w, op_i_return_time_trace) ||
 	       BeamIsOpCode(w, op_return_trace) ||
                BeamIsOpCode(w, op_i_return_to_trace)) ) {
-	    Eterm* E = c_p->stop;
 	    ASSERT(c_p->htop <= E && E <= c_p->hend);
 	    if (E - 2 < c_p->htop) {
 		(void) erts_garbage_collect(c_p, 2, reg, info->mfa.arity);
@@ -759,9 +721,8 @@ erts_generic_breakpoint(Process* c_p, ErtsCodeInfo *info, Eterm* reg)
 	    ASSERT(c_p->htop <= E && E <= c_p->hend);
 
 	    E -= 2;
-	    E[0] = make_cp(erts_codeinfo_to_code(info));
-	    E[1] = make_cp(c_p->cp);     /* original return address */
-	    c_p->cp = beam_return_time_trace;
+	    E[1] = make_cp(erts_codeinfo_to_code(info));
+	    E[0] = (Eterm) beam_return_time_trace;
 	    c_p->stop = E;
 	}
     }
@@ -773,237 +734,24 @@ erts_generic_breakpoint(Process* c_p, ErtsCodeInfo *info, Eterm* reg)
     }
 }
 
-/*
- * Entry point called by the trace wrap functions in erl_bif_wrap.c
- *
- * The trace wrap functions are themselves called through the export
- * entries instead of the original BIF functions.
- */
-Eterm
-erts_bif_trace(int bif_index, Process* p, Eterm* args, BeamInstr* I)
-{
-    Eterm result;
-    Eterm (*func)(Process*, Eterm*, BeamInstr*);
-    Export* ep = bif_export[bif_index];
-    Uint32 flags = 0, flags_meta = 0;
-    ErtsTracer meta_tracer = erts_tracer_nil;
-    int applying = (I == ep->beam); /* Yup, the apply code for a bif
-                                      * is actually in the
-                                      * export entry */
-    BeamInstr *cp = p->cp;
-    GenericBp* g;
-    GenericBpData* bp = NULL;
-    Uint bp_flags = 0;
-    int return_to_trace = 0;
-
-    ERTS_CHK_HAVE_ONLY_MAIN_PROC_LOCK(p);
-
-    g = ep->info.u.gen_bp;
-    if (g) {
-	bp = &g->data[erts_active_bp_ix()];
-	bp_flags = bp->flags;
-    }
-
-    /*
-     * Make continuation pointer OK, it is not during direct BIF calls,
-     * but it is correct during apply of bif.
-     */
-    if (!applying) {
-	p->cp = I;
-    } else {
-        fixup_cp_before_trace(p, &return_to_trace);
-    }
-    if (bp_flags & (ERTS_BPF_LOCAL_TRACE|ERTS_BPF_GLOBAL_TRACE) &&
-	IS_TRACED_FL(p, F_TRACE_CALLS)) {
-	int local = !!(bp_flags & ERTS_BPF_LOCAL_TRACE);
-	flags = erts_call_trace(p, &ep->info, bp->local_ms, args,
-				local, &ERTS_TRACER(p));
-    }
-    if (bp_flags & ERTS_BPF_META_TRACE) {
-	ErtsTracer old_tracer;
-
-        meta_tracer = erts_atomic_read_nob(&bp->meta_tracer->tracer);
-        old_tracer = meta_tracer;
-	flags_meta = erts_call_trace(p, &ep->info, bp->meta_ms, args,
-				     0, &meta_tracer);
-
-	if (!ERTS_TRACER_COMPARE(old_tracer, meta_tracer)) {
-            ErtsTracer new_tracer = erts_tracer_nil;
-            erts_tracer_update(&new_tracer, meta_tracer);
-	    if (old_tracer == erts_atomic_cmpxchg_acqb(
-                    &bp->meta_tracer->tracer,
-                    (erts_aint_t)new_tracer,
-                    (erts_aint_t)old_tracer)) {
-                ERTS_TRACER_CLEAR(&old_tracer);
-            } else {
-                ERTS_TRACER_CLEAR(&new_tracer);
-            }
-	}
-    }
-    if (bp_flags & ERTS_BPF_TIME_TRACE_ACTIVE &&
-	IS_TRACED_FL(p, F_TRACE_CALLS)) {
-	erts_trace_time_call(p, &ep->info, bp->time);
-    }
-
-    /* Restore original continuation pointer (if changed). */
-    p->cp = cp;
-
-    func = bif_table[bif_index].f;
-
-    result = func(p, args, I);
-
-    if (erts_nif_export_check_save_trace(p, result,
-					 applying, ep,
-					 cp, flags,
-					 flags_meta, I,
-					 meta_tracer)) {
-	/*
-	 * erts_bif_trace_epilogue() will be called
-	 * later when appropriate via the NIF export
-	 * scheduling functionality...
-	 */
-	return result;
-    }
-
-    return erts_bif_trace_epilogue(p, result, applying, ep, cp,
-				   flags, flags_meta, I,
-				   meta_tracer);
-}
-
-Eterm
-erts_bif_trace_epilogue(Process *p, Eterm result, int applying,
-			Export* ep, BeamInstr *cp, Uint32 flags,
-			Uint32 flags_meta, BeamInstr* I,
-			ErtsTracer meta_tracer)
-{
-    if (applying && (flags & MATCH_SET_RETURN_TO_TRACE)) {
-	BeamInstr i_return_trace      = beam_return_trace[0];
-	BeamInstr i_return_to_trace   = beam_return_to_trace[0];
-	BeamInstr i_return_time_trace = beam_return_time_trace[0];
-	Eterm *cpp;
-	/* Maybe advance cp to skip trace stack frames */
-	for (cpp = p->stop;  ;  cp = cp_val(*cpp++)) {
-	    if (*cp == i_return_trace) {
-		/* Skip stack frame variables */
-		while (is_not_CP(*cpp)) cpp++;
-		cpp += 2; /* Skip return_trace parameters */
-	    } else if (*cp == i_return_time_trace) {
-		/* Skip stack frame variables */
-		while (is_not_CP(*cpp)) cpp++;
-		cpp += 1; /* Skip return_time_trace parameters */
-	    } else if (*cp == i_return_to_trace) {
-		/* A return_to trace message is going to be generated
-		 * by normal means, so we do not have to.
-		 */
-		cp = NULL;
-		break;
-	    } else break;
-	}
-    }
-
-    /* Try to get these in the order
-     * they usually appear in normal code... */
-    if (is_non_value(result)) {
-	Uint reason = p->freason;
-	if (reason != TRAP) {
-	    Eterm class;
-	    Eterm value = p->fvalue;
-	    /* Expand error value like in handle_error() */
-	    if (reason & EXF_ARGLIST) {
-		Eterm *tp;
-		ASSERT(is_tuple(value));
-		tp = tuple_val(value);
-		value = tp[1];
-	    }
-	    if ((reason & EXF_THROWN) && (p->catches <= 0)) {
-                Eterm *hp = HAlloc(p, 3);
-		value = TUPLE2(hp, am_nocatch, value);
-		reason = EXC_ERROR;
-	    }
-	    /* Note: expand_error_value() could theoretically
-	     * allocate on the heap, but not for any error
-	     * returned by a BIF, and it would do no harm,
-	     * just be annoying.
-	     */
-	    value = expand_error_value(p, reason, value);
-	    class = exception_tag[GET_EXC_CLASS(reason)];
-
-	    if (flags_meta & MATCH_SET_EXCEPTION_TRACE) {
-		erts_trace_exception(p, &ep->info.mfa, class, value,
-				     &meta_tracer);
-	    }
-	    if (flags & MATCH_SET_EXCEPTION_TRACE) {
-		erts_trace_exception(p, &ep->info.mfa, class, value,
-				     &ERTS_TRACER(p));
-	    }
-	    if ((flags & MATCH_SET_RETURN_TO_TRACE) && p->catches > 0) {
-		/* can only happen if(local)*/
-		Eterm *ptr = p->stop;
-		ASSERT(is_CP(*ptr));
-		ASSERT(ptr <= STACK_START(p));
-		/* Search the nearest stack frame for a catch */
-		while (++ptr < STACK_START(p)) {
-		    if (is_CP(*ptr)) break;
-		    if (is_catch(*ptr)) {
-			if (applying) {
-			    /* Apply of BIF, cp is in calling function */
-			    if (cp) erts_trace_return_to(p, cp);
-			} else {
-			    /* Direct bif call, I points into
-			     * calling function */
-			    erts_trace_return_to(p, I);
-			}
-		    }
-		}
-	    }
-	    if ((flags_meta|flags) & MATCH_SET_EXCEPTION_TRACE) {
-		erts_proc_lock(p, ERTS_PROC_LOCKS_ALL_MINOR);
-		ERTS_TRACE_FLAGS(p) |= F_EXCEPTION_TRACE;
-		erts_proc_unlock(p, ERTS_PROC_LOCKS_ALL_MINOR);
-	    }
-	}
-    } else {
-	if (flags_meta & MATCH_SET_RX_TRACE) {
-	    erts_trace_return(p, &ep->info.mfa, result, &meta_tracer);
-	}
-	/* MATCH_SET_RETURN_TO_TRACE cannot occur if(meta) */
-	if (flags & MATCH_SET_RX_TRACE) {
-	    erts_trace_return(p, &ep->info.mfa, result, &ERTS_TRACER(p));
-	}
-	if (flags & MATCH_SET_RETURN_TO_TRACE &&
-            IS_TRACED_FL(p, F_TRACE_RETURN_TO)) {
-	    /* can only happen if(local)*/
-	    if (applying) {
-		/* Apply of BIF, cp is in calling function */
-		if (cp) erts_trace_return_to(p, cp);
-	    } else {
-		/* Direct bif call, I points into calling function */
-		erts_trace_return_to(p, I);
-	    }
-	}
-    }
-    ERTS_CHK_HAVE_ONLY_MAIN_PROC_LOCK(p);
-    return result;
-}
-
 static ErtsTracer
 do_call_trace(Process* c_p, ErtsCodeInfo* info, Eterm* reg,
 	      int local, Binary* ms, ErtsTracer tracer)
 {
     int return_to_trace = 0;
-    BeamInstr *cp_save = c_p->cp;
     Uint32 flags;
     Uint need = 0;
+    Eterm cp_save;
     Eterm* E = c_p->stop;
 
-    fixup_cp_before_trace(c_p, &return_to_trace);
+    cp_save = E[0];
 
+    fixup_cp_before_trace(c_p, &return_to_trace);
     ERTS_UNREQ_PROC_MAIN_LOCK(c_p);
     flags = erts_call_trace(c_p, info, ms, reg, local, &tracer);
     ERTS_REQ_PROC_MAIN_LOCK(c_p);
 
-    /* restore cp after potential fixup */
-    c_p->cp = cp_save;
+    E[0] = cp_save;
 
     ASSERT(!ERTS_PROC_IS_EXITING(c_p));
     if ((flags & MATCH_SET_RETURN_TO_TRACE) && !return_to_trace) {
@@ -1023,28 +771,23 @@ do_call_trace(Process* c_p, ErtsCodeInfo* info, Eterm* reg,
     if (flags & MATCH_SET_RETURN_TO_TRACE && !return_to_trace) {
 	E -= 1;
 	ASSERT(c_p->htop <= E && E <= c_p->hend);
-	E[0] = make_cp(c_p->cp);
-	c_p->cp = beam_return_to_trace;
+	E[0] = (Eterm) beam_return_to_trace;
+        c_p->stop = E;
     }
-    if (flags & MATCH_SET_RX_TRACE)
-    {
+    if (flags & MATCH_SET_RX_TRACE) {
 	E -= 3;
         c_p->stop = E;
 	ASSERT(c_p->htop <= E && E <= c_p->hend);
 	ASSERT(is_CP((Eterm) (UWord) (&info->mfa.module)));
 	ASSERT(IS_TRACER_VALID(tracer));
-	E[2] = make_cp(c_p->cp);
-        E[1] = copy_object(tracer, c_p);
-	E[0] = make_cp(&info->mfa.module);
-                               /* We ARE at the beginning of an instruction,
-				  the funcinfo is above i. */
-	c_p->cp = (flags & MATCH_SET_EXCEPTION_TRACE) ?
-	    beam_exception_trace : beam_return_trace;
+        E[2] = copy_object(tracer, c_p);
+        E[1] = make_cp(&info->mfa.module);
+        E[0] = (Eterm) ((flags & MATCH_SET_EXCEPTION_TRACE) ?
+                        beam_exception_trace : beam_return_trace);
 	erts_proc_lock(c_p, ERTS_PROC_LOCKS_ALL_MINOR);
 	ERTS_TRACE_FLAGS(c_p) |= F_EXCEPTION_TRACE;
 	erts_proc_unlock(c_p, ERTS_PROC_LOCKS_ALL_MINOR);
-    } else
-        c_p->stop = E;
+    }
     return tracer;
 }
 
diff --git a/erts/emulator/beam/beam_bp.h b/erts/emulator/beam/beam_bp.h
index a64765822b..54e84e7e4f 100644
--- a/erts/emulator/beam/beam_bp.h
+++ b/erts/emulator/beam/beam_bp.h
@@ -119,20 +119,16 @@ void erts_bp_free_matched_functions(BpFunctions* f);
 void erts_install_breakpoints(BpFunctions* f);
 void erts_uninstall_breakpoints(BpFunctions* f);
 void erts_consolidate_bp_data(BpFunctions* f, int local);
-void erts_consolidate_bif_bp_data(void);
 
 void erts_set_trace_break(BpFunctions *f, Binary *match_spec);
 void erts_clear_trace_break(BpFunctions *f);
 
-void erts_set_call_trace_bif(ErtsCodeInfo *ci, Binary *match_spec, int local);
-void erts_clear_call_trace_bif(ErtsCodeInfo *ci, int local);
+void erts_set_export_trace(ErtsCodeInfo *ci, Binary *match_spec, int local);
+void erts_clear_export_trace(ErtsCodeInfo *ci, int local);
 
 void erts_set_mtrace_break(BpFunctions *f, Binary *match_spec,
 			  ErtsTracer tracer);
 void erts_clear_mtrace_break(BpFunctions *f);
-void erts_set_mtrace_bif(ErtsCodeInfo *ci, Binary *match_spec,
-			 ErtsTracer tracer);
-void erts_clear_mtrace_bif(ErtsCodeInfo *ci);
 
 void erts_set_debug_break(BpFunctions *f);
 void erts_clear_debug_break(BpFunctions *f);
@@ -142,7 +138,7 @@ void erts_clear_count_break(BpFunctions *f);
 
 void erts_clear_all_breaks(BpFunctions* f);
 int erts_clear_module_break(Module *modp);
-void erts_clear_export_break(Module *modp, ErtsCodeInfo* ci);
+void erts_clear_export_break(Module *modp, Export *ep);
 
 BeamInstr erts_generic_breakpoint(Process* c_p, ErtsCodeInfo *ci, Eterm* reg);
 BeamInstr erts_trace_break(Process *p, ErtsCodeInfo *ci, Eterm *args,
@@ -151,8 +147,6 @@ BeamInstr erts_trace_break(Process *p, ErtsCodeInfo *ci, Eterm *args,
 int erts_is_trace_break(ErtsCodeInfo *ci, Binary **match_spec_ret, int local);
 int erts_is_mtrace_break(ErtsCodeInfo *ci, Binary **match_spec_ret,
 			 ErtsTracer *tracer_ret);
-int erts_is_mtrace_bif(ErtsCodeInfo *ci, Binary **match_spec_ret,
-		       ErtsTracer *tracer_ret);
 int erts_is_native_break(ErtsCodeInfo *ci);
 int erts_is_count_break(ErtsCodeInfo *ci, Uint *count_ret);
 int erts_is_time_break(Process *p, ErtsCodeInfo *ci, Eterm *call_time);
@@ -163,10 +157,6 @@ void erts_schedule_time_break(Process *p, Uint out);
 void erts_set_time_break(BpFunctions *f, enum erts_break_op);
 void erts_clear_time_break(BpFunctions *f);
 
-int erts_is_time_trace_bif(Process *p, ErtsCodeInfo *ci, Eterm *call_time);
-void erts_set_time_trace_bif(ErtsCodeInfo *ci, enum erts_break_op);
-void erts_clear_time_trace_bif(ErtsCodeInfo *ci);
-
 ErtsCodeInfo *erts_find_local_func(ErtsCodeMFA *mfa);
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
diff --git a/erts/emulator/beam/beam_debug.c b/erts/emulator/beam/beam_debug.c
index 4d52435139..6a9a6b7dc9 100644
--- a/erts/emulator/beam/beam_debug.c
+++ b/erts/emulator/beam/beam_debug.c
@@ -332,7 +332,7 @@ erts_debug_disassemble_1(BIF_ALIST_1)
 		   "unknown " HEXF "\n", instr);
 	code_ptr++;
     }
-    if (i == op_call_nif) {
+    if (i == op_call_nif_WWW) {
         /*
          * The rest of the code will not be executed. Don't disassemble any
          * more code in this function.
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index 07c16e3415..ee70b138e5 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -111,10 +111,9 @@ do {                                     \
 
 #define CHECK_ALIGNED(Dst) ASSERT((((Uint)&Dst) & (sizeof(Uint)-1)) == 0)
 
-#define GET_BIF_MODULE(p)  (p->info.mfa.module)
-#define GET_BIF_FUNCTION(p)  (p->info.mfa.function)
-#define GET_BIF_ARITY(p)  (p->info.mfa.arity)
-#define GET_BIF_ADDRESS(p) ((BifFunction) (p->beam[1]))
+#define GET_EXPORT_MODULE(p)  ((p)->info.mfa.module)
+#define GET_EXPORT_FUNCTION(p)  ((p)->info.mfa.function)
+#define GET_EXPORT_ARITY(p)  ((p)->info.mfa.arity)
 #define TermWords(t) (((t) / (sizeof(BeamInstr)/sizeof(Eterm))) + !!((t) % (sizeof(BeamInstr)/sizeof(Eterm))))
 
 
@@ -141,10 +140,6 @@ do {                                     \
      BeamCodeAddr(IP) < (BeamInstr)LabelAddr(end_emulator_loop))
 #endif /* NO_JUMP_TABLE */
 
-#define SET_CP(p, ip)           \
-   ASSERT(VALID_INSTR(*(ip)));  \
-   (p)->cp = (ip)
-
 #define SET_I(ip) \
    ASSERT(VALID_INSTR(* (Eterm *)(ip))); \
    I = (ip)
@@ -254,72 +249,6 @@ void** beam_ops;
 #define Q(N) (N*sizeof(Eterm *))
 #define l(N) (freg[N].fd)
 
-/*
- * Check that we haven't used the reductions and jump to function pointed to by
- * the I register.  If we are out of reductions, do a context switch.
- */
-
-#define DispatchMacro()				\
-  do {						\
-     BeamInstr dis_next;                        \
-     dis_next = *I;                             \
-     CHECK_ARGS(I);				\
-     if (FCALLS > 0 || FCALLS > neg_o_reds) {	\
-        FCALLS--;				\
-        Goto(dis_next);				\
-     } else {					\
-	goto context_switch;			\
-     }						\
- } while (0)                                    \
-
-#define DispatchMacroFun()			\
-  do {						\
-     BeamInstr dis_next;                        \
-     dis_next = *I;                             \
-     CHECK_ARGS(I);				\
-     if (FCALLS > 0 || FCALLS > neg_o_reds) {	\
-        FCALLS--;				\
-        Goto(dis_next);				\
-     } else {					\
-	goto context_switch_fun;		\
-     }						\
- } while (0)
-
-#define DispatchMacrox()                                                \
-  do {                                                                  \
-     if (FCALLS > 0) {                                                  \
-        BeamInstr dis_next;                                             \
-        SET_I(((Export *) Arg(0))->addressv[erts_active_code_ix()]);    \
-        dis_next = *I;                                                  \
-        FCALLS--;                                                       \
-        CHECK_ARGS(I);                                                  \
-        Goto(dis_next);                                                 \
-     } else if (ERTS_PROC_GET_SAVED_CALLS_BUF(c_p)                      \
-		&& FCALLS > neg_o_reds) {                               \
-        goto save_calls1;                                               \
-     } else {                                                           \
-        SET_I(((Export *) Arg(0))->addressv[erts_active_code_ix()]);    \
-        CHECK_ARGS(I);                                                  \
-	goto context_switch;                                            \
-     }                                                                  \
- } while (0)
-
-#ifdef DEBUG
-/*
- * To simplify breakpoint setting, put the code in one place only and jump to it.
- */
-#  define Dispatch() goto do_dispatch
-#  define Dispatchx() goto do_dispatchx
-#  define Dispatchfun() goto do_dispatchfun
-#else
-/*
- * Inline for speed.
- */
-#  define Dispatch() DispatchMacro()
-#  define Dispatchx() DispatchMacrox()
-#  define Dispatchfun() DispatchMacroFun()
-#endif
-
 #define Arg(N)       I[(N)+1]
 
 #define GetSource(raw, dst)			\
@@ -352,19 +281,6 @@ do {						\
     }						\
 } while(0)
 
-#define DispatchReturn                          \
-do {                                            \
-    if (FCALLS > 0 || FCALLS > neg_o_reds) {	\
-        FCALLS--;				\
-        Goto(*I);                               \
-    }                                           \
-    else {					\
-        c_p->current = NULL;                    \
-        c_p->arity = 1;                         \
-        goto context_switch3;			\
-    }						\
-} while (0)
-
 #ifdef DEBUG
 /* Better static type testing by the C compiler */
 #  define BEAM_IS_TUPLE(Src) is_tuple(Src)
@@ -521,10 +437,10 @@ init_emulator(void)
         }                                                                                      \
     } while(0)
 
-#define DTRACE_RETURN_FROM_PC(p)                                                        \
+#define DTRACE_RETURN_FROM_PC(p, i)                                                        \
     do {                                                                                \
         ErtsCodeMFA* cmfa;                                                                  \
-        if (DTRACE_ENABLED(function_return) && (cmfa = find_function_from_pc((p)->cp))) { \
+        if (DTRACE_ENABLED(function_return) && (cmfa = find_function_from_pc(i))) { \
             DTRACE_RETURN((p), cmfa);                               \
         }                                                                               \
     } while(0)
@@ -534,7 +450,7 @@ init_emulator(void)
 #define DTRACE_GLOBAL_CALL(p, mfa)       do {} while (0)
 #define DTRACE_GLOBAL_CALL_FROM_EXPORT(p, e) do {} while (0)
 #define DTRACE_RETURN(p, mfa)            do {} while (0)
-#define DTRACE_RETURN_FROM_PC(p)             do {} while (0)
+#define DTRACE_RETURN_FROM_PC(p, i)      do {} while (0)
 #define DTRACE_BIF_ENTRY(p, mfa)         do {} while (0)
 #define DTRACE_BIF_RETURN(p, mfa)        do {} while (0)
 #define DTRACE_NIF_ENTRY(p, mfa)         do {} while (0)
@@ -772,27 +688,9 @@ void process_main(Eterm * x_reg_array, FloatDef* f_reg_array)
 #endif
 
 #include "beam_hot.h"
-
-#ifdef DEBUG
     /*
-     * Set a breakpoint here to get control just after a call instruction.
-     * I points to the first instruction in the called function.
-     *
-     * In gdb, use 'call dis(I-5, 1)' to show the name of the function.
-     */
- do_dispatch:
-     DispatchMacro();
-
- do_dispatchx:
-     DispatchMacrox();
-
- do_dispatchfun:
-     DispatchMacroFun();
-
-#endif
-
-    /*
-     * Jumped to from the Dispatch() macro when the reductions are used up.
+     * The labels are jumped to from the $DISPATCH() macros when the reductions
+     * are used up.
      *
      * Since the I register points just beyond the FuncBegin instruction, we
      * can get the module, function, and arity for the function being
@@ -986,18 +884,42 @@ void process_main(Eterm * x_reg_array, FloatDef* f_reg_array)
   }
 #endif
     return;			/* Never executed */
+}
 
-  save_calls1:
-    {
-	BeamInstr dis_next;
+/*
+ * Enter all BIFs into the export table.
+ *
+ * Note that they will all call the error_handler until their modules have been
+ * loaded, which may prevent the system from booting if BIFs from non-preloaded
+ * modules are apply/3'd while loading code. Ordinary BIF calls will work fine
+ * however since they won't go through export entries.
+ */
+static void install_bifs(void) {
+    int i;
+
+    for (i = 0; i < BIF_SIZE; i++) {
+        BifEntry *entry;
+        Export *ep;
+        int j;
 
-	save_calls(c_p, (Export *) Arg(0));
+        entry = &bif_table[i];
 
-	SET_I(((Export *) Arg(0))->addressv[erts_active_code_ix()]);
+        ep = erts_export_put(entry->module, entry->name, entry->arity);
 
-	dis_next = *I;
-	FCALLS--;
-	Goto(dis_next);
+        ep->info.op = BeamOpCodeAddr(op_i_func_info_IaaI);
+        ep->info.mfa.module = entry->module;
+        ep->info.mfa.function = entry->name;
+        ep->info.mfa.arity = entry->arity;
+        ep->bif_table_index = i;
+
+        memset(&ep->trampoline, 0, sizeof(ep->trampoline));
+        ep->trampoline.op = BeamOpCodeAddr(op_call_error_handler);
+
+        for (j = 0; j < ERTS_NUM_CODE_IX; j++) {
+            ep->addressv[j] = ep->trampoline.raw;
+        }
+
+        bif_export[i] = ep;
     }
 }
 
@@ -1008,43 +930,30 @@ void process_main(Eterm * x_reg_array, FloatDef* f_reg_array)
 static void
 init_emulator_finish(void)
 {
-     int i;
-     Export* ep;
-
 #if defined(ARCH_64) && defined(CODE_MODEL_SMALL)
-     for (i = 0; i < NUMBER_OF_OPCODES; i++) {
-         BeamInstr instr = BeamOpCodeAddr(i);
-         if (instr >= (1ull << 32)) {
-             erts_exit(ERTS_ERROR_EXIT,
-                       "This run-time was supposed be compiled with all code below 2Gb,\n"
-                       "but the instruction '%s' is located at %016lx.\n",
-                       opc[i].name, instr);
-         }
-     }
+    int i;
+
+    for (i = 0; i < NUMBER_OF_OPCODES; i++) {
+        BeamInstr instr = BeamOpCodeAddr(i);
+        if (instr >= (1ull << 32)) {
+            erts_exit(ERTS_ERROR_EXIT,
+                      "This run-time was supposed be compiled with all code below 2Gb,\n"
+                      "but the instruction '%s' is located at %016lx.\n",
+                      opc[i].name, instr);
+        }
+    }
 #endif
 
-     beam_apply[0]             = BeamOpCodeAddr(op_i_apply);
-     beam_apply[1]             = BeamOpCodeAddr(op_normal_exit);
-     beam_exit[0]              = BeamOpCodeAddr(op_error_action_code);
-     beam_continue_exit[0]     = BeamOpCodeAddr(op_continue_exit);
-     beam_return_to_trace[0]   = BeamOpCodeAddr(op_i_return_to_trace);
-     beam_return_trace[0]      = BeamOpCodeAddr(op_return_trace);
-     beam_exception_trace[0]   = BeamOpCodeAddr(op_return_trace); /* UGLY */
-     beam_return_time_trace[0] = BeamOpCodeAddr(op_i_return_time_trace);
+    beam_apply[0]             = BeamOpCodeAddr(op_i_apply);
+    beam_apply[1]             = BeamOpCodeAddr(op_normal_exit);
+    beam_exit[0]              = BeamOpCodeAddr(op_error_action_code);
+    beam_continue_exit[0]     = BeamOpCodeAddr(op_continue_exit);
+    beam_return_to_trace[0]   = BeamOpCodeAddr(op_i_return_to_trace);
+    beam_return_trace[0]      = BeamOpCodeAddr(op_return_trace);
+    beam_exception_trace[0]   = BeamOpCodeAddr(op_return_trace); /* UGLY */
+    beam_return_time_trace[0] = BeamOpCodeAddr(op_i_return_time_trace);
 
-     /*
-      * Enter all BIFs into the export table.
-      */
-     for (i = 0; i < BIF_SIZE; i++) {
-	 ep = erts_export_put(bif_table[i].module,
-			      bif_table[i].name,
-			      bif_table[i].arity);
-	 bif_export[i] = ep;
-	 ep->beam[0] = BeamOpCodeAddr(op_apply_bif);
-	 ep->beam[1] = (BeamInstr) bif_table[i].f;
-	 /* XXX: set func info for bifs */
-	 ep->info.op = BeamOpCodeAddr(op_i_func_info_IaaI);
-     }
+    install_bifs();
 }
 
 /*
@@ -1257,7 +1166,7 @@ void erts_dirty_process_main(ErtsSchedulerData *esdp)
 	 * I[2]: Pointer to erl_module_nif
 	 * I[3]: Function pointer to dirty NIF
 	 *
-	 * This layout is determined by the NifExport struct
+	 * This layout is determined by the ErtsNativeFunc struct
 	 */
 
 	ERTS_MSACC_SET_STATE_CACHED_M_X(ERTS_MSACC_STATE_NIF);
@@ -1271,11 +1180,11 @@ void erts_dirty_process_main(ErtsSchedulerData *esdp)
 	ERTS_UNREQ_PROC_MAIN_LOCK(c_p);
 
 	ASSERT(!ERTS_PROC_IS_EXITING(c_p));
-	if (BeamIsOpCode(*I, op_apply_bif)) {
+	if (BeamIsOpCode(*I, op_call_bif_W)) {
 	    exiting = erts_call_dirty_bif(esdp, c_p, I, reg);
 	}
 	else {
-	    ASSERT(BeamIsOpCode(*I, op_call_nif));
+	    ASSERT(BeamIsOpCode(*I, op_call_nif_WWW));
             exiting = erts_call_dirty_nif(esdp, c_p, I, reg);
 	}
 
@@ -1344,6 +1253,33 @@ Eterm error_atom[NUMBER_EXIT_CODES] = {
   am_badkey,		/* 19 */
 };
 
+/* Returns the return address at E[0] in printable form, skipping tracing in
+ * the same manner as gather_stacktrace.
+ *
+ * This is needed to generate correct stacktraces when throwing errors from
+ * instructions that return like an ordinary function, such as call_nif. */
+BeamInstr *erts_printable_return_address(Process* p, Eterm *E) {
+    Eterm *ptr = E;
+
+    ASSERT(is_CP(*ptr));
+
+    while (ptr < STACK_START(p)) {
+        BeamInstr *cp = cp_val(*ptr);
+
+        if (cp == beam_exception_trace || cp == beam_return_trace) {
+            ptr += 3;
+        } else if (cp == beam_return_time_trace) {
+            ptr += 2;
+        } else if (cp == beam_return_to_trace) {
+            ptr += 1;
+        } else {
+            return cp;
+        }
+    }
+
+    ERTS_ASSERT(!"No continuation pointer on stack");
+}
+
 /*
  * To fully understand the error handling, one must keep in mind that
  * when an exception is thrown, the search for a handler can jump back
@@ -1373,14 +1309,14 @@ handle_error(Process* c_p, BeamInstr* pc, Eterm* reg, ErtsCodeMFA *bif_mfa)
 
     ASSERT(c_p->freason != TRAP); /* Should have been handled earlier. */
 
-    if (c_p->freason & EXF_RESTORE_NIF)
-	erts_nif_export_restore_error(c_p, &pc, reg, &bif_mfa);
+    if (c_p->freason & EXF_RESTORE_NFUNC)
+	erts_nfunc_restore_error(c_p, &pc, reg, &bif_mfa);
 
 #ifdef DEBUG
     if (bif_mfa) {
-	/* Verify that bif_mfa does not point into our nif export */
-	NifExport *nep = ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p);
-	ASSERT(!nep || !ErtsInArea(bif_mfa, (char *)nep, sizeof(NifExport)));
+	/* Verify that bif_mfa does not point into our native function wrapper */
+	ErtsNativeFunc *nep = ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p);
+	ASSERT(!nep || !ErtsInArea(bif_mfa, (char *)nep, sizeof(ErtsNativeFunc)));
     }
 #endif
 
@@ -1443,7 +1379,7 @@ handle_error(Process* c_p, BeamInstr* pc, Eterm* reg, ErtsCodeMFA *bif_mfa)
 	reg[2] = Value;
 	reg[3] = c_p->ftrace;
         if ((new_pc = next_catch(c_p, reg))) {
-	    c_p->cp = 0;	/* To avoid keeping stale references. */
+            c_p->stop[0] = NIL;  /* To avoid keeping stale references. */
             ERTS_RECV_MARK_CLEAR(c_p); /* No longer safe to use this position */
 	    return new_pc;
 	}
@@ -1481,35 +1417,6 @@ next_catch(Process* c_p, Eterm *reg) {
         return NULL;
     }
 
-    /*
-     * Better safe than sorry here. In debug builds, produce a core
-     * dump if the top of the stack doesn't point to a continuation
-     * pointer. In other builds, ignore a non-CP at the top of stack.
-     */
-    ASSERT(is_CP(*ptr));
-    if ((is_not_CP(*ptr) || (*cp_val(*ptr) != i_return_trace &&
-			     *cp_val(*ptr) != i_return_to_trace &&
-			     *cp_val(*ptr) != i_return_time_trace ))
-	&& c_p->cp) {
-	/* Can not follow cp here - code may be unloaded */
-	BeamInstr *cpp = c_p->cp;
-	if (cpp == beam_exception_trace) {
-            ErtsCodeMFA *mfa = (ErtsCodeMFA*)cp_val(ptr[0]);
-	    erts_trace_exception(c_p, mfa,
-				 reg[1], reg[2],
-                                 ERTS_TRACER_FROM_ETERM(ptr+1));
-	    /* Skip return_trace parameters */
-	    ptr += 2;
-	} else if (cpp == beam_return_trace) {
-	    /* Skip return_trace parameters */
-	    ptr += 2;
-	} else if (cpp == beam_return_time_trace) {
-	    /* Skip return_trace parameters */
-	    ptr += 1;
-	} else if (cpp == beam_return_to_trace) {
-	    have_return_to_trace = !0; /* Record next cp */
-	}
-    }
     while (ptr < STACK_START(c_p)) {
 	if (is_catch(*ptr)) {
 	    if (active_catches) goto found_catch;
@@ -1664,6 +1571,54 @@ expand_error_value(Process* c_p, Uint freason, Eterm Value) {
     return Value;
 }
 
+
+static void
+gather_stacktrace(Process* p, struct StackTrace* s, int depth)
+{
+    BeamInstr *prev;
+    Eterm *ptr;
+
+    if (depth == 0) {
+        return;
+    }
+
+    prev = s->depth ? s->trace[s->depth - 1] : s->pc;
+    ptr = p->stop;
+
+    /*
+     * Traverse the stack backwards and add all unique continuation
+     * pointers to the buffer, up to the maximum stack trace size.
+     *
+     * Skip trace stack frames.
+     */
+
+    ASSERT(ptr >= STACK_TOP(p) && ptr <= STACK_START(p));
+
+    while (ptr < STACK_START(p) && depth > 0) {
+        if (is_CP(*ptr)) {
+            BeamInstr *cp = cp_val(*ptr);
+
+            if (cp == beam_exception_trace || cp == beam_return_trace) {
+                ptr += 3;
+            } else if (cp == beam_return_time_trace) {
+                ptr += 2;
+            } else if (cp == beam_return_to_trace) {
+                ptr += 1;
+            } else {
+                if (cp != prev) {
+                    /* Record non-duplicates only */
+                    prev = cp;
+                    s->trace[s->depth++] = cp - 1;
+                    depth--;
+                }
+                ptr++;
+            }
+        } else {
+            ptr++;
+        }
+    }
+}
+
 /*
  * Quick-saving the stack trace in an internal form on the heap. Note
  * that c_p->ftrace will point to a cons cell which holds the given args
@@ -1750,11 +1705,6 @@ save_stacktrace(Process* c_p, BeamInstr* pc, Eterm* reg,
 	    s->trace[s->depth++] = pc;
 	    depth--;
 	}
-	/* Save second stack entry if CP is valid and different from pc */
-	if (depth > 0 && c_p->cp != 0 && c_p->cp != pc) {
-	    s->trace[s->depth++] = c_p->cp - 1;
-	    depth--;
-	}
 	s->pc = NULL;
 	args = make_arglist(c_p, reg, bif_mfa->arity); /* Overwrite CAR(c_p->ftrace) */
     } else {
@@ -1762,9 +1712,9 @@ save_stacktrace(Process* c_p, BeamInstr* pc, Eterm* reg,
     non_bif_stacktrace:
 
 	s->current = c_p->current;
-        /* 
+        /*
 	 * For a function_clause error, the arguments are in the beam
-	 * registers, c_p->cp is valid, and c_p->current is set.
+	 * registers and c_p->current is set.
 	 */
 	if ( (GET_EXC_INDEX(s->freason)) ==
 	     (GET_EXC_INDEX(EXC_FUNCTION_CLAUSE)) ) {
@@ -1772,18 +1722,8 @@ save_stacktrace(Process* c_p, BeamInstr* pc, Eterm* reg,
 	    ASSERT(s->current);
 	    a = s->current->arity;
 	    args = make_arglist(c_p, reg, a); /* Overwrite CAR(c_p->ftrace) */
-	    /* Save first stack entry */
-	    ASSERT(c_p->cp);
-	    if (depth > 0) {
-		s->trace[s->depth++] = c_p->cp - 1;
-		depth--;
-	    }
 	    s->pc = NULL; /* Ignore pc */
 	} else {
-	    if (depth > 0 && c_p->cp != 0 && c_p->cp != pc) {
-		s->trace[s->depth++] = c_p->cp - 1;
-		depth--;
-	    }
 	    s->pc = pc;
 	}
     }
@@ -1796,80 +1736,13 @@ save_stacktrace(Process* c_p, BeamInstr* pc, Eterm* reg,
     }
 
     /* Save the actual stack trace */
-    erts_save_stacktrace(c_p, s, depth);
+    gather_stacktrace(c_p, s, depth);
 }
 
 void
 erts_save_stacktrace(Process* p, struct StackTrace* s, int depth)
 {
-    if (depth > 0) {
-	Eterm *ptr;
-	BeamInstr *prev = s->depth ? s->trace[s->depth-1] : NULL;
-	BeamInstr i_return_trace = beam_return_trace[0];
-	BeamInstr i_return_to_trace = beam_return_to_trace[0];
-
-	/*
-	 * Traverse the stack backwards and add all unique continuation
-	 * pointers to the buffer, up to the maximum stack trace size.
-	 * 
-	 * Skip trace stack frames.
-	 */
-	ptr = p->stop;
-	if (ptr < STACK_START(p) &&
-	    (is_not_CP(*ptr)|| (*cp_val(*ptr) != i_return_trace &&
-				*cp_val(*ptr) != i_return_to_trace)) &&
-	    p->cp) {
-	    /* Cannot follow cp here - code may be unloaded */
-	    BeamInstr *cpp = p->cp;
-	    int trace_cp;
-	    if (cpp == beam_exception_trace || cpp == beam_return_trace) {
-		/* Skip return_trace parameters */
-		ptr += 2;
-		trace_cp = 1;
-	    } else if (cpp == beam_return_to_trace) {
-		/* Skip return_to_trace parameters */
-		ptr += 1;
-		trace_cp = 1;
-	    }
-	    else {
-		trace_cp = 0;
-	    }
-	    if (trace_cp && s->pc == cpp) {
-		/*
-		 * If process 'cp' points to a return/exception trace
-		 * instruction and 'cp' has been saved as 'pc' in
-		 * stacktrace, we need to update 'pc' in stacktrace
-		 * with the actual 'cp' located on the top of the
-		 * stack; otherwise, we will lose the top stackframe
-		 * when building the stack trace.
-		 */
-		ASSERT(is_CP(p->stop[0]));
-		s->pc = cp_val(p->stop[0]);
-	    }
-	}
-	while (ptr < STACK_START(p) && depth > 0) {
-	    if (is_CP(*ptr)) {
-		if (*cp_val(*ptr) == i_return_trace) {
-		    /* Skip stack frame variables */
-		    do ++ptr; while (is_not_CP(*ptr));
-		    /* Skip return_trace parameters */
-		    ptr += 2;
-		} else if (*cp_val(*ptr) == i_return_to_trace) {
-		    /* Skip stack frame variables */
-		    do ++ptr; while (is_not_CP(*ptr));
-		} else {
-		    BeamInstr *cp = cp_val(*ptr);
-		    if (cp != prev) {
-			/* Record non-duplicates only */
-			prev = cp;
-			s->trace[s->depth++] = cp - 1;
-			depth--;
-		    }
-		    ptr++;
-		}
-	    } else ptr++;
-	}
-    }
+    gather_stacktrace(p, s, depth);
 }
 
 /*
@@ -2128,95 +2001,66 @@ apply_bif_error_adjustment(Process *p, Export *ep,
 			   Eterm *reg, Uint arity,
 			   BeamInstr *I, Uint stack_offset)
 {
+    int apply_only;
+    Uint need;
+
+    need = stack_offset /* bytes */ / sizeof(Eterm);
+    apply_only = stack_offset == 0;
+
     /*
      * I is only set when the apply is a tail call, i.e.,
      * from the instructions i_apply_only, i_apply_last_P,
      * and apply_last_IP.
      */
-    if (I
-	&& BeamIsOpCode(ep->beam[0], op_apply_bif)
-        && (ep == bif_export[BIF_error_1]
-	    || ep == bif_export[BIF_error_2]
-	    || ep == bif_export[BIF_exit_1]
-	    || ep == bif_export[BIF_throw_1])) {
-	/*
-	 * We are about to tail apply one of the BIFs
-	 * erlang:error/1, erlang:error/2, erlang:exit/1,
-	 * or erlang:throw/1. Error handling of these BIFs is
-	 * special!
-	 *
-	 * We need 'p->cp' to point into the calling
-	 * function when handling the error after the BIF has
-	 * been applied. This in order to get the topmost
-	 * stackframe correct. Without the following adjustment,
-	 * 'p->cp' will point into the function that called
-	 * current function when handling the error. We add a
-	 * dummy stackframe in order to achieve this.
-	 *
-	 * Note that these BIFs unconditionally will cause
-	 * an exception to be raised. That is, our modifications
-	 * of 'p->cp' as well as the stack will be corrected by
-	 * the error handling code.
-	 *
-	 * If we find an exception/return-to trace continuation
-	 * pointer as the topmost continuation pointer, we do not
-	 * need to do anything since the information already will
-	 * be available for generation of the stacktrace.
-	 */
-	int apply_only = stack_offset == 0;
-	BeamInstr *cpp;
+    if (!(I && (ep == bif_export[BIF_error_1] ||
+                ep == bif_export[BIF_error_2] ||
+                ep == bif_export[BIF_exit_1] ||
+                ep == bif_export[BIF_throw_1]))) {
+        return;
+    }
 
-	if (apply_only) {
-	    ASSERT(p->cp != NULL);
-	    cpp = p->cp;
-	}
-	else {
-	    ASSERT(is_CP(p->stop[0]));
-	    cpp = cp_val(p->stop[0]);
-	}
+    /*
+     * We are about to tail apply one of the BIFs erlang:error/1,
+     * erlang:error/2, erlang:exit/1, or erlang:throw/1. Error handling of
+     * these BIFs is special!
+     *
+     * We need the topmost continuation pointer to point into the calling
+     * function when handling the error after the BIF has been applied. This in
+     * order to get the topmost stackframe correct.
+     *
+     * Note that these BIFs will unconditionally cause an exception to be
+     * raised. That is, our modifications of the stack will be corrected by the
+     * error handling code.
+     */
+    if (need == 0) {
+        need = 1; /* i_apply_only */
+    }
 
-	if (cpp != beam_exception_trace
-	    && cpp != beam_return_trace
-	    && cpp != beam_return_to_trace) {
-	    Uint need = stack_offset /* bytes */ / sizeof(Eterm);
-	    if (need == 0)
-		need = 1; /* i_apply_only */
-	    if (p->stop - p->htop < need)
-		erts_garbage_collect(p, (int) need, reg, arity+1);
-	    p->stop -= need;
-
-	    if (apply_only) {
-		/*
-		 * Called from the i_apply_only instruction.
-		 *
-		 * 'p->cp' contains continuation pointer pointing
-		 * into the function that called current function.
-		 * We push that continuation pointer onto the stack,
-		 * and set 'p->cp' to point into current function.
-		 */
+    if (p->stop - p->htop < need) {
+        erts_garbage_collect(p, (int) need, reg, arity+1);
+    }
 
-		p->stop[0] = make_cp(p->cp);
-		p->cp = I;
-	    }
-	    else {
-		/*
-		 * Called from an i_apply_last_p, or apply_last_IP,
-		 * instruction.
-		 *
-		 * Calling instruction will after we return read
-		 * a continuation pointer from the stack and write
-		 * it to 'p->cp', and then remove the topmost
-		 * stackframe of size 'stack_offset'.
-		 *
-		 * We have sized the dummy-stackframe so that it
-		 * will be removed by the instruction we currently
-		 * are executing, and leave the stackframe that
-		 * normally would have been removed intact.
-		 *
-		 */
-		p->stop[0] = make_cp(I);
-	    }
-	}
+    if (apply_only) {
+        /*
+         * Called from the i_apply_only instruction.
+         *
+         * Push the continuation pointer for the current function to the stack.
+         */
+        p->stop -= need;
+        p->stop[0] = make_cp(I);
+    } else {
+        /*
+         * Called from an i_apply_last_* instruction.
+         *
+         * The calling instruction will deallocate a stack frame of size
+         * 'stack_offset'.
+         *
+         * Push the continuation pointer for the current function to the stack,
+         * and then add a dummy stackframe for the i_apply_last* instruction
+         * to discard.
+         */
+        p->stop[0] = make_cp(I);
+        p->stop -= need;
     }
 }
 
@@ -2437,10 +2281,10 @@ erts_hibernate(Process* c_p, Eterm* reg)
     c_p->arg_reg[0] = module;
     c_p->arg_reg[1] = function;
     c_p->arg_reg[2] = args;
-    c_p->stop = STACK_START(c_p);
+    c_p->stop = c_p->hend - 1;  /* Keep first continuation pointer */
+    ASSERT(c_p->stop[0] == make_cp(beam_apply+1));
     c_p->catches = 0;
     c_p->i = beam_apply;
-    c_p->cp = (BeamInstr *) beam_apply+1;
 
     /*
      * If there are no waiting messages, garbage collect and
@@ -3268,10 +3112,10 @@ erts_is_builtin(Eterm Mod, Eterm Name, int arity)
     e.info.mfa.arity = arity;
 
     if ((ep = export_get(&e)) == NULL) {
-	return 0;
+        return 0;
     }
-    return ep->addressv[erts_active_code_ix()] == ep->beam &&
-	BeamIsOpCode(ep->beam[0], op_apply_bif);
+
+    return ep->bif_table_index != -1;
 }
 
 
diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c
index 35f2ea6688..8fe046095f 100644
--- a/erts/emulator/beam/beam_load.c
+++ b/erts/emulator/beam/beam_load.c
@@ -141,7 +141,7 @@ typedef struct {
 				 * eventually patch with a pointer into
 				 * the export entry.
 				 */
-    BifFunction bf;		/* Pointer to BIF function if BIF;
+    Export *bif;		/* Pointer to export entry if BIF;
 				 * NULL otherwise.
 				 */
 } ImportEntry;
@@ -315,6 +315,7 @@ typedef struct LoaderState {
 				 * (or 0 if there is no on_load function)
 				 */
     int otp_20_or_higher;       /* Compiled with OTP 20 or higher */
+    unsigned max_opcode;        /* Highest opcode used in module */
 
     /*
      * Atom table.
@@ -844,17 +845,23 @@ erts_finish_loading(Binary* magic, Process* c_p,
 	    if (ep == NULL || ep->info.mfa.module != module) {
 		continue;
 	    }
-	    if (ep->addressv[code_ix] == ep->beam) {
-		if (BeamIsOpCode(ep->beam[0], op_apply_bif)) {
-		    continue;
-		} else if (BeamIsOpCode(ep->beam[0], op_i_generic_breakpoint)) {
+
+            DBG_CHECK_EXPORT(ep, code_ix);
+
+	    if (ep->addressv[code_ix] == ep->trampoline.raw) {
+		if (BeamIsOpCode(ep->trampoline.op, op_i_generic_breakpoint)) {
 		    ERTS_LC_ASSERT(erts_thr_progress_is_blocking());
 		    ASSERT(mod_tab_p->curr.num_traced_exports > 0);
-		    erts_clear_export_break(mod_tab_p, &ep->info);
-		    ep->addressv[code_ix] = (BeamInstr *) ep->beam[1];
-		    ep->beam[1] = 0;
+
+                    erts_clear_export_break(mod_tab_p, ep);
+
+                    ep->addressv[code_ix] =
+                        (BeamInstr*)ep->trampoline.breakpoint.address;
+                    ep->trampoline.breakpoint.address = 0;
+
+                    ASSERT(ep->addressv[code_ix] != ep->trampoline.raw);
 		}
-		ASSERT(ep->beam[1] == 0);
+		ASSERT(ep->trampoline.breakpoint.address == 0);
 	    }
 	}
 	ASSERT(mod_tab_p->curr.num_breakpoints == 0);
@@ -1470,15 +1477,14 @@ load_import_table(LoaderState* stp)
 	}
 	stp->import[i].arity = arity;
 	stp->import[i].patches = 0;
-	stp->import[i].bf = NULL;
+	stp->import[i].bif = NULL;
 
 	/*
-	 * If the export entry refers to a BIF, get the pointer to
-	 * the BIF function.
+	 * If the export entry refers to a BIF, save a pointer to the BIF entry.
 	 */
 	if ((e = erts_active_export_entry(mod, func, arity)) != NULL) {
-	    if (BeamIsOpCode(e->beam[0], op_apply_bif)) {
-		stp->import[i].bf = (BifFunction) e->beam[1];
+	    if (e->bif_table_index != -1) {
+		stp->import[i].bif = e;
 		if (func == am_load_nif && mod == am_erlang && arity == 2) {
 		    stp->may_load_nif = 1;
 		}
@@ -1529,33 +1535,6 @@ read_export_table(LoaderState* stp)
 	    LoadError2(stp, "export table entry %u: label %u not resolved", i, n);
 	}
 	stp->export[i].address = address = stp->codev + value;
-
-	/*
-	 * Find out if there is a BIF with the same name.
-	 */
-
-	if (!is_bif(stp->module, func, arity)) {
-	    continue;
-	}
-
-	/*
-	 * This is a stub for a BIF.
-	 *
-	 * It should not be exported, and the information in its
-	 * func_info instruction should be invalidated so that it
-	 * can be filtered out by module_info(functions) and by
-	 * any other functions that walk through all local functions.
-	 */
-
-	if (stp->labels[n].num_patches > 0) {
-	    LoadError3(stp, "there are local calls to the stub for "
-		       "the BIF %T:%T/%d",
-		       stp->module, func, arity);
-	}
-	stp->export[i].address = NULL;
-	address[-1] = 0;
-	address[-2] = NIL;
-	address[-3] = NIL;
     }
     return 1;
 
@@ -1563,31 +1542,33 @@ read_export_table(LoaderState* stp)
     return 0;
 }
 
-
 static int
 is_bif(Eterm mod, Eterm func, unsigned arity)
 {
-    Export* e = erts_active_export_entry(mod, func, arity);
-    if (e == NULL) {
-	return 0;
-    }
-    if (! BeamIsOpCode(e->beam[0], op_apply_bif)) {
-	return 0;
-    }
-    if (mod == am_erlang && func == am_apply && arity == 3) {
-	/*
-	 * erlang:apply/3 is a special case -- it is implemented
-	 * as an instruction and it is OK to redefine it.
-	 */
-	return 0;
+    Export *e = erts_active_export_entry(mod, func, arity);
+
+    if (e != NULL) {
+        return e->bif_table_index != -1;
     }
-    return 1;
+
+    return 0;
 }
 
 static int
 read_lambda_table(LoaderState* stp)
 {
     unsigned int i;
+    unsigned int otp_22_or_lower;
+
+    /*
+     * Determine whether this module was compiled with OTP 22 or lower
+     * by looking at the max opcode number. The compiler in OTP 23 will
+     * always set the max opcode to the opcode for `swap` (whether
+     * actually used or not) so that a module compiled for OTP 23
+     * cannot be loaded in earlier versions.
+     */
+
+    otp_22_or_lower = stp->max_opcode < genop_swap_2;
 
     GetInt(stp, 4, stp->num_lambdas);
     if (stp->num_lambdas > stp->lambdas_allocated) {
@@ -1619,6 +1600,29 @@ read_lambda_table(LoaderState* stp)
 	GetInt(stp, 4, Index);
 	GetInt(stp, 4, stp->lambdas[i].num_free);
 	GetInt(stp, 4, OldUniq);
+
+        /*
+         * Fun entries are now keyed by the explicit ("new") index in
+         * the fun entry. That allows multiple make_fun2 instructions
+         * to share the same fun entry (when the `fun F/A` syntax is
+         * used). Before OTP 23, fun entries were keyed by the old
+         * index, which is the order of the entries in the fun
+         * chunk. Each make_fun2 needed to refer to its own fun entry.
+         *
+         * Modules compiled before OTP 23 can safely be loaded if the
+         * old index and the new index are equal. That is true for all
+         * modules compiled with OTP R15 and later.
+         */
+        if (otp_22_or_lower && i != Index) {
+            /*
+             * Compiled with a compiler before OTP R15B. The new indices
+             * are not reliable, so it is not safe to load this module.
+             */
+            LoadError2(stp, "please re-compile this module with an "
+                       ERLANG_OTP_RELEASE " compiler "
+                       "(old-style fun with indices: %d/%d)",
+                       i, Index);
+        }
 	fe = erts_put_fun_entry2(stp->module, OldUniq, i, stp->mod_md5,
 				 Index, arity-stp->lambdas[i].num_free);
 	stp->lambdas[i].fe = fe;
@@ -1839,7 +1843,6 @@ read_code_header(LoaderState* stp)
 {
     unsigned head_size;
     unsigned version;
-    unsigned opcode_max;
     int i;
 
     /*
@@ -1871,8 +1874,8 @@ read_code_header(LoaderState* stp)
     /*
      * Verify the number of the highest opcode used.
      */
-    GetInt(stp, 4, opcode_max);
-    if (opcode_max > MAX_GENERIC_OPCODE) {
+    GetInt(stp, 4, stp->max_opcode);
+    if (stp->max_opcode > MAX_GENERIC_OPCODE) {
 	LoadError2(stp,
 		   "This BEAM file was compiled for a later version"
 		   " of the run-time system than " ERLANG_OTP_RELEASE ".\n"
@@ -1880,7 +1883,7 @@ read_code_header(LoaderState* stp)
 		   ERLANG_OTP_RELEASE " compiler.\n"
 		   "  (Use of opcode %d; this emulator supports "
 		   "only up to %d.)",
-		   opcode_max, MAX_GENERIC_OPCODE);
+		   stp->max_opcode, MAX_GENERIC_OPCODE);
     }
 
     GetInt(stp, 4, stp->num_labels);
@@ -2500,10 +2503,14 @@ load_code(LoaderState* stp)
 		if (i >= stp->num_imports) {
 		    LoadError1(stp, "invalid import table index %d", i);
 		}
-		if (stp->import[i].bf == NULL) {
+		if (stp->import[i].bif == NULL) {
 		    LoadError1(stp, "not a BIF: import table index %d", i);
 		}
-		code[ci++] = (BeamInstr) stp->import[i].bf;
+		{
+		    int bif_index = stp->import[i].bif->bif_table_index;
+		    BifEntry *bif_entry = &bif_table[bif_index];
+		    code[ci++] = (BeamInstr) bif_entry->f;
+		}
 		break;
 	    case 'P':		/* Byte offset into tuple or stack */
 	    case 'Q':		/* Like 'P', but packable */
@@ -2811,18 +2818,43 @@ load_code(LoaderState* stp)
 	switch (stp->specific_op) {
 	case op_i_func_info_IaaI:
 	    {
+                int padding_required;
 		Sint offset;
+
 		if (function_number >= stp->num_functions) {
 		    LoadError1(stp, "too many functions in module (header said %u)",
 			       stp->num_functions); 
 		}
 
-		if (stp->may_load_nif) {
+                /* Native function calls may be larger than their stubs, so
+                 * we'll need to make sure any potentially-native function stub
+                 * is padded with enough room.
+                 *
+                 * Note that the padding is applied for the previous function,
+                 * not the current one, so we check whether the old F/A is
+                 * a BIF. */
+                padding_required = last_func_start && (stp->may_load_nif ||
+                    is_bif(stp->module, stp->function, stp->arity));
+
+		/*
+		 * Save context for error messages.
+		 */
+		stp->function = code[ci-2];
+		stp->arity = code[ci-1];
+
+		/*
+		 * Save current offset of into the line instruction array.
+		 */
+		if (stp->func_line) {
+		    stp->func_line[function_number] = stp->current_li;
+		}
+
+		if (padding_required) {
 		    const int finfo_ix = ci - FUNC_INFO_SZ;
-		    if (finfo_ix - last_func_start < BEAM_NIF_MIN_FUNC_SZ && last_func_start) {
+		    if (finfo_ix - last_func_start < BEAM_NATIVE_MIN_FUNC_SZ) {
 			/* Must make room for call_nif op */
-			int pad = BEAM_NIF_MIN_FUNC_SZ - (finfo_ix - last_func_start);
-			ASSERT(pad > 0 && pad < BEAM_NIF_MIN_FUNC_SZ);
+			int pad = BEAM_NATIVE_MIN_FUNC_SZ - (finfo_ix - last_func_start);
+			ASSERT(pad > 0 && pad < BEAM_NATIVE_MIN_FUNC_SZ);
 			CodeNeed(pad);
 			sys_memmove(&code[finfo_ix+pad], &code[finfo_ix],
 				    FUNC_INFO_SZ*sizeof(BeamInstr));
@@ -2833,20 +2865,6 @@ load_code(LoaderState* stp)
 		}
 		last_func_start = ci;
 
-		/*
-		 * Save current offset of into the line instruction array.
-		 */
-
-		if (stp->func_line) {
-		    stp->func_line[function_number] = stp->current_li;
-		}
-
-		/*
-		 * Save context for error messages.
-		 */
-		stp->function = code[ci-2];
-		stp->arity = code[ci-1];
-
                 /* When this assert is triggered, it is normally a sign that
                    the size of the ops.tab i_func_info instruction is not
                    the same as FUNC_INFO_SZ */
@@ -2876,7 +2894,6 @@ load_code(LoaderState* stp)
 	case op_i_bs_match_string_yfWW:
 	    new_string_patch(stp, ci-1);
 	    break;
-
 	case op_catch_yf:
 	    /* code[ci-3]	&&lb_catch_yf
 	     * code[ci-2]	y-register offset in E
@@ -3131,27 +3148,6 @@ mixed_types(LoaderState* stp, GenOpArg Size, GenOpArg* Rest)
     return 0;
 }
 
-static int
-is_killed_apply(LoaderState* stp, GenOpArg Reg, GenOpArg Live)
-{
-    return Reg.type == TAG_x && Live.type == TAG_u &&
-	Live.val+2 <= Reg.val;
-}
-
-static int
-is_killed(LoaderState* stp, GenOpArg Reg, GenOpArg Live)
-{
-    return Reg.type == TAG_x && Live.type == TAG_u &&
-	Live.val <= Reg.val;
-}
-
-static int
-is_killed_by_call_fun(LoaderState* stp, GenOpArg Reg, GenOpArg Live)
-{
-    return Reg.type == TAG_x && Live.type == TAG_u &&
-	Live.val+1 <= Reg.val;
-}
-
 /*
  * Test whether register Reg is killed by make_fun instruction that
  * creates the fun given by index idx.
@@ -3171,14 +3167,24 @@ is_killed_by_make_fun(LoaderState* stp, GenOpArg Reg, GenOpArg idx)
     }
 }
 
-/*
- * Test whether register Reg is killed by the send instruction that follows.
- */
-
+/* Test whether Bif is "heavy" and should always go through its export entry */
 static int
-is_killed_by_send(LoaderState* stp, GenOpArg Reg)
+is_heavy_bif(LoaderState* stp, GenOpArg Bif)
 {
-    return Reg.type == TAG_x && 2 <= Reg.val;
+    Export *bif_export;
+
+    if (Bif.type != TAG_u || Bif.val >= stp->num_imports) {
+        return 0;
+    }
+
+    bif_export = stp->import[Bif.val].bif;
+
+    if (bif_export) {
+        int bif_index = bif_export->bif_table_index;
+        return bif_table[bif_index].kind == BIF_KIND_HEAVY;
+    }
+
+    return 0;
 }
 
 /*
@@ -5208,27 +5214,52 @@ final_touch(LoaderState* stp, struct erl_module_instance* inst_p)
      */
 
     for (i = 0; i < stp->num_exps; i++) {
-	Export* ep;
-	BeamInstr* address = stp->export[i].address;
+        Export* ep;
+        BeamInstr* address = stp->export[i].address;
 
-	if (address == NULL) {
-	    /* Skip stub for a BIF */
-	    continue;
-	}
-	ep = erts_export_put(stp->module, stp->export[i].function,
-			     stp->export[i].arity);
-	if (on_load) {
-	    /*
-	     * on_load: Don't make any of the exported functions
-	     * callable yet. Keep any function in the current
-	     * code callable.
-	     */
-	    ep->beam[1] = (BeamInstr) address;
-	}
-        else
+        ep = erts_export_put(stp->module,
+                             stp->export[i].function,
+                             stp->export[i].arity);
+
+        /* Fill in BIF stubs with a proper call to said BIF. */
+        if (ep->bif_table_index != -1) {
+            erts_write_bif_wrapper(ep, address);
+        }
+
+        if (on_load) {
+            /*
+             * on_load: Don't make any of the exported functions
+             * callable yet. Keep any function in the current
+             * code callable.
+             */
+            ep->trampoline.not_loaded.deferred = (BeamInstr) address;
+        } else {
             ep->addressv[erts_staging_code_ix()] = address;
+        }
     }
 
+#ifdef DEBUG
+    /* Ensure that we've loaded stubs for all BIFs in this module. */
+    for (i = 0; i < BIF_SIZE; i++) {
+        BifEntry *entry = &bif_table[i];
+
+        if (stp->module == entry->module) {
+            Export *ep = erts_export_put(entry->module,
+                                         entry->name,
+                                         entry->arity);
+            BeamInstr *addr = ep->addressv[erts_staging_code_ix()];
+
+            if (!ErtsInArea(addr, stp->codev, stp->ci * sizeof(BeamInstr))) {
+                erts_exit(ERTS_ABORT_EXIT,
+                          "Module %T doesn't export BIF %T/%i\n",
+                          entry->module,
+                          entry->name,
+                          entry->arity);
+            }
+        }
+    }
+#endif
+
     /*
      * Import functions and patch all callers.
      */
@@ -5400,15 +5431,14 @@ transform_engine(LoaderState* st)
 
 		i = instr->a[ap].val;
 		ASSERT(i < st->num_imports);
-		if (i >= st->num_imports || st->import[i].bf == NULL)
+		if (i >= st->num_imports || st->import[i].bif == NULL)
 		    goto restart;
 		if (bif_number != -1 &&
-		    bif_export[bif_number]->beam[1] != (BeamInstr) st->import[i].bf) {
+		    bif_export[bif_number] != st->import[i].bif) {
 		    goto restart;
 		}
 	    }
 	    break;
-
 #endif
 #if defined(TOP_is_not_bif)
 	case TOP_is_not_bif:
@@ -5438,7 +5468,7 @@ transform_engine(LoaderState* st)
 		 * they are special.
 		 */
 		if (i < st->num_imports) {
-		    if (st->import[i].bf != NULL ||
+		    if (st->import[i].bif != NULL ||
 			(st->import[i].module == am_erlang &&
 			 st->import[i].function == am_apply &&
 			 (st->import[i].arity == 2 || st->import[i].arity == 3))) {
@@ -6283,12 +6313,12 @@ exported_from_module(Process* p, /* Process whose heap to use. */
 	
 	if (ep->info.mfa.module == mod) {
 	    Eterm tuple;
-	    
-	    if (ep->addressv[code_ix] == ep->beam &&
-		BeamIsOpCode(ep->beam[0], op_call_error_handler)) {
-		/* There is a call to the function, but it does not exist. */ 
-		continue;
-	    }
+
+            if (ep->addressv[code_ix] == ep->trampoline.raw &&
+                BeamIsOpCode(ep->trampoline.op, op_call_error_handler)) {
+                /* There is a call to the function, but it does not exist. */ 
+                continue;
+            }
 
 	    if (hp == hend) {
 		int need = 10 * 5;
diff --git a/erts/emulator/beam/beam_load.h b/erts/emulator/beam/beam_load.h
index 156c3c45e2..e7127c5b08 100644
--- a/erts/emulator/beam/beam_load.h
+++ b/erts/emulator/beam/beam_load.h
@@ -106,7 +106,7 @@ typedef struct beam_code_header {
 
 }BeamCodeHeader;
 
-#  define BEAM_NIF_MIN_FUNC_SZ 4
+#  define BEAM_NATIVE_MIN_FUNC_SZ 4
 
 void erts_release_literal_area(struct ErtsLiteralArea_* literal_area);
 int erts_is_module_native(BeamCodeHeader* code);
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index b81056c774..d3fd99932a 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -1915,7 +1915,7 @@ do_send(Process *p, Eterm to, Eterm msg, Eterm return_term, Eterm *refp,
 	    erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
 	    erts_dsprintf(dsbufp,
 			  "Discarding message %T from %T to %T in an old "
-			  "incarnation (%d) of this node (%d)\n",
+			  "incarnation (%u) of this node (%u)\n",
 			  msg,
 			  p->common.id,
 			  to,
@@ -1959,7 +1959,7 @@ do_send(Process *p, Eterm to, Eterm msg, Eterm return_term, Eterm *refp,
 	erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf();
 	erts_dsprintf(dsbufp,
 		      "Discarding message %T from %T to %T in an old "
-		      "incarnation (%d) of this node (%d)\n",
+		      "incarnation (%u) of this node (%u)\n",
 		      msg,
 		      p->common.id,
 		      to,
@@ -1987,7 +1987,7 @@ do_send(Process *p, Eterm to, Eterm msg, Eterm return_term, Eterm *refp,
                 trace_send(p, portid, msg);
 
             if (have_seqtrace(SEQ_TRACE_TOKEN(p))) {
-                seq_trace_update_send(p);
+                seq_trace_update_serial(p);
                 seq_trace_output(SEQ_TRACE_TOKEN(p), msg,
                                  SEQ_TRACE_SEND, portid, p);
             }
@@ -4866,9 +4866,13 @@ BIF_RETTYPE phash_2(BIF_ALIST_2)
 BIF_RETTYPE phash2_1(BIF_ALIST_1)
 {
     Uint32 hash;
-
-    hash = make_hash2(BIF_ARG_1);
-    BIF_RET(make_small(hash & ((1L << 27) - 1)));
+    Eterm trap_state = THE_NON_VALUE;
+    hash = trapping_make_hash2(BIF_ARG_1, &trap_state, BIF_P);
+    if (trap_state == THE_NON_VALUE) {
+        BIF_RET(make_small(hash & ((1L << 27) - 1)));
+    } else {
+        BIF_TRAP1(bif_export[BIF_phash2_1], BIF_P, trap_state);
+    }
 }
 
 BIF_RETTYPE phash2_2(BIF_ALIST_2)
@@ -4876,6 +4880,7 @@ BIF_RETTYPE phash2_2(BIF_ALIST_2)
     Uint32 hash;
     Uint32 final_hash;
     Uint32 range;
+    Eterm trap_state = THE_NON_VALUE;
 
     /* Check for special case 2^32 */
     if (term_equals_2pow32(BIF_ARG_2)) {
@@ -4887,7 +4892,10 @@ BIF_RETTYPE phash2_2(BIF_ALIST_2)
 	}
 	range = (Uint32) u;
     }
-    hash = make_hash2(BIF_ARG_1);
+    hash = trapping_make_hash2(BIF_ARG_1, &trap_state, BIF_P);
+    if (trap_state != THE_NON_VALUE) {
+        BIF_TRAP2(bif_export[BIF_phash2_2], BIF_P, trap_state, BIF_ARG_2);
+    }
     if (range) {
 	final_hash = hash % range; /* [0..range-1] */
     } else {
@@ -4963,15 +4971,28 @@ void erts_init_trap_export(Export* ep, Eterm m, Eterm f, Uint a,
 			   Eterm (*bif)(BIF_ALIST))
 {
     int i;
+
     sys_memset((void *) ep, 0, sizeof(Export));
+
     for (i=0; i<ERTS_NUM_CODE_IX; i++) {
-	ep->addressv[i] = ep->beam;
+        ep->addressv[i] = ep->trampoline.raw;
     }
+
     ep->info.mfa.module = m;
     ep->info.mfa.function = f;
     ep->info.mfa.arity = a;
-    ep->beam[0] = BeamOpCodeAddr(op_apply_bif);
-    ep->beam[1] = (BeamInstr) bif;
+    ep->trampoline.op = BeamOpCodeAddr(op_call_bif_W);
+    ep->trampoline.raw[1] = (BeamInstr)bif;
+}
+
+/*
+ * Writes a BIF call wrapper to the given address.
+ */
+void erts_write_bif_wrapper(Export *export, BeamInstr *address) {
+    BifEntry *entry = &bif_table[export->bif_table_index];
+
+    address[0] = BeamOpCodeAddr(op_call_bif_W);
+    address[1] = (BeamInstr)entry->f;
 }
 
 void erts_init_bif(void)
@@ -5023,7 +5044,7 @@ void erts_init_bif(void)
 }
 
 /*
- * Scheduling of BIFs via NifExport...
+ * Scheduling of BIFs via ErtsNativeFunc...
  */
 #define ERTS_WANT_NFUNC_SCHED_INTERNALS__
 #include "erl_nfunc_sched.h"
@@ -5038,8 +5059,8 @@ schedule(Process *c_p, Process *dirty_shadow_proc,
 	 int argc, Eterm *argv)
 {
     ERTS_LC_ASSERT(ERTS_PROC_LOCK_MAIN & erts_proc_lc_my_proc_locks(c_p));
-    (void) erts_nif_export_schedule(c_p, dirty_shadow_proc,
-				    mfa, pc, BeamOpCodeAddr(op_apply_bif),
+    (void) erts_nfunc_schedule(c_p, dirty_shadow_proc,
+				    mfa, pc, BeamOpCodeAddr(op_call_bif_W),
 				    dfunc, ifunc,
 				    module, function,
 				    argc, argv);
@@ -5048,23 +5069,23 @@ schedule(Process *c_p, Process *dirty_shadow_proc,
 
 static BIF_RETTYPE dirty_bif_result(BIF_ALIST_1)
 {
-    NifExport *nep = (NifExport *) ERTS_PROC_GET_NIF_TRAP_EXPORT(BIF_P);
-    erts_nif_export_restore(BIF_P, nep, BIF_ARG_1);
+    ErtsNativeFunc *nep = (ErtsNativeFunc *) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(BIF_P);
+    erts_nfunc_restore(BIF_P, nep, BIF_ARG_1);
     BIF_RET(BIF_ARG_1);
 }
 
 static BIF_RETTYPE dirty_bif_trap(BIF_ALIST)
 {
-    NifExport *nep = (NifExport *) ERTS_PROC_GET_NIF_TRAP_EXPORT(BIF_P);
+    ErtsNativeFunc *nep = (ErtsNativeFunc *) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(BIF_P);
 
     /*
      * Arity and argument registers already set
      * correct by call to dirty_bif_trap()...
      */
 
-    ASSERT(BIF_P->arity == nep->exp.info.mfa.arity);
+    ASSERT(BIF_P->arity == nep->trampoline.info.mfa.arity);
 
-    erts_nif_export_restore(BIF_P, nep, THE_NON_VALUE);
+    erts_nfunc_restore(BIF_P, nep, THE_NON_VALUE);
 
     BIF_P->i = (BeamInstr *) nep->func;
     BIF_P->freason = TRAP;
@@ -5079,8 +5100,8 @@ static BIF_RETTYPE dirty_bif_exception(BIF_ALIST_2)
 
     freason = signed_val(BIF_ARG_1);
 
-    /* Restore orig info for error and clear nif export in handle_error() */
-    freason |= EXF_RESTORE_NIF;
+    /* Restore orig info for error and clear nif wrapper in handle_error() */
+    freason |= EXF_RESTORE_NFUNC;
 
     BIF_P->fvalue = BIF_ARG_2;
 
@@ -5118,6 +5139,7 @@ erts_schedule_bif(Process *proc,
     if (!ERTS_PROC_IS_EXITING(c_p)) {
 	Export *exp;
 	BifFunction dbif, ibif;
+        BeamInstr call_instr;
 	BeamInstr *pc;
 
 	/*
@@ -5152,29 +5174,41 @@ erts_schedule_bif(Process *proc,
 	if (i == NULL) {
 	    ERTS_INTERNAL_ERROR("Missing instruction pointer");
 	}
+
+        if (BeamIsOpCode(*i, op_i_generic_breakpoint)) {
+            ErtsCodeInfo *ci;
+            GenericBp *bp;
+    
+            ci = erts_code_to_codeinfo(i);
+            bp = ci->u.gen_bp;
+
+            call_instr = bp->orig_instr;
+        } else {
+            call_instr = *i;
+        }
+
 #ifdef HIPE
-	else if (proc->flags & F_HIPE_MODE) {
+	if (proc->flags & F_HIPE_MODE) {
 	    /* Pointer to bif export in i */
 	    exp = (Export *) i;
-	    pc = c_p->cp;
+            pc = cp_val(c_p->stop[0]);
 	    mfa = &exp->info.mfa;
-	}
+	} else /* !! This is part of the if clause below !! */
 #endif
-	else if (BeamIsOpCode(*i, op_call_bif_e)) {
-	    /* Pointer to bif export in i+1 */
-	    exp = (Export *) i[1];
+	if (BeamIsOpCode(call_instr, op_call_light_bif_be)) {
+	    /* Pointer to bif export in i+2 */
+	    exp = (Export *) i[2];
 	    pc = i;
 	    mfa = &exp->info.mfa;
 	}
-	else if (BeamIsOpCode(*i, op_call_bif_only_e)) {
-	    /* Pointer to bif export in i+1 */
-	    exp = (Export *) i[1];
+	else if (BeamIsOpCode(call_instr, op_call_light_bif_only_be)) {
+	    /* Pointer to bif export in i+2 */
+	    exp = (Export *) i[2];
 	    pc = i;
 	    mfa = &exp->info.mfa;
 	}
-	else if (BeamIsOpCode(*i, op_apply_bif)) {
-	    /* Pointer to bif in i+1, and mfa in i-3 */	    
-	    pc = c_p->cp;
+	else if (BeamIsOpCode(call_instr, op_call_bif_W)) {
+            pc = cp_val(c_p->stop[0]);
 	    mfa = erts_code_to_codemfa(i);
 	}
 	else {
@@ -5202,7 +5236,7 @@ erts_schedule_bif(Process *proc,
 static BIF_RETTYPE
 call_bif(Process *c_p, Eterm *reg, BeamInstr *I)
 {
-    NifExport *nep = ERTS_I_BEAM_OP_TO_NIF_EXPORT(I);
+    ErtsNativeFunc *nep = ERTS_I_BEAM_OP_TO_NFUNC(I);
     ErtsBifFunc bif = (ErtsBifFunc) nep->func;
     BIF_RETTYPE ret;
 
@@ -5215,12 +5249,12 @@ call_bif(Process *c_p, Eterm *reg, BeamInstr *I)
     ret = (*bif)(c_p, reg, I);
 
     if (is_value(ret))
-	erts_nif_export_restore(c_p, nep, ret);
+	erts_nfunc_restore(c_p, nep, ret);
     else if (c_p->freason != TRAP)
-	c_p->freason |= EXF_RESTORE_NIF; /* restore in handle_error() */
+	c_p->freason |= EXF_RESTORE_NFUNC; /* restore in handle_error() */
     else if (nep->func == ERTS_SCHED_BIF_TRAP_MARKER) {
 	/* BIF did an ordinary trap... */
-	erts_nif_export_restore(c_p, nep, ret);
+	erts_nfunc_restore(c_p, nep, ret);
     }
     /* else:
      *   BIF rescheduled itself using erts_schedule_bif().
@@ -5237,7 +5271,7 @@ erts_call_dirty_bif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
     int exiting;
     Process *dirty_shadow_proc;
     ErtsBifFunc bf;
-    NifExport *nep;
+    ErtsNativeFunc *nep;
 #ifdef DEBUG
     Eterm *c_p_htop;
     erts_aint32_t state;
@@ -5250,8 +5284,8 @@ erts_call_dirty_bif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
 
 #endif
 
-    nep = ERTS_I_BEAM_OP_TO_NIF_EXPORT(I);
-    ASSERT(nep == ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p));
+    nep = ERTS_I_BEAM_OP_TO_NFUNC(I);
+    ASSERT(nep == ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p));
 
     nep->func = ERTS_SCHED_BIF_TRAP_MARKER;
 
@@ -5265,7 +5299,6 @@ erts_call_dirty_bif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
     dirty_shadow_proc->freason = c_p->freason;
     dirty_shadow_proc->fvalue = c_p->fvalue;
     dirty_shadow_proc->ftrace = c_p->ftrace;
-    dirty_shadow_proc->cp = c_p->cp;
     dirty_shadow_proc->i = c_p->i;
 
 #ifdef DEBUG
@@ -5312,7 +5345,6 @@ erts_call_dirty_bif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
 	c_p->freason = dirty_shadow_proc->freason;
 	c_p->fvalue = dirty_shadow_proc->fvalue;
 	c_p->ftrace = dirty_shadow_proc->ftrace;
-	c_p->cp = dirty_shadow_proc->cp;
 	c_p->i = dirty_shadow_proc->i;
 	c_p->arity = dirty_shadow_proc->arity;
     }
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab
index c9f5177bd3..19dabc0514 100644
--- a/erts/emulator/beam/bif.tab
+++ b/erts/emulator/beam/bif.tab
@@ -25,13 +25,14 @@
 #
 # <bif-decl> ::= "bif" <bif> <C-name>* |
 #                "ubif" <bif> <C-name>* |
-#                "gcbif" <bif> <C-name>*
+#                "hbif" <bif> <C-name>*
 # <bif> ::= <module> ":" <name> "/" <arity>
 #
-# ubif:   Use for operators and guard BIFs that never build anything
-#         on the heap (such as tuple_size/1) and operators.
+# ubif:   Use for operators and guard BIFs.
 #
-# gcbif:  Use for guard BIFs that may build on the heap (such as abs/1).
+# hbif:   Use for BIFs that perform garbage collection or need up-to-date
+#         information on where they were called from. These must be called
+#         through the export entry.
 #
 # bif:    Use for all other BIFs.
 #
@@ -60,7 +61,7 @@ bif erlang:display_string/1
 bif erlang:display_nl/0
 ubif erlang:element/2
 bif erlang:erase/0
-bif erlang:erase/1
+hbif erlang:erase/1
 bif erlang:exit/1
 bif erlang:exit/2
 bif erlang:exit_signal/2
@@ -70,7 +71,7 @@ ubif erlang:float/1
 bif erlang:float_to_list/1
 bif erlang:float_to_list/2
 bif erlang:fun_info/2
-bif erts_internal:garbage_collect/1
+hbif erts_internal:garbage_collect/1
 bif erlang:get/0
 bif erlang:get/1
 bif erlang:get_keys/1
@@ -127,10 +128,10 @@ bif erlang:ports/0
 bif erlang:pre_loaded/0
 bif erlang:process_flag/2
 bif erts_internal:process_flag/3
-bif erlang:process_info/1
-bif erlang:process_info/2
+hbif erlang:process_info/1
+hbif erlang:process_info/2
 bif erlang:processes/0
-bif erlang:put/2
+hbif erlang:put/2
 bif erlang:register/2
 bif erlang:registered/0
 ubif erlang:round/1
@@ -174,7 +175,7 @@ bif erts_internal:port_connect/2
 
 bif erts_internal:request_system_task/3
 bif erts_internal:request_system_task/4
-bif erts_internal:check_process_code/1
+hbif erts_internal:check_process_code/1
 
 bif erts_internal:map_to_tuple_keys/1
 bif erts_internal:term_type/1
@@ -466,7 +467,7 @@ bif code:is_module_native/1
 # New Bifs in R9C.
 #
 
-bif erlang:hibernate/3
+hbif erlang:hibernate/3
 bif error_logger:warning_map/0
 
 #
diff --git a/erts/emulator/beam/bif_instrs.tab b/erts/emulator/beam/bif_instrs.tab
index 8e0caa38a3..de5305bde4 100644
--- a/erts/emulator/beam/bif_instrs.tab
+++ b/erts/emulator/beam/bif_instrs.tab
@@ -212,26 +212,32 @@ i_length.execute(Fail, Live, Dst) {
 // Call a BIF, store the result in x(0) and transfer control to the
 // next instruction.
 //
-call_bif(Exp) {
+call_light_bif(Bif, Exp) {
+    Export *export;
     ErtsBifFunc bf;
+
     Eterm result;
     ErlHeapFragment *live_hf_end;
-    Export *export = (Export*) $Exp;
+
+    bf = (ErtsBifFunc) $Bif;
+    export = (Export*) $Exp;
 
     if (!((FCALLS - 1) > 0 || (FCALLS-1) > neg_o_reds)) {
         /*
          * If we have run out of reductions, do a context
          * switch before calling the BIF.
          */
-        c_p->arity = GET_BIF_ARITY(export);
+        c_p->arity = GET_EXPORT_ARITY(export);
         c_p->current = &export->info.mfa;
         goto context_switch3;
     }
 
-    ERTS_MSACC_SET_BIF_STATE_CACHED_X(GET_BIF_MODULE(export),
-                                      GET_BIF_ADDRESS(export));
+    if (ERTS_UNLIKELY(export->is_bif_traced)) {
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
+        $DISPATCH_EXPORT(export);
+    }
 
-    bf = GET_BIF_ADDRESS(export);
+    ERTS_MSACC_SET_BIF_STATE_CACHED_X(GET_EXPORT_MODULE(export), bf);
 
     PRE_BIF_SWAPOUT(c_p);
     ERTS_DBG_CHK_REDS(c_p, FCALLS);
@@ -243,21 +249,26 @@ call_bif(Exp) {
     ERTS_VERIFY_UNUSED_TEMP_ALLOC(c_p);
     live_hf_end = c_p->mbuf;
     ERTS_CHK_MBUF_SZ(c_p);
+
     result = (*bf)(c_p, reg, I);
+
+    /* Only heavy BIFs may GC. */
+    ASSERT(E == c_p->stop);
+
     ERTS_CHK_MBUF_SZ(c_p);
     ASSERT(!ERTS_PROC_IS_EXITING(c_p) || is_non_value(result));
     ERTS_VERIFY_UNUSED_TEMP_ALLOC(c_p);
     ERTS_HOLE_CHECK(c_p);
     ERTS_REQ_PROC_MAIN_LOCK(c_p);
     if (ERTS_IS_GC_DESIRED(c_p)) {
-        Uint arity = GET_BIF_ARITY(export);
+        Uint arity = GET_EXPORT_ARITY(export);
         result = erts_gc_after_bif_call_lhf(c_p, live_hf_end, result,
                                             reg, arity);
         E = c_p->stop;
     }
-    PROCESS_MAIN_CHK_LOCKS(c_p);
     HTOP = HEAP_TOP(c_p);
     FCALLS = c_p->fcalls;
+    PROCESS_MAIN_CHK_LOCKS(c_p);
     ERTS_DBG_CHK_REDS(c_p, FCALLS);
 
     /*
@@ -280,10 +291,9 @@ call_bif(Exp) {
          * erlang code or by nif_bif.epilogue() when the BIF
          * is done).
          */
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         SET_I(c_p->i);
-        SWAPIN;
-        Dispatch();
+        $DISPATCH();
     }
 
     /*
@@ -297,29 +307,38 @@ call_bif(Exp) {
 
 //
 // Call a BIF tail-recursively, storing the result in x(0) and doing
-// a return to the continuation poiner (c_p->cp).
+// a return to the continuation poiner.
 //
-
-call_bif_only(Exp) {
+call_light_bif_only(Bif, Exp) {
+    ErlHeapFragment *live_hf_end;
     ErtsBifFunc bf;
+    Export *export;
     Eterm result;
-    ErlHeapFragment *live_hf_end;
-    Export *export = (Export*) $Exp;
+
+    bf = (ErtsBifFunc) $Bif;
+    export = (Export*) $Exp;
 
     if (!((FCALLS - 1) > 0 || (FCALLS-1) > neg_o_reds)) {
         /*
          * If we have run out of reductions, do a context
          * switch before calling the BIF.
          */
-        c_p->arity = GET_BIF_ARITY(export);
+        c_p->arity = GET_EXPORT_ARITY(export);
         c_p->current = &export->info.mfa;
         goto context_switch3;
     }
 
-    ERTS_MSACC_SET_BIF_STATE_CACHED_X(GET_BIF_MODULE(export),
-                                      GET_BIF_ADDRESS(export));
+    if (ERTS_UNLIKELY(export->is_bif_traced)) {
+        /* Set up a dummy stack frame so we can perform a normal call. Loader
+         * transformations ensure that the next instruction after this is
+         * 'deallocate_return 0'. */
+        $AH(0, 0, GET_EXPORT_ARITY(export));
+
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
+        $DISPATCH_EXPORT(export);
+    }
 
-    bf = GET_BIF_ADDRESS(export);
+    ERTS_MSACC_SET_BIF_STATE_CACHED_X(GET_EXPORT_MODULE(export), bf);
 
     PRE_BIF_SWAPOUT(c_p);
     ERTS_DBG_CHK_REDS(c_p, FCALLS);
@@ -331,21 +350,26 @@ call_bif_only(Exp) {
     ERTS_VERIFY_UNUSED_TEMP_ALLOC(c_p);
     live_hf_end = c_p->mbuf;
     ERTS_CHK_MBUF_SZ(c_p);
+
     result = (*bf)(c_p, reg, I);
+
+    /* Only heavy BIFs may GC. */
+    ASSERT(E == c_p->stop);
+
     ERTS_CHK_MBUF_SZ(c_p);
     ASSERT(!ERTS_PROC_IS_EXITING(c_p) || is_non_value(result));
     ERTS_VERIFY_UNUSED_TEMP_ALLOC(c_p);
     ERTS_HOLE_CHECK(c_p);
     ERTS_REQ_PROC_MAIN_LOCK(c_p);
     if (ERTS_IS_GC_DESIRED(c_p)) {
-        Uint arity = GET_BIF_ARITY(export);
+        Uint arity = GET_EXPORT_ARITY(export);
         result = erts_gc_after_bif_call_lhf(c_p, live_hf_end, result,
                                             reg, arity);
         E = c_p->stop;
     }
-    PROCESS_MAIN_CHK_LOCKS(c_p);
     HTOP = HEAP_TOP(c_p);
     FCALLS = c_p->fcalls;
+    PROCESS_MAIN_CHK_LOCKS(c_p);
     ERTS_DBG_CHK_REDS(c_p, FCALLS);
 
     /*
@@ -367,11 +391,10 @@ call_bif_only(Exp) {
     } else if (c_p->freason == TRAP) {
         /*
          * Dispatch to a trap. When the trap is done, a jump
-         * to the continuation pointer (c_p->cp) will be done.
+         * to the continuation pointer on the stack will be done.
          */
         SET_I(c_p->i);
-        SWAPIN;
-        Dispatch();
+        $DISPATCH();
     }
 
     /*
@@ -413,17 +436,17 @@ send() {
         r(0) = result;
         CHECK_TERM(r(0));
     } else if (c_p->freason == TRAP) {
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         SET_I(c_p->i);
         SWAPIN;
-        Dispatch();
+        $DISPATCH();
     } else {
         goto find_func_info;
     }
 }
 
 call_nif := nif_bif.call_nif.epilogue;
-apply_bif := nif_bif.apply_bif.epilogue;
+call_bif := nif_bif.call_bif.epilogue;
 
 nif_bif.head() {
     Eterm nif_bif_result;
@@ -433,7 +456,7 @@ nif_bif.head() {
     ErtsCodeMFA *codemfa;
 }
 
-nif_bif.call_nif() {
+nif_bif.call_nif(Func, NifMod, DirtyFunc) {
     /*
      * call_nif is always first instruction in function:
      *
@@ -443,11 +466,14 @@ nif_bif.call_nif() {
      * I[0]: &&call_nif
      * I[1]: Function pointer to NIF function
      * I[2]: Pointer to erl_module_nif
-     * I[3]: Function pointer to dirty NIF
+     * I[3]: Function pointer to dirty NIF. This is not used in this
+     *       instruction, but dirty schedulers look at it.
      *
-     * This layout is determined by the NifExport struct
+     * This layout is determined by the ErtsNativeFunc struct
      */
 
+    (void)$DirtyFunc;
+
     ERTS_MSACC_SET_STATE_CACHED_M_X(ERTS_MSACC_STATE_NIF);
 
     codemfa = erts_code_to_codemfa(I);
@@ -465,12 +491,12 @@ nif_bif.call_nif() {
     ASSERT(!ERTS_PROC_IS_EXITING(c_p));
     {
         typedef Eterm NifF(struct enif_environment_t*, int argc, Eterm argv[]);
-        NifF* fp = vbf = (NifF*) I[1];
+        NifF* fp = vbf = (NifF*) $Func;
         struct enif_environment_t env;
         ASSERT(c_p->scheduler_data);
         live_hf_end = c_p->mbuf;
         ERTS_CHK_MBUF_SZ(c_p);
-        erts_pre_nif(&env, c_p, (struct erl_module_nif*)I[2], NULL);
+        erts_pre_nif(&env, c_p, (struct erl_module_nif*)$NifMod, NULL);
 
         ASSERT((c_p->scheduler_data)->current_nif == NULL);
         (c_p->scheduler_data)->current_nif = &env;
@@ -495,15 +521,15 @@ nif_bif.call_nif() {
     DTRACE_NIF_RETURN(c_p, codemfa);
 }
 
-nif_bif.apply_bif() {
+nif_bif.call_bif(Func) {
     /*
-     * At this point, I points to the code[0] in the export entry for
-     * the BIF:
+     * At this point, I points to the code[0] in the native function wrapper
+     * for the BIF:
      *
      * code[-3]: Module
      * code[-2]: Function
      * code[-1]: Arity
-     * code[0]: &&apply_bif
+     * code[0]: &&call_bif
      * code[1]: Function pointer to BIF function
      */
 
@@ -515,21 +541,19 @@ nif_bif.apply_bif() {
 
     codemfa = erts_code_to_codemfa(I);
 
-    ERTS_MSACC_SET_BIF_STATE_CACHED_X(codemfa->module, (BifFunction)Arg(0));
-
+    ERTS_MSACC_SET_BIF_STATE_CACHED_X(codemfa->module, (BifFunction)$Func);
 
     /* In case we apply process_info/1,2 or load_nif/1 */
     c_p->current = codemfa;
     $SET_CP_I_ABS(I);     /* In case we apply check_process_code/2. */
     c_p->arity = 0;       /* To allow garbage collection on ourselves
-                           * (check_process_code/2).
-                           */
+                           * (check_process_code/2, put/2, etc). */
     DTRACE_BIF_ENTRY(c_p, codemfa);
 
     SWAPOUT;
     ERTS_DBG_CHK_REDS(c_p, FCALLS - 1);
     c_p->fcalls = FCALLS - 1;
-    vbf = (BifFunction) Arg(0);
+    vbf = (BifFunction)$Func;
     PROCESS_MAIN_CHK_LOCKS(c_p);
     bif_nif_arity = codemfa->arity;
     ASSERT(bif_nif_arity <= 4);
@@ -557,6 +581,7 @@ nif_bif.apply_bif() {
 }
 
 nif_bif.epilogue() {
+    //| -no_next
     ERTS_REQ_PROC_MAIN_LOCK(c_p);
     ERTS_HOLE_CHECK(c_p);
     if (ERTS_IS_GC_DESIRED(c_p)) {
@@ -570,8 +595,7 @@ nif_bif.epilogue() {
     if (ERTS_LIKELY(is_value(nif_bif_result))) {
         r(0) = nif_bif_result;
         CHECK_TERM(r(0));
-        SET_I(c_p->cp);
-        c_p->cp = 0;
+        $RETURN();
         Goto(*I);
     } else if (c_p->freason == TRAP) {
         SET_I(c_p->i);
@@ -579,8 +603,42 @@ nif_bif.epilogue() {
             c_p->flags &= ~F_HIBERNATE_SCHED;
             goto do_schedule;
         }
-        Dispatch();
+        $DISPATCH();
+    }
+    {
+        BeamInstr *cp = erts_printable_return_address(c_p, E);
+        ASSERT(VALID_INSTR(*cp));
+        I = handle_error(c_p, cp, reg, c_p->current);
     }
-    I = handle_error(c_p, c_p->cp, reg, c_p->current);
     goto post_error_handling;
 }
+
+i_load_nif() {
+    //| -no_next
+    if (erts_try_seize_code_write_permission(c_p)) {
+        Eterm result;
+
+        PRE_BIF_SWAPOUT(c_p);
+        result = erts_load_nif(c_p, I, r(0), r(1));
+        erts_release_code_write_permission();
+        ERTS_REQ_PROC_MAIN_LOCK(c_p);
+        SWAPIN;
+
+        if (ERTS_LIKELY(is_value(result))) {
+            r(0) = result;
+            $NEXT0();
+        } else {
+            static ErtsCodeMFA mfa = {am_erlang, am_load_nif, 2};
+            c_p->freason = BADARG;
+            I = handle_error(c_p, I, reg, &mfa);
+            goto post_error_handling;
+        }
+    } else {
+        /* Yield and try again */
+        $SET_CP_I_ABS(I);
+        SWAPOUT;
+        c_p->current = NULL;
+        c_p->arity = 2;
+        goto do_schedule;
+    }
+}
diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c
index 522f50287a..7666f23a4f 100644
--- a/erts/emulator/beam/big.c
+++ b/erts/emulator/beam/big.c
@@ -2176,6 +2176,24 @@ term_to_Uint64(Eterm term, Uint64 *up)
 #endif
 }
 
+int
+term_to_Uint32(Eterm term, Uint32 *up)
+{
+#if ERTS_SIZEOF_ETERM == 4
+    return term_to_Uint(term,up);
+#else
+    if (is_small(term)) {
+	Sint i = signed_val(term);
+	if (i >= 0) {
+            *up = (Uint32) i;
+            return 1;
+        }
+    }
+    *up = BADARG;
+    return 0;
+#endif
+}
+
 
 int term_to_Sint(Eterm term, Sint *sp)
 {
diff --git a/erts/emulator/beam/big.h b/erts/emulator/beam/big.h
index ad19cce395..3fed076419 100644
--- a/erts/emulator/beam/big.h
+++ b/erts/emulator/beam/big.h
@@ -168,6 +168,8 @@ Eterm erts_uint64_array_to_big(Uint **, int, int, Uint64 *);
 int term_to_Uint64(Eterm, Uint64*);
 int term_to_Sint64(Eterm, Sint64*);
 #endif
+int term_to_Uint32(Eterm, Uint32*);
+
 
 Uint32 big_to_uint32(Eterm b);
 int term_equals_2pow32(Eterm);
diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c
index eb9e749a08..dafe805a6f 100644
--- a/erts/emulator/beam/dist.c
+++ b/erts/emulator/beam/dist.c
@@ -1051,7 +1051,7 @@ erts_dsig_send_msg(ErtsDSigSendContext* ctx, Eterm remote, Eterm message)
 #endif
 
     if (have_seqtrace(SEQ_TRACE_TOKEN(sender))) {
-	seq_trace_update_send(sender);
+	seq_trace_update_serial(sender);
 	token = SEQ_TRACE_TOKEN(sender);
 	seq_trace_output(token, message, SEQ_TRACE_SEND, remote, sender);
     }
@@ -1125,7 +1125,7 @@ erts_dsig_send_reg_msg(ErtsDSigSendContext* ctx, Eterm remote_name, Eterm messag
 #endif
 
     if (have_seqtrace(SEQ_TRACE_TOKEN(sender))) {
-	seq_trace_update_send(sender);
+	seq_trace_update_serial(sender);
 	token = SEQ_TRACE_TOKEN(sender);
 	seq_trace_output(token, message, SEQ_TRACE_SEND, remote_name, sender);
     }
@@ -1184,7 +1184,7 @@ erts_dsig_send_exit_tt(ErtsDSigSendContext *ctx, Eterm local, Eterm remote,
         msg = reason;
 
     if (have_seqtrace(token)) {
-	seq_trace_update_send(ctx->c_p);
+	seq_trace_update_serial(ctx->c_p);
 	seq_trace_output_exit(token, reason, SEQ_TRACE_SEND, remote, local);
         if (ctx->dep->flags & DFLAG_EXIT_PAYLOAD) {
             ctl = TUPLE4(&ctx->ctl_heap[0],
@@ -3762,12 +3762,10 @@ int distribution_info(fmtfn_t to, void *arg)	/* Called by break handler */
 BIF_RETTYPE setnode_2(BIF_ALIST_2)
 {
     Process *net_kernel;
-    Uint creation;
+    Uint32 creation;
 
     /* valid creation ? */
-    if(!term_to_Uint(BIF_ARG_2, &creation))
-	goto error;
-    if(creation > 3)
+    if(!term_to_Uint32(BIF_ARG_2, &creation))
 	goto error;
 
     /* valid node name ? */
@@ -3811,7 +3809,7 @@ BIF_RETTYPE setnode_2(BIF_ALIST_2)
     erts_proc_unlock(BIF_P, ERTS_PROC_LOCK_MAIN);
     erts_thr_progress_block();
     inc_no_nodes();
-    erts_set_this_node(BIF_ARG_1, (Uint32) creation);
+    erts_set_this_node(BIF_ARG_1, creation);
     erts_is_alive = 1;
     send_nodes_mon_msgs(NULL, am_nodeup, BIF_ARG_1, am_visible, NIL);
     erts_thr_progress_unblock();
diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h
index a33fb7efcf..37ec88cc55 100644
--- a/erts/emulator/beam/dist.h
+++ b/erts/emulator/beam/dist.h
@@ -54,11 +54,12 @@
 #define DFLAG_DIST_MANDATORY (DFLAG_EXTENDED_REFERENCES         \
                               | DFLAG_EXTENDED_PIDS_PORTS       \
 			      | DFLAG_UTF8_ATOMS                \
-			      | DFLAG_NEW_FUN_TAGS)
+			      | DFLAG_NEW_FUN_TAGS              \
+                              | DFLAG_BIG_CREATION)
 
 /*
  * Additional optimistic flags when encoding toward pending connection.
- * If remote node (erl_interface) does not supporting these then we may need
+ * If remote node (erl_interface) does not support these then we may need
  * to transcode messages enqueued before connection setup was finished.
  */
 #define DFLAG_DIST_HOPEFULLY (DFLAG_EXPORT_PTR_TAG              \
@@ -75,7 +76,6 @@
                             | DFLAG_SMALL_ATOM_TAGS           \
                             | DFLAG_UTF8_ATOMS                \
                             | DFLAG_MAP_TAG                   \
-                            | DFLAG_BIG_CREATION              \
                             | DFLAG_SEND_SENDER               \
                             | DFLAG_BIG_SEQTRACE_LABELS       \
                             | DFLAG_EXIT_PAYLOAD              \
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index b9f0334172..1bbc7d7f1e 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -653,8 +653,6 @@ erts_alloc_init(int *argc, char **argv, ErtsAllocInitOpts *eaiop)
 	= erts_timer_type_size(ERTS_ALC_T_HL_PTIMER);
     fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_BIF_TIMER)]
 	= erts_timer_type_size(ERTS_ALC_T_BIF_TIMER);
-    fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_NIF_EXP_TRACE)]
-	= sizeof(NifExportTrace);
     fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_MREF_NSCHED_ENT)]
 	= sizeof(ErtsNSchedMagicRefTableEntry);
     fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_MINDIRECTION)]
@@ -2392,10 +2390,6 @@ erts_memory(fmtfn_t *print_to_p, void *print_to_arg, void *proc, Eterm earg)
 		       &size.processes_used,
 		       fi,
 		       ERTS_ALC_T_BIF_TIMER);
-	add_fix_values(&size.processes,
-		       &size.processes_used,
-		       fi,
-		       ERTS_ALC_T_NIF_EXP_TRACE);
     }
 
     if (want.atom || want.atom_used) {
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 21941ba96e..3e643a6223 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -278,6 +278,7 @@ type	SETUP_CONN_ARG	SHORT_LIVED	PROCESSES	setup_connection_argument
 type    LIST_TRAP       SHORT_LIVED     PROCESSES       list_bif_trap_state
 type    CONT_EXIT_TRAP  SHORT_LIVED     PROCESSES       continue_exit_trap_state
 type    SEQ_YIELD_STATE SHORT_LIVED     SYSTEM          dist_seq_yield_state
+type    PHASH2_TRAP     SHORT_LIVED     PROCESSES       phash2_trap_state
 
 type	ENVIRONMENT	SYSTEM		SYSTEM		environment
 
@@ -330,8 +331,7 @@ type	DB_HEIR_DATA	STANDARD	ETS		db_heir_data
 type	DB_MS_PSDO_PROC	LONG_LIVED	ETS		db_match_pseudo_proc
 type	SCHDLR_DATA	LONG_LIVED	SYSTEM		scheduler_data
 
-type	NIF_TRAP_EXPORT	STANDARD	PROCESSES	nif_trap_export_entry
-type	NIF_EXP_TRACE	FIXED_SIZE	PROCESSES	nif_export_trace
+type	NFUNC_TRAP_WRAPPER	STANDARD	PROCESSES	nfunc_trap_wrapper
 type	EXPORT		LONG_LIVED	CODE		export_entry
 type	MONITOR		FIXED_SIZE	PROCESSES	monitor
 type	MONITOR_SUSPEND	STANDARD	PROCESSES	monitor_suspend
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index b06b5fc1ab..57fc5ec131 100644
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -158,8 +158,9 @@ static Eterm os_version_tuple;
 static Eterm
 current_function(Process* p, ErtsHeapFactory *hfact, Process* rp,
                  int full_info, Uint reserve_size, int flags);
-static Eterm current_stacktrace(ErtsHeapFactory *hfact, Process* rp,
-                                Uint reserve_size);
+static Eterm
+current_stacktrace(Process* p, ErtsHeapFactory *hfact, Process* rp,
+                   Uint reserve_size, int flags);
 
 Eterm
 erts_bld_bin_list(Uint **hpp, Uint *szp, ErlOffHeap* oh, Eterm tail)
@@ -1384,7 +1385,7 @@ process_info_aux(Process *c_p,
 	break;
 
     case ERTS_PI_IX_CURRENT_STACKTRACE:
-	res = current_stacktrace(hfact, rp, reserve_size);
+	res = current_stacktrace(c_p, hfact, rp, reserve_size, flags);
 	break;
 
     case ERTS_PI_IX_INITIAL_CALL:
@@ -2022,19 +2023,23 @@ current_function(Process *c_p, ErtsHeapFactory *hfact, Process* rp,
     }
 
     if (c_p == rp && !(flags & ERTS_PI_FLAG_REQUEST_FOR_OTHER)) {
-	FunctionInfo fi2;
+        BeamInstr* return_address;
+        FunctionInfo caller_fi;
 
-	/*
-	 * The current function is erlang:process_info/{1,2},
-	 * which is not the answer that the application want.
-	 * We will use the function pointed into by rp->cp
-	 * instead if it can be looked up.
-	 */
-	erts_lookup_function_info(&fi2, rp->cp, full_info);
-	if (fi2.mfa) {
-	    fi = fi2;
-	    rp->current = fi2.mfa;
-	}
+        /*
+         * The current function is erlang:process_info/{1,2}, and we've
+         * historically returned the *calling* function in that case. We
+         * therefore use the continuation pointer stored at the top of the
+         * stack instead, which is safe since process_info is a "heavy" BIF
+         * that is only called through its export entry.
+         */
+        return_address = erts_printable_return_address(rp, STACK_TOP(rp));
+
+        erts_lookup_function_info(&caller_fi, return_address, full_info);
+        if (caller_fi.mfa) {
+            fi = caller_fi;
+            rp->current = caller_fi.mfa;
+        }
     }
 
     /*
@@ -2055,8 +2060,8 @@ current_function(Process *c_p, ErtsHeapFactory *hfact, Process* rp,
 }
 
 static Eterm
-current_stacktrace(ErtsHeapFactory *hfact, Process* rp,
-                   Uint reserve_size)
+current_stacktrace(Process *p, ErtsHeapFactory *hfact, Process* rp,
+                   Uint reserve_size, int flags)
 {
     Uint sz;
     struct StackTrace* s;
@@ -2073,13 +2078,14 @@ current_stacktrace(ErtsHeapFactory *hfact, Process* rp,
     sz = offsetof(struct StackTrace, trace) + sizeof(BeamInstr *)*depth;
     s = (struct StackTrace *) erts_alloc(ERTS_ALC_T_TMP, sz);
     s->depth = 0;
-    if (depth > 0 && rp->i) {
-	s->trace[s->depth++] = rp->i;
-	depth--;
-    }
-    if (depth > 0 && rp->cp != 0) {
-	s->trace[s->depth++] = rp->cp - 1;
-	depth--;
+    s->pc = NULL;
+
+    /* We skip current pc when requesting our own stack trace since it will
+     * inevitably point to process_info/1,2 */
+    if ((p != rp || (flags & ERTS_PI_FLAG_REQUEST_FOR_OTHER)) &&
+        depth > 0 && rp->i) {
+        s->trace[s->depth++] = rp->i;
+        depth--;
     }
     erts_save_stacktrace(rp, s, depth);
 
@@ -2814,7 +2820,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
     } else if (BIF_ARG_1 == am_threads) {
 	return am_true;
     } else if (BIF_ARG_1 == am_creation) {
-	return make_small(erts_this_node->creation);
+        Uint hsz = 0;
+        erts_bld_uint(NULL, &hsz, erts_this_node->creation);
+        hp = hsz ? HAlloc(BIF_P, hsz) : NULL;
+        BIF_RET(erts_bld_uint(&hp, NULL, erts_this_node->creation));
     } else if (BIF_ARG_1 == am_break_ignored) {
       extern int ignore_break;
       if (ignore_break) 
diff --git a/erts/emulator/beam/erl_bif_lists.c b/erts/emulator/beam/erl_bif_lists.c
index fa2edfef1e..9d485abc35 100644
--- a/erts/emulator/beam/erl_bif_lists.c
+++ b/erts/emulator/beam/erl_bif_lists.c
@@ -32,8 +32,46 @@
 #include "bif.h"
 #include "erl_binary.h"
 
+static Export plusplus_trap_export;
+static Export append_trap_export;
 
-static Eterm keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List);
+static Export minusminus_trap_export;
+static Export subtract_trap_export;
+
+static Export member_trap_export;
+
+static Export reverse_trap_export;
+
+static Export keymember_trap_export;
+static Export keysearch_trap_export;
+static Export keyfind_trap_export;
+
+void erts_init_bif_lists(void) {
+    erts_init_trap_export(&plusplus_trap_export, am_erlang, am_PlusPlus, 2,
+                          ebif_plusplus_2);
+    erts_init_trap_export(&append_trap_export, am_erlang, am_append, 2,
+                          append_2);
+
+    erts_init_trap_export(&minusminus_trap_export, am_erlang, am_MinusMinus, 2,
+                          ebif_minusminus_2);
+    erts_init_trap_export(&subtract_trap_export, am_lists, am_subtract, 2,
+                          subtract_2);
+
+    erts_init_trap_export(&reverse_trap_export, am_lists, am_reverse, 2,
+                          lists_reverse_2);
+
+    erts_init_trap_export(&member_trap_export, am_lists, am_member, 2,
+                          lists_member_2);
+
+    erts_init_trap_export(&keymember_trap_export, am_lists, am_keymember, 3,
+                          lists_keymember_3);
+    erts_init_trap_export(&keysearch_trap_export, am_lists, am_keysearch, 3,
+                          lists_keysearch_3);
+    erts_init_trap_export(&keyfind_trap_export, am_lists, am_keyfind, 3,
+                          lists_keyfind_3);
+}
+
+static Eterm keyfind(Export* Bif, Process* p, Eterm Key, Eterm Pos, Eterm List);
 
 /* erlang:'++'/2
  *
@@ -308,12 +346,12 @@ static Eterm append(Export *bif_entry, BIF_ALIST_2) {
 Eterm
 ebif_plusplus_2(BIF_ALIST_2)
 {
-    return append(bif_export[BIF_ebif_plusplus_2], BIF_CALL_ARGS);
+    return append(&plusplus_trap_export, BIF_CALL_ARGS);
 }
 
 BIF_RETTYPE append_2(BIF_ALIST_2)
 {
-    return append(bif_export[BIF_append_2], BIF_CALL_ARGS);
+    return append(&append_trap_export, BIF_CALL_ARGS);
 }
 
 /* erlang:'--'/2
@@ -1039,11 +1077,11 @@ static Eterm subtract(Export *bif_entry, BIF_ALIST_2) {
 }
 
 BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2) {
-    return subtract(bif_export[BIF_ebif_minusminus_2], BIF_CALL_ARGS);
+    return subtract(&minusminus_trap_export, BIF_CALL_ARGS);
 }
 
 BIF_RETTYPE subtract_2(BIF_ALIST_2) {
-    return subtract(bif_export[BIF_subtract_2], BIF_CALL_ARGS);
+    return subtract(&subtract_trap_export, BIF_CALL_ARGS);
 }
 
 
@@ -1068,7 +1106,7 @@ BIF_RETTYPE lists_member_2(BIF_ALIST_2)
     while (is_list(list)) {
 	if (--max_iter < 0) {
 	    BUMP_ALL_REDS(BIF_P);
-	    BIF_TRAP2(bif_export[BIF_lists_member_2], BIF_P, term, list);
+	    BIF_TRAP2(&member_trap_export, BIF_P, term, list);
 	}
 	item = CAR(list_val(list));
 	if ((item == term) || (non_immed_key && eq(item, term))) {
@@ -1130,7 +1168,7 @@ static BIF_RETTYPE lists_reverse_alloc(Process *c_p,
     }
 
     ASSERT(is_list(tail) && cells_left == 0);
-    BIF_TRAP2(bif_export[BIF_lists_reverse_2], c_p, list, tail);
+    BIF_TRAP2(&reverse_trap_export, c_p, list, tail);
 }
 
 static BIF_RETTYPE lists_reverse_onheap(Process *c_p,
@@ -1179,7 +1217,7 @@ static BIF_RETTYPE lists_reverse_onheap(Process *c_p,
         }
 
         BUMP_ALL_REDS(c_p);
-        BIF_TRAP2(bif_export[BIF_lists_reverse_2], c_p, list, tail);
+        BIF_TRAP2(&reverse_trap_export, c_p, list, tail);
     }
 
     BIF_ERROR(c_p, BADARG);
@@ -1209,7 +1247,7 @@ lists_keymember_3(BIF_ALIST_3)
 {
     Eterm res;
 
-    res = keyfind(BIF_lists_keymember_3, BIF_P,
+    res = keyfind(&keymember_trap_export, BIF_P,
 		  BIF_ARG_1, BIF_ARG_2, BIF_ARG_3);
     if (is_value(res) && is_tuple(res)) {
 	return am_true;
@@ -1223,7 +1261,7 @@ lists_keysearch_3(BIF_ALIST_3)
 {
     Eterm res;
     
-    res = keyfind(BIF_lists_keysearch_3, BIF_P,
+    res = keyfind(&keysearch_trap_export, BIF_P,
 		  BIF_ARG_1, BIF_ARG_2, BIF_ARG_3);
     if (is_non_value(res) || is_not_tuple(res)) {
 	return res;
@@ -1236,12 +1274,12 @@ lists_keysearch_3(BIF_ALIST_3)
 BIF_RETTYPE
 lists_keyfind_3(BIF_ALIST_3)
 {
-    return keyfind(BIF_lists_keyfind_3, BIF_P,
+    return keyfind(&keyfind_trap_export, BIF_P,
 		   BIF_ARG_1, BIF_ARG_2, BIF_ARG_3);
 }
 
 static Eterm
-keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List)
+keyfind(Export *Bif, Process* p, Eterm Key, Eterm Pos, Eterm List)
 {
     int max_iter = 10 * CONTEXT_REDS;
     Sint pos;
@@ -1257,7 +1295,7 @@ keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List)
 	while (is_list(List)) {
 	    if (--max_iter < 0) {
 		BUMP_ALL_REDS(p);
-		BIF_TRAP3(bif_export[Bif], p, Key, Pos, List);
+		BIF_TRAP3(Bif, p, Key, Pos, List);
 	    }
 	    term = CAR(list_val(List));
 	    List = CDR(list_val(List));
@@ -1282,7 +1320,7 @@ keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List)
 	while (is_list(List)) {
 	    if (--max_iter < 0) {
 		BUMP_ALL_REDS(p);
-		BIF_TRAP3(bif_export[Bif], p, Key, Pos, List);
+		BIF_TRAP3(Bif, p, Key, Pos, List);
 	    }
 	    term = CAR(list_val(List));
 	    List = CDR(list_val(List));
@@ -1300,7 +1338,7 @@ keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List)
 	while (is_list(List)) {
 	    if (--max_iter < 0) {
 		BUMP_ALL_REDS(p);
-		BIF_TRAP3(bif_export[Bif], p, Key, Pos, List);
+		BIF_TRAP3(Bif, p, Key, Pos, List);
 	    }
 	    term = CAR(list_val(List));
 	    List = CDR(list_val(List));
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index ed825d3dda..dd1e884705 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -44,6 +44,7 @@
 #include "erl_bif_unique.h"
 #include "dtrace-wrapper.h"
 #include "erl_proc_sig_queue.h"
+#include "erl_osenv.h"
 
 static Port *open_port(Process* p, Eterm name, Eterm settings, int *err_typep, int *err_nump);
 static int merge_global_environment(erts_osenv_t *env, Eterm key_value_pairs);
diff --git a/erts/emulator/beam/erl_bif_trace.c b/erts/emulator/beam/erl_bif_trace.c
index b31d5b86cb..e03c97fe10 100644
--- a/erts/emulator/beam/erl_bif_trace.c
+++ b/erts/emulator/beam/erl_bif_trace.c
@@ -80,9 +80,6 @@ static Eterm trace_info_func(Process* p, Eterm pid_spec, Eterm key);
 static Eterm trace_info_on_load(Process* p, Eterm key);
 static Eterm trace_info_event(Process* p, Eterm event, Eterm key);
 
-
-static void reset_bif_trace(void);
-static void setup_bif_trace(void);
 static void install_exp_breakpoints(BpFunctions* f);
 static void uninstall_exp_breakpoints(BpFunctions* f);
 static void clean_export_entries(BpFunctions* f);
@@ -1047,14 +1044,13 @@ static int function_is_traced(Process *p,
     e.info.mfa.function = mfa[1];
     e.info.mfa.arity = mfa[2];
     if ((ep = export_get(&e)) != NULL) {
-	pc = ep->beam;
+	pc = ep->trampoline.raw;
 	if (ep->addressv[erts_active_code_ix()] == pc &&
 	    ! BeamIsOpCode(*pc, op_call_error_handler)) {
 
 	    int r = 0;
 
-	    ASSERT(BeamIsOpCode(*pc, op_apply_bif) ||
-		   BeamIsOpCode(*pc, op_i_generic_breakpoint));
+	    ASSERT(BeamIsOpCode(*pc, op_i_generic_breakpoint));
 
 	    if (erts_is_trace_break(&ep->info, ms, 0)) {
 		return FUNC_TRACE_GLOBAL_TRACE;
@@ -1426,18 +1422,21 @@ erts_set_trace_pattern(Process*p, ErtsCodeMFA *mfa, int specified,
     int n;
     BpFunction* fp;
 
-    /*
-     * First work on normal functions (not real BIFs).
-     */
-
     erts_bp_match_export(&finish_bp.e, mfa, specified);
     fp = finish_bp.e.matching;
     n = finish_bp.e.matched;
 
     for (i = 0; i < n; i++) {
         ErtsCodeInfo *ci = fp[i].ci;
-	BeamInstr* pc = erts_codeinfo_to_code(ci);
-	Export* ep = ErtsContainerStruct(ci, Export, info);
+        BeamInstr* pc;
+        Export* ep;
+
+        pc = erts_codeinfo_to_code(ci);
+        ep = ErtsContainerStruct(ci, Export, info);
+
+        if (ep->bif_table_index != -1) {
+            ep->is_bif_traced = !!on;
+        }
 
 	if (on && !flags.breakpoint) {
 	    /* Turn on global call tracing */
@@ -1446,12 +1445,12 @@ erts_set_trace_pattern(Process*p, ErtsCodeMFA *mfa, int specified,
 #ifdef DEBUG
 		ep->info.op = BeamOpCodeAddr(op_i_func_info_IaaI);
 #endif
-                ep->beam[0] = BeamOpCodeAddr(op_trace_jump_W);
-		ep->beam[1] = (BeamInstr) ep->addressv[code_ix];
+                ep->trampoline.op = BeamOpCodeAddr(op_trace_jump_W);
+                ep->trampoline.trace.address = (BeamInstr) ep->addressv[code_ix];
 	    }
-	    erts_set_call_trace_bif(ci, match_prog_set, 0);
+	    erts_set_export_trace(ci, match_prog_set, 0);
 	    if (ep->addressv[code_ix] != pc) {
-		ep->beam[0] = BeamOpCodeAddr(op_i_generic_breakpoint);
+                ep->trampoline.op = BeamOpCodeAddr(op_i_generic_breakpoint);
 	    }
 	} else if (!on && flags.breakpoint) {
 	    /* Turn off breakpoint tracing -- nothing to do here. */
@@ -1460,91 +1459,14 @@ erts_set_trace_pattern(Process*p, ErtsCodeMFA *mfa, int specified,
 	     * Turn off global tracing, either explicitly or implicitly
 	     * before turning on breakpoint tracing.
 	     */
-	    erts_clear_call_trace_bif(ci, 0);
-	    if (BeamIsOpCode(ep->beam[0], op_i_generic_breakpoint)) {
-		ep->beam[0] = BeamOpCodeAddr(op_trace_jump_W);
+	    erts_clear_export_trace(ci, 0);
+            if (BeamIsOpCode(ep->trampoline.op, op_i_generic_breakpoint)) {
+                ep->trampoline.op = BeamOpCodeAddr(op_trace_jump_W);
 	    }
 	}
     }
 
     /*
-    ** OK, now for the bif's
-    */
-    for (i = 0; i < BIF_SIZE; ++i) {
-	Export *ep = bif_export[i];
-
-	if (!ExportIsBuiltIn(ep)) {
-	    continue;
-	}
-
-	if (bif_table[i].f == bif_table[i].traced) {
-	    /* Trace wrapper same as regular function - untraceable */
-	    continue;
-	}
-
-        switch (specified) {
-        case 3:
-            if (mfa->arity != ep->info.mfa.arity)
-                continue;
-        case 2:
-            if (mfa->function != ep->info.mfa.function)
-                continue;
-        case 1:
-            if (mfa->module != ep->info.mfa.module)
-                continue;
-        case 0:
-            break;
-        default:
-            ASSERT(0);
-        }
-
-        if (! flags.breakpoint) { /* Export entry call trace */
-            if (on) {
-                erts_clear_call_trace_bif(&ep->info, 1);
-                erts_clear_mtrace_bif(&ep->info);
-                erts_set_call_trace_bif(&ep->info, match_prog_set, 0);
-            } else { /* off */
-                erts_clear_call_trace_bif(&ep->info, 0);
-            }
-            matches++;
-        } else { /* Breakpoint call trace */
-            int m = 0;
-
-            if (on) {
-                if (flags.local) {
-                    erts_clear_call_trace_bif(&ep->info, 0);
-                    erts_set_call_trace_bif(&ep->info, match_prog_set, 1);
-                    m = 1;
-                }
-                if (flags.meta) {
-                    erts_set_mtrace_bif(&ep->info, meta_match_prog_set,
-                                        meta_tracer);
-                    m = 1;
-                }
-                if (flags.call_time) {
-                    erts_set_time_trace_bif(&ep->info, on);
-                    /* I don't want to remove any other tracers */
-                    m = 1;
-                }
-            } else { /* off */
-                if (flags.local) {
-                    erts_clear_call_trace_bif(&ep->info, 1);
-                    m = 1;
-                }
-                if (flags.meta) {
-                    erts_clear_mtrace_bif(&ep->info);
-                    m = 1;
-                }
-                if (flags.call_time) {
-                    erts_clear_time_trace_bif(&ep->info);
-                    m = 1;
-                }
-            }
-            matches += m;
-        }
-    }
-
-    /*
     ** So, now for breakpoint tracing
     */
     erts_bp_match_functions(&finish_bp.f, mfa, specified);
@@ -1670,7 +1592,6 @@ erts_finish_breakpointing(void)
 		install_exp_breakpoints(&finish_bp.e);
 	    }
 	}
-	setup_bif_trace();
 	return 1;
     case 1:
 	/*
@@ -1699,7 +1620,6 @@ erts_finish_breakpointing(void)
 		uninstall_exp_breakpoints(&finish_bp.e);
 	    }
 	}
-	reset_bif_trace();
 	return 1;
     case 3:
 	/*
@@ -1710,7 +1630,6 @@ erts_finish_breakpointing(void)
 	 * updated).  If any breakpoints have been totally disabled,
 	 * deallocate the GenericBp structs for them.
 	 */
-	erts_consolidate_bif_bp_data();
 	clean_export_entries(&finish_bp.e);
 	erts_consolidate_bp_data(&finish_bp.e, 0);
 	erts_consolidate_bp_data(&finish_bp.f, 1);
@@ -1736,7 +1655,7 @@ install_exp_breakpoints(BpFunctions* f)
     for (i = 0; i < ne; i++) {
 	Export* ep = ErtsContainerStruct(fp[i].ci, Export, info);
 
-	ep->addressv[code_ix] = ep->beam;
+	ep->addressv[code_ix] = ep->trampoline.raw;
     }
 }
 
@@ -1751,11 +1670,12 @@ uninstall_exp_breakpoints(BpFunctions* f)
     for (i = 0; i < ne; i++) {
 	Export* ep = ErtsContainerStruct(fp[i].ci, Export, info);
 
-	if (ep->addressv[code_ix] != ep->beam) {
-	    continue;
-	}
-	ASSERT(BeamIsOpCode(ep->beam[0], op_trace_jump_W));
-	ep->addressv[code_ix] = (BeamInstr *) ep->beam[1];
+        if (ep->addressv[code_ix] != ep->trampoline.raw) {
+            continue;
+        }
+
+        ASSERT(BeamIsOpCode(ep->trampoline.op, op_trace_jump_W));
+        ep->addressv[code_ix] = (BeamInstr *) ep->trampoline.trace.address;
     }
 }
 
@@ -1770,48 +1690,14 @@ clean_export_entries(BpFunctions* f)
     for (i = 0; i < ne; i++) {
 	Export* ep = ErtsContainerStruct(fp[i].ci, Export, info);
 
-	if (ep->addressv[code_ix] == ep->beam) {
-	    continue;
-	}
-	if (BeamIsOpCode(ep->beam[0], op_trace_jump_W)) {
-	    ep->beam[0] = (BeamInstr) 0;
-	    ep->beam[1] = (BeamInstr) 0;
-	}
-    }
-}
-
-static void
-setup_bif_trace(void)
-{
-    int i;
-
-    for (i = 0; i < BIF_SIZE; ++i) {
-	Export *ep = bif_export[i];
-	GenericBp* g = ep->info.u.gen_bp;
-	if (g) {
-	    if (ExportIsBuiltIn(ep)) {
-		ASSERT(ep->beam[1]);
-		ep->beam[1] = (BeamInstr) bif_table[i].traced;
-	    }
-	}
-    }
-}
+        if (ep->addressv[code_ix] == ep->trampoline.raw) {
+            continue;
+        }
 
-static void
-reset_bif_trace(void)
-{
-    int i;
-    ErtsBpIndex active = erts_active_bp_ix();
-
-    for (i = 0; i < BIF_SIZE; ++i) {
-	Export *ep = bif_export[i];
-	GenericBp* g = ep->info.u.gen_bp;
-	if (g && g->data[active].flags == 0) {
-	    if (ExportIsBuiltIn(ep)) {
-		ASSERT(ep->beam[1]);
-		ep->beam[1] = (BeamInstr) bif_table[i].f;
-	    }
-	}
+        if (BeamIsOpCode(ep->trampoline.op, op_trace_jump_W)) {
+            ep->trampoline.op = (BeamInstr) 0;
+            ep->trampoline.trace.address = (BeamInstr) 0;
+        }
     }
 }
 
@@ -1858,6 +1744,8 @@ Eterm erts_seq_trace(Process *p, Eterm arg1, Eterm arg2,
 
     if (arg1 == am_send) {
 	current_flag = SEQ_TRACE_SEND;
+    } else if (arg1 == am_spawn) {
+	current_flag = SEQ_TRACE_SPAWN;
     } else if (arg1 == am_receive) {
 	current_flag = SEQ_TRACE_RECEIVE; 
     } else if (arg1 == am_print) {
@@ -1976,8 +1864,9 @@ BIF_RETTYPE erl_seq_trace_info(Process *p, Eterm item)
     }
 
     if (have_no_seqtrace(SEQ_TRACE_TOKEN(p))) {
-	if ((item == am_send)  || (item == am_receive) || 
-	    (item == am_print) || (item == am_timestamp)
+	if ((item == am_send) || (item == am_spawn) ||
+        (item == am_receive) || (item == am_print)
+        || (item == am_timestamp)
 	    || (item == am_monotonic_timestamp)
 	    || (item == am_strict_monotonic_timestamp)) {
 	    hp = HAlloc(p,3);
@@ -1992,6 +1881,8 @@ BIF_RETTYPE erl_seq_trace_info(Process *p, Eterm item)
 
     if (item == am_send) {
 	current_flag = SEQ_TRACE_SEND;
+    } else if (item == am_spawn) {
+	current_flag = SEQ_TRACE_SPAWN;
     } else if (item == am_receive) {
 	current_flag = SEQ_TRACE_RECEIVE; 
     } else if (item == am_print) {
@@ -2041,7 +1932,7 @@ BIF_RETTYPE seq_trace_print_1(BIF_ALIST_1)
     if (have_no_seqtrace(SEQ_TRACE_TOKEN(BIF_P))) {
 	BIF_RET(am_false);
     }
-    seq_trace_update_send(BIF_P);
+    seq_trace_update_serial(BIF_P);
     seq_trace_output(SEQ_TRACE_TOKEN(BIF_P), BIF_ARG_1, 
 		     SEQ_TRACE_PRINT, NIL, BIF_P);
     BIF_RET(am_true);
@@ -2062,7 +1953,7 @@ BIF_RETTYPE seq_trace_print_2(BIF_ALIST_2)
     }
     if (!EQ(BIF_ARG_1, SEQ_TRACE_TOKEN_LABEL(BIF_P)))
 	BIF_RET(am_false);
-    seq_trace_update_send(BIF_P);
+    seq_trace_update_serial(BIF_P);
     seq_trace_output(SEQ_TRACE_TOKEN(BIF_P), BIF_ARG_2, 
 		     SEQ_TRACE_PRINT, NIL, BIF_P);
     BIF_RET(am_true);
diff --git a/erts/emulator/beam/erl_db_hash.c b/erts/emulator/beam/erl_db_hash.c
index 5937bd64ec..2bcdb47a54 100644
--- a/erts/emulator/beam/erl_db_hash.c
+++ b/erts/emulator/beam/erl_db_hash.c
@@ -93,11 +93,9 @@
     erts_flxctr_dec_read_centralized(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID)
 #define RESET_NITEMS(DB)                                                \
     erts_flxctr_reset(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID)
-/* 
- * The following symbols can be manipulated to "tune" the linear hash array 
- */
+
 #define GROW_LIMIT(NACTIVE) ((NACTIVE)*1)
-#define SHRINK_LIMIT(NACTIVE) ((NACTIVE) / 2)
+#define SHRINK_LIMIT(TB) erts_atomic_read_nob(&(TB)->shrink_limit)
 
 /*
 ** We want the first mandatory segment to be small (to reduce minimal footprint)
@@ -137,6 +135,11 @@
 
 #define BUCKET(tb, i) SEGTAB(tb)[SLOT_IX_TO_SEG_IX(i)]->buckets[(i) & EXT_SEGSZ_MASK]
 
+#ifdef DEBUG
+#  define DBG_BUCKET_INACTIVE ((HashDbTerm*)0xdead5107)
+#endif
+
+
 /*
  * When deleting a table, the number of records to delete.
  * Approximate number, because we must delete entire buckets.
@@ -377,7 +380,7 @@ typedef int (*extra_match_validator_t)(int keypos, Eterm match, Eterm guard, Ete
 */
 static struct ext_segtab* alloc_ext_segtab(DbTableHash* tb, unsigned seg_ix);
 static void alloc_seg(DbTableHash *tb);
-static int free_seg(DbTableHash *tb, int free_records);
+static int free_seg(DbTableHash *tb);
 static HashDbTerm* next_live(DbTableHash *tb, Uint *iptr, erts_rwmtx_t** lck_ptr,
 			     HashDbTerm *list);
 static HashDbTerm* search_list(DbTableHash* tb, Eterm key, 
@@ -476,10 +479,8 @@ static int db_raw_next_hash(Process* p, DbTable *tbl, Eterm key, Eterm *ret);
 
 static ERTS_INLINE void try_shrink(DbTableHash* tb)
 {
-    int nactive = NACTIVE(tb);
     int nitems = NITEMS(tb);
-    if (nactive > FIRST_SEGSZ && nitems < SHRINK_LIMIT(nactive)
-	&& !IS_FIXED(tb)) {
+    if (nitems < SHRINK_LIMIT(tb) && !IS_FIXED(tb)) {
 	shrink(tb, nitems);
     }
 }	
@@ -693,6 +694,7 @@ int db_create_hash(Process *p, DbTable *tbl)
 
     erts_atomic_init_nob(&tb->szm, FIRST_SEGSZ_MASK);
     erts_atomic_init_nob(&tb->nactive, FIRST_SEGSZ);
+    erts_atomic_init_nob(&tb->shrink_limit, 0);
     erts_atomic_init_nob(&tb->fixdel, (erts_aint_t)NULL);
     erts_atomic_init_nob(&tb->segtab, (erts_aint_t)NULL);
     SET_SEGTAB(tb, tb->first_segtab);
@@ -779,7 +781,7 @@ static int db_next_hash(Process *p, DbTable *tbl, Eterm key, Eterm *ret)
     b = next_live(tb, &ix, &lck, b->next);
     if (tb->common.status & (DB_BAG | DB_DUPLICATE_BAG)) {
 	while (b != 0) {
-	    if (!has_key(tb, b, key, hval) && !is_pseudo_deleted(b)) {
+	    if (!has_key(tb, b, key, hval)) {
 		break;
 	    }
 	    b = next_live(tb, &ix, &lck, b->next);
@@ -789,6 +791,7 @@ static int db_next_hash(Process *p, DbTable *tbl, Eterm key, Eterm *ret)
 	*ret = am_EOT;
     }
     else {
+        ASSERT(!is_pseudo_deleted(b));
 	*ret = db_copy_key(p, tbl, &b->dbterm);
 	RUNLOCK_HASH(lck);
     }    
@@ -2474,7 +2477,7 @@ static SWord db_free_table_continue_hash(DbTable *tbl, SWord reds)
     erts_atomic_set_relb(&tb->fixdel, (erts_aint_t)NULL);
 
     while(tb->nslots != 0) {
-	reds -= EXT_SEGSZ/64 + free_seg(tb, 1);
+	reds -= EXT_SEGSZ/64 + free_seg(tb);
 
 	/*
 	 * If we have done enough work, get out here.
@@ -2672,6 +2675,34 @@ static struct ext_segtab* alloc_ext_segtab(DbTableHash* tb, unsigned seg_ix)
     return est;
 }
 
+static void calc_shrink_limit(DbTableHash* tb)
+{
+    erts_aint_t shrink_limit;
+
+    if (tb->nslots >= (FIRST_SEGSZ + 2*EXT_SEGSZ)) {
+        /*
+         * Start shrink when we can remove one extra segment
+         * and still remain below 50% load.
+         */
+        shrink_limit = (tb->nslots - EXT_SEGSZ) / 2;
+    }
+    else {
+        /*
+         * But don't shrink below two segments.
+         * Why? In order to have chance of getting rid of the last extra segment,
+         * and rehash it into the first small segment, we either have to start
+         * early and do speculative joining of buckets or we have to join a lot
+         * of buckets during each delete-op.
+         *
+         * Instead keep segment #2 once allocated. I also think it's a good bet
+         * a shrinking large table will grow large again.
+         */
+        shrink_limit = 0;
+    }
+    erts_atomic_set_nob(&tb->shrink_limit, shrink_limit);
+}
+
+
 /* Extend table with one new segment
 */
 static void alloc_seg(DbTableHash *tb)
@@ -2690,8 +2721,17 @@ static void alloc_seg(DbTableHash *tb)
     segtab[seg_ix] = (struct segment*) erts_db_alloc(ERTS_ALC_T_DB_SEG,
                                                      (DbTable *) tb,
                                                      SIZEOF_SEGMENT(EXT_SEGSZ));
-    sys_memset(segtab[seg_ix], 0, SIZEOF_SEGMENT(EXT_SEGSZ));
+#ifdef DEBUG
+    {
+        int i;
+        for (i = 0; i < EXT_SEGSZ; i++) {
+            segtab[seg_ix]->buckets[i] = DBG_BUCKET_INACTIVE;
+        }
+    }
+#endif
     tb->nslots += EXT_SEGSZ;
+
+    calc_shrink_limit(tb);
 }
 
 static void dealloc_ext_segtab(void* lop_data)
@@ -2701,10 +2741,19 @@ static void dealloc_ext_segtab(void* lop_data)
     erts_free(ERTS_ALC_T_DB_SEG, est);
 }
 
-/* Shrink table by freeing the top segment
+struct dealloc_seg_ops {
+    struct segment* segp;
+    Uint seg_sz;
+
+    struct ext_segtab* est;
+};
+
+/* Shrink table by removing the top segment
 ** free_records: 1=free any records in segment, 0=assume segment is empty 
+** ds_ops: (out) Instructions for dealloc_seg().
 */
-static int free_seg(DbTableHash *tb, int free_records)
+static int remove_seg(DbTableHash *tb, int free_records,
+                      struct dealloc_seg_ops *ds_ops)
 {
     const int seg_ix = SLOT_IX_TO_SEG_IX(tb->nslots) - 1;
     struct segment** const segtab = SEGTAB(tb);
@@ -2712,24 +2761,47 @@ static int free_seg(DbTableHash *tb, int free_records)
     Uint seg_sz;
     int nrecords = 0;
 
+    ERTS_LC_ASSERT(IS_TAB_WLOCKED(tb) || tb->common.status & DB_DELETE
+                   || erts_atomic_read_nob(&tb->is_resizing));
+
     ASSERT(segp != NULL);
-#ifndef DEBUG
-    if (free_records)
-#endif
-    {	
-	int i = (seg_ix == 0) ? FIRST_SEGSZ : EXT_SEGSZ;
-	while (i--) {
-	    HashDbTerm* p = segp->buckets[i];
+    if (free_records) {
+        int ix, n;
+        if (seg_ix == 0) {
+            /* First segment (always fully active) */
+            n = FIRST_SEGSZ;
+            ix = FIRST_SEGSZ-1;
+        }
+        else if (NACTIVE(tb) < tb->nslots) {
+            /* Last extended segment partially active */
+            n = (NACTIVE(tb) - FIRST_SEGSZ) & EXT_SEGSZ_MASK;
+            ix = (NACTIVE(tb)-1) & EXT_SEGSZ_MASK;
+        }
+        else {
+            /* Full extended segment */
+            n = EXT_SEGSZ;
+            ix = EXT_SEGSZ - 1;
+        }
+        for ( ; n > 0; n--, ix--) {
+	    HashDbTerm* p = segp->buckets[ix & EXT_SEGSZ_MASK];
 	    while(p != 0) {		
 		HashDbTerm* nxt = p->next;
-		ASSERT(free_records); /* segment not empty as assumed? */
 		free_term(tb, p);
 		p = nxt;
 		++nrecords;
 	    }
 	}
     }
-    
+#ifdef DEBUG
+    else {
+        int ix = (seg_ix == 0) ? FIRST_SEGSZ-1 : EXT_SEGSZ-1;
+        for ( ; ix >= 0; ix--) {
+            ASSERT(segp->buckets[ix] == DBG_BUCKET_INACTIVE);
+        }
+    }
+#endif
+
+    ds_ops->est = NULL;
     if (seg_ix >= NSEG_1) {
         struct ext_segtab* est = ErtsContainerStruct_(segtab,struct ext_segtab,segtab);
 
@@ -2738,35 +2810,64 @@ static int free_seg(DbTableHash *tb, int free_records)
             SET_SEGTAB(tb, est->prev_segtab);
             tb->nsegs = est->prev_nsegs;
 
-            if (!tb->common.is_thread_safe) {
-                /*
-                 * Table is doing a graceful shrink operation and we must avoid
-                 * deallocating this segtab while it may still be read by other
-                 * threads. Schedule deallocation with thread progress to make
-                 * sure no lingering threads are still hanging in BUCKET macro
-                 * with an old segtab pointer.
-                 */
-                erts_schedule_db_free(&tb->common, dealloc_ext_segtab,
-                                      est, &est->lop,
-                                      SIZEOF_EXT_SEGTAB(est->nsegs));
-            }
-            else
-                erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable*)tb, est,
-                             SIZEOF_EXT_SEGTAB(est->nsegs));
+            ds_ops->est = est;
         }
     }
+
     seg_sz = (seg_ix == 0) ? FIRST_SEGSZ : EXT_SEGSZ;
-    erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable *)tb, segp, SIZEOF_SEGMENT(seg_sz));
+    tb->nslots -= seg_sz;
+    ASSERT(tb->nslots >= 0);
+
+    ds_ops->segp = segp;
+    ds_ops->seg_sz = seg_sz;
     
 #ifdef DEBUG
     if (seg_ix < tb->nsegs)
         SEGTAB(tb)[seg_ix] = NULL;
 #endif
-    tb->nslots -= seg_sz;
-    ASSERT(tb->nslots >= 0);
+    calc_shrink_limit(tb);
     return nrecords;
 }
 
+/*
+ * Deallocate segment removed by remove_seg()
+ */
+static void dealloc_seg(DbTableHash *tb, struct dealloc_seg_ops* ds_ops)
+{
+    struct ext_segtab* est = ds_ops->est;
+
+    if (est) {
+        if (!tb->common.is_thread_safe) {
+            /*
+             * Table is doing a graceful shrink operation and we must avoid
+             * deallocating this segtab while it may still be read by other
+             * threads. Schedule deallocation with thread progress to make
+             * sure no lingering threads are still hanging in BUCKET macro
+             * with an old segtab pointer.
+             */
+            erts_schedule_db_free(&tb->common, dealloc_ext_segtab,
+                                  est, &est->lop,
+                                  SIZEOF_EXT_SEGTAB(est->nsegs));
+        }
+        else
+            erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable*)tb, est,
+                         SIZEOF_EXT_SEGTAB(est->nsegs));
+    }
+
+    erts_db_free(ERTS_ALC_T_DB_SEG, (DbTable *)tb,
+                 ds_ops->segp, SIZEOF_SEGMENT(ds_ops->seg_sz));
+}
+
+/* Remove and deallocate top segment and all its contained objects */
+static int free_seg(DbTableHash *tb)
+{
+    struct dealloc_seg_ops ds_ops;
+    int reds;
+
+    reds = remove_seg(tb, 1, &ds_ops);
+    dealloc_seg(tb, &ds_ops);
+    return reds;
+}
 
 /*
 ** Copy terms from ptr1 until ptr2
@@ -2888,6 +2989,7 @@ static void grow(DbTableHash* tb, int nitems)
         pnext = &BUCKET(tb, from_ix);
         p = *pnext;
         to_pnext = &BUCKET(tb, to_ix);
+        ASSERT(*to_pnext == DBG_BUCKET_INACTIVE);
         while (p != NULL) {
             if (is_pseudo_deleted(p)) { /* rare but possible with fine locking */
                 *pnext = p->next;
@@ -2924,19 +3026,21 @@ abort:
 */
 static void shrink(DbTableHash* tb, int nitems)
 {
-    HashDbTerm** src_bp;
-    HashDbTerm** dst_bp;
+    struct dealloc_seg_ops ds_ops;
+    HashDbTerm* src;
+    HashDbTerm* tail;
     HashDbTerm** bp;
     erts_rwmtx_t* lck;
     int src_ix, dst_ix, low_szm;
     int nactive;
     int loop_limit = 5;
 
+    ds_ops.segp = NULL;
     do {
         if (!begin_resizing(tb))
             return; /* already in progress */
         nactive = NACTIVE(tb);
-        if (!(nactive > FIRST_SEGSZ && nitems < SHRINK_LIMIT(nactive))) {
+        if (!(nitems < SHRINK_LIMIT(tb))) {
             goto abort; /* already done (race) */
         }
         src_ix = nactive - 1;
@@ -2953,41 +3057,49 @@ static void shrink(DbTableHash* tb, int nitems)
             goto abort;
         }
 
-        src_bp = &BUCKET(tb, src_ix);
-        dst_bp = &BUCKET(tb, dst_ix);
-        bp = src_bp;
-
-        /*
-         * We join lists by appending "dst" at the end of "src"
-         * as we must step through "src" anyway to purge pseudo deleted.
-         */
-        while(*bp != NULL) {
-            if (is_pseudo_deleted(*bp)) {
-                HashDbTerm* deleted = *bp;
-                *bp = deleted->next;
-                free_term(tb, deleted);
-            } else {
-                bp = &(*bp)->next;
-            }
-        }
-        *bp = *dst_bp;
-        *dst_bp = *src_bp;
-        *src_bp = NULL;
-
+        src = BUCKET(tb, src_ix);
+#ifdef DEBUG
+        BUCKET(tb, src_ix) = DBG_BUCKET_INACTIVE;
+#endif
         nactive = src_ix;
         erts_atomic_set_nob(&tb->nactive, nactive);
         if (dst_ix == 0) {
             erts_atomic_set_relb(&tb->szm, low_szm);
         }
-        WUNLOCK_HASH(lck);
-
         if (tb->nslots - src_ix >= EXT_SEGSZ) {
-            free_seg(tb, 0);
+            remove_seg(tb, 0, &ds_ops);
         }
         done_resizing(tb);
 
-    } while (--loop_limit
-             && nactive > FIRST_SEGSZ && nitems < SHRINK_LIMIT(nactive));
+        if (src) {
+            /*
+             * We join buckets by appending "dst" list at the end of "src" list
+             * as we must step through "src" anyway to purge pseudo deleted.
+             */
+            bp = &BUCKET(tb, dst_ix);
+            tail = *bp;
+            *bp = src;
+
+            while(*bp != NULL) {
+                if (is_pseudo_deleted(*bp)) {
+                    HashDbTerm* deleted = *bp;
+                    *bp = deleted->next;
+                    free_term(tb, deleted);
+                } else {
+                    bp = &(*bp)->next;
+                }
+            }
+            *bp = tail;
+        }
+
+        WUNLOCK_HASH(lck);
+
+        if (ds_ops.segp) {
+            dealloc_seg(tb, &ds_ops);
+            ds_ops.segp = NULL;
+        }
+
+    } while (--loop_limit && nitems < SHRINK_LIMIT(tb));
     return;
 
 abort:
diff --git a/erts/emulator/beam/erl_db_hash.h b/erts/emulator/beam/erl_db_hash.h
index 9759d8b466..b26b82056f 100644
--- a/erts/emulator/beam/erl_db_hash.h
+++ b/erts/emulator/beam/erl_db_hash.h
@@ -63,9 +63,10 @@ typedef struct db_table_hash_fine_locks {
 typedef struct db_table_hash {
     DbTableCommon common;
 
-    /* SMP: szm and nactive are write-protected by is_resizing or table write lock */
+    /* szm, nactive, shrink_limit are write-protected by is_resizing or table write lock */
     erts_atomic_t szm;     /* current size mask. */
     erts_atomic_t nactive; /* Number of "active" slots */
+    erts_atomic_t shrink_limit; /* Shrink table when fewer objects than this */
 
     erts_atomic_t segtab;  /* The segment table (struct segment**) */
     struct segment* first_segtab[1];
diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c
index ed09a34ae4..096cb8a778 100644
--- a/erts/emulator/beam/erl_db_util.c
+++ b/erts/emulator/beam/erl_db_util.c
@@ -2612,7 +2612,10 @@ restart:
 	    break;
         case matchCaller:
             ASSERT(c_p == self);
-	    if (!(c_p->cp) || !(cp = find_function_from_pc(c_p->cp))) {
+            t = c_p->stop[0];
+            if (is_not_CP(t)) {
+                *esp++ = am_undefined;
+            } else if (!(cp = find_function_from_pc(cp_val(t)))) {
  		*esp++ = am_undefined;
  	    } else {
 		ehp = HAllocX(build_proc, 4, HEAP_XTRA);
@@ -5218,7 +5221,7 @@ static Eterm match_spec_test(Process *p, Eterm against, Eterm spec, int trace)
     Eterm l;
     Uint32 ret_flags;
     Uint sz;
-    BeamInstr *save_cp;
+    Eterm save_cp;
 
     if (trace && !(is_list(against) || against == NIL)) {
 	return THE_NON_VALUE;
@@ -5262,13 +5265,13 @@ static Eterm match_spec_test(Process *p, Eterm against, Eterm spec, int trace)
 		++n;
 		l = CDR(list_val(l));
 	    }
-	    save_cp = p->cp;
-	    p->cp = NULL;
+	    save_cp = p->stop[0];
+	    p->stop[0] = NIL;
 	    res = erts_match_set_run_trace(p, p,
                       mps, arr, n,
 		      ERTS_PAM_COPY_RESULT|ERTS_PAM_IGNORE_TRACE_SILENT,
 		      &ret_flags);
-	    p->cp = save_cp;
+	    p->stop[0] = save_cp;
 	} else {
 	    n = 0;
 	    arr = NULL;
diff --git a/erts/emulator/beam/erl_fun.c b/erts/emulator/beam/erl_fun.c
index 9c866250bb..79a1fdb8b9 100644
--- a/erts/emulator/beam/erl_fun.c
+++ b/erts/emulator/beam/erl_fun.c
@@ -100,27 +100,6 @@ int erts_fun_table_sz(void)
 }
 
 ErlFunEntry*
-erts_put_fun_entry(Eterm mod, int uniq, int index)
-{
-    ErlFunEntry template;
-    ErlFunEntry* fe;
-    erts_aint_t refc;
-    ASSERT(is_atom(mod));
-    template.old_uniq = uniq;
-    template.old_index = index;
-    template.module = mod;
-    erts_fun_write_lock();
-    fe = (ErlFunEntry *) hash_put(&erts_fun_table, (void*) &template);
-    sys_memset(fe->uniq, 0, sizeof(fe->uniq));
-    fe->index = 0;
-    refc = erts_refc_inctest(&fe->refc, 0);
-    if (refc < 2) /* New or pending delete */
-	erts_refc_inc(&fe->refc, 1);
-    erts_fun_write_unlock();
-    return fe;
-}
-
-ErlFunEntry*
 erts_put_fun_entry2(Eterm mod, int old_uniq, int old_index,
 		    byte* uniq, int index, int arity)
 {
@@ -130,12 +109,12 @@ erts_put_fun_entry2(Eterm mod, int old_uniq, int old_index,
 
     ASSERT(is_atom(mod));
     template.old_uniq = old_uniq;
-    template.old_index = old_index;
+    template.index = index;
     template.module = mod;
     erts_fun_write_lock();
     fe = (ErlFunEntry *) hash_put(&erts_fun_table, (void*) &template);
     sys_memcpy(fe->uniq, uniq, sizeof(fe->uniq));
-    fe->index = index;
+    fe->old_index = old_index;
     fe->arity = arity;
     refc = erts_refc_inctest(&fe->refc, 0);
     if (refc < 2) /* New or pending delete */
@@ -144,13 +123,6 @@ erts_put_fun_entry2(Eterm mod, int old_uniq, int old_index,
     return fe;
 }
 
-struct my_key {
-    Eterm mod;
-    byte* uniq;
-    int index;
-    ErlFunEntry* fe;
-};
-
 ErlFunEntry*
 erts_get_fun_entry(Eterm mod, int uniq, int index)
 {
@@ -159,7 +131,7 @@ erts_get_fun_entry(Eterm mod, int uniq, int index)
 
     ASSERT(is_atom(mod));
     template.old_uniq = uniq;
-    template.old_index = index;
+    template.index = index;
     template.module = mod;
     erts_fun_read_lock();
     ret = (ErlFunEntry *) hash_get(&erts_fun_table, (void*) &template);
@@ -199,36 +171,33 @@ erts_erase_fun_entry(ErlFunEntry* fe)
     erts_fun_write_unlock();
 }
 
+struct fun_purge_foreach_args {
+    BeamInstr *start;
+    BeamInstr *end;
+};
+
+static void fun_purge_foreach(ErlFunEntry *fe, struct fun_purge_foreach_args *arg)
+{
+    BeamInstr* addr = fe->address;
+    if (arg->start <= addr && addr < arg->end) {
+        fe->pend_purge_address = addr;
+        ERTS_THR_WRITE_MEMORY_BARRIER;
+        fe->address = unloaded_fun;
+#ifdef HIPE
+        fe->pend_purge_native_address = fe->native_address;
+        hipe_set_closure_stub(fe);
+#endif
+        erts_purge_state_add_fun(fe);
+    }
+}
+
 void
 erts_fun_purge_prepare(BeamInstr* start, BeamInstr* end)
 {
-    int limit;
-    HashBucket** bucket;
-    int i;
+    struct fun_purge_foreach_args args = {start, end};
 
     erts_fun_read_lock();
-    limit = erts_fun_table.size;
-    bucket = erts_fun_table.bucket;
-    for (i = 0; i < limit; i++) {
-	HashBucket* b = bucket[i];
-
-	while (b) {
-	    ErlFunEntry* fe = (ErlFunEntry *) b;
-	    BeamInstr* addr = fe->address;
-
-	    if (start <= addr && addr < end) {
-		fe->pend_purge_address = addr;
-		ERTS_THR_WRITE_MEMORY_BARRIER;
-		fe->address = unloaded_fun;
-#ifdef HIPE
-                fe->pend_purge_native_address = fe->native_address;
-                hipe_set_closure_stub(fe);
-#endif
-		erts_purge_state_add_fun(fe);
-	    }
-	    b = b->next;
-	}
-    }
+    hash_foreach(&erts_fun_table, (HFOREACH_FUN)fun_purge_foreach, &args);
     erts_fun_read_unlock();
 }
 
@@ -278,36 +247,34 @@ erts_fun_purge_complete(ErlFunEntry **funs, Uint no)
     ERTS_THR_WRITE_MEMORY_BARRIER;
 }
 
+struct dump_fun_foreach_args {
+    fmtfn_t to;
+    void *to_arg;
+};
+
+static void
+dump_fun_foreach(ErlFunEntry *fe, struct dump_fun_foreach_args *args)
+{
+    erts_print(args->to, args->to_arg, "=fun\n");
+    erts_print(args->to, args->to_arg, "Module: %T\n", fe->module);
+    erts_print(args->to, args->to_arg, "Uniq: %d\n", fe->old_uniq);
+    erts_print(args->to, args->to_arg, "Index: %d\n",fe->old_index);
+    erts_print(args->to, args->to_arg, "Address: %p\n", fe->address);
+#ifdef HIPE
+    erts_print(args->to, args->to_arg, "Native_address: %p\n", fe->native_address);
+#endif
+    erts_print(args->to, args->to_arg, "Refc: %ld\n", erts_refc_read(&fe->refc, 1));
+}
+
 void
 erts_dump_fun_entries(fmtfn_t to, void *to_arg)
 {
-    int limit;
-    HashBucket** bucket;
-    int i;
+    struct dump_fun_foreach_args args = {to, to_arg};
     int lock = !ERTS_IS_CRASH_DUMPING;
 
-
     if (lock)
 	erts_fun_read_lock();
-    limit = erts_fun_table.size;
-    bucket = erts_fun_table.bucket;
-    for (i = 0; i < limit; i++) {
-	HashBucket* b = bucket[i];
-
-	while (b) {
-	    ErlFunEntry* fe = (ErlFunEntry *) b;
-	    erts_print(to, to_arg, "=fun\n");
-	    erts_print(to, to_arg, "Module: %T\n", fe->module);
-	    erts_print(to, to_arg, "Uniq: %d\n", fe->old_uniq);
-	    erts_print(to, to_arg, "Index: %d\n",fe->old_index);
-	    erts_print(to, to_arg, "Address: %p\n", fe->address);
-#ifdef HIPE
-	    erts_print(to, to_arg, "Native_address: %p\n", fe->native_address);
-#endif
-	    erts_print(to, to_arg, "Refc: %ld\n", erts_refc_read(&fe->refc, 1));
-	    b = b->next;
-	}
-    }
+    hash_foreach(&erts_fun_table, (HFOREACH_FUN)dump_fun_foreach, &args);
     if (lock)
 	erts_fun_read_unlock();
 }
@@ -315,15 +282,27 @@ erts_dump_fun_entries(fmtfn_t to, void *to_arg)
 static HashValue
 fun_hash(ErlFunEntry* obj)
 {
-    return (HashValue) (obj->old_uniq ^ obj->old_index ^ atom_val(obj->module));
+    return (HashValue) (obj->old_uniq ^ obj->index ^ atom_val(obj->module));
 }
 
 static int
 fun_cmp(ErlFunEntry* obj1, ErlFunEntry* obj2)
 {
-    return !(obj1->module == obj2->module && 
+    /*
+     * OTP 23: Use 'index' (instead of 'old_index') when comparing fun
+     * entries. In OTP 23, multiple make_fun2 instructions may refer to the
+     * the same 'index' (for the wrapper function generated for the
+     * 'fun F/A' syntax).
+     *
+     * This is safe when loading code compiled with OTP R15 and later,
+     * because since R15 (2011), the 'index' has been reliably equal
+     * to 'old_index'. The loader refuses to load modules compiled before
+     * OTP R15.
+     */
+
+    return !(obj1->module == obj2->module &&
 	     obj1->old_uniq == obj2->old_uniq &&
-	     obj1->old_index == obj2->old_index);
+	     obj1->index == obj2->index);
 }
 
 static ErlFunEntry*
@@ -333,7 +312,7 @@ fun_alloc(ErlFunEntry* template)
 						  sizeof(ErlFunEntry));
 
     obj->old_uniq = template->old_uniq;
-    obj->old_index = template->old_index;
+    obj->index = template->index;
     obj->module = template->module;
     erts_refc_init(&obj->refc, -1);
     obj->address = unloaded_fun;
diff --git a/erts/emulator/beam/erl_fun.h b/erts/emulator/beam/erl_fun.h
index fb2901d866..eefc7a95bb 100644
--- a/erts/emulator/beam/erl_fun.h
+++ b/erts/emulator/beam/erl_fun.h
@@ -74,7 +74,6 @@ void erts_init_fun_table(void);
 void erts_fun_info(fmtfn_t, void *);
 int erts_fun_table_sz(void);
 
-ErlFunEntry* erts_put_fun_entry(Eterm mod, int uniq, int index);
 ErlFunEntry* erts_get_fun_entry(Eterm mod, int uniq, int index);
 
 ErlFunEntry* erts_put_fun_entry2(Eterm mod, int old_uniq, int old_index,
diff --git a/erts/emulator/beam/erl_gc.c b/erts/emulator/beam/erl_gc.c
index 13b1f8ab4d..4a6f204cb5 100644
--- a/erts/emulator/beam/erl_gc.c
+++ b/erts/emulator/beam/erl_gc.c
@@ -65,6 +65,8 @@
 #  define HARDDEBUG 1
 #endif
 
+extern BeamInstr beam_apply[2];
+
 /*
  * Returns number of elements in an array.
  */
@@ -934,13 +936,15 @@ garbage_collect_hibernate(Process* p, int check_long_gc)
      */
     erts_atomic32_read_bor_nob(&p->state, ERTS_PSFLG_GC);
     ErtsGcQuickSanityCheck(p);
-    ASSERT(p->stop == p->hend);	/* Stack must be empty. */
+    ASSERT(p->stop == p->hend - 1); /* Only allow one continuation pointer. */
+    ASSERT(p->stop[0] == make_cp(beam_apply+1));
 
     /*
      * Do it.
      */
 
     heap_size = p->heap_sz + (p->old_htop - p->old_heap) + p->mbuf_sz;
+    heap_size += 1;             /* Reserve place for continuation pointer */
 
     heap = (Eterm*) ERTS_HEAP_ALLOC(ERTS_ALC_T_TMP_HEAP,
 				    sizeof(Eterm)*heap_size);
@@ -966,13 +970,11 @@ garbage_collect_hibernate(Process* p, int check_long_gc)
     p->high_water = htop;
     p->htop = htop;
     p->hend = p->heap + heap_size;
-    p->stop = p->hend;
+    p->stop = p->hend - 1;
     p->heap_sz = heap_size;
 
     heap_size = actual_size = p->htop - p->heap;
-    if (heap_size == 0) {
-	heap_size = 1; /* We want a heap... */
-    }
+    heap_size += 1;             /* Reserve place for continuation pointer */
 
     FLAGS(p) &= ~F_FORCE_GC;
     p->live_hf_end = ERTS_INVALID_HFRAG_PTR;
@@ -988,14 +990,15 @@ garbage_collect_hibernate(Process* p, int check_long_gc)
      * hibernated.
      */
 
-    ASSERT(p->hend - p->stop == 0); /* Empty stack */
     ASSERT(actual_size < p->heap_sz);
 
     heap = ERTS_HEAP_ALLOC(ERTS_ALC_T_HEAP, sizeof(Eterm)*heap_size);
     sys_memcpy((void *) heap, (void *) p->heap, actual_size*sizeof(Eterm));
     ERTS_HEAP_FREE(ERTS_ALC_T_TMP_HEAP, p->heap, p->heap_sz*sizeof(Eterm));
 
-    p->stop = p->hend = heap + heap_size;
+    p->hend = heap + heap_size;
+    p->stop = p->hend - 1;
+    p->stop[0] = make_cp(beam_apply+1);
 
     offs = heap - p->heap;
     area = (char *) p->heap;
@@ -2585,7 +2588,7 @@ setup_rootset(Process *p, Eterm *objv, int nobj, Rootset *rootset)
     /*
      * If a NIF or BIF has saved arguments, they need to be added
      */
-    if (erts_setup_nif_export_rootset(p, &roots[n].v, &roots[n].sz))
+    if (erts_setup_nfunc_rootset(p, &roots[n].v, &roots[n].sz))
 	n++;
 
     ASSERT(n <= rootset->size);
@@ -3233,7 +3236,7 @@ offset_one_rootset(Process *p, Sint offs, char* area, Uint area_size,
 	offset_heap_ptr(objv, nobj, offs, area, area_size);
     }
     offset_off_heap(p, offs, area, area_size);
-    if (erts_setup_nif_export_rootset(p, &v, &sz))
+    if (erts_setup_nfunc_rootset(p, &v, &sz))
 	offset_heap_ptr(v, sz, offs, area, area_size);
 }
 
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 547e4064a2..4d0ebbd1ed 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -354,6 +354,7 @@ erl_init(int ncpu,
     erts_init_bif_chksum();
     erts_init_bif_binary();
     erts_init_bif_guard();
+    erts_init_bif_lists();
     erts_init_bif_persistent_term();
     erts_init_bif_re();
     erts_init_unicode(); /* after RE to get access to PCRE unicode */
diff --git a/erts/emulator/beam/erl_message.c b/erts/emulator/beam/erl_message.c
index 1bebf6efe2..42a07a59d6 100644
--- a/erts/emulator/beam/erl_message.c
+++ b/erts/emulator/beam/erl_message.c
@@ -674,7 +674,7 @@ erts_send_message(Process* sender,
          * Make sure we don't use the heap between those instances.
          */
         if (have_seqtrace(stoken)) {
-	    seq_trace_update_send(sender);
+	    seq_trace_update_serial(sender);
 	    seq_trace_output(stoken, message, SEQ_TRACE_SEND,
 			     receiver->common.id, sender);
 
diff --git a/erts/emulator/beam/erl_nfunc_sched.c b/erts/emulator/beam/erl_nfunc_sched.c
index b8cf2bee0e..8263a6e9b7 100644
--- a/erts/emulator/beam/erl_nfunc_sched.c
+++ b/erts/emulator/beam/erl_nfunc_sched.c
@@ -30,77 +30,37 @@
 #include "erl_nfunc_sched.h"
 #include "erl_trace.h"
 
-NifExport *
-erts_new_proc_nif_export(Process *c_p, int argc)
+ErtsNativeFunc *
+erts_new_proc_nfunc(Process *c_p, int argc)
 {
+    ErtsNativeFunc *nep, *old_nep;
     size_t size;
-    int i;
-    NifExport *nep, *old_nep;
-
-    size = sizeof(NifExport) + (argc-1)*sizeof(Eterm);
-    nep = erts_alloc(ERTS_ALC_T_NIF_TRAP_EXPORT, size);
 
-    for (i = 0; i < ERTS_NUM_CODE_IX; i++)
-	nep->exp.addressv[i] = &nep->exp.beam[0];
+    size = sizeof(ErtsNativeFunc) + (argc-1)*sizeof(Eterm);
+    nep = erts_alloc(ERTS_ALC_T_NFUNC_TRAP_WRAPPER, size);
 
     nep->argc = -1; /* unused marker */
     nep->argv_size = argc;
-    nep->trace = NULL;
-    old_nep = ERTS_PROC_SET_NIF_TRAP_EXPORT(c_p, nep);
+    old_nep = ERTS_PROC_SET_NFUNC_TRAP_WRAPPER(c_p, nep);
     if (old_nep) {
-	ASSERT(!nep->trace);
-	erts_free(ERTS_ALC_T_NIF_TRAP_EXPORT, old_nep);
+	erts_free(ERTS_ALC_T_NFUNC_TRAP_WRAPPER, old_nep);
     }
     return nep;
 }
 
 void
-erts_destroy_nif_export(Process *p)
+erts_destroy_nfunc(Process *p)
 {
-    NifExport *nep = ERTS_PROC_SET_NIF_TRAP_EXPORT(p, NULL);
+    ErtsNativeFunc *nep = ERTS_PROC_SET_NFUNC_TRAP_WRAPPER(p, NULL);
     if (nep) {
 	if (nep->m)
-	    erts_nif_export_cleanup_nif_mod(nep);
-	erts_free(ERTS_ALC_T_NIF_TRAP_EXPORT, nep);
+	    erts_nfunc_cleanup_nif_mod(nep);
+	erts_free(ERTS_ALC_T_NFUNC_TRAP_WRAPPER, nep);
     }
 }
 
-void
-erts_nif_export_save_trace(Process *c_p, NifExport *nep, int applying,
-			   Export* ep, BeamInstr *cp, Uint32 flags,
-			   Uint32 flags_meta, BeamInstr* I,
-			   ErtsTracer meta_tracer)
-{
-    NifExportTrace *netp;
-    ASSERT(nep && nep->argc >= 0);
-    ASSERT(!nep->trace);
-    netp = erts_alloc(ERTS_ALC_T_NIF_EXP_TRACE,
-		      sizeof(NifExportTrace));
-    netp->applying = applying;
-    netp->ep = ep;
-    netp->cp = cp;
-    netp->flags = flags;
-    netp->flags_meta = flags_meta;
-    netp->I = I;
-    netp->meta_tracer = NIL;
-    erts_tracer_update(&netp->meta_tracer, meta_tracer);
-    nep->trace = netp;
-}
-
-void
-erts_nif_export_restore_trace(Process *c_p, Eterm result, NifExport *nep)
-{
-    NifExportTrace *netp = nep->trace;
-    nep->trace = NULL;
-    erts_bif_trace_epilogue(c_p, result, netp->applying, netp->ep,
-			    netp->cp, netp->flags, netp->flags_meta,
-			    netp->I, netp->meta_tracer);
-    erts_tracer_update(&netp->meta_tracer, NIL);
-    erts_free(ERTS_ALC_T_NIF_EXP_TRACE, netp);
-}
-
-NifExport *
-erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
+ErtsNativeFunc *
+erts_nfunc_schedule(Process *c_p, Process *dirty_shadow_proc,
 			 ErtsCodeMFA *mfa, BeamInstr *pc,
 			 BeamInstr instr,
 			 void *dfunc, void *ifunc,
@@ -110,7 +70,7 @@ erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
     Process *used_proc;
     ErtsSchedulerData *esdp;
     Eterm* reg;
-    NifExport* nep;
+    ErtsNativeFunc* nep;
     int i;
 
     ERTS_LC_ASSERT(erts_proc_lc_my_proc_locks(c_p)
@@ -133,10 +93,10 @@ erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
     reg = esdp->x_reg_array;
 
     if (mfa)
-	nep = erts_get_proc_nif_export(c_p, (int) mfa->arity);
+	nep = erts_get_proc_nfunc(c_p, (int) mfa->arity);
     else {
 	/* If no mfa, this is not the first schedule... */
-	nep = ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p);
+	nep = ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p);
 	ASSERT(nep && nep->argc >= 0);
     }
 
@@ -148,16 +108,15 @@ erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
 	for (i = 0; i < (int) mfa->arity; i++)
 	    nep->argv[i] = reg[i];
 	nep->pc = pc;
-	nep->cp = c_p->cp;
 	nep->mfa = mfa;
 	nep->current = c_p->current;
 	ASSERT(argc >= 0);
 	nep->argc = (int) mfa->arity;
 	nep->m = NULL;
 
-	ASSERT(!erts_check_nif_export_in_area(c_p,
+	ASSERT(!erts_check_nfunc_in_area(c_p,
 					      (char *) nep,
-					      (sizeof(NifExport)
+					      (sizeof(ErtsNativeFunc)
 					       + (sizeof(Eterm)
 						  *(nep->argc-1)))));
     }
@@ -167,14 +126,14 @@ erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
 	    reg[i] = argv[i];
     }
     ASSERT(is_atom(mod) && is_atom(func));
-    nep->exp.info.mfa.module = mod;
-    nep->exp.info.mfa.function = func;
-    nep->exp.info.mfa.arity = (Uint) argc;
-    nep->exp.beam[0] = (BeamInstr) instr; /* call_nif || apply_bif */
-    nep->exp.beam[1] = (BeamInstr) dfunc;
+    nep->trampoline.info.mfa.module = mod;
+    nep->trampoline.info.mfa.function = func;
+    nep->trampoline.info.mfa.arity = (Uint) argc;
+    nep->trampoline.call_op = (BeamInstr) instr; /* call_bif || call_nif */
+    nep->trampoline.dfunc = (BeamInstr) dfunc;
     nep->func = ifunc;
     used_proc->arity = argc;
     used_proc->freason = TRAP;
-    used_proc->i = (BeamInstr*) nep->exp.addressv[0];
+    used_proc->i = (BeamInstr*)&nep->trampoline.call_op;
     return nep;
 }
diff --git a/erts/emulator/beam/erl_nfunc_sched.h b/erts/emulator/beam/erl_nfunc_sched.h
index 1cb252eba5..4dae242d4f 100644
--- a/erts/emulator/beam/erl_nfunc_sched.h
+++ b/erts/emulator/beam/erl_nfunc_sched.h
@@ -25,92 +25,78 @@
 #include "bif.h"
 #include "error.h"
 
-typedef struct {
-    int applying;
-    Export* ep;
-    BeamInstr *cp;
-    Uint32 flags;
-    Uint32 flags_meta;
-    BeamInstr* I;
-    ErtsTracer meta_tracer;
-} NifExportTrace;
-
 /*
- * NIF exports need a few more items than the Export struct provides,
- * including the erl_module_nif* and a NIF function pointer, so the
- * NifExport below adds those. The Export member must be first in the
- * struct. A number of values are stored for error handling purposes
- * only.
+ * Native function wrappers are used to schedule native functions on both
+ * normal and dirty schedulers.
+ *
+ * A number of values are only stored for error handling, and the fields
+ * following `current` can be omitted when a wrapper is statically "scheduled"
+ * through placement in a function stub.
  *
- * 'argc' is >= 0 when NifExport is in use, and < 0 when not.
+ * 'argc' is >= 0 when ErtsNativeFunc is in use, and < 0 when not.
  */
 
 typedef struct {
-    Export exp;
+    struct {
+        ErtsCodeInfo info;
+        BeamInstr call_op; /* call_bif || call_nif */
+        BeamInstr dfunc;
+    } trampoline;
+
     struct erl_module_nif* m; /* NIF module, or NULL if BIF */
     void *func;		/* Indirect NIF or BIF to execute (may be unused) */
     ErtsCodeMFA *current;/* Current as set when originally called */
-    NifExportTrace *trace;
     /* --- The following is only used on error --- */
     BeamInstr *pc;	/* Program counter */
-    BeamInstr *cp;	/* Continuation pointer */
     ErtsCodeMFA *mfa;	/* MFA of original call */
     int argc;		/* Number of arguments in original call */
     int argv_size;	/* Allocated size of argv */
     Eterm argv[1];	/* Saved arguments from the original call */
-} NifExport;
-
-NifExport *erts_new_proc_nif_export(Process *c_p, int argc);
-void erts_nif_export_save_trace(Process *c_p, NifExport *nep, int applying,
-				Export* ep, BeamInstr *cp, Uint32 flags,
-				Uint32 flags_meta, BeamInstr* I,
-				ErtsTracer meta_tracer);
-void erts_nif_export_restore_trace(Process *c_p, Eterm result, NifExport *nep);
-void erts_destroy_nif_export(Process *p);
-NifExport *erts_nif_export_schedule(Process *c_p, Process *dirty_shadow_proc,
-				    ErtsCodeMFA *mfa, BeamInstr *pc,
-				    BeamInstr instr,
-				    void *dfunc, void *ifunc,
-				    Eterm mod, Eterm func,
-				    int argc, const Eterm *argv);
-void erts_nif_export_cleanup_nif_mod(NifExport *ep); /* erl_nif.c */
-ERTS_GLB_INLINE NifExport *erts_get_proc_nif_export(Process *c_p, int extra);
-ERTS_GLB_INLINE int erts_setup_nif_export_rootset(Process* proc, Eterm** objv,
-						  Uint* nobj);
-ERTS_GLB_INLINE int erts_check_nif_export_in_area(Process *p,
-						  char *start, Uint size);
-ERTS_GLB_INLINE void erts_nif_export_restore(Process *c_p, NifExport *ep,
-					     Eterm result);
-ERTS_GLB_INLINE void erts_nif_export_restore_error(Process* c_p, BeamInstr **pc,
-						   Eterm *reg, ErtsCodeMFA **nif_mfa);
-ERTS_GLB_INLINE int erts_nif_export_check_save_trace(Process *c_p, Eterm result,
-						     int applying, Export* ep,
-						     BeamInstr *cp, Uint32 flags,
-						     Uint32 flags_meta, BeamInstr* I,
-						     ErtsTracer meta_tracer);
+} ErtsNativeFunc;
+
+ErtsNativeFunc *erts_new_proc_nfunc(Process *c_p, int argc);
+void erts_destroy_nfunc(Process *p);
+ErtsNativeFunc *erts_nfunc_schedule(Process *c_p, Process *dirty_shadow_proc,
+                               ErtsCodeMFA *mfa, BeamInstr *pc,
+                               BeamInstr instr,
+                               void *dfunc, void *ifunc,
+                               Eterm mod, Eterm func,
+                               int argc, const Eterm *argv);
+void erts_nfunc_cleanup_nif_mod(ErtsNativeFunc *ep); /* erl_nif.c */
+ERTS_GLB_INLINE ErtsNativeFunc *erts_get_proc_nfunc(Process *c_p, int extra);
+ERTS_GLB_INLINE int erts_setup_nfunc_rootset(Process* proc, Eterm** objv,
+                                             Uint* nobj);
+ERTS_GLB_INLINE int erts_check_nfunc_in_area(Process *p,
+                                             char *start, Uint size);
+ERTS_GLB_INLINE void erts_nfunc_restore(Process *c_p, ErtsNativeFunc *ep,
+                                        Eterm result);
+ERTS_GLB_INLINE void erts_nfunc_restore_error(Process* c_p,
+                                              BeamInstr **pc,
+                                              Eterm *reg,
+                                              ErtsCodeMFA **nif_mfa);
 ERTS_GLB_INLINE Process *erts_proc_shadow2real(Process *c_p);
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
 
-ERTS_GLB_INLINE NifExport *
-erts_get_proc_nif_export(Process *c_p, int argc)
+ERTS_GLB_INLINE ErtsNativeFunc *
+erts_get_proc_nfunc(Process *c_p, int argc)
 {
-    NifExport *nep = ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p);
+    ErtsNativeFunc *nep = ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p);
     if (!nep || (nep->argc < 0 && nep->argv_size < argc))
-	return erts_new_proc_nif_export(c_p, argc);
+	return erts_new_proc_nfunc(c_p, argc);
     return nep;
 }
 
 /*
  * If a process has saved arguments, they need to be part of the GC
  * rootset. The function below is called from setup_rootset() in
- * erl_gc.c. Any exception term saved in the NifExport is also made
+ * erl_gc.c. Any exception term saved in the ErtsNativeFunc is also made
  * part of the GC rootset here; it always resides in rootset[0].
  */
 ERTS_GLB_INLINE int
-erts_setup_nif_export_rootset(Process* proc, Eterm** objv, Uint* nobj)
+erts_setup_nfunc_rootset(Process* proc, Eterm** objv, Uint* nobj)
 {
-    NifExport* ep = (NifExport*) ERTS_PROC_GET_NIF_TRAP_EXPORT(proc);
+    ErtsNativeFunc* ep = (ErtsNativeFunc*) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(proc);
 
     if (!ep || ep->argc <= 0)
 	return 0;
@@ -121,18 +107,16 @@ erts_setup_nif_export_rootset(Process* proc, Eterm** objv, Uint* nobj)
 }
 
 /*
- * Check if nif export points into code area...
+ * Check if native func wrapper points into code area...
  */
 ERTS_GLB_INLINE int
-erts_check_nif_export_in_area(Process *p, char *start, Uint size)
+erts_check_nfunc_in_area(Process *p, char *start, Uint size)
 {
-    NifExport *nep = ERTS_PROC_GET_NIF_TRAP_EXPORT(p);
+    ErtsNativeFunc *nep = ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(p);
     if (!nep || nep->argc < 0)
 	return 0;
     if (ErtsInArea(nep->pc, start, size))
 	return 1;
-    if (ErtsInArea(nep->cp, start, size))
-	return 1;
     if (ErtsInArea(nep->mfa, start, size))
 	return 1;
     if (ErtsInArea(nep->current, start, size))
@@ -141,7 +125,7 @@ erts_check_nif_export_in_area(Process *p, char *start, Uint size)
 }
 
 ERTS_GLB_INLINE void
-erts_nif_export_restore(Process *c_p, NifExport *ep, Eterm result)
+erts_nfunc_restore(Process *c_p, ErtsNativeFunc *ep, Eterm result)
 {
     ASSERT(!ERTS_SCHEDULER_IS_DIRTY(erts_get_scheduler_data()));
     ERTS_LC_ASSERT(!(c_p->static_flags
@@ -151,43 +135,21 @@ erts_nif_export_restore(Process *c_p, NifExport *ep, Eterm result)
 
     c_p->current = ep->current;
     ep->argc = -1; /* Unused nif-export marker... */
-    if (ep->trace)
-	erts_nif_export_restore_trace(c_p, result, ep);
 }
 
 ERTS_GLB_INLINE void
-erts_nif_export_restore_error(Process* c_p, BeamInstr **pc,
+erts_nfunc_restore_error(Process* c_p, BeamInstr **pc,
 			      Eterm *reg, ErtsCodeMFA **nif_mfa)
 {
-    NifExport *nep = (NifExport *) ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p);
+    ErtsNativeFunc *nep = (ErtsNativeFunc *) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p);
     int ix;
 
     ASSERT(nep);
     *pc = nep->pc;
-    c_p->cp = nep->cp;
     *nif_mfa = nep->mfa;
     for (ix = 0; ix < nep->argc; ix++)
 	reg[ix] = nep->argv[ix];
-    erts_nif_export_restore(c_p, nep, THE_NON_VALUE);
-}
-
-ERTS_GLB_INLINE int
-erts_nif_export_check_save_trace(Process *c_p, Eterm result,
-				 int applying, Export* ep,
-				 BeamInstr *cp, Uint32 flags,
-				 Uint32 flags_meta, BeamInstr* I,
-				 ErtsTracer meta_tracer)
-{
-    if (is_non_value(result) && c_p->freason == TRAP) {
-	NifExport *nep = ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p);
-	if (nep && nep->argc >= 0) {
-	    erts_nif_export_save_trace(c_p, nep, applying, ep,
-				       cp, flags, flags_meta,
-				       I, meta_tracer);
-	    return 1;
-	}
-    }
-    return 0;
+    erts_nfunc_restore(c_p, nep, THE_NON_VALUE);
 }
 
 ERTS_GLB_INLINE Process *
@@ -210,10 +172,10 @@ erts_proc_shadow2real(Process *c_p)
 #if defined(ERTS_WANT_NFUNC_SCHED_INTERNALS__) && !defined(ERTS_NFUNC_SCHED_INTERNALS__)
 #define ERTS_NFUNC_SCHED_INTERNALS__
 
-#define ERTS_I_BEAM_OP_TO_NIF_EXPORT(I)					\
-    (ASSERT(BeamIsOpCode(*(I), op_apply_bif) ||                         \
-            BeamIsOpCode(*(I), op_call_nif)),                           \
-     ((NifExport *) (((char *) (I)) - offsetof(NifExport, exp.beam[0]))))
+#define ERTS_I_BEAM_OP_TO_NFUNC(I)					\
+    (ASSERT(BeamIsOpCode(*(I), op_call_bif_W) ||                          \
+            BeamIsOpCode(*(I), op_call_nif_WWW)),                           \
+     ((ErtsNativeFunc *) (((char *) (I)) - offsetof(ErtsNativeFunc, trampoline.call_op))))
 
 
 #include "erl_message.h"
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index 1fbe362330..6e27b4b7cb 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -309,10 +309,10 @@ void erts_post_nif(ErlNifEnv* env)
 
 
 /*
- * Initialize a NifExport struct. Create it if needed and store it in the
+ * Initialize a ErtsNativeFunc struct. Create it if needed and store it in the
  * proc. The direct_fp function is what will be invoked by op_call_nif, and
  * the indirect_fp function, if not NULL, is what the direct_fp function
- * will call. If the allocated NifExport isn't enough to hold all of argv,
+ * will call. If the allocated ErtsNativeFunc isn't enough to hold all of argv,
  * allocate a larger one. Save 'current' and registers if first time this
  * call is scheduled.
  */
@@ -321,7 +321,7 @@ static ERTS_INLINE ERL_NIF_TERM
 schedule(ErlNifEnv* env, NativeFunPtr direct_fp, NativeFunPtr indirect_fp,
 	 Eterm mod, Eterm func_name, int argc, const ERL_NIF_TERM argv[])
 {
-    NifExport *ep;
+    ErtsNativeFunc *ep;
     Process *c_p, *dirty_shadow_proc;
 
     execution_state(env, &c_p, NULL);
@@ -332,10 +332,10 @@ schedule(ErlNifEnv* env, NativeFunPtr direct_fp, NativeFunPtr indirect_fp,
 
     ERTS_LC_ASSERT(ERTS_PROC_LOCK_MAIN & erts_proc_lc_my_proc_locks(c_p));
 
-    ep = erts_nif_export_schedule(c_p, dirty_shadow_proc,
+    ep = erts_nfunc_schedule(c_p, dirty_shadow_proc,
 				  c_p->current,
-				  c_p->cp,
-				  BeamOpCodeAddr(op_call_nif),
+                                  cp_val(c_p->stop[0]),
+				  BeamOpCodeAddr(op_call_nif_WWW),
 				  direct_fp, indirect_fp,
 				  mod, func_name,
 				  argc, (const Eterm *) argv);
@@ -356,7 +356,7 @@ erts_call_dirty_nif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
 {
     int exiting;
     ERL_NIF_TERM *argv = (ERL_NIF_TERM *) reg;
-    NifExport *nep = ERTS_I_BEAM_OP_TO_NIF_EXPORT(I);
+    ErtsNativeFunc *nep = ERTS_I_BEAM_OP_TO_NFUNC(I);
     ErtsCodeMFA *codemfa = erts_code_to_codemfa(I);
     NativeFunPtr dirty_nif = (NativeFunPtr) I[1];
     ErlNifEnv env;
@@ -364,7 +364,7 @@ erts_call_dirty_nif(ErtsSchedulerData *esdp, Process *c_p, BeamInstr *I, Eterm *
 #ifdef DEBUG
     erts_aint32_t state = erts_atomic32_read_nob(&c_p->state);
 
-    ASSERT(nep == ERTS_PROC_GET_NIF_TRAP_EXPORT(c_p));
+    ASSERT(nep == ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(c_p));
 
     ASSERT(!c_p->scheduler_data);
     ASSERT((state & ERTS_PSFLG_DIRTY_RUNNING)
@@ -815,7 +815,7 @@ int enif_send(ErlNifEnv* env, const ErlNifPid* to_pid,
             }
 #endif
             if (have_seqtrace(stoken)) {
-                seq_trace_update_send(c_p);
+                seq_trace_update_serial(c_p);
                 seq_trace_output(stoken, msg, SEQ_TRACE_SEND,
                                  rp->common.id, c_p);
             }
@@ -2823,7 +2823,7 @@ int enif_consume_timeslice(ErlNifEnv* env, int percent)
 }
 
 static ERTS_INLINE void
-nif_export_cleanup_nif_mod(NifExport *ep)
+nfunc_cleanup_nif_mod(ErtsNativeFunc *ep)
 {
     if (erts_refc_dectest(&ep->m->rt_dtor_cnt, 0) == 0 && ep->m->mod == NULL)
 	close_lib(ep->m);
@@ -2831,17 +2831,17 @@ nif_export_cleanup_nif_mod(NifExport *ep)
 }
 
 void
-erts_nif_export_cleanup_nif_mod(NifExport *ep)
+erts_nfunc_cleanup_nif_mod(ErtsNativeFunc *ep)
 {
-    nif_export_cleanup_nif_mod(ep);
+    nfunc_cleanup_nif_mod(ep);
 }
 
 static ERTS_INLINE void
-nif_export_restore(Process *c_p, NifExport *ep, Eterm res)
+nfunc_restore(Process *c_p, ErtsNativeFunc *ep, Eterm res)
 {
-    erts_nif_export_restore(c_p, ep, res);
+    erts_nfunc_restore(c_p, ep, res);
     ASSERT(ep->m);
-    nif_export_cleanup_nif_mod(ep);
+    nfunc_cleanup_nif_mod(ep);
 }
 
 
@@ -2858,15 +2858,15 @@ static ERL_NIF_TERM
 dirty_nif_finalizer(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 {
     Process* proc;
-    NifExport* ep;
+    ErtsNativeFunc* ep;
 
     execution_state(env, &proc, NULL);
 
     ASSERT(argc == 1);
     ASSERT(!ERTS_SCHEDULER_IS_DIRTY(erts_proc_sched_data(proc)));
-    ep = (NifExport*) ERTS_PROC_GET_NIF_TRAP_EXPORT(proc);
+    ep = (ErtsNativeFunc*) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(proc);
     ASSERT(ep);
-    nif_export_restore(proc, ep, argv[0]);
+    nfunc_restore(proc, ep, argv[0]);
     return argv[0];
 }
 
@@ -2878,21 +2878,22 @@ dirty_nif_exception(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 {
     ERL_NIF_TERM ret;
     Process* proc;
-    NifExport* ep;
+    ErtsNativeFunc* ep;
     Eterm exception;
 
     execution_state(env, &proc, NULL);
 
     ASSERT(argc == 1);
     ASSERT(!ERTS_SCHEDULER_IS_DIRTY(erts_proc_sched_data(proc)));
-    ep = (NifExport*) ERTS_PROC_GET_NIF_TRAP_EXPORT(proc);
+    ep = (ErtsNativeFunc*) ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(proc);
     ASSERT(ep);
     exception = argv[0]; /* argv overwritten by restore below... */
-    nif_export_cleanup_nif_mod(ep);
+    nfunc_cleanup_nif_mod(ep);
     ret = enif_raise_exception(env, exception);
 
-    /* Restore orig info for error and clear nif export in handle_error() */
-    proc->freason |= EXF_RESTORE_NIF;
+    /* Restore orig info for error and clear native func wrapper in
+     * handle_error() */
+    proc->freason |= EXF_RESTORE_NFUNC;
     return ret;
 }
 
@@ -2929,7 +2930,7 @@ static_schedule_dirty_nif(ErlNifEnv* env, erts_aint32_t dirty_psflg,
 			     int argc, const ERL_NIF_TERM argv[])
 {
     Process *proc;
-    NifExport *ep;
+    ErtsNativeFunc *ep;
     Eterm mod, func;
     NativeFunPtr fp;
 
@@ -2939,12 +2940,11 @@ static_schedule_dirty_nif(ErlNifEnv* env, erts_aint32_t dirty_psflg,
      * Called in order to schedule statically determined
      * dirty NIF calls...
      *
-     * Note that 'current' does not point into a NifExport
-     * structure; only a structure with similar
-     * parts (located in code).
+     * Note that 'current' does not point into a ErtsNativeFunc
+     * structure; only a structure with similar parts (located in code).
      */
 
-    ep = ErtsContainerStruct(proc->current, NifExport, exp.info.mfa);
+    ep = ErtsContainerStruct(proc->current, ErtsNativeFunc, trampoline.info.mfa);
     mod = proc->current->module;
     func = proc->current->function;
     fp = (NativeFunPtr) ep->func;
@@ -2983,12 +2983,12 @@ execute_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 {
     Process* proc;
     NativeFunPtr fp;
-    NifExport* ep;
+    ErtsNativeFunc* ep;
     ERL_NIF_TERM result;
 
     execution_state(env, &proc, NULL);
 
-    ep = ErtsContainerStruct(proc->current, NifExport, exp.info.mfa);
+    ep = ErtsContainerStruct(proc->current, ErtsNativeFunc, trampoline.info.mfa);
     fp = ep->func;
     ASSERT(ep);
     ASSERT(!env->exception_thrown);
@@ -3001,20 +3001,20 @@ execute_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
 
     result = (*fp)(env, argc, argv);
 
-    ASSERT(ep == ERTS_PROC_GET_NIF_TRAP_EXPORT(proc));
+    ASSERT(ep == ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(proc));
 
     if (is_value(result) || proc->freason != TRAP) {
 	/* Done (not rescheduled)... */
 	ASSERT(ep->func == ERTS_DBG_NIF_NOT_SCHED_MARKER);
 	if (!env->exception_thrown)
-	    nif_export_restore(proc, ep, result);
+	    nfunc_restore(proc, ep, result);
 	else {
-	    nif_export_cleanup_nif_mod(ep);
+	    nfunc_cleanup_nif_mod(ep);
 	    /*
 	     * Restore orig info for error and clear nif
 	     * export in handle_error()
 	     */
-	    proc->freason |= EXF_RESTORE_NIF;
+	    proc->freason |= EXF_RESTORE_NFUNC;
 	}
     }
 
@@ -4117,8 +4117,23 @@ static struct erl_module_nif* create_lib(const ErlNifEntry* src)
     return lib;
 };
 
+/* load_nif/2 is implemented as an instruction as it needs to know where it
+ * was called from, and it's a pain to get that information in a BIF.
+ *
+ * This is a small stub that rejects apply(erlang, load_nif, [Path, Args]). */
+BIF_RETTYPE load_nif_2(BIF_ALIST_2) {
+    if (BIF_P->flags & F_HIPE_MODE) {
+        BIF_RET(load_nif_error(BIF_P, "notsup",
+                               "Calling load_nif from HiPE compiled modules "
+                               "not supported"));
+    }
+    
+    BIF_RET(load_nif_error(BIF_P, "bad_lib",
+                           "load_nif/2 must be explicitly called from the NIF "
+                           "module. It cannot be called through apply/3."));
+}
 
-BIF_RETTYPE load_nif_2(BIF_ALIST_2)
+Eterm erts_load_nif(Process *c_p, BeamInstr *I, Eterm filename, Eterm args)
 {
     static const char bad_lib[] = "bad_lib";
     static const char upgrade[] = "upgrade";
@@ -4141,41 +4156,25 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
     struct erl_module_instance* this_mi;
     struct erl_module_instance* prev_mi;
 
-    if (BIF_P->flags & F_HIPE_MODE) {
-	ret = load_nif_error(BIF_P, "notsup", "Calling load_nif from HiPE compiled "
-			     "modules not supported");
-	BIF_RET(ret);
-    }
-
     encoding = erts_get_native_filename_encoding();
     if (encoding == ERL_FILENAME_WIN_WCHAR) {
         /* Do not convert the lib name to utf-16le yet, do that in win32 specific code */
         /* since lib_name is used in error messages */
         encoding = ERL_FILENAME_UTF8;
     }
-    lib_name = erts_convert_filename_to_encoding(BIF_ARG_1, NULL, 0,
+    lib_name = erts_convert_filename_to_encoding(filename, NULL, 0,
                                                  ERTS_ALC_T_TMP, 1, 0, encoding,
 						 NULL, 0);
     if (!lib_name) {
-	BIF_ERROR(BIF_P, BADARG);
-    }
-
-    if (!erts_try_seize_code_write_permission(BIF_P)) {
-	erts_free(ERTS_ALC_T_TMP, lib_name);
-	ERTS_BIF_YIELD2(bif_export[BIF_load_nif_2],
-			BIF_P, BIF_ARG_1, BIF_ARG_2);
+        return THE_NON_VALUE;
     }
 
     /* Block system (is this the right place to do it?) */
-    erts_proc_unlock(BIF_P, ERTS_PROC_LOCK_MAIN);
+    erts_proc_unlock(c_p, ERTS_PROC_LOCK_MAIN);
     erts_thr_progress_block();
 
     /* Find calling module */
-    ASSERT(BIF_P->current != NULL);
-    ASSERT(BIF_P->current->module == am_erlang
-	   && BIF_P->current->function == am_load_nif 
-	   && BIF_P->current->arity == 2);
-    caller = find_function_from_pc(BIF_P->cp);
+    caller = find_function_from_pc(I);
     ASSERT(caller != NULL);
     mod_atom = caller->module;
     ASSERT(is_atom(mod_atom));
@@ -4195,7 +4194,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
     this_mi = &module_p->curr;
     prev_mi = &module_p->old;
     if (in_area(caller, module_p->old.code_hdr, module_p->old.code_length)) {
-	ret = load_nif_error(BIF_P, "old_code", "Calling load_nif from old "
+	ret = load_nif_error(c_p, "old_code", "Calling load_nif from old "
 			     "module '%T' not allowed", mod_atom);
 	goto error;
     } else if (module_p->on_load) {
@@ -4209,52 +4208,52 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
     }
 
     if (this_mi->nif != NULL) {
-        ret = load_nif_error(BIF_P,"reload","NIF library already loaded"
+        ret = load_nif_error(c_p,"reload","NIF library already loaded"
                              " (reload disallowed since OTP 20).");
     }
     else if (init_func == NULL &&
              (err=erts_sys_ddll_open(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
 	const char slogan[] = "Failed to load NIF library";
 	if (strstr(errdesc.str, lib_name) != NULL) {
-	    ret = load_nif_error(BIF_P, "load_failed", "%s: '%s'", slogan, errdesc.str);
+	    ret = load_nif_error(c_p, "load_failed", "%s: '%s'", slogan, errdesc.str);
 	}
 	else {
-	    ret = load_nif_error(BIF_P, "load_failed", "%s %s: '%s'", slogan, lib_name, errdesc.str);
+	    ret = load_nif_error(c_p, "load_failed", "%s %s: '%s'", slogan, lib_name, errdesc.str);
 	}
     }
     else if (init_func == NULL &&
 	     erts_sys_ddll_load_nif_init(handle, &init_func, &errdesc) != ERL_DE_NO_ERROR) {
-	ret  = load_nif_error(BIF_P, bad_lib, "Failed to find library init"
+	ret  = load_nif_error(c_p, bad_lib, "Failed to find library init"
 			      " function: '%s'", errdesc.str);
 	
     }
     else if ((taint ? erts_add_taint(mod_atom) : 0,
 	      (entry = erts_sys_ddll_call_nif_init(init_func)) == NULL)) {
-	ret = load_nif_error(BIF_P, bad_lib, "Library init-call unsuccessful");
+	ret = load_nif_error(c_p, bad_lib, "Library init-call unsuccessful");
     }
     else if (entry->major > ERL_NIF_MAJOR_VERSION
              || (entry->major == ERL_NIF_MAJOR_VERSION
                  && entry->minor > ERL_NIF_MINOR_VERSION)) {
         char* fmt = "That '%T' NIF library needs %s or newer. Either try to"
             " recompile the NIF lib or use a newer erts runtime.";
-        ret = load_nif_error(BIF_P, bad_lib, fmt, mod_atom, entry->min_erts);
+        ret = load_nif_error(c_p, bad_lib, fmt, mod_atom, entry->min_erts);
     }
     else if (entry->major < ERL_NIF_MIN_REQUIRED_MAJOR_VERSION_ON_LOAD
 	     || (entry->major==2 && entry->minor == 5)) { /* experimental maps */
 	
         char* fmt = "That old NIF library (%d.%d) is not compatible with this "
             "erts runtime (%d.%d). Try recompile the NIF lib.";
-        ret = load_nif_error(BIF_P, bad_lib, fmt, entry->major, entry->minor,
+        ret = load_nif_error(c_p, bad_lib, fmt, entry->major, entry->minor,
                              ERL_NIF_MAJOR_VERSION, ERL_NIF_MINOR_VERSION);
     }   
     else if (AT_LEAST_VERSION(entry, 2, 1)
 	     && sys_strcmp(entry->vm_variant, ERL_NIF_VM_VARIANT) != 0) {
-	ret = load_nif_error(BIF_P, bad_lib, "Library (%s) not compiled for "
+	ret = load_nif_error(c_p, bad_lib, "Library (%s) not compiled for "
 			     "this vm variant (%s).",
 			     entry->vm_variant, ERL_NIF_VM_VARIANT);
     }
     else if (!erts_is_atom_str((char*)entry->name, mod_atom, 1)) {
-	ret = load_nif_error(BIF_P, bad_lib, "Library module name '%s' does not"
+	ret = load_nif_error(c_p, bad_lib, "Library module name '%s' does not"
 			     " match calling module '%T'", entry->name, mod_atom);
     }
     else {
@@ -4273,7 +4272,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 
 	    if (!erts_atom_get(f->name, sys_strlen(f->name), &f_atom, ERTS_ATOM_ENC_LATIN1)
 		|| (ci_pp = get_func_pp(this_mi->code_hdr, f_atom, f->arity))==NULL) {
-		ret = load_nif_error(BIF_P,bad_lib,"Function not found %T:%s/%u",
+		ret = load_nif_error(c_p,bad_lib,"Function not found %T:%s/%u",
 				     mod_atom, f->name, f->arity);
 	    }
 	    else if (f->flags) {
@@ -4285,16 +4284,13 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 		 * a load error.
 		 */
 		if (f->flags != ERL_NIF_DIRTY_JOB_IO_BOUND && f->flags != ERL_NIF_DIRTY_JOB_CPU_BOUND)
-		    ret = load_nif_error(BIF_P, bad_lib, "Illegal flags field value %d for NIF %T:%s/%u",
+		    ret = load_nif_error(c_p, bad_lib, "Illegal flags field value %d for NIF %T:%s/%u",
 					 f->flags, mod_atom, f->name, f->arity);
 	    }
-	    else if (erts_codeinfo_to_code(ci_pp[1]) - erts_codeinfo_to_code(ci_pp[0])
-                     < BEAM_NIF_MIN_FUNC_SZ)
-	    {
-		ret = load_nif_error(BIF_P,bad_lib,"No explicit call to load_nif"
-				     " in module (%T:%s/%u too small)",
-				     mod_atom, f->name, f->arity);
-	    }
+
+            ASSERT(erts_codeinfo_to_code(ci_pp[1]) - erts_codeinfo_to_code(ci_pp[0])
+                     >= BEAM_NATIVE_MIN_FUNC_SZ);
+
 	    /*erts_fprintf(stderr, "Found NIF %T:%s/%u\r\n",
 	      mod_atom, f->name, f->arity);*/
 	}
@@ -4313,23 +4309,23 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
     if (prev_mi->nif != NULL) { /**************** Upgrade ***************/
         void* prev_old_data = prev_mi->nif->priv_data;
         if (entry->upgrade == NULL) {
-            ret = load_nif_error(BIF_P, upgrade, "Upgrade not supported by this NIF library.");
+            ret = load_nif_error(c_p, upgrade, "Upgrade not supported by this NIF library.");
             goto error;
         }
-        erts_pre_nif(&env, BIF_P, lib, NULL);
-        veto = entry->upgrade(&env, &lib->priv_data, &prev_mi->nif->priv_data, BIF_ARG_2);
+        erts_pre_nif(&env, c_p, lib, NULL);
+        veto = entry->upgrade(&env, &lib->priv_data, &prev_mi->nif->priv_data, args);
         erts_post_nif(&env);
         if (veto) {
             prev_mi->nif->priv_data = prev_old_data;
-            ret = load_nif_error(BIF_P, upgrade, "Library upgrade-call unsuccessful (%d).", veto);
+            ret = load_nif_error(c_p, upgrade, "Library upgrade-call unsuccessful (%d).", veto);
         }
     }
     else if (entry->load != NULL) { /********* Initial load ***********/
-        erts_pre_nif(&env, BIF_P, lib, NULL);
-        veto = entry->load(&env, &lib->priv_data, BIF_ARG_2);
+        erts_pre_nif(&env, c_p, lib, NULL);
+        veto = entry->load(&env, &lib->priv_data, args);
         erts_post_nif(&env);
         if (veto) {
-            ret = load_nif_error(BIF_P, "load", "Library load-call unsuccessful (%d).", veto);
+            ret = load_nif_error(c_p, "load", "Library load-call unsuccessful (%d).", veto);
         }
     }
     if (ret == am_ok) {
@@ -4351,12 +4347,12 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
             code_ptr = erts_codeinfo_to_code(ci);
 
 	    if (ci->u.gen_bp == NULL) {
-		code_ptr[0] = BeamOpCodeAddr(op_call_nif);
+		code_ptr[0] = BeamOpCodeAddr(op_call_nif_WWW);
 	    }
 	    else { /* Function traced, patch the original instruction word */
 		GenericBp* g = ci->u.gen_bp;
 		ASSERT(BeamIsOpCode(code_ptr[0], op_i_generic_breakpoint));
-		g->orig_instr = BeamOpCodeAddr(op_call_nif);
+		g->orig_instr = BeamOpCodeAddr(op_call_nif_WWW);
 	    }
 	    if (f->flags) {
 		code_ptr[3] = (BeamInstr) f->fptr;
@@ -4383,8 +4379,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
     }
 
     erts_thr_progress_unblock();
-    erts_proc_lock(BIF_P, ERTS_PROC_LOCK_MAIN);
-    erts_release_code_write_permission();
+    erts_proc_lock(c_p, ERTS_PROC_LOCK_MAIN);
     erts_free(ERTS_ALC_T_TMP, lib_name);
 
     BIF_RET(ret);
diff --git a/erts/emulator/beam/erl_node_tables.c b/erts/emulator/beam/erl_node_tables.c
index 8863e219e2..ae3193be81 100644
--- a/erts/emulator/beam/erl_node_tables.c
+++ b/erts/emulator/beam/erl_node_tables.c
@@ -977,7 +977,7 @@ static void print_node(void *venp, void *vpndp)
 	if(pndp->sysname == NIL) {
 	    erts_print(pndp->to, pndp->to_arg, "Name: %T ", enp->sysname);
 	}
-	erts_print(pndp->to, pndp->to_arg, " %d", enp->creation);
+	erts_print(pndp->to, pndp->to_arg, " %u", enp->creation);
 #ifdef DEBUG
 	erts_print(pndp->to, pndp->to_arg, " (refc=%ld)",
 		   erts_refc_read(&enp->refc, 0));
@@ -1020,7 +1020,7 @@ void erts_print_node_info(fmtfn_t to,
 /* ----------------------------------------------------------------------- */
 
 void
-erts_set_this_node(Eterm sysname, Uint creation)
+erts_set_this_node(Eterm sysname, Uint32 creation)
 {
     ERTS_LC_ASSERT(erts_thr_progress_is_blocking());
     ASSERT(2 <= de_refc_read(erts_this_dist_entry, 2));
diff --git a/erts/emulator/beam/erl_node_tables.h b/erts/emulator/beam/erl_node_tables.h
index ffaafbbbea..beae2df75f 100644
--- a/erts/emulator/beam/erl_node_tables.h
+++ b/erts/emulator/beam/erl_node_tables.h
@@ -264,7 +264,7 @@ void erts_set_dist_entry_pending(DistEntry *);
 void erts_set_dist_entry_connected(DistEntry *, Eterm, Uint);
 ErlNode *erts_find_or_insert_node(Eterm, Uint32, Eterm);
 void erts_schedule_delete_node(ErlNode *);
-void erts_set_this_node(Eterm, Uint);
+void erts_set_this_node(Eterm, Uint32);
 Uint erts_node_table_size(void);
 void erts_init_node_tables(int);
 void erts_node_table_info(fmtfn_t, void *);
diff --git a/erts/emulator/beam/erl_proc_sig_queue.c b/erts/emulator/beam/erl_proc_sig_queue.c
index d5e0e3b218..b60fb64342 100644
--- a/erts/emulator/beam/erl_proc_sig_queue.c
+++ b/erts/emulator/beam/erl_proc_sig_queue.c
@@ -995,7 +995,7 @@ send_gen_exit_signal(Process *c_p, Eterm from_tag,
 
     seq_trace = c_p && have_seqtrace(token);
     if (seq_trace)
-        seq_trace_update_send(c_p);
+        seq_trace_update_serial(c_p);
 
 #ifdef USE_VM_PROBES
     utag_sz = 0;
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index ac0a4f8902..b62ec77d65 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -708,10 +708,10 @@ erts_pre_init_process(void)
     erts_psd_required_locks[ERTS_PSD_DELAYED_GC_TASK_QS].set_locks
 	= ERTS_PSD_DELAYED_GC_TASK_QS_SET_LOCKS;
 
-    erts_psd_required_locks[ERTS_PSD_NIF_TRAP_EXPORT].get_locks
-	= ERTS_PSD_NIF_TRAP_EXPORT_GET_LOCKS;
-    erts_psd_required_locks[ERTS_PSD_NIF_TRAP_EXPORT].set_locks
-	= ERTS_PSD_NIF_TRAP_EXPORT_SET_LOCKS;
+    erts_psd_required_locks[ERTS_PSD_NFUNC_TRAP_WRAPPER].get_locks
+	= ERTS_PSD_NFUNC_TRAP_WRAPPER_GET_LOCKS;
+    erts_psd_required_locks[ERTS_PSD_NFUNC_TRAP_WRAPPER].set_locks
+	= ERTS_PSD_NFUNC_TRAP_WRAPPER_SET_LOCKS;
 
     erts_psd_required_locks[ERTS_PSD_ETS_OWNED_TABLES].get_locks
         = ERTS_PSD_ETS_OWNED_TABLES_GET_LOCKS;
@@ -6478,8 +6478,8 @@ schedule_out_process(ErtsRunQueue *c_rq, erts_aint32_t state, Process *p,
 
     ASSERT(!(state & (ERTS_PSFLG_DIRTY_IO_PROC
                       |ERTS_PSFLG_DIRTY_CPU_PROC))
-           || (BeamIsOpCode(*p->i, op_call_nif)
-               || BeamIsOpCode(*p->i, op_apply_bif)));
+           || (BeamIsOpCode(*p->i, op_call_nif_WWW)
+               || BeamIsOpCode(*p->i, op_call_bif_W)));
 
     a = state;
 
@@ -10992,8 +10992,13 @@ erts_set_gc_state(Process *c_p, int enable)
     ERTS_LC_ASSERT(ERTS_PROC_LOCK_MAIN == erts_proc_lc_my_proc_locks(c_p));
 
     if (!enable) {
-	c_p->flags |= F_DISABLE_GC;
-	return 0;
+        /* Strictly speaking it's not illegal to disable the GC when it's
+         * already disabled, but we risk enabling the GC prematurely if (for
+         * example) a BIF were to blindly disable it when trapping and then
+         * re-enable it before returning its result. */
+        ASSERT(!(c_p->flags & F_DISABLE_GC));
+        c_p->flags |= F_DISABLE_GC;
+        return 0;
     }
 
     c_p->flags &= ~F_DISABLE_GC;
@@ -11453,7 +11458,7 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
 #else
     arg_size = size_object_litopt(args, &litarea);
 #endif
-    heap_need = arg_size;
+    heap_need = arg_size + 1;   /* Reserve place for continuation pointer */
 
     p->flags = flags;
 
@@ -11502,7 +11507,8 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
     p->old_hend = p->old_htop = p->old_heap = NULL;
     p->high_water = p->heap;
     p->gen_gcs = 0;
-    p->stop = p->hend = p->heap + sz;
+    p->hend = p->heap + sz;
+    p->stop = p->hend - 1;     /* Reserve place for continuation pointer */
     p->htop = p->heap;
     p->heap_sz = sz;
     p->abandoned_heap = NULL;
@@ -11520,7 +11526,7 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
     p->current = &p->u.initial;
 
     p->i = (BeamInstr *) beam_apply;
-    p->cp = (BeamInstr *) beam_apply+1;
+    p->stop[0] = make_cp(beam_apply + 1);
 
     p->arg_reg = p->def_arg_reg;
     p->max_arg_reg = sizeof(p->def_arg_reg)/sizeof(p->def_arg_reg[0]);
@@ -11583,9 +11589,6 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
     p->mbuf_sz = 0;
     erts_atomic_init_nob(&p->psd, (erts_aint_t) NULL);
     p->dictionary = NULL;
-    p->seq_trace_lastcnt = 0;
-    p->seq_trace_clock = 0;
-    SEQ_TRACE_TOKEN(p) = NIL;
 #ifdef USE_VM_PROBES
     DT_UTAG(p) = NIL;
     DT_UTAG_FLAGS(p) = 0;
@@ -11606,6 +11609,45 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
     p->fp_exception = 0;
 #endif
 
+    /* seq_trace is handled before regular tracing as the latter may touch the
+     * trace token. */
+    if (have_seqtrace(SEQ_TRACE_TOKEN(parent))) {
+        Eterm token;
+        Uint token_sz;
+        Eterm *hp;
+
+        ASSERT(SEQ_TRACE_TOKEN_ARITY(parent) == 5);
+        ASSERT(is_immed(SEQ_TRACE_TOKEN_FLAGS(parent)));
+        ASSERT(is_immed(SEQ_TRACE_TOKEN_SERIAL(parent)));
+        ASSERT(is_immed(SEQ_TRACE_TOKEN_LASTCNT(parent)));
+
+        seq_trace_update_serial(parent);
+
+        token = SEQ_TRACE_TOKEN(parent);
+        token_sz = size_object(token);
+
+        hp = HAlloc(p, token_sz);
+        SEQ_TRACE_TOKEN(p) = copy_struct(token, token_sz, &hp, &MSO(p));
+
+        /* The counters behave the same way on spawning as they do on messages;
+         * we don't inherit our parent's lastcnt. */
+        p->seq_trace_lastcnt = parent->seq_trace_clock;
+        p->seq_trace_clock = parent->seq_trace_clock;
+
+        ASSERT((locks & (ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE)) ==
+               (ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE));
+
+        locks &= ~(ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+        erts_proc_unlock(p, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+        erts_proc_unlock(parent, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+
+        seq_trace_output(token, NIL, SEQ_TRACE_SPAWN, p->common.id, parent);
+    } else {
+        SEQ_TRACE_TOKEN(p) = NIL;
+        p->seq_trace_lastcnt = 0;
+        p->seq_trace_clock = 0;
+    }
+
     if (IS_TRACED(parent)) {
 	if (ERTS_TRACE_FLAGS(parent) & F_TRACE_SOS) {
 	    ERTS_TRACE_FLAGS(p) |= (ERTS_TRACE_FLAGS(parent) & TRACEE_FLAGS);
@@ -11627,9 +11669,14 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
 		}
         }
         if (ARE_TRACE_FLAGS_ON(parent, F_TRACE_PROCS)) {
-            locks &= ~(ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
-            erts_proc_unlock(p, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
-            erts_proc_unlock(parent, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+            /* The locks may already be released if seq_trace is enabled as
+             * well. */
+            if ((locks & (ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE))
+                  == (ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE)) {
+                locks &= ~(ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+                erts_proc_unlock(p, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+                erts_proc_unlock(parent, ERTS_PROC_LOCK_STATUS|ERTS_PROC_LOCK_TRACE);
+            }
             trace_proc_spawn(parent, am_spawn, p->common.id, mod, func, args);
             if (so->flags & SPO_LINK)
                 trace_proc(parent, locks, parent, am_link, p->common.id);
@@ -11793,7 +11840,6 @@ void erts_init_empty_process(Process *p)
     p->u.initial.function = 0;
     p->u.initial.arity = 0;
     p->catches = 0;
-    p->cp = NULL;
     p->i = NULL;
     p->current = NULL;
 
@@ -11871,7 +11917,6 @@ erts_debug_verify_clean_empty_process(Process* p)
     ASSERT(p->bif_timers == NULL);
     ASSERT(p->dictionary == NULL);
     ASSERT(p->catches == 0);
-    ASSERT(p->cp == NULL);
     ASSERT(p->i == NULL);
     ASSERT(p->current == NULL);
 
@@ -11931,7 +11976,7 @@ delete_process(Process* p)
     if (pbt)
         erts_free(ERTS_ALC_T_BPD, (void *) pbt);
 
-    erts_destroy_nif_export(p);
+    erts_destroy_nfunc(p);
 
     /* Cleanup psd */
 
@@ -13117,9 +13162,6 @@ erts_program_counter_info(fmtfn_t to, void *to_arg, Process *p)
     erts_print(to, to_arg, "Program counter: %p (", p->i);
     print_function_from_pc(to, to_arg, p->i);
     erts_print(to, to_arg, ")\n");
-    erts_print(to, to_arg, "CP: %p (", p->cp);
-    print_function_from_pc(to, to_arg, p->cp);
-    erts_print(to, to_arg, ")\n");
     state = erts_atomic32_read_acqb(&p->state);
     if (!(state & (ERTS_PSFLG_RUNNING
 		   | ERTS_PSFLG_RUNNING_SYS
@@ -13396,9 +13438,6 @@ static void print_current_process_info(fmtfn_t to, void *to_arg,
 	erts_print(to, to_arg, "Current Process Program counter: %p (", p->i);
 	print_function_from_pc(to, to_arg, p->i);
 	erts_print(to, to_arg, ")\n");
-	erts_print(to, to_arg, "Current Process CP: %p (", p->cp);
-	print_function_from_pc(to, to_arg, p->cp);
-	erts_print(to, to_arg, ")\n");
 
 	/* Getting this stacktrace can segfault if we are very very
 	   unlucky if called while a process is being garbage collected.
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 5886b576e0..09a6c0e961 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -812,7 +812,7 @@ erts_reset_max_len(ErtsRunQueue *rq, ErtsRunQueueInfo *rqi)
 #define ERTS_PSD_SCHED_ID			2
 #define ERTS_PSD_CALL_TIME_BP			3
 #define ERTS_PSD_DELAYED_GC_TASK_QS		4
-#define ERTS_PSD_NIF_TRAP_EXPORT		5
+#define ERTS_PSD_NFUNC_TRAP_WRAPPER		5
 #define ERTS_PSD_ETS_OWNED_TABLES               6
 #define ERTS_PSD_ETS_FIXED_TABLES               7
 #define ERTS_PSD_DIST_ENTRY	                8
@@ -849,8 +849,8 @@ typedef struct {
 #define ERTS_PSD_DELAYED_GC_TASK_QS_GET_LOCKS ERTS_PROC_LOCK_MAIN
 #define ERTS_PSD_DELAYED_GC_TASK_QS_SET_LOCKS ERTS_PROC_LOCK_MAIN
 
-#define ERTS_PSD_NIF_TRAP_EXPORT_GET_LOCKS ERTS_PROC_LOCK_MAIN
-#define ERTS_PSD_NIF_TRAP_EXPORT_SET_LOCKS ERTS_PROC_LOCK_MAIN
+#define ERTS_PSD_NFUNC_TRAP_WRAPPER_GET_LOCKS ERTS_PROC_LOCK_MAIN
+#define ERTS_PSD_NFUNC_TRAP_WRAPPER_SET_LOCKS ERTS_PROC_LOCK_MAIN
 
 #define ERTS_PSD_ETS_OWNED_TABLES_GET_LOCKS ERTS_PROC_LOCK_STATUS
 #define ERTS_PSD_ETS_OWNED_TABLES_SET_LOCKS ERTS_PROC_LOCK_STATUS
@@ -975,7 +975,6 @@ struct process {
     unsigned max_arg_reg;	/* Maximum number of argument registers available. */
     Eterm def_arg_reg[6];	/* Default array for argument registers. */
 
-    BeamInstr* cp;		/* (untagged) Continuation pointer (for threaded code). */
     BeamInstr* i;		/* Program counter for threaded code. */
     Sint catches;		/* Number of catches on stack */
     Sint fcalls;		/* 
@@ -1489,6 +1488,8 @@ extern int erts_system_profile_ts_type;
 #define SEQ_TRACE_SEND     (1 << 0)
 #define SEQ_TRACE_RECEIVE  (1 << 1)
 #define SEQ_TRACE_PRINT    (1 << 2)
+/* (This three-bit gap contains the timestamp.) */
+#define SEQ_TRACE_SPAWN    (1 << 6)
 
 #define ERTS_SEQ_TRACE_FLAGS_TS_TYPE_SHIFT 3
 
@@ -2037,10 +2038,10 @@ erts_psd_set(Process *p, int ix, void *data)
 #define ERTS_PROC_SET_DELAYED_GC_TASK_QS(P, PBT) \
     ((ErtsProcSysTaskQs *) erts_psd_set((P), ERTS_PSD_DELAYED_GC_TASK_QS, (void *) (PBT)))
 
-#define ERTS_PROC_GET_NIF_TRAP_EXPORT(P) \
-    erts_psd_get((P), ERTS_PSD_NIF_TRAP_EXPORT)
-#define ERTS_PROC_SET_NIF_TRAP_EXPORT(P, NTE) \
-    erts_psd_set((P), ERTS_PSD_NIF_TRAP_EXPORT, (void *) (NTE))
+#define ERTS_PROC_GET_NFUNC_TRAP_WRAPPER(P) \
+    erts_psd_get((P), ERTS_PSD_NFUNC_TRAP_WRAPPER)
+#define ERTS_PROC_SET_NFUNC_TRAP_WRAPPER(P, NTE) \
+    erts_psd_set((P), ERTS_PSD_NFUNC_TRAP_WRAPPER, (void *) (NTE))
 
 #define ERTS_PROC_GET_DIST_ENTRY(P) \
     ((DistEntry *) erts_psd_get((P), ERTS_PSD_DIST_ENTRY))
diff --git a/erts/emulator/beam/erl_trace.c b/erts/emulator/beam/erl_trace.c
index f6f177887c..5c46a10d64 100644
--- a/erts/emulator/beam/erl_trace.c
+++ b/erts/emulator/beam/erl_trace.c
@@ -830,7 +830,7 @@ trace_receive(Process* receiver,
 }
 
 int
-seq_trace_update_send(Process *p)
+seq_trace_update_serial(Process *p)
 {
     ErtsTracer seq_tracer = erts_get_system_seq_tracer();
     ASSERT((is_tuple(SEQ_TRACE_TOKEN(p)) || is_nil(SEQ_TRACE_TOKEN(p))));
@@ -898,6 +898,7 @@ seq_trace_output_generic(Eterm token, Eterm msg, Uint type,
 
     switch (type) {
     case SEQ_TRACE_SEND:    type_atom = am_send; break;
+    case SEQ_TRACE_SPAWN:   type_atom = am_spawn; break;
     case SEQ_TRACE_PRINT:   type_atom = am_print; break;
     case SEQ_TRACE_RECEIVE: type_atom = am_receive; break;
     default:
diff --git a/erts/emulator/beam/erl_trace.h b/erts/emulator/beam/erl_trace.h
index af38ef52db..c0f31e0cb6 100644
--- a/erts/emulator/beam/erl_trace.h
+++ b/erts/emulator/beam/erl_trace.h
@@ -142,12 +142,6 @@ void monitor_generic(Process *p, Eterm type, Eterm spec);
 Uint erts_trace_flag2bit(Eterm flag);
 int erts_trace_flags(Eterm List, 
 		 Uint *pMask, ErtsTracer *pTracer, int *pCpuTimestamp);
-Eterm erts_bif_trace(int bif_index, Process* p, Eterm* args, BeamInstr *I);
-Eterm
-erts_bif_trace_epilogue(Process *p, Eterm result, int applying,
-			Export* ep, BeamInstr *cp, Uint32 flags,
-			Uint32 flags_meta, BeamInstr* I,
-			ErtsTracer meta_tracer);
 
 void erts_send_pending_trace_msgs(ErtsSchedulerData *esdp);
 #define ERTS_CHK_PEND_TRACE_MSGS(ESDP)				\
@@ -163,7 +157,9 @@ seq_trace_output_generic((token), (msg), (type), (receiver), NULL, (exitfrom))
 void seq_trace_output_generic(Eterm token, Eterm msg, Uint type, 
 			      Eterm receiver, Process *process, Eterm exitfrom);
 
-int seq_trace_update_send(Process *process);
+/* Bump the sequence number if tracing is enabled; must be used before sending
+ * send/spawn trace messages. */
+int seq_trace_update_serial(Process *process);
 
 Eterm erts_seq_trace(Process *process, 
 		     Eterm atom_type, Eterm atom_true_or_false, 
diff --git a/erts/emulator/beam/erl_utils.h b/erts/emulator/beam/erl_utils.h
index 430ac305c5..449243a9b7 100644
--- a/erts/emulator/beam/erl_utils.h
+++ b/erts/emulator/beam/erl_utils.h
@@ -70,6 +70,7 @@ int erts_fit_in_bits_uint(Uint);
 Sint erts_list_length(Eterm);
 int erts_is_builtin(Eterm, Eterm, int);
 Uint32 make_hash2(Eterm);
+Uint32 trapping_make_hash2(Eterm, Eterm*, struct process*);
 Uint32 make_hash(Eterm);
 Uint32 make_internal_hash(Eterm, Uint32 salt);
 
diff --git a/erts/emulator/beam/erlang_dtrace.d b/erts/emulator/beam/erlang_dtrace.d
index 8792138d53..8864a8ec84 100644
--- a/erts/emulator/beam/erlang_dtrace.d
+++ b/erts/emulator/beam/erlang_dtrace.d
@@ -176,7 +176,7 @@ provider erlang {
      * Fired whenever a user function returns.
      *
      * @param p the PID (string form) of the process
-     * @param mfa the m:f/a of the function
+     * @param mfa the m:f/a of the function being returned from
      * @param depth the stack depth
      */
     probe function__return(char *p, char *mfa, int depth);
@@ -193,7 +193,7 @@ provider erlang {
      * Fired whenever a Built In Function returns.
      *
      * @param p the PID (string form) of the process
-     * @param mfa the m:f/a of the function
+     * @param mfa the m:f/a of the function being returned from
      */
     probe bif__return(char *p, char *mfa);
 
@@ -209,7 +209,7 @@ provider erlang {
      * Fired whenever a Native Function returns.
      *
      * @param p the PID (string form) of the process
-     * @param mfa the m:f/a of the function
+     * @param mfa the m:f/a of the function being returned from
      */
     probe nif__return(char *p, char *mfa);
 
diff --git a/erts/emulator/beam/error.h b/erts/emulator/beam/error.h
index 64c08b1570..44a9809a18 100644
--- a/erts/emulator/beam/error.h
+++ b/erts/emulator/beam/error.h
@@ -66,13 +66,13 @@
 #define EXF_OFFSET	EXTAG_BITS
 #define EXF_BITS	7
 
-#define EXF_PANIC	(1<<(0+EXF_OFFSET))	/* ignore catches */
-#define EXF_THROWN	(1<<(1+EXF_OFFSET))	/* nonlocal return */
-#define EXF_LOG		(1<<(2+EXF_OFFSET))	/* write to logger on termination */
-#define EXF_NATIVE	(1<<(3+EXF_OFFSET))	/* occurred in native code */
-#define EXF_SAVETRACE	(1<<(4+EXF_OFFSET))	/* save stack trace in internal form */
-#define EXF_ARGLIST	(1<<(5+EXF_OFFSET))	/* has arglist for top of trace */
-#define EXF_RESTORE_NIF	(1<<(6+EXF_OFFSET))	/* restore original bif/nif */
+#define EXF_PANIC         (1<<(0+EXF_OFFSET)) /* ignore catches */
+#define EXF_THROWN        (1<<(1+EXF_OFFSET)) /* nonlocal return */
+#define EXF_LOG           (1<<(2+EXF_OFFSET)) /* write to logger on termination */
+#define EXF_NATIVE        (1<<(3+EXF_OFFSET)) /* occurred in native code */
+#define EXF_SAVETRACE     (1<<(4+EXF_OFFSET)) /* save stack trace in internal form */
+#define EXF_ARGLIST       (1<<(5+EXF_OFFSET)) /* has arglist for top of trace */
+#define EXF_RESTORE_NFUNC (1<<(6+EXF_OFFSET)) /* restore original bif/nif */
 
 #define EXC_FLAGBITS	(((1<<(EXF_BITS+EXF_OFFSET))-1) \
 			 & ~((1<<(EXF_OFFSET))-1))
diff --git a/erts/emulator/beam/export.c b/erts/emulator/beam/export.c
index 946ffeffb8..ca16bfd20e 100644
--- a/erts/emulator/beam/export.c
+++ b/erts/emulator/beam/export.c
@@ -129,14 +129,17 @@ export_alloc(struct export_entry* tmpl_e)
 	obj->info.mfa.module = tmpl->info.mfa.module;
 	obj->info.mfa.function = tmpl->info.mfa.function;
 	obj->info.mfa.arity = tmpl->info.mfa.arity;
-        obj->beam[0] = 0;
+        obj->bif_table_index = -1;
+        obj->is_bif_traced = 0;
+
+        memset(&obj->trampoline, 0, sizeof(obj->trampoline));
+
         if (BeamOpsAreInitialized()) {
-            obj->beam[0] = BeamOpCodeAddr(op_call_error_handler);
+            obj->trampoline.op = BeamOpCodeAddr(op_call_error_handler);
         }
-	obj->beam[1] = 0;
 
 	for (ix=0; ix<ERTS_NUM_CODE_IX; ix++) {
-	    obj->addressv[ix] = obj->beam;
+	    obj->addressv[ix] = obj->trampoline.raw;
 
 	    blob->entryv[ix].slot.index = -1;
 	    blob->entryv[ix].ep = &blob->exp;
@@ -196,6 +199,19 @@ init_export_table(void)
     }
 }
 
+static struct export_entry* init_template(struct export_templ* templ,
+					  Eterm m, Eterm f, unsigned a)
+{
+    templ->entry.ep = &templ->exp;
+    templ->entry.slot.index = -1;
+    templ->exp.info.mfa.module = m;
+    templ->exp.info.mfa.function = f;
+    templ->exp.info.mfa.arity = a;
+    templ->exp.bif_table_index = -1;
+    templ->exp.is_bif_traced = 0;
+    return &templ->entry;
+}
+
 /*
  * Return a pointer to the export entry for the given function,
  * or NULL otherwise.  Notes:
@@ -214,41 +230,15 @@ erts_find_export_entry(Eterm m, Eterm f, unsigned int a,ErtsCodeIndex code_ix);
 Export*
 erts_find_export_entry(Eterm m, Eterm f, unsigned int a, ErtsCodeIndex code_ix)
 {
-    HashValue hval = EXPORT_HASH((BeamInstr) m, (BeamInstr) f, (BeamInstr) a);
-    int ix;
-    HashBucket* b;
-
-    ix = hval % export_tables[code_ix].htable.size;
-    b = export_tables[code_ix].htable.bucket[ix];
-
-    /*
-     * Note: We have inlined the code from hash.c for speed.
-     */
-	
-    while (b != (HashBucket*) 0) {
-	Export* ep = ((struct export_entry*) b)->ep;
-	if (ep->info.mfa.module == m &&
-            ep->info.mfa.function == f &&
-            ep->info.mfa.arity == a) {
-	    return ep;
-	}
-	b = b->next;
-    }
+    struct export_templ templ;
+    struct export_entry *ee =
+        hash_fetch(&export_tables[code_ix].htable,
+                   init_template(&templ, m, f, a),
+                   (H_FUN)export_hash, (HCMP_FUN)export_cmp);
+    if (ee) return ee->ep;
     return NULL;
 }
 
-static struct export_entry* init_template(struct export_templ* templ,
-					  Eterm m, Eterm f, unsigned a)
-{
-    templ->entry.ep = &templ->exp;
-    templ->entry.slot.index = -1;
-    templ->exp.info.mfa.module = m;
-    templ->exp.info.mfa.function = f;
-    templ->exp.info.mfa.arity = a;
-    return &templ->entry;
-}
-
-
 /*
  * Find the export entry for a loaded function.
  * Returns a NULL pointer if the given function is not loaded, or
@@ -268,8 +258,8 @@ erts_find_function(Eterm m, Eterm f, unsigned int a, ErtsCodeIndex code_ix)
 
     ee = hash_get(&export_tables[code_ix].htable, init_template(&templ, m, f, a));
     if (ee == NULL ||
-	(ee->ep->addressv[code_ix] == ee->ep->beam &&
-	 ! BeamIsOpCode(ee->ep->beam[0], op_i_generic_breakpoint))) {
+	(ee->ep->addressv[code_ix] == ee->ep->trampoline.raw &&
+	 ! BeamIsOpCode(ee->ep->trampoline.op, op_i_generic_breakpoint))) {
 	return NULL;
     }
     return ee->ep;
diff --git a/erts/emulator/beam/export.h b/erts/emulator/beam/export.h
index ae8dfa4cf8..0190624f79 100644
--- a/erts/emulator/beam/export.h
+++ b/erts/emulator/beam/export.h
@@ -31,24 +31,72 @@
 
 typedef struct export
 {
-    void* addressv[ERTS_NUM_CODE_IX];  /* Pointer to code for function. */
-
-    ErtsCodeInfo info; /* MUST be just before beam[] */
-
-    /*
-     * beam[0]: This entry is 0 unless the 'addressv' field points to it.
-     *          Threaded code instruction to load function
-     *		(em_call_error_handler), execute BIF (em_apply_bif),
-     *		or a breakpoint instruction (op_i_generic_breakpoint).
-     * beam[1]: Function pointer to BIF function (for BIFs only),
-     *		or pointer to threaded code if the module has an
-     *		on_load function that has not been run yet, or pointer
-     *          to code if function beam[0] is a breakpoint instruction.
-     *		Otherwise: 0.
-     */
-    BeamInstr beam[2];
+    /* Pointer to code for function. */
+    void* addressv[ERTS_NUM_CODE_IX];
+
+    /* Index into bif_table[], or -1 if not a BIF. */
+    int bif_table_index;
+    /* Non-zero if this is a BIF that's traced. */
+    int is_bif_traced;
+
+    /* This is a small trampoline function that can be used for lazy code
+     * loading, global call tracing, and so on. It's only valid when
+     * addressv points to it and should otherwise be left zeroed.
+     *
+     * Needless to say, the order of the fields below is significant. */
+    ErtsCodeInfo info;
+    union {
+        BeamInstr op;           /* Union discriminant. */
+
+        struct {
+            BeamInstr op;       /* op_i_generic_breakpoint */
+            BeamInstr address;  /* Address of the original function */
+        } breakpoint;
+
+        /* This is used when a module refers to (imports) a function that
+         * hasn't been loaded yet. Upon loading we create an export entry which
+         * redirects to the error_handler so that the appropriate module will
+         * be loaded when called (or crash).
+         *
+         * This is also used when a module has an on_load callback as we need
+         * to defer all calls until the callback returns. `deferred` contains
+         * the address of the original function in this case, and there's an
+         * awkward condiditon where `deferred` may be set while op is zero. See
+         * erlang:finish_after_on_load/2 for details. */
+        struct {
+            BeamInstr op;       /* op_call_error_handler, or 0 during the last
+                                 * phase of code loading when on_load is
+                                 * present. See above. */
+            BeamInstr deferred;
+        } not_loaded;
+
+        struct {
+            BeamInstr op;       /* op_trace_jump_W */
+            BeamInstr address;  /* Address of the traced function */
+        } trace;
+
+        BeamInstr raw[2];       /* For use in address comparisons, should not
+                                 * be tampered directly. */
+    } trampoline;
 } Export;
 
+#ifdef DEBUG
+#define DBG_CHECK_EXPORT(EP, CX) \
+    do { \
+        if((EP)->addressv[CX] == (EP)->trampoline.raw) { \
+            /* The entry currently points at the trampoline, so the
+             * instructions must be valid. */ \
+            ASSERT(((BeamIsOpCode((EP)->trampoline.op, op_i_generic_breakpoint)) && \
+                    (EP)->trampoline.breakpoint.address != 0) || \
+                   ((BeamIsOpCode((EP)->trampoline.op, op_trace_jump_W)) && \
+                    (EP)->trampoline.trace.address != 0) || \
+                   /* (EP)->trampoline.not_loaded.deferred may be zero. */ \
+                   (BeamIsOpCode((EP)->trampoline.op, op_call_error_handler))); \
+        } \
+    } while(0)
+#else
+#define DBG_CHECK_EXPORT(EP, CX) ((void)(EP), (void)(CX))
+#endif
 
 void init_export_table(void);
 void export_info(fmtfn_t, void *);
@@ -71,9 +119,6 @@ extern erts_mtx_t export_staging_lock;
 #define export_staging_unlock()	erts_mtx_unlock(&export_staging_lock)
 
 #include "beam_load.h" /* For em_* extern declarations */ 
-#define ExportIsBuiltIn(EntryPtr) 			\
-(((EntryPtr)->addressv[erts_active_code_ix()] == (EntryPtr)->beam) && \
- (BeamIsOpCode((EntryPtr)->beam[0], op_apply_bif)))
 
 #if ERTS_GLB_INLINE_INCL_FUNC_DEF
 
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 39bbf62eae..a575e1d743 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -51,18 +51,17 @@
 
 #define MAX_STRING_LEN 0xffff
 
-/* MAX value for the creation field in pid, port and reference
-   for the local node and for the current external format.
-
-   Larger creation values than this are allowed in external pid, port and refs
-   encoded with NEW_PID_EXT, NEW_PORT_EXT and NEWER_REFERENCE_EXT.
-   The point here is to prepare for future upgrade to 32-bit creation.
-   OTP-19 (erts-8.0) can handle big creation values from other (newer) nodes,
-   but do not use big creation values for the local node yet,
-   as we still may have to communicate with older nodes.
+/*
+ * MAX value for the creation field in pid, port and reference
+ * for the old PID_EXT, PORT_EXT, REFERENCE_EXT and NEW_REFERENCE_EXT.
+ * Older nodes (OTP 19-22) will send us these so we must be able to decode them.
+ *
+ * From OTP 23 DFLAG_BIG_CREATION is mandatory so this node will always
+ * encode with new big 32-bit creations using NEW_PID_EXT, NEW_PORT_EXT
+ * and NEWER_REFERENCE_EXT.
 */
-#define ERTS_MAX_LOCAL_CREATION (3)
-#define is_valid_creation(Cre) ((unsigned)(Cre) <= ERTS_MAX_LOCAL_CREATION)
+#define ERTS_MAX_TINY_CREATION (3)
+#define is_tiny_creation(Cre) ((unsigned)(Cre) <= ERTS_MAX_TINY_CREATION)
 
 #undef ERTS_DEBUG_USE_DIST_SEP
 #ifdef DEBUG
@@ -2469,7 +2468,8 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags)
     Eterm sysname = ((is_internal_pid(pid) && (dflags & DFLAG_INTERNAL_TAGS))
 		      ? INTERNAL_LOCAL_SYSNAME : pid_node_name(pid));
     Uint32 creation = pid_creation(pid);
-    byte* tagp = ep++;
+
+    *ep++ = NEW_PID_EXT;
 
     /* insert  atom here containing host and sysname  */
     ep = enc_atom(acmp, sysname, ep, dflags);
@@ -2481,15 +2481,8 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags)
     ep += 4;
     put_int32(os, ep);
     ep += 4;
-    if (creation <= ERTS_MAX_LOCAL_CREATION) {
-        *tagp = PID_EXT;
-        *ep++ = creation;
-    } else {
-        ASSERT(is_external_pid(pid));
-        *tagp = NEW_PID_EXT;
-        put_int32(creation, ep);
-        ep += 4;
-    }
+    put_int32(creation, ep);
+    ep += 4;
     return ep;
 }
 
@@ -2609,7 +2602,7 @@ dec_pid(ErtsDistExternal *edep, ErtsHeapFactory* factory, byte* ep,
     if (tag == PID_EXT) {
         cre = get_int8(ep);
         ep += 1;
-        if (!is_valid_creation(cre)) {
+        if (!is_tiny_creation(cre)) {
             return NULL;
         }
     } else {
@@ -2870,25 +2863,18 @@ enc_term_int(TTBEncodeContext* ctx, ErtsAtomCacheMap *acmp, Eterm obj, byte* ep,
 	    Eterm sysname = (((dflags & DFLAG_INTERNAL_TAGS) && is_internal_ref(obj))
 			     ? INTERNAL_LOCAL_SYSNAME : ref_node_name(obj));
             Uint32 creation = ref_creation(obj);
-            byte* tagp = ep++;
 
 	    ASSERT(dflags & DFLAG_EXTENDED_REFERENCES);
 
 	    erts_magic_ref_save_bin(obj);
 
+            *ep++ = NEWER_REFERENCE_EXT;
 	    i = ref_no_numbers(obj);
 	    put_int16(i, ep);
 	    ep += 2;
 	    ep = enc_atom(acmp, sysname, ep, dflags);
-            if (creation <= ERTS_MAX_LOCAL_CREATION) {
-                *tagp = NEW_REFERENCE_EXT;
-                *ep++ = creation;
-            } else {
-                ASSERT(is_external_ref(obj));
-                *tagp = NEWER_REFERENCE_EXT;
-                put_int32(creation, ep);
-                ep += 4;
-            }
+            put_int32(creation, ep);
+            ep += 4;
 	    ref_num = ref_numbers(obj);
 	    for (j = 0; j < i; j++) {
 		put_int32(ref_num[j], ep);
@@ -2901,21 +2887,14 @@ enc_term_int(TTBEncodeContext* ctx, ErtsAtomCacheMap *acmp, Eterm obj, byte* ep,
 	    Eterm sysname = (((dflags & DFLAG_INTERNAL_TAGS) && is_internal_port(obj))
 			     ? INTERNAL_LOCAL_SYSNAME : port_node_name(obj));
             Uint32 creation = port_creation(obj);
-            byte* tagp = ep++;
 
+            *ep++ = NEW_PORT_EXT;
 	    ep = enc_atom(acmp, sysname, ep, dflags);
 	    j = port_number(obj);
 	    put_int32(j, ep);
 	    ep += 4;
-            if (creation <= ERTS_MAX_LOCAL_CREATION) {
-                *tagp = PORT_EXT;
-                *ep++ = creation;
-            } else {
-                ASSERT(is_external_port(obj));
-                *tagp = NEW_PORT_EXT;
-                put_int32(creation, ep);
-                ep += 4;
-            }
+            put_int32(creation, ep);
+            ep += 4;
 	    break;
 	}
 	case LIST_DEF:
@@ -3610,7 +3589,7 @@ dec_term_atom_common:
                 if (tag == PORT_EXT) {
                     cre = get_int8(ep);
                     ep++;
-                    if (!is_valid_creation(cre)) {
+                    if (!is_tiny_creation(cre)) {
                         goto error;
                     }
                 }
@@ -3657,7 +3636,7 @@ dec_term_atom_common:
 
 		cre = get_int8(ep);
 		ep += 1;
-		if (!is_valid_creation(cre)) {
+		if (!is_tiny_creation(cre)) {
 		    goto error;
 		}
 		goto ref_ext_common;
@@ -3671,7 +3650,7 @@ dec_term_atom_common:
 
 		cre = get_int8(ep);
 		ep += 1;
-		if (!is_valid_creation(cre)) {
+		if (!is_tiny_creation(cre)) {
 		    goto error;
 		}
 		r0 = get_int32(ep);
@@ -4066,73 +4045,6 @@ dec_term_atom_common:
 		next = &(funp->creator);
 		break;
 	    }
-	case FUN_EXT:
-	    {
-		ErlFunThing* funp = (ErlFunThing *) hp;
-		Eterm module;
-		Sint old_uniq;
-		Sint old_index;
-		unsigned num_free;
-		int i;
-		Eterm temp;
-
-		num_free = get_int32(ep);
-		ep += 4;
-		hp += ERL_FUN_SIZE;
-		hp += num_free;
-		factory->hp = hp;
-		funp->thing_word = HEADER_FUN;
-		funp->num_free = num_free;
-		*objp = make_fun(funp);
-
-		/* Creator pid */
-		if ((*ep != PID_EXT && *ep != NEW_PID_EXT)
-		    || (ep = dec_pid(edep, factory, ep+1,
-				     &funp->creator, *ep))==NULL) {
-		    goto error;
-		}
-
-		/* Module */
-		if ((ep = dec_atom(edep, ep, &module)) == NULL) {
-		    goto error;
-		}
-
-		/* Index */
-		if ((ep = dec_term(edep, factory, ep, &temp, NULL)) == NULL) {
-		    goto error;
-		}
-		if (!is_small(temp)) {
-		    goto error;
-		}
-		old_index = unsigned_val(temp);
-
-		/* Uniq */
-		if ((ep = dec_term(edep, factory, ep, &temp, NULL)) == NULL) {
-		    goto error;
-		}
-		if (!is_small(temp)) {
-		    goto error;
-		}
-		
-		/*
-		 * It is safe to link the fun into the fun list only when
-		 * no more validity tests can fail.
-		 */
-		funp->next = factory->off_heap->first;
-		factory->off_heap->first = (struct erl_off_heap_header*)funp;
-		old_uniq = unsigned_val(temp);
-
-		funp->fe = erts_put_fun_entry(module, old_uniq, old_index);
-		funp->arity = funp->fe->address[-1] - num_free;
-		hp = factory->hp;
-
-		/* Environment */
-		for (i = num_free-1; i >= 0; i--) {
-		    funp->env[i] = (Eterm) next;
-		    next = funp->env + i;
-		}
-		break;
-	    }
 	case ATOM_INTERNAL_REF2:
 	    n = get_int16(ep);
 	    ep += 2;
@@ -4401,30 +4313,21 @@ encode_size_struct_int(TTBSizeContext* ctx, ErtsAtomCacheMap *acmp, Eterm obj,
 		result += 1 + 4 + 1 + i;  /* tag,size,sign,digits */
 	    break;
         case EXTERNAL_PID_DEF:
-            if (external_pid_creation(obj) > ERTS_MAX_LOCAL_CREATION)
-                result += 3;
-            /*fall through*/
 	case PID_DEF:
 	    result += (1 + encode_size_struct2(acmp, pid_node_name(obj), dflags) +
-		       4 + 4 + 1);
+		       4 + 4 + 4);
 	    break;
         case EXTERNAL_REF_DEF:
-            if (external_ref_creation(obj) > ERTS_MAX_LOCAL_CREATION)
-                result += 3;
-            /*fall through*/
 	case REF_DEF:
 	    ASSERT(dflags & DFLAG_EXTENDED_REFERENCES);
 	    i = ref_no_numbers(obj);
 	    result += (1 + 2 + encode_size_struct2(acmp, ref_node_name(obj), dflags) +
-		       1 + 4*i);
+		       4 + 4*i);
 	    break;
         case EXTERNAL_PORT_DEF:
-            if (external_port_creation(obj) > ERTS_MAX_LOCAL_CREATION)
-                result += 3;
-            /*fall through*/
         case PORT_DEF:
 	    result += (1 + encode_size_struct2(acmp, port_node_name(obj), dflags) +
-		      4 + 1);
+		      4 + 4);
 	    break;
 	case LIST_DEF: {
 	    int is_str = is_external_string(obj, &m);
@@ -4891,9 +4794,6 @@ init_done:
 		total_size = get_int32(ep);
 		CHKSIZE(total_size);		
 		ep += 1+16+4+4;
-		/*FALLTHROUGH*/
-
-	    case FUN_EXT:
 		CHKSIZE(4);
 		num_free = get_int32(ep);
 		ep += 4;
@@ -4904,6 +4804,12 @@ init_done:
 		heap_size += ERL_FUN_SIZE + num_free;
 		break;
 	    }
+	case FUN_EXT:
+            /*
+             * OTP 23: No longer support decoding the old fun
+             * representation.
+             */
+            goto error;
 	case ATOM_INTERNAL_REF2:
 	    SKIP(2+atom_extra_skip);
 	    atom_extra_skip = 0;
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 40c65461bc..b86709b093 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -122,6 +122,10 @@ void erts_nif_demonitored(ErtsResource* resource);
 extern void erts_add_taint(Eterm mod_atom);
 extern Eterm erts_nif_taints(Process* p);
 extern void erts_print_nif_taints(fmtfn_t to, void* to_arg);
+
+/* Loads the specified NIF. The caller must have code write permission. */
+Eterm erts_load_nif(Process *c_p, BeamInstr *I, Eterm filename, Eterm args);
+
 void erts_unload_nif(struct erl_module_nif* nif);
 extern void erl_nif_init(void);
 extern int erts_nif_get_funcs(struct erl_module_nif*,
@@ -885,6 +889,8 @@ void erts_bif_info_init(void);
 
 /* bif.c */
 
+void erts_write_bif_wrapper(Export *export, BeamInstr *address);
+
 void erts_queue_monitor_message(Process *,
 				ErtsProcLocks*,
 				Eterm,
@@ -906,6 +912,9 @@ Eterm erts_trapping_length_1(Process* p, Eterm* args);
 
 Eterm erl_is_function(Process* p, Eterm arg1, Eterm arg2);
 
+/* beam_bif_lists.c */
+void erts_init_bif_lists(void);
+
 /* beam_bif_load.c */
 Eterm erts_check_process_code(Process *c_p, Eterm module, int *redsp, int fcalls);
 Eterm erts_proc_copy_literal_area(Process *c_p, int *redsp, int fcalls, int gc_allowed);
@@ -1147,6 +1156,7 @@ void erts_dirty_process_main(ErtsSchedulerData *);
 Eterm build_stacktrace(Process* c_p, Eterm exc);
 Eterm expand_error_value(Process* c_p, Uint freason, Eterm Value);
 void erts_save_stacktrace(Process* p, struct StackTrace* s, int depth);
+BeamInstr *erts_printable_return_address(Process* p, Eterm *E) ERTS_NOINLINE;
 
 /* erl_init.c */
 
diff --git a/erts/emulator/beam/hash.c b/erts/emulator/beam/hash.c
index 8954dbb06c..177b7cc3d1 100644
--- a/erts/emulator/beam/hash.c
+++ b/erts/emulator/beam/hash.c
@@ -30,37 +30,19 @@
 #include "hash.h"
 
 /*
-** List of sizes (all are primes)
-*/
-static const int h_size_table[] = {
-    2, 5, 11, 23, 47, 97, 197, 397, 797,  /* double upto here */
-    1201,   1597,
-    2411,   3203,
-    4813,   6421,
-    9643,   12853,
-    19289,  25717,
-    51437,
-    102877,
-    205759,
-    411527,
-    823117,
-    1646237,
-    3292489,
-    6584983,
-    13169977,
-    26339969,
-    52679969,
-    -1
-};
-
-/*
 ** Get info about hash
 **
 */
 
+#define MAX_SHIFT (ERTS_SIZEOF_TERM * 8)
+
+static int hash_get_slots(Hash *h) {
+    return UWORD_CONSTANT(1) << (MAX_SHIFT - h->shift);
+}
+
 void hash_get_info(HashInfo *hi, Hash *h)
 {
-    int size = h->size;
+    int size = hash_get_slots(h);
     int i;
     int max_depth = 0;
     int objects = 0;
@@ -84,7 +66,7 @@ void hash_get_info(HashInfo *hi, Hash *h)
     ASSERT(objects == h->nobjs);
 
     hi->name  = h->name;
-    hi->size  = h->size;
+    hi->size  = hash_get_slots(h);
     hi->used  = used;
     hi->objs  = h->nobjs;
     hi->depth = max_depth;
@@ -118,15 +100,15 @@ hash_table_sz(Hash *h)
   int i;
   for(i=0;h->name[i];i++);
   i++;
-  return sizeof(Hash) + h->size*sizeof(HashBucket*) + i;
+  return sizeof(Hash) + hash_get_slots(h)*sizeof(HashBucket*) + i;
 }
 
 
 static ERTS_INLINE void set_thresholds(Hash* h)
 {
-    h->grow_threshold = (8*h->size)/5;   /* grow at 160% load */
-    if (h->size_ix > h->min_size_ix)
-        h->shrink_threshold = h->size / 5;  /* shrink at 20% load */
+    h->grow_threshold = (8*hash_get_slots(h))/5;   /* grow at 160% load */
+    if (h->shift < h->max_shift)
+        h->shrink_threshold = hash_get_slots(h) / 5;  /* shrink at 20% load */
     else
         h->shrink_threshold = -1;  /* never shrink below inital size */
 }
@@ -138,29 +120,27 @@ static ERTS_INLINE void set_thresholds(Hash* h)
 Hash* hash_init(int type, Hash* h, char* name, int size, HashFunctions fun)
 {
     int sz;
-    int ix = 0;
+    int shift = 1;
 
     h->meta_alloc_type = type;
 
-    while (h_size_table[ix] != -1 && h_size_table[ix] < size)
-	ix++;
-    if (h_size_table[ix] == -1)
-	return NULL;
-
-    size = h_size_table[ix];
-    sz = size*sizeof(HashBucket*);
+    while ((UWORD_CONSTANT(1) << shift) < size)
+        shift++;
 
-    h->bucket = (HashBucket**) fun.meta_alloc(h->meta_alloc_type, sz);
-
-    memzero(h->bucket, sz);
     h->is_allocated = 0;
     h->name = name;
     h->fun = fun;
-    h->size = size;
-    h->size_ix = ix;
-    h->min_size_ix = ix;
+    h->shift = MAX_SHIFT - shift;
+    h->max_shift = h->shift;
     h->nobjs = 0;
     set_thresholds(h);
+
+    sz = hash_get_slots(h) * sizeof(HashBucket*);
+    h->bucket = (HashBucket**) fun.meta_alloc(h->meta_alloc_type, sz);
+    memzero(h->bucket, sz);
+
+    ASSERT(h->shift > 0 && h->shift < 64);
+
     return h;
 }
 
@@ -183,7 +163,7 @@ Hash* hash_new(int type, char* name, int size, HashFunctions fun)
 */
 void hash_delete(Hash* h)
 {
-    int old_size = h->size;
+    int old_size = hash_get_slots(h);
     int i;
 
     for (i = 0; i < old_size; i++) {
@@ -206,22 +186,20 @@ void hash_delete(Hash* h)
 static void rehash(Hash* h, int grow)
 {
     int sz;
-    int old_size = h->size;
+    int old_size = hash_get_slots(h);
     HashBucket** new_bucket;
     int i;
 
     if (grow) {
-	if ((h_size_table[h->size_ix+1]) == -1)
-	    return;
-	h->size_ix++;
+	h->shift--;
     }
     else {
-	if (h->size_ix == 0)
+	if (h->shift == h->max_shift)
 	    return;
-	h->size_ix--;
+	h->shift++;
     }
-    h->size = h_size_table[h->size_ix];
-    sz = h->size*sizeof(HashBucket*);
+
+    sz = hash_get_slots(h)*sizeof(HashBucket*);
 
     new_bucket = (HashBucket **) h->fun.meta_alloc(h->meta_alloc_type, sz);
     memzero(new_bucket, sz);
@@ -230,7 +208,7 @@ static void rehash(Hash* h, int grow)
 	HashBucket* b = h->bucket[i];
 	while (b != (HashBucket*) 0) {
 	    HashBucket* b_next = b->next;
-	    int ix = b->hvalue % h->size;
+	    Uint ix = hash_get_slot(h, b->hvalue);
 	    b->next = new_bucket[ix];
 	    new_bucket[ix] = b;
 	    b = b_next;
@@ -247,16 +225,7 @@ static void rehash(Hash* h, int grow)
 */
 void* hash_get(Hash* h, void* tmpl)
 {
-    HashValue hval = h->fun.hash(tmpl);
-    int ix = hval % h->size;
-    HashBucket* b = h->bucket[ix];
-
-    while(b != (HashBucket*) 0) {
-	if ((b->hvalue == hval) && (h->fun.cmp(tmpl, (void*)b) == 0))
-	    return (void*) b;
-	b = b->next;
-    }
-    return (void*) 0;
+    return hash_fetch(h, tmpl, h->fun.hash, h->fun.cmp);
 }
 
 /*
@@ -265,7 +234,7 @@ void* hash_get(Hash* h, void* tmpl)
 void* hash_put(Hash* h, void* tmpl)
 {
     HashValue hval = h->fun.hash(tmpl);
-    int ix = hval % h->size;
+    Uint ix = hash_get_slot(h, hval);
     HashBucket* b = h->bucket[ix];
 
     while(b != (HashBucket*) 0) {
@@ -291,7 +260,7 @@ void* hash_put(Hash* h, void* tmpl)
 void* hash_erase(Hash* h, void* tmpl)
 {
     HashValue hval = h->fun.hash(tmpl);
-    int ix = hval % h->size;
+    Uint ix = hash_get_slot(h, hval);
     HashBucket* b = h->bucket[ix];
     HashBucket* prev = 0;
 
@@ -323,7 +292,7 @@ void *
 hash_remove(Hash *h, void *tmpl)
 {
     HashValue hval = h->fun.hash(tmpl);
-    int ix = hval % h->size;
+    Uint ix = hash_get_slot(h, hval);
     HashBucket *b = h->bucket[ix];
     HashBucket *prev = NULL;
 
@@ -343,11 +312,11 @@ hash_remove(Hash *h, void *tmpl)
     return NULL;
 }
 
-void hash_foreach(Hash* h, void (*func)(void *, void *), void *func_arg2)
+void hash_foreach(Hash* h, HFOREACH_FUN func, void *func_arg2)
 {
     int i;
 
-    for (i = 0; i < h->size; i++) {
+    for (i = 0; i < hash_get_slots(h); i++) {
 	HashBucket* b = h->bucket[i];
 	while(b != (HashBucket*) 0) {
 	    (*func)((void *) b, func_arg2);
diff --git a/erts/emulator/beam/hash.h b/erts/emulator/beam/hash.h
index d319aaca83..4e8eb6594b 100644
--- a/erts/emulator/beam/hash.h
+++ b/erts/emulator/beam/hash.h
@@ -18,16 +18,16 @@
  * %CopyrightEnd%
  */
 
-/*
-** General hash functions
-**
-*/
+/**
+ * General hash functions
+ *
+ **/
 #ifndef __HASH_H__
 #define __HASH_H__
 
 #include "sys.h"
 
-typedef unsigned long HashValue;
+typedef UWord HashValue;
 typedef struct hash Hash;
 
 typedef int (*HCMP_FUN)(void*, void*);
@@ -38,6 +38,7 @@ typedef void (*HFREE_FUN)(void*);
 typedef void* (*HMALLOC_FUN)(int,size_t);
 typedef void (*HMFREE_FUN)(int,void*);
 typedef int (*HMPRINT_FUN)(fmtfn_t,void*,char*, ...);
+typedef void (*HFOREACH_FUN)(void *, void *);
 
 /*
 ** This bucket must be placed in top of 
@@ -75,11 +76,10 @@ struct hash
     int is_allocated;    /* 0 iff hash structure is on stack or is static */
     int meta_alloc_type; /* argument to pass to meta_alloc and meta_free */
     char* name;          /* Table name (static string, for debugging) */
-    int size;		 /* Number of slots */
+    int shift;		 /* How much to shift the hash value */
+    int max_shift;       /* Never shift more than this value */
     int shrink_threshold;
     int grow_threshold;
-    int size_ix;         /* Size index in size table */
-    int min_size_ix;     /* Never shrink table smaller than this */
     int nobjs;		 /* Number of objects in table */
     HashBucket** bucket; /* Vector of bucket pointers (objects) */
 };
@@ -96,6 +96,54 @@ void* hash_get(Hash*, void*);
 void* hash_put(Hash*, void*);
 void* hash_erase(Hash*, void*);
 void* hash_remove(Hash*, void*);
-void  hash_foreach(Hash*, void (*func)(void *, void *), void *);
+void  hash_foreach(Hash*, HFOREACH_FUN, void *);
+
+ERTS_GLB_INLINE Uint hash_get_slot(Hash *h, HashValue hv);
+ERTS_GLB_INLINE void* hash_fetch(Hash *, void*, H_FUN, HCMP_FUN);
+
+#if ERTS_GLB_INLINE_INCL_FUNC_DEF
+
+ERTS_GLB_INLINE Uint
+hash_get_slot(Hash *h, HashValue hv)
+{
+    /* This slot mapping function uses fibonacci hashing in order to
+     * protect itself against a very bad hash function. This is not
+     * a hash function, so the user of hash.h should still spend time
+     * to figure out a good hash function for its data.
+     *
+     * See https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+     * for some thoughts and ideas about fibonacci hashing.
+     */
+
+    /* This is not strictly part of the fibonacci hashing algorithm
+     * but it does help to spread the values of the mapping function better.
+     */
+    hv ^= hv >> h->shift;
+#ifdef ARCH_64
+    /* 2^64 / 1.61803398875 = 11400714819323198485.... */
+    return (UWORD_CONSTANT(11400714819323198485) * hv) >> h->shift;
+#else
+    /* 2^32 / 1.61803398875 = 2654435769.... */
+    return (UWORD_CONSTANT(2654435769) * hv) >> h->shift;
+#endif
+}
+
+ERTS_GLB_INLINE void* hash_fetch(Hash *h, void* tmpl, H_FUN hash, HCMP_FUN cmp)
+{
+    HashValue hval = hash(tmpl);
+    Uint ix = hash_get_slot(h, hval);
+    HashBucket* b = h->bucket[ix];
+    ASSERT(h->fun.hash == hash);
+    ASSERT(h->fun.cmp == cmp);
+
+    while(b != (HashBucket*) 0) {
+	if ((b->hvalue == hval) && (cmp(tmpl, (void*)b) == 0))
+	    return (void*) b;
+	b = b->next;
+    }
+    return (void*) 0;
+}
+
+#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */
 
 #endif
diff --git a/erts/emulator/beam/index.c b/erts/emulator/beam/index.c
index be1771b037..09d3c24424 100644
--- a/erts/emulator/beam/index.c
+++ b/erts/emulator/beam/index.c
@@ -114,35 +114,26 @@ int index_get(IndexTable* t, void* tmpl)
     return -1;
 }
 
-void erts_index_merge(Hash* src, IndexTable* dst)
+static void index_merge_foreach(IndexSlot *p, IndexTable *dst)
 {
-    int limit = src->size;
-    HashBucket** bucket = src->bucket;
-    int i;
-
-    for (i = 0; i < limit; i++) {
-	HashBucket* b = bucket[i];
-	IndexSlot* p;
-	int ix;
-
-	while (b) {
-	    Uint sz;
-	    ix = dst->entries++;
-	    if (ix >= dst->size) {
-		if (ix >= dst->limit) {
-		    erts_exit(ERTS_ERROR_EXIT, "no more index entries in %s (max=%d)\n",
-			     dst->htable.name, dst->limit);
-		}
-		sz = INDEX_PAGE_SIZE*sizeof(IndexSlot*);
-		dst->seg_table[ix>>INDEX_PAGE_SHIFT] = erts_alloc(dst->type, sz);
-		dst->size += INDEX_PAGE_SIZE;
-	    }
-	    p = (IndexSlot*) b;
-	    p->index = ix;
-	    dst->seg_table[ix>>INDEX_PAGE_SHIFT][ix&INDEX_PAGE_MASK] = p;
-	    b = b->next;
-	}
+    Uint sz;
+    int ix = dst->entries++;
+    if (ix >= dst->size) {
+        if (ix >= dst->limit) {
+            erts_exit(ERTS_ERROR_EXIT, "no more index entries in %s (max=%d)\n",
+                      dst->htable.name, dst->limit);
+        }
+        sz = INDEX_PAGE_SIZE*sizeof(IndexSlot*);
+        dst->seg_table[ix>>INDEX_PAGE_SHIFT] = erts_alloc(dst->type, sz);
+        dst->size += INDEX_PAGE_SIZE;
     }
+    p->index = ix;
+    dst->seg_table[ix>>INDEX_PAGE_SHIFT][ix&INDEX_PAGE_MASK] = p;
+}
+
+void erts_index_merge(Hash* src, IndexTable* dst)
+{
+    hash_foreach(src, (HFOREACH_FUN)index_merge_foreach, dst);
 }
 
 void index_erase_latest_from(IndexTable* t, Uint from_ix)
diff --git a/erts/emulator/beam/instrs.tab b/erts/emulator/beam/instrs.tab
index 7cffe7fb5c..f53d60a5db 100644
--- a/erts/emulator/beam/instrs.tab
+++ b/erts/emulator/beam/instrs.tab
@@ -19,7 +19,12 @@
 // %CopyrightEnd%
 //
 
-// Stack manipulation instructions
+//
+// Stack manipulation instructions follow.
+//
+// See the comment for AH() in macros.tab for information about
+// the layout of stack frames.
+//
 
 allocate(NeedStack, Live) {
     $AH($NeedStack, 0, $Live);
@@ -58,105 +63,170 @@ allocate_heap_zero(NeedStack, NeedHeap, Live) {
 
 deallocate(Deallocate) {
     //| -no_prefetch
-    SET_CP(c_p, (BeamInstr *) cp_val(*E));
     E = ADD_BYTE_OFFSET(E, $Deallocate);
 }
 
-deallocate_return(Deallocate) {
-    //| -no_next
-    int words_to_pop = $Deallocate;
-    SET_I((BeamInstr *) cp_val(*E));
-    E = ADD_BYTE_OFFSET(E, words_to_pop);
-    CHECK_TERM(x(0));
-    DispatchReturn;
+//
+// Micro-benchmarks showed that the deallocate_return instruction
+// became slower when the continuation pointer was moved from
+// the process struct to the stack. The reason seems to be read
+// dependencies, i.e. that the CPU cannot figure out beforehand
+// from which position on the stack the continuation pointer
+// should be fetched.
+//
+// Initializing num_bytes with a constant value seems to restore
+// the lost speed, so we've specialized the instruction for the
+// most common values.
+//
+
+deallocate_return0 := dealloc_ret.n0.execute;
+deallocate_return1 := dealloc_ret.n1.execute;
+deallocate_return2 := dealloc_ret.n2.execute;
+deallocate_return3 := dealloc_ret.n3.execute;
+deallocate_return4 := dealloc_ret.n4.execute;
+deallocate_return := dealloc_ret.var.execute;
+
+dealloc_ret.head() {
+    Uint num_bytes;
 }
 
-move_deallocate_return(Src, Deallocate) {
-    x(0) = $Src;
-    $deallocate_return($Deallocate);
+dealloc_ret.n0() {
+    num_bytes = (0+1) * sizeof(Eterm);
 }
 
-// Call instructions
+dealloc_ret.n1() {
+    num_bytes = (1+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n2() {
+    num_bytes = (2+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n3() {
+    num_bytes = (3+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n4() {
+    num_bytes = (4+1) * sizeof(Eterm);
+}
+
+dealloc_ret.var(Deallocate) {
+    num_bytes = $Deallocate;
+}
 
-DISPATCH_REL(CallDest) {
+dealloc_ret.execute() {
     //| -no_next
-    $SET_I_REL($CallDest);
-    DTRACE_LOCAL_CALL(c_p, erts_code_to_codemfa(I));
-    Dispatch();
+
+    E = ADD_BYTE_OFFSET(E, num_bytes);
+    $RETURN();
+    CHECK_TERM(x(0));
+    $DISPATCH_RETURN();
 }
 
-DISPATCH_ABS(CallDest) {
+move_deallocate_return(Src, Deallocate) {
     //| -no_next
-    SET_I((BeamInstr *) $CallDest);
-    DTRACE_LOCAL_CALL(c_p, erts_code_to_codemfa(I));
-    Dispatch();
+
+    /*
+     * Explicitly do reads first to mitigate the impact of read
+     * dependencies.
+     */
+
+    Uint bytes_to_pop = $Deallocate;
+    Eterm src = $Src;
+    E = ADD_BYTE_OFFSET(E, bytes_to_pop);
+    x(0) = src;
+    DTRACE_RETURN_FROM_PC(c_p, I);
+    $RETURN();
+    CHECK_TERM(x(0));
+    $DISPATCH_RETURN();
 }
 
+// Call instructions
+
 i_call(CallDest) {
-    SET_CP(c_p, $NEXT_INSTRUCTION);
+    //| -no_next
+    $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
     $DISPATCH_REL($CallDest);
 }
 
 move_call(Src, CallDest) {
-    x(0) = $Src;
-    SET_CP(c_p, $NEXT_INSTRUCTION);
-    $DISPATCH_REL($CallDest);
+    //| -no_next
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
+    x(0) = src;
+    $DISPATCH_REL(call_dest);
 }
 
 i_call_last(CallDest, Deallocate) {
+    //| -no_next
     $deallocate($Deallocate);
     $DISPATCH_REL($CallDest);
 }
 
 move_call_last(Src, CallDest, Deallocate) {
-    x(0) = $Src;
-    $i_call_last($CallDest, $Deallocate);
+    //| -no_next
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    $deallocate($Deallocate);
+    x(0) = src;
+    $DISPATCH_REL(call_dest);
 }
 
 i_call_only(CallDest) {
+    //| -no_next
     $DISPATCH_REL($CallDest);
 }
 
 move_call_only(Src, CallDest) {
-    x(0) = $Src;
-    $i_call_only($CallDest);
-}
-
-DISPATCHX(Dest) {
     //| -no_next
-    DTRACE_GLOBAL_CALL_FROM_EXPORT(c_p, $Dest);
-    // Dispatchx assumes the Export* is in Arg(0)
-    I = (&$Dest) - 1;
-    Dispatchx();
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    x(0) = src;
+    $DISPATCH_REL(call_dest);
 }
 
 i_call_ext(Dest) {
-    SET_CP(c_p, $NEXT_INSTRUCTION);
-    $DISPATCHX($Dest);
+    //| -no_next
+    $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
+    $DISPATCH_EXPORT($Dest);
 }
 
-i_move_call_ext(Src, Dest) {
-    x(0) = $Src;
-    $i_call_ext($Dest);
+i_move_call_ext(Src, CallDest) {
+    //| -no_next
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
+    x(0) = src;
+    $DISPATCH_EXPORT(call_dest);
 }
 
 i_call_ext_only(Dest) {
-    $DISPATCHX($Dest);
+    //| -no_next
+    $DISPATCH_EXPORT($Dest);
 }
 
-i_move_call_ext_only(Dest, Src) {
-    x(0) = $Src;
-    $i_call_ext_only($Dest);
+i_move_call_ext_only(CallDest, Src) {
+    //| -no_next
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    x(0) = src;
+    $DISPATCH_EXPORT(call_dest);
 }
 
 i_call_ext_last(Dest, Deallocate) {
+    //| -no_next
     $deallocate($Deallocate);
-    $DISPATCHX($Dest);
+    $DISPATCH_EXPORT($Dest);
 }
 
-i_move_call_ext_last(Dest, StackOffset, Src) {
-    x(0) = $Src;
-    $i_call_ext_last($Dest, $StackOffset);
+i_move_call_ext_last(CallDest, Deallocate, Src) {
+    //| -no_next
+    Eterm call_dest = $CallDest;
+    Eterm src = $Src;
+    $deallocate($Deallocate);
+    x(0) = src;
+    $DISPATCH_EXPORT(call_dest);
 }
 
 APPLY(I, Deallocate, Next) {
@@ -172,16 +242,18 @@ HANDLE_APPLY_ERROR() {
 }
 
 i_apply() {
+    //| -no_next
     BeamInstr *next;
     $APPLY(NULL, 0, next);
     if (ERTS_LIKELY(next != NULL)) {
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         $DISPATCH_ABS(next);
     }
     $HANDLE_APPLY_ERROR();
 }
 
 i_apply_last(Deallocate) {
+    //| -no_next
     BeamInstr *next;
     $APPLY(I, $Deallocate, next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -192,6 +264,7 @@ i_apply_last(Deallocate) {
 }
 
 i_apply_only() {
+    //| -no_next
     BeamInstr *next;
     $APPLY(I, 0, next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -208,16 +281,18 @@ FIXED_APPLY(Arity, I, Deallocate, Next) {
 }
 
 apply(Arity) {
+    //| -no_next
     BeamInstr *next;
     $FIXED_APPLY($Arity, NULL, 0, next);
     if (ERTS_LIKELY(next != NULL)) {
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         $DISPATCH_ABS(next);
     }
     $HANDLE_APPLY_ERROR();
 }
 
 apply_last(Arity, Deallocate) {
+    //| -no_next
     BeamInstr *next;
     $FIXED_APPLY($Arity, I, $Deallocate, next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -237,23 +312,19 @@ HANDLE_APPLY_FUN_ERROR() {
      goto find_func_info;
 }
 
-DISPATCH_FUN(I) {
-    //| -no_next
-    SET_I($I);
-    Dispatchfun();
-}
-
 i_apply_fun() {
+    //| -no_next
     BeamInstr *next;
     $APPLY_FUN(next);
     if (ERTS_LIKELY(next != NULL)) {
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         $DISPATCH_FUN(next);
     }
     $HANDLE_APPLY_FUN_ERROR();
 }
 
 i_apply_fun_last(Deallocate) {
+    //| -no_next
     BeamInstr *next;
     $APPLY_FUN(next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -264,6 +335,7 @@ i_apply_fun_last(Deallocate) {
 }
 
 i_apply_fun_only() {
+    //| -no_next
     BeamInstr *next;
     $APPLY_FUN(next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -280,16 +352,18 @@ CALL_FUN(Fun, Next) {
 }
 
 i_call_fun(Fun) {
+    //| -no_next
     BeamInstr *next;
     $CALL_FUN($Fun, next);
     if (ERTS_LIKELY(next != NULL)) {
-        SET_CP(c_p, $NEXT_INSTRUCTION);
+        $SAVE_CONTINUATION_POINTER($NEXT_INSTRUCTION);
         $DISPATCH_FUN(next);
     }
     $HANDLE_APPLY_FUN_ERROR();
 }
 
 i_call_fun_last(Fun, Deallocate) {
+    //| -no_next
     BeamInstr *next;
     $CALL_FUN($Fun, next);
     if (ERTS_LIKELY(next != NULL)) {
@@ -301,18 +375,12 @@ i_call_fun_last(Fun, Deallocate) {
 
 return() {
     //| -no_next
-    SET_I(c_p->cp);
-    DTRACE_RETURN_FROM_PC(c_p);
-
-    /*
-     * We must clear the CP to make sure that a stale value do not
-     * create a false module dependcy preventing code upgrading.
-     * It also means that we can use the CP in stack backtraces.
-     */
-    c_p->cp = 0;
+    DTRACE_RETURN_FROM_PC(c_p, I);
+    $RETURN();
     CHECK_TERM(r(0));
     HEAP_SPACE_VERIFIED(0);
-    DispatchReturn;
+
+    $DISPATCH_RETURN();
 }
 
 get_list(Src, Hd, Tl) {
@@ -478,16 +546,21 @@ i_make_fun(FunP, NumFree) {
 }
 
 move_trim(Src, Dst, Words) {
-    Uint cp = E[0];
     $Dst = $Src;
-    E += $Words;
-    E[0] = cp;
+    $i_trim($Words);
 }
 
 i_trim(Words) {
-    Uint cp = E[0];
     E += $Words;
-    E[0] = cp;
+
+    /*
+     * Clear the reserved location for the continuation pointer at
+     * E[0]. This is not strictly necessary for correctness, but if a
+     * GC is triggered before E[0] is overwritten by another
+     * continuation pointer the now dead term at E[0] would be
+     * retained by the GC.
+     */
+    E[0] = NIL;
 }
 
 move(Src, Dst) {
@@ -599,9 +672,9 @@ move_window5(S1, S2, S3, S4, S5, D) {
 move_return(Src) {
     //| -no_next
     x(0) = $Src;
-    SET_I(c_p->cp);
-    c_p->cp = 0;
-    DispatchReturn;
+    DTRACE_RETURN_FROM_PC(c_p, I);
+    $RETURN();
+    $DISPATCH_RETURN();
 }
 
 move_x1(Src) {
@@ -683,10 +756,11 @@ swap(R1, R2) {
     $R2 = V;
 }
 
-swap_temp(R1, R2, Tmp) {
-    Eterm V = $R1;
-    $R1 = $R2;
-    $R2 = $Tmp = V;
+swap2(R1, R2, R3) {
+    Eterm V = $R2;
+    $R2 = $R1;
+    $R1 = $R3;
+    $R3 = V;
 }
 
 test_heap(Nh, Live) {
diff --git a/erts/emulator/beam/macros.tab b/erts/emulator/beam/macros.tab
index 1b5e5f66b0..848e35d45c 100644
--- a/erts/emulator/beam/macros.tab
+++ b/erts/emulator/beam/macros.tab
@@ -104,14 +104,136 @@ GC_TEST_PRESERVE(NeedHeap, Live, PreserveTerm) {
 
 
 // Make sure that there are NeedStack + NeedHeap + 1 words available
-// on the combined heap/stack segment, then allocates NeedHeap + 1
-// words on the stack and saves CP.
+// on the combined heap/stack segment, then decrement the stack
+// pointer by (NeedStack + 1) words. Finally clear the word reserved
+// for the continuation pointer at the top of the stack.
+//
+// Stack frame layout:
+//
+//       +-----------+
+// y(N)  | Term      |
+//       +-----------+
+//            .
+//            .
+//            .
+//       +-----------+
+// y(0)  | Term      |
+//       +-----------+
+// E ==> | NIL or CP |
+//       +-----------+
+//
+// When the function owning the stack frame is the currently executing
+// function, the word at the top of the stack is NIL. When calling
+// another function, the continuation pointer will be stored in the
+// word at the top of the stack. When returning to the function
+// owning the stack frame, the word at the stack top will again be set
+// to NIL.
+
 AH(NeedStack, NeedHeap, Live) {
     unsigned needed = $NeedStack + 1;
     $GC_TEST(needed, $NeedHeap, $Live);
     E -= needed;
-    *E = make_cp(c_p->cp);
-    c_p->cp = 0;
+    *E = NIL;
+}
+
+
+//
+// Helpers for call instructions
+//
+
+DISPATCH() {
+    BeamInstr dis_next;
+
+    dis_next = *I;
+    CHECK_ARGS(I);
+
+    if (FCALLS > 0 || FCALLS > neg_o_reds) {
+        FCALLS--;
+        Goto(dis_next);
+    } else {
+        goto context_switch;
+    }
+}
+
+DISPATCH_ABS(CallDest) {
+    SET_I((BeamInstr *) $CallDest);
+    DTRACE_LOCAL_CALL(c_p, erts_code_to_codemfa(I));
+
+    $DISPATCH();
+}
+
+DISPATCH_EXPORT(Export) {
+    BeamInstr dis_next;
+    Export *ep;
+
+    ep = (Export*)($Export);
+
+    DTRACE_GLOBAL_CALL_FROM_EXPORT(c_p, ep);
+
+    SET_I(ep->addressv[erts_active_code_ix()]);
+    CHECK_ARGS(I);
+    dis_next = *I;
+
+    if (ERTS_UNLIKELY(FCALLS <= 0)) {
+        if (ERTS_PROC_GET_SAVED_CALLS_BUF(c_p) && FCALLS > neg_o_reds) {
+            save_calls(c_p, ep);
+        } else {
+            goto context_switch;
+        }
+    }
+
+    FCALLS--;
+    Goto(dis_next);
+}
+
+DISPATCH_FUN(I) {
+    BeamInstr dis_next;
+
+    SET_I($I);
+
+    dis_next = *I;
+    CHECK_ARGS(I);
+
+    if (FCALLS > 0 || FCALLS > neg_o_reds) {
+        FCALLS--;
+        Goto(dis_next);
+    } else {
+        goto context_switch_fun;
+    }
+}
+
+DISPATCH_REL(CallDest) {
+    $SET_I_REL($CallDest);
+    DTRACE_LOCAL_CALL(c_p, erts_code_to_codemfa(I));
+
+    $DISPATCH();
+}
+
+DISPATCH_RETURN() {
+    if (FCALLS > 0 || FCALLS > neg_o_reds) {
+        FCALLS--;
+        Goto(*I);
+    } else {
+        c_p->current = NULL;
+        c_p->arity = 1;
+        goto context_switch3;
+    }
+}
+
+// Save the continuation pointer in the reserved slot at the
+// top of the stack as preparation for doing a function call.
+
+SAVE_CONTINUATION_POINTER(IP) {
+    ASSERT(VALID_INSTR(*($IP)));
+    *E = (BeamInstr) ($IP);
+}
+
+// Return to the function whose continuation pointer is stored
+// at the top of the stack and set that word to NIL.
+
+RETURN() {
+    SET_I(cp_val(*E));
+    *E = NIL;
 }
 
 NEXT0() {
diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab
index b9d4f6afcc..1d336e4b7b 100644
--- a/erts/emulator/beam/ops.tab
+++ b/erts/emulator/beam/ops.tab
@@ -77,19 +77,32 @@ return
 # To ensure that a "move Src x(0)" instruction can be combined with
 # the following call instruction, we need to make sure that there is
 # no line/1 instruction between the move and the call.
-#
-# A tail-recursive call to an external function (BIF or non-BIF) will
-# never be saved on the stack, so there is no reason to keep the line
-# instruction.
+
+move S X0=x==0 | line Loc | call Ar Func => \
+     line Loc | move S X0 | call Ar Func
 
 move S X0=x==0 | line Loc | call_ext Ar Func => \
      line Loc | move S X0 | call_ext Ar Func
+
+#
+# A tail call will not refer to the current function on error unless it's a
+# BIF, so we can omit the line instruction for non-BIFs.
+#
+
+move S X0=x==0 | line Loc | call_ext_last Ar Func=u$is_bif D => \
+     line Loc | move S X0 | call_ext_last Ar Func D
+move S X0=x==0 | line Loc | call_ext_only Ar Func=u$is_bif => \
+     line Loc | move S X0 | call_ext_only Ar Func
+
 move S X0=x==0 | line Loc | call_ext_last Ar Func D => \
      move S X0 | call_ext_last Ar Func D
 move S X0=x==0 | line Loc | call_ext_only Ar Func => \
      move S X0 | call_ext_only Ar Func
-move S X0=x==0 | line Loc | call Ar Func => \
-     line Loc | move S X0 | call Ar Func
+
+move S X0=x==0 | line Loc | call_last Ar Func D => \
+     move S X0 | call_last Ar Func D
+move S X0=x==0 | line Loc | call_only Ar Func => \
+     move S X0 | call_only Ar Func
 
 line Loc | func_info M F A => func_info M F A | line Loc
 
@@ -324,76 +337,15 @@ move_src_window2 y x x
 move_src_window3 y x x x
 move_src_window4 y x x x x
 
-# Swap registers.
-move R1=xy Tmp=x | move R2=xy R1 | move Tmp R2 => swap_temp R1 R2 Tmp
-
-# The compiler uses x(1022) when swapping registers. It will definitely
-# not be used again.
-swap_temp R1 R2 Tmp=x==1022 => swap R1 R2
-
-swap_temp R1 R2 Tmp | move Src Tmp => swap R1 R2 | move Src Tmp
-
-swap_temp R1 R2 Tmp | line Loc | apply Live | is_killed_apply(Tmp, Live) => \
-  swap R1 R2 | line Loc | apply Live
-swap_temp R1 R2 Tmp | line Loc | apply_last Live D | is_killed_apply(Tmp, Live) => \
-  swap R1 R2 | line Loc | apply_last Live D
-
-swap_temp R1 R2 Tmp | line Loc | call_fun Live | is_killed_by_call_fun(Tmp, Live) => \
-  swap R1 R2 | line Loc | call_fun Live
-swap_temp R1 R2 Tmp | make_fun2 OldIndex=u | is_killed_by_make_fun(Tmp, OldIndex) => \
-  swap R1 R2 | make_fun2 OldIndex
-
-swap_temp R1 R2 Tmp | line Loc | call Live Addr | is_killed(Tmp, Live) => \
-  swap R1 R2 | line Loc | call Live Addr
-swap_temp R1 R2 Tmp | call_only Live Addr | \
-  is_killed(Tmp, Live) => swap R1 R2 | call_only Live Addr
-swap_temp R1 R2 Tmp | call_last Live Addr D | \
-  is_killed(Tmp, Live) => swap R1 R2 | call_last Live Addr D
-
-swap_temp R1 R2 Tmp | line Loc | call_ext Live Addr | is_killed(Tmp, Live) => \
-  swap R1 R2 | line Loc | call_ext Live Addr
-swap_temp R1 R2 Tmp | line Loc | call_ext_only Live Addr | \
-  is_killed(Tmp, Live) => swap R1 R2 | line Loc | call_ext_only Live Addr
-swap_temp R1 R2 Tmp | line Loc | call_ext_last Live Addr D | \
-  is_killed(Tmp, Live) => swap R1 R2 | line Loc | call_ext_last Live Addr D
-
-swap_temp R1 R2 Tmp | call_ext Live Addr | is_killed(Tmp, Live) => \
-  swap R1 R2 | call_ext Live Addr
-swap_temp R1 R2 Tmp | call_ext_only Live Addr | is_killed(Tmp, Live) => \
-  swap R1 R2 | call_ext_only Live Addr
-swap_temp R1 R2 Tmp | call_ext_last Live Addr D | is_killed(Tmp, Live) => \
-  swap R1 R2 | call_ext_last Live Addr D
-
-swap_temp R1 R2 Tmp | move Src Any | line Loc | call Live Addr | \
-  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
-     swap R1 R2 | move Src Any | line Loc | call Live Addr
-swap_temp R1 R2 Tmp | move Src Any | line Loc | call_ext Live Addr | \
-  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
-     swap R1 R2 | move Src Any | line Loc | call_ext Live Addr
-swap_temp R1 R2 Tmp | move Src Any | call_only Live Addr | \
-  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
-    swap R1 R2 | move Src Any | call_only Live Addr
-swap_temp R1 R2 Tmp | move Src Any | line Loc | call_ext_only Live Addr | \
-  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
-    swap R1 R2 | move Src Any | line Loc | call_ext_only Live Addr
-swap_temp R1 R2 Tmp | move Src Any | line Loc | call_fun Live | \
-  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
-    swap R1 R2 | move Src Any | line Loc | call_fun Live
-
-swap_temp R1 R2 Tmp | line Loc | send | is_killed_by_send(Tmp) => \
-  swap R1 R2 | line Loc | send
-
-# swap_temp/3 with Y register operands are rare.
-swap_temp R1 R2=y Tmp => swap R1 R2 | move R2 Tmp
-swap_temp R1=y R2 Tmp => swap R1 R2 | move R2 Tmp
-
 swap R1=x R2=y => swap R2 R1
 
-swap_temp x x x
-
 swap xy x
 swap y y
 
+swap R1=x R2=x | swap R3=x R1 => swap2 R1 R2 R3
+
+swap2 x x x
+
 # move_shift
 
 move SD=x    D=x | move Src=cxy SD=x  | distinct(D, Src) => move_shift Src SD D
@@ -635,8 +587,8 @@ put_list s s d
 %cold
 normal_exit
 continue_exit
-apply_bif
-call_nif
+call_bif W
+call_nif W W W
 call_error_handler
 error_action_code
 return_trace
@@ -657,8 +609,20 @@ move S x==0 | deallocate D | return => move_deallocate_return S D
 
 move_deallocate_return xycn Q
 
+deallocate u==0 | return => deallocate_return0
+deallocate u==1 | return => deallocate_return1
+deallocate u==2 | return => deallocate_return2
+deallocate u==3 | return => deallocate_return3
+deallocate u==4 | return => deallocate_return4
+
 deallocate D | return => deallocate_return D
 
+deallocate_return0
+deallocate_return1
+deallocate_return2
+deallocate_return3
+deallocate_return4
+
 deallocate_return Q
 
 test_heap Need u==1 | put_list Y=y x==0 x==0 => test_heap_1_put_list Need Y
@@ -836,62 +800,22 @@ allocate_init t t? y
 # External function and bif calls.
 #################################################################
 
-#
-# The BIFs erts_internal:check_process_code/1 must be called like a function,
-# to ensure that c_p->i (program counter) is set correctly (an ordinary
-# BIF call doesn't set it).
-#
-
-call_ext u==1 Bif=u$bif:erts_internal:check_process_code/1 => i_call_ext Bif
-call_ext_last u==1 Bif=u$bif:erts_internal:check_process_code/1 D => i_call_ext_last Bif D
-call_ext_only u==1 Bif=u$bif:erts_internal:check_process_code/1 => i_call_ext_only Bif
+# Expands into call_light_bif(_only)/2
+call_light_bif/1
+call_light_bif_only/1
+call_light_bif_last/2
 
 #
-# The BIFs erts_internal:garbage_collect/1 must be called like a function,
-# to allow them to invoke the garbage collector. (The stack pointer must
-# be saved and p->arity must be zeroed, which is not done on ordinary BIF calls.)
-#
-call_ext u==1 Bif=u$bif:erts_internal:garbage_collect/1 => i_call_ext Bif
-call_ext_last u==1 Bif=u$bif:erts_internal:garbage_collect/1 D => i_call_ext_last Bif D
-call_ext_only u==1 Bif=u$bif:erts_internal:garbage_collect/1 => i_call_ext_only Bif
-
-#
-# put/2 and erase/1 must be able to do garbage collection, so we must call
-# them like functions.
-#
-
-call_ext u==2 Bif=u$bif:erlang:put/2 => i_call_ext Bif
-call_ext_last u==2 Bif=u$bif:erlang:put/2 D => i_call_ext_last Bif D
-call_ext_only u==2 Bif=u$bif:erlang:put/2 => i_call_ext_only Bif
-
-call_ext u==1 Bif=u$bif:erlang:erase/1 => i_call_ext Bif
-call_ext_last u==1 Bif=u$bif:erlang:erase/1 D => i_call_ext_last Bif D
-call_ext_only u==1 Bif=u$bif:erlang:erase/1 => i_call_ext_only Bif
-
-#
-# The process_info/1,2 BIF should be called like a function, to force
-# the emulator to set c_p->current before calling it (a BIF call doesn't
-# set it).
-#
-# In addition, we force the use of a non-tail-recursive call.  This will ensure
-# that c_p->cp points into the function making the call.
+# The load_nif/2 BIF is an instruction.
 #
 
-call_ext u==1 Bif=u$bif:erlang:process_info/1 => i_call_ext Bif
-call_ext_last u==1 Bif=u$bif:erlang:process_info/1 D => i_call_ext Bif | deallocate_return D
-call_ext_only Ar=u==1 Bif=u$bif:erlang:process_info/1 => allocate u Ar | i_call_ext Bif | deallocate_return u
-
-call_ext u==2 Bif=u$bif:erlang:process_info/2 => i_call_ext Bif
-call_ext_last u==2 Bif=u$bif:erlang:process_info/2 D => i_call_ext Bif | deallocate_return D
-call_ext_only Ar=u==2 Bif=u$bif:erlang:process_info/2 => allocate u Ar | i_call_ext Bif | deallocate_return u
-
-#
-# load_nif/2 also needs to know calling function like process_info
-#
-call_ext u==2 Bif=u$bif:erlang:load_nif/2 => i_call_ext Bif
-call_ext_last u==2 Bif=u$bif:erlang:load_nif/2 D => i_call_ext Bif | deallocate_return D
-call_ext_only Ar=u==2 Bif=u$bif:erlang:load_nif/2 => allocate u Ar | i_call_ext Bif | deallocate_return u
+call_ext u==2 u$func:erlang:load_nif/2 => i_load_nif
+call_ext_last u==2 u$func:erlang:load_nif/2 D => i_load_nif | deallocate_return D
+call_ext_only u==2 u$func:erlang:load_nif/2 => i_load_nif | return
 
+%cold
+i_load_nif
+%hot
 
 #
 # apply/2 is an instruction, not a BIF.
@@ -910,33 +834,6 @@ call_ext_last u==3 u$bif:erlang:apply/3 D => i_apply_last D
 call_ext_only u==3 u$bif:erlang:apply/3 => i_apply_only
 
 #
-# The exit/1 and throw/1 BIFs never execute the instruction following them;
-# thus there is no need to generate any return instruction.
-#
-
-call_ext_last u==1 Bif=u$bif:erlang:exit/1 D => call_bif Bif
-call_ext_last u==1 Bif=u$bif:erlang:throw/1 D => call_bif Bif
-
-call_ext_only u==1 Bif=u$bif:erlang:exit/1 => call_bif Bif
-call_ext_only u==1 Bif=u$bif:erlang:throw/1 => call_bif Bif
-
-#
-# The error/1 and error/2 BIFs never execute the instruction following them;
-# thus there is no need to generate any return instruction.
-# However, they generate stack backtraces, so if the call instruction
-# is call_ext_only/2 instruction, we explicitly do an allocate/2 to store
-# the continuation pointer on the stack.
-#
-
-call_ext_last u==1 Bif=u$bif:erlang:error/1 D => call_bif Bif
-call_ext_last u==2 Bif=u$bif:erlang:error/2 D => call_bif Bif
-
-call_ext_only Ar=u==1 Bif=u$bif:erlang:error/1 => \
-  allocate u Ar | call_bif Bif
-call_ext_only Ar=u==2 Bif=u$bif:erlang:error/2 => \
-  allocate u Ar | call_bif Bif
-
-#
 # The yield/0 BIF is an instruction
 #
 
@@ -1048,17 +945,24 @@ call_ext_only u==0 u$func:os:perf_counter/0 => \
     i_perf_counter | return
 
 #
-# The general case for BIFs that have no special instructions.
-# A BIF used in the tail must be followed by a return instruction.
+# BIFs like process_info/1,2 require up-to-date information about the current
+# emulator state, which the ordinary call_light_bif instruction doesn't save.
 #
-# To make trapping and stack backtraces work correctly, we make sure that
-# the continuation pointer is always stored on the stack.
 
-call_ext u Bif=u$is_bif => call_bif Bif
+call_ext u Bif=u$is_bif | is_heavy_bif(Bif) => \
+    i_call_ext Bif
+call_ext_last u Bif=u$is_bif D | is_heavy_bif(Bif) => \
+    i_call_ext Bif | deallocate_return D
+call_ext_only Ar=u Bif=u$is_bif | is_heavy_bif(Bif) => \
+    allocate u Ar | i_call_ext Bif | deallocate_return u
 
-call_ext_last u Bif=u$is_bif D => deallocate D | call_bif_only Bif
+#
+# The general case for BIFs that have no special requirements.
+#
 
-call_ext_only Ar=u Bif=u$is_bif => call_bif_only Bif
+call_ext u Bif=u$is_bif => call_light_bif Bif
+call_ext_last u Bif=u$is_bif D => call_light_bif_last Bif D
+call_ext_only Ar=u Bif=u$is_bif => call_light_bif_only Bif
 
 #
 # Any remaining calls are calls to Erlang functions, not BIFs.
@@ -1083,14 +987,32 @@ i_apply_fun
 i_apply_fun_last Q
 i_apply_fun_only
 
+#
+# When a BIF is traced, these instructions make a body call through the export
+# entry instead of calling the BIF directly (setting up a temporary stack frame
+# if needed). We therefore retain the stack frame in call_light_bif_last, and
+# add a deallocate_return after call_light_bif_only to remove the temporary
+# stack frame before returning.
+#
+
+call_light_bif Bif=u$is_bif => \
+    call_light_bif Bif Bif
+
+call_light_bif_last Bif=u$is_bif D => \
+    call_light_bif Bif Bif | deallocate_return D
+
+call_light_bif_only Bif=u$is_bif => \
+    call_light_bif_only Bif Bif | deallocate_return u
+
+call_light_bif b e
+call_light_bif_only b e
+
 %cold
-i_hibernate
 
+i_hibernate
 i_perf_counter
-%hot
 
-call_bif e
-call_bif_only e
+%hot
 
 #
 # Calls to non-building and guard BIFs.
diff --git a/erts/emulator/beam/register.c b/erts/emulator/beam/register.c
index c7e02c6d48..8e44b527a2 100644
--- a/erts/emulator/beam/register.c
+++ b/erts/emulator/beam/register.c
@@ -265,10 +265,8 @@ Eterm
 erts_whereis_name_to_id(Process *c_p, Eterm name)
 {
     Eterm res = am_undefined;
-    HashValue hval;
-    int ix;
-    HashBucket* b;
     ErtsProcLocks c_p_locks = 0;
+    RegProc *rp, tmpl;
     if (c_p) {
         c_p_locks = ERTS_PROC_LOCK_MAIN;
         ERTS_CHK_HAVE_ONLY_MAIN_PROC_LOCK(c_p);
@@ -278,29 +276,14 @@ erts_whereis_name_to_id(Process *c_p, Eterm name)
     if (c_p && !c_p_locks)
         erts_proc_lock(c_p, ERTS_PROC_LOCK_MAIN);
 
-    hval = REG_HASH(name);
-    ix = hval % process_reg.size;
-    b = process_reg.bucket[ix];
+    tmpl.name = name;
+    rp = hash_fetch(&process_reg, &tmpl, (H_FUN)reg_hash, (HCMP_FUN)reg_cmp);
 
-    /*
-     * Note: We have inlined the code from hash.c for speed.
-     */
-	
-    while (b) {
-	RegProc* rp = (RegProc *) b;
-	if (rp->name == name) {
-	    /*
-	     * SMP NOTE: No need to lock registered entity since it cannot
-	     * be removed without acquiring write reg lock and id on entity
-	     * is read only.
-	     */
-	    if (rp->p)
-		res = rp->p->common.id;
-	    else if (rp->pt)
-		res = rp->pt->common.id;
-	    break;
-	}
-	b = b->next;
+    if (rp) {
+        if (rp->p)
+            res = rp->p->common.id;
+        else if (rp->pt)
+            res = rp->pt->common.id;
     }
 
     reg_read_unlock();
@@ -321,10 +304,7 @@ erts_whereis_name(Process *c_p,
 		  Port** port,
                   int lock_port)
 {
-    RegProc* rp = NULL;
-    HashValue hval;
-    int ix;
-    HashBucket* b;
+    RegProc* rp = NULL, tmpl;
     ErtsProcLocks current_c_p_locks;
     Port *pending_port = NULL;
 
@@ -342,21 +322,8 @@ erts_whereis_name(Process *c_p,
      * - current_c_p_locks (either c_p_locks or 0) on c_p
      */
 
-    hval = REG_HASH(name);
-    ix = hval % process_reg.size;
-    b = process_reg.bucket[ix];
-
-    /*
-     * Note: We have inlined the code from hash.c for speed.
-     */
-
-    while (b) {
-	if (((RegProc *) b)->name == name) {
-	    rp = (RegProc *) b;
-	    break;
-	}
-	b = b->next;
-    }
+    tmpl.name = name;
+    rp = hash_fetch(&process_reg, &tmpl, (H_FUN)reg_hash, (HCMP_FUN)reg_cmp);
 
     if (proc) {
 	if (!rp)
@@ -564,18 +531,6 @@ int erts_unregister_name(Process *c_p,
     return res;
 }
 
-int process_reg_size(void)
-{
-    int size;
-    int lock = !ERTS_IS_CRASH_DUMPING;
-    if (lock)
-	reg_read_lock();
-    size = process_reg.size;
-    if (lock)
-	reg_read_unlock();
-    return size;
-}
-
 int process_reg_sz(void)
 {
     int sz;
@@ -592,15 +547,24 @@ int process_reg_sz(void)
 
 #include "bif.h"
 
+struct registered_foreach_arg {
+    Eterm res;
+    Eterm *hp;
+};
+
+static void
+registered_foreach(RegProc *reg, struct registered_foreach_arg *arg)
+{
+    arg->res = CONS(arg->hp, reg->name, arg->res);
+    arg->hp += 2;
+}
+
 /* return a list of the registered processes */
 
 BIF_RETTYPE registered_0(BIF_ALIST_0)
 {
-    int i;
-    Eterm res;
+    struct registered_foreach_arg arg;
     Uint need;
-    Eterm* hp;
-    HashBucket **bucket;
     ErtsProcLocks proc_locks = ERTS_PROC_LOCK_MAIN;
 
     ERTS_CHK_HAVE_ONLY_MAIN_PROC_LOCK(BIF_P);
@@ -608,41 +572,21 @@ BIF_RETTYPE registered_0(BIF_ALIST_0)
     if (!proc_locks)
 	erts_proc_lock(BIF_P, ERTS_PROC_LOCK_MAIN);
 
-    bucket = process_reg.bucket;
-
-    /* work out how much heap we need & maybe garb, by scanning through
-       the registered process table */
-    need = 0;
-    for (i = 0; i < process_reg.size; i++) {
-	HashBucket *b = bucket[i];
-	while (b != NULL) {
-	    need += 2;
-	    b = b->next;
-	}
-    }
+    /* work out how much heap we need */
+    need = process_reg.nobjs * 2;
 
     if (need == 0) {
 	reg_read_unlock();
 	BIF_RET(NIL);
     }
 
-    hp = HAlloc(BIF_P, need);
-     
-     /* scan through again and make the list */ 
-    res = NIL;
+    /* scan through again and make the list */
+    arg.hp = HAlloc(BIF_P, need);
+    arg.res = NIL;
 
-    for (i = 0; i < process_reg.size; i++) {
-	HashBucket *b = bucket[i];
-	while (b != NULL) {
-	    RegProc *reg = (RegProc *) b;
-
-	    res = CONS(hp, reg->name, res);
-	    hp += 2;
-	    b = b->next;
-	}
-    }
+    hash_foreach(&process_reg, (HFOREACH_FUN)registered_foreach, &arg);
 
     reg_read_unlock();
 
-    BIF_RET(res);
+    BIF_RET(arg.res);
 }
diff --git a/erts/emulator/beam/register.h b/erts/emulator/beam/register.h
index 27a314ca78..c77bd03653 100644
--- a/erts/emulator/beam/register.h
+++ b/erts/emulator/beam/register.h
@@ -41,7 +41,6 @@ typedef struct reg_proc
     Eterm name;         /* Atom name */
 } RegProc;
 
-int process_reg_size(void);
 int process_reg_sz(void);
 void init_register_table(void);
 void register_info(fmtfn_t, void *);
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index c261c8e117..8a59b61b63 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -92,6 +92,12 @@
 #  define ERTS_GLB_INLINE_INCL_FUNC_DEF 0
 #endif
 
+#ifdef __GNUC__
+#  define ERTS_NOINLINE __attribute__((__noinline__))
+#else
+#  define ERTS_NOINLINE
+#endif
+
 #if defined(VALGRIND) && !defined(NO_FPE_SIGNALS)
 #  define NO_FPE_SIGNALS
 #endif
@@ -172,7 +178,8 @@ typedef ERTS_SYS_FD_TYPE ErtsSysFdType;
 #  define ERTS_UNLIKELY(BOOL) (BOOL)
 #endif
 
-#if ERTS_AT_LEAST_GCC_VSN__(2, 96, 0)
+/* AIX doesn't like this and claims section conflicts */
+#if ERTS_AT_LEAST_GCC_VSN__(2, 96, 0) && !defined(_AIX)
 #if (defined(__APPLE__) && defined(__MACH__)) || defined(__DARWIN__)
 #  define ERTS_WRITE_UNLIKELY(X) X __attribute__ ((section ("__DATA,ERTS_LOW_WRITE") ))
 #else
@@ -666,7 +673,16 @@ typedef struct preload {
  */
 typedef Eterm ErtsTracer;
 
-#include "erl_osenv.h"
+
+/*
+ * This structure contains the rb tree for the erlang osenv copy
+ * see erl_osenv.h for more details.
+ */
+typedef struct __erts_osenv_t {
+    struct __env_rbtnode_t *tree;
+    int variable_count;
+    int content_size;
+} erts_osenv_t;
 
 /*
  * This structure contains options to all built in drivers.
diff --git a/erts/emulator/beam/trace_instrs.tab b/erts/emulator/beam/trace_instrs.tab
index 3eee81c053..9f22587f96 100644
--- a/erts/emulator/beam/trace_instrs.tab
+++ b/erts/emulator/beam/trace_instrs.tab
@@ -20,16 +20,15 @@
 //
 
 return_trace() {
-    ErtsCodeMFA* mfa = (ErtsCodeMFA *)(E[0]);
+    ErtsCodeMFA* mfa = (ErtsCodeMFA *)(E[1]);
 
     SWAPOUT;		/* Needed for shared heap */
     ERTS_UNREQ_PROC_MAIN_LOCK(c_p);
-    erts_trace_return(c_p, mfa, r(0), ERTS_TRACER_FROM_ETERM(E+1)/* tracer */);
+    erts_trace_return(c_p, mfa, r(0), ERTS_TRACER_FROM_ETERM(E+2)/* tracer */);
     ERTS_REQ_PROC_MAIN_LOCK(c_p);
     SWAPIN;
-    c_p->cp = NULL;
-    SET_I((BeamInstr *) cp_val(E[2]));
     E += 3;
+    $RETURN();
     Goto(*I);
     //| -no_next
 }
@@ -45,13 +44,12 @@ i_generic_breakpoint() {
 }
 
 i_return_time_trace() {
-    BeamInstr *pc = (BeamInstr *) (UWord) E[0];
+    BeamInstr *pc = (BeamInstr *) (UWord) E[1];
     SWAPOUT;
     erts_trace_time_return(c_p, erts_code_to_codeinfo(pc));
     SWAPIN;
-    c_p->cp = NULL;
-    SET_I((BeamInstr *) cp_val(E[1]));
     E += 2;
+    $RETURN();
     Goto(*I);
     //| -no_next
 }
@@ -59,8 +57,10 @@ i_return_time_trace() {
 i_return_to_trace() {
     if (IS_TRACED_FL(c_p, F_TRACE_RETURN_TO)) {
         Uint *cpp = (Uint*) E;
+        while (is_not_CP(*cpp)) {
+            cpp++;
+        }
         for(;;) {
-            ASSERT(is_CP(*cpp));
             if (IsOpCode(*cp_val(*cpp), return_trace)) {
                 do
                     ++cpp;
@@ -80,9 +80,8 @@ i_return_to_trace() {
         ERTS_REQ_PROC_MAIN_LOCK(c_p);
         SWAPIN;
     }
-    c_p->cp = NULL;
-    SET_I((BeamInstr *) cp_val(E[0]));
     E += 1;
+    $RETURN();
     Goto(*I);
     //| -no_next
 }
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 0bbae65e28..fb06d60768 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -66,7 +66,7 @@
 #undef M_MMAP_THRESHOLD
 #undef M_MMAP_MAX
 
-#if defined(__GLIBC__) && defined(HAVE_MALLOC_H)
+#if (defined(__GLIBC__) || defined(_AIX)) && defined(HAVE_MALLOC_H)
 #include <malloc.h>
 #endif
 
@@ -907,7 +907,7 @@ tail_recur:
 	    hash = hash * FUNNY_NUMBER10 + num_free;
 	    hash = hash*FUNNY_NUMBER1 +
 		(atom_tab(atom_val(funp->fe->module))->slot.bucket.hvalue);
-	    hash = hash*FUNNY_NUMBER2 + funp->fe->old_index;
+	    hash = hash*FUNNY_NUMBER2 + funp->fe->index;
 	    hash = hash*FUNNY_NUMBER2 + funp->fe->old_uniq;
 	    if (num_free > 0) {
 		if (num_free > 1) {
@@ -1069,54 +1069,237 @@ do {                               \
 
 #define HCONST 0x9e3779b9UL /* the golden ratio; an arbitrary value */
 
-static Uint32
-block_hash(byte *k, Uint length, Uint32 initval)
+typedef struct {
+    Uint32 a,b,c;
+} ErtsBlockHashHelperCtx;
+
+#define BLOCK_HASH_BYTES_PER_ITER 12
+
+/* The three functions below are separated into different functions even
+   though they are always used together to make trapping and handling
+   of unaligned binaries easier. Examples of how they are used can be
+   found in block_hash and make_hash2_helper.*/
+static ERTS_INLINE
+void block_hash_setup(Uint32 initval,
+                      ErtsBlockHashHelperCtx* ctx /* out parameter */)
+{
+    ctx->a = ctx->b = HCONST;
+    ctx->c = initval;           /* the previous hash value */
+}
+
+static ERTS_INLINE
+void block_hash_buffer(byte *buf,
+                       Uint buf_length,
+                       ErtsBlockHashHelperCtx* ctx /* out parameter */)
 {
-   Uint32 a,b,c;
-   Uint len;
-
-   /* Set up the internal state */
-   len = length;
-   a = b = HCONST;
-   c = initval;           /* the previous hash value */
-
-   while (len >= 12)
-   {
-      a += (k[0] +((Uint32)k[1]<<8) +((Uint32)k[2]<<16) +((Uint32)k[3]<<24));
-      b += (k[4] +((Uint32)k[5]<<8) +((Uint32)k[6]<<16) +((Uint32)k[7]<<24));
-      c += (k[8] +((Uint32)k[9]<<8) +((Uint32)k[10]<<16)+((Uint32)k[11]<<24));
-      MIX(a,b,c);
-      k += 12; len -= 12;
-   }
-
-   c += length;
-   switch(len)              /* all the case statements fall through */
-   {
-   case 11: c+=((Uint32)k[10]<<24);
-   case 10: c+=((Uint32)k[9]<<16);
-   case 9 : c+=((Uint32)k[8]<<8);
-      /* the first byte of c is reserved for the length */
-   case 8 : b+=((Uint32)k[7]<<24);
-   case 7 : b+=((Uint32)k[6]<<16);
-   case 6 : b+=((Uint32)k[5]<<8);
-   case 5 : b+=k[4];
-   case 4 : a+=((Uint32)k[3]<<24);
-   case 3 : a+=((Uint32)k[2]<<16);
-   case 2 : a+=((Uint32)k[1]<<8);
-   case 1 : a+=k[0];
-     /* case 0: nothing left to add */
-   }
-   MIX(a,b,c);
-   return c;
+    Uint len = buf_length;
+    byte *k = buf;
+    ASSERT(buf_length % BLOCK_HASH_BYTES_PER_ITER == 0);
+    while (len >= BLOCK_HASH_BYTES_PER_ITER) {
+        ctx->a += (k[0] +((Uint32)k[1]<<8) +((Uint32)k[2]<<16) +((Uint32)k[3]<<24));
+        ctx->b += (k[4] +((Uint32)k[5]<<8) +((Uint32)k[6]<<16) +((Uint32)k[7]<<24));
+        ctx->c += (k[8] +((Uint32)k[9]<<8) +((Uint32)k[10]<<16)+((Uint32)k[11]<<24));
+        MIX(ctx->a,ctx->b,ctx->c);
+        k += BLOCK_HASH_BYTES_PER_ITER; len -= BLOCK_HASH_BYTES_PER_ITER;
+    }
 }
 
+static ERTS_INLINE
+Uint32 block_hash_final_bytes(byte *buf,
+                              Uint buf_length,
+                              Uint full_length,
+                              ErtsBlockHashHelperCtx* ctx)
+{
+    Uint len = buf_length;
+    byte *k = buf;
+    ctx->c += full_length;
+    switch(len)
+    { /* all the case statements fall through */      
+    case 11: ctx->c+=((Uint32)k[10]<<24);
+    case 10: ctx->c+=((Uint32)k[9]<<16);
+    case 9 : ctx->c+=((Uint32)k[8]<<8);
+    /* the first byte of c is reserved for the length */
+    case 8 : ctx->b+=((Uint32)k[7]<<24);
+    case 7 : ctx->b+=((Uint32)k[6]<<16);
+    case 6 : ctx->b+=((Uint32)k[5]<<8);
+    case 5 : ctx->b+=k[4];
+    case 4 : ctx->a+=((Uint32)k[3]<<24);
+    case 3 : ctx->a+=((Uint32)k[2]<<16);
+    case 2 : ctx->a+=((Uint32)k[1]<<8);
+    case 1 : ctx->a+=k[0];
+    /* case 0: nothing left to add */
+    }
+    MIX(ctx->a,ctx->b,ctx->c);
+    return ctx->c;
+}
+
+static
 Uint32
-make_hash2(Eterm term)
+block_hash(byte *block, Uint block_length, Uint32 initval)
 {
+    ErtsBlockHashHelperCtx ctx;
+    Uint no_bytes_not_in_loop =
+        (block_length % BLOCK_HASH_BYTES_PER_ITER);
+    Uint no_bytes_to_process_in_loop =
+        block_length - no_bytes_not_in_loop;
+    byte *final_bytes = block + no_bytes_to_process_in_loop;
+    block_hash_setup(initval, &ctx);
+    block_hash_buffer(block,
+                      no_bytes_to_process_in_loop,
+                      &ctx);
+    return block_hash_final_bytes(final_bytes,
+                                  no_bytes_not_in_loop,
+                                  block_length,
+                                  &ctx);
+}
+
+typedef enum {
+    tag_primary_list,
+    arityval_subtag,
+    hamt_subtag_head_flatmap,
+    map_subtag,
+    fun_subtag,
+    neg_big_subtag,
+    sub_binary_subtag_1,
+    sub_binary_subtag_2,
+    hash2_common_1,
+    hash2_common_2,
+    hash2_common_3,
+} ErtsMakeHash2TrapLocation; 
+
+typedef struct {
+    int c;
+    Uint32 sh;
+    Eterm* ptr;
+} ErtsMakeHash2Context_TAG_PRIMARY_LIST;
+
+typedef struct {
+    int i;
+    int arity;
+    Eterm* elem;
+} ErtsMakeHash2Context_ARITYVAL_SUBTAG;
+
+typedef struct {
+    Eterm *ks;
+    Eterm *vs;
+    int i;
+    Uint size;
+} ErtsMakeHash2Context_HAMT_SUBTAG_HEAD_FLATMAP;
+
+typedef struct {
+    Eterm* ptr;
+    int i;
+} ErtsMakeHash2Context_MAP_SUBTAG;
+
+typedef struct {
+    Uint num_free;
+    Eterm* bptr;
+} ErtsMakeHash2Context_FUN_SUBTAG;
+
+typedef struct {
+    Eterm* ptr;
+    Uint i;
+    Uint n;
+    Uint32 con;
+} ErtsMakeHash2Context_NEG_BIG_SUBTAG;
+
+typedef struct {
+    byte* bptr;
+    Uint sz;
+    Uint bitsize;
+    Uint bitoffs;
+    Uint no_bytes_processed;
+    ErtsBlockHashHelperCtx block_hash_ctx;
+    /* The following fields are only used when bitoffs != 0 */
+    byte* buf;
+    int done;
+
+} ErtsMakeHash2Context_SUB_BINARY_SUBTAG;
+
+typedef struct {
+    int dummy__; /* Empty structs are not supported on all platforms */
+} ErtsMakeHash2Context_EMPTY;
+
+typedef struct {
+    ErtsMakeHash2TrapLocation trap_location;
+    /* specific to the trap location: */
+    union {
+        ErtsMakeHash2Context_TAG_PRIMARY_LIST tag_primary_list;
+        ErtsMakeHash2Context_ARITYVAL_SUBTAG arityval_subtag;
+        ErtsMakeHash2Context_HAMT_SUBTAG_HEAD_FLATMAP hamt_subtag_head_flatmap;
+        ErtsMakeHash2Context_MAP_SUBTAG map_subtag;
+        ErtsMakeHash2Context_FUN_SUBTAG fun_subtag;
+        ErtsMakeHash2Context_NEG_BIG_SUBTAG neg_big_subtag;
+        ErtsMakeHash2Context_SUB_BINARY_SUBTAG sub_binary_subtag_1;
+        ErtsMakeHash2Context_SUB_BINARY_SUBTAG sub_binary_subtag_2;
+        ErtsMakeHash2Context_EMPTY hash2_common_1;
+        ErtsMakeHash2Context_EMPTY hash2_common_2;
+        ErtsMakeHash2Context_EMPTY hash2_common_3;
+    } trap_location_state;
+    /* same for all trap locations: */
+    Eterm term; 
     Uint32 hash;
     Uint32 hash_xor_pairs;
-    DeclareTmpHeapNoproc(tmp_big,2);
+    ErtsEStack stack;
+} ErtsMakeHash2Context;
+
+static int make_hash2_ctx_bin_dtor(Binary *context_bin) {
+    ErtsMakeHash2Context* context = ERTS_MAGIC_BIN_DATA(context_bin);
+    DESTROY_SAVED_ESTACK(&context->stack);
+    if (context->trap_location == sub_binary_subtag_2 &&
+        context->trap_location_state.sub_binary_subtag_2.buf != NULL) {
+        erts_free(ERTS_ALC_T_PHASH2_TRAP, context->trap_location_state.sub_binary_subtag_2.buf);
+    }
+    return 1;
+}
 
+/* hash2_save_trap_state is called seldom so we want to avoid inlining */
+static ERTS_NOINLINE
+Eterm hash2_save_trap_state(Eterm state_mref,
+                            Uint32 hash_xor_pairs,
+                            Uint32 hash,
+                            Process* p,
+                            Eterm term,
+                            Eterm* ESTK_DEF_STACK(s),
+                            ErtsEStack s,
+                            ErtsMakeHash2TrapLocation trap_location,
+                            void* trap_location_state_ptr,
+                            size_t trap_location_state_size) {
+    Binary* state_bin;
+    ErtsMakeHash2Context* context;
+    if (state_mref == THE_NON_VALUE) {
+        Eterm* hp;
+        state_bin = erts_create_magic_binary(sizeof(ErtsMakeHash2Context),
+                                             make_hash2_ctx_bin_dtor);
+        hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE);
+        state_mref = erts_mk_magic_ref(&hp, &MSO(p), state_bin);
+    } else {
+        state_bin = erts_magic_ref2bin(state_mref);
+    }
+    context = ERTS_MAGIC_BIN_DATA(state_bin);
+    context->term = term;
+    context->hash = hash;
+    context->hash_xor_pairs = hash_xor_pairs;
+    ESTACK_SAVE(s, &context->stack);
+    context->trap_location = trap_location;
+    sys_memcpy(&context->trap_location_state,
+               trap_location_state_ptr,
+               trap_location_state_size);
+    erts_set_gc_state(p, 0);
+    BUMP_ALL_REDS(p);
+    return state_mref;
+}
+#undef NOINLINE_HASH2_SAVE_TRAP_STATE
+
+/* Writes back a magic reference to *state_mref_write_back when the
+   function traps */
+static ERTS_INLINE Uint32
+make_hash2_helper(Eterm term_param, const int can_trap, Eterm* state_mref_write_back, Process* p)
+{
+    static const Uint ITERATIONS_PER_RED = 64;
+    Uint32 hash;
+    Uint32 hash_xor_pairs;
+    Eterm term = term_param;
     ERTS_UNDEF(hash_xor_pairs, 0);
 
 /* (HCONST * {2, ..., 22}) mod 2^32 */
@@ -1168,12 +1351,63 @@ make_hash2(Eterm term)
 
 #define IS_SSMALL28(x) (((Uint) (((x) >> (28-1)) + 1)) < 2)
 
+#define NOT_SSMALL28_HASH(SMALL)                          \
+    do {                                                  \
+        Uint64 t;                                         \
+        Uint32 x, y;                                      \
+        Uint32 con;                                       \
+        if (SMALL < 0) {                                  \
+            con = HCONST_10;                              \
+            t = (Uint64)(SMALL * (-1));                   \
+        } else {                                          \
+            con = HCONST_11;                              \
+            t = SMALL;                                    \
+        }                                                 \
+        x = t & 0xffffffff;                               \
+        y = t >> 32;                                      \
+        UINT32_HASH_2(x, y, con);                         \
+    } while(0)
+    
 #ifdef ARCH_64
 #  define POINTER_HASH(Ptr, AConst) UINT32_HASH_2((Uint32)(UWord)(Ptr), (((UWord)(Ptr)) >> 32), AConst)
 #else
 #  define POINTER_HASH(Ptr, AConst) UINT32_HASH(Ptr, AConst)
 #endif
 
+#define TRAP_LOCATION_NO_RED(location_name)                             \
+    do {                                                                \
+        if(can_trap && iterations_until_trap <= 0) {                    \
+                *state_mref_write_back  =                               \
+                    hash2_save_trap_state(state_mref,                   \
+                                          hash_xor_pairs,               \
+                                          hash,                         \
+                                          p,                            \
+                                          term,                         \
+                                          ESTK_DEF_STACK(s),            \
+                                          s,                            \
+                                          location_name,                \
+                                          &ctx,                         \
+                                          sizeof(ctx));                 \
+                return 0;                                               \
+            L_##location_name:                                          \
+                ctx = context->trap_location_state. location_name;      \
+        }                                                               \
+    } while(0)
+
+#define TRAP_LOCATION(location_name)                            \
+    do {                                                        \
+        if (can_trap) {                                         \
+            iterations_until_trap--;                            \
+            TRAP_LOCATION_NO_RED(location_name);                \
+        }                                                       \
+    } while(0)
+
+#define TRAP_LOCATION_NO_CTX(location_name)                             \
+    do {                                                                \
+        ErtsMakeHash2Context_EMPTY ctx;                                 \
+        TRAP_LOCATION(location_name);                                   \
+    } while(0)
+    
     /* Optimization. Simple cases before declaration of estack. */
     if (primary_tag(term) == TAG_PRIMARY_IMMED1) {
 	switch (term & _TAG_IMMED1_MASK) {
@@ -1186,51 +1420,94 @@ make_hash2(Eterm term)
 	    break;
 	case _TAG_IMMED1_SMALL:
 	  {
-	      Sint x = signed_val(term);
-
-	      if (SMALL_BITS > 28 && !IS_SSMALL28(x)) {
-		  term = small_to_big(x, tmp_big);
-		  break;
+	      Sint small = signed_val(term);
+	      if (SMALL_BITS > 28 && !IS_SSMALL28(small)) {
+                  hash = 0;
+                  NOT_SSMALL28_HASH(small);
+                  return hash;
 	      }
 	      hash = 0;
-	      SINT32_HASH(x, HCONST);
+	      SINT32_HASH(small, HCONST);
 	      return hash;
 	  }
 	}
     };
     {
     Eterm tmp;
+    long max_iterations = 0;
+    long iterations_until_trap = 0;
+    Eterm state_mref = THE_NON_VALUE;
+    ErtsMakeHash2Context* context = NULL;
     DECLARE_ESTACK(s);
-
-    UseTmpHeapNoproc(2);
+    ESTACK_CHANGE_ALLOCATOR(s, ERTS_ALC_T_SAVED_ESTACK);
+    if(can_trap){
+#ifdef DEBUG
+        (void)ITERATIONS_PER_RED;
+        iterations_until_trap = max_iterations =
+            (1103515245 * (ERTS_BIF_REDS_LEFT(p)) + 12345)  % 227;
+#else
+        iterations_until_trap = max_iterations =
+            ITERATIONS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+#endif
+    }
+    if (can_trap && is_internal_magic_ref(term)) {
+        Binary* state_bin;
+        state_mref = term;
+        state_bin = erts_magic_ref2bin(state_mref);
+        if (ERTS_MAGIC_BIN_DESTRUCTOR(state_bin) == make_hash2_ctx_bin_dtor) {
+            /* Restore state after a trap */
+            context = ERTS_MAGIC_BIN_DATA(state_bin);
+            term = context->term;
+            hash = context->hash;
+            hash_xor_pairs = context->hash_xor_pairs;
+            ESTACK_RESTORE(s, &context->stack);
+            ASSERT(p->flags & F_DISABLE_GC);
+            erts_set_gc_state(p, 1);
+            switch (context->trap_location) {
+            case hash2_common_3:           goto L_hash2_common_3;
+            case tag_primary_list:         goto L_tag_primary_list;
+            case arityval_subtag:          goto L_arityval_subtag;
+            case hamt_subtag_head_flatmap: goto L_hamt_subtag_head_flatmap;
+            case map_subtag:               goto L_map_subtag;
+            case fun_subtag:               goto L_fun_subtag;
+            case neg_big_subtag:           goto L_neg_big_subtag;
+            case sub_binary_subtag_1:      goto L_sub_binary_subtag_1;
+            case sub_binary_subtag_2:      goto L_sub_binary_subtag_2;
+            case hash2_common_1:           goto L_hash2_common_1;
+            case hash2_common_2:           goto L_hash2_common_2;
+            }
+        }
+    }
     hash = 0;
     for (;;) {
 	switch (primary_tag(term)) {
 	case TAG_PRIMARY_LIST:
 	{
-	    int c = 0;
-	    Uint32 sh = 0;
-	    Eterm* ptr = list_val(term);
-	    while (is_byte(*ptr)) {
+            ErtsMakeHash2Context_TAG_PRIMARY_LIST ctx = {
+                .c =  0,
+                .sh = 0,
+                .ptr = list_val(term)};
+	    while (is_byte(*ctx.ptr)) {
 		/* Optimization for strings. */
-		sh = (sh << 8) + unsigned_val(*ptr);
-		if (c == 3) {
-		    UINT32_HASH(sh, HCONST_4);
-		    c = sh = 0;
+		ctx.sh = (ctx.sh << 8) + unsigned_val(*ctx.ptr);
+		if (ctx.c == 3) {
+		    UINT32_HASH(ctx.sh, HCONST_4);
+		    ctx.c = ctx.sh = 0;
 		} else {
-		    c++;
+		    ctx.c++;
 		}
-		term = CDR(ptr);
+		term = CDR(ctx.ptr);
 		if (is_not_list(term))
 		    break;
-		ptr = list_val(term);
+		ctx.ptr = list_val(term);
+                TRAP_LOCATION(tag_primary_list);
 	    }
-	    if (c > 0)
-		UINT32_HASH(sh, HCONST_4);
+	    if (ctx.c > 0)
+		UINT32_HASH(ctx.sh, HCONST_4);
 	    if (is_list(term)) {
-		tmp = CDR(ptr);
+		tmp = CDR(ctx.ptr);
                 ESTACK_PUSH(s, tmp);
-		term = CAR(ptr);
+		term = CAR(ctx.ptr);
 	    }
 	}
 	break;
@@ -1241,34 +1518,39 @@ make_hash2(Eterm term)
 	    switch (hdr & _TAG_HEADER_MASK) {
 	    case ARITYVAL_SUBTAG:
 	    {
-		int i;
-		int arity = header_arity(hdr);
-		Eterm* elem = tuple_val(term);
-		UINT32_HASH(arity, HCONST_9);
-		if (arity == 0) /* Empty tuple */
+                ErtsMakeHash2Context_ARITYVAL_SUBTAG ctx = {
+                    .i =  0,
+                    .arity = header_arity(hdr),
+                    .elem = tuple_val(term)};
+		UINT32_HASH(ctx.arity, HCONST_9);
+		if (ctx.arity == 0) /* Empty tuple */
 		    goto hash2_common;
-		for (i = arity; ; i--) {
-		    term = elem[i];
-                    if (i == 1)
+		for (ctx.i = ctx.arity; ; ctx.i--) {
+		    term = ctx.elem[ctx.i];
+                    if (ctx.i == 1)
                         break;
                     ESTACK_PUSH(s, term);
+                    TRAP_LOCATION(arityval_subtag);
 		}
 	    }
 	    break;
             case MAP_SUBTAG:
             {
-                Eterm* ptr = boxed_val(term) + 1;
                 Uint size;
-                int i;
+                ErtsMakeHash2Context_MAP_SUBTAG ctx = {
+                    .ptr = boxed_val(term) + 1,
+                    .i = 0};
                 switch (hdr & _HEADER_MAP_SUBTAG_MASK) {
                 case HAMT_SUBTAG_HEAD_FLATMAP:
                 {
                     flatmap_t *mp = (flatmap_t *)flatmap_val(term);
-                    Eterm *ks = flatmap_get_keys(mp);
-                    Eterm *vs = flatmap_get_values(mp);
-                    size      = flatmap_get_size(mp);
-                    UINT32_HASH(size, HCONST_16);
-                    if (size == 0)
+                    ErtsMakeHash2Context_HAMT_SUBTAG_HEAD_FLATMAP ctx = {
+                        .ks = flatmap_get_keys(mp),
+                        .vs = flatmap_get_values(mp),
+                        .i = 0,
+                        .size = flatmap_get_size(mp)};
+                    UINT32_HASH(ctx.size, HCONST_16);
+                    if (ctx.size == 0)
                         goto hash2_common;
 
                     /* We want a portable hash function that is *independent* of
@@ -1281,17 +1563,18 @@ make_hash2(Eterm term)
                     ESTACK_PUSH(s, HASH_MAP_TAIL);
                     hash = 0;
                     hash_xor_pairs = 0;
-                    for (i = size - 1; i >= 0; i--) {
+                    for (ctx.i = ctx.size - 1; ctx.i >= 0; ctx.i--) {
                         ESTACK_PUSH(s, HASH_MAP_PAIR);
-                        ESTACK_PUSH(s, vs[i]);
-                        ESTACK_PUSH(s, ks[i]);
+                        ESTACK_PUSH(s, ctx.vs[ctx.i]);
+                        ESTACK_PUSH(s, ctx.ks[ctx.i]);
+                        TRAP_LOCATION(hamt_subtag_head_flatmap);
                     }
                     goto hash2_common;
                 }
 
                 case HAMT_SUBTAG_HEAD_ARRAY:
                 case HAMT_SUBTAG_HEAD_BITMAP:
-                    size = *ptr++;
+                    size = *ctx.ptr++;
                     UINT32_HASH(size, HCONST_16);
                     if (size == 0)
                         goto hash2_common;
@@ -1303,27 +1586,28 @@ make_hash2(Eterm term)
                 }
                 switch (hdr & _HEADER_MAP_SUBTAG_MASK) {
                 case HAMT_SUBTAG_HEAD_ARRAY:
-                    i = 16;
+                    ctx.i = 16;
                     break;
                 case HAMT_SUBTAG_HEAD_BITMAP:
                 case HAMT_SUBTAG_NODE_BITMAP:
-                    i = hashmap_bitcount(MAP_HEADER_VAL(hdr));
+                    ctx.i = hashmap_bitcount(MAP_HEADER_VAL(hdr));
                     break;
                 default:
                     erts_exit(ERTS_ERROR_EXIT, "bad header");
                 }
-                while (i) {
-                    if (is_list(*ptr)) {
-                        Eterm* cons = list_val(*ptr);
+                while (ctx.i) {
+                    if (is_list(*ctx.ptr)) {
+                        Eterm* cons = list_val(*ctx.ptr);
                         ESTACK_PUSH(s, HASH_MAP_PAIR);
                         ESTACK_PUSH(s, CDR(cons));
                         ESTACK_PUSH(s, CAR(cons));
                     }
                     else {
-                        ASSERT(is_boxed(*ptr));
-                        ESTACK_PUSH(s, *ptr);
+                        ASSERT(is_boxed(*ctx.ptr));
+                        ESTACK_PUSH(s, *ctx.ptr);
                     }
-                    i--; ptr++;
+                    ctx.i--; ctx.ptr++;
+                    TRAP_LOCATION(map_subtag);
                 }
                 goto hash2_common;
             }
@@ -1344,22 +1628,25 @@ make_hash2(Eterm term)
 	    case FUN_SUBTAG:
 	    {
 		ErlFunThing* funp = (ErlFunThing *) fun_val(term);
-		Uint num_free = funp->num_free;
+                ErtsMakeHash2Context_FUN_SUBTAG ctx = {
+                    .num_free = funp->num_free,
+                    .bptr = NULL};
 		UINT32_HASH_2
-		    (num_free,
+		    (ctx.num_free,
 		     atom_tab(atom_val(funp->fe->module))->slot.bucket.hvalue,
 		     HCONST);
 		UINT32_HASH_2
-		    (funp->fe->old_index, funp->fe->old_uniq, HCONST);
-		if (num_free == 0) {
+		    (funp->fe->index, funp->fe->old_uniq, HCONST);
+		if (ctx.num_free == 0) {
 		    goto hash2_common;
 		} else {
-		    Eterm* bptr = funp->env + num_free - 1;
-		    while (num_free-- > 1) {
-			term = *bptr--;
+		    ctx.bptr = funp->env + ctx.num_free - 1;
+		    while (ctx.num_free-- > 1) {
+			term = *ctx.bptr--;
 			ESTACK_PUSH(s, term);
+                        TRAP_LOCATION(fun_subtag);
 		    }
-		    term = *bptr;
+		    term = *ctx.bptr;
 		}
 	    }
 	    break;
@@ -1367,70 +1654,190 @@ make_hash2(Eterm term)
 	    case HEAP_BINARY_SUBTAG:
 	    case SUB_BINARY_SUBTAG:
 	    {
-		byte* bptr;
-		unsigned sz = binary_size(term);
+#define BYTE_BITS 8
+                ErtsMakeHash2Context_SUB_BINARY_SUBTAG ctx = {
+                    .bptr = 0,
+                    /* !!!!!!!!!!!!!!!!!!!! OBS !!!!!!!!!!!!!!!!!!!!
+                     *
+                     * The size is truncated to 32 bits on the line
+                     * below so that the code is compatible with old
+                     * versions of the code. This means that hash
+                     * values for binaries with a size greater than
+                     * 4GB do not take all bytes in consideration.
+                     *
+                     * !!!!!!!!!!!!!!!!!!!! OBS !!!!!!!!!!!!!!!!!!!!
+                     */ 
+                    .sz = (0xFFFFFFFF & binary_size(term)),
+                    .bitsize = 0,
+                    .bitoffs = 0,
+                    .no_bytes_processed = 0
+                };
 		Uint32 con = HCONST_13 + hash;
-		Uint bitoffs;
-		Uint bitsize;
-
-		ERTS_GET_BINARY_BYTES(term, bptr, bitoffs, bitsize);
-		if (sz == 0 && bitsize == 0) {
+                Uint iters_for_bin = MAX(1, ctx.sz / BLOCK_HASH_BYTES_PER_ITER);
+		ERTS_GET_BINARY_BYTES(term, ctx.bptr, ctx.bitoffs, ctx.bitsize);
+		if (ctx.sz == 0 && ctx.bitsize == 0) {
 		    hash = con;
-		} else {
-		    if (bitoffs == 0) {
-			hash = block_hash(bptr, sz, con);
-			if (bitsize > 0) {
-			    UINT32_HASH_2(bitsize, (bptr[sz] >> (8 - bitsize)),
-					  HCONST_15);
-			}
-		    } else {
-			byte* buf = (byte *) erts_alloc(ERTS_ALC_T_TMP,
-							sz + (bitsize != 0));
-			erts_copy_bits(bptr, bitoffs, 1, buf, 0, 1, sz*8+bitsize);
-			hash = block_hash(buf, sz, con);
-			if (bitsize > 0) {
-			    UINT32_HASH_2(bitsize, (buf[sz] >> (8 - bitsize)),
-					  HCONST_15);
-			}
-			erts_free(ERTS_ALC_T_TMP, (void *) buf);
-		    }
+		} else if (ctx.bitoffs == 0 &&
+                           (!can_trap ||
+                            (iterations_until_trap - iters_for_bin) > 0)) {
+                    /* No need to trap while hashing binary */
+                    if (can_trap) iterations_until_trap -= iters_for_bin;
+                    hash = block_hash(ctx.bptr, ctx.sz, con);
+                    if (ctx.bitsize > 0) {
+                        UINT32_HASH_2(ctx.bitsize,
+                                      (ctx.bptr[ctx.sz] >> (BYTE_BITS - ctx.bitsize)),
+                                      HCONST_15);
+                    }
+                } else if (ctx.bitoffs == 0) {
+                    /* Need to trap while hashing binary */
+                    ErtsBlockHashHelperCtx* block_hash_ctx = &ctx.block_hash_ctx;
+                    block_hash_setup(con, block_hash_ctx);
+                    do {
+                        Uint max_bytes_to_process =
+                            iterations_until_trap <= 0 ? BLOCK_HASH_BYTES_PER_ITER :
+                            iterations_until_trap * BLOCK_HASH_BYTES_PER_ITER;
+                        Uint bytes_left = ctx.sz - ctx.no_bytes_processed;
+                        Uint even_bytes_left =
+                            bytes_left - (bytes_left % BLOCK_HASH_BYTES_PER_ITER);
+                        Uint bytes_to_process =
+                            MIN(max_bytes_to_process, even_bytes_left);
+                        block_hash_buffer(&ctx.bptr[ctx.no_bytes_processed],
+                                          bytes_to_process,
+                                          block_hash_ctx);
+                        ctx.no_bytes_processed += bytes_to_process;
+                        iterations_until_trap -=
+                            MAX(1, bytes_to_process / BLOCK_HASH_BYTES_PER_ITER);
+                        TRAP_LOCATION_NO_RED(sub_binary_subtag_1);
+                        block_hash_ctx = &ctx.block_hash_ctx; /* Restore after trap */
+                    } while ((ctx.sz - ctx.no_bytes_processed) >=
+                             BLOCK_HASH_BYTES_PER_ITER);
+                    hash = block_hash_final_bytes(ctx.bptr +
+                                                  ctx.no_bytes_processed,
+                                                  ctx.sz - ctx.no_bytes_processed,
+                                                  ctx.sz,
+                                                  block_hash_ctx);
+                    if (ctx.bitsize > 0) {
+                        UINT32_HASH_2(ctx.bitsize,
+                                      (ctx.bptr[ctx.sz] >> (BYTE_BITS - ctx.bitsize)),
+                                      HCONST_15);
+                    }
+                } else if (/* ctx.bitoffs != 0 && */
+                           (!can_trap ||
+                            (iterations_until_trap - iters_for_bin) > 0)) {
+                    /* No need to trap while hashing binary */
+                    Uint nr_of_bytes = ctx.sz + (ctx.bitsize != 0);
+                    byte *buf = erts_alloc(ERTS_ALC_T_TMP, nr_of_bytes);
+                    Uint nr_of_bits_to_copy = ctx.sz*BYTE_BITS+ctx.bitsize;
+                    if (can_trap) iterations_until_trap -= iters_for_bin;
+                    erts_copy_bits(ctx.bptr,
+                                   ctx.bitoffs, 1, buf, 0, 1, nr_of_bits_to_copy);
+                    hash = block_hash(buf, ctx.sz, con);
+                    if (ctx.bitsize > 0) {
+                        UINT32_HASH_2(ctx.bitsize,
+                                      (buf[ctx.sz] >> (BYTE_BITS - ctx.bitsize)),
+                                      HCONST_15);
+                    }
+                    erts_free(ERTS_ALC_T_TMP, buf);
+                } else /* ctx.bitoffs != 0 && */ {
+#ifdef DEBUG
+#define BINARY_BUF_SIZE (BLOCK_HASH_BYTES_PER_ITER * 3)
+#else
+#define BINARY_BUF_SIZE (BLOCK_HASH_BYTES_PER_ITER * 256)
+#endif
+#define BINARY_BUF_SIZE_BITS (BINARY_BUF_SIZE*BYTE_BITS)
+                    /* Need to trap while hashing binary */
+                    ErtsBlockHashHelperCtx* block_hash_ctx = &ctx.block_hash_ctx;
+                    Uint nr_of_bytes = ctx.sz + (ctx.bitsize != 0);
+                    ERTS_CT_ASSERT(BINARY_BUF_SIZE % BLOCK_HASH_BYTES_PER_ITER == 0);
+                    ctx.buf = erts_alloc(ERTS_ALC_T_PHASH2_TRAP,
+                                         MIN(nr_of_bytes, BINARY_BUF_SIZE));
+                    block_hash_setup(con, block_hash_ctx);
+                    do {
+                        Uint bytes_left =
+                            ctx.sz - ctx.no_bytes_processed;
+                        Uint even_bytes_left =
+                            bytes_left - (bytes_left % BLOCK_HASH_BYTES_PER_ITER);
+                        Uint bytes_to_process =
+                            MIN(BINARY_BUF_SIZE, even_bytes_left);
+                        Uint nr_of_bits_left =
+                            (ctx.sz*BYTE_BITS+ctx.bitsize) -
+                            ctx.no_bytes_processed*BYTE_BITS; 
+                        Uint nr_of_bits_to_copy =
+                            MIN(nr_of_bits_left, BINARY_BUF_SIZE_BITS);
+                        ctx.done = nr_of_bits_left == nr_of_bits_to_copy;
+                        erts_copy_bits(ctx.bptr + ctx.no_bytes_processed,
+                                       ctx.bitoffs, 1, ctx.buf, 0, 1,
+                                       nr_of_bits_to_copy);
+                        block_hash_buffer(ctx.buf,
+                                          bytes_to_process,
+                                          block_hash_ctx);
+                        ctx.no_bytes_processed += bytes_to_process;
+                        iterations_until_trap -=
+                            MAX(1, bytes_to_process / BLOCK_HASH_BYTES_PER_ITER);
+                        TRAP_LOCATION_NO_RED(sub_binary_subtag_2);
+                        block_hash_ctx = &ctx.block_hash_ctx; /* Restore after trap */
+                    } while (!ctx.done);
+                    nr_of_bytes = ctx.sz + (ctx.bitsize != 0);
+                    hash = block_hash_final_bytes(ctx.buf +
+                                                  (ctx.no_bytes_processed -
+                                                   ((nr_of_bytes-1) / BINARY_BUF_SIZE) *  BINARY_BUF_SIZE),
+                                                  ctx.sz - ctx.no_bytes_processed,
+                                                  ctx.sz,
+                                                  block_hash_ctx);
+                    if (ctx.bitsize > 0) {
+                        Uint last_byte_index =
+                            nr_of_bytes - (((nr_of_bytes-1) / BINARY_BUF_SIZE) *  BINARY_BUF_SIZE) -1;
+                        UINT32_HASH_2(ctx.bitsize,
+                                      (ctx.buf[last_byte_index] >> (BYTE_BITS - ctx.bitsize)),
+                                      HCONST_15);
+                    }
+                    erts_free(ERTS_ALC_T_PHASH2_TRAP, ctx.buf);
+                    context->trap_location_state.sub_binary_subtag_2.buf = NULL;
 		}
 		goto hash2_common;
+#undef BYTE_BITS
+#undef BINARY_BUF_SIZE
+#undef BINARY_BUF_SIZE_BITS
 	    }
 	    break;
 	    case POS_BIG_SUBTAG:
 	    case NEG_BIG_SUBTAG:
 	    {
-		Eterm* ptr = big_val(term);
-		Uint i = 0;
-		Uint n = BIG_SIZE(ptr);
-		Uint32 con = BIG_SIGN(ptr) ? HCONST_10 : HCONST_11;
+		Eterm* big_val_ptr = big_val(term);
+                ErtsMakeHash2Context_NEG_BIG_SUBTAG ctx = {
+                    .ptr = big_val_ptr,
+                    .i = 0,
+                    .n = BIG_SIZE(big_val_ptr),
+                    .con = BIG_SIGN(big_val_ptr) ? HCONST_10 : HCONST_11};
 #if D_EXP == 16
 		do {
 		    Uint32 x, y;
-		    x = i < n ? BIG_DIGIT(ptr, i++) : 0;
-		    x += (Uint32)(i < n ? BIG_DIGIT(ptr, i++) : 0) << 16;
-		    y = i < n ? BIG_DIGIT(ptr, i++) : 0;
-		    y += (Uint32)(i < n ? BIG_DIGIT(ptr, i++) : 0) << 16;
-		    UINT32_HASH_2(x, y, con);
-		} while (i < n);
+		    x = ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0;
+		    x += (Uint32)(ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0) << 16;
+		    y = ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0;
+		    y += (Uint32)(ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0) << 16;
+		    UINT32_HASH_2(x, y, ctx.con);
+                    TRAP_LOCATION(neg_big_subtag);
+		} while (ctx.i < ctx.n);
 #elif D_EXP == 32
 		do {
 		    Uint32 x, y;
-		    x = i < n ? BIG_DIGIT(ptr, i++) : 0;
-		    y = i < n ? BIG_DIGIT(ptr, i++) : 0;
-		    UINT32_HASH_2(x, y, con);
-		} while (i < n);
+		    x = ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0;
+		    y = ctx.i < ctx.n ? BIG_DIGIT(ctx.ptr, ctx.i++) : 0;
+		    UINT32_HASH_2(x, y, ctx.con);
+                    TRAP_LOCATION(neg_big_subtag);
+		} while (ctx.i < ctx.n);
 #elif D_EXP == 64
 		do {
 		    Uint t;
 		    Uint32 x, y;
-                    ASSERT(i < n);
-		    t = BIG_DIGIT(ptr, i++);
+                    ASSERT(ctx.i < ctx.n);
+		    t = BIG_DIGIT(ctx.ptr, ctx.i++);
 		    x = t & 0xffffffff;
 		    y = t >> 32;
-		    UINT32_HASH_2(x, y, con);
-		} while (i < n);
+		    UINT32_HASH_2(x, y, ctx.con);
+                    TRAP_LOCATION(neg_big_subtag);
+		} while (ctx.i < ctx.n);
 #else
 #error "unsupported D_EXP size"
 #endif
@@ -1508,13 +1915,13 @@ make_hash2(Eterm term)
 		}
 	    case _TAG_IMMED1_SMALL:
 	      {
-		  Sint x = signed_val(term);
+		  Sint small = signed_val(term);
+		  if (SMALL_BITS > 28 && !IS_SSMALL28(small)) {
+                      NOT_SSMALL28_HASH(small);
+		  } else {
+		      SINT32_HASH(small, HCONST);
+                  }
 
-		  if (SMALL_BITS > 28 && !IS_SSMALL28(x)) {
-		      term = small_to_big(x, tmp_big);
-		      break;
-		  }
-		  SINT32_HASH(x, HCONST);
 		  goto hash2_common;
 	      }
 	    }
@@ -1529,7 +1936,10 @@ make_hash2(Eterm term)
 
 	    if (ESTACK_ISEMPTY(s)) {
 		DESTROY_ESTACK(s);
-		UnUseTmpHeapNoproc(2);
+                if (can_trap) {
+                    BUMP_REDS(p, (max_iterations - iterations_until_trap) / ITERATIONS_PER_RED);
+                    ASSERT(!(p->flags & F_DISABLE_GC));
+                }
 		return hash;
 	    }
 
@@ -1540,18 +1950,37 @@ make_hash2(Eterm term)
 		    hash = (Uint32) ESTACK_POP(s);
                     UINT32_HASH(hash_xor_pairs, HCONST_19);
 		    hash_xor_pairs = (Uint32) ESTACK_POP(s);
+                    TRAP_LOCATION_NO_CTX(hash2_common_1);
 		    goto hash2_common;
 		}
 		case HASH_MAP_PAIR:
 		    hash_xor_pairs ^= hash;
                     hash = 0;
+                    TRAP_LOCATION_NO_CTX(hash2_common_2);
 		    goto hash2_common;
 		default:
 		    break;
 	    }
+
 	}
+        TRAP_LOCATION_NO_CTX(hash2_common_3);
     }
     }
+#undef TRAP_LOCATION_NO_RED
+#undef TRAP_LOCATION
+#undef TRAP_LOCATION_NO_CTX
+}
+
+Uint32
+make_hash2(Eterm term)
+{
+    return make_hash2_helper(term, 0, NULL, NULL);
+}
+
+Uint32
+trapping_make_hash2(Eterm term, Eterm* state_mref_write_back, Process* p)
+{
+    return make_hash2_helper(term, 1, state_mref_write_back, p);
 }
 
 /* Term hash function for internal use.
@@ -1731,7 +2160,7 @@ make_internal_hash(Eterm term, Uint32 salt)
 		ErlFunThing* funp = (ErlFunThing *) fun_val(term);
 		Uint num_free = funp->num_free;
                 UINT32_HASH_2(num_free, funp->fe->module, HCONST_20);
-                UINT32_HASH_2(funp->fe->old_index, funp->fe->old_uniq, HCONST_21);
+                UINT32_HASH_2(funp->fe->index, funp->fe->old_uniq, HCONST_21);
 		if (num_free == 0) {
 		    goto pop_next;
 		} else {
@@ -2381,7 +2810,7 @@ tailrecur_ne:
 		    f1 = (ErlFunThing *) fun_val(a);
 		    f2 = (ErlFunThing *) fun_val(b);
 		    if (f1->fe->module != f2->fe->module ||
-			f1->fe->old_index != f2->fe->old_index ||
+			f1->fe->index != f2->fe->index ||
 			f1->fe->old_uniq != f2->fe->old_uniq ||
 			f1->num_free != f2->num_free) {
 			goto not_equal;
@@ -2976,7 +3405,7 @@ tailrecur_ne:
 		    if (diff != 0) {
 			RETURN_NEQ(diff);
 		    }
-		    diff = f1->fe->old_index - f2->fe->old_index;
+		    diff = f1->fe->index - f2->fe->index;
 		    if (diff != 0) {
 			RETURN_NEQ(diff);
 		    }