1 files changed, 344 insertions, 275 deletions
diff --git a/erts/emulator/beam/jit/x86/instr_bif.cpp b/erts/emulator/beam/jit/x86/instr_bif.cpp
index bc390b0fa8..4c2d4f007e 100644
--- a/erts/emulator/beam/jit/x86/instr_bif.cpp
+++ b/erts/emulator/beam/jit/x86/instr_bif.cpp
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2020-2022. All Rights Reserved.
+ * Copyright Ericsson AB 2020-2023. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@ extern "C"
  *
  * Result is returned in RET, error is indicated by ZF. */
 void BeamGlobalAssembler::emit_i_bif_guard_shared() {
+    emit_enter_frame();
     emit_enter_runtime<Update::eReductions>();
 
     a.mov(ARG1, c_p);
@@ -42,6 +43,7 @@ void BeamGlobalAssembler::emit_i_bif_guard_shared() {
     runtime_call(ARG4, 3);
 
     emit_leave_runtime<Update::eReductions>();
+    emit_leave_frame();
 
     emit_test_the_non_value(RET);
     a.ret();
@@ -53,6 +55,7 @@ void BeamGlobalAssembler::emit_i_bif_guard_shared() {
 void BeamGlobalAssembler::emit_i_bif_body_shared() {
     Label error = a.newLabel();
 
+    emit_enter_frame();
     emit_enter_runtime<Update::eReductions>();
 
     /* Save current BIF and argument vector for the error path. */
@@ -68,6 +71,7 @@ void BeamGlobalAssembler::emit_i_bif_body_shared() {
     a.short_().je(error);
 
     emit_leave_runtime<Update::eReductions>();
+    emit_leave_frame();
 
     a.ret();
 
@@ -87,14 +91,15 @@ void BeamGlobalAssembler::emit_i_bif_body_shared() {
         runtime_call<1>(ubif2mfa);
 
         emit_leave_runtime<Update::eReductions>();
+        emit_leave_frame();
 
         a.mov(ARG4, RET);
-        a.jmp(labels[handle_error_shared_prologue]);
+        a.jmp(labels[raise_exception]);
     }
 }
 
 void BeamModuleAssembler::emit_setup_guard_bif(const std::vector<ArgVal> &args,
-                                               const ArgVal &bif) {
+                                               const ArgWord &bif) {
     bool is_contiguous_mem = false;
 
     ASSERT(args.size() > 0 && args.size() <= 3);
@@ -102,12 +107,12 @@ void BeamModuleAssembler::emit_setup_guard_bif(const std::vector<ArgVal> &args,
     /* If the guard BIF's arguments are in memory and continuous, for example
      * `map_get(x0, x1)`, then we can pass the address of the first argument
      * instead of filling in the argument vector. */
-    is_contiguous_mem = args.size() && args[0].isMem();
+    is_contiguous_mem = args.size() && args[0].isRegister();
     for (size_t i = 1; i < args.size() && is_contiguous_mem; i++) {
-        const ArgVal &curr = args[i], &prev = args[i - 1];
+        const ArgSource &curr = args[i], &prev = args[i - 1];
 
-        is_contiguous_mem = curr.getType() == prev.getType() &&
-                            curr.getValue() == prev.getValue() + 1;
+        is_contiguous_mem =
+                ArgVal::memory_relation(prev, curr) == ArgVal::consecutive;
     }
 
     if (is_contiguous_mem) {
@@ -123,15 +128,15 @@ void BeamModuleAssembler::emit_setup_guard_bif(const std::vector<ArgVal> &args,
     mov_arg(ARG4, bif);
 }
 
-void BeamModuleAssembler::emit_i_bif1(const ArgVal &Src1,
-                                      const ArgVal &Fail,
-                                      const ArgVal &Bif,
-                                      const ArgVal &Dst) {
+void BeamModuleAssembler::emit_i_bif1(const ArgSource &Src1,
+                                      const ArgLabel &Fail,
+                                      const ArgWord &Bif,
+                                      const ArgRegister &Dst) {
     emit_setup_guard_bif({Src1}, Bif);
 
-    if (Fail.getValue() != 0) {
+    if (Fail.get() != 0) {
         safe_fragment_call(ga->get_i_bif_guard_shared());
-        a.je(labels[Fail.getValue()]);
+        a.je(resolve_beam_label(Fail));
     } else {
         safe_fragment_call(ga->get_i_bif_body_shared());
     }
@@ -139,16 +144,16 @@ void BeamModuleAssembler::emit_i_bif1(const ArgVal &Src1,
     mov_arg(Dst, RET);
 }
 
-void BeamModuleAssembler::emit_i_bif2(const ArgVal &Src1,
-                                      const ArgVal &Src2,
-                                      const ArgVal &Fail,
-                                      const ArgVal &Bif,
-                                      const ArgVal &Dst) {
+void BeamModuleAssembler::emit_i_bif2(const ArgSource &Src1,
+                                      const ArgSource &Src2,
+                                      const ArgLabel &Fail,
+                                      const ArgWord &Bif,
+                                      const ArgRegister &Dst) {
     emit_setup_guard_bif({Src1, Src2}, Bif);
 
-    if (Fail.getValue() != 0) {
+    if (Fail.get() != 0) {
         safe_fragment_call(ga->get_i_bif_guard_shared());
-        a.je(labels[Fail.getValue()]);
+        a.je(resolve_beam_label(Fail));
     } else {
         safe_fragment_call(ga->get_i_bif_body_shared());
     }
@@ -156,17 +161,17 @@ void BeamModuleAssembler::emit_i_bif2(const ArgVal &Src1,
     mov_arg(Dst, RET);
 }
 
-void BeamModuleAssembler::emit_i_bif3(const ArgVal &Src1,
-                                      const ArgVal &Src2,
-                                      const ArgVal &Src3,
-                                      const ArgVal &Fail,
-                                      const ArgVal &Bif,
-                                      const ArgVal &Dst) {
+void BeamModuleAssembler::emit_i_bif3(const ArgSource &Src1,
+                                      const ArgSource &Src2,
+                                      const ArgSource &Src3,
+                                      const ArgLabel &Fail,
+                                      const ArgWord &Bif,
+                                      const ArgRegister &Dst) {
     emit_setup_guard_bif({Src1, Src2, Src3}, Bif);
 
-    if (Fail.getValue() != 0) {
+    if (Fail.get() != 0) {
         safe_fragment_call(ga->get_i_bif_guard_shared());
-        a.je(labels[Fail.getValue()]);
+        a.je(resolve_beam_label(Fail));
     } else {
         safe_fragment_call(ga->get_i_bif_body_shared());
     }
@@ -180,34 +185,34 @@ void BeamModuleAssembler::emit_i_bif3(const ArgVal &Src1,
  * to align the call targeting the shared fragment.
  */
 
-void BeamModuleAssembler::emit_nofail_bif1(const ArgVal &Src1,
-                                           const ArgVal &Bif,
-                                           const ArgVal &Dst) {
+void BeamModuleAssembler::emit_nofail_bif1(const ArgSource &Src1,
+                                           const ArgWord &Bif,
+                                           const ArgRegister &Dst) {
     emit_setup_guard_bif({Src1}, Bif);
     safe_fragment_call(ga->get_i_bif_guard_shared());
     mov_arg(Dst, RET);
 }
 
-void BeamModuleAssembler::emit_nofail_bif2(const ArgVal &Src1,
-                                           const ArgVal &Src2,
-                                           const ArgVal &Bif,
-                                           const ArgVal &Dst) {
+void BeamModuleAssembler::emit_nofail_bif2(const ArgSource &Src1,
+                                           const ArgSource &Src2,
+                                           const ArgWord &Bif,
+                                           const ArgRegister &Dst) {
     emit_setup_guard_bif({Src1, Src2}, Bif);
     safe_fragment_call(ga->get_i_bif_guard_shared());
     mov_arg(Dst, RET);
 }
 
-void BeamModuleAssembler::emit_i_length_setup(const ArgVal &Fail,
-                                              const ArgVal &Live,
-                                              const ArgVal &Src) {
+void BeamModuleAssembler::emit_i_length_setup(const ArgLabel &Fail,
+                                              const ArgWord &Live,
+                                              const ArgSource &Src) {
     x86::Mem trap_state;
 
     /* Store trap state after the currently live registers. There's an extra 3
      * registers beyond the ordinary ones that we're free to use for whatever
      * purpose. */
     ERTS_CT_ASSERT(ERTS_X_REGS_ALLOCATED - MAX_REG >= 3);
-    ASSERT(Live.getValue() <= MAX_REG);
-    trap_state = getXRef(Live.getValue());
+    ASSERT(Live.get() <= MAX_REG);
+    trap_state = getXRef(Live.get());
 
     /* Remainder of the list. */
     mov_arg(trap_state, Src);
@@ -217,7 +222,7 @@ void BeamModuleAssembler::emit_i_length_setup(const ArgVal &Fail,
 
     /* Original argument. This is only needed for exceptions and can be safely
      * skipped in guards. */
-    if (Fail.getValue() == 0) {
+    if (Fail.get() == 0) {
         x86::Mem original_argument;
 
         original_argument = trap_state.cloneAdjusted(2 * sizeof(Eterm));
@@ -238,6 +243,8 @@ x86::Mem BeamGlobalAssembler::emit_i_length_common(Label fail, int state_size) {
     trap_state = getXRef(0);
     trap_state.setIndex(ARG2, 3);
 
+    emit_enter_frame();
+
     /* Save arguments for error/trapping path. */
     a.mov(TMP_MEM1q, ARG2);
     a.mov(TMP_MEM2q, ARG3);
@@ -249,6 +256,7 @@ x86::Mem BeamGlobalAssembler::emit_i_length_common(Label fail, int state_size) {
     runtime_call<2>(erts_trapping_length_1);
 
     emit_leave_runtime<Update::eReductions>();
+    emit_leave_frame();
 
     emit_test_the_non_value(RET);
     a.short_().je(trap);
@@ -268,7 +276,7 @@ x86::Mem BeamGlobalAssembler::emit_i_length_common(Label fail, int state_size) {
         a.add(ARG2, imm(state_size));
 
         /* We'll find our way back through the entry address (ARG3). */
-        emit_discard_cp();
+        a.add(x86::rsp, imm(sizeof(UWord)));
 
         a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), imm(0));
         a.mov(x86::qword_ptr(c_p, offsetof(Process, arity)), ARG2);
@@ -298,7 +306,7 @@ void BeamGlobalAssembler::emit_i_length_body_shared() {
         a.mov(getXRef(0), ARG1);
 
         a.mov(ARG4, imm(&bif_mfa));
-        emit_handle_error();
+        a.jmp(labels[raise_exception]);
     }
 }
 
@@ -312,14 +320,14 @@ void BeamGlobalAssembler::emit_i_length_guard_shared() {
 
     a.bind(error);
     {
-        mov_imm(RET, 0);
+        a.sub(RET, RET);
         a.ret();
     }
 }
 
-void BeamModuleAssembler::emit_i_length(const ArgVal &Fail,
-                                        const ArgVal &Live,
-                                        const ArgVal &Dst) {
+void BeamModuleAssembler::emit_i_length(const ArgLabel &Fail,
+                                        const ArgWord &Live,
+                                        const ArgRegister &Dst) {
     Label entry = a.newLabel();
 
     align_erlang_cp();
@@ -328,13 +336,13 @@ void BeamModuleAssembler::emit_i_length(const ArgVal &Fail,
     mov_arg(ARG2, Live);
     a.lea(ARG3, x86::qword_ptr(entry));
 
-    if (Fail.getValue() != 0) {
+    if (Fail.get() != 0) {
         /* The return address is discarded when yielding, so it doesn't need to
          * be aligned. */
         safe_fragment_call(ga->get_i_length_guard_shared());
-        a.je(labels[Fail.getValue()]);
+        a.je(resolve_beam_label(Fail));
     } else {
-        fragment_call(ga->get_i_length_body_shared());
+        safe_fragment_call(ga->get_i_length_body_shared());
     }
 
     mov_arg(Dst, RET);
@@ -375,31 +383,35 @@ static Eterm debug_call_light_bif(Process *c_p,
  * RET  = BIF pointer
  */
 void BeamGlobalAssembler::emit_call_light_bif_shared() {
-    /* We use the HTOP and FCALLS registers as they are
-       not used on the runtime-stack and are caller save. */
+    x86::Mem entry_mem = TMP_MEM1q, export_mem = TMP_MEM2q,
+             mbuf_mem = TMP_MEM3q;
+
+    Label trace = a.newLabel(), yield = a.newLabel();
 
-    x86::Gp I = HTOP, exp = FCALLS;
+    emit_enter_frame();
 
-    Label error = a.newLabel(), trace = a.newLabel(), trap = a.newLabel(),
-          yield = a.newLabel(), call_save_calls = a.newLabel(),
-          call_bif = a.newLabel(), gc_after_bif_call = a.newLabel(),
-          check_bif_return = a.newLabel();
+    /* Spill everything we may need on the error and GC paths. */
+    a.mov(ARG1, x86::qword_ptr(c_p, offsetof(Process, mbuf)));
+    a.mov(entry_mem, ARG3);
+    a.mov(export_mem, ARG4);
+    a.mov(mbuf_mem, ARG1);
 
-    /* Check if we should trace this bif call */
+    /* Check if we should trace this bif call or handle save_calls. Both
+     * variants dispatch through the export entry. */
     a.cmp(x86::dword_ptr(ARG4, offsetof(Export, is_bif_traced)), imm(0));
     a.jne(trace);
+    a.cmp(active_code_ix, imm(ERTS_SAVE_CALLS_CODE_IX));
+    a.je(trace);
 
     a.dec(FCALLS);
     a.jle(yield);
 
     {
+        Label check_bif_return = a.newLabel(), gc_after_bif_call = a.newLabel();
+
         emit_enter_runtime<Update::eReductions | Update::eStack |
                            Update::eHeap>();
 
-        /* Spill the arguments we may need on the error path. */
-        a.mov(I, ARG3);
-        a.mov(exp, ARG4);
-
 #ifdef ERTS_MSACC_EXTENDED_STATES
         {
             Label skip_msacc = a.newLabel();
@@ -407,7 +419,7 @@ void BeamGlobalAssembler::emit_call_light_bif_shared() {
             a.cmp(erts_msacc_cache, imm(0));
             a.short_().je(skip_msacc);
 
-            a.mov(TMP_MEM1q, RET);
+            a.mov(TMP_MEM4q, ARG3);
 
             a.mov(ARG1, erts_msacc_cache);
             a.mov(ARG2,
@@ -415,29 +427,23 @@ void BeamGlobalAssembler::emit_call_light_bif_shared() {
             a.mov(ARG3, RET);
             runtime_call<3>(erts_msacc_set_bif_state);
 
-            a.mov(ARG3, I);
-            a.mov(RET, TMP_MEM1q);
+            a.mov(ARG3, TMP_MEM4q);
             a.bind(skip_msacc);
         }
 #endif
-        /* Check if we need to call save_calls */
-        a.cmp(active_code_ix, imm(ERTS_SAVE_CALLS_CODE_IX));
-        a.je(call_save_calls);
-        a.bind(call_bif);
 
-        a.mov(ARG1, x86::qword_ptr(c_p, offsetof(Process, mbuf)));
-        a.mov(TMP_MEM1q, ARG1);
-
-        /* ARG3 and RET have been set earlier. */
-        a.mov(ARG1, c_p);
-        load_x_reg_array(ARG2);
+        {
+            /* Call the BIF proper. ARG3 and RET have been set earlier. */
+            a.mov(ARG1, c_p);
+            load_x_reg_array(ARG2);
 
 #if defined(DEBUG) || defined(ERTS_ENABLE_LOCK_CHECK)
-        a.mov(ARG4, RET);
-        runtime_call<4>(debug_call_light_bif);
+            a.mov(ARG4, RET);
+            runtime_call<4>(debug_call_light_bif);
 #else
-        runtime_call(RET, 3);
+            runtime_call(RET, 3);
 #endif
+        }
 
 #ifdef ERTS_MSACC_EXTENDED_STATES
         {
@@ -445,180 +451,153 @@ void BeamGlobalAssembler::emit_call_light_bif_shared() {
 
             a.cmp(erts_msacc_cache, imm(0));
             a.short_().je(skip_msacc);
-
-            /* update cache if it was changed in the bif.
-               TMP_MEM1q is already taken to save ARG1 above */
-            a.mov(TMP_MEM2q, RET);
-            a.lea(ARG1, erts_msacc_cache);
-            runtime_call<1>(erts_msacc_update_cache);
-            a.mov(RET, TMP_MEM2q);
-
-            /* set state to emulator if msacc has been enabled */
-            a.cmp(erts_msacc_cache, imm(0));
-            a.short_().je(skip_msacc);
-            a.mov(ARG1, erts_msacc_cache);
-            a.mov(ARG2, imm(ERTS_MSACC_STATE_EMULATOR));
-            a.mov(ARG3, imm(1));
-            runtime_call<3>(erts_msacc_set_state_m__);
-            a.mov(RET, TMP_MEM2q);
-
+            {
+                /* Update cache if it was changed in the BIF, stashing the
+                 * return value in TMP_MEM4q. */
+                a.mov(TMP_MEM4q, RET);
+                a.lea(ARG1, erts_msacc_cache);
+                runtime_call<1>(erts_msacc_update_cache);
+                a.mov(RET, TMP_MEM4q);
+
+                /* set state to emulator if msacc has been enabled */
+                a.cmp(erts_msacc_cache, imm(0));
+                a.short_().je(skip_msacc);
+
+                a.mov(ARG1, erts_msacc_cache);
+                a.mov(ARG2, imm(ERTS_MSACC_STATE_EMULATOR));
+                a.mov(ARG3, imm(1));
+                runtime_call<3>(erts_msacc_set_state_m__);
+                a.mov(RET, TMP_MEM4q);
+            }
             a.bind(skip_msacc);
         }
 #endif
 
+        /* We must update the active code index in case another process has
+         * loaded new code, as the result of this BIF may be observable on
+         * both ends.
+         *
+         * It doesn't matter whether the BIF modifies anything; if process
+         * A loads new code and calls erlang:monotonic_time/0 soon after,
+         * we'd break the illusion of atomic upgrades if process B still
+         * ran old code after seeing a later timestamp from its own call to
+         * erlang:monotonic_time/0. */
+        emit_leave_runtime<Update::eReductions | Update::eStack |
+                           Update::eHeap | Update::eCodeIndex>();
+
         /* ERTS_IS_GC_DESIRED_INTERNAL */
         {
-            a.mov(ARG2, x86::qword_ptr(c_p, offsetof(Process, stop)));
-            a.mov(ARG3, RET);
-            a.mov(ARG5, x86::qword_ptr(c_p, offsetof(Process, htop)));
+            /* Test whether GC is forced. */
+            a.test(x86::dword_ptr(c_p, offsetof(Process, flags)),
+                   imm(F_FORCE_GC | F_DISABLE_GC));
+            a.jne(gc_after_bif_call);
 
-            /* Test if binary heap size should trigger gc */
-            a.mov(RET, x86::qword_ptr(c_p, offsetof(Process, bin_vheap_sz)));
+            /* Test if binary heap size should trigger GC. */
+            a.mov(ARG1, x86::qword_ptr(c_p, offsetof(Process, bin_vheap_sz)));
             a.cmp(x86::qword_ptr(c_p, offsetof(Process, off_heap.overhead)),
-                  RET);
-            a.mov(RETd, x86::dword_ptr(c_p, offsetof(Process, flags)));
-            a.seta(x86::cl); /* Clobber ARG1 on windows and ARG4 on Linux */
-            a.and_(RETd, imm(F_FORCE_GC | F_DISABLE_GC));
-            a.or_(x86::cl, RETb);
-            a.jne(gc_after_bif_call);
+                  ARG1);
+            a.ja(gc_after_bif_call);
 
             /* Test if heap fragment size is larger than remaining heap size. */
-            a.mov(RET, ARG2);
-            a.sub(RET, ARG5);
-            a.sar(RET, imm(3));
-            a.cmp(RET, x86::qword_ptr(c_p, offsetof(Process, mbuf_sz)));
+            a.mov(ARG2, x86::qword_ptr(c_p, offsetof(Process, mbuf_sz)));
+            a.lea(ARG1, x86::qword_ptr(HTOP, ARG2, 0, 3));
+            a.cmp(E, ARG1);
             a.jl(gc_after_bif_call);
         }
 
-        /*
-           ARG2 is set to E
-           ARG3 is set to bif return
-           ARG5 is set to HTOP
-
-           HTOP is exp
-           E_saved|E is I
-        */
+        /* ! FALL THROUGH ! */
         a.bind(check_bif_return);
-        emit_test_the_non_value(ARG3);
-
-        /* NOTE: Short won't reach if JIT_HARD_DEBUG is defined. */
-        a.je(trap);
-
-        a.mov(HTOP, ARG5);
-#ifdef NATIVE_ERLANG_STACK
-        a.mov(E_saved, ARG2);
-#else
-        a.mov(E, ARG2);
-#endif
-
-        /* We must update the active code index in case another process has
-         * loaded new code, as the result of this BIF may be observable on both
-         * ends.
-         *
-         * It doesn't matter whether the BIF modifies anything; if process A
-         * loads new code and calls erlang:monotonic_time/0 soon after, we'd
-         * break the illusion of atomic upgrades if process B still ran old code
-         * after seeing a later timestamp from its own call to
-         * erlang:monotonic_time/0. */
-
-        emit_leave_runtime<Update::eReductions | Update::eCodeIndex>();
-
-        a.mov(getXRef(0), ARG3);
-        a.ret();
-
-        a.bind(call_save_calls);
         {
-            /* Stash the bif function pointer */
-            a.mov(TMP_MEM1q, RET);
+            Label trap = a.newLabel(), error = a.newLabel();
 
-            /* Setup the arguments to call */
-            a.mov(ARG1, c_p);
-            a.mov(ARG2, exp);
-            runtime_call<2>(save_calls);
-
-            /* Restore RET and ARG3 to the values expected
-               by the bif call */
-            a.mov(RET, TMP_MEM1q);
-            a.mov(ARG3, I);
-            a.jmp(call_bif);
-        }
+            emit_test_the_non_value(RET);
+            a.short_().je(trap);
 
-        a.bind(trap);
-        {
-            a.cmp(x86::qword_ptr(c_p, offsetof(Process, freason)), imm(TRAP));
-            a.short_().jne(error);
+            a.mov(getXRef(0), RET);
 
-            emit_leave_runtime<Update::eHeap | Update::eStack |
-                               Update::eReductions | Update::eCodeIndex>();
+            emit_leave_frame();
+            a.ret();
+
+            a.bind(trap);
+            {
+                a.cmp(x86::qword_ptr(c_p, offsetof(Process, freason)),
+                      imm(TRAP));
+                a.short_().jne(error);
 
 #if !defined(NATIVE_ERLANG_STACK)
-            a.pop(getCPRef());
+                a.pop(getCPRef());
 #endif
 
-            /* Trap out, our return address is on the Erlang stack.
-             *
-             * The BIF_TRAP macros all set up c_p->arity and c_p->current, so
-             * we can use a simplified context switch. */
-            a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
-            a.jmp(labels[context_switch_simplified]);
-        }
-
-        a.bind(error);
-        {
-            a.mov(ARG4, exp);
-            a.mov(RET, I);
-
-            /* Update::eCodeIndex clobbers ARG1 + ARG2 */
-            emit_leave_runtime<Update::eHeap | Update::eStack |
-                               Update::eReductions | Update::eCodeIndex>();
+                /* Trap out, our return address is on the Erlang stack.
+                 *
+                 * The BIF_TRAP macros all set up c_p->arity and c_p->current,
+                 * so we can use a simplified context switch. */
+                a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
+                a.jmp(labels[context_switch_simplified]);
+            }
 
-            /* handle_error_shared needs the entry address in ARG2 */
-            a.mov(ARG2, RET);
+            a.bind(error);
+            {
+                a.mov(ARG2, entry_mem);
+                a.mov(ARG4, export_mem);
+                a.add(ARG4, imm(offsetof(Export, info.mfa)));
 
 #if !defined(NATIVE_ERLANG_STACK)
-            /* Discard the continuation pointer as it will never be used. */
-            emit_discard_cp();
+                /* Discard the continuation pointer as it will never be
+                 * used. */
+                emit_unwind_frame();
 #endif
 
-            /* get_handle_error expects current PC in ARG2 and MFA in ARG4. */
-            a.lea(ARG4, x86::qword_ptr(ARG4, offsetof(Export, info.mfa)));
-
-            /* Overwrite the return address with the entry address to ensure
-             * that only the entry address ends up in the stack trace. */
-            a.mov(x86::qword_ptr(E), ARG2);
-
-            a.jmp(labels[handle_error_shared]);
+                /* Overwrite the return address with the entry address to
+                 * ensure that only the entry address ends up in the stack
+                 * trace. */
+                if (erts_frame_layout == ERTS_FRAME_LAYOUT_RA) {
+                    a.mov(x86::qword_ptr(E), ARG2);
+                } else {
+                    ASSERT(erts_frame_layout == ERTS_FRAME_LAYOUT_FP_RA);
+                    a.mov(x86::qword_ptr(E, 8), ARG2);
+                }
+
+                a.jmp(labels[raise_exception_shared]);
+            }
         }
 
         a.bind(gc_after_bif_call);
         {
+            a.mov(ARG2, mbuf_mem);
+            a.mov(ARG5, export_mem);
+            a.mov(ARG5, x86::qword_ptr(ARG5, offsetof(Export, info.mfa.arity)));
+
+            emit_enter_runtime<Update::eReductions | Update::eStack |
+                               Update::eHeap>();
+
             a.mov(ARG1, c_p);
-            a.mov(ARG2, TMP_MEM1q);
-            /* ARG3 already contains result */
+            a.mov(ARG3, RET);
             load_x_reg_array(ARG4);
-            a.mov(ARG5, x86::qword_ptr(exp, offsetof(Export, info.mfa.arity)));
             runtime_call<5>(erts_gc_after_bif_call_lhf);
-            a.mov(ARG3, RET);
-            a.mov(ARG5, x86::qword_ptr(c_p, offsetof(Process, htop)));
-            a.mov(ARG2, x86::qword_ptr(c_p, offsetof(Process, stop)));
+
+            emit_leave_runtime<Update::eReductions | Update::eStack |
+                               Update::eHeap>();
+
             a.jmp(check_bif_return);
         }
     }
 
     a.bind(trace);
     {
-        /* Call the export entry instead of the BIF. If we use the
-         * native stack as the Erlang stack our return address is
-         * already on the Erlang stack. Otherwise we will have to move
-         * the return address from the native stack to the Erlang
-         * stack. */
+        /* Tail call the export entry instead of the BIF. If we use the native
+         * stack as the Erlang stack our return address is already on the
+         * Erlang stack. Otherwise we will have to move the return address from
+         * the native stack to the Erlang stack. */
+
+        emit_leave_frame();
 
 #if !defined(NATIVE_ERLANG_STACK)
         /* The return address must be on the Erlang stack. */
         a.pop(getCPRef());
 #endif
 
-        x86::Mem destination = emit_setup_export_call(ARG4);
+        x86::Mem destination = emit_setup_dispatchable_call(ARG4);
         a.jmp(destination);
     }
 
@@ -630,21 +609,21 @@ void BeamGlobalAssembler::emit_call_light_bif_shared() {
         a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), ARG4);
 
         /* We'll find our way back through ARG3 (entry address). */
-        emit_discard_cp();
+        emit_unwind_frame();
 
         a.jmp(labels[context_switch_simplified]);
     }
 }
 
-void BeamModuleAssembler::emit_call_light_bif(const ArgVal &Bif,
-                                              const ArgVal &Exp) {
+void BeamModuleAssembler::emit_call_light_bif(const ArgWord &Bif,
+                                              const ArgExport &Exp) {
     Label entry = a.newLabel();
 
     align_erlang_cp();
     a.bind(entry);
 
-    make_move_patch(ARG4, imports[Exp.getValue()].patches);
-    a.mov(RET, imm(Bif.getValue()));
+    mov_arg(ARG4, Exp);
+    a.mov(RET, imm(Bif.get()));
     a.lea(ARG3, x86::qword_ptr(entry));
 
     fragment_call(ga->get_call_light_bif_shared());
@@ -654,9 +633,8 @@ void BeamModuleAssembler::emit_send() {
     Label entry = a.newLabel();
 
     /* This is essentially a mirror of call_light_bif, there's no point to
-     * specializing send/2 anymore.
-     *
-     * FIXME: Rewrite this to an ordinary BIF in the loader instead. */
+     * specializing send/2 anymore. We do it here because it's far more work to
+     * do it in the loader. */
     align_erlang_cp();
     a.bind(entry);
 
@@ -667,6 +645,10 @@ void BeamModuleAssembler::emit_send() {
     fragment_call(ga->get_call_light_bif_shared());
 }
 
+void BeamModuleAssembler::emit_nif_start() {
+    /* load time only instruction */
+}
+
 void BeamGlobalAssembler::emit_bif_nif_epilogue(void) {
     Label check_trap = a.newLabel(), trap = a.newLabel(), error = a.newLabel();
 
@@ -696,6 +678,9 @@ void BeamGlobalAssembler::emit_bif_nif_epilogue(void) {
 
     comment("Do return and dispatch to it");
     a.mov(getXRef(0), RET);
+
+    emit_leave_frame();
+
 #ifdef NATIVE_ERLANG_STACK
     a.ret();
 #else
@@ -745,7 +730,7 @@ void BeamGlobalAssembler::emit_bif_nif_epilogue(void) {
 
         a.mov(ARG2, RET);
         a.mov(ARG4, x86::qword_ptr(c_p, offsetof(Process, current)));
-        a.jmp(labels[handle_error_shared]);
+        a.jmp(labels[raise_exception_shared]);
     }
 }
 
@@ -779,17 +764,16 @@ void BeamGlobalAssembler::emit_call_bif_shared(void) {
         a.short_().je(skip_msacc);
 
         a.mov(TMP_MEM1q, ARG3);
-        a.mov(TMP_MEM2q, ARG4);
-        a.mov(TMP_MEM3q, ARG5);
+        a.mov(TMP_MEM2q, ARG5);
 
         a.mov(ARG1, erts_msacc_cache);
         a.mov(ARG2, x86::qword_ptr(ARG2, offsetof(ErtsCodeMFA, module)));
         a.mov(ARG3, ARG4);
         runtime_call<3>(erts_msacc_set_bif_state);
+        a.mov(ARG4, RET);
 
         a.mov(ARG3, TMP_MEM1q);
-        a.mov(ARG4, TMP_MEM2q);
-        a.mov(ARG5, TMP_MEM3q);
+        a.mov(ARG5, TMP_MEM2q);
         a.bind(skip_msacc);
     }
 #endif
@@ -814,41 +798,52 @@ void BeamGlobalAssembler::emit_dispatch_bif(void) {
      * `info` structure. */
     a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
 
-    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.trace) ==
+    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.call_bif_nif) ==
                    sizeof(ErtsCodeInfo));
 
     ssize_t mfa_offset = offsetof(ErtsNativeFunc, trampoline.info.mfa) -
-                         offsetof(ErtsNativeFunc, trampoline.trace);
+                         offsetof(ErtsNativeFunc, trampoline.call_bif_nif);
     a.lea(ARG2, x86::qword_ptr(ARG3, mfa_offset));
 
     ssize_t dfunc_offset = offsetof(ErtsNativeFunc, trampoline.dfunc) -
-                           offsetof(ErtsNativeFunc, trampoline.trace);
+                           offsetof(ErtsNativeFunc, trampoline.call_bif_nif);
     a.mov(ARG4, x86::qword_ptr(ARG3, dfunc_offset));
 
     a.jmp(labels[call_bif_shared]);
 }
 
-void BeamModuleAssembler::emit_call_bif(const ArgVal &Func) {
+void BeamModuleAssembler::emit_call_bif(const ArgWord &Func) {
     int mfa_offset = -(int)sizeof(ErtsCodeMFA);
 
-    a.lea(ARG2, x86::qword_ptr(currLabel, mfa_offset));
-    a.lea(ARG3, x86::qword_ptr(currLabel));
-    mov_arg(ARG4, Func);
+    Label entry = a.newLabel();
+
+    /* This is _always_ the first instruction in a function and replaces the
+     * yield test that would otherwise add a frame, so we must add a frame
+     * here. */
+    emit_enter_frame();
+
+    /* Yield entry point; must be after entering frame. */
+    a.bind(entry);
+    {
+        a.lea(ARG2, x86::qword_ptr(current_label, mfa_offset));
+        a.lea(ARG3, x86::qword_ptr(entry));
+        mov_arg(ARG4, Func);
 
-    abs_jmp(ga->get_call_bif_shared());
+        a.jmp(resolve_fragment(ga->get_call_bif_shared()));
+    }
 }
 
-void BeamModuleAssembler::emit_call_bif_mfa(const ArgVal &M,
-                                            const ArgVal &F,
-                                            const ArgVal &A) {
+void BeamModuleAssembler::emit_call_bif_mfa(const ArgAtom &M,
+                                            const ArgAtom &F,
+                                            const ArgWord &A) {
     BeamInstr func;
     Export *e;
 
-    e = erts_active_export_entry(M.getValue(), F.getValue(), A.getValue());
+    e = erts_active_export_entry(M.get(), F.get(), A.get());
     ASSERT(e != NULL && e->bif_number != -1);
 
     func = (BeamInstr)bif_table[e->bif_number].f;
-    emit_call_bif(ArgVal(ArgVal::i, func));
+    emit_call_bif(ArgWord(func));
 }
 
 void BeamGlobalAssembler::emit_call_nif_early() {
@@ -866,6 +861,7 @@ void BeamGlobalAssembler::emit_call_nif_early() {
         a.test(ARG2, imm(sizeof(UWord) - 1));
         a.short_().je(next);
 
+        comment("# Return address isn't word-aligned");
         a.ud2();
 
         a.bind(next);
@@ -879,11 +875,14 @@ void BeamGlobalAssembler::emit_call_nif_early() {
 
     emit_leave_runtime();
 
-    /* We won't return to the original code. */
-    emit_discard_cp();
+    /* We won't return to the original code. We KNOW that the stack points at
+     * a return address. */
+    a.add(x86::rsp, imm(8));
 
     /* Emulate `emit_call_nif`, loading the current (phony) instruction
-     * pointer into ARG2. */
+     * pointer into ARG2. We push a (redundant) frame pointer to match the
+     * corresponding `emit_leave_frame` in `call_nif_shared`. */
+    emit_enter_frame();
     a.mov(ARG3, RET);
     a.jmp(labels[call_nif_shared]);
 }
@@ -932,48 +931,71 @@ void BeamGlobalAssembler::emit_dispatch_nif(void) {
      *
      * ErtsNativeFunc already follows the NIF call layout, so we don't need to
      * do anything beyond loading the address. */
-    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.trace) ==
+    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.call_bif_nif) ==
                    sizeof(ErtsCodeInfo));
     a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
     a.jmp(labels[call_nif_shared]);
 }
 
-/* WARNING: This stub is memcpy'd, so all code herein must be explicitly
- * position-independent. */
-void BeamModuleAssembler::emit_call_nif(const ArgVal &Func,
-                                        const ArgVal &NifMod,
-                                        const ArgVal &DirtyFunc) {
-    Label dispatch = a.newLabel();
-    uint64_t val;
-
-    /* The start of this function has to mimic the layout of ErtsNativeFunc. */
-    a.jmp(dispatch); /* call_op */
-
-    a.align(AlignMode::kCode, 8);
-    /* ErtsNativeFunc.dfunc */
-    val = Func.getValue();
-    a.embed(&val, sizeof(val));
-    /* ErtsNativeFunc.m */
-    val = NifMod.getValue();
-    a.embed(&val, sizeof(val));
-    /* ErtsNativeFunc.func */
-    val = DirtyFunc.getValue();
-    a.embed(&val, sizeof(val));
-
-    /* The real code starts here */
-    a.bind(dispatch);
+void BeamGlobalAssembler::emit_call_nif_yield_helper() {
+    Label yield = a.newLabel();
+
+    a.dec(FCALLS);
+    a.short_().jl(yield);
+    a.jmp(labels[call_nif_shared]);
+
+    a.bind(yield);
     {
-        Label yield = a.newLabel();
+        int mfa_offset = -(int)sizeof(ErtsCodeMFA);
+        int arity_offset = mfa_offset + (int)offsetof(ErtsCodeMFA, arity);
 
-        a.lea(ARG3, x86::qword_ptr(currLabel));
+        a.mov(ARG1, x86::qword_ptr(ARG3, arity_offset));
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, arity)), ARG1);
 
-        a.dec(FCALLS);
-        a.jl(yield);
+        a.lea(ARG1, x86::qword_ptr(ARG3, mfa_offset));
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), ARG1);
 
-        pic_jmp(ga->get_call_nif_shared());
+        /* Yield to `dispatch` rather than `entry` to avoid pushing too many
+         * frames to the stack. See `emit_call_nif` for details. */
+        a.add(ARG3, imm(BEAM_ASM_NFUNC_SIZE + sizeof(UWord[3])));
+        a.jmp(labels[context_switch_simplified]);
+    }
+}
 
-        a.bind(yield);
-        pic_jmp(ga->get_context_switch());
+/* WARNING: This stub is memcpy'd, so all code herein must be explicitly
+ * position-independent. */
+void BeamModuleAssembler::emit_call_nif(const ArgWord &Func,
+                                        const ArgWord &NifMod,
+                                        const ArgWord &DirtyFunc) {
+    Label entry = a.newLabel(), dispatch = a.newLabel();
+
+    /* The start of this function must mimic the layout of ErtsNativeFunc.
+     *
+     * We jump here on the very first entry, pushing a stack frame if
+     * applicable. */
+    a.bind(entry);
+    {
+        emit_enter_frame();
+        a.short_().jmp(dispatch); /* call_op */
+
+        a.align(AlignMode::kCode, 8);
+        /* ErtsNativeFunc.dfunc */
+        a.embedUInt64(Func.get());
+        /* ErtsNativeFunc.m */
+        a.embedUInt64(NifMod.get());
+        /* ErtsNativeFunc.func */
+        a.embedUInt64(DirtyFunc.get());
+    }
+
+    /* `emit_call_nif_yield_helper` relies on this to compute the address of
+     * `dispatch` */
+    ASSERT((a.offset() - code.labelOffsetFromBase(current_label)) ==
+           BEAM_ASM_NFUNC_SIZE + sizeof(UWord[3]));
+
+    a.bind(dispatch);
+    {
+        a.lea(ARG3, x86::qword_ptr(current_label));
+        pic_jmp(ga->get_call_nif_yield_helper());
     }
 }
 
@@ -996,15 +1018,18 @@ void BeamGlobalAssembler::emit_i_load_nif_shared() {
 
     a.cmp(RET, RET_NIF_yield);
     a.short_().je(yield);
+
+    /* We entered the frame in module code. */
+    emit_leave_frame();
+
     a.cmp(RET, RET_NIF_success);
     a.short_().jne(error);
-
     a.ret();
 
     a.bind(error);
     {
         a.mov(ARG4, imm(&bif_mfa));
-        emit_handle_error();
+        a.jmp(labels[raise_exception]);
     }
 
     a.bind(yield);
@@ -1014,10 +1039,49 @@ void BeamGlobalAssembler::emit_i_load_nif_shared() {
     }
 }
 
+static ErtsCodePtr get_on_load_address(Process *c_p, Eterm module) {
+    const Module *modp = erts_get_module(module, erts_active_code_ix());
+
+    if (modp && modp->on_load) {
+        const BeamCodeHeader *hdr = (modp->on_load)->code_hdr;
+
+        if (hdr) {
+            return erts_codeinfo_to_code(hdr->on_load);
+        }
+    }
+
+    c_p->freason = BADARG;
+
+    return NULL;
+}
+
+/* Implements the internal and undocumented erlang:call_on_load_function/1,
+ * which is tricky to implement in the face of frame pointers. */
+void BeamModuleAssembler::emit_i_call_on_load_function() {
+    static ErtsCodeMFA mfa = {am_erlang, am_call_on_load_function, 1};
+    Label next = a.newLabel();
+
+    emit_enter_runtime();
+
+    a.mov(ARG1, c_p);
+    a.mov(ARG2, getXRef(0));
+    runtime_call<2>(get_on_load_address);
+
+    emit_leave_runtime();
+
+    a.test(RET, RET);
+    a.jne(next);
+
+    emit_raise_exception(&mfa);
+
+    a.bind(next);
+    erlang_call(RET, ARG1);
+}
+
 #ifdef NATIVE_ERLANG_STACK
 
 void BeamModuleAssembler::emit_i_load_nif() {
-    Label entry = a.newLabel(), next = a.newLabel();
+    Label entry = a.newLabel(), yield = a.newLabel(), next = a.newLabel();
 
     /* i_load_nif is a rewrite of a call_ext instruction, so we'll body-call
      * ourselves to ensure the stack is consistent with that. This greatly
@@ -1028,8 +1092,13 @@ void BeamModuleAssembler::emit_i_load_nif() {
     align_erlang_cp();
     a.bind(entry);
     {
-        a.lea(ARG2, x86::qword_ptr(entry));
-        abs_jmp(ga->get_i_load_nif_shared());
+        emit_enter_frame();
+
+        a.bind(yield);
+        {
+            a.lea(ARG2, x86::qword_ptr(yield));
+            a.jmp(resolve_fragment(ga->get_i_load_nif_shared()));
+        }
     }
 
     a.bind(next);
@@ -1048,7 +1117,7 @@ void BeamModuleAssembler::emit_i_load_nif() {
     emit_enter_runtime<Update::eStack | Update::eHeap>();
 
     a.mov(ARG1, c_p);
-    a.lea(ARG2, x86::qword_ptr(currLabel));
+    a.lea(ARG2, x86::qword_ptr(current_label));
     load_x_reg_array(ARG3);
     runtime_call<3>(beam_jit_load_nif);
 
@@ -1059,12 +1128,12 @@ void BeamModuleAssembler::emit_i_load_nif() {
     a.cmp(RET, imm(RET_NIF_success));
     a.je(next);
 
-    emit_handle_error(currLabel, &mfa);
+    emit_raise_exception(current_label, &mfa);
 
     a.bind(schedule);
     {
         a.lea(ARG3, x86::qword_ptr(entry));
-        abs_jmp(ga->get_context_switch_simplified());
+        a.jmp(resolve_fragment(ga->get_context_switch_simplified()));
     }
 
     a.bind(next);