1 files changed, 1073 insertions, 0 deletions
diff --git a/erts/emulator/beam/jit/x86/instr_bif.cpp b/erts/emulator/beam/jit/x86/instr_bif.cpp
new file mode 100644
index 0000000000..ae329e3f0c
--- /dev/null
+++ b/erts/emulator/beam/jit/x86/instr_bif.cpp
@@ -0,0 +1,1073 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2020-2020. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#include "beam_asm.hpp"
+
+extern "C"
+{
+#include "beam_common.h"
+#include "code_ix.h"
+#include "erl_bif_table.h"
+#include "erl_nfunc_sched.h"
+#include "bif.h"
+#include "erl_msacc.h"
+}
+
+/* ARG2 = argument vector, ARG4 (!) = bif function pointer
+ *
+ * Result is returned in RET, error is indicated by ZF. */
+void BeamGlobalAssembler::emit_i_bif_guard_shared() {
+    emit_enter_runtime<Update::eReductions>();
+
+    a.mov(ARG1, c_p);
+    /* ARG2 has been set by caller; ARG3 is never used by guard BIFs. */
+    mov_imm(ARG3, 0);
+    runtime_call(ARG4, 3);
+
+    emit_leave_runtime<Update::eReductions>();
+
+    emit_test_the_non_value(RET);
+    a.ret();
+}
+
+/* ARG2 = argument vector, ARG4 (!) = bif function pointer
+ *
+ * Result is returned in RET. */
+void BeamGlobalAssembler::emit_i_bif_body_shared() {
+    Label error = a.newLabel();
+
+    emit_enter_runtime<Update::eReductions>();
+
+    /* Save current BIF and argument vector for the error path. */
+    a.mov(TMP_MEM1q, ARG2);
+    a.mov(TMP_MEM2q, ARG4);
+
+    a.mov(ARG1, c_p);
+    /* ARG2 has been set by caller; ARG3 is never used by guard BIFs. */
+    mov_imm(ARG3, 0);
+    runtime_call(ARG4, 3);
+
+    emit_test_the_non_value(RET);
+    a.short_().je(error);
+
+    emit_leave_runtime<Update::eReductions>();
+
+    a.ret();
+
+    a.bind(error);
+    {
+        /* Copy arguments into x-registers from the argument vector. We don't
+         * need to care about the actual arity since all x-registers are
+         * clobbered on exceptions. */
+        a.mov(ARG2, TMP_MEM1q);
+        for (int i = 0; i < 3; i++) {
+            a.mov(ARG1, x86::qword_ptr(ARG2, i * sizeof(Eterm)));
+            a.mov(getXRef(i), ARG1);
+        }
+
+        /* Find the correct MFA from the BIF's function address. */
+        a.mov(ARG1, TMP_MEM2q);
+        runtime_call<1>(ubif2mfa);
+
+        emit_leave_runtime<Update::eReductions>();
+
+        a.mov(ARG4, RET);
+        a.jmp(labels[handle_error_shared_prologue]);
+    }
+}
+
+void BeamModuleAssembler::emit_setup_guard_bif(const std::vector<ArgVal> &args,
+                                               const ArgVal &bif) {
+    bool is_contiguous_mem = false;
+
+    ASSERT(args.size() > 0 && args.size() <= 3);
+
+    /* If the guard BIF's arguments are in memory and continuous, for example
+     * `map_get(x0, x1)`, then we can pass the address of the first argument
+     * instead of filling in the argument vector. */
+    is_contiguous_mem = args.size() && args[0].isMem();
+    for (size_t i = 1; i < args.size() && is_contiguous_mem; i++) {
+        const ArgVal &curr = args[i], &prev = args[i - 1];
+
+        is_contiguous_mem = curr.getType() == prev.getType() &&
+                            curr.getValue() == prev.getValue() + 1;
+    }
+
+    if (is_contiguous_mem) {
+        a.lea(ARG2, getArgRef(args[0]));
+    } else {
+        a.lea(ARG2, TMP_MEM3q);
+
+        for (size_t i = 0; i < args.size(); i++) {
+            mov_arg(x86::qword_ptr(ARG2, i * sizeof(Eterm)), args[i]);
+        }
+    }
+
+    mov_arg(ARG4, bif);
+}
+
+void BeamModuleAssembler::emit_i_bif1(const ArgVal &Src1,
+                                      const ArgVal &Fail,
+                                      const ArgVal &Bif,
+                                      const ArgVal &Dst) {
+    emit_setup_guard_bif({Src1}, Bif);
+
+    if (Fail.getValue() != 0) {
+        safe_fragment_call(ga->get_i_bif_guard_shared());
+        a.je(labels[Fail.getValue()]);
+    } else {
+        safe_fragment_call(ga->get_i_bif_body_shared());
+    }
+
+    mov_arg(Dst, RET);
+}
+
+void BeamModuleAssembler::emit_i_bif2(const ArgVal &Src1,
+                                      const ArgVal &Src2,
+                                      const ArgVal &Fail,
+                                      const ArgVal &Bif,
+                                      const ArgVal &Dst) {
+    emit_setup_guard_bif({Src1, Src2}, Bif);
+
+    if (Fail.getValue() != 0) {
+        safe_fragment_call(ga->get_i_bif_guard_shared());
+        a.je(labels[Fail.getValue()]);
+    } else {
+        safe_fragment_call(ga->get_i_bif_body_shared());
+    }
+
+    mov_arg(Dst, RET);
+}
+
+void BeamModuleAssembler::emit_i_bif3(const ArgVal &Src1,
+                                      const ArgVal &Src2,
+                                      const ArgVal &Src3,
+                                      const ArgVal &Fail,
+                                      const ArgVal &Bif,
+                                      const ArgVal &Dst) {
+    emit_setup_guard_bif({Src1, Src2, Src3}, Bif);
+
+    if (Fail.getValue() != 0) {
+        safe_fragment_call(ga->get_i_bif_guard_shared());
+        a.je(labels[Fail.getValue()]);
+    } else {
+        safe_fragment_call(ga->get_i_bif_body_shared());
+    }
+
+    mov_arg(Dst, RET);
+}
+
+/*
+ * Emit code for guard BIFs that can't fail (e.g. is_list/1).  We
+ * don't need to test for failure and even in a body there is no need
+ * to align the call targeting the shared fragment.
+ */
+
+void BeamModuleAssembler::emit_nofail_bif1(const ArgVal &Src1,
+                                           const ArgVal &Bif,
+                                           const ArgVal &Dst) {
+    emit_setup_guard_bif({Src1}, Bif);
+    safe_fragment_call(ga->get_i_bif_guard_shared());
+    mov_arg(Dst, RET);
+}
+
+void BeamModuleAssembler::emit_nofail_bif2(const ArgVal &Src1,
+                                           const ArgVal &Src2,
+                                           const ArgVal &Bif,
+                                           const ArgVal &Dst) {
+    emit_setup_guard_bif({Src1, Src2}, Bif);
+    safe_fragment_call(ga->get_i_bif_guard_shared());
+    mov_arg(Dst, RET);
+}
+
+void BeamModuleAssembler::emit_i_length_setup(const ArgVal &Fail,
+                                              const ArgVal &Live,
+                                              const ArgVal &Src) {
+    x86::Mem trap_state;
+
+    /* Store trap state after the currently live registers. There's an extra 3
+     * registers beyond the ordinary ones that we're free to use for whatever
+     * purpose. */
+    ERTS_CT_ASSERT(ERTS_X_REGS_ALLOCATED - MAX_REG >= 3);
+    ASSERT(Live.getValue() <= MAX_REG);
+    trap_state = getXRef(Live.getValue());
+
+    /* Remainder of the list. */
+    mov_arg(trap_state, Src);
+
+    /* Accumulated length. */
+    a.mov(trap_state.cloneAdjusted(1 * sizeof(Eterm)), imm(make_small(0)));
+
+    /* Original argument. This is only needed for exceptions and can be safely
+     * skipped in guards. */
+    if (Fail.getValue() == 0) {
+        x86::Mem original_argument;
+
+        original_argument = trap_state.cloneAdjusted(2 * sizeof(Eterm));
+        mov_arg(original_argument, Src);
+    }
+}
+
+/* ARG2 = live registers, ARG3 = entry address
+ *
+ * Result is returned in RET. */
+x86::Mem BeamGlobalAssembler::emit_i_length_common(Label fail, int state_size) {
+    Label trap = a.newLabel();
+    x86::Mem trap_state;
+
+    ASSERT(state_size >= 2 && state_size <= ERTS_X_REGS_ALLOCATED - MAX_REG);
+
+    /* getXRef(Live) */
+    trap_state = getXRef(0);
+    trap_state.setIndex(ARG2, 3);
+
+    /* Save arguments for error/trapping path. */
+    a.mov(TMP_MEM1q, ARG2);
+    a.mov(TMP_MEM2q, ARG3);
+
+    emit_enter_runtime<Update::eReductions>();
+
+    a.mov(ARG1, c_p);
+    a.lea(ARG2, trap_state);
+    runtime_call<2>(erts_trapping_length_1);
+
+    emit_leave_runtime<Update::eReductions>();
+
+    emit_test_the_non_value(RET);
+    a.short_().je(trap);
+
+    a.ret();
+
+    a.bind(trap);
+    {
+        a.mov(ARG2, TMP_MEM1q);
+        a.mov(ARG3, TMP_MEM2q);
+
+        a.cmp(x86::qword_ptr(c_p, offsetof(Process, freason)), imm(TRAP));
+        a.jne(fail);
+
+        /* The trap state is stored in the registers above the current live
+         * ones, so we add the state size (in words) to keep it alive. */
+        a.add(ARG2, imm(state_size));
+
+        /* We'll find our way back through the entry address (ARG3). */
+        emit_discard_cp();
+
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), imm(0));
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, arity)), ARG2);
+        a.jmp(labels[context_switch_simplified]);
+    }
+
+    return trap_state;
+}
+
+/* ARG2 = live registers, ARG3 = entry address
+ *
+ * Result is returned in RET. */
+void BeamGlobalAssembler::emit_i_length_body_shared() {
+    Label error = a.newLabel();
+    x86::Mem trap_state;
+
+    /* `state_size = 3` to include the original argument. */
+    trap_state = emit_i_length_common(error, 3);
+
+    a.bind(error);
+    {
+        static const ErtsCodeMFA bif_mfa = {am_erlang, am_length, 1};
+
+        /* Move the original argument to x0. It's stored in the third word of
+         * the trap state. */
+        a.mov(ARG1, trap_state.cloneAdjusted(2 * sizeof(Eterm)));
+        a.mov(getXRef(0), ARG1);
+
+        a.mov(ARG4, imm(&bif_mfa));
+        emit_handle_error();
+    }
+}
+
+/* ARG2 = live registers, ARG3 = entry address
+ *
+ * Result is returned in RET, error is indicated by ZF. */
+void BeamGlobalAssembler::emit_i_length_guard_shared() {
+    Label error = a.newLabel();
+
+    emit_i_length_common(error, 2);
+
+    a.bind(error);
+    {
+        mov_imm(RET, 0);
+        a.ret();
+    }
+}
+
+void BeamModuleAssembler::emit_i_length(const ArgVal &Fail,
+                                        const ArgVal &Live,
+                                        const ArgVal &Dst) {
+    Label entry = a.newLabel();
+
+    align_erlang_cp();
+    a.bind(entry);
+
+    mov_arg(ARG2, Live);
+    a.lea(ARG3, x86::qword_ptr(entry));
+
+    if (Fail.getValue() != 0) {
+        /* The return address is discarded when yielding, so it doesn't need to
+         * be aligned. */
+        safe_fragment_call(ga->get_i_length_guard_shared());
+        a.je(labels[Fail.getValue()]);
+    } else {
+        fragment_call(ga->get_i_length_body_shared());
+    }
+
+    mov_arg(Dst, RET);
+}
+
+#if defined(DEBUG) || defined(ERTS_ENABLE_LOCK_CHECK)
+
+static Eterm debug_call_light_bif(Process *c_p,
+                                  Eterm *reg,
+                                  ErtsCodePtr I,
+                                  ErtsBifFunc vbf) {
+    Eterm result;
+
+    ERTS_UNREQ_PROC_MAIN_LOCK(c_p);
+    {
+        ERTS_CHK_MBUF_SZ(c_p);
+        ASSERT(!ERTS_PROC_IS_EXITING(c_p));
+        result = vbf(c_p, reg, I);
+        ASSERT(!ERTS_PROC_IS_EXITING(c_p) || is_non_value(result));
+        ERTS_CHK_MBUF_SZ(c_p);
+
+        ERTS_VERIFY_UNUSED_TEMP_ALLOC(c_p);
+        ERTS_HOLE_CHECK(c_p);
+    }
+    PROCESS_MAIN_CHK_LOCKS(c_p);
+    ERTS_REQ_PROC_MAIN_LOCK(c_p);
+
+    return result;
+}
+#endif
+
+/* It is important that the below code is as optimized as possible.
+ * When doing any changes, make sure to look at the estone bif_dispatch
+ * benchmark to make sure you don't introduce any regressions.
+ *
+ * ARG3 = entry
+ * ARG4 = export entry
+ * RET  = BIF pointer
+ */
+void BeamGlobalAssembler::emit_call_light_bif_shared() {
+    /* We use the HTOP and FCALLS registers as they are
+       not used on the runtime-stack and are caller save. */
+
+    x86::Gp I = HTOP, exp = FCALLS;
+
+    Label error = a.newLabel(), trace = a.newLabel(), trap = a.newLabel(),
+          yield = a.newLabel(), call_save_calls = a.newLabel(),
+          call_bif = a.newLabel(), gc_after_bif_call = a.newLabel(),
+          check_bif_return = a.newLabel();
+
+    /* Check if we should trace this bif call */
+    a.cmp(x86::dword_ptr(ARG4, offsetof(Export, is_bif_traced)), imm(0));
+    a.jne(trace);
+
+    a.dec(FCALLS);
+    a.jle(yield);
+
+    {
+        emit_enter_runtime<Update::eReductions | Update::eStack |
+                           Update::eHeap>();
+
+        /* Spill the arguments we may need on the error path. */
+        a.mov(I, ARG3);
+        a.mov(exp, ARG4);
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+        {
+            Label skip_msacc = a.newLabel();
+
+            a.cmp(erts_msacc_cache, imm(0));
+            a.short_().je(skip_msacc);
+
+            a.mov(TMP_MEM1q, RET);
+
+            a.mov(ARG1, erts_msacc_cache);
+            a.mov(ARG2,
+                  x86::qword_ptr(ARG4, offsetof(Export, info.mfa.module)));
+            a.mov(ARG3, RET);
+            runtime_call<3>(erts_msacc_set_bif_state);
+
+            a.mov(ARG3, I);
+            a.mov(RET, TMP_MEM1q);
+            a.bind(skip_msacc);
+        }
+#endif
+        /* Check if we need to call save_calls */
+        a.cmp(active_code_ix, imm(ERTS_SAVE_CALLS_CODE_IX));
+        a.je(call_save_calls);
+        a.bind(call_bif);
+
+        a.mov(ARG1, x86::qword_ptr(c_p, offsetof(Process, mbuf)));
+        a.mov(TMP_MEM1q, ARG1);
+
+        /* ARG3 and RET have been set earlier. */
+        a.mov(ARG1, c_p);
+        load_x_reg_array(ARG2);
+
+#if defined(DEBUG) || defined(ERTS_ENABLE_LOCK_CHECK)
+        a.mov(ARG4, RET);
+        runtime_call<4>(debug_call_light_bif);
+#else
+        runtime_call(RET, 3);
+#endif
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+        {
+            Label skip_msacc = a.newLabel();
+
+            a.cmp(erts_msacc_cache, imm(0));
+            a.short_().je(skip_msacc);
+
+            /* update cache if it was changed in the bif.
+               TMP_MEM1q is already taken to save ARG1 above */
+            a.mov(TMP_MEM2q, RET);
+            a.lea(ARG1, erts_msacc_cache);
+            runtime_call<1>(erts_msacc_update_cache);
+            a.mov(RET, TMP_MEM2q);
+
+            /* set state to emulator if msacc has been enabled */
+            a.cmp(erts_msacc_cache, imm(0));
+            a.short_().je(skip_msacc);
+            a.mov(ARG1, erts_msacc_cache);
+            a.mov(ARG2, imm(ERTS_MSACC_STATE_EMULATOR));
+            a.mov(ARG3, imm(1));
+            runtime_call<3>(erts_msacc_set_state_m__);
+            a.mov(RET, TMP_MEM2q);
+
+            a.bind(skip_msacc);
+        }
+#endif
+
+        /* ERTS_IS_GC_DESIRED_INTERNAL */
+        {
+            a.mov(ARG2, x86::qword_ptr(c_p, offsetof(Process, stop)));
+            a.mov(ARG3, RET);
+            a.mov(ARG5, x86::qword_ptr(c_p, offsetof(Process, htop)));
+
+            /* Test if binary heap size should trigger gc */
+            a.mov(RET, x86::qword_ptr(c_p, offsetof(Process, bin_vheap_sz)));
+            a.cmp(x86::qword_ptr(c_p, offsetof(Process, off_heap.overhead)),
+                  RET);
+            a.mov(RETd, x86::dword_ptr(c_p, offsetof(Process, flags)));
+            a.seta(x86::cl); /* Clobber ARG1 on windows and ARG4 on Linux */
+            a.and_(RETd, imm(F_FORCE_GC));
+            a.or_(x86::cl, RETb);
+            a.jne(gc_after_bif_call);
+
+            /* Test if heap fragment size is larger than remaining heap size. */
+            a.mov(RET, ARG2);
+            a.sub(RET, ARG5);
+            a.sar(RET, imm(3));
+            a.cmp(RET, x86::qword_ptr(c_p, offsetof(Process, mbuf_sz)));
+            a.jl(gc_after_bif_call);
+        }
+
+        /*
+           ARG2 is set to E
+           ARG3 is set to bif return
+           ARG5 is set to HTOP
+
+           HTOP is exp
+           E_saved|E is I
+        */
+        a.bind(check_bif_return);
+        emit_test_the_non_value(ARG3);
+
+        /* NOTE: Short won't reach if JIT_HARD_DEBUG is defined. */
+        a.je(trap);
+
+        a.mov(HTOP, ARG5);
+#ifdef NATIVE_ERLANG_STACK
+        a.mov(E_saved, ARG2);
+#else
+        a.mov(E, ARG2);
+#endif
+
+        /* We must update the active code index in case another process has
+         * loaded new code, as the result of this BIF may be observable on both
+         * ends.
+         *
+         * It doesn't matter whether the BIF modifies anything; if process A
+         * loads new code and calls erlang:monotonic_time/0 soon after, we'd
+         * break the illusion of atomic upgrades if process B still ran old code
+         * after seeing a later timestamp from its own call to
+         * erlang:monotonic_time/0. */
+
+        emit_leave_runtime<Update::eReductions | Update::eCodeIndex>();
+
+        a.mov(getXRef(0), ARG3);
+        a.ret();
+
+        a.bind(call_save_calls);
+        {
+            /* Stash the bif function pointer */
+            a.mov(TMP_MEM1q, RET);
+
+            /* Setup the arguments to call */
+            a.mov(ARG1, c_p);
+            a.mov(ARG2, exp);
+            runtime_call<2>(save_calls);
+
+            /* Restore RET and ARG3 to the values expected
+               by the bif call */
+            a.mov(RET, TMP_MEM1q);
+            a.mov(ARG3, I);
+            a.jmp(call_bif);
+        }
+
+        a.bind(trap);
+        {
+            a.cmp(x86::qword_ptr(c_p, offsetof(Process, freason)), imm(TRAP));
+            a.short_().jne(error);
+
+            emit_leave_runtime<Update::eHeap | Update::eStack |
+                               Update::eReductions | Update::eCodeIndex>();
+
+#if !defined(NATIVE_ERLANG_STACK)
+            a.pop(getCPRef());
+#endif
+
+            /* Trap out, our return address is on the Erlang stack.
+             *
+             * The BIF_TRAP macros all set up c_p->arity and c_p->current, so
+             * we can use a simplified context switch. */
+            a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
+            a.jmp(labels[context_switch_simplified]);
+        }
+
+        a.bind(error);
+        {
+            a.mov(ARG4, exp);
+            a.mov(RET, I);
+
+            /* Update::eCodeIndex clobbers ARG1 + ARG2 */
+            emit_leave_runtime<Update::eHeap | Update::eStack |
+                               Update::eReductions | Update::eCodeIndex>();
+
+            /* handle_error_shared needs the entry address in ARG2 */
+            a.mov(ARG2, RET);
+
+#if !defined(NATIVE_ERLANG_STACK)
+            /* Discard the continuation pointer as it will never be used. */
+            emit_discard_cp();
+#endif
+
+            /* get_handle_error expects current PC in ARG2 and MFA in ARG4. */
+            a.lea(ARG4, x86::qword_ptr(ARG4, offsetof(Export, info.mfa)));
+
+            /* Overwrite the return address with the entry address to ensure
+             * that only the entry address ends up in the stack trace. */
+            a.mov(x86::qword_ptr(E), ARG2);
+
+            a.jmp(labels[handle_error_shared]);
+        }
+
+        a.bind(gc_after_bif_call);
+        {
+            a.mov(ARG1, c_p);
+            a.mov(ARG2, TMP_MEM1q);
+            /* ARG3 already contains result */
+            load_x_reg_array(ARG4);
+            a.mov(ARG5, x86::qword_ptr(exp, offsetof(Export, info.mfa.arity)));
+            runtime_call<5>(erts_gc_after_bif_call_lhf);
+            a.mov(ARG3, RET);
+            a.mov(ARG5, x86::qword_ptr(c_p, offsetof(Process, htop)));
+            a.mov(ARG2, x86::qword_ptr(c_p, offsetof(Process, stop)));
+            a.jmp(check_bif_return);
+        }
+    }
+
+    a.bind(trace);
+    {
+        /* Call the export entry instead of the BIF. If we use the
+         * native stack as the Erlang stack our return address is
+         * already on the Erlang stack. Otherwise we will have to move
+         * the return address from the native stack to the Erlang
+         * stack. */
+
+#if !defined(NATIVE_ERLANG_STACK)
+        /* The return address must be on the Erlang stack. */
+        a.pop(getCPRef());
+#endif
+
+        x86::Mem destination = emit_setup_export_call(ARG4);
+        a.jmp(destination);
+    }
+
+    a.bind(yield);
+    {
+        a.mov(ARG2, x86::qword_ptr(ARG4, offsetof(Export, info.mfa.arity)));
+        a.lea(ARG4, x86::qword_ptr(ARG4, offsetof(Export, info.mfa)));
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, arity)), ARG2);
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), ARG4);
+
+        /* We'll find our way back through ARG3 (entry address). */
+        emit_discard_cp();
+
+        a.jmp(labels[context_switch_simplified]);
+    }
+}
+
+void BeamModuleAssembler::emit_call_light_bif(const ArgVal &Bif,
+                                              const ArgVal &Exp) {
+    Label entry = a.newLabel();
+
+    align_erlang_cp();
+    a.bind(entry);
+
+    make_move_patch(ARG4, imports[Exp.getValue()].patches);
+    a.mov(RET, imm(Bif.getValue()));
+    a.lea(ARG3, x86::qword_ptr(entry));
+
+    fragment_call(ga->get_call_light_bif_shared());
+}
+
+void BeamModuleAssembler::emit_send() {
+    Label entry = a.newLabel();
+
+    /* This is essentially a mirror of call_light_bif, there's no point to
+     * specializing send/2 anymore.
+     *
+     * FIXME: Rewrite this to an ordinary BIF in the loader instead. */
+    align_erlang_cp();
+    a.bind(entry);
+
+    a.mov(ARG4, imm(BIF_TRAP_EXPORT(BIF_send_2)));
+    a.mov(RET, imm(send_2));
+    a.lea(ARG3, x86::qword_ptr(entry));
+
+    fragment_call(ga->get_call_light_bif_shared());
+}
+
+void BeamGlobalAssembler::emit_bif_nif_epilogue(void) {
+    Label check_trap = a.newLabel(), trap = a.newLabel(), error = a.newLabel();
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+    {
+        Label skip_msacc = a.newLabel();
+
+        a.cmp(erts_msacc_cache, 0);
+        a.short_().je(skip_msacc);
+        a.mov(TMP_MEM1q, RET);
+        a.mov(ARG1, erts_msacc_cache);
+        a.mov(ARG2, imm(ERTS_MSACC_STATE_EMULATOR));
+        a.mov(ARG3, imm(1));
+        runtime_call<3>(erts_msacc_set_state_m__);
+        a.mov(RET, TMP_MEM1q);
+        a.bind(skip_msacc);
+    }
+#endif
+
+    /* Another process may have loaded new code and somehow notified us through
+     * this call, so we must update the active code index. */
+    emit_leave_runtime<Update::eReductions | Update::eStack | Update::eHeap |
+                       Update::eCodeIndex>();
+
+    emit_test_the_non_value(RET);
+    a.short_().je(check_trap);
+
+    comment("Do return and dispatch to it");
+    a.mov(getXRef(0), RET);
+#ifdef NATIVE_ERLANG_STACK
+    a.ret();
+#else
+    a.mov(RET, getCPRef());
+    a.mov(getCPRef(), imm(NIL));
+    a.jmp(RET);
+#endif
+
+    a.bind(check_trap);
+    a.cmp(x86::qword_ptr(c_p, offsetof(Process, freason)), imm(TRAP));
+    a.jne(error);
+    {
+        comment("yield");
+
+        comment("test trap to hibernate");
+        a.mov(ARG1, x86::qword_ptr(c_p, offsetof(Process, flags)));
+        a.mov(ARG2, ARG1);
+        a.and_(ARG2, imm(F_HIBERNATE_SCHED));
+        a.short_().je(trap);
+
+        comment("do hibernate trap");
+        a.and_(ARG1, imm(~F_HIBERNATE_SCHED));
+        a.mov(x86::qword_ptr(c_p, offsetof(Process, flags)), ARG1);
+        a.jmp(labels[do_schedule]);
+    }
+
+    a.bind(trap);
+    {
+        comment("do normal trap");
+
+        /* The BIF_TRAP macros all set up c_p->arity and c_p->current, so we
+         * can use a simplified context switch. */
+        a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
+        a.jmp(labels[context_switch_simplified]);
+    }
+
+    a.bind(error);
+    {
+        a.mov(ARG2, E);
+
+        emit_enter_runtime<Update::eStack>();
+
+        a.mov(ARG1, c_p);
+        runtime_call<2>(erts_printable_return_address);
+
+        emit_leave_runtime<Update::eStack>();
+
+        a.mov(ARG2, RET);
+        a.mov(ARG4, x86::qword_ptr(c_p, offsetof(Process, current)));
+        a.jmp(labels[handle_error_shared]);
+    }
+}
+
+/* Used by call_bif, dispatch_bif, and export_trampoline.
+ *
+ * Note that we don't check reductions here as we may have jumped here through
+ * interpreted code (e.g. an ErtsNativeFunc or export entry) and it's very
+ * tricky to yield back. Reductions are checked in module code instead.
+ *
+ * ARG2 = BIF MFA
+ * ARG3 = I (rip), doesn't need to point past an MFA
+ * ARG4 = function to be called */
+void BeamGlobalAssembler::emit_call_bif_shared(void) {
+    /* "Heavy" BIFs need up-to-date values for `c_p->i`, `c_p->current`, and
+     * `c_p->arity`. */
+
+    a.mov(x86::qword_ptr(c_p, offsetof(Process, current)), ARG2);
+    /* `call_bif` wants arity in ARG5. */
+    a.mov(ARG5, x86::qword_ptr(ARG2, offsetof(ErtsCodeMFA, arity)));
+    a.mov(x86::qword_ptr(c_p, offsetof(Process, arity)), ARG5);
+    a.mov(x86::qword_ptr(c_p, offsetof(Process, i)), ARG3);
+
+    /* The corresponding leave can be found in the epilogue. */
+    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+    {
+        Label skip_msacc = a.newLabel();
+
+        a.cmp(erts_msacc_cache, 0);
+        a.short_().je(skip_msacc);
+
+        a.mov(TMP_MEM1q, ARG3);
+        a.mov(TMP_MEM2q, ARG4);
+        a.mov(TMP_MEM3q, ARG5);
+
+        a.mov(ARG1, erts_msacc_cache);
+        a.mov(ARG2, x86::qword_ptr(ARG2, offsetof(ErtsCodeMFA, module)));
+        a.mov(ARG3, ARG4);
+        runtime_call<3>(erts_msacc_set_bif_state);
+
+        a.mov(ARG3, TMP_MEM1q);
+        a.mov(ARG4, TMP_MEM2q);
+        a.mov(ARG5, TMP_MEM3q);
+        a.bind(skip_msacc);
+    }
+#endif
+
+    a.mov(ARG1, c_p);
+    load_x_reg_array(ARG2);
+    /* ARG3 (I), ARG4 (func), and ARG5 (arity) have already been provided. */
+    runtime_call<5>(beam_jit_call_bif);
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+    a.mov(TMP_MEM1q, RET);
+    a.lea(ARG1, erts_msacc_cache);
+    runtime_call<1>(erts_msacc_update_cache);
+    a.mov(RET, TMP_MEM1q);
+#endif
+
+    emit_bif_nif_epilogue();
+}
+
+void BeamGlobalAssembler::emit_dispatch_bif(void) {
+    /* c_p->i points into the trampoline of a ErtsNativeFunc, right after the
+     * `info` structure. */
+    a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
+
+    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.trace) ==
+                   sizeof(ErtsCodeInfo));
+
+    ssize_t mfa_offset = offsetof(ErtsNativeFunc, trampoline.info.mfa) -
+                         offsetof(ErtsNativeFunc, trampoline.trace);
+    a.lea(ARG2, x86::qword_ptr(ARG3, mfa_offset));
+
+    ssize_t dfunc_offset = offsetof(ErtsNativeFunc, trampoline.dfunc) -
+                           offsetof(ErtsNativeFunc, trampoline.trace);
+    a.mov(ARG4, x86::qword_ptr(ARG3, dfunc_offset));
+
+    a.jmp(labels[call_bif_shared]);
+}
+
+void BeamModuleAssembler::emit_call_bif(const ArgVal &Func) {
+    int mfa_offset = -(int)sizeof(ErtsCodeMFA);
+
+    a.lea(ARG2, x86::qword_ptr(currLabel, mfa_offset));
+    a.lea(ARG3, x86::qword_ptr(currLabel));
+    mov_arg(ARG4, Func);
+
+    abs_jmp(ga->get_call_bif_shared());
+}
+
+void BeamModuleAssembler::emit_call_bif_mfa(const ArgVal &M,
+                                            const ArgVal &F,
+                                            const ArgVal &A) {
+    BeamInstr func;
+    Export *e;
+
+    e = erts_active_export_entry(M.getValue(), F.getValue(), A.getValue());
+    ASSERT(e != NULL && e->bif_number != -1);
+
+    func = (BeamInstr)bif_table[e->bif_number].f;
+    emit_call_bif(ArgVal(ArgVal::i, func));
+}
+
+void BeamGlobalAssembler::emit_call_nif_early() {
+    /* Fetch and align the return address so we can tell where we came from. It
+     * points just after the trampoline word so we'll need to skip that to find
+     * our ErtsCodeInfo. */
+    a.mov(ARG2, x86::qword_ptr(x86::rsp));
+    a.sub(ARG2, imm(sizeof(UWord) + sizeof(ErtsCodeInfo)));
+
+#ifdef DEBUG
+    {
+        Label next = a.newLabel();
+
+        /* Crash if our return address isn't word-aligned. */
+        a.test(ARG2, imm(sizeof(UWord) - 1));
+        a.short_().je(next);
+
+        a.ud2();
+
+        a.bind(next);
+    }
+#endif
+
+    emit_enter_runtime();
+
+    a.mov(ARG1, c_p);
+    runtime_call<2>(erts_call_nif_early);
+
+    emit_leave_runtime();
+
+    /* We won't return to the original code. */
+    emit_discard_cp();
+
+    /* Emulate `emit_call_nif`, loading the current (phony) instruction
+     * pointer into ARG2. */
+    a.mov(ARG3, RET);
+    a.jmp(labels[call_nif_shared]);
+}
+
+/* Used by call_nif, call_nif_early, and dispatch_nif.
+ *
+ * Note that we don't check reductions here as we may have jumped here through
+ * interpreted code (e.g. an ErtsNativeFunc or export entry) and it's very
+ * tricky to yield back. Reductions are checked in module code instead.
+ *
+ * ARG3 = current I, just past the end of an ErtsCodeInfo. */
+void BeamGlobalAssembler::emit_call_nif_shared(void) {
+    /* The corresponding leave can be found in the epilogue. */
+    emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
+
+#ifdef ERTS_MSACC_EXTENDED_STATES
+    {
+        Label skip_msacc = a.newLabel();
+
+        a.cmp(erts_msacc_cache, 0);
+        a.short_().je(skip_msacc);
+        a.mov(TMP_MEM1q, ARG3);
+        a.mov(ARG1, erts_msacc_cache);
+        a.mov(ARG2, imm(ERTS_MSACC_STATE_NIF));
+        a.mov(ARG3, imm(1));
+        runtime_call<3>(erts_msacc_set_state_m__);
+        a.mov(ARG3, TMP_MEM1q);
+        a.bind(skip_msacc);
+    }
+#endif
+
+    a.mov(ARG1, c_p);
+    a.mov(ARG2, ARG3);
+    load_x_reg_array(ARG3);
+    a.mov(ARG4, x86::qword_ptr(ARG2, 8 + BEAM_ASM_FUNC_PROLOGUE_SIZE));
+    a.mov(ARG5, x86::qword_ptr(ARG2, 16 + BEAM_ASM_FUNC_PROLOGUE_SIZE));
+    a.mov(ARG6, x86::qword_ptr(ARG2, 24 + BEAM_ASM_FUNC_PROLOGUE_SIZE));
+    runtime_call<5>(beam_jit_call_nif);
+
+    emit_bif_nif_epilogue();
+}
+
+void BeamGlobalAssembler::emit_dispatch_nif(void) {
+    /* c_p->i points into the trampoline of a ErtsNativeFunc, right after the
+     * `info` structure.
+     *
+     * ErtsNativeFunc already follows the NIF call layout, so we don't need to
+     * do anything beyond loading the address. */
+    ERTS_CT_ASSERT(offsetof(ErtsNativeFunc, trampoline.trace) ==
+                   sizeof(ErtsCodeInfo));
+    a.mov(ARG3, x86::qword_ptr(c_p, offsetof(Process, i)));
+    a.jmp(labels[call_nif_shared]);
+}
+
+/* WARNING: This stub is memcpy'd, so all code herein must be explicitly
+ * position-independent. */
+void BeamModuleAssembler::emit_call_nif(const ArgVal &Func,
+                                        const ArgVal &NifMod,
+                                        const ArgVal &DirtyFunc) {
+    Label dispatch = a.newLabel();
+    uint64_t val;
+
+    /* The start of this function has to mimic the layout of ErtsNativeFunc. */
+    a.jmp(dispatch); /* call_op */
+
+    a.align(kAlignCode, 8);
+    /* ErtsNativeFunc.dfunc */
+    val = Func.getValue();
+    a.embed(&val, sizeof(val));
+    /* ErtsNativeFunc.m */
+    val = NifMod.getValue();
+    a.embed(&val, sizeof(val));
+    /* ErtsNativeFunc.func */
+    val = DirtyFunc.getValue();
+    a.embed(&val, sizeof(val));
+
+    /* The real code starts here */
+    a.bind(dispatch);
+    {
+        Label yield = a.newLabel();
+
+        a.lea(ARG3, x86::qword_ptr(currLabel));
+
+        a.dec(FCALLS);
+        a.jl(yield);
+
+        pic_jmp(ga->get_call_nif_shared());
+
+        a.bind(yield);
+        pic_jmp(ga->get_context_switch());
+    }
+}
+
+/* ARG2 = entry address. */
+void BeamGlobalAssembler::emit_i_load_nif_shared() {
+    static ErtsCodeMFA bif_mfa = {am_erlang, am_load_nif, 2};
+
+    Label yield = a.newLabel(), error = a.newLabel();
+
+    a.mov(TMP_MEM1q, ARG2);
+
+    emit_enter_runtime<Update::eStack | Update::eHeap>();
+
+    a.mov(ARG1, c_p);
+    /* ARG2 has already been set by caller */
+    load_x_reg_array(ARG3);
+    runtime_call<3>(beam_jit_load_nif);
+
+    emit_leave_runtime<Update::eStack | Update::eHeap>();
+
+    a.cmp(RET, RET_NIF_yield);
+    a.short_().je(yield);
+    a.cmp(RET, RET_NIF_success);
+    a.short_().jne(error);
+
+    a.ret();
+
+    a.bind(error);
+    {
+        a.mov(ARG4, imm(&bif_mfa));
+        emit_handle_error();
+    }
+
+    a.bind(yield);
+    {
+        a.mov(ARG3, TMP_MEM1q);
+        a.jmp(labels[context_switch_simplified]);
+    }
+}
+
+#ifdef NATIVE_ERLANG_STACK
+
+void BeamModuleAssembler::emit_i_load_nif() {
+    Label entry = a.newLabel(), next = a.newLabel();
+
+    /* i_load_nif is a rewrite of a call_ext instruction, so we'll body-call
+     * ourselves to ensure the stack is consistent with that. This greatly
+     * simplifies yielding and error handling. */
+    fragment_call(entry);
+    a.short_().jmp(next);
+
+    align_erlang_cp();
+    a.bind(entry);
+    {
+        a.lea(ARG2, x86::qword_ptr(entry));
+        abs_jmp(ga->get_i_load_nif_shared());
+    }
+
+    a.bind(next);
+}
+
+#else
+
+void BeamModuleAssembler::emit_i_load_nif() {
+    static ErtsCodeMFA mfa = {am_erlang, am_load_nif, 2};
+
+    Label entry = a.newLabel(), next = a.newLabel(), schedule = a.newLabel();
+
+    align_erlang_cp();
+    a.bind(entry);
+
+    emit_enter_runtime<Update::eStack | Update::eHeap>();
+
+    a.mov(ARG1, c_p);
+    a.lea(ARG2, x86::qword_ptr(currLabel));
+    load_x_reg_array(ARG3);
+    runtime_call<3>(beam_jit_load_nif);
+
+    emit_leave_runtime<Update::eStack | Update::eHeap>();
+
+    a.cmp(RET, imm(RET_NIF_yield));
+    a.je(schedule);
+    a.cmp(RET, imm(RET_NIF_success));
+    a.je(next);
+
+    emit_handle_error(currLabel, &mfa);
+
+    a.bind(schedule);
+    {
+        a.lea(ARG3, x86::qword_ptr(entry));
+        abs_jmp(ga->get_context_switch_simplified());
+    }
+
+    a.bind(next);
+}
+
+#endif