summaryrefslogtreecommitdiff
path: root/erts/emulator
diff options
context:
space:
mode:
authorJohn Högberg <john@erlang.org>2022-02-07 23:05:19 +0100
committerJohn Högberg <john@erlang.org>2022-02-17 12:50:43 +0100
commit48643c2ee9c9d74b2e642dbd6d6802fda502e370 (patch)
tree3da6eb2b02d1a18c98cab9a51a3c4e0944308413 /erts/emulator
parentdf9d0cb725f83b498bd620555f62962b9c0f1df5 (diff)
downloaderlang-48643c2ee9c9d74b2e642dbd6d6802fda502e370.tar.gz
jit: Optimize map lookups
Diffstat (limited to 'erts/emulator')
-rw-r--r--erts/emulator/beam/jit/arm/beam_asm.hpp6
-rw-r--r--erts/emulator/beam/jit/arm/instr_common.cpp2
-rw-r--r--erts/emulator/beam/jit/arm/instr_guard_bifs.cpp4
-rw-r--r--erts/emulator/beam/jit/arm/instr_map.cpp400
-rw-r--r--erts/emulator/beam/jit/x86/beam_asm.hpp11
-rw-r--r--erts/emulator/beam/jit/x86/instr_map.cpp403
-rw-r--r--erts/emulator/test/map_SUITE.erl15
7 files changed, 787 insertions, 54 deletions
diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp
index a29f6953a9..4a1125d998 100644
--- a/erts/emulator/beam/jit/arm/beam_asm.hpp
+++ b/erts/emulator/beam/jit/arm/beam_asm.hpp
@@ -879,6 +879,8 @@ class BeamGlobalAssembler : public BeamAssembler {
_(i_breakpoint_trampoline_shared) \
_(i_bsr_body_shared) \
_(i_bsl_body_shared) \
+ _(i_get_map_element_shared) \
+ _(i_get_map_element_hash_shared) \
_(i_func_info_shared) \
_(i_length_guard_shared) \
_(i_length_body_shared) \
@@ -888,6 +890,7 @@ class BeamGlobalAssembler : public BeamAssembler {
_(i_bxor_body_shared) \
_(int_div_rem_body_shared) \
_(int_div_rem_guard_shared) \
+ _(internal_hash_helper) \
_(minus_body_shared) \
_(new_map_shared) \
_(update_map_assoc_shared) \
@@ -938,6 +941,9 @@ class BeamGlobalAssembler : public BeamAssembler {
void emit_bif_element_helper(Label fail);
void emit_bif_tuple_size_helper(Label fail);
+ void emit_flatmap_get_element();
+ void emit_hashmap_get_element();
+
public:
BeamGlobalAssembler(JitAllocator *allocator);
diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp
index 1b80e05e9e..c759ebcf1e 100644
--- a/erts/emulator/beam/jit/arm/instr_common.cpp
+++ b/erts/emulator/beam/jit/arm/instr_common.cpp
@@ -641,7 +641,7 @@ void BeamModuleAssembler::emit_is_nonempty_list(const ArgVal &Fail,
const int bitNumber = 1;
ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber));
- a.tbnz(list_ptr.reg, bitNumber, resolve_beam_label(Fail, disp32K));
+ a.tbnz(list_ptr.reg, imm(bitNumber), resolve_beam_label(Fail, disp32K));
}
void BeamModuleAssembler::emit_jump(const ArgVal &Fail) {
diff --git a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp
index 30434001a2..9987e60aa6 100644
--- a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp
+++ b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp
@@ -570,7 +570,7 @@ void BeamModuleAssembler::emit_bif_hd(const ArgVal &Src, const ArgVal &Hd) {
ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber));
- a.tbz(src.reg, bitNumber, good_cons);
+ a.tbz(src.reg, imm(bitNumber), good_cons);
mov_var(XREG0, src);
fragment_call(ga->get_handle_hd_error());
@@ -736,7 +736,7 @@ void BeamModuleAssembler::emit_bif_tl(const ArgVal &Src, const ArgVal &Tl) {
ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber));
- a.tbz(src.reg, bitNumber, good_cons);
+ a.tbz(src.reg, imm(bitNumber), good_cons);
mov_var(XREG0, src);
fragment_call(ga->get_handle_tl_error());
diff --git a/erts/emulator/beam/jit/arm/instr_map.cpp b/erts/emulator/beam/jit/arm/instr_map.cpp
index 0203a32362..2272366188 100644
--- a/erts/emulator/beam/jit/arm/instr_map.cpp
+++ b/erts/emulator/beam/jit/arm/instr_map.cpp
@@ -28,6 +28,205 @@ extern "C"
#include "beam_common.h"
}
+static const Uint32 INTERNAL_HASH_SALT = 3432918353;
+static const Uint32 HCONST_22 = 0x98C475E6UL;
+static const Uint32 HCONST = 0x9E3779B9;
+
+/* ARG3 = incoming hash
+ * ARG6 = lower 32
+ * ARG7 = upper 32
+ * ARG8 = type constant
+ *
+ * Helper function for calculating the internal hash of keys before looking
+ * them up in a map.
+ *
+ * This is essentially just a manual expansion of the `UINT32_HASH_2` macro.
+ * Whenever the internal hash algorithm is updated, this and all of its users
+ * must follow suit.
+ *
+ * Result is returned in ARG3. All arguments are clobbered. */
+void BeamGlobalAssembler::emit_internal_hash_helper() {
+ a64::Gp hash = ARG3.w(), lower = ARG6.w(), upper = ARG7.w(),
+ constant = ARG8.w();
+
+ a.add(lower, lower, constant);
+ a.add(upper, upper, constant);
+
+ using rounds =
+ std::initializer_list<std::tuple<a64::Gp, a64::Gp, a64::Gp, int>>;
+ for (const auto &round : rounds{{lower, upper, hash, 13},
+ {upper, hash, lower, -8},
+ {hash, lower, upper, 13},
+ {lower, upper, hash, 12},
+ {upper, hash, lower, -16},
+ {hash, lower, upper, 5},
+ {lower, upper, hash, 3},
+ {upper, hash, lower, -10},
+ {hash, lower, upper, 15}}) {
+ const auto &[r_a, r_b, r_c, shift] = round;
+
+ a.sub(r_a, r_a, r_b);
+ a.sub(r_a, r_a, r_c);
+
+ if (shift > 0) {
+ a.eor(r_a, r_a, r_c, arm::lsr(shift));
+ } else {
+ a.eor(r_a, r_a, r_c, arm::lsl(-shift));
+ }
+ }
+
+ a.ret(a64::x30);
+}
+
+/* ARG1 = untagged hash map root
+ * ARG2 = key
+ * ARG3 = key hash
+ * ARG4 = node header
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_hashmap_get_element() {
+ Label node_loop = a.newLabel();
+
+ arm::Gp node = ARG1, key = ARG2, key_hash = ARG3, header_val = ARG4,
+ depth = TMP4, index = TMP5, one = TMP6;
+
+ const int header_shift =
+ (_HEADER_ARITY_OFFS + MAP_HEADER_TAG_SZ + MAP_HEADER_ARITY_SZ);
+
+ /* Skip the root header, which is two words long (child headers are one
+ * word). */
+ a.add(node, node, imm(sizeof(Eterm[2])));
+ mov_imm(depth, 0);
+
+ a.bind(node_loop);
+ {
+ Label fail = a.newLabel(), leaf_node = a.newLabel(),
+ skip_index_adjustment = a.newLabel(), update_hash = a.newLabel();
+
+ /* Find out which child we should follow, and shift the hash for the
+ * next round. */
+ a.and_(index, key_hash, imm(0xF));
+ a.lsr(key_hash, key_hash, imm(4));
+ a.add(depth, depth, imm(1));
+
+ /* The entry offset is always equal to the index on fully populated
+ * nodes, so we'll skip adjusting them. */
+ ERTS_CT_ASSERT(header_shift == 16);
+ a.asr(header_val.w(), header_val.w(), imm(header_shift));
+ a.cmn(header_val.w(), imm(1));
+ a.b_eq(skip_index_adjustment);
+ {
+ /* If our bit isn't set on this node, the key can't be found.
+ *
+ * Note that we jump directly to the return sequence as ZF is clear
+ * at this point. */
+ a.lsr(TMP1, header_val, index);
+ a.tbz(TMP1, imm(0), fail);
+
+ /* The actual offset of our entry is the number of bits set (in
+ * essence "entries present") before our index in the bitmap. Clear
+ * the upper bits and count the remainder. */
+ a.lsl(TMP2, TMP1, index);
+ a.eor(TMP1, TMP2, header_val);
+ a.fmov(a64::d0, TMP1);
+ a.cnt(a64::v0.b8(), a64::v0.b8());
+ a.addv(a64::b0, a64::v0.b8());
+ a.fmov(index, a64::d0);
+ }
+ a.bind(skip_index_adjustment);
+
+ a.ldr(TMP1, arm::Mem(node, index, arm::lsl(3)));
+ emit_untag_ptr(node, TMP1);
+
+ /* Have we found our leaf? */
+ emit_is_boxed(leaf_node, TMP1);
+
+ /* Nope, we have to search another node. Read and skip past the header
+ * word. */
+ a.ldr(header_val, arm::Mem(node).post(sizeof(Eterm)));
+
+ /* After 8 nodes we've run out of the 32 bits we started with, so we
+ * need to update the hash to keep going. */
+ a.tst(depth, imm(0x7));
+ a.b_eq(update_hash);
+ a.b(node_loop);
+
+ a.bind(leaf_node);
+ {
+ /* We've arrived at a leaf, set ZF according to whether its key
+ * matches ours and speculatively place the element in ARG1. */
+ a.ldp(TMP1, ARG1, arm::Mem(node));
+ a.cmp(TMP1, key);
+
+ /* See comment at the jump. */
+ a.bind(fail);
+ a.ret(a64::x30);
+ }
+
+ /* After 8 nodes we've run out of the 32 bits we started with, so we
+ * must calculate a new hash to continue.
+ *
+ * This is a manual expansion `make_map_hash` from utils.c, and all
+ * changes to that function must be mirrored here. */
+ a.bind(update_hash);
+ {
+ emit_enter_runtime_frame();
+
+ /* NOTE: ARG3 (key_hash) is always 0 at this point. */
+ a.lsr(ARG6, depth, imm(3));
+ mov_imm(ARG7, 1);
+ mov_imm(ARG8, HCONST_22);
+ a.bl(labels[internal_hash_helper]);
+
+ mov_imm(TMP1, INTERNAL_HASH_SALT);
+ a.eor(ARG3, ARG3, TMP1);
+
+ a.mov(ARG6.w(), key.w());
+ a.lsr(ARG7, key, imm(32));
+ mov_imm(ARG8, HCONST);
+ a.bl(labels[internal_hash_helper]);
+
+ emit_leave_runtime_frame();
+
+ a.b(node_loop);
+ }
+ }
+}
+
+/* ARG1 = untagged flat map
+ * ARG2 = key
+ * ARG5 = size
+ *
+ * Result is returned in ARG1. ZF is set on success. */
+void BeamGlobalAssembler::emit_flatmap_get_element() {
+ Label fail = a.newLabel(), loop = a.newLabel();
+
+ /* Bump size by 1 to slide past the `keys` field in the map, and the header
+ * word in the key array. Also set flags to ensure ZF == 0 when entering
+ * the loop. */
+ a.adds(ARG5, ARG5, imm(1));
+
+ /* Adjust our map pointer to the `keys` field before loading it. This
+ * saves us from having to bump it to point at the values later on. */
+ a.ldr(TMP1, arm::Mem(ARG1).pre(offsetof(flatmap_t, keys)));
+ emit_untag_ptr(TMP1, TMP1);
+
+ a.bind(loop);
+ {
+ a.sub(ARG5, ARG5, imm(1));
+ a.cbz(ARG5, fail);
+
+ a.ldr(TMP4, arm::Mem(TMP1, ARG5, arm::lsl(3)));
+ a.cmp(ARG2, TMP4);
+ a.b_ne(loop);
+ }
+
+ a.ldr(ARG1, arm::Mem(ARG1, ARG5, arm::lsl(3)));
+
+ a.bind(fail);
+ a.ret(a64::x30);
+}
+
void BeamGlobalAssembler::emit_new_map_shared() {
emit_enter_runtime_frame();
emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs |
@@ -91,6 +290,64 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgVal &Dst,
mov_arg(Dst, ARG1);
}
+/* ARG1 = map
+ * ARG2 = key
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_i_get_map_element_shared() {
+ Label generic = a.newLabel(), hashmap = a.newLabel();
+
+ a.and_(TMP1, ARG2, imm(_TAG_PRIMARY_MASK));
+ a.cmp(TMP1, imm(TAG_PRIMARY_IMMED1));
+ a.b_ne(generic);
+
+ emit_untag_ptr(ARG1, ARG1);
+
+ /* hashmap_get_element expects node header in ARG4, flatmap_get_element
+ * expects size in ARG5 */
+ ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size);
+ a.ldp(ARG4, ARG5, arm::Mem(ARG1));
+ a.and_(TMP1, ARG4, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(TMP1, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.b_ne(hashmap);
+
+ emit_flatmap_get_element();
+
+ a.bind(generic);
+ {
+ emit_enter_runtime_frame();
+
+ emit_enter_runtime();
+ runtime_call<2>(get_map_element);
+ emit_leave_runtime();
+
+ a.cmp(ARG1, imm(THE_NON_VALUE));
+
+ /* Invert ZF, we want it to be set when RET is a value. */
+ a.cset(TMP1, arm::CondCode::kEQ);
+ a.tst(TMP1, TMP1);
+
+ emit_leave_runtime_frame();
+ a.ret(a64::x30);
+ }
+
+ a.bind(hashmap);
+ {
+ emit_enter_runtime_frame();
+
+ /* Calculate the internal hash of ARG2 before diving into the HAMT. */
+ mov_imm(ARG3, INTERNAL_HASH_SALT);
+ a.mov(ARG6.w(), ARG2.w());
+ a.lsr(ARG7, ARG2, imm(32));
+ mov_imm(ARG8, HCONST);
+ a.bl(labels[internal_hash_helper]);
+
+ emit_leave_runtime_frame();
+
+ emit_hashmap_get_element();
+ }
+}
+
void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail,
const ArgVal &Src,
const ArgVal &Key,
@@ -98,13 +355,16 @@ void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail,
mov_arg(ARG1, Src);
mov_arg(ARG2, Key);
- emit_enter_runtime();
-
- runtime_call<2>(get_map_element);
-
- emit_leave_runtime();
+ if (masked_types(Key, BEAM_TYPE_MASK_IMMEDIATE) != BEAM_TYPE_NONE) {
+ fragment_call(ga->get_i_get_map_element_shared());
+ a.b_ne(resolve_beam_label(Fail, disp1MB));
+ } else {
+ emit_enter_runtime();
+ runtime_call<2>(get_map_element);
+ emit_leave_runtime();
- emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
+ emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
+ }
/*
* Don't store the result if the destination is the scratch X register.
@@ -119,22 +379,119 @@ void BeamModuleAssembler::emit_i_get_map_elements(const ArgVal &Fail,
const ArgVal &Src,
const ArgVal &Size,
const Span<ArgVal> &args) {
+ Label generic = a.newLabel(), next = a.newLabel();
+
+ /* We're not likely to gain much from inlining huge extractions, and the
+ * resulting code is quite large, so we'll cut it off after a handful
+ * elements.
+ *
+ * Note that the arguments come in flattened triplets of
+ * `{Key, Dst, KeyHash}` */
+ bool can_inline = args.size() < (8 * 3);
+
ASSERT(Size.getValue() == args.size());
+ ASSERT((Size.getValue() % 3) == 0);
- embed_vararg_rodata(args, ARG5);
+ for (size_t i = 0; i < args.size(); i += 3) {
+ can_inline &= args[i].isImmed();
+ }
mov_arg(ARG1, Src);
- a.mov(ARG3, E);
- emit_enter_runtime<Update::eXRegs>();
+ if (can_inline) {
+ comment("simplified multi-element lookup");
- mov_imm(ARG4, args.size() / 3);
- load_x_reg_array(ARG2);
- runtime_call<5>(beam_jit_get_map_elements);
+ emit_untag_ptr(TMP1, ARG1);
+
+ ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size);
+ a.ldp(TMP2, TMP3, arm::Mem(TMP1, offsetof(flatmap_t, thing_word)));
+ a.and_(TMP2, TMP2, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(TMP2, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.b_ne(generic);
+
+ check_pending_stubs();
+
+ /* Bump size by 1 to slide past the `keys` field in the map, and the
+ * header word in the key array. */
+ a.add(TMP3, TMP3, imm(1));
+
+ /* Adjust our map pointer to the `keys` field before loading it. This
+ * saves us from having to bump it to point at the values later on. */
+ ERTS_CT_ASSERT(sizeof(flatmap_t) ==
+ offsetof(flatmap_t, keys) + sizeof(Eterm));
+ a.ldr(TMP2, arm::Mem(TMP1).pre(offsetof(flatmap_t, keys)));
+ emit_untag_ptr(TMP2, TMP2);
+
+ for (ssize_t i = args.size() - 3; i >= 0; i -= 3) {
+ Label loop = a.newLabel();
+
+ a.bind(loop);
+ {
+ a.sub(TMP3, TMP3, imm(1));
+ a.cbz(TMP3, resolve_beam_label(Fail, disp1MB));
+
+ a.ldr(TMP4, arm::Mem(TMP2, TMP3, arm::lsl(3)));
+
+ const auto &Comparand = args[i];
+ cmp_arg(TMP4, Comparand);
+ a.b_ne(loop);
+ }
- emit_leave_runtime<Update::eXRegs>();
+ /* Don't store the result if the destination is the scratch X
+ * register. (This instruction was originally a has_map_fields
+ * instruction.) */
+ const auto &Dst = args[i + 1];
+ if (!(Dst.getType() == ArgVal::XReg &&
+ Dst.getValue() == SCRATCH_X_REG)) {
+ mov_arg(Dst, arm::Mem(TMP1, TMP3, arm::lsl(3)));
+ }
+ }
- a.cbz(ARG1, resolve_beam_label(Fail, disp1MB));
+ a.b(next);
+ }
+
+ a.bind(generic);
+ {
+ embed_vararg_rodata(args, ARG5);
+
+ a.mov(ARG3, E);
+
+ emit_enter_runtime<Update::eXRegs>();
+
+ mov_imm(ARG4, args.size() / 3);
+ load_x_reg_array(ARG2);
+ runtime_call<5>(beam_jit_get_map_elements);
+
+ emit_leave_runtime<Update::eXRegs>();
+
+ a.cbz(ARG1, resolve_beam_label(Fail, disp1MB));
+ }
+
+ a.bind(next);
+}
+
+/* ARG1 = map
+ * ARG2 = key
+ * ARG3 = key hash
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_i_get_map_element_hash_shared() {
+ Label hashmap = a.newLabel();
+
+ emit_untag_ptr(ARG1, ARG1);
+
+ /* hashmap_get_element expects node header in ARG4, flatmap_get_element
+ * expects size in ARG5 */
+ ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size);
+ a.ldp(ARG4, ARG5, arm::Mem(ARG1));
+ a.and_(TMP1, ARG4, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(TMP1, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.b_ne(hashmap);
+
+ emit_flatmap_get_element();
+
+ a.bind(hashmap);
+ emit_hashmap_get_element();
}
void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail,
@@ -146,13 +503,16 @@ void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail,
mov_arg(ARG2, Key);
mov_arg(ARG3, Hx);
- emit_enter_runtime();
-
- runtime_call<3>(get_map_element_hash);
-
- emit_leave_runtime();
+ if (Key.isImmed()) {
+ fragment_call(ga->get_i_get_map_element_hash_shared());
+ a.b_ne(resolve_beam_label(Fail, disp1MB));
+ } else {
+ emit_enter_runtime();
+ runtime_call<3>(get_map_element_hash);
+ emit_leave_runtime();
- emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
+ emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown));
+ }
/*
* Don't store the result if the destination is the scratch X register.
diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp
index 59d7834765..5bbb7fa95d 100644
--- a/erts/emulator/beam/jit/x86/beam_asm.hpp
+++ b/erts/emulator/beam/jit/x86/beam_asm.hpp
@@ -957,6 +957,8 @@ class BeamGlobalAssembler : public BeamAssembler {
_(i_bxor_body_shared) \
_(i_bxor_guard_shared) \
_(i_func_info_shared) \
+ _(i_get_map_element_shared) \
+ _(i_get_map_element_hash_shared) \
_(i_load_nif_shared) \
_(i_length_guard_shared) \
_(i_length_body_shared) \
@@ -966,6 +968,7 @@ class BeamGlobalAssembler : public BeamAssembler {
_(increment_body_shared) \
_(int_div_rem_body_shared) \
_(int_div_rem_guard_shared) \
+ _(internal_hash_helper) \
_(minus_body_shared) \
_(minus_guard_shared) \
_(new_map_shared) \
@@ -1015,6 +1018,9 @@ class BeamGlobalAssembler : public BeamAssembler {
x86::Mem emit_i_length_common(Label fail, int state_size);
+ void emit_flatmap_get_element();
+ void emit_hashmap_get_element();
+
public:
BeamGlobalAssembler(JitAllocator *allocator);
@@ -1520,6 +1526,11 @@ protected:
a.mov(getArgRef(to), from);
}
+ void mov_arg(const ArgVal &to, x86::Mem from, const x86::Gp &spill) {
+ a.mov(spill, from);
+ a.mov(getArgRef(to), spill);
+ }
+
void mov_arg(const ArgVal &to, BeamInstr from, const x86::Gp &spill) {
if (Support::isInt32((Sint)from)) {
a.mov(getArgRef(to), imm(from));
diff --git a/erts/emulator/beam/jit/x86/instr_map.cpp b/erts/emulator/beam/jit/x86/instr_map.cpp
index c94a746523..16770c0a9d 100644
--- a/erts/emulator/beam/jit/x86/instr_map.cpp
+++ b/erts/emulator/beam/jit/x86/instr_map.cpp
@@ -28,6 +28,200 @@ extern "C"
#include "beam_common.h"
}
+static const Uint32 INTERNAL_HASH_SALT = 3432918353;
+static const Uint32 HCONST_22 = 0x98C475E6UL;
+static const Uint32 HCONST = 0x9E3779B9;
+
+/* ARG3 = incoming hash
+ * ARG4 = lower 32
+ * ARG5 = upper 32
+ * ARG6 = type constant
+ *
+ * Helper function for calculating the internal hash of keys before looking
+ * them up in a map.
+ *
+ * This is essentially just a manual expansion of the `UINT32_HASH_2` macro.
+ * Whenever the internal hash algorithm is updated, this and all of its users
+ * must follow suit.
+ *
+ * Result is returned in ARG3. */
+void BeamGlobalAssembler::emit_internal_hash_helper() {
+ x86::Gp hash = ARG3d, lower = ARG4d, upper = ARG5d;
+
+ a.add(lower, ARG6d);
+ a.add(upper, ARG6d);
+
+ using rounds =
+ std::initializer_list<std::tuple<x86::Gp, x86::Gp, x86::Gp, int>>;
+ for (const auto &round : rounds{{lower, upper, hash, 13},
+ {upper, hash, lower, -8},
+ {hash, lower, upper, 13},
+ {lower, upper, hash, 12},
+ {upper, hash, lower, -16},
+ {hash, lower, upper, 5},
+ {lower, upper, hash, 3},
+ {upper, hash, lower, -10},
+ {hash, lower, upper, 15}}) {
+ const auto &[r_a, r_b, r_c, shift] = round;
+
+ a.sub(r_a, r_b);
+ a.sub(r_a, r_c);
+
+ /* We have no use for the type constant anymore, reuse its register for
+ * the `a ^= r_c << shift` expression. */
+ a.mov(ARG6d, r_c);
+
+ if (shift > 0) {
+ a.shr(ARG6d, imm(shift));
+ } else {
+ a.shl(ARG6d, imm(-shift));
+ }
+
+ a.xor_(r_a, ARG6d);
+ }
+
+ a.ret();
+}
+
+/* ARG1 = hash map root, ARG2 = key, ARG3 = key hash, RETd = node header
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_hashmap_get_element() {
+ Label node_loop = a.newLabel();
+
+ x86::Gp node = ARG1, key = ARG2, key_hash = ARG3d, header_val = RETd,
+ index = ARG4d, depth = ARG5d;
+
+ const int header_shift =
+ (_HEADER_ARITY_OFFS + MAP_HEADER_TAG_SZ + MAP_HEADER_ARITY_SZ);
+
+ /* Skip the root header. This is not required for child nodes. */
+ a.add(node, imm(sizeof(Eterm)));
+ mov_imm(depth, 0);
+
+ a.bind(node_loop);
+ {
+ Label fail = a.newLabel(), leaf_node = a.newLabel(),
+ skip_index_adjustment = a.newLabel(), update_hash = a.newLabel();
+
+ /* Find out which child we should follow, and shift the hash for the
+ * next round. */
+ a.mov(index, key_hash);
+ a.and_(index, imm(0xF));
+ a.shr(key_hash, imm(4));
+ a.inc(depth);
+
+ /* The entry offset is always equal to the index on fully populated
+ * nodes, so we'll skip adjusting them. */
+ ERTS_CT_ASSERT(header_shift == 16);
+ a.sar(header_val, imm(header_shift));
+ a.cmp(header_val, imm(-1));
+ a.short_().je(skip_index_adjustment);
+ {
+ /* If our bit isn't set on this node, the key can't be found.
+ *
+ * Note that we jump directly to a `RET` instruction, as `BT` only
+ * affects CF, and ZF ("not found") is clear at this point. */
+ a.bt(header_val, index);
+ a.short_().jnc(fail);
+
+ /* The actual offset of our entry is the number of bits set (in
+ * essence "entries present") before our index in the bitmap. */
+ a.bzhi(header_val, header_val, index);
+ a.popcnt(index, header_val);
+ }
+ a.bind(skip_index_adjustment);
+
+ a.mov(node,
+ x86::qword_ptr(node,
+ index.r64(),
+ 3,
+ sizeof(Eterm) - TAG_PRIMARY_BOXED));
+ emit_ptr_val(node, node);
+
+ /* Have we found our leaf? */
+ a.test(node.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST));
+ a.short_().je(leaf_node);
+
+ /* Nope, we have to search another node. */
+ a.mov(header_val, emit_boxed_val(node, 0, sizeof(Uint32)));
+
+ /* After 8 nodes we've run out of the 32 bits we started with, so we
+ * need to update the hash to keep going. */
+ a.test(depth, imm(0x7));
+ a.short_().jz(update_hash);
+ a.short_().jmp(node_loop);
+
+ a.bind(leaf_node);
+ {
+ /* We've arrived at a leaf, set ZF according to whether its key
+ * matches ours and speculatively place the element in RET. */
+ a.cmp(getCARRef(node), key);
+ a.mov(RET, getCDRRef(node));
+
+ /* See comment at the jump. */
+ a.bind(fail);
+ a.ret();
+ }
+
+ /* After 8 nodes we've run out of the 32 bits we started with, so we
+ * must calculate a new hash to continue.
+ *
+ * This is a manual expansion `make_map_hash` from utils.c, and all
+ * changes to that function must be mirrored here. */
+ a.bind(update_hash);
+ {
+ a.mov(TMP_MEM1d, depth);
+
+ /* NOTE: ARG3d is always 0 at this point. */
+ a.mov(ARG4d, depth);
+ a.shr(ARG4d, imm(3));
+ mov_imm(ARG5d, 1);
+ a.mov(ARG6d, imm(HCONST_22));
+ a.call(labels[internal_hash_helper]);
+
+ a.xor_(ARG3d, imm(INTERNAL_HASH_SALT));
+ a.mov(ARG4d, key.r32());
+ a.mov(ARG5, key);
+ a.shr(ARG5, imm(32));
+ a.mov(ARG6d, imm(HCONST));
+ a.call(labels[internal_hash_helper]);
+
+ a.mov(depth, TMP_MEM1d);
+
+ a.jmp(node_loop);
+ }
+ }
+}
+
+/* ARG1 = flat map, ARG2 = key
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_flatmap_get_element() {
+ Label fail = a.newLabel(), loop = a.newLabel();
+
+ a.mov(RETd, emit_boxed_val(ARG1, offsetof(flatmap_t, size), 4));
+ a.mov(ARG4, emit_boxed_val(ARG1, offsetof(flatmap_t, keys)));
+
+ emit_ptr_val(ARG4, ARG4);
+
+ a.bind(loop);
+ {
+ a.dec(RETd);
+ a.short_().jl(fail);
+
+ a.cmp(ARG2,
+ x86::qword_ptr(ARG4, RET, 3, sizeof(Eterm) - TAG_PRIMARY_BOXED));
+ a.short_().jne(loop);
+ }
+
+ int value_offset = sizeof(flatmap_t) - TAG_PRIMARY_BOXED;
+ a.mov(RET, x86::qword_ptr(ARG1, RET, 3, value_offset));
+
+ a.bind(fail);
+ a.ret();
+}
+
void BeamGlobalAssembler::emit_new_map_shared() {
emit_enter_frame();
emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>();
@@ -91,6 +285,59 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgVal &Dst,
mov_arg(Dst, RET);
}
+/* ARG1 = map, ARG2 = key
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_i_get_map_element_shared() {
+ Label generic = a.newLabel(), hashmap = a.newLabel();
+
+ a.mov(RETd, ARG2d);
+
+ a.and_(RETb, imm(_TAG_PRIMARY_MASK));
+ a.cmp(RETb, imm(TAG_PRIMARY_IMMED1));
+ a.short_().jne(generic);
+
+ emit_ptr_val(ARG1, ARG1);
+
+ a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32)));
+ a.mov(ARG4d, RETd);
+
+ a.and_(ARG4d, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(ARG4d, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.short_().jne(hashmap);
+
+ emit_flatmap_get_element();
+
+ a.bind(generic);
+ {
+ emit_enter_runtime();
+ runtime_call<2>(get_map_element);
+ emit_leave_runtime();
+
+ emit_test_the_non_value(RET);
+
+ /* Invert ZF, we want it to be set when RET is a value. */
+ a.setnz(ARG1.r8());
+ a.dec(ARG1.r8());
+
+ a.ret();
+ }
+
+ a.bind(hashmap);
+ {
+ /* Calculate the internal hash of ARG2 before diving into the HAMT. */
+ a.mov(ARG5, ARG2);
+ a.shr(ARG5, imm(32));
+ a.mov(ARG4d, ARG2d);
+
+ a.mov(ARG3d, imm(INTERNAL_HASH_SALT));
+ a.mov(ARG6d, imm(HCONST));
+ a.call(labels[internal_hash_helper]);
+
+ emit_hashmap_get_element();
+ }
+}
+
void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail,
const ArgVal &Src,
const ArgVal &Key,
@@ -98,19 +345,21 @@ void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail,
mov_arg(ARG1, Src);
mov_arg(ARG2, Key);
- emit_enter_runtime();
-
- runtime_call<2>(get_map_element);
-
- emit_leave_runtime();
+ if (masked_types(Key, BEAM_TYPE_MASK_IMMEDIATE) != BEAM_TYPE_NONE &&
+ hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+ safe_fragment_call(ga->get_i_get_map_element_shared());
+ a.jne(resolve_beam_label(Fail));
+ } else {
+ emit_enter_runtime();
+ runtime_call<2>(get_map_element);
+ emit_leave_runtime();
- emit_test_the_non_value(RET);
- a.je(resolve_beam_label(Fail));
+ emit_test_the_non_value(RET);
+ a.je(resolve_beam_label(Fail));
+ }
- /*
- * Don't store the result if the destination is the scratch X register.
- * (This instruction was originally a has_map_fields instruction.)
- */
+ /* Don't store the result if the destination is the scratch X register.
+ * (This instruction was originally a has_map_fields instruction.) */
if (!(Dst.getType() == ArgVal::XReg && Dst.getValue() == SCRATCH_X_REG)) {
mov_arg(Dst, RET);
}
@@ -120,24 +369,115 @@ void BeamModuleAssembler::emit_i_get_map_elements(const ArgVal &Fail,
const ArgVal &Src,
const ArgVal &Size,
const Span<ArgVal> &args) {
+ Label generic = a.newLabel(), next = a.newLabel();
Label data = embed_vararg_rodata(args, 0);
+ /* We're not likely to gain much from inlining huge extractions, and the
+ * resulting code is quite large, so we'll cut it off after a handful
+ * elements.
+ *
+ * Note that the arguments come in flattened triplets of
+ * `{Key, Dst, KeyHash}` */
+ bool can_inline = args.size() < (8 * 3);
+
ASSERT(Size.getValue() == args.size());
+ ASSERT((Size.getValue() % 3) == 0);
+
+ for (size_t i = 0; i < args.size(); i += 3) {
+ can_inline &= args[i].isImmed();
+ }
mov_arg(ARG1, Src);
- a.mov(ARG3, E);
- emit_enter_runtime();
+ if (can_inline) {
+ comment("simplified multi-element lookup");
+
+ emit_ptr_val(ARG1, ARG1);
+
+ a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32)));
+ a.and_(RETb, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(RETb, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.jne(generic);
+
+ ERTS_CT_ASSERT(MAP_SMALL_MAP_LIMIT <= ERTS_UINT32_MAX);
+ a.mov(RETd,
+ emit_boxed_val(ARG1, offsetof(flatmap_t, size), sizeof(Uint32)));
+ a.mov(ARG2, emit_boxed_val(ARG1, offsetof(flatmap_t, keys)));
+
+ emit_ptr_val(ARG2, ARG2);
+
+ for (ssize_t i = args.size() - 3; i >= 0; i -= 3) {
+ Label loop = a.newLabel();
+
+ a.bind(loop);
+ {
+ x86::Mem candidate =
+ x86::qword_ptr(ARG2,
+ RET,
+ 3,
+ sizeof(Eterm) - TAG_PRIMARY_BOXED);
+
+ a.dec(RETd);
+ a.jl(resolve_beam_label(Fail));
+
+ const auto &Comparand = args[i];
+ cmp_arg(candidate, Comparand, ARG3);
+ a.short_().jne(loop);
+ }
+
+ /* Don't store the result if the destination is the scratch X
+ * register. (This instruction was originally a has_map_fields
+ * instruction.) */
+ const auto &Dst = args[i + 1];
+ if (!(Dst.getType() == ArgVal::XReg &&
+ Dst.getValue() == SCRATCH_X_REG)) {
+ const int value_offset = sizeof(flatmap_t) - TAG_PRIMARY_BOXED;
+ mov_arg(Dst, x86::qword_ptr(ARG1, RET, 3, value_offset), ARG3);
+ }
+ }
+
+ a.short_().jmp(next);
+ }
- mov_imm(ARG4, args.size() / 3);
- a.lea(ARG5, x86::qword_ptr(data));
- load_x_reg_array(ARG2);
- runtime_call<5>(beam_jit_get_map_elements);
+ a.bind(generic);
+ {
+ mov_imm(ARG4, args.size() / 3);
+ a.lea(ARG5, x86::qword_ptr(data));
+ a.mov(ARG3, E);
+
+ emit_enter_runtime();
+
+ load_x_reg_array(ARG2);
+ runtime_call<5>(beam_jit_get_map_elements);
+
+ emit_leave_runtime();
+
+ a.test(RET, RET);
+ a.je(resolve_beam_label(Fail));
+ }
+
+ a.bind(next);
+}
+
+/* ARG1 = map, ARG2 = key, ARG3 = key hash
+ *
+ * Result is returned in RET. ZF is set on success. */
+void BeamGlobalAssembler::emit_i_get_map_element_hash_shared() {
+ Label hashmap = a.newLabel();
+
+ emit_ptr_val(ARG1, ARG1);
+
+ a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32)));
+ a.mov(ARG4d, RETd);
- emit_leave_runtime();
+ a.and_(ARG4d, imm(_HEADER_MAP_SUBTAG_MASK));
+ a.cmp(ARG4d, imm(HAMT_SUBTAG_HEAD_FLATMAP));
+ a.short_().jne(hashmap);
- a.test(RET, RET);
- a.je(resolve_beam_label(Fail));
+ emit_flatmap_get_element();
+
+ a.bind(hashmap);
+ emit_hashmap_get_element();
}
void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail,
@@ -149,19 +489,20 @@ void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail,
mov_arg(ARG2, Key);
mov_arg(ARG3, Hx);
- emit_enter_runtime();
-
- runtime_call<3>(get_map_element_hash);
-
- emit_leave_runtime();
+ if (Key.isImmed() && hasCpuFeature(CpuFeatures::X86::kBMI2)) {
+ safe_fragment_call(ga->get_i_get_map_element_hash_shared());
+ a.jne(resolve_beam_label(Fail));
+ } else {
+ emit_enter_runtime();
+ runtime_call<3>(get_map_element_hash);
+ emit_leave_runtime();
- emit_test_the_non_value(RET);
- a.je(resolve_beam_label(Fail));
+ emit_test_the_non_value(RET);
+ a.je(resolve_beam_label(Fail));
+ }
- /*
- * Don't store the result if the destination is the scratch X register.
- * (This instruction was originally a has_map_fields instruction.)
- */
+ /* Don't store the result if the destination is the scratch X register.
+ * (This instruction was originally a has_map_fields instruction.) */
if (!(Dst.getType() == ArgVal::XReg && Dst.getValue() == SCRATCH_X_REG)) {
mov_arg(Dst, RET);
}
diff --git a/erts/emulator/test/map_SUITE.erl b/erts/emulator/test/map_SUITE.erl
index 9cc954a937..b06c308cb1 100644
--- a/erts/emulator/test/map_SUITE.erl
+++ b/erts/emulator/test/map_SUITE.erl
@@ -84,6 +84,7 @@
%% instruction-level tests
t_has_map_fields/1,
+ t_get_map_elements/1,
y_regs/1,
%%Bugs
@@ -157,6 +158,7 @@ groups() ->
%% instruction-level tests
t_has_map_fields,
+ t_get_map_elements,
y_regs,
%% Bugs
@@ -3142,6 +3144,19 @@ has_map_fields_3(#{a:=_,b:=_}) -> true;
has_map_fields_3(#{[]:=_,42.0:=_}) -> true;
has_map_fields_3(#{}) -> false.
+t_get_map_elements(Config) when is_list(Config) ->
+ %% Tests that the JIT implementation of `get_map_elements` handles
+ %% collisions properly.
+ N = 500000,
+ Is = lists:seq(1, N),
+ Test = maps:from_list([{I,I}||I<-Is]),
+ [begin
+ #{ Key := Val } = Test,
+ Key = Val
+ end || Key <- Is],
+
+ ok.
+
y_regs(Config) when is_list(Config) ->
Val = [length(Config)],
Map0 = y_regs_update(#{}, Val),