diff options
author | John Högberg <john@erlang.org> | 2022-02-07 23:05:19 +0100 |
---|---|---|
committer | John Högberg <john@erlang.org> | 2022-02-17 12:50:43 +0100 |
commit | 48643c2ee9c9d74b2e642dbd6d6802fda502e370 (patch) | |
tree | 3da6eb2b02d1a18c98cab9a51a3c4e0944308413 /erts | |
parent | df9d0cb725f83b498bd620555f62962b9c0f1df5 (diff) | |
download | erlang-48643c2ee9c9d74b2e642dbd6d6802fda502e370.tar.gz |
jit: Optimize map lookups
Diffstat (limited to 'erts')
-rw-r--r-- | erts/emulator/beam/jit/arm/beam_asm.hpp | 6 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/instr_common.cpp | 2 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/instr_guard_bifs.cpp | 4 | ||||
-rw-r--r-- | erts/emulator/beam/jit/arm/instr_map.cpp | 400 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/beam_asm.hpp | 11 | ||||
-rw-r--r-- | erts/emulator/beam/jit/x86/instr_map.cpp | 403 | ||||
-rw-r--r-- | erts/emulator/test/map_SUITE.erl | 15 |
7 files changed, 787 insertions, 54 deletions
diff --git a/erts/emulator/beam/jit/arm/beam_asm.hpp b/erts/emulator/beam/jit/arm/beam_asm.hpp index a29f6953a9..4a1125d998 100644 --- a/erts/emulator/beam/jit/arm/beam_asm.hpp +++ b/erts/emulator/beam/jit/arm/beam_asm.hpp @@ -879,6 +879,8 @@ class BeamGlobalAssembler : public BeamAssembler { _(i_breakpoint_trampoline_shared) \ _(i_bsr_body_shared) \ _(i_bsl_body_shared) \ + _(i_get_map_element_shared) \ + _(i_get_map_element_hash_shared) \ _(i_func_info_shared) \ _(i_length_guard_shared) \ _(i_length_body_shared) \ @@ -888,6 +890,7 @@ class BeamGlobalAssembler : public BeamAssembler { _(i_bxor_body_shared) \ _(int_div_rem_body_shared) \ _(int_div_rem_guard_shared) \ + _(internal_hash_helper) \ _(minus_body_shared) \ _(new_map_shared) \ _(update_map_assoc_shared) \ @@ -938,6 +941,9 @@ class BeamGlobalAssembler : public BeamAssembler { void emit_bif_element_helper(Label fail); void emit_bif_tuple_size_helper(Label fail); + void emit_flatmap_get_element(); + void emit_hashmap_get_element(); + public: BeamGlobalAssembler(JitAllocator *allocator); diff --git a/erts/emulator/beam/jit/arm/instr_common.cpp b/erts/emulator/beam/jit/arm/instr_common.cpp index 1b80e05e9e..c759ebcf1e 100644 --- a/erts/emulator/beam/jit/arm/instr_common.cpp +++ b/erts/emulator/beam/jit/arm/instr_common.cpp @@ -641,7 +641,7 @@ void BeamModuleAssembler::emit_is_nonempty_list(const ArgVal &Fail, const int bitNumber = 1; ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber)); - a.tbnz(list_ptr.reg, bitNumber, resolve_beam_label(Fail, disp32K)); + a.tbnz(list_ptr.reg, imm(bitNumber), resolve_beam_label(Fail, disp32K)); } void BeamModuleAssembler::emit_jump(const ArgVal &Fail) { diff --git a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp index 30434001a2..9987e60aa6 100644 --- a/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp +++ b/erts/emulator/beam/jit/arm/instr_guard_bifs.cpp @@ -570,7 +570,7 @@ void BeamModuleAssembler::emit_bif_hd(const ArgVal &Src, const ArgVal &Hd) { ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber)); - a.tbz(src.reg, bitNumber, good_cons); + a.tbz(src.reg, imm(bitNumber), good_cons); mov_var(XREG0, src); fragment_call(ga->get_handle_hd_error()); @@ -736,7 +736,7 @@ void BeamModuleAssembler::emit_bif_tl(const ArgVal &Src, const ArgVal &Tl) { ERTS_CT_ASSERT(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST == (1 << bitNumber)); - a.tbz(src.reg, bitNumber, good_cons); + a.tbz(src.reg, imm(bitNumber), good_cons); mov_var(XREG0, src); fragment_call(ga->get_handle_tl_error()); diff --git a/erts/emulator/beam/jit/arm/instr_map.cpp b/erts/emulator/beam/jit/arm/instr_map.cpp index 0203a32362..2272366188 100644 --- a/erts/emulator/beam/jit/arm/instr_map.cpp +++ b/erts/emulator/beam/jit/arm/instr_map.cpp @@ -28,6 +28,205 @@ extern "C" #include "beam_common.h" } +static const Uint32 INTERNAL_HASH_SALT = 3432918353; +static const Uint32 HCONST_22 = 0x98C475E6UL; +static const Uint32 HCONST = 0x9E3779B9; + +/* ARG3 = incoming hash + * ARG6 = lower 32 + * ARG7 = upper 32 + * ARG8 = type constant + * + * Helper function for calculating the internal hash of keys before looking + * them up in a map. + * + * This is essentially just a manual expansion of the `UINT32_HASH_2` macro. + * Whenever the internal hash algorithm is updated, this and all of its users + * must follow suit. + * + * Result is returned in ARG3. All arguments are clobbered. */ +void BeamGlobalAssembler::emit_internal_hash_helper() { + a64::Gp hash = ARG3.w(), lower = ARG6.w(), upper = ARG7.w(), + constant = ARG8.w(); + + a.add(lower, lower, constant); + a.add(upper, upper, constant); + + using rounds = + std::initializer_list<std::tuple<a64::Gp, a64::Gp, a64::Gp, int>>; + for (const auto &round : rounds{{lower, upper, hash, 13}, + {upper, hash, lower, -8}, + {hash, lower, upper, 13}, + {lower, upper, hash, 12}, + {upper, hash, lower, -16}, + {hash, lower, upper, 5}, + {lower, upper, hash, 3}, + {upper, hash, lower, -10}, + {hash, lower, upper, 15}}) { + const auto &[r_a, r_b, r_c, shift] = round; + + a.sub(r_a, r_a, r_b); + a.sub(r_a, r_a, r_c); + + if (shift > 0) { + a.eor(r_a, r_a, r_c, arm::lsr(shift)); + } else { + a.eor(r_a, r_a, r_c, arm::lsl(-shift)); + } + } + + a.ret(a64::x30); +} + +/* ARG1 = untagged hash map root + * ARG2 = key + * ARG3 = key hash + * ARG4 = node header + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_hashmap_get_element() { + Label node_loop = a.newLabel(); + + arm::Gp node = ARG1, key = ARG2, key_hash = ARG3, header_val = ARG4, + depth = TMP4, index = TMP5, one = TMP6; + + const int header_shift = + (_HEADER_ARITY_OFFS + MAP_HEADER_TAG_SZ + MAP_HEADER_ARITY_SZ); + + /* Skip the root header, which is two words long (child headers are one + * word). */ + a.add(node, node, imm(sizeof(Eterm[2]))); + mov_imm(depth, 0); + + a.bind(node_loop); + { + Label fail = a.newLabel(), leaf_node = a.newLabel(), + skip_index_adjustment = a.newLabel(), update_hash = a.newLabel(); + + /* Find out which child we should follow, and shift the hash for the + * next round. */ + a.and_(index, key_hash, imm(0xF)); + a.lsr(key_hash, key_hash, imm(4)); + a.add(depth, depth, imm(1)); + + /* The entry offset is always equal to the index on fully populated + * nodes, so we'll skip adjusting them. */ + ERTS_CT_ASSERT(header_shift == 16); + a.asr(header_val.w(), header_val.w(), imm(header_shift)); + a.cmn(header_val.w(), imm(1)); + a.b_eq(skip_index_adjustment); + { + /* If our bit isn't set on this node, the key can't be found. + * + * Note that we jump directly to the return sequence as ZF is clear + * at this point. */ + a.lsr(TMP1, header_val, index); + a.tbz(TMP1, imm(0), fail); + + /* The actual offset of our entry is the number of bits set (in + * essence "entries present") before our index in the bitmap. Clear + * the upper bits and count the remainder. */ + a.lsl(TMP2, TMP1, index); + a.eor(TMP1, TMP2, header_val); + a.fmov(a64::d0, TMP1); + a.cnt(a64::v0.b8(), a64::v0.b8()); + a.addv(a64::b0, a64::v0.b8()); + a.fmov(index, a64::d0); + } + a.bind(skip_index_adjustment); + + a.ldr(TMP1, arm::Mem(node, index, arm::lsl(3))); + emit_untag_ptr(node, TMP1); + + /* Have we found our leaf? */ + emit_is_boxed(leaf_node, TMP1); + + /* Nope, we have to search another node. Read and skip past the header + * word. */ + a.ldr(header_val, arm::Mem(node).post(sizeof(Eterm))); + + /* After 8 nodes we've run out of the 32 bits we started with, so we + * need to update the hash to keep going. */ + a.tst(depth, imm(0x7)); + a.b_eq(update_hash); + a.b(node_loop); + + a.bind(leaf_node); + { + /* We've arrived at a leaf, set ZF according to whether its key + * matches ours and speculatively place the element in ARG1. */ + a.ldp(TMP1, ARG1, arm::Mem(node)); + a.cmp(TMP1, key); + + /* See comment at the jump. */ + a.bind(fail); + a.ret(a64::x30); + } + + /* After 8 nodes we've run out of the 32 bits we started with, so we + * must calculate a new hash to continue. + * + * This is a manual expansion `make_map_hash` from utils.c, and all + * changes to that function must be mirrored here. */ + a.bind(update_hash); + { + emit_enter_runtime_frame(); + + /* NOTE: ARG3 (key_hash) is always 0 at this point. */ + a.lsr(ARG6, depth, imm(3)); + mov_imm(ARG7, 1); + mov_imm(ARG8, HCONST_22); + a.bl(labels[internal_hash_helper]); + + mov_imm(TMP1, INTERNAL_HASH_SALT); + a.eor(ARG3, ARG3, TMP1); + + a.mov(ARG6.w(), key.w()); + a.lsr(ARG7, key, imm(32)); + mov_imm(ARG8, HCONST); + a.bl(labels[internal_hash_helper]); + + emit_leave_runtime_frame(); + + a.b(node_loop); + } + } +} + +/* ARG1 = untagged flat map + * ARG2 = key + * ARG5 = size + * + * Result is returned in ARG1. ZF is set on success. */ +void BeamGlobalAssembler::emit_flatmap_get_element() { + Label fail = a.newLabel(), loop = a.newLabel(); + + /* Bump size by 1 to slide past the `keys` field in the map, and the header + * word in the key array. Also set flags to ensure ZF == 0 when entering + * the loop. */ + a.adds(ARG5, ARG5, imm(1)); + + /* Adjust our map pointer to the `keys` field before loading it. This + * saves us from having to bump it to point at the values later on. */ + a.ldr(TMP1, arm::Mem(ARG1).pre(offsetof(flatmap_t, keys))); + emit_untag_ptr(TMP1, TMP1); + + a.bind(loop); + { + a.sub(ARG5, ARG5, imm(1)); + a.cbz(ARG5, fail); + + a.ldr(TMP4, arm::Mem(TMP1, ARG5, arm::lsl(3))); + a.cmp(ARG2, TMP4); + a.b_ne(loop); + } + + a.ldr(ARG1, arm::Mem(ARG1, ARG5, arm::lsl(3))); + + a.bind(fail); + a.ret(a64::x30); +} + void BeamGlobalAssembler::emit_new_map_shared() { emit_enter_runtime_frame(); emit_enter_runtime<Update::eStack | Update::eHeap | Update::eXRegs | @@ -91,6 +290,64 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgVal &Dst, mov_arg(Dst, ARG1); } +/* ARG1 = map + * ARG2 = key + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_i_get_map_element_shared() { + Label generic = a.newLabel(), hashmap = a.newLabel(); + + a.and_(TMP1, ARG2, imm(_TAG_PRIMARY_MASK)); + a.cmp(TMP1, imm(TAG_PRIMARY_IMMED1)); + a.b_ne(generic); + + emit_untag_ptr(ARG1, ARG1); + + /* hashmap_get_element expects node header in ARG4, flatmap_get_element + * expects size in ARG5 */ + ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size); + a.ldp(ARG4, ARG5, arm::Mem(ARG1)); + a.and_(TMP1, ARG4, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(TMP1, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.b_ne(hashmap); + + emit_flatmap_get_element(); + + a.bind(generic); + { + emit_enter_runtime_frame(); + + emit_enter_runtime(); + runtime_call<2>(get_map_element); + emit_leave_runtime(); + + a.cmp(ARG1, imm(THE_NON_VALUE)); + + /* Invert ZF, we want it to be set when RET is a value. */ + a.cset(TMP1, arm::CondCode::kEQ); + a.tst(TMP1, TMP1); + + emit_leave_runtime_frame(); + a.ret(a64::x30); + } + + a.bind(hashmap); + { + emit_enter_runtime_frame(); + + /* Calculate the internal hash of ARG2 before diving into the HAMT. */ + mov_imm(ARG3, INTERNAL_HASH_SALT); + a.mov(ARG6.w(), ARG2.w()); + a.lsr(ARG7, ARG2, imm(32)); + mov_imm(ARG8, HCONST); + a.bl(labels[internal_hash_helper]); + + emit_leave_runtime_frame(); + + emit_hashmap_get_element(); + } +} + void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail, const ArgVal &Src, const ArgVal &Key, @@ -98,13 +355,16 @@ void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail, mov_arg(ARG1, Src); mov_arg(ARG2, Key); - emit_enter_runtime(); - - runtime_call<2>(get_map_element); - - emit_leave_runtime(); + if (masked_types(Key, BEAM_TYPE_MASK_IMMEDIATE) != BEAM_TYPE_NONE) { + fragment_call(ga->get_i_get_map_element_shared()); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } else { + emit_enter_runtime(); + runtime_call<2>(get_map_element); + emit_leave_runtime(); - emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); + emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); + } /* * Don't store the result if the destination is the scratch X register. @@ -119,22 +379,119 @@ void BeamModuleAssembler::emit_i_get_map_elements(const ArgVal &Fail, const ArgVal &Src, const ArgVal &Size, const Span<ArgVal> &args) { + Label generic = a.newLabel(), next = a.newLabel(); + + /* We're not likely to gain much from inlining huge extractions, and the + * resulting code is quite large, so we'll cut it off after a handful + * elements. + * + * Note that the arguments come in flattened triplets of + * `{Key, Dst, KeyHash}` */ + bool can_inline = args.size() < (8 * 3); + ASSERT(Size.getValue() == args.size()); + ASSERT((Size.getValue() % 3) == 0); - embed_vararg_rodata(args, ARG5); + for (size_t i = 0; i < args.size(); i += 3) { + can_inline &= args[i].isImmed(); + } mov_arg(ARG1, Src); - a.mov(ARG3, E); - emit_enter_runtime<Update::eXRegs>(); + if (can_inline) { + comment("simplified multi-element lookup"); - mov_imm(ARG4, args.size() / 3); - load_x_reg_array(ARG2); - runtime_call<5>(beam_jit_get_map_elements); + emit_untag_ptr(TMP1, ARG1); + + ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size); + a.ldp(TMP2, TMP3, arm::Mem(TMP1, offsetof(flatmap_t, thing_word))); + a.and_(TMP2, TMP2, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(TMP2, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.b_ne(generic); + + check_pending_stubs(); + + /* Bump size by 1 to slide past the `keys` field in the map, and the + * header word in the key array. */ + a.add(TMP3, TMP3, imm(1)); + + /* Adjust our map pointer to the `keys` field before loading it. This + * saves us from having to bump it to point at the values later on. */ + ERTS_CT_ASSERT(sizeof(flatmap_t) == + offsetof(flatmap_t, keys) + sizeof(Eterm)); + a.ldr(TMP2, arm::Mem(TMP1).pre(offsetof(flatmap_t, keys))); + emit_untag_ptr(TMP2, TMP2); + + for (ssize_t i = args.size() - 3; i >= 0; i -= 3) { + Label loop = a.newLabel(); + + a.bind(loop); + { + a.sub(TMP3, TMP3, imm(1)); + a.cbz(TMP3, resolve_beam_label(Fail, disp1MB)); + + a.ldr(TMP4, arm::Mem(TMP2, TMP3, arm::lsl(3))); + + const auto &Comparand = args[i]; + cmp_arg(TMP4, Comparand); + a.b_ne(loop); + } - emit_leave_runtime<Update::eXRegs>(); + /* Don't store the result if the destination is the scratch X + * register. (This instruction was originally a has_map_fields + * instruction.) */ + const auto &Dst = args[i + 1]; + if (!(Dst.getType() == ArgVal::XReg && + Dst.getValue() == SCRATCH_X_REG)) { + mov_arg(Dst, arm::Mem(TMP1, TMP3, arm::lsl(3))); + } + } - a.cbz(ARG1, resolve_beam_label(Fail, disp1MB)); + a.b(next); + } + + a.bind(generic); + { + embed_vararg_rodata(args, ARG5); + + a.mov(ARG3, E); + + emit_enter_runtime<Update::eXRegs>(); + + mov_imm(ARG4, args.size() / 3); + load_x_reg_array(ARG2); + runtime_call<5>(beam_jit_get_map_elements); + + emit_leave_runtime<Update::eXRegs>(); + + a.cbz(ARG1, resolve_beam_label(Fail, disp1MB)); + } + + a.bind(next); +} + +/* ARG1 = map + * ARG2 = key + * ARG3 = key hash + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_i_get_map_element_hash_shared() { + Label hashmap = a.newLabel(); + + emit_untag_ptr(ARG1, ARG1); + + /* hashmap_get_element expects node header in ARG4, flatmap_get_element + * expects size in ARG5 */ + ERTS_CT_ASSERT_FIELD_PAIR(flatmap_t, thing_word, size); + a.ldp(ARG4, ARG5, arm::Mem(ARG1)); + a.and_(TMP1, ARG4, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(TMP1, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.b_ne(hashmap); + + emit_flatmap_get_element(); + + a.bind(hashmap); + emit_hashmap_get_element(); } void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail, @@ -146,13 +503,16 @@ void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail, mov_arg(ARG2, Key); mov_arg(ARG3, Hx); - emit_enter_runtime(); - - runtime_call<3>(get_map_element_hash); - - emit_leave_runtime(); + if (Key.isImmed()) { + fragment_call(ga->get_i_get_map_element_hash_shared()); + a.b_ne(resolve_beam_label(Fail, disp1MB)); + } else { + emit_enter_runtime(); + runtime_call<3>(get_map_element_hash); + emit_leave_runtime(); - emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); + emit_branch_if_not_value(ARG1, resolve_beam_label(Fail, dispUnknown)); + } /* * Don't store the result if the destination is the scratch X register. diff --git a/erts/emulator/beam/jit/x86/beam_asm.hpp b/erts/emulator/beam/jit/x86/beam_asm.hpp index 59d7834765..5bbb7fa95d 100644 --- a/erts/emulator/beam/jit/x86/beam_asm.hpp +++ b/erts/emulator/beam/jit/x86/beam_asm.hpp @@ -957,6 +957,8 @@ class BeamGlobalAssembler : public BeamAssembler { _(i_bxor_body_shared) \ _(i_bxor_guard_shared) \ _(i_func_info_shared) \ + _(i_get_map_element_shared) \ + _(i_get_map_element_hash_shared) \ _(i_load_nif_shared) \ _(i_length_guard_shared) \ _(i_length_body_shared) \ @@ -966,6 +968,7 @@ class BeamGlobalAssembler : public BeamAssembler { _(increment_body_shared) \ _(int_div_rem_body_shared) \ _(int_div_rem_guard_shared) \ + _(internal_hash_helper) \ _(minus_body_shared) \ _(minus_guard_shared) \ _(new_map_shared) \ @@ -1015,6 +1018,9 @@ class BeamGlobalAssembler : public BeamAssembler { x86::Mem emit_i_length_common(Label fail, int state_size); + void emit_flatmap_get_element(); + void emit_hashmap_get_element(); + public: BeamGlobalAssembler(JitAllocator *allocator); @@ -1520,6 +1526,11 @@ protected: a.mov(getArgRef(to), from); } + void mov_arg(const ArgVal &to, x86::Mem from, const x86::Gp &spill) { + a.mov(spill, from); + a.mov(getArgRef(to), spill); + } + void mov_arg(const ArgVal &to, BeamInstr from, const x86::Gp &spill) { if (Support::isInt32((Sint)from)) { a.mov(getArgRef(to), imm(from)); diff --git a/erts/emulator/beam/jit/x86/instr_map.cpp b/erts/emulator/beam/jit/x86/instr_map.cpp index c94a746523..16770c0a9d 100644 --- a/erts/emulator/beam/jit/x86/instr_map.cpp +++ b/erts/emulator/beam/jit/x86/instr_map.cpp @@ -28,6 +28,200 @@ extern "C" #include "beam_common.h" } +static const Uint32 INTERNAL_HASH_SALT = 3432918353; +static const Uint32 HCONST_22 = 0x98C475E6UL; +static const Uint32 HCONST = 0x9E3779B9; + +/* ARG3 = incoming hash + * ARG4 = lower 32 + * ARG5 = upper 32 + * ARG6 = type constant + * + * Helper function for calculating the internal hash of keys before looking + * them up in a map. + * + * This is essentially just a manual expansion of the `UINT32_HASH_2` macro. + * Whenever the internal hash algorithm is updated, this and all of its users + * must follow suit. + * + * Result is returned in ARG3. */ +void BeamGlobalAssembler::emit_internal_hash_helper() { + x86::Gp hash = ARG3d, lower = ARG4d, upper = ARG5d; + + a.add(lower, ARG6d); + a.add(upper, ARG6d); + + using rounds = + std::initializer_list<std::tuple<x86::Gp, x86::Gp, x86::Gp, int>>; + for (const auto &round : rounds{{lower, upper, hash, 13}, + {upper, hash, lower, -8}, + {hash, lower, upper, 13}, + {lower, upper, hash, 12}, + {upper, hash, lower, -16}, + {hash, lower, upper, 5}, + {lower, upper, hash, 3}, + {upper, hash, lower, -10}, + {hash, lower, upper, 15}}) { + const auto &[r_a, r_b, r_c, shift] = round; + + a.sub(r_a, r_b); + a.sub(r_a, r_c); + + /* We have no use for the type constant anymore, reuse its register for + * the `a ^= r_c << shift` expression. */ + a.mov(ARG6d, r_c); + + if (shift > 0) { + a.shr(ARG6d, imm(shift)); + } else { + a.shl(ARG6d, imm(-shift)); + } + + a.xor_(r_a, ARG6d); + } + + a.ret(); +} + +/* ARG1 = hash map root, ARG2 = key, ARG3 = key hash, RETd = node header + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_hashmap_get_element() { + Label node_loop = a.newLabel(); + + x86::Gp node = ARG1, key = ARG2, key_hash = ARG3d, header_val = RETd, + index = ARG4d, depth = ARG5d; + + const int header_shift = + (_HEADER_ARITY_OFFS + MAP_HEADER_TAG_SZ + MAP_HEADER_ARITY_SZ); + + /* Skip the root header. This is not required for child nodes. */ + a.add(node, imm(sizeof(Eterm))); + mov_imm(depth, 0); + + a.bind(node_loop); + { + Label fail = a.newLabel(), leaf_node = a.newLabel(), + skip_index_adjustment = a.newLabel(), update_hash = a.newLabel(); + + /* Find out which child we should follow, and shift the hash for the + * next round. */ + a.mov(index, key_hash); + a.and_(index, imm(0xF)); + a.shr(key_hash, imm(4)); + a.inc(depth); + + /* The entry offset is always equal to the index on fully populated + * nodes, so we'll skip adjusting them. */ + ERTS_CT_ASSERT(header_shift == 16); + a.sar(header_val, imm(header_shift)); + a.cmp(header_val, imm(-1)); + a.short_().je(skip_index_adjustment); + { + /* If our bit isn't set on this node, the key can't be found. + * + * Note that we jump directly to a `RET` instruction, as `BT` only + * affects CF, and ZF ("not found") is clear at this point. */ + a.bt(header_val, index); + a.short_().jnc(fail); + + /* The actual offset of our entry is the number of bits set (in + * essence "entries present") before our index in the bitmap. */ + a.bzhi(header_val, header_val, index); + a.popcnt(index, header_val); + } + a.bind(skip_index_adjustment); + + a.mov(node, + x86::qword_ptr(node, + index.r64(), + 3, + sizeof(Eterm) - TAG_PRIMARY_BOXED)); + emit_ptr_val(node, node); + + /* Have we found our leaf? */ + a.test(node.r32(), imm(_TAG_PRIMARY_MASK - TAG_PRIMARY_LIST)); + a.short_().je(leaf_node); + + /* Nope, we have to search another node. */ + a.mov(header_val, emit_boxed_val(node, 0, sizeof(Uint32))); + + /* After 8 nodes we've run out of the 32 bits we started with, so we + * need to update the hash to keep going. */ + a.test(depth, imm(0x7)); + a.short_().jz(update_hash); + a.short_().jmp(node_loop); + + a.bind(leaf_node); + { + /* We've arrived at a leaf, set ZF according to whether its key + * matches ours and speculatively place the element in RET. */ + a.cmp(getCARRef(node), key); + a.mov(RET, getCDRRef(node)); + + /* See comment at the jump. */ + a.bind(fail); + a.ret(); + } + + /* After 8 nodes we've run out of the 32 bits we started with, so we + * must calculate a new hash to continue. + * + * This is a manual expansion `make_map_hash` from utils.c, and all + * changes to that function must be mirrored here. */ + a.bind(update_hash); + { + a.mov(TMP_MEM1d, depth); + + /* NOTE: ARG3d is always 0 at this point. */ + a.mov(ARG4d, depth); + a.shr(ARG4d, imm(3)); + mov_imm(ARG5d, 1); + a.mov(ARG6d, imm(HCONST_22)); + a.call(labels[internal_hash_helper]); + + a.xor_(ARG3d, imm(INTERNAL_HASH_SALT)); + a.mov(ARG4d, key.r32()); + a.mov(ARG5, key); + a.shr(ARG5, imm(32)); + a.mov(ARG6d, imm(HCONST)); + a.call(labels[internal_hash_helper]); + + a.mov(depth, TMP_MEM1d); + + a.jmp(node_loop); + } + } +} + +/* ARG1 = flat map, ARG2 = key + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_flatmap_get_element() { + Label fail = a.newLabel(), loop = a.newLabel(); + + a.mov(RETd, emit_boxed_val(ARG1, offsetof(flatmap_t, size), 4)); + a.mov(ARG4, emit_boxed_val(ARG1, offsetof(flatmap_t, keys))); + + emit_ptr_val(ARG4, ARG4); + + a.bind(loop); + { + a.dec(RETd); + a.short_().jl(fail); + + a.cmp(ARG2, + x86::qword_ptr(ARG4, RET, 3, sizeof(Eterm) - TAG_PRIMARY_BOXED)); + a.short_().jne(loop); + } + + int value_offset = sizeof(flatmap_t) - TAG_PRIMARY_BOXED; + a.mov(RET, x86::qword_ptr(ARG1, RET, 3, value_offset)); + + a.bind(fail); + a.ret(); +} + void BeamGlobalAssembler::emit_new_map_shared() { emit_enter_frame(); emit_enter_runtime<Update::eReductions | Update::eStack | Update::eHeap>(); @@ -91,6 +285,59 @@ void BeamModuleAssembler::emit_i_new_small_map_lit(const ArgVal &Dst, mov_arg(Dst, RET); } +/* ARG1 = map, ARG2 = key + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_i_get_map_element_shared() { + Label generic = a.newLabel(), hashmap = a.newLabel(); + + a.mov(RETd, ARG2d); + + a.and_(RETb, imm(_TAG_PRIMARY_MASK)); + a.cmp(RETb, imm(TAG_PRIMARY_IMMED1)); + a.short_().jne(generic); + + emit_ptr_val(ARG1, ARG1); + + a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32))); + a.mov(ARG4d, RETd); + + a.and_(ARG4d, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(ARG4d, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.short_().jne(hashmap); + + emit_flatmap_get_element(); + + a.bind(generic); + { + emit_enter_runtime(); + runtime_call<2>(get_map_element); + emit_leave_runtime(); + + emit_test_the_non_value(RET); + + /* Invert ZF, we want it to be set when RET is a value. */ + a.setnz(ARG1.r8()); + a.dec(ARG1.r8()); + + a.ret(); + } + + a.bind(hashmap); + { + /* Calculate the internal hash of ARG2 before diving into the HAMT. */ + a.mov(ARG5, ARG2); + a.shr(ARG5, imm(32)); + a.mov(ARG4d, ARG2d); + + a.mov(ARG3d, imm(INTERNAL_HASH_SALT)); + a.mov(ARG6d, imm(HCONST)); + a.call(labels[internal_hash_helper]); + + emit_hashmap_get_element(); + } +} + void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail, const ArgVal &Src, const ArgVal &Key, @@ -98,19 +345,21 @@ void BeamModuleAssembler::emit_i_get_map_element(const ArgVal &Fail, mov_arg(ARG1, Src); mov_arg(ARG2, Key); - emit_enter_runtime(); - - runtime_call<2>(get_map_element); - - emit_leave_runtime(); + if (masked_types(Key, BEAM_TYPE_MASK_IMMEDIATE) != BEAM_TYPE_NONE && + hasCpuFeature(CpuFeatures::X86::kBMI2)) { + safe_fragment_call(ga->get_i_get_map_element_shared()); + a.jne(resolve_beam_label(Fail)); + } else { + emit_enter_runtime(); + runtime_call<2>(get_map_element); + emit_leave_runtime(); - emit_test_the_non_value(RET); - a.je(resolve_beam_label(Fail)); + emit_test_the_non_value(RET); + a.je(resolve_beam_label(Fail)); + } - /* - * Don't store the result if the destination is the scratch X register. - * (This instruction was originally a has_map_fields instruction.) - */ + /* Don't store the result if the destination is the scratch X register. + * (This instruction was originally a has_map_fields instruction.) */ if (!(Dst.getType() == ArgVal::XReg && Dst.getValue() == SCRATCH_X_REG)) { mov_arg(Dst, RET); } @@ -120,24 +369,115 @@ void BeamModuleAssembler::emit_i_get_map_elements(const ArgVal &Fail, const ArgVal &Src, const ArgVal &Size, const Span<ArgVal> &args) { + Label generic = a.newLabel(), next = a.newLabel(); Label data = embed_vararg_rodata(args, 0); + /* We're not likely to gain much from inlining huge extractions, and the + * resulting code is quite large, so we'll cut it off after a handful + * elements. + * + * Note that the arguments come in flattened triplets of + * `{Key, Dst, KeyHash}` */ + bool can_inline = args.size() < (8 * 3); + ASSERT(Size.getValue() == args.size()); + ASSERT((Size.getValue() % 3) == 0); + + for (size_t i = 0; i < args.size(); i += 3) { + can_inline &= args[i].isImmed(); + } mov_arg(ARG1, Src); - a.mov(ARG3, E); - emit_enter_runtime(); + if (can_inline) { + comment("simplified multi-element lookup"); + + emit_ptr_val(ARG1, ARG1); + + a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32))); + a.and_(RETb, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(RETb, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.jne(generic); + + ERTS_CT_ASSERT(MAP_SMALL_MAP_LIMIT <= ERTS_UINT32_MAX); + a.mov(RETd, + emit_boxed_val(ARG1, offsetof(flatmap_t, size), sizeof(Uint32))); + a.mov(ARG2, emit_boxed_val(ARG1, offsetof(flatmap_t, keys))); + + emit_ptr_val(ARG2, ARG2); + + for (ssize_t i = args.size() - 3; i >= 0; i -= 3) { + Label loop = a.newLabel(); + + a.bind(loop); + { + x86::Mem candidate = + x86::qword_ptr(ARG2, + RET, + 3, + sizeof(Eterm) - TAG_PRIMARY_BOXED); + + a.dec(RETd); + a.jl(resolve_beam_label(Fail)); + + const auto &Comparand = args[i]; + cmp_arg(candidate, Comparand, ARG3); + a.short_().jne(loop); + } + + /* Don't store the result if the destination is the scratch X + * register. (This instruction was originally a has_map_fields + * instruction.) */ + const auto &Dst = args[i + 1]; + if (!(Dst.getType() == ArgVal::XReg && + Dst.getValue() == SCRATCH_X_REG)) { + const int value_offset = sizeof(flatmap_t) - TAG_PRIMARY_BOXED; + mov_arg(Dst, x86::qword_ptr(ARG1, RET, 3, value_offset), ARG3); + } + } + + a.short_().jmp(next); + } - mov_imm(ARG4, args.size() / 3); - a.lea(ARG5, x86::qword_ptr(data)); - load_x_reg_array(ARG2); - runtime_call<5>(beam_jit_get_map_elements); + a.bind(generic); + { + mov_imm(ARG4, args.size() / 3); + a.lea(ARG5, x86::qword_ptr(data)); + a.mov(ARG3, E); + + emit_enter_runtime(); + + load_x_reg_array(ARG2); + runtime_call<5>(beam_jit_get_map_elements); + + emit_leave_runtime(); + + a.test(RET, RET); + a.je(resolve_beam_label(Fail)); + } + + a.bind(next); +} + +/* ARG1 = map, ARG2 = key, ARG3 = key hash + * + * Result is returned in RET. ZF is set on success. */ +void BeamGlobalAssembler::emit_i_get_map_element_hash_shared() { + Label hashmap = a.newLabel(); + + emit_ptr_val(ARG1, ARG1); + + a.mov(RETd, emit_boxed_val(ARG1, 0, sizeof(Uint32))); + a.mov(ARG4d, RETd); - emit_leave_runtime(); + a.and_(ARG4d, imm(_HEADER_MAP_SUBTAG_MASK)); + a.cmp(ARG4d, imm(HAMT_SUBTAG_HEAD_FLATMAP)); + a.short_().jne(hashmap); - a.test(RET, RET); - a.je(resolve_beam_label(Fail)); + emit_flatmap_get_element(); + + a.bind(hashmap); + emit_hashmap_get_element(); } void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail, @@ -149,19 +489,20 @@ void BeamModuleAssembler::emit_i_get_map_element_hash(const ArgVal &Fail, mov_arg(ARG2, Key); mov_arg(ARG3, Hx); - emit_enter_runtime(); - - runtime_call<3>(get_map_element_hash); - - emit_leave_runtime(); + if (Key.isImmed() && hasCpuFeature(CpuFeatures::X86::kBMI2)) { + safe_fragment_call(ga->get_i_get_map_element_hash_shared()); + a.jne(resolve_beam_label(Fail)); + } else { + emit_enter_runtime(); + runtime_call<3>(get_map_element_hash); + emit_leave_runtime(); - emit_test_the_non_value(RET); - a.je(resolve_beam_label(Fail)); + emit_test_the_non_value(RET); + a.je(resolve_beam_label(Fail)); + } - /* - * Don't store the result if the destination is the scratch X register. - * (This instruction was originally a has_map_fields instruction.) - */ + /* Don't store the result if the destination is the scratch X register. + * (This instruction was originally a has_map_fields instruction.) */ if (!(Dst.getType() == ArgVal::XReg && Dst.getValue() == SCRATCH_X_REG)) { mov_arg(Dst, RET); } diff --git a/erts/emulator/test/map_SUITE.erl b/erts/emulator/test/map_SUITE.erl index 9cc954a937..b06c308cb1 100644 --- a/erts/emulator/test/map_SUITE.erl +++ b/erts/emulator/test/map_SUITE.erl @@ -84,6 +84,7 @@ %% instruction-level tests t_has_map_fields/1, + t_get_map_elements/1, y_regs/1, %%Bugs @@ -157,6 +158,7 @@ groups() -> %% instruction-level tests t_has_map_fields, + t_get_map_elements, y_regs, %% Bugs @@ -3142,6 +3144,19 @@ has_map_fields_3(#{a:=_,b:=_}) -> true; has_map_fields_3(#{[]:=_,42.0:=_}) -> true; has_map_fields_3(#{}) -> false. +t_get_map_elements(Config) when is_list(Config) -> + %% Tests that the JIT implementation of `get_map_elements` handles + %% collisions properly. + N = 500000, + Is = lists:seq(1, N), + Test = maps:from_list([{I,I}||I<-Is]), + [begin + #{ Key := Val } = Test, + Key = Val + end || Key <- Is], + + ok. + y_regs(Config) when is_list(Config) -> Val = [length(Config)], Map0 = y_regs_update(#{}, Val), |