summaryrefslogtreecommitdiff
path: root/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/brig/brigfrontend/brig-basic-inst-handler.cc')
-rw-r--r--gcc/brig/brigfrontend/brig-basic-inst-handler.cc865
1 files changed, 865 insertions, 0 deletions
diff --git a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
new file mode 100644
index 00000000000..638f818ef0b
--- /dev/null
+++ b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
@@ -0,0 +1,865 @@
+/* brig-basic-inst-handler.cc -- brig basic instruction handling
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+ for General Processor Tech.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 3, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sstream>
+
+#include "brig-code-entry-handler.h"
+#include "brig-util.h"
+
+#include "errors.h"
+#include "gimple-expr.h"
+#include "convert.h"
+#include "print-tree.h"
+#include "tree-pretty-print.h"
+#include "langhooks.h"
+#include "stor-layout.h"
+#include "diagnostic-core.h"
+#include "brig-builtins.h"
+
+brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
+ : brig_code_entry_handler (parent)
+{
+}
+
+class scalarized_sat_arithmetics : public tree_element_binary_visitor
+{
+public:
+ scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
+ : m_brig_inst (brig_inst)
+ {
+ BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
+
+#undef DEF_HSAIL_SAT_BUILTIN
+#undef DEF_HSAIL_BUILTIN
+#undef DEF_HSAIL_ATOMIC_BUILTIN
+#undef DEF_HSAIL_INTR_BUILTIN
+#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
+
+#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE, \
+ NAME, TYPE, ATTRS) \
+ if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE) \
+ m_builtin = builtin_decl_explicit (ENUM); \
+ else
+#include "brig-builtins.def"
+ gcc_unreachable ();
+ }
+
+ virtual tree
+ visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
+ {
+ /* Implement saturating arithmetics with scalar built-ins for now.
+ TODO: emit GENERIC nodes for the simplest cases or at least
+ emit vector built-ins. */
+ return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
+ TREE_TYPE (operand0), operand0,
+ TREE_TYPE (operand1), operand1);
+ }
+ const BrigInstBase &m_brig_inst;
+ tree m_builtin;
+};
+
+/* Implements a vector shuffle. ARITH_TYPE is the type of the vector,
+ OPERANDS[0] is the first vector, OPERAND[1] the second vector and
+ OPERANDS[2] the shuffle mask in HSAIL format. The output is a VEC_PERM_EXPR
+ that implements the shuffle as a GENERIC expression. */
+
+tree
+brig_basic_inst_handler::build_shuffle (tree arith_type,
+ tree_stl_vec &operands)
+{
+ tree element_type
+ = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
+
+ /* Offsets to add to the mask values to convert from the
+ HSAIL mask to VEC_PERM_EXPR masks. VEC_PERM_EXPR mask
+ assumes an index spanning from 0 to 2 times the vec
+ width while HSAIL refers separately to two different
+ input vectors, thus is not a "full shuffle" where all
+ output elements can originate from any input element. */
+ vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
+
+ vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+ size_t input_mask_element_size
+ = exact_log2 (TYPE_VECTOR_SUBPARTS (arith_type));
+
+ /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
+ from which to construct the mask vector as understood by
+ VEC_PERM_EXPR. */
+ tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
+
+ tree mask_element_type
+ = build_nonstandard_integer_type (input_mask_element_size, true);
+
+ for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
+ {
+ tree mask_element
+ = build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
+ build_int_cst (unsigned_char_type_node,
+ input_mask_element_size),
+ build_int_cst (unsigned_char_type_node,
+ i * input_mask_element_size));
+
+ mask_element = convert (element_type, mask_element);
+
+ tree offset;
+ if (i < TYPE_VECTOR_SUBPARTS (arith_type) / 2)
+ offset = build_int_cst (element_type, 0);
+ else
+ offset
+ = build_int_cst (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+ CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
+ CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
+ }
+ tree mask_vec_type
+ = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+ tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
+ tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
+
+ tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
+
+ tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
+ operands[1], mask);
+ return perm;
+}
+
+/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
+ from the vector expression in OPERANDS[0]. */
+
+tree
+brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
+{
+ /* Implement the unpack with a shuffle that stores the unpacked
+ element to the lowest bit positions in the dest. After that
+ a bitwise AND is used to clear the uppermost bits. */
+ tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
+
+ /* Perform the operations with a raw (unsigned int type) type. */
+ tree element_type = get_unsigned_int_type (src_element_type);
+
+ vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+ vec<constructor_elt, va_gc> *and_mask_vals = NULL;
+
+ size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
+ tree vec_type = build_vector_type (element_type, element_count);
+
+ for (size_t i = 0; i < element_count; ++i)
+ {
+ tree mask_element;
+ if (i == 0)
+ mask_element = convert (element_type, operands[1]);
+ else
+ mask_element = build_int_cst (element_type, 0);
+
+ CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
+
+ tree and_mask_element;
+ if (i == 0)
+ and_mask_element = build_int_cst (element_type, -1);
+ else
+ and_mask_element = build_int_cst (element_type, 0);
+ CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
+ }
+
+ tree mask_vec = build_constructor (vec_type, input_mask_vals);
+
+ tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
+
+ tree perm = build3 (VEC_PERM_EXPR, vec_type,
+ build_reinterpret_cast (vec_type, operands[0]),
+ build_reinterpret_cast (vec_type, operands[0]), mask_vec);
+
+ tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
+
+ size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
+ tree raw_type = build_nonstandard_integer_type (s, true);
+
+ tree as_int = build_reinterpret_cast (raw_type, cleared);
+
+ if (int_size_in_bytes (src_element_type) < 4)
+ {
+ if (INTEGRAL_TYPE_P (src_element_type))
+ return extend_int (as_int, uint32_type_node, src_element_type);
+ }
+ return as_int;
+}
+
+/* Packs (inserts) a scalar element in OPERANDS[1]
+ to the vector in OPERANDS[0] at element position defined by
+ OPERANDS[2]. */
+
+tree
+brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
+{
+ /* Implement using a bit level insertion.
+ TODO: Reuse this for implementing 'bitinsert'
+ without a builtin call. */
+
+ size_t ecount = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
+ size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
+ tree wide_type = build_nonstandard_integer_type (vecsize, 1);
+
+ tree src_vect = build_reinterpret_cast (wide_type, operands[0]);
+ src_vect = add_temp_var ("src_vect", src_vect);
+
+ tree scalar = operands[1];
+ scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
+
+ tree pos = operands[2];
+
+ /* The upper bits of the position can contain garbage.
+ Zero them for well-defined semantics. */
+ tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
+ build_int_cstu (TREE_TYPE (pos), ecount - 1));
+ pos = add_temp_var ("pos", convert (wide_type, t));
+
+ tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
+ size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
+ tree ewidth = build_int_cstu (wide_type, element_width);
+
+ tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
+ bitoffset = add_temp_var ("offset", bitoffset);
+
+ uint64_t mask_int
+ = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
+
+ tree mask = build_int_cstu (wide_type, mask_int);
+
+ mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
+
+ tree clearing_mask
+ = build1 (BIT_NOT_EXPR, wide_type,
+ build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
+
+ tree zeroed_element
+ = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
+
+ /* TODO: Is the AND necessary: does HSA define what
+ happens if the upper bits in the inserted element are not
+ zero? */
+ tree element_in_position
+ = build2 (LSHIFT_EXPR, wide_type,
+ build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
+
+ tree inserted
+ = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
+ return inserted;
+}
+
+/* Implement the unpack{lo,hi}. BRIG_OPCODE should tell which one and
+ ARITH_TYPE describe the type of the vector arithmetics.
+ OPERANDS[0] and OPERANDS[1] are the input vectors. */
+
+tree
+brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
+ tree arith_type,
+ tree_stl_vec &operands)
+{
+ tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
+ tree mask_vec_type
+ = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+ size_t element_count = TYPE_VECTOR_SUBPARTS (arith_type);
+ vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+
+ size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
+
+ for (size_t i = 0; i < element_count / 2; ++i)
+ {
+ CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
+ build_int_cst (element_type, offset + i));
+ CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
+ build_int_cst (element_type,
+ offset + i + element_count));
+ }
+
+ tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
+
+ tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
+ operands[1], mask_vec);
+ return perm;
+}
+
+/* Builds a basic instruction expression from a BRIG instruction. BRIG_OPCODE
+ is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
+ desired tree type for the instruction, and OPERANDS the instruction's
+ input operands already converted to tree nodes. */
+
+tree
+brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
+ BrigType16_t brig_type,
+ tree arith_type,
+ tree_stl_vec &operands)
+{
+ tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
+
+ BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+
+ tree instr_inner_type
+ = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
+
+ if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
+ {
+ /* HSA defines modulo/clipping behavior for shift amounts larger
+ than the bit width, while tree.def leaves it undefined.
+ We need to mask the upper bits to ensure the defined behavior. */
+ tree scalar_mask
+ = build_int_cst (instr_inner_type,
+ gccbrig_hsa_type_bit_size (inner_type) - 1);
+
+ tree mask = VECTOR_TYPE_P (arith_type)
+ ? build_vector_from_val (arith_type, scalar_mask)
+ : scalar_mask;
+
+ /* The shift amount is a scalar, broadcast it to produce
+ a vector shift. */
+ if (VECTOR_TYPE_P (arith_type))
+ operands[1] = build_vector_from_val (arith_type, operands[1]);
+ operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
+ }
+
+ size_t input_count = operands.size ();
+ size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
+ 1 : 0;
+
+ if (opcode == TREE_LIST)
+ {
+ /* There was no direct GENERIC opcode for the instruction;
+ try to emulate it with a chain of GENERIC nodes. */
+ if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
+ {
+ /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
+ use a chain of multiply + add for now (double rounding method).
+ It should be easier for optimizers than a custom built-in call
+ WIDEN_MULT_EXPR is close, but requires a double size result
+ type. */
+ tree mult_res
+ = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
+ return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
+ }
+ else if (brig_opcode == BRIG_OPCODE_MAD24HI)
+ {
+ tree mult_res
+ = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
+ return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
+ }
+ else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
+ {
+ return build_shuffle (arith_type, operands);
+ }
+ else if (brig_opcode == BRIG_OPCODE_UNPACKLO
+ || brig_opcode == BRIG_OPCODE_UNPACKHI)
+ {
+ return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
+ }
+ else if (brig_opcode == BRIG_OPCODE_UNPACK)
+ {
+ return build_unpack (operands);
+ }
+ else if (brig_opcode == BRIG_OPCODE_PACK)
+ {
+ return build_pack (operands);
+ }
+ else if (brig_opcode == BRIG_OPCODE_NRSQRT)
+ {
+ /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
+ native ISA other than a division, if available.
+ TODO: this will happen only with unsafe math optimizations
+ on which cannot be used in general to remain HSAIL compliant.
+ Perhaps a builtin call would be better option here. */
+ return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
+ expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
+ arith_type, operands));
+ }
+ else if (brig_opcode == BRIG_OPCODE_NRCP)
+ {
+ /* Implement as 1.0/x and assume gcc instruction selects to
+ native ISA other than a division, if available. */
+ return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
+ operands[0]);
+ }
+ else if (brig_opcode == BRIG_OPCODE_LANEID
+ || brig_opcode == BRIG_OPCODE_MAXWAVEID
+ || brig_opcode == BRIG_OPCODE_WAVEID)
+ {
+ /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
+ MAXWAVEID always return 0. */
+ return build_zero_cst (arith_type);
+ }
+ else
+ gcc_unreachable ();
+ }
+ else if (opcode == CALL_EXPR)
+ return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
+ operands);
+ else if (output_count == 1)
+ {
+ if (input_count == 1)
+ {
+ if (opcode == MODIFY_EXPR)
+ return operands[0];
+ else
+ return build1 (opcode, arith_type, operands[0]);
+ }
+ else if (input_count == 2)
+ return build2 (opcode, arith_type, operands[0], operands[1]);
+ else if (input_count == 3)
+ return build3 (opcode, arith_type, operands[0], operands[1],
+ operands[2]);
+ else
+ gcc_unreachable ();
+ }
+ else
+ gcc_unreachable ();
+
+ return NULL_TREE;
+}
+
+/* Handles the basic instructions, including packed instructions. Deals
+ with the different packing modes by unpacking/packing the wanted
+ elements. Delegates most of the instruction cases to build_inst_expr(). */
+
+size_t
+brig_basic_inst_handler::operator () (const BrigBase *base)
+{
+ const BrigInstBase *brig_inst = (const BrigInstBase *) base;
+
+ tree_stl_vec operands = build_operands (*brig_inst);
+
+ size_t output_count
+ = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
+ size_t input_count
+ = operands.size () == 0 ? 0 : (operands.size () - output_count);
+
+ gcc_assert (output_count == 0 || output_count == 1);
+
+ tree_stl_vec::iterator first_input_i = operands.begin ();
+ if (output_count > 0 && operands.size () > 0)
+ ++first_input_i;
+
+ tree_stl_vec in_operands;
+ in_operands.assign (first_input_i, operands.end ());
+
+ BrigType16_t brig_inst_type = brig_inst->type;
+
+ if (brig_inst->opcode == BRIG_OPCODE_NOP)
+ return base->byteCount;
+ else if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
+ || brig_inst->opcode == BRIG_OPCODE_LASTBIT
+ || brig_inst->opcode == BRIG_OPCODE_SAD)
+ /* These instructions are reported to be always 32b in HSAIL, but we want
+ to treat them according to their input argument's type to select the
+ correct instruction/builtin. */
+ brig_inst_type
+ = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
+
+ tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
+
+ if (!instr_type)
+ {
+ gcc_unreachable ();
+ return base->byteCount;
+ }
+
+ bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
+
+ size_t element_size_bits;
+ size_t element_count;
+
+ if (is_vec_instr)
+ {
+ BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
+ element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
+ element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
+ / gccbrig_hsa_type_bit_size (brig_element_type);
+ }
+ else
+ {
+ element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
+ element_count = 1;
+ }
+
+ /* The actual arithmetics type that should be performed with the
+ operation. This is not always the same as the original BRIG
+ opcode's type due to implicit conversions of storage-only f16. */
+ tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
+ ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
+ : get_tree_expr_type_for_hsa_type (brig_inst_type);
+
+ tree instr_expr = NULL_TREE;
+
+ BrigPack8_t p = BRIG_PACK_NONE;
+ if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
+ p = ((const BrigInstMod *) brig_inst)->pack;
+ else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
+ p = ((const BrigInstCmp *) brig_inst)->pack;
+
+ if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
+ in_operands[1] = build_lower_element_broadcast (in_operands[1]);
+ else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
+ in_operands[0] = build_lower_element_broadcast (in_operands[0]);
+
+ tree_code opcode
+ = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
+
+ if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
+ {
+ scalarized_sat_arithmetics sat_arith (*brig_inst);
+ gcc_assert (input_count == 2);
+ instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
+ }
+ else if (opcode == RETURN_EXPR)
+ {
+ if (m_parent.m_cf->m_is_kernel)
+ {
+ tree goto_stmt
+ = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
+ m_parent.m_cf->append_statement (goto_stmt);
+ return base->byteCount;
+ }
+ else
+ {
+ m_parent.m_cf->append_return_stmt ();
+ return base->byteCount;
+ }
+ }
+ else if (opcode == MULT_HIGHPART_EXPR &&
+ is_vec_instr && element_size_bits < 64)
+ {
+ /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
+ even the scalars do not seem to work at least for char elements.
+
+ Let's fall back to scalarization and promotion of the vector elements
+ to larger types with the MULHI computed as a regular MUL.
+ MULHI for 2x64b seems to work with the Intel CPUs I've tested so
+ that is passed on for vector processing so there is no need for
+ 128b scalar arithmetics.
+
+ This is not modular as these type of things do not belong to the
+ frontend, there should be a legalization phase before the backend
+ that figures out the best way to compute the MULHI for any
+ integer vector datatype.
+
+ TODO: promote to larger vector types instead. For example
+ MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
+ with my x86-64.
+ */
+ tree_stl_vec operand0_elements;
+ if (input_count > 0)
+ unpack (in_operands[0], operand0_elements);
+
+ tree_stl_vec operand1_elements;
+ if (input_count > 1)
+ unpack (in_operands[1], operand1_elements);
+
+ tree_stl_vec result_elements;
+
+ tree scalar_type = TREE_TYPE (arith_type);
+ BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
+ tree promoted_type = short_integer_type_node;
+ switch (element_type)
+ {
+ case BRIG_TYPE_S8:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
+ break;
+ case BRIG_TYPE_U8:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
+ break;
+ case BRIG_TYPE_S16:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
+ break;
+ case BRIG_TYPE_U16:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
+ break;
+ case BRIG_TYPE_S32:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
+ break;
+ case BRIG_TYPE_U32:
+ promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
+
+ for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
+ {
+ tree operand0 = convert (promoted_type, operand0_elements.at (i));
+ tree operand1 = convert (promoted_type, operand1_elements.at (i));
+
+ tree scalar_expr
+ = build2 (MULT_EXPR, promoted_type, operand0, operand1);
+
+ scalar_expr
+ = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
+ build_int_cstu (promoted_type, promoted_type_size / 2));
+
+ result_elements.push_back (convert (scalar_type, scalar_expr));
+ }
+ instr_expr = pack (result_elements);
+ }
+ else
+ {
+ /* 'class' is always of b1 type, let's consider it by its
+ float type when building the instruction to find the
+ correct builtin. */
+ if (brig_inst->opcode == BRIG_OPCODE_CLASS)
+ brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
+ instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
+ arith_type, in_operands);
+ }
+
+ if (instr_expr == NULL_TREE)
+ {
+ gcc_unreachable ();
+ return base->byteCount;
+ }
+
+ if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
+ || p == BRIG_PACK_SSAT)
+ {
+ /* In case of _s_ or _ss_, select only the lowest element
+ from the new input to the output. We could extract
+ the element and use a scalar operation, but try
+ to keep data in vector registers as much as possible
+ to avoid copies between scalar and vector datapaths. */
+ tree old_value;
+ tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
+ bool is_fp16_operation
+ = (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
+ && !gccbrig_is_bit_operation (brig_inst->opcode);
+
+ if (is_fp16_operation)
+ old_value = build_h2f_conversion
+ (build_reinterpret_cast (half_storage_type, operands[0]));
+ else
+ old_value
+ = build_reinterpret_cast (TREE_TYPE (instr_expr), operands[0]);
+
+ size_t esize = is_fp16_operation ? 32 : element_size_bits;
+
+ /* Construct a permutation mask where other elements than the lowest one
+ is picked from the old_value. */
+ tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
+ vec<constructor_elt, va_gc> *constructor_vals = NULL;
+ for (size_t i = 0; i < element_count; ++i)
+ {
+ tree cst;
+
+ if (i == 0)
+ cst = build_int_cstu (mask_inner_type, element_count);
+ else
+ cst = build_int_cstu (mask_inner_type, i);
+ CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
+ }
+ tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
+ tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
+
+ tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
+ tree assign
+ = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
+ m_parent.m_cf->append_statement (assign);
+
+ instr_expr
+ = build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
+
+ tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
+ tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
+ lower_output, instr_expr);
+ m_parent.m_cf->append_statement (assign_lower);
+ instr_expr = lower_output;
+ }
+
+ if (output_count == 1)
+ build_output_assignment (*brig_inst, operands[0], instr_expr);
+ else
+ m_parent.m_cf->append_statement (instr_expr);
+ return base->byteCount;
+}
+
+/* Create an expression that broadcasts the lowest element of the
+ vector in VEC_OPERAND to all elements of the returned vector. */
+
+tree
+brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
+{
+ /* Build the broadcast using shuffle because there's no
+ direct broadcast in GENERIC and this way there's no need for
+ a separate extract of the lowest element. */
+ tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
+ size_t esize = 8 * int_size_in_bytes (element_type);
+
+ size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_operand));
+ tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
+ vec<constructor_elt, va_gc> *constructor_vals = NULL;
+
+ /* Construct the mask. */
+ for (size_t i = 0; i < element_count; ++i)
+ {
+ tree cst = build_int_cstu (mask_inner_type, element_count);
+ CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
+ }
+ tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
+ tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
+
+ return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
+ vec_operand, mask);
+}
+
+/* Returns the tree code that should be used to implement the given
+ HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
+ (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly,
+ returns TREE_LIST (if it can be emulated with a simple chain of tree
+ nodes) or CALL_EXPR if the opcode should be implemented using a builtin
+ call. */
+
+tree_code
+brig_basic_inst_handler::get_tree_code_for_hsa_opcode
+ (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
+{
+ BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+ switch (brig_opcode)
+ {
+ case BRIG_OPCODE_NOP:
+ return NOP_EXPR;
+ case BRIG_OPCODE_ADD:
+ return PLUS_EXPR;
+ case BRIG_OPCODE_CMOV:
+ if (brig_inner_type == brig_type)
+ return COND_EXPR;
+ else
+ return VEC_COND_EXPR;
+ case BRIG_OPCODE_SUB:
+ return MINUS_EXPR;
+ case BRIG_OPCODE_MUL:
+ case BRIG_OPCODE_MUL24:
+ return MULT_EXPR;
+ case BRIG_OPCODE_MULHI:
+ case BRIG_OPCODE_MUL24HI:
+ return MULT_HIGHPART_EXPR;
+ case BRIG_OPCODE_DIV:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return RDIV_EXPR;
+ else
+ return TRUNC_DIV_EXPR;
+ case BRIG_OPCODE_NEG:
+ return NEGATE_EXPR;
+ case BRIG_OPCODE_MIN:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return CALL_EXPR;
+ else
+ return MIN_EXPR;
+ case BRIG_OPCODE_MAX:
+ if (gccbrig_is_float_type (brig_inner_type))
+ return CALL_EXPR;
+ else
+ return MAX_EXPR;
+ case BRIG_OPCODE_FMA:
+ return FMA_EXPR;
+ case BRIG_OPCODE_ABS:
+ return ABS_EXPR;
+ case BRIG_OPCODE_SHL:
+ return LSHIFT_EXPR;
+ case BRIG_OPCODE_SHR:
+ return RSHIFT_EXPR;
+ case BRIG_OPCODE_OR:
+ return BIT_IOR_EXPR;
+ case BRIG_OPCODE_XOR:
+ return BIT_XOR_EXPR;
+ case BRIG_OPCODE_AND:
+ return BIT_AND_EXPR;
+ case BRIG_OPCODE_NOT:
+ return BIT_NOT_EXPR;
+ case BRIG_OPCODE_RET:
+ return RETURN_EXPR;
+ case BRIG_OPCODE_MOV:
+ case BRIG_OPCODE_LDF:
+ return MODIFY_EXPR;
+ case BRIG_OPCODE_LD:
+ case BRIG_OPCODE_ST:
+ return MEM_REF;
+ case BRIG_OPCODE_BR:
+ return GOTO_EXPR;
+ case BRIG_OPCODE_REM:
+ if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
+ return TRUNC_MOD_EXPR;
+ else
+ return CALL_EXPR;
+ case BRIG_OPCODE_NRCP:
+ case BRIG_OPCODE_NRSQRT:
+ /* Implement as 1/f (x). gcc should pattern detect that and
+ use a native instruction, if available, for it. */
+ return TREE_LIST;
+ case BRIG_OPCODE_FLOOR:
+ case BRIG_OPCODE_CEIL:
+ case BRIG_OPCODE_SQRT:
+ case BRIG_OPCODE_NSQRT:
+ case BRIG_OPCODE_RINT:
+ case BRIG_OPCODE_TRUNC:
+ case BRIG_OPCODE_POPCOUNT:
+ case BRIG_OPCODE_COPYSIGN:
+ case BRIG_OPCODE_NCOS:
+ case BRIG_OPCODE_NSIN:
+ case BRIG_OPCODE_NLOG2:
+ case BRIG_OPCODE_NEXP2:
+ case BRIG_OPCODE_NFMA:
+ /* Class has type B1 regardless of the float type, thus
+ the below builtin map search cannot find it. */
+ case BRIG_OPCODE_CLASS:
+ case BRIG_OPCODE_WORKITEMABSID:
+ return CALL_EXPR;
+ default:
+
+ /* Some BRIG opcodes can use the same builtins for unsigned and
+ signed types. Force these cases to unsigned types.
+ */
+
+ if (brig_opcode == BRIG_OPCODE_BORROW
+ || brig_opcode == BRIG_OPCODE_CARRY
+ || brig_opcode == BRIG_OPCODE_LASTBIT
+ || brig_opcode == BRIG_OPCODE_BITINSERT)
+ {
+ if (brig_type == BRIG_TYPE_S32)
+ brig_type = BRIG_TYPE_U32;
+ else if (brig_type == BRIG_TYPE_S64)
+ brig_type = BRIG_TYPE_U64;
+ }
+
+
+ builtin_map::const_iterator i
+ = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+ if (i != s_custom_builtins.end ())
+ return CALL_EXPR;
+ else if (s_custom_builtins.find
+ (std::make_pair (brig_opcode, brig_inner_type))
+ != s_custom_builtins.end ())
+ return CALL_EXPR;
+ if (brig_inner_type == BRIG_TYPE_F16
+ && s_custom_builtins.find
+ (std::make_pair (brig_opcode, BRIG_TYPE_F32))
+ != s_custom_builtins.end ())
+ return CALL_EXPR;
+ break;
+ }
+ return TREE_LIST; /* Emulate using a chain of nodes. */
+}