1 files changed, 865 insertions, 0 deletions
diff --git a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
new file mode 100644
index 00000000000..638f818ef0b
--- /dev/null
+++ b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
@@ -0,0 +1,865 @@
+/* brig-basic-inst-handler.cc -- brig basic instruction handling
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
+   for General Processor Tech.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it under
+   the terms of the GNU General Public License as published by the Free
+   Software Foundation; either version 3, or (at your option) any later
+   version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or
+   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+   for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING3.  If not see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sstream>
+
+#include "brig-code-entry-handler.h"
+#include "brig-util.h"
+
+#include "errors.h"
+#include "gimple-expr.h"
+#include "convert.h"
+#include "print-tree.h"
+#include "tree-pretty-print.h"
+#include "langhooks.h"
+#include "stor-layout.h"
+#include "diagnostic-core.h"
+#include "brig-builtins.h"
+
+brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
+  : brig_code_entry_handler (parent)
+{
+}
+
+class scalarized_sat_arithmetics : public tree_element_binary_visitor
+{
+public:
+  scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
+    : m_brig_inst (brig_inst)
+  {
+    BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
+
+#undef DEF_HSAIL_SAT_BUILTIN
+#undef DEF_HSAIL_BUILTIN
+#undef DEF_HSAIL_ATOMIC_BUILTIN
+#undef DEF_HSAIL_INTR_BUILTIN
+#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
+
+#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
+			      NAME, TYPE, ATTRS)			\
+    if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
+      m_builtin = builtin_decl_explicit (ENUM);				\
+    else
+#include "brig-builtins.def"
+      gcc_unreachable ();
+  }
+
+  virtual tree
+  visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
+  {
+    /* Implement saturating arithmetics with scalar built-ins for now.
+       TODO: emit GENERIC nodes for the simplest cases or at least
+       emit vector built-ins.  */
+    return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
+			 TREE_TYPE (operand0), operand0,
+			 TREE_TYPE (operand1), operand1);
+  }
+  const BrigInstBase &m_brig_inst;
+  tree m_builtin;
+};
+
+/* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
+   OPERANDS[0] is the first vector, OPERAND[1] the second vector and
+   OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
+   that implements the shuffle as a GENERIC expression.  */
+
+tree
+brig_basic_inst_handler::build_shuffle (tree arith_type,
+					tree_stl_vec &operands)
+{
+  tree element_type
+    = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
+
+  /* Offsets to add to the mask values to convert from the
+     HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
+     assumes an index spanning from 0 to 2 times the vec
+     width while HSAIL refers separately to two different
+     input vectors, thus is not a "full shuffle" where all
+     output elements can originate from any input element.  */
+  vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
+
+  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+  size_t input_mask_element_size
+    = exact_log2 (TYPE_VECTOR_SUBPARTS (arith_type));
+
+  /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
+     from which to construct the mask vector as understood by
+     VEC_PERM_EXPR.  */
+  tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
+
+  tree mask_element_type
+    = build_nonstandard_integer_type (input_mask_element_size, true);
+
+  for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
+    {
+      tree mask_element
+	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
+		  build_int_cst (unsigned_char_type_node,
+				 input_mask_element_size),
+		  build_int_cst (unsigned_char_type_node,
+				 i * input_mask_element_size));
+
+      mask_element = convert (element_type, mask_element);
+
+      tree offset;
+      if (i < TYPE_VECTOR_SUBPARTS (arith_type) / 2)
+	offset = build_int_cst (element_type, 0);
+      else
+	offset
+	  = build_int_cst (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+      CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
+      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
+    }
+  tree mask_vec_type
+    = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
+  tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
+
+  tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
+
+  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
+		      operands[1], mask);
+  return perm;
+}
+
+/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
+   from the vector expression in OPERANDS[0].  */
+
+tree
+brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
+{
+  /* Implement the unpack with a shuffle that stores the unpacked
+     element to the lowest bit positions in the dest.  After that
+     a bitwise AND is used to clear the uppermost bits.  */
+  tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
+
+  /* Perform the operations with a raw (unsigned int type) type.  */
+  tree element_type = get_unsigned_int_type (src_element_type);
+
+  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+  vec<constructor_elt, va_gc> *and_mask_vals = NULL;
+
+  size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
+  tree vec_type = build_vector_type (element_type, element_count);
+
+  for (size_t i = 0; i < element_count; ++i)
+    {
+      tree mask_element;
+      if (i == 0)
+	mask_element = convert (element_type, operands[1]);
+      else
+	mask_element = build_int_cst (element_type, 0);
+
+      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
+
+      tree and_mask_element;
+      if (i == 0)
+	and_mask_element = build_int_cst (element_type, -1);
+      else
+	and_mask_element = build_int_cst (element_type, 0);
+      CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
+    }
+
+  tree mask_vec = build_constructor (vec_type, input_mask_vals);
+
+  tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
+
+  tree perm = build3 (VEC_PERM_EXPR, vec_type,
+		      build_reinterpret_cast (vec_type, operands[0]),
+		      build_reinterpret_cast (vec_type, operands[0]), mask_vec);
+
+  tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
+
+  size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
+  tree raw_type = build_nonstandard_integer_type (s, true);
+
+  tree as_int = build_reinterpret_cast (raw_type, cleared);
+
+  if (int_size_in_bytes (src_element_type) < 4)
+    {
+      if (INTEGRAL_TYPE_P (src_element_type))
+	return extend_int (as_int, uint32_type_node, src_element_type);
+    }
+  return as_int;
+}
+
+/* Packs (inserts) a scalar element in OPERANDS[1]
+   to the vector in OPERANDS[0] at element position defined by
+   OPERANDS[2].  */
+
+tree
+brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
+{
+  /* Implement using a bit level insertion.
+     TODO: Reuse this for implementing 'bitinsert'
+     without a builtin call.  */
+
+  size_t ecount = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
+  size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
+  tree wide_type = build_nonstandard_integer_type (vecsize, 1);
+
+  tree src_vect = build_reinterpret_cast (wide_type, operands[0]);
+  src_vect = add_temp_var ("src_vect", src_vect);
+
+  tree scalar = operands[1];
+  scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
+
+  tree pos = operands[2];
+
+  /* The upper bits of the position can contain garbage.
+     Zero them for well-defined semantics.  */
+  tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
+		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
+  pos = add_temp_var ("pos", convert (wide_type, t));
+
+  tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
+  size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
+  tree ewidth = build_int_cstu (wide_type, element_width);
+
+  tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
+  bitoffset = add_temp_var ("offset", bitoffset);
+
+  uint64_t mask_int
+    = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
+
+  tree mask = build_int_cstu (wide_type, mask_int);
+
+  mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
+
+  tree clearing_mask
+    = build1 (BIT_NOT_EXPR, wide_type,
+	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
+
+  tree zeroed_element
+    = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
+
+  /* TODO: Is the AND necessary: does HSA define what
+     happens if the upper bits in the inserted element are not
+     zero? */
+  tree element_in_position
+    = build2 (LSHIFT_EXPR, wide_type,
+	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
+
+  tree inserted
+    = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
+  return inserted;
+}
+
+/* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
+   ARITH_TYPE describe the type of the vector arithmetics.
+   OPERANDS[0] and OPERANDS[1] are the input vectors.  */
+
+tree
+brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
+						tree arith_type,
+						tree_stl_vec &operands)
+{
+  tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
+  tree mask_vec_type
+    = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));
+
+  size_t element_count = TYPE_VECTOR_SUBPARTS (arith_type);
+  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
+
+  size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
+
+  for (size_t i = 0; i < element_count / 2; ++i)
+    {
+      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
+			      build_int_cst (element_type, offset + i));
+      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
+			      build_int_cst (element_type,
+					     offset + i + element_count));
+    }
+
+  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
+
+  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
+		      operands[1], mask_vec);
+  return perm;
+}
+
+/* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
+   is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
+   desired tree type for the instruction, and OPERANDS the instruction's
+   input operands already converted to tree nodes.  */
+
+tree
+brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
+					  BrigType16_t brig_type,
+					  tree arith_type,
+					  tree_stl_vec &operands)
+{
+  tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
+
+  BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+
+  tree instr_inner_type
+    = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
+
+  if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
+    {
+      /* HSA defines modulo/clipping behavior for shift amounts larger
+	 than the bit width, while tree.def leaves it undefined.
+	 We need to mask the upper bits to ensure the defined behavior.  */
+      tree scalar_mask
+	= build_int_cst (instr_inner_type,
+			 gccbrig_hsa_type_bit_size (inner_type) - 1);
+
+      tree mask = VECTOR_TYPE_P (arith_type)
+		    ? build_vector_from_val (arith_type, scalar_mask)
+		    : scalar_mask;
+
+      /* The shift amount is a scalar, broadcast it to produce
+	 a vector shift.  */
+      if (VECTOR_TYPE_P (arith_type))
+	operands[1] = build_vector_from_val (arith_type, operands[1]);
+      operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
+    }
+
+  size_t input_count = operands.size ();
+  size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
+    1 : 0;
+
+  if (opcode == TREE_LIST)
+    {
+      /* There was no direct GENERIC opcode for the instruction;
+	 try to emulate it with a chain of GENERIC nodes.  */
+      if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
+	{
+	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
+	     use a chain of multiply + add for now (double rounding method).
+	     It should be easier for optimizers than a custom built-in call
+	     WIDEN_MULT_EXPR is close, but requires a double size result
+	     type.  */
+	  tree mult_res
+	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
+	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
+	}
+      else if (brig_opcode == BRIG_OPCODE_MAD24HI)
+	{
+	  tree mult_res
+	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
+	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
+	}
+      else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
+	{
+	  return build_shuffle (arith_type, operands);
+	}
+      else if (brig_opcode == BRIG_OPCODE_UNPACKLO
+	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
+	{
+	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
+	}
+      else if (brig_opcode == BRIG_OPCODE_UNPACK)
+	{
+	  return build_unpack (operands);
+	}
+      else if (brig_opcode == BRIG_OPCODE_PACK)
+	{
+	  return build_pack (operands);
+	}
+      else if (brig_opcode == BRIG_OPCODE_NRSQRT)
+	{
+	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
+	     native ISA other than a division, if available.
+	     TODO: this will happen only with unsafe math optimizations
+	     on which cannot be used in general to remain HSAIL compliant.
+	     Perhaps a builtin call would be better option here.  */
+	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
+			 expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
+						 arith_type, operands));
+	}
+      else if (brig_opcode == BRIG_OPCODE_NRCP)
+	{
+	  /* Implement as 1.0/x and assume gcc instruction selects to
+	     native ISA other than a division, if available.  */
+	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
+			 operands[0]);
+	}
+      else if (brig_opcode == BRIG_OPCODE_LANEID
+	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
+	       || brig_opcode == BRIG_OPCODE_WAVEID)
+	{
+	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
+	     MAXWAVEID always return 0.  */
+	  return build_zero_cst (arith_type);
+	}
+      else
+	gcc_unreachable ();
+    }
+  else if (opcode == CALL_EXPR)
+    return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
+				   operands);
+  else if (output_count == 1)
+    {
+      if (input_count == 1)
+	{
+	  if (opcode == MODIFY_EXPR)
+	    return operands[0];
+	  else
+	    return build1 (opcode, arith_type, operands[0]);
+	}
+      else if (input_count == 2)
+	return build2 (opcode, arith_type, operands[0], operands[1]);
+      else if (input_count == 3)
+	return build3 (opcode, arith_type, operands[0], operands[1],
+		       operands[2]);
+      else
+	gcc_unreachable ();
+    }
+  else
+    gcc_unreachable ();
+
+  return NULL_TREE;
+}
+
+/* Handles the basic instructions, including packed instructions. Deals
+   with the different packing modes by unpacking/packing the wanted
+   elements.  Delegates most of the instruction cases to build_inst_expr(). */
+
+size_t
+brig_basic_inst_handler::operator () (const BrigBase *base)
+{
+  const BrigInstBase *brig_inst = (const BrigInstBase *) base;
+
+  tree_stl_vec operands = build_operands (*brig_inst);
+
+  size_t output_count
+    = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
+  size_t input_count
+    = operands.size () == 0 ? 0 : (operands.size () - output_count);
+
+  gcc_assert (output_count == 0 || output_count == 1);
+
+  tree_stl_vec::iterator first_input_i = operands.begin ();
+  if (output_count > 0 && operands.size () > 0)
+    ++first_input_i;
+
+  tree_stl_vec in_operands;
+  in_operands.assign (first_input_i, operands.end ());
+
+  BrigType16_t brig_inst_type = brig_inst->type;
+
+  if (brig_inst->opcode == BRIG_OPCODE_NOP)
+    return base->byteCount;
+  else if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
+	   || brig_inst->opcode == BRIG_OPCODE_LASTBIT
+	   || brig_inst->opcode == BRIG_OPCODE_SAD)
+    /* These instructions are reported to be always 32b in HSAIL, but we want
+       to treat them according to their input argument's type to select the
+       correct instruction/builtin.  */
+    brig_inst_type
+      = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
+
+  tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
+
+  if (!instr_type)
+    {
+      gcc_unreachable ();
+      return base->byteCount;
+    }
+
+  bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
+
+  size_t element_size_bits;
+  size_t element_count;
+
+  if (is_vec_instr)
+    {
+      BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
+      element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
+      element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
+	/ gccbrig_hsa_type_bit_size (brig_element_type);
+    }
+  else
+    {
+      element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
+      element_count = 1;
+    }
+
+  /* The actual arithmetics type that should be performed with the
+     operation.  This is not always the same as the original BRIG
+     opcode's type due to implicit conversions of storage-only f16.  */
+  tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
+		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
+		      : get_tree_expr_type_for_hsa_type (brig_inst_type);
+
+  tree instr_expr = NULL_TREE;
+
+  BrigPack8_t p = BRIG_PACK_NONE;
+  if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
+    p = ((const BrigInstMod *) brig_inst)->pack;
+  else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
+    p = ((const BrigInstCmp *) brig_inst)->pack;
+
+  if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
+    in_operands[1] = build_lower_element_broadcast (in_operands[1]);
+  else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
+    in_operands[0] = build_lower_element_broadcast (in_operands[0]);
+
+  tree_code opcode
+    = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
+
+  if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
+    {
+      scalarized_sat_arithmetics sat_arith (*brig_inst);
+      gcc_assert (input_count == 2);
+      instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
+    }
+  else if (opcode == RETURN_EXPR)
+    {
+      if (m_parent.m_cf->m_is_kernel)
+	{
+	  tree goto_stmt
+	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
+	  m_parent.m_cf->append_statement (goto_stmt);
+	  return base->byteCount;
+	}
+      else
+	{
+	  m_parent.m_cf->append_return_stmt ();
+	  return base->byteCount;
+	}
+    }
+  else if (opcode == MULT_HIGHPART_EXPR &&
+	   is_vec_instr && element_size_bits < 64)
+    {
+      /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
+	 even the scalars do not seem to work at least for char elements.
+
+	 Let's fall back to scalarization and promotion of the vector elements
+	 to larger types with the MULHI computed as a regular MUL.
+	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
+	 that is passed on for vector processing so there is no need for
+	 128b scalar arithmetics.
+
+	 This is not modular as these type of things do not belong to the
+	 frontend, there should be a legalization phase before the backend
+	 that figures out the best way to compute the MULHI for any
+	 integer vector datatype.
+
+	 TODO: promote to larger vector types instead.  For example
+	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
+	 with my x86-64.
+      */
+      tree_stl_vec operand0_elements;
+      if (input_count > 0)
+	unpack (in_operands[0], operand0_elements);
+
+      tree_stl_vec operand1_elements;
+      if (input_count > 1)
+	unpack (in_operands[1], operand1_elements);
+
+      tree_stl_vec result_elements;
+
+      tree scalar_type = TREE_TYPE (arith_type);
+      BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
+      tree promoted_type = short_integer_type_node;
+      switch (element_type)
+	{
+	case BRIG_TYPE_S8:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
+	  break;
+	case BRIG_TYPE_U8:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
+	  break;
+	case BRIG_TYPE_S16:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
+	  break;
+	case BRIG_TYPE_U16:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
+	  break;
+	case BRIG_TYPE_S32:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
+	  break;
+	case BRIG_TYPE_U32:
+	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
+
+      for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
+	{
+	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
+	  tree operand1 = convert (promoted_type, operand1_elements.at (i));
+
+	  tree scalar_expr
+	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);
+
+	  scalar_expr
+	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
+		      build_int_cstu (promoted_type, promoted_type_size / 2));
+
+	  result_elements.push_back (convert (scalar_type, scalar_expr));
+	}
+      instr_expr = pack (result_elements);
+    }
+  else
+    {
+      /* 'class' is always of b1 type, let's consider it by its
+	 float type when building the instruction to find the
+	 correct builtin.  */
+      if (brig_inst->opcode == BRIG_OPCODE_CLASS)
+	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
+      instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
+				     arith_type, in_operands);
+    }
+
+  if (instr_expr == NULL_TREE)
+    {
+      gcc_unreachable ();
+      return base->byteCount;
+    }
+
+  if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
+      || p == BRIG_PACK_SSAT)
+    {
+      /* In case of _s_ or _ss_, select only the lowest element
+	 from the new input to the output.  We could extract
+	 the element and use a scalar operation, but try
+	 to keep data in vector registers as much as possible
+	 to avoid copies between scalar and vector datapaths.  */
+      tree old_value;
+      tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
+      bool is_fp16_operation
+	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
+	&& !gccbrig_is_bit_operation (brig_inst->opcode);
+
+      if (is_fp16_operation)
+	old_value = build_h2f_conversion
+	  (build_reinterpret_cast (half_storage_type, operands[0]));
+      else
+	old_value
+	  = build_reinterpret_cast (TREE_TYPE (instr_expr), operands[0]);
+
+      size_t esize = is_fp16_operation ? 32 : element_size_bits;
+
+      /* Construct a permutation mask where other elements than the lowest one
+	 is picked from the old_value.  */
+      tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
+      vec<constructor_elt, va_gc> *constructor_vals = NULL;
+      for (size_t i = 0; i < element_count; ++i)
+	{
+	  tree cst;
+
+	  if (i == 0)
+	    cst = build_int_cstu (mask_inner_type, element_count);
+	  else
+	    cst = build_int_cstu (mask_inner_type, i);
+	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
+	}
+      tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
+      tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
+
+      tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
+      tree assign
+	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
+      m_parent.m_cf->append_statement (assign);
+
+      instr_expr
+	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
+
+      tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
+      tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
+				  lower_output, instr_expr);
+      m_parent.m_cf->append_statement (assign_lower);
+      instr_expr = lower_output;
+    }
+
+  if (output_count == 1)
+    build_output_assignment (*brig_inst, operands[0], instr_expr);
+  else
+    m_parent.m_cf->append_statement (instr_expr);
+  return base->byteCount;
+}
+
+/* Create an expression that broadcasts the lowest element of the
+   vector in VEC_OPERAND to all elements of the returned vector.  */
+
+tree
+brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
+{
+  /* Build the broadcast using shuffle because there's no
+     direct broadcast in GENERIC and this way there's no need for
+     a separate extract of the lowest element.  */
+  tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
+  size_t esize = 8 * int_size_in_bytes (element_type);
+
+  size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_operand));
+  tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
+  vec<constructor_elt, va_gc> *constructor_vals = NULL;
+
+  /* Construct the mask.  */
+  for (size_t i = 0; i < element_count; ++i)
+    {
+      tree cst = build_int_cstu (mask_inner_type, element_count);
+      CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
+    }
+  tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
+  tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
+
+  return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
+		 vec_operand, mask);
+}
+
+/* Returns the tree code that should be used to implement the given
+   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
+   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
+   returns TREE_LIST (if it can be emulated with a simple chain of tree
+   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
+   call.  */
+
+tree_code
+brig_basic_inst_handler::get_tree_code_for_hsa_opcode
+  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
+{
+  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_NOP:
+      return NOP_EXPR;
+    case BRIG_OPCODE_ADD:
+      return PLUS_EXPR;
+    case BRIG_OPCODE_CMOV:
+      if (brig_inner_type == brig_type)
+	return COND_EXPR;
+      else
+	return VEC_COND_EXPR;
+    case BRIG_OPCODE_SUB:
+      return MINUS_EXPR;
+    case BRIG_OPCODE_MUL:
+    case BRIG_OPCODE_MUL24:
+      return MULT_EXPR;
+    case BRIG_OPCODE_MULHI:
+    case BRIG_OPCODE_MUL24HI:
+      return MULT_HIGHPART_EXPR;
+    case BRIG_OPCODE_DIV:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return RDIV_EXPR;
+      else
+	return TRUNC_DIV_EXPR;
+    case BRIG_OPCODE_NEG:
+      return NEGATE_EXPR;
+    case BRIG_OPCODE_MIN:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return CALL_EXPR;
+      else
+	return MIN_EXPR;
+    case BRIG_OPCODE_MAX:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return CALL_EXPR;
+      else
+	return MAX_EXPR;
+    case BRIG_OPCODE_FMA:
+      return FMA_EXPR;
+    case BRIG_OPCODE_ABS:
+      return ABS_EXPR;
+    case BRIG_OPCODE_SHL:
+      return LSHIFT_EXPR;
+    case BRIG_OPCODE_SHR:
+      return RSHIFT_EXPR;
+    case BRIG_OPCODE_OR:
+      return BIT_IOR_EXPR;
+    case BRIG_OPCODE_XOR:
+      return BIT_XOR_EXPR;
+    case BRIG_OPCODE_AND:
+      return BIT_AND_EXPR;
+    case BRIG_OPCODE_NOT:
+      return BIT_NOT_EXPR;
+    case BRIG_OPCODE_RET:
+      return RETURN_EXPR;
+    case BRIG_OPCODE_MOV:
+    case BRIG_OPCODE_LDF:
+      return MODIFY_EXPR;
+    case BRIG_OPCODE_LD:
+    case BRIG_OPCODE_ST:
+      return MEM_REF;
+    case BRIG_OPCODE_BR:
+      return GOTO_EXPR;
+    case BRIG_OPCODE_REM:
+      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
+	return TRUNC_MOD_EXPR;
+      else
+	return CALL_EXPR;
+    case BRIG_OPCODE_NRCP:
+    case BRIG_OPCODE_NRSQRT:
+      /* Implement as 1/f (x).  gcc should pattern detect that and
+	 use a native instruction, if available, for it.  */
+      return TREE_LIST;
+    case BRIG_OPCODE_FLOOR:
+    case BRIG_OPCODE_CEIL:
+    case BRIG_OPCODE_SQRT:
+    case BRIG_OPCODE_NSQRT:
+    case BRIG_OPCODE_RINT:
+    case BRIG_OPCODE_TRUNC:
+    case BRIG_OPCODE_POPCOUNT:
+    case BRIG_OPCODE_COPYSIGN:
+    case BRIG_OPCODE_NCOS:
+    case BRIG_OPCODE_NSIN:
+    case BRIG_OPCODE_NLOG2:
+    case BRIG_OPCODE_NEXP2:
+    case BRIG_OPCODE_NFMA:
+      /* Class has type B1 regardless of the float type, thus
+	 the below builtin map search cannot find it.  */
+    case BRIG_OPCODE_CLASS:
+    case BRIG_OPCODE_WORKITEMABSID:
+      return CALL_EXPR;
+    default:
+
+      /* Some BRIG opcodes can use the same builtins for unsigned and
+	 signed types.  Force these cases to unsigned types.
+      */
+
+      if (brig_opcode == BRIG_OPCODE_BORROW
+	  || brig_opcode == BRIG_OPCODE_CARRY
+	  || brig_opcode == BRIG_OPCODE_LASTBIT
+	  || brig_opcode == BRIG_OPCODE_BITINSERT)
+	{
+	  if (brig_type == BRIG_TYPE_S32)
+	    brig_type = BRIG_TYPE_U32;
+	  else if (brig_type == BRIG_TYPE_S64)
+	    brig_type = BRIG_TYPE_U64;
+	}
+
+
+      builtin_map::const_iterator i
+	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+      if (i != s_custom_builtins.end ())
+	return CALL_EXPR;
+      else if (s_custom_builtins.find
+	       (std::make_pair (brig_opcode, brig_inner_type))
+	       != s_custom_builtins.end ())
+	return CALL_EXPR;
+      if (brig_inner_type == BRIG_TYPE_F16
+	  && s_custom_builtins.find
+	  (std::make_pair (brig_opcode, BRIG_TYPE_F32))
+	  != s_custom_builtins.end ())
+	return CALL_EXPR;
+      break;
+    }
+  return TREE_LIST; /* Emulate using a chain of nodes.  */
+}