gcc/testsuite/gcc.target/aarch64/sve_slp_7.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -march=armv8-a+sve -msve-vector-bits=scalable -ffast-math" } */

#include <stdint.h>

#define VEC_PERM(TYPE)						\
void __attribute__ ((weak))					\
vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
{								\
  TYPE x0 = b[0];						\
  TYPE x1 = b[1];						\
  TYPE x2 = b[2];						\
  TYPE x3 = b[3];						\
  for (int i = 0; i < n; ++i)					\
    {								\
      x0 += a[i * 4];						\
      x1 += a[i * 4 + 1];					\
      x2 += a[i * 4 + 2];					\
      x3 += a[i * 4 + 3];					\
    }								\
  b[0] = x0;							\
  b[1] = x1;							\
  b[2] = x2;							\
  b[3] = x3;							\
}

#define TEST_ALL(T)				\
  T (int8_t)					\
  T (uint8_t)					\
  T (int16_t)					\
  T (uint16_t)					\
  T (int32_t)					\
  T (uint32_t)					\
  T (int64_t)					\
  T (uint64_t)					\
  T (float)					\
  T (double)

TEST_ALL (VEC_PERM)

/* We can't use SLP for the 64-bit loops, since the number of reduction
   results might be greater than the number of elements in the vector.
   Otherwise we have two loads per loop, one for the initial vector
   and one for the loop body.  */
/* ??? At present we don't treat the int8_t and int16_t loops as
   reductions.  */
/* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tld1h\t} 2 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
/* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
/* { dg-final { scan-assembler-times {\tld4d\t} 3 } } */
/* { dg-final { scan-assembler-not {\tld4b\t} } } */
/* { dg-final { scan-assembler-not {\tld4h\t} } } */
/* { dg-final { scan-assembler-not {\tld4w\t} } } */
/* { dg-final { scan-assembler-not {\tld1d\t} } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 { xfail *-*-* } } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 4 } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
/* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 8 } } */
/* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
/* { dg-final { scan-assembler-times {\tfaddv\td[0-9]+, p[0-7], z[0-9]+\.d} 4 } } */

/* Should be 4, if we used reductions for int8_t and int16_t.  */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 2 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */

/* { dg-final { scan-assembler-not {\tuqdec} } } */