1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
/* { dg-do assemble { target { aarch64*-*-* } } } */
/* { dg-require-effective-target stdint_types_mbig_endian } */
/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
/* { dg-add-options arm_v8_2a_bf16_neon } */
/* { dg-additional-options "-mbig-endian --save-temps" } */
/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
#include <arm_neon.h>
/*
**ufoo:
** bfdot v0.2s, (v1.4h, v2.4h|v2.4h, v1.4h)
** ret
*/
float32x2_t ufoo(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq:
** bfdot v0.4s, (v1.8h, v2.8h|v2.8h, v1.8h)
** ret
*/
float32x4_t ufooq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_f32 (r, x, y);
}
/*
**ufoo_lane:
** bfdot v0.2s, v1.4h, v2.2h\[0\]
** ret
*/
float32x2_t ufoo_lane(float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_lane_f32 (r, x, y, 0);
}
/*
**ufooq_laneq:
** bfdot v0.4s, v1.8h, v2.2h\[2\]
** ret
*/
float32x4_t ufooq_laneq(float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
{
return vbfdotq_laneq_f32 (r, x, y, 2);
}
/*
**ufoo_laneq:
** bfdot v0.2s, v1.4h, v2.2h\[3\]
** ret
*/
float32x2_t ufoo_laneq(float32x2_t r, bfloat16x4_t x, bfloat16x8_t y)
{
return vbfdot_laneq_f32 (r, x, y, 3);
}
/*
**ufooq_lane:
** bfdot v0.4s, v1.8h, v2.2h\[1\]
** ret
*/
float32x4_t ufooq_lane(float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
/*
**ufoo_untied:
** (
** mov v0.8b, v1.8b
** bfdot v0.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
** |
** bfdot v1.2s, (v2.4h, v3.4h|v3.4h, v2.4h)
** mov v0.8b, v1.8b
** )
** ret
*/
float32x2_t ufoo_untied(float32x4_t unused, float32x2_t r, bfloat16x4_t x, bfloat16x4_t y)
{
return vbfdot_f32 (r, x, y);
}
/*
**ufooq_lane_untied:
** (
** mov v0.16b, v1.16b
** bfdot v0.4s, v2.8h, v3.2h\[1\]
** |
** bfdot v1.4s, v2.8h, v3.2h\[1\]
** mov v0.16b, v1.16b
** )
** ret
*/
float32x4_t ufooq_lane_untied(float32x4_t unused, float32x4_t r, bfloat16x8_t x, bfloat16x4_t y)
{
return vbfdotq_lane_f32 (r, x, y, 1);
}
|