summaryrefslogtreecommitdiff
path: root/utests/compiler_long_hi_sat.cpp
blob: 1c57d0c5bc010a07233368e08efae6dd6eb3898d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#include <cstdint>
#include <cstring>
#include <iostream>
#include "utest_helper.hpp"

static void __u64_mul_u64(uint64_t sourceA, uint64_t sourceB, uint64_t &destLow, uint64_t &destHi)
{
  uint64_t lowA, lowB;
  uint64_t highA, highB;

  lowA = sourceA & 0xffffffff;
  highA = sourceA >> 32;
  lowB = sourceB & 0xffffffff;
  highB = sourceB >> 32;

  uint64_t aHibHi = highA * highB;
  uint64_t aHibLo = highA * lowB;
  uint64_t aLobHi = lowA * highB;
  uint64_t aLobLo = lowA * lowB;

  uint64_t aLobLoHi = aLobLo >> 32;
  uint64_t aLobHiLo = aLobHi & 0xFFFFFFFFULL;
  aHibLo += aLobLoHi + aLobHiLo;

  destHi = aHibHi + (aHibLo >> 32 ) + (aLobHi >> 32);    // Cant overflow
  destLow = (aHibLo << 32) | ( aLobLo & 0xFFFFFFFFULL);
}

static void __64_mul_64(int64_t sourceA, int64_t sourceB, uint64_t &destLow, int64_t &destHi)
{
  int64_t aSign = sourceA >> 63;
  int64_t bSign = sourceB >> 63;
  int64_t resultSign = aSign ^ bSign;

  // take absolute values of the argument
  sourceA = (sourceA ^ aSign) - aSign;
  sourceB = (sourceB ^ bSign) - bSign;

  uint64_t hi;
  __u64_mul_u64( (uint64_t) sourceA, (uint64_t) sourceB, destLow, hi );

  // Fix the sign
  if( resultSign ) {
    destLow ^= resultSign;
    hi ^= resultSign;
    destLow -= resultSign;
    //carry if necessary
    if( 0 == destLow )
      hi -= resultSign;
  }

  destHi = (int64_t) hi;
}

static void __mad_sat(int64_t sourceA, int64_t sourceB, int64_t sourceC, int64_t& dst)
{
  cl_long multHi;
  cl_ulong multLo;
  __64_mul_64(sourceA, sourceB, multLo, multHi);
  cl_ulong sum = multLo + sourceC;

  // carry if overflow
  if(sourceC >= 0) {
    if(multLo > sum) {
      multHi++;
      if(CL_LONG_MIN == multHi) {
        multHi = CL_LONG_MAX;
        sum = CL_ULONG_MAX;
      }
    }
  } else {
    if( multLo < sum ) {
      multHi--;
      if( CL_LONG_MAX == multHi ) {
        multHi = CL_LONG_MIN;
        sum = 0;
      }
    }
  }

  // saturate
  if( multHi > 0 )
    sum = CL_LONG_MAX;
  else if ( multHi == 0 && sum > CL_LONG_MAX)
    sum = CL_LONG_MAX;
  else if ( multHi == -1 && sum < (cl_ulong)CL_LONG_MIN)
    sum = CL_LONG_MIN;
  else if( multHi < -1 )
    sum = CL_LONG_MIN;

  dst = (cl_long) sum;
}

void compiler_long_mul_hi(void)
{
  const size_t n = 32;
  int64_t src[n];
  int64_t num0 = 0xF00A00CED0090B0CUL;
  int64_t num1 = 0x7FABCD57FC098FC1UL;
  memset(src, 0, sizeof(int64_t) * n);

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_hi_sat", "compiler_long_mul_hi");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  OCL_SET_ARG(2, sizeof(cl_long), &num0);
  OCL_SET_ARG(3, sizeof(cl_long), &num1);
  globals[0] = n;
  locals[0] = 32;

  for (int32_t i = 0; i < (int32_t) n; ++i) {
    uint64_t a = rand();
    a = a <<32 | a;
    src[i] = a;
  }

  OCL_MAP_BUFFER(0);
  memcpy(buf_data[0], src, sizeof(uint64_t) * n);
  OCL_UNMAP_BUFFER(0);

  uint64_t res_lo;
  int64_t res_hi;

  // Run the kernel on GPU
  OCL_NDRANGE(1);

  // Compare
  OCL_MAP_BUFFER(1);
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    if (i % 2 == 0)
      __64_mul_64(src[i], num0, res_lo, res_hi);
    else
      __64_mul_64(src[i], num1, res_lo, res_hi);

    OCL_ASSERT(((int64_t *)(buf_data[1]))[i] == res_hi);
  }
  OCL_UNMAP_BUFFER(1);
}

void compiler_long_mul_sat(void)
{
  const size_t n = 32;
  int64_t src[n];
  int64_t num0 = 0xF00000CED8090B0CUL;
  int64_t num1 = 0x0000000000098FC1UL;
  memset(src, 0, sizeof(int64_t) * n);

  // Setup kernel and buffers
  OCL_CREATE_KERNEL_FROM_FILE("compiler_long_hi_sat", "compiler_long_mul_sat");
  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint64_t), NULL);
  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint64_t), NULL);
  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
  OCL_SET_ARG(2, sizeof(cl_long), &num0);
  OCL_SET_ARG(3, sizeof(cl_long), &num1);
  globals[0] = n;
  locals[0] = 32;

  for (int32_t i = 0; i < (int32_t) n; ++i) {
    uint64_t a = rand();
    a = a <<32 | a;
    src[i] = a;
  }

  OCL_MAP_BUFFER(0);
  memcpy(buf_data[0], src, sizeof(uint64_t) * n);
  OCL_UNMAP_BUFFER(0);

  int64_t res;

  // Run the kernel on GPU
  OCL_NDRANGE(1);

  // Compare
  OCL_MAP_BUFFER(1);
  for (int32_t i = 0; i < (int32_t) n; ++i) {
    __mad_sat(src[i], num0, num1, res);

    OCL_ASSERT(((int64_t *)(buf_data[1]))[i] == res);
  }
  OCL_UNMAP_BUFFER(1);
}

MAKE_UTEST_FROM_FUNCTION(compiler_long_mul_hi);
MAKE_UTEST_FROM_FUNCTION(compiler_long_mul_sat);