chromium/third_party/blink/renderer/modules/webaudio/cpu/x86/oscillator_kernel_sse2.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288

// Copyright 2020 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"

#include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"

#include <xmmintrin.h>

namespace blink {

static __m128 WrapVirtualIndexVector(__m128 x,
                                     __m128 wave_size,
                                     __m128 inv_wave_size) {
  // Wrap the virtual index |x| to the range 0 to wave_size - 1.  This is done
  // by computing x - floor(x/wave_size)*wave_size.
  //
  // But there's no SSE2 SIMD instruction for this, so we do it the following
  // way.

  // f = truncate(x/wave_size), truncating towards 0.
  const __m128 r = _mm_mul_ps(x, inv_wave_size);
  __m128i f = _mm_cvttps_epi32(r);

  // Note that if r >= 0, then f <= r. But if r < 0, then r <= f, with equality
  // only if r is already an integer.  Hence if r < f, we want to subtract 1
  // from f to get floor(r).

  // cmplt(a,b) returns 0xffffffff (-1) if a < b and 0 if not.  So cmp is -1 or
  // 0 depending on whether r < f, which is what we need to compute floor(r).
  const __m128i cmp =
      reinterpret_cast<__m128i>(_mm_cmplt_ps(r, _mm_cvtepi32_ps(f)));

  // This subtracts 1 if needed to get floor(r).
  f = _mm_add_epi32(f, cmp);

  // Convert back to float, and scale by wave_size.  And finally subtract that
  // from x.
  return _mm_sub_ps(x, _mm_mul_ps(_mm_cvtepi32_ps(f), wave_size));
}

std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
    int n,
    float* dest_p,
    double virtual_read_index,
    float frequency,
    float rate_scale) const {
  const unsigned periodic_wave_size = periodic_wave_->PeriodicWaveSize();
  const double inv_periodic_wave_size = 1.0 / periodic_wave_size;

  float* higher_wave_data = nullptr;
  float* lower_wave_data = nullptr;
  float table_interpolation_factor = 0;
  float incr = frequency * rate_scale;
  DCHECK_GE(incr, kInterpolate2Point);

  periodic_wave_->WaveDataForFundamentalFrequency(
      frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);

  const __m128 v_wave_size = _mm_set1_ps(periodic_wave_size);
  const __m128 v_inv_wave_size = _mm_set1_ps(1.0f / periodic_wave_size);

  // Mask to use to wrap the read indices to the proper range.
  const __m128i v_read_mask = _mm_set1_epi32(periodic_wave_size - 1);
  const __m128i one = _mm_set1_epi32(1);

  const __m128 v_table_factor = _mm_set1_ps(table_interpolation_factor);

  // The loop processes 4 items at a time, so we need to increment the
  // virtual index by 4*incr each time.
  const __m128 v_incr = _mm_set1_ps(4 * incr);

  // The virtual index vector.  Ideally, to preserve accuracy, we should use
  // (two) packed double vectors for this, but that degrades performance quite a
  // bit.
  __m128 v_virt_index =
      _mm_set_ps(virtual_read_index + 3 * incr, virtual_read_index + 2 * incr,
                 virtual_read_index + incr, virtual_read_index);

  // It's possible that adding the incr above exceeded the bounds, so wrap them
  // if needed.
  v_virt_index =
      WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);

  // Temporary arrays where we can gather up the wave data we need for
  // interpolation.  Align these for best efficiency on older CPUs where aligned
  // access is much faster than unaliged.
  // TODO(1013118): Is there a faster way to do this?
  float sample1_lower[4] __attribute__((aligned(16)));
  float sample2_lower[4] __attribute__((aligned(16)));
  float sample1_higher[4] __attribute__((aligned(16)));
  float sample2_higher[4] __attribute__((aligned(16)));

  int k = 0;
  int n_loops = n / 4;

  for (int loop = 0; loop < n_loops; ++loop, k += 4) {
    // Compute indices for the samples.  Clamp the index to lie in the range 0
    // to periodic_wave_size-1 by applying a mask to the index.
    const __m128i read_index_0 =
        _mm_and_si128(_mm_cvttps_epi32(v_virt_index), v_read_mask);
    const __m128i read_index_1 =
        _mm_and_si128(_mm_add_epi32(read_index_0, one), v_read_mask);

    // Extract the components of the indices so we can get the samples
    // associated with the lower and higher wave data.
    const uint32_t* r0 = reinterpret_cast<const uint32_t*>(&read_index_0);
    const uint32_t* r1 = reinterpret_cast<const uint32_t*>(&read_index_1);

    // Get the samples from the wave tables and save them in work arrays so we
    // can load them into simd registers.
    for (int m = 0; m < 4; ++m) {
      sample1_lower[m] = lower_wave_data[r0[m]];
      sample2_lower[m] = lower_wave_data[r1[m]];
      sample1_higher[m] = higher_wave_data[r0[m]];
      sample2_higher[m] = higher_wave_data[r1[m]];
    }

    const __m128 s1_low = _mm_load_ps(sample1_lower);
    const __m128 s2_low = _mm_load_ps(sample2_lower);
    const __m128 s1_high = _mm_load_ps(sample1_higher);
    const __m128 s2_high = _mm_load_ps(sample2_higher);

    // Linearly interpolate within each table (lower and higher).
    const __m128 interpolation_factor =
        _mm_sub_ps(v_virt_index, _mm_cvtepi32_ps(read_index_0));
    const __m128 sample_higher = _mm_add_ps(
        s1_high,
        _mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_high, s1_high)));
    const __m128 sample_lower = _mm_add_ps(
        s1_low, _mm_mul_ps(interpolation_factor, _mm_sub_ps(s2_low, s1_low)));

    // Then interpolate between the two tables.
    const __m128 sample = _mm_add_ps(
        sample_higher,
        _mm_mul_ps(v_table_factor, _mm_sub_ps(sample_lower, sample_higher)));

    // WARNING: dest_p may not be aligned!
    _mm_storeu_ps(dest_p + k, sample);

    // Increment virtual read index and wrap virtualReadIndex into the range
    // 0 -> periodicWaveSize.
    v_virt_index = _mm_add_ps(v_virt_index, v_incr);
    v_virt_index =
        WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
  }

  // There's a bit of round-off above, so update the index more accurately so at
  // least the next render starts over with a more accurate value.
  virtual_read_index += k * incr;
  virtual_read_index -=
      floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;

  return std::make_tuple(k, virtual_read_index);
}

static __m128d WrapVirtualIndexVectorPd(__m128d x,
                                        __m128d wave_size,
                                        __m128d inv_wave_size) {
  // Wrap the virtual index |x| to the range 0 to wave_size - 1.  This is done
  // by computing x - floor(x/wave_size)*wave_size.
  //
  // But there's no SSE2 SIMD instruction for this, so we do it the following
  // way.

  // f = truncate(x/wave_size), truncating towards 0.
  const __m128d r = _mm_mul_pd(x, inv_wave_size);
  __m128i f = _mm_cvttpd_epi32(r);

  // Note that if r >= 0, then f <= r. But if r < 0, then r <= f, with equality
  // only if r is already an integer.  Hence if r < f, we want to subtract 1
  // from f to get floor(r).

  // cmplt(a,b) returns 0xffffffffffffffff (-1) if a < b and 0 if not.  So cmp
  // is -1 or 0 depending on whether r < f, which is what we need to compute
  // floor(r).
  __m128i cmp = reinterpret_cast<__m128i>(_mm_cmplt_pd(r, _mm_cvtepi32_pd(f)));

  // Take the low 32 bits of each 64-bit result and move them into the two
  // lowest 32-bit fields.
  cmp = _mm_shuffle_epi32(cmp, (2 << 2) | 0);

  // This subtracts 1 if needed to get floor(r).
  f = _mm_add_epi32(f, cmp);

  // Convert back to float, and scale by wave_size.  And finally subtract that
  // from x.
  return _mm_sub_pd(x, _mm_mul_pd(_mm_cvtepi32_pd(f), wave_size));
}

double OscillatorHandler::ProcessARateVectorKernel(
    float* dest_p,
    double virtual_read_index,
    const float* phase_increments,
    unsigned periodic_wave_size,
    const float* const lower_wave_data[4],
    const float* const higher_wave_data[4],
    const float table_interpolation_factor[4]) const {
  // See the scalar version in oscillator_node.cc for the basic algorithm.
  double inv_periodic_wave_size = 1.0 / periodic_wave_size;
  unsigned read_index_mask = periodic_wave_size - 1;

  // Accumulate the phase increments so we can set up the virtual read index
  // vector appropriately.  This must be a double to preserve accuracy and
  // to match the scalar version.
  double incr_sum[4];
  incr_sum[0] = phase_increments[0];
  for (int m = 1; m < 4; ++m) {
    incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
  }

  // It's really important for accuracy that we use doubles instead of
  // floats for the virtual_read_index.  Without this, we can only get some
  // 30-50 dB in the sweep tests instead of 100+ dB.
  __m128d v_read_index_hi = _mm_set_pd(virtual_read_index + incr_sum[2],
                                       virtual_read_index + incr_sum[1]);
  __m128d v_read_index_lo =
      _mm_set_pd(virtual_read_index + incr_sum[0], virtual_read_index);

  v_read_index_hi =
      WrapVirtualIndexVectorPd(v_read_index_hi, _mm_set1_pd(periodic_wave_size),
                               _mm_set1_pd(inv_periodic_wave_size));
  v_read_index_lo =
      WrapVirtualIndexVectorPd(v_read_index_lo, _mm_set1_pd(periodic_wave_size),
                               _mm_set1_pd(inv_periodic_wave_size));

  // Convert the virtual read index (parts) to an integer, and carefully
  // merge them into one vector.
  __m128i v_read0 = reinterpret_cast<__m128i>(_mm_movelh_ps(
      reinterpret_cast<__m128>(_mm_cvttpd_epi32(v_read_index_lo)),
      reinterpret_cast<__m128>(_mm_cvttpd_epi32(v_read_index_hi))));

  // Get index to next element being sure to wrap the index around if needed.
  __m128i v_read1 = _mm_add_epi32(v_read0, _mm_set1_epi32(1));

  // Make sure the index lies in 0 to periodic_wave_size - 1 (the size of the
  // arrays) by applying a mask to the values.
  {
    const __m128i v_mask = _mm_set1_epi32(read_index_mask);
    v_read0 = _mm_and_si128(v_read0, v_mask);
    v_read1 = _mm_and_si128(v_read1, v_mask);
  }

  float sample1_lower[4] __attribute__((aligned(16)));
  float sample2_lower[4] __attribute__((aligned(16)));
  float sample1_higher[4] __attribute__((aligned(16)));
  float sample2_higher[4] __attribute__((aligned(16)));

  const unsigned* read0 = reinterpret_cast<const unsigned*>(&v_read0);
  const unsigned* read1 = reinterpret_cast<const unsigned*>(&v_read1);

  for (int m = 0; m < 4; ++m) {
    DCHECK_LT(read0[m], periodic_wave_size);
    DCHECK_LT(read1[m], periodic_wave_size);

    sample1_lower[m] = lower_wave_data[m][read0[m]];
    sample2_lower[m] = lower_wave_data[m][read1[m]];
    sample1_higher[m] = higher_wave_data[m][read0[m]];
    sample2_higher[m] = higher_wave_data[m][read1[m]];
  }

  const __m128 v_factor =
      _mm_sub_ps(_mm_movelh_ps(_mm_cvtpd_ps(v_read_index_lo),
                               _mm_cvtpd_ps(v_read_index_hi)),
                 _mm_cvtepi32_ps(v_read0));
  const __m128 sample_higher =
      _mm_add_ps(_mm_load_ps(sample1_higher),
                 _mm_mul_ps(v_factor, _mm_sub_ps(_mm_load_ps(sample2_higher),
                                                 _mm_load_ps(sample1_higher))));
  const __m128 sample_lower =
      _mm_add_ps(_mm_load_ps(sample1_lower),
                 _mm_mul_ps(v_factor, _mm_sub_ps(_mm_load_ps(sample2_lower),
                                                 _mm_load_ps(sample1_lower))));
  const __m128 sample = _mm_add_ps(
      sample_higher, _mm_mul_ps(_mm_load_ps(table_interpolation_factor),
                                _mm_sub_ps(sample_lower, sample_higher)));

  _mm_storeu_ps(dest_p, sample);

  virtual_read_index += incr_sum[3];
  virtual_read_index -=
      floor(virtual_read_index * inv_periodic_wave_size) * periodic_wave_size;

  return virtual_read_index;
}

}  // namespace blink