diff options
Diffstat (limited to 'chromium/third_party/blink/renderer/platform/audio/cpu/x86/vector_math_impl.h')
-rw-r--r-- | chromium/third_party/blink/renderer/platform/audio/cpu/x86/vector_math_impl.h | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/chromium/third_party/blink/renderer/platform/audio/cpu/x86/vector_math_impl.h b/chromium/third_party/blink/renderer/platform/audio/cpu/x86/vector_math_impl.h new file mode 100644 index 00000000000..8562a56d8d8 --- /dev/null +++ b/chromium/third_party/blink/renderer/platform/audio/cpu/x86/vector_math_impl.h @@ -0,0 +1,357 @@ +// Copyright 2017 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file intentionally does not have header guards, it's included from +// VectorMathAVX.h and from VectorMathSSE.h with different macro definitions. +// The following line silences a presubmit warning that would otherwise be +// triggered by this: no-include-guard-because-multiply-included + +#include "build/build_config.h" + +#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX) + +#include <algorithm> +#include <cmath> + +#include "third_party/blink/renderer/platform/audio/audio_array.h" +#include "third_party/blink/renderer/platform/wtf/assertions.h" + +namespace blink { +namespace VectorMath { +namespace VECTOR_MATH_SIMD_NAMESPACE_NAME { + +// This stride is chosen so that the same prepared filter created by +// AVX::PrepareFilterForConv can be used by both AVX::Conv and SSE::Conv. +// A prepared filter created by SSE::PrepareFilterForConv can only be used +// by SSE::Conv. +constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister; + +bool IsAligned(const float* p) { + constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u; + constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u; + return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u; +} + +void PrepareFilterForConv(const float* filter_p, + int filter_stride, + size_t filter_size, + AudioFloatArray* prepared_filter) { + // Only contiguous convolution is implemented. Correlation (positive + // |filter_stride|) and support for non-contiguous vectors are not + // implemented. + DCHECK_EQ(-1, filter_stride); + DCHECK(prepared_filter); + + // Reverse the filter and repeat each value across a vector + prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister * + filter_size); + MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data()); + for (size_t i = 0; i < filter_size; ++i) { + reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i)); + } +} + +// Direct vector convolution: +// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m +// provided that |prepared_filter_p| is |prepared_filter->Data()| and that +// |prepared_filter| is prepared with |PrepareFilterForConv|. +void Conv(const float* source_p, + const float* prepared_filter_p, + float* dest_p, + size_t frames_to_process, + size_t filter_size) { + const float* const dest_end_p = dest_p + frames_to_process; + + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister); + + const MType* reversed_filter = + reinterpret_cast<const MType*>(prepared_filter_p); + + // Do convolution with kPackedFloatsPerRegister inputs at a time. + while (dest_p < dest_end_p) { + MType m_convolution_sum = MM_PS(setzero)(); + + // |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll + // the loop by kPackedFloatsPerRegister, manually. + for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) { + for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) { + size_t k = i + j; + MType m_product; + MType m_source; + + m_source = MM_PS(loadu)(source_p + k); + m_product = + MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source); + m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product); + } + } + MM_PS(storeu)(dest_p, m_convolution_sum); + + source_p += kPackedFloatsPerRegister; + dest_p += kPackedFloatsPerRegister; + } +} + +// dest[k] = source1[k] + source2[k] +void Vadd(const float* source1p, + const float* source2p, + float* dest_p, + size_t frames_to_process) { + const float* const source1_end_p = source1p + frames_to_process; + + DCHECK(IsAligned(source1p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + +#define ADD_ALL(loadSource2, storeDest) \ + while (source1p < source1_end_p) { \ + MType m_source1 = MM_PS(load)(source1p); \ + MType m_source2 = MM_PS(loadSource2)(source2p); \ + MType m_dest = MM_PS(add)(m_source1, m_source2); \ + MM_PS(storeDest)(dest_p, m_dest); \ + source1p += kPackedFloatsPerRegister; \ + source2p += kPackedFloatsPerRegister; \ + dest_p += kPackedFloatsPerRegister; \ + } + + if (IsAligned(source2p)) { + if (IsAligned(dest_p)) { + ADD_ALL(load, store); + } else { + ADD_ALL(load, storeu); + } + } else { + if (IsAligned(dest_p)) { + ADD_ALL(loadu, store); + } else { + ADD_ALL(loadu, storeu); + } + } + +#undef ADD_ALL +} + +// dest[k] = clip(source[k], low_threshold, high_threshold) +// = max(low_threshold, min(high_threshold, source[k])) +void Vclip(const float* source_p, + const float* low_threshold_p, + const float* high_threshold_p, + float* dest_p, + size_t frames_to_process) { + const float* const source_end_p = source_p + frames_to_process; + + DCHECK(IsAligned(source_p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + + MType m_low_threshold = MM_PS(set1)(*low_threshold_p); + MType m_high_threshold = MM_PS(set1)(*high_threshold_p); + +#define CLIP_ALL(storeDest) \ + while (source_p < source_end_p) { \ + MType m_source = MM_PS(load)(source_p); \ + MType m_dest = \ + MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \ + MM_PS(storeDest)(dest_p, m_dest); \ + source_p += kPackedFloatsPerRegister; \ + dest_p += kPackedFloatsPerRegister; \ + } + + if (IsAligned(dest_p)) { + CLIP_ALL(store); + } else { + CLIP_ALL(storeu); + } + +#undef CLIP_ALL +} + +// *max_p = max(*max_p, source_max) where +// source_max = max(abs(source[k])) for all k +void Vmaxmgv(const float* source_p, float* max_p, size_t frames_to_process) { + constexpr uint32_t kMask = 0x7FFFFFFFu; + + const float* const source_end_p = source_p + frames_to_process; + + DCHECK(IsAligned(source_p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + + MType m_mask = MM_PS(set1)(*reinterpret_cast<const float*>(&kMask)); + MType m_max = MM_PS(setzero)(); + + while (source_p < source_end_p) { + MType m_source = MM_PS(load)(source_p); + // Calculate the absolute value by ANDing the source with the mask, + // which will set the sign bit to 0. + m_source = MM_PS(and)(m_source, m_mask); + m_max = MM_PS(max)(m_source, m_max); + source_p += kPackedFloatsPerRegister; + } + + // Combine the packed floats. + const float* maxes = reinterpret_cast<const float*>(&m_max); + for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i) + *max_p = std::max(*max_p, maxes[i]); +} + +// dest[k] = source1[k] * source2[k] +void Vmul(const float* source1p, + const float* source2p, + float* dest_p, + size_t frames_to_process) { + const float* const source1_end_p = source1p + frames_to_process; + + DCHECK(IsAligned(source1p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + +#define MULTIPLY_ALL(loadSource2, storeDest) \ + while (source1p < source1_end_p) { \ + MType m_source1 = MM_PS(load)(source1p); \ + MType m_source2 = MM_PS(loadSource2)(source2p); \ + MType m_dest = MM_PS(mul)(m_source1, m_source2); \ + MM_PS(storeDest)(dest_p, m_dest); \ + source1p += kPackedFloatsPerRegister; \ + source2p += kPackedFloatsPerRegister; \ + dest_p += kPackedFloatsPerRegister; \ + } + + if (IsAligned(source2p)) { + if (IsAligned(dest_p)) { + MULTIPLY_ALL(load, store); + } else { + MULTIPLY_ALL(load, storeu); + } + } else { + if (IsAligned(dest_p)) { + MULTIPLY_ALL(loadu, store); + } else { + MULTIPLY_ALL(loadu, storeu); + } + } + +#undef MULTIPLY_ALL +} + +// dest[k] += scale * source[k] +void Vsma(const float* source_p, + const float* scale, + float* dest_p, + size_t frames_to_process) { + const float* const source_end_p = source_p + frames_to_process; + + DCHECK(IsAligned(source_p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + + const MType m_scale = MM_PS(set1)(*scale); + +#define SCALAR_MULTIPLY_AND_ADD_ALL(loadDest, storeDest) \ + while (source_p < source_end_p) { \ + MType m_source = MM_PS(load)(source_p); \ + MType m_dest = MM_PS(loadDest)(dest_p); \ + m_dest = MM_PS(add)(m_dest, MM_PS(mul)(m_scale, m_source)); \ + MM_PS(storeDest)(dest_p, m_dest); \ + source_p += kPackedFloatsPerRegister; \ + dest_p += kPackedFloatsPerRegister; \ + } + + if (IsAligned(dest_p)) { + SCALAR_MULTIPLY_AND_ADD_ALL(load, store); + } else { + SCALAR_MULTIPLY_AND_ADD_ALL(loadu, storeu); + } + +#undef SCALAR_MULTIPLY_AND_ADD_ALL +} + +// dest[k] = scale * source[k] +void Vsmul(const float* source_p, + const float* scale, + float* dest_p, + size_t frames_to_process) { + const float* const source_end_p = source_p + frames_to_process; + + DCHECK(IsAligned(source_p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + + const MType m_scale = MM_PS(set1)(*scale); + +#define SCALAR_MULTIPLY_ALL(storeDest) \ + while (source_p < source_end_p) { \ + MType m_source = MM_PS(load)(source_p); \ + MType m_dest = MM_PS(mul)(m_scale, m_source); \ + MM_PS(storeDest)(dest_p, m_dest); \ + source_p += kPackedFloatsPerRegister; \ + dest_p += kPackedFloatsPerRegister; \ + } + + if (IsAligned(dest_p)) { + SCALAR_MULTIPLY_ALL(store); + } else { + SCALAR_MULTIPLY_ALL(storeu); + } + +#undef SCALAR_MULTIPLY_ALL +} + +// sum += sum(source[k]^2) for all k +void Vsvesq(const float* source_p, float* sum_p, size_t frames_to_process) { + const float* const source_end_p = source_p + frames_to_process; + + DCHECK(IsAligned(source_p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + + MType m_sum = MM_PS(setzero)(); + + while (source_p < source_end_p) { + MType m_source = MM_PS(load)(source_p); + m_sum = MM_PS(add)(m_sum, MM_PS(mul)(m_source, m_source)); + source_p += kPackedFloatsPerRegister; + } + + // Combine the packed floats. + const float* sums = reinterpret_cast<const float*>(&m_sum); + for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i) + *sum_p += sums[i]; +} + +// real_dest[k] = real1[k] * real2[k] - imag1[k] * imag2[k] +// imag_dest[k] = real1[k] * imag2[k] + imag1[k] * real2[k] +void Zvmul(const float* real1p, + const float* imag1p, + const float* real2p, + const float* imag2p, + float* real_dest_p, + float* imag_dest_p, + size_t frames_to_process) { + DCHECK(IsAligned(real1p)); + DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); + +#define MULTIPLY_ALL(loadOtherThanReal1, storeDest) \ + for (size_t i = 0u; i < frames_to_process; i += kPackedFloatsPerRegister) { \ + MType real1 = MM_PS(load)(real1p + i); \ + MType real2 = MM_PS(loadOtherThanReal1)(real2p + i); \ + MType imag1 = MM_PS(loadOtherThanReal1)(imag1p + i); \ + MType imag2 = MM_PS(loadOtherThanReal1)(imag2p + i); \ + MType real = \ + MM_PS(sub)(MM_PS(mul)(real1, real2), MM_PS(mul)(imag1, imag2)); \ + MType imag = \ + MM_PS(add)(MM_PS(mul)(real1, imag2), MM_PS(mul)(imag1, real2)); \ + MM_PS(storeDest)(real_dest_p + i, real); \ + MM_PS(storeDest)(imag_dest_p + i, imag); \ + } + + if (IsAligned(imag1p) && IsAligned(real2p) && IsAligned(imag2p) && + IsAligned(real_dest_p) && IsAligned(imag_dest_p)) { + MULTIPLY_ALL(load, store); + } else { + MULTIPLY_ALL(loadu, storeu); + } + +#undef MULTIPLY_ALL +} + +} // namespace VECTOR_MATH_SIMD_NAMESPACE_NAME +} // namespace VectorMath +} // namespace blink + +#endif // defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MACOSX) |