diff options
Diffstat (limited to 'src/libFLAC/stream_encoder_intrin_avx2.c')
-rw-r--r-- | src/libFLAC/stream_encoder_intrin_avx2.c | 18 |
1 files changed, 15 insertions, 3 deletions
diff --git a/src/libFLAC/stream_encoder_intrin_avx2.c b/src/libFLAC/stream_encoder_intrin_avx2.c index 7a06ea15..863ae4d2 100644 --- a/src/libFLAC/stream_encoder_intrin_avx2.c +++ b/src/libFLAC/stream_encoder_intrin_avx2.c @@ -48,7 +48,7 @@ FLAC__SSE_TARGET("avx2") void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; @@ -60,7 +60,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); - if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { + if(max_residual_bps < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; @@ -92,7 +92,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual #endif } } - else { /* have to pessimistically use 64 bits for accumulator */ + else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; @@ -121,6 +121,18 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual _mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), sum128); } } + else { /* must handle abs(INT32_MIN) */ + for(partition = residual_sample = 0; partition < partitions; partition++) { + FLAC__uint64 abs_residual_partition_sum64 = 0; + end += default_partition_samples; + for( ; residual_sample < end; residual_sample++) + if(residual[residual_sample] == INT32_MIN) + abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN; + else + abs_residual_partition_sum64 += abs(residual[residual_sample]); + abs_residual_partition_sums[partition] = abs_residual_partition_sum64; + } + } } /* now merge partitions for lower orders */ |