diff options
author | Martijn van Beurden <mvanb1@gmail.com> | 2022-06-29 12:00:13 +0200 |
---|---|---|
committer | Martijn van Beurden <mvanb1@gmail.com> | 2022-07-01 21:51:31 +0200 |
commit | 7e0a0e572305e9004a6fa9bba3dd6be936553b03 (patch) | |
tree | a344ad5eccc1acb81a16f1f756a14c09e7e035e8 /src | |
parent | 633ab36ec51bada8737459e51cc54441544e63b2 (diff) | |
download | flac-7e0a0e572305e9004a6fa9bba3dd6be936553b03.tar.gz |
Enable encoder to use INT32_MIN as residual value
As abs(INT32_MIN) is undefined, it took some extra work to enable
the encoder to do this. While expected gains are zero, this is
done to ensure full spec coverage in this regard
Diffstat (limited to 'src')
-rw-r--r-- | src/libFLAC/fixed.c | 75 | ||||
-rw-r--r-- | src/libFLAC/include/private/stream_encoder.h | 12 | ||||
-rw-r--r-- | src/libFLAC/lpc.c | 6 | ||||
-rw-r--r-- | src/libFLAC/stream_encoder.c | 37 | ||||
-rw-r--r-- | src/libFLAC/stream_encoder_intrin_avx2.c | 18 | ||||
-rw-r--r-- | src/libFLAC/stream_encoder_intrin_sse2.c | 19 | ||||
-rw-r--r-- | src/libFLAC/stream_encoder_intrin_ssse3.c | 18 |
7 files changed, 112 insertions, 73 deletions
diff --git a/src/libFLAC/fixed.c b/src/libFLAC/fixed.c index 26f46e6c..9dfe4f19 100644 --- a/src/libFLAC/fixed.c +++ b/src/libFLAC/fixed.c @@ -377,33 +377,32 @@ uint32_t FLAC__fixed_compute_best_predictor_limit_residual(const FLAC__int32 dat #endif { FLAC__uint64 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0, smallest_error = UINT64_MAX; - FLAC__uint64 error_0, error_1, error_2, error_3, error_4; + FLAC__int64 error_0, error_1, error_2, error_3, error_4; FLAC__bool order_0_is_valid = true, order_1_is_valid = true, order_2_is_valid = true, order_3_is_valid = true, order_4_is_valid = true; uint32_t order = 0; for(int i = 0; i < (int)data_len; i++) { - error_0 = local_abs64((FLAC__int64)data[i]); - error_1 = (i > 0) ? local_abs64((FLAC__int64)data[i] - data[i-1]) : 0 ; - error_2 = (i > 1) ? local_abs64((FLAC__int64)data[i] - 2 * (FLAC__int64)data[i-1] + data[i-2]) : 0; - error_3 = (i > 2) ? local_abs64((FLAC__int64)data[i] - 3 * (FLAC__int64)data[i-1] + 3 * (FLAC__int64)data[i-2] - data[i-3]) : 0; - error_4 = (i > 3) ? local_abs64((FLAC__int64)data[i] - 4 * (FLAC__int64)data[i-1] + 6 * (FLAC__int64)data[i-2] - 4 * (FLAC__int64)data[i-3] + data[i-4]) : 0; - - total_error_0 += error_0; - total_error_1 += error_1; - total_error_2 += error_2; - total_error_3 += error_3; - total_error_4 += error_4; - - /* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */ - if(error_0 > INT32_MAX) + error_0 = (FLAC__int64)data[i]; + error_1 = (i > 0) ? (FLAC__int64)data[i] - data[i-1] : 0 ; + error_2 = (i > 1) ? (FLAC__int64)data[i] - 2 * (FLAC__int64)data[i-1] + data[i-2] : 0; + error_3 = (i > 2) ? (FLAC__int64)data[i] - 3 * (FLAC__int64)data[i-1] + 3 * (FLAC__int64)data[i-2] - data[i-3] : 0; + error_4 = (i > 3) ? (FLAC__int64)data[i] - 4 * (FLAC__int64)data[i-1] + 6 * (FLAC__int64)data[i-2] - 4 * (FLAC__int64)data[i-3] + data[i-4] : 0; + + total_error_0 += local_abs64(error_0); + total_error_1 += local_abs64(error_1); + total_error_2 += local_abs64(error_2); + total_error_3 += local_abs64(error_3); + total_error_4 += local_abs64(error_4); + + if(error_0 > INT32_MAX || error_0 < INT32_MIN) order_0_is_valid = false; - if(error_1 > INT32_MAX) + if(error_1 > INT32_MAX || error_1 < INT32_MIN) order_1_is_valid = false; - if(error_2 > INT32_MAX) + if(error_2 > INT32_MAX || error_2 < INT32_MIN) order_2_is_valid = false; - if(error_3 > INT32_MAX) + if(error_3 > INT32_MAX || error_3 < INT32_MIN) order_3_is_valid = false; - if(error_4 > INT32_MAX) + if(error_4 > INT32_MAX || error_4 < INT32_MIN) order_4_is_valid = false; } @@ -423,33 +422,33 @@ uint32_t FLAC__fixed_compute_best_predictor_limit_residual_33bit(const FLAC__int #endif { FLAC__uint64 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0, smallest_error = UINT64_MAX; - FLAC__uint64 error_0, error_1, error_2, error_3, error_4; + FLAC__int64 error_0, error_1, error_2, error_3, error_4; FLAC__bool order_0_is_valid = true, order_1_is_valid = true, order_2_is_valid = true, order_3_is_valid = true, order_4_is_valid = true; uint32_t order = 0; for(int i = 0; i < (int)data_len; i++) { - error_0 = local_abs64(data[i]); - error_1 = (i > 0) ? local_abs64(data[i] - data[i-1]) : 0 ; - error_2 = (i > 1) ? local_abs64(data[i] - 2 * data[i-1] + data[i-2]) : 0; - error_3 = (i > 2) ? local_abs64(data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3]) : 0; - error_4 = (i > 3) ? local_abs64(data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4]) : 0; - - total_error_0 += error_0; - total_error_1 += error_1; - total_error_2 += error_2; - total_error_3 += error_3; - total_error_4 += error_4; - - /* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */ - if(error_0 > INT32_MAX) + error_0 = data[i]; + error_1 = (i > 0) ? data[i] - data[i-1] : 0 ; + error_2 = (i > 1) ? data[i] - 2 * data[i-1] + data[i-2] : 0; + error_3 = (i > 2) ? data[i] - 3 * data[i-1] + 3 * data[i-2] - data[i-3] : 0; + error_4 = (i > 3) ? data[i] - 4 * data[i-1] + 6 * data[i-2] - 4 * data[i-3] + data[i-4] : 0; + + total_error_0 += local_abs64(error_0); + total_error_1 += local_abs64(error_1); + total_error_2 += local_abs64(error_2); + total_error_3 += local_abs64(error_3); + total_error_4 += local_abs64(error_4); + + + if(error_0 > INT32_MAX || error_0 < INT32_MIN) order_0_is_valid = false; - if(error_1 > INT32_MAX) + if(error_1 > INT32_MAX || error_1 < INT32_MIN) order_1_is_valid = false; - if(error_2 > INT32_MAX) + if(error_2 > INT32_MAX || error_2 < INT32_MIN) order_2_is_valid = false; - if(error_3 > INT32_MAX) + if(error_3 > INT32_MAX || error_3 < INT32_MIN) order_3_is_valid = false; - if(error_4 > INT32_MAX) + if(error_4 > INT32_MAX || error_4 < INT32_MIN) order_4_is_valid = false; } diff --git a/src/libFLAC/include/private/stream_encoder.h b/src/libFLAC/include/private/stream_encoder.h index ade648bf..ed6af122 100644 --- a/src/libFLAC/include/private/stream_encoder.h +++ b/src/libFLAC/include/private/stream_encoder.h @@ -37,29 +37,23 @@ #include <config.h> #endif -/* - * This is used to avoid overflow with unusual signals in 32-bit - * accumulator in the *precompute_partition_info_sums_* functions. - */ -#define FLAC__MAX_EXTRA_RESIDUAL_BPS 4 - #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN #include "private/cpu.h" #include "FLAC/format.h" #ifdef FLAC__SSE2_SUPPORTED extern void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps); + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps); #endif #ifdef FLAC__SSSE3_SUPPORTED extern void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps); + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps); #endif #ifdef FLAC__AVX2_SUPPORTED extern void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps); + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps); #endif #endif diff --git a/src/libFLAC/lpc.c b/src/libFLAC/lpc.c index 3fe3a2b7..2e8fa51c 100644 --- a/src/libFLAC/lpc.c +++ b/src/libFLAC/lpc.c @@ -828,8 +828,7 @@ FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual(const case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; } residual_to_check = data[i] - (sum >> lp_quantization); - /* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */ - if(residual_to_check <= INT32_MIN || residual_to_check > INT32_MAX) + if(residual_to_check < INT32_MIN || residual_to_check > INT32_MAX) return false; else residual[i] = residual_to_check; @@ -882,8 +881,7 @@ FLAC__bool FLAC__lpc_compute_residual_from_qlp_coefficients_limit_residual_33bit case 1: sum += qlp_coeff[ 0] * data[i- 1]; } residual_to_check = data[i] - (sum >> lp_quantization); - /* residual must not be INT32_MIN because abs(INT32_MIN) is undefined */ - if(residual_to_check <= INT32_MIN || residual_to_check > INT32_MAX) + if(residual_to_check < INT32_MIN || residual_to_check > INT32_MAX) return false; else residual[i] = residual_to_check; diff --git a/src/libFLAC/stream_encoder.c b/src/libFLAC/stream_encoder.c index 34222174..3f3dc326 100644 --- a/src/libFLAC/stream_encoder.c +++ b/src/libFLAC/stream_encoder.c @@ -231,7 +231,7 @@ static uint32_t find_best_partition_order_( uint32_t rice_parameter_limit, uint32_t min_partition_order, uint32_t max_partition_order, - uint32_t bps, + uint32_t max_residual_bps, FLAC__bool do_escape_coding, uint32_t rice_parameter_search_dist, FLAC__EntropyCodingMethod *best_ecm @@ -244,7 +244,7 @@ static void precompute_partition_info_sums_( uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, - uint32_t bps + uint32_t max_residual_bps ); static void precompute_partition_info_escapes_( @@ -349,7 +349,7 @@ typedef struct FLAC__StreamEncoderPrivate { uint32_t current_frame_number; FLAC__MD5Context md5context; FLAC__CPUInfo cpuinfo; - void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps); + void (*local_precompute_partition_info_sums)(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps); #ifndef FLAC__INTEGER_ONLY_LIBRARY uint32_t (*local_fixed_compute_best_predictor)(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); uint32_t (*local_fixed_compute_best_predictor_wide)(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]); @@ -3873,7 +3873,7 @@ uint32_t evaluate_fixed_subframe_( rice_parameter_limit, min_partition_order, max_partition_order, - subframe_bps, + (subframe_bps + order), do_escape_coding, rice_parameter_search_dist, &subframe->data.fixed.entropy_coding_method @@ -3972,7 +3972,7 @@ uint32_t evaluate_lpc_subframe_( rice_parameter_limit, min_partition_order, max_partition_order, - subframe_bps, + FLAC__lpc_max_residual_bps(subframe_bps, qlp_coeff, order, quantization), do_escape_coding, rice_parameter_search_dist, &subframe->data.lpc.entropy_coding_method @@ -4046,7 +4046,7 @@ uint32_t find_best_partition_order_( uint32_t rice_parameter_limit, uint32_t min_partition_order, uint32_t max_partition_order, - uint32_t bps, + uint32_t max_residual_bps, FLAC__bool do_escape_coding, uint32_t rice_parameter_search_dist, FLAC__EntropyCodingMethod *best_ecm @@ -4060,7 +4060,7 @@ uint32_t find_best_partition_order_( max_partition_order = FLAC__format_get_max_rice_partition_order_from_blocksize_limited_max_and_predictor_order(max_partition_order, blocksize, predictor_order); min_partition_order = flac_min(min_partition_order, max_partition_order); - private_->local_precompute_partition_info_sums(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, bps); + private_->local_precompute_partition_info_sums(residual, abs_residual_partition_sums, residual_samples, predictor_order, min_partition_order, max_partition_order, max_residual_bps); if(do_escape_coding) precompute_partition_info_escapes_(residual, raw_bits_per_partition, residual_samples, predictor_order, min_partition_order, max_partition_order); @@ -4138,7 +4138,7 @@ void precompute_partition_info_sums_( uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, - uint32_t bps + uint32_t max_residual_bps ) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; @@ -4150,22 +4150,33 @@ void precompute_partition_info_sums_( { const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int)predictor_order); - /* WATCHOUT: "bps + FLAC__MAX_EXTRA_RESIDUAL_BPS" is the maximum assumed size of the average residual magnitude */ - if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { + if(max_residual_bps < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { FLAC__uint32 abs_residual_partition_sum = 0; end += default_partition_samples; for( ; residual_sample < end; residual_sample++) - abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ + abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } - else { /* have to pessimistically use 64 bits for accumulator */ + else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { FLAC__uint64 abs_residual_partition_sum64 = 0; end += default_partition_samples; for( ; residual_sample < end; residual_sample++) - abs_residual_partition_sum64 += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ + abs_residual_partition_sum64 += abs(residual[residual_sample]); + abs_residual_partition_sums[partition] = abs_residual_partition_sum64; + } + } + else { /* must handle abs(INT32_MIN) */ + for(partition = residual_sample = 0; partition < partitions; partition++) { + FLAC__uint64 abs_residual_partition_sum64 = 0; + end += default_partition_samples; + for( ; residual_sample < end; residual_sample++) + if(residual[residual_sample] == INT32_MIN) + abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN; + else + abs_residual_partition_sum64 += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum64; } } diff --git a/src/libFLAC/stream_encoder_intrin_avx2.c b/src/libFLAC/stream_encoder_intrin_avx2.c index 7a06ea15..863ae4d2 100644 --- a/src/libFLAC/stream_encoder_intrin_avx2.c +++ b/src/libFLAC/stream_encoder_intrin_avx2.c @@ -48,7 +48,7 @@ FLAC__SSE_TARGET("avx2") void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; @@ -60,7 +60,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); - if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { + if(max_residual_bps < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; @@ -92,7 +92,7 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual #endif } } - else { /* have to pessimistically use 64 bits for accumulator */ + else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m256i sum256 = _mm256_setzero_si256(); __m128i sum128; @@ -121,6 +121,18 @@ void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual _mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), sum128); } } + else { /* must handle abs(INT32_MIN) */ + for(partition = residual_sample = 0; partition < partitions; partition++) { + FLAC__uint64 abs_residual_partition_sum64 = 0; + end += default_partition_samples; + for( ; residual_sample < end; residual_sample++) + if(residual[residual_sample] == INT32_MIN) + abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN; + else + abs_residual_partition_sum64 += abs(residual[residual_sample]); + abs_residual_partition_sums[partition] = abs_residual_partition_sum64; + } + } } /* now merge partitions for lower orders */ diff --git a/src/libFLAC/stream_encoder_intrin_sse2.c b/src/libFLAC/stream_encoder_intrin_sse2.c index 04e560ba..a8490f4c 100644 --- a/src/libFLAC/stream_encoder_intrin_sse2.c +++ b/src/libFLAC/stream_encoder_intrin_sse2.c @@ -59,7 +59,7 @@ static inline __m128i local_abs_epi32(__m128i val) FLAC__SSE_TARGET("sse2") void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; @@ -71,7 +71,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); - if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { + if(max_residual_bps < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; @@ -106,7 +106,7 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual #endif } } - else { /* have to pessimistically use 64 bits for accumulator */ + else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; @@ -135,6 +135,19 @@ void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual _mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum); } } + else { /* must handle abs(INT32_MIN) */ + for(partition = residual_sample = 0; partition < partitions; partition++) { + FLAC__uint64 abs_residual_partition_sum64 = 0; + end += default_partition_samples; + for( ; residual_sample < end; residual_sample++) + if(residual[residual_sample] == INT32_MIN) + abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN; + else + abs_residual_partition_sum64 += abs(residual[residual_sample]); + abs_residual_partition_sums[partition] = abs_residual_partition_sum64; + } + } + } /* now merge partitions for lower orders */ diff --git a/src/libFLAC/stream_encoder_intrin_ssse3.c b/src/libFLAC/stream_encoder_intrin_ssse3.c index d7395710..628cd5ca 100644 --- a/src/libFLAC/stream_encoder_intrin_ssse3.c +++ b/src/libFLAC/stream_encoder_intrin_ssse3.c @@ -48,7 +48,7 @@ FLAC__SSE_TARGET("ssse3") void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], - uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) + uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t max_residual_bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; @@ -60,7 +60,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); - if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { + if(max_residual_bps < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; @@ -95,7 +95,7 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua #endif } } - else { /* have to pessimistically use 64 bits for accumulator */ + else if(max_residual_bps < 32) { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; @@ -124,6 +124,18 @@ void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residua _mm_storel_epi64((__m128i*)(void*)(abs_residual_partition_sums+partition), mm_sum); } } + else { /* must handle abs(INT32_MIN) */ + for(partition = residual_sample = 0; partition < partitions; partition++) { + FLAC__uint64 abs_residual_partition_sum64 = 0; + end += default_partition_samples; + for( ; residual_sample < end; residual_sample++) + if(residual[residual_sample] == INT32_MIN) + abs_residual_partition_sum64 -= (FLAC__int64)INT32_MIN; + else + abs_residual_partition_sum64 += abs(residual[residual_sample]); + abs_residual_partition_sums[partition] = abs_residual_partition_sum64; + } + } } /* now merge partitions for lower orders */ |