summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSnappy Team <no-reply@google.com>2020-11-13 17:58:32 +0000
committerVictor Costan <costan@google.com>2020-11-18 23:21:30 +0000
commit3bfa265a04da212cfcb6cc7acf4868647860579e (patch)
tree77945b721f73205544cdbf2ebc86450380f116a8
parent4d2dc9dcbb4c4c68872e9a2952426dd7dee91e56 (diff)
downloadsnappy-git-3bfa265a04da212cfcb6cc7acf4868647860579e.tar.gz
Revert zippy optimization that causes heap buffer overflows.
PiperOrigin-RevId: 342283314
-rw-r--r--snappy.cc194
1 files changed, 54 insertions, 140 deletions
diff --git a/snappy.cc b/snappy.cc
index b0e4414..41776a9 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -137,16 +137,6 @@ void UnalignedCopy128(const void* src, void* dst) {
std::memcpy(dst, tmp, 16);
}
-template <bool use_16bytes_chunk>
-inline void ConditionalUnalignedCopy128(const char* src, char* dst) {
- if (use_16bytes_chunk) {
- UnalignedCopy128(src, dst);
- } else {
- UnalignedCopy64(src, dst);
- UnalignedCopy64(src + 8, dst + 8);
- }
-}
-
// Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used
// for handling COPY operations where the input and output regions may overlap.
// For example, suppose:
@@ -174,54 +164,21 @@ inline char* IncrementalCopySlow(const char* src, char* op,
#if SNAPPY_HAVE_SSSE3
-// Computes the bytes for shuffle control mask (please read comments on
-// 'pattern_generation_masks' as well) for the given index_offset and
-// pattern_size. For example, when the 'offset' is 6, it will generate a
-// repeating pattern of size 6. So, the first 16 byte indexes will correspond to
-// the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the
-// next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3,
-// 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by
-// calling MakePatternMaskBytes(0, 6, std::make_integer_sequence<int, 16>()) and
-// MakePatternMaskBytes(16, 6, std::make_integer_sequence<int, 16>())
-// respectively.
-template <int... indexes>
-constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
- int index_offset, int pattern_size,
- std::integer_sequence<int, indexes...>) {
- return {static_cast<char>((index_offset + indexes) % pattern_size)...};
-}
-
-// Computes the shuffle control mask bytes array for given pattern-sizes and
-// returns an array.
-template <int... pattern_sizes_minus_one>
-constexpr std::array<std::array<char, sizeof(__m128i)>,
- sizeof...(pattern_sizes_minus_one)>
-MakePatternMaskBytesTable(
- int index_offset, std::integer_sequence<int, pattern_sizes_minus_one...>) {
- return {MakePatternMaskBytes(
- index_offset, pattern_sizes_minus_one + 1,
- std::make_integer_sequence<int, /*indexes=*/sizeof(__m128i)>())...};
-}
-
-// This is an array of shuffle control masks that can be used as the source
+// This is a table of shuffle control masks that can be used as the source
// operand for PSHUFB to permute the contents of the destination XMM register
// into a repeating byte pattern.
-alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
- 16> pattern_generation_masks =
- MakePatternMaskBytesTable(
- /*index_offset=*/0,
- /*pattern_sizes_minus_one=*/std::make_integer_sequence<int, 16>());
-
-// Similar to 'pattern_generation_masks', this table is used to "rotate" the
-// pattern so that we can copy the *next 16 bytes* consistent with the pattern.
-// Basically, pattern_reshuffle_masks is a continuation of
-// pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
-// pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
-alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
- 16> pattern_reshuffle_masks =
- MakePatternMaskBytesTable(
- /*index_offset=*/16,
- /*pattern_sizes_minus_one=*/std::make_integer_sequence<int, 16>());
+alignas(16) const char pshufb_fill_patterns[7][16] = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+
+// j * (16 / j) for all j from 0 to 7. 0 is not actually used.
+const uint8_t pattern_size_table[8] = {0, 16, 16, 15, 16, 15, 12, 14};
#endif // SNAPPY_HAVE_SSSE3
@@ -230,19 +187,13 @@ alignas(16) inline constexpr std::array<std::array<char, sizeof(__m128i)>,
// region of the buffer.
inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
char* const buf_limit) {
-#if SNAPPY_HAVE_SSSE3
- constexpr int big_pattern_size_lower_bound = 16;
-#else
- constexpr int big_pattern_size_lower_bound = 8;
-#endif
-
// Terminology:
//
// slop = buf_limit - op
// pat = op - src
// len = limit - op
assert(src < op);
- assert(op < op_limit);
+ assert(op <= op_limit);
assert(op_limit <= buf_limit);
// NOTE: The copy tags use 3 or 6 bits to store the copy length, so len <= 64.
assert(op_limit - op <= 64);
@@ -273,13 +224,11 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
// input. In general if we always predict len <= 16 it would be an ok
// prediction.
//
- // In order to be fast we want a pattern >= 16 bytes (or 8 bytes in non-SSE)
- // and an unrolled loop copying 1x 16 bytes (or 2x 8 bytes in non-SSE) at a
- // time.
+ // In order to be fast we want a pattern >= 8 bytes and an unrolled loop
+ // copying 2x 8 bytes at a time.
- // Handle the uncommon case where pattern is less than 16 (or 8 in non-SSE)
- // bytes.
- if (SNAPPY_PREDICT_FALSE(pattern_size < big_pattern_size_lower_bound)) {
+ // Handle the uncommon case where pattern is less than 8 bytes.
+ if (SNAPPY_PREDICT_FALSE(pattern_size < 8)) {
#if SNAPPY_HAVE_SSSE3
// Load the first eight bytes into an 128-bit XMM register, then use PSHUFB
// to permute the register's contents in-place into a repeating sequence of
@@ -293,63 +242,24 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
// The non-SSE fallback implementation suffers from store-forwarding stalls
// because its loads and stores partly overlap. By expanding the pattern
// in-place, we avoid the penalty.
- const __m128i pattern_generation_mask =
- _mm_load_si128(reinterpret_cast<const __m128i*>(
- pattern_generation_masks[pattern_size - 1].data()));
- // Uninitialized bytes are masked out by the shuffle mask.
- // TODO: remove annotation and macro defs once MSan is fixed.
- SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(op, 16 - pattern_size);
- __m128i pattern =
- _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(src)),
- pattern_generation_mask);
-
- // In addition to the 'pattern_generation_mask', this mask will generate the
- // next 16 bytes in-place. Doing so enables us to write 4 conditionals
- // similar to when 'pattern_size >= big_pattern_size_lower_bound'.
- // For example, suppose pattern is abcabcabcabcabca. Shuffling with this
- // mask will generate bcabcabcabcabcab. Shuffling again will generate
- // cabcabcabcabcabc.
- const __m128i pattern_reshuffle_mask =
- _mm_load_si128(reinterpret_cast<const __m128i*>(
- pattern_reshuffle_masks[pattern_size - 1].data()));
-
- // Typically, the op_limit is the gating factor so try to simplify the loop
- // based on that.
- if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
- // There is at least one, and at most four 16-byte blocks. Writing four
- // conditionals instead of a loop allows FDO to layout the code with
- // respect to the actual probabilities of each length.
- // TODO: Replace with loop with trip count hint.
- _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
-
- if (op + 16 < op_limit) {
- pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
- _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 16), pattern);
- }
- if (op + 32 < op_limit) {
- pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
- _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 32), pattern);
- }
- if (op + 48 < op_limit) {
- pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
- _mm_storeu_si128(reinterpret_cast<__m128i*>(op + 48), pattern);
+ if (SNAPPY_PREDICT_TRUE(op <= buf_limit - 16)) {
+ const __m128i shuffle_mask = _mm_load_si128(
+ reinterpret_cast<const __m128i*>(pshufb_fill_patterns) +
+ pattern_size - 1);
+ const __m128i pattern = _mm_shuffle_epi8(
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src)), shuffle_mask);
+ // Uninitialized bytes are masked out by the shuffle mask.
+ // TODO: remove annotation and macro defs once MSan is fixed.
+ SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(&pattern, sizeof(pattern));
+ pattern_size = pattern_size_table[pattern_size];
+ char* op_end = std::min(op_limit, buf_limit - 15);
+ while (op < op_end) {
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
+ op += pattern_size;
}
- return op_limit;
- }
-
- // Fall back to doing as much as we can with the available slop in the
- // buffer. This code path is relatively cold however so we save code size by
- // avoiding unrolling and vectorizing.
- //
- // TODO: Remove pragma when when cold regions don't get
- // vectorized or unrolled.
-#pragma nounroll
- while (op + 16 <= op_limit) {
- _mm_storeu_si128(reinterpret_cast<__m128i*>(op), pattern);
- pattern = _mm_shuffle_epi8(pattern, pattern_reshuffle_mask);
- op += 16;
+ if (SNAPPY_PREDICT_TRUE(op >= op_limit)) return op_limit;
}
- return IncrementalCopySlow(op - pattern_size, op, op_limit);
+ return IncrementalCopySlow(src, op, op_limit);
#else // !SNAPPY_HAVE_SSSE3
// If plenty of buffer space remains, expand the pattern to at least 8
// bytes. The way the following loop is written, we need 8 bytes of buffer
@@ -369,30 +279,34 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
}
#endif // SNAPPY_HAVE_SSSE3
}
- assert(pattern_size >= big_pattern_size_lower_bound);
- constexpr bool use_16bytes_chunk = big_pattern_size_lower_bound == 16;
+ assert(pattern_size >= 8);
- // Copy 1x 16 bytes (or 2x 8 bytes in non-SSE) at a time. Because op - src can
- // be < 16 in non-SSE, a single UnalignedCopy128 might overwrite data in op.
- // UnalignedCopy64 is safe because expanding the pattern to at least 8 bytes
- // guarantees that op - src >= 8.
+ // Copy 2x 8 bytes at a time. Because op - src can be < 16, a single
+ // UnalignedCopy128 might overwrite data in op. UnalignedCopy64 is safe
+ // because expanding the pattern to at least 8 bytes guarantees that
+ // op - src >= 8.
//
// Typically, the op_limit is the gating factor so try to simplify the loop
// based on that.
- if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+ if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 16)) {
// There is at least one, and at most four 16-byte blocks. Writing four
// conditionals instead of a loop allows FDO to layout the code with respect
// to the actual probabilities of each length.
// TODO: Replace with loop with trip count hint.
- ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
+ UnalignedCopy64(src, op);
+ UnalignedCopy64(src + 8, op + 8);
+
if (op + 16 < op_limit) {
- ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 16, op + 16);
+ UnalignedCopy64(src + 16, op + 16);
+ UnalignedCopy64(src + 24, op + 24);
}
if (op + 32 < op_limit) {
- ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 32, op + 32);
+ UnalignedCopy64(src + 32, op + 32);
+ UnalignedCopy64(src + 40, op + 40);
}
if (op + 48 < op_limit) {
- ConditionalUnalignedCopy128<use_16bytes_chunk>(src + 48, op + 48);
+ UnalignedCopy64(src + 48, op + 48);
+ UnalignedCopy64(src + 56, op + 56);
}
return op_limit;
}
@@ -403,9 +317,12 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
//
// TODO: Remove pragma when when cold regions don't get vectorized
// or unrolled.
-#pragma nounroll
+#ifdef __clang__
+#pragma clang loop unroll(disable)
+#endif
for (char* op_end = buf_limit - 16; op < op_end; op += 16, src += 16) {
- ConditionalUnalignedCopy128<use_16bytes_chunk>(src, op);
+ UnalignedCopy64(src, op);
+ UnalignedCopy64(src + 8, op + 8);
}
if (op >= op_limit) return op_limit;
@@ -999,7 +916,6 @@ class SnappyDecompressor {
preload = LittleEndian::Load32(ip);
const uint32_t trailer = ExtractLowBytes(preload, c & 3);
const uint32_t length = entry & 0xff;
- assert(length > 0);
// copy_offset/256 is encoded in bits 8..10. By just fetching
// those bits, we get copy_offset (since the bit-field starts at
@@ -1345,7 +1261,6 @@ class SnappyIOVecWriter {
if (to_copy > len) {
to_copy = len;
}
- assert(to_copy > 0);
IncrementalCopy(GetIOVecPointer(from_iov, from_iov_offset),
curr_iov_output_, curr_iov_output_ + to_copy,
@@ -1435,7 +1350,6 @@ class SnappyArrayWriter {
SNAPPY_ATTRIBUTE_ALWAYS_INLINE
inline bool AppendFromSelf(size_t offset, size_t len, char** op_p) {
- assert(len > 0);
char* const op = *op_p;
assert(op >= base_);
char* const op_end = op + len;