diff options
authorSnappy Team <>2022-10-11 16:00:34 +0000
committerVictor Costan <>2023-01-12 13:32:43 +0000
commita2d219a8a801ae522bac8e966de005fcb336821b (patch)
parent984b191f0fefdeb17050b42a90b7625999c13b8d (diff)
Modify MemCopy64 to use AVX 32 byte copies instead of SSE2 16 byte copies on capable x86 platforms. This gives an average speedup of 6.87% on Milan and 1.90% on Skylake.
PiperOrigin-RevId: 480370725
1 files changed, 16 insertions, 7 deletions
diff --git a/ b/
index b072e5d..932f59f 100644
--- a/
+++ b/
@@ -989,27 +989,36 @@ inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) {
// so gives better performance. [src, src + size) must not overlap with
// [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64).
void MemCopy64(char* dst, const void* src, size_t size) {
- // Always copy this many bytes, test if we need to copy more.
+ // Always copy this many bytes. If that's below size then copy the full 64.
constexpr int kShortMemCopy = 32;
- // We're always allowed to copy 64 bytes, so if we exceed kShortMemCopy just
- // copy 64 rather than the exact amount.
- constexpr int kLongMemCopy = 64;
- assert(size <= kLongMemCopy);
+ assert(size <= 64);
assert(std::less_equal<const void*>()(static_cast<const char*>(src) + size,
dst) ||
std::less_equal<const void*>()(dst + size, src));
// We know that src and dst are at least size bytes apart. However, because we
// might copy more than size bytes the copy still might overlap past size.
- // E.g. if src and dst appear consecutively in memory (src + size == dst).
+ // E.g. if src and dst appear consecutively in memory (src + size >= dst).
+ // TODO: Investigate wider copies on other platforms.
+#if defined(__x86_64__) && defined(__AVX__)
+ assert(kShortMemCopy <= 32);
+ __m256i data = _mm256_lddqu_si256(static_cast<const __m256i *>(src));
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), data);
+ // Profiling shows that nearly all copies are short.
+ if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
+ data = _mm256_lddqu_si256(static_cast<const __m256i *>(src) + 1);
+ _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, data);
+ }
std::memmove(dst, src, kShortMemCopy);
// Profiling shows that nearly all copies are short.
if (SNAPPY_PREDICT_FALSE(size > kShortMemCopy)) {
std::memmove(dst + kShortMemCopy,
static_cast<const uint8_t*>(src) + kShortMemCopy,
- kLongMemCopy - kShortMemCopy);
+ 64 - kShortMemCopy);
void MemCopy64(ptrdiff_t dst, const void* src, size_t size) {