summaryrefslogtreecommitdiff
path: root/snappy-internal.h
diff options
context:
space:
mode:
authoratdt <atdt@google.com>2018-03-26 21:55:23 -0700
committerVictor Costan <costan@gmail.com>2018-08-04 18:51:07 -0700
commit8f469d97e2ccbd87e5c00d25b1969daebfbfa350 (patch)
tree81dc12de971007e567688266f2d4cf756e1ed963 /snappy-internal.h
parent4f7bd2dbfd12bfda77488baf46c2f7648c9f1999 (diff)
downloadsnappy-git-8f469d97e2ccbd87e5c00d25b1969daebfbfa350.tar.gz
Avoid store-forwarding stalls in Zippy's IncrementalCopy
NEW: Annotate `pattern` as initialized, for MSan. Snappy's IncrementalCopy routine optimizes for speed by reading and writing memory in blocks of eight or sixteen bytes. If the gap between the source and destination pointers is smaller than eight bytes, snappy's strategy is to expand the gap by issuing a series of partly-overlapping eight-byte loads+stores. Because the range of each load partly overlaps that of the store which preceded it, the store buffer cannot be forwarded to the load, and the load stalls while it waits for the store to retire. This is called a store-forwarding stall. We can use fewer loads and avoid most of the stalls by loading the first eight bytes into an 128-bit XMM register, then using PSHUFB to permute the register's contents in-place into the desired repeating sequence of bytes. When falling back to IncrementalCopySlow, use memset if the pattern size == 1. This eliminates around 60% of the stalls. name old time/op new time/op delta BM_UFlat/0 [html] 48.6µs ± 0% 48.2µs ± 0% -0.92% (p=0.000 n=19+18) BM_UFlat/1 [urls] 589µs ± 0% 576µs ± 0% -2.17% (p=0.000 n=19+18) BM_UFlat/2 [jpg] 7.12µs ± 0% 7.10µs ± 0% ~ (p=0.071 n=19+18) BM_UFlat/3 [jpg_200] 162ns ± 0% 151ns ± 0% -7.06% (p=0.000 n=19+18) BM_UFlat/4 [pdf] 8.25µs ± 0% 8.19µs ± 0% -0.74% (p=0.000 n=19+18) BM_UFlat/5 [html4] 218µs ± 0% 218µs ± 0% +0.09% (p=0.000 n=17+18) BM_UFlat/6 [txt1] 191µs ± 0% 189µs ± 0% -1.12% (p=0.000 n=19+18) BM_UFlat/7 [txt2] 168µs ± 0% 167µs ± 0% -1.01% (p=0.000 n=19+18) BM_UFlat/8 [txt3] 502µs ± 0% 499µs ± 0% -0.52% (p=0.000 n=19+18) BM_UFlat/9 [txt4] 704µs ± 0% 695µs ± 0% -1.26% (p=0.000 n=19+18) BM_UFlat/10 [pb] 45.6µs ± 0% 44.2µs ± 0% -3.13% (p=0.000 n=19+15) BM_UFlat/11 [gaviota] 188µs ± 0% 194µs ± 0% +3.06% (p=0.000 n=15+18) BM_UFlat/12 [cp] 15.1µs ± 2% 14.7µs ± 1% -2.09% (p=0.000 n=18+18) BM_UFlat/13 [c] 7.38µs ± 0% 7.36µs ± 0% -0.28% (p=0.000 n=16+18) BM_UFlat/14 [lsp] 2.31µs ± 0% 2.37µs ± 0% +2.64% (p=0.000 n=19+18) BM_UFlat/15 [xls] 984µs ± 0% 909µs ± 0% -7.59% (p=0.000 n=19+18) BM_UFlat/16 [xls_200] 215ns ± 0% 217ns ± 0% +0.71% (p=0.000 n=19+15) BM_UFlat/17 [bin] 289µs ± 0% 287µs ± 0% -0.71% (p=0.000 n=19+18) BM_UFlat/18 [bin_200] 161ns ± 0% 116ns ± 0% -28.09% (p=0.000 n=19+16) BM_UFlat/19 [sum] 31.9µs ± 0% 29.2µs ± 0% -8.37% (p=0.000 n=19+18) BM_UFlat/20 [man] 3.13µs ± 1% 3.07µs ± 0% -1.79% (p=0.000 n=19+18) name old allocs/op new allocs/op delta BM_UFlat/0 [html] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/1 [urls] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/2 [jpg] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/3 [jpg_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/4 [pdf] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/5 [html4] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/6 [txt1] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/7 [txt2] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/8 [txt3] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/9 [txt4] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/10 [pb] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/11 [gaviota] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/12 [cp] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/13 [c] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/14 [lsp] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/15 [xls] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/16 [xls_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/17 [bin] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/18 [bin_200] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/19 [sum] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) BM_UFlat/20 [man] 0.00 ±NaN% 0.00 ±NaN% ~ (all samples are equal) name old speed new speed delta BM_UFlat/0 [html] 2.11GB/s ± 0% 2.13GB/s ± 0% +0.92% (p=0.000 n=19+18) BM_UFlat/1 [urls] 1.19GB/s ± 0% 1.22GB/s ± 0% +2.22% (p=0.000 n=16+17) BM_UFlat/2 [jpg] 17.3GB/s ± 0% 17.3GB/s ± 0% ~ (p=0.074 n=19+18) BM_UFlat/3 [jpg_200] 1.23GB/s ± 0% 1.33GB/s ± 0% +7.58% (p=0.000 n=19+18) BM_UFlat/4 [pdf] 12.4GB/s ± 0% 12.5GB/s ± 0% +0.74% (p=0.000 n=19+18) BM_UFlat/5 [html4] 1.88GB/s ± 0% 1.88GB/s ± 0% -0.09% (p=0.000 n=18+18) BM_UFlat/6 [txt1] 798MB/s ± 0% 807MB/s ± 0% +1.13% (p=0.000 n=19+18) BM_UFlat/7 [txt2] 743MB/s ± 0% 751MB/s ± 0% +1.02% (p=0.000 n=19+18) BM_UFlat/8 [txt3] 850MB/s ± 0% 855MB/s ± 0% +0.52% (p=0.000 n=19+18) BM_UFlat/9 [txt4] 684MB/s ± 0% 693MB/s ± 0% +1.28% (p=0.000 n=19+18) BM_UFlat/10 [pb] 2.60GB/s ± 0% 2.69GB/s ± 0% +3.25% (p=0.000 n=19+16) BM_UFlat/11 [gaviota] 979MB/s ± 0% 950MB/s ± 0% -2.97% (p=0.000 n=15+18) BM_UFlat/12 [cp] 1.63GB/s ± 2% 1.67GB/s ± 1% +2.13% (p=0.000 n=18+18) BM_UFlat/13 [c] 1.51GB/s ± 0% 1.52GB/s ± 0% +0.29% (p=0.000 n=16+18) BM_UFlat/14 [lsp] 1.61GB/s ± 1% 1.57GB/s ± 0% -2.57% (p=0.000 n=19+18) BM_UFlat/15 [xls] 1.05GB/s ± 0% 1.13GB/s ± 0% +8.22% (p=0.000 n=19+18) BM_UFlat/16 [xls_200] 928MB/s ± 0% 921MB/s ± 0% -0.81% (p=0.000 n=19+17) BM_UFlat/17 [bin] 1.78GB/s ± 0% 1.79GB/s ± 0% +0.71% (p=0.000 n=19+18) BM_UFlat/18 [bin_200] 1.24GB/s ± 0% 1.72GB/s ± 0% +38.92% (p=0.000 n=19+18) BM_UFlat/19 [sum] 1.20GB/s ± 0% 1.31GB/s ± 0% +9.15% (p=0.000 n=19+18) BM_UFlat/20 [man] 1.35GB/s ± 1% 1.38GB/s ± 0% +1.84% (p=0.000 n=19+18)
Diffstat (limited to 'snappy-internal.h')
-rw-r--r--snappy-internal.h13
1 files changed, 13 insertions, 0 deletions
diff --git a/snappy-internal.h b/snappy-internal.h
index 4b53d59..ceec337 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -218,6 +218,19 @@ static const uint16 char_table[256] = {
0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
};
+// This is a table of shuffle control masks that can be used as the source
+// operand for PSHUFB to permute the contents of the destination XMM register
+// into a repeating byte pattern.
+alignas(16) static const char pshufb_fill_patterns[7][16] = {
+ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
+ {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0},
+ {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0},
+ {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3},
+ {0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1},
+};
+
} // end namespace internal
} // end namespace snappy