summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMartin Möhrmann <moehrmann@google.com>2019-08-24 10:38:23 +0200
committerMartin Möhrmann <moehrmann@google.com>2019-09-09 06:49:17 +0000
commit9ec7074a946b7c2812a1a044e84276a36f46d14d (patch)
tree04d57d7f886f2df72d35cfe5108dd04aef6d2e45 /src
parent844e642392e1586e6631aafeda5007f9f0f55145 (diff)
downloadgo-git-9ec7074a946b7c2812a1a044e84276a36f46d14d.tar.gz
compile: prefer an AND instead of SHR+SHL instructions
On modern 64bit CPUs a SHR, SHL or AND instruction take 1 cycle to execute. A pair of shifts that operate on the same register will take 2 cycles and needs to wait for the input register value to be available. Large constants used to mask the high bits of a register with an AND instruction can not be encoded as an immediate in the AND instruction on amd64 and therefore need to be loaded into a register with a MOV instruction. However that MOV instruction is not dependent on the output register and on many CPUs does not compete with the AND or shift instructions for execution ports. Using a pair of shifts to mask high bits instead of an AND to mask high bits of a register has a shorter encoding and uses one less general purpose register but is slower due to taking one clock cycle longer if there is no register pressure that would make the AND variant need to generate a spill. For example the instructions emitted for (x & 1 << 63) before this CL are: 48c1ea3f SHRQ $0x3f, DX 48c1e23f SHLQ $0x3f, DX after this CL the instructions are the same as GCC and LLVM use: 48b80000000000000080 MOVQ $0x8000000000000000, AX 4821d0 ANDQ DX, AX Some platforms such as arm64 already have SSA optimization rules to fuse two shift instructions back into an AND. Removing the general rule to rewrite AND to SHR+SHL speeds up this benchmark: var GlobalU uint func BenchmarkAndHighBits(b *testing.B) { x := uint(0) for i := 0; i < b.N; i++ { x &= 1 << 63 } GlobalU = x } amd64/darwin on Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz: name old time/op new time/op delta AndHighBits-4 0.61ns ± 6% 0.42ns ± 6% -31.42% (p=0.000 n=25+25): Updates #33826 Updates #32781 Change-Id: I862d3587446410c447b9a7265196b57f85358633 Reviewed-on: https://go-review.googlesource.com/c/go/+/191780 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
Diffstat (limited to 'src')
-rw-r--r--src/cmd/compile/internal/ssa/gen/ARM64.rules5
-rw-r--r--src/cmd/compile/internal/ssa/gen/generic.rules8
-rw-r--r--src/cmd/compile/internal/ssa/rewritegeneric.go114
3 files changed, 6 insertions, 121 deletions
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index d4b47bfb0b..6539a0ce7b 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -1863,9 +1863,8 @@
(XORshiftLL <t> [c] (UBFX [bfc] x) x2) && c < 32 && t.Size() == 4 && bfc == armBFAuxInt(32-c, c)
-> (EXTRWconst [32-c] x2 x)
-// Generic rules rewrite certain AND to a pair of shifts.
-// However, on ARM64 the bitmask can fit into an instruction.
-// Rewrite it back to AND.
+// Rewrite special pairs of shifts to AND.
+// On ARM64 the bitmask can fit into an instruction.
(SRLconst [c] (SLLconst [c] x)) && 0 < c && c < 64 -> (ANDconst [1<<uint(64-c)-1] x) // mask out high bits
(SLLconst [c] (SRLconst [c] x)) && 0 < c && c < 64 -> (ANDconst [^(1<<uint(c)-1)] x) // mask out low bits
diff --git a/src/cmd/compile/internal/ssa/gen/generic.rules b/src/cmd/compile/internal/ssa/gen/generic.rules
index ef5d7a63ff..8696464a70 100644
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@@ -542,14 +542,6 @@
(Slicemask (Const64 [x])) && x > 0 -> (Const64 [-1])
(Slicemask (Const64 [0])) -> (Const64 [0])
-// Rewrite AND of consts as shifts if possible, slightly faster for 64 bit operands
-// leading zeros can be shifted left, then right
-(And64 <t> (Const64 [y]) x) && nlz(y) + nto(y) == 64 && nto(y) >= 32
- -> (Rsh64Ux64 (Lsh64x64 <t> x (Const64 <t> [nlz(y)])) (Const64 <t> [nlz(y)]))
-// trailing zeros can be shifted right, then left
-(And64 <t> (Const64 [y]) x) && nlo(y) + ntz(y) == 64 && ntz(y) >= 32
- -> (Lsh64x64 (Rsh64Ux64 <t> x (Const64 <t> [ntz(y)])) (Const64 <t> [ntz(y)]))
-
// simplifications often used for lengths. e.g. len(s[i:i+5])==5
(Sub(64|32|16|8) (Add(64|32|16|8) x y) x) -> y
(Sub(64|32|16|8) (Add(64|32|16|8) x y) y) -> x
diff --git a/src/cmd/compile/internal/ssa/rewritegeneric.go b/src/cmd/compile/internal/ssa/rewritegeneric.go
index 8aa07d20db..a2d091d3d6 100644
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
@@ -5735,112 +5735,6 @@ func rewriteValuegeneric_OpAnd64_10(v *Value) bool {
v.AddArg(y)
return true
}
- // match: (And64 <t> (Const64 [y]) x)
- // cond: nlz(y) + nto(y) == 64 && nto(y) >= 32
- // result: (Rsh64Ux64 (Lsh64x64 <t> x (Const64 <t> [nlz(y)])) (Const64 <t> [nlz(y)]))
- for {
- t := v.Type
- x := v.Args[1]
- v_0 := v.Args[0]
- if v_0.Op != OpConst64 {
- break
- }
- y := v_0.AuxInt
- if !(nlz(y)+nto(y) == 64 && nto(y) >= 32) {
- break
- }
- v.reset(OpRsh64Ux64)
- v0 := b.NewValue0(v.Pos, OpLsh64x64, t)
- v0.AddArg(x)
- v1 := b.NewValue0(v.Pos, OpConst64, t)
- v1.AuxInt = nlz(y)
- v0.AddArg(v1)
- v.AddArg(v0)
- v2 := b.NewValue0(v.Pos, OpConst64, t)
- v2.AuxInt = nlz(y)
- v.AddArg(v2)
- return true
- }
- // match: (And64 <t> x (Const64 [y]))
- // cond: nlz(y) + nto(y) == 64 && nto(y) >= 32
- // result: (Rsh64Ux64 (Lsh64x64 <t> x (Const64 <t> [nlz(y)])) (Const64 <t> [nlz(y)]))
- for {
- t := v.Type
- _ = v.Args[1]
- x := v.Args[0]
- v_1 := v.Args[1]
- if v_1.Op != OpConst64 {
- break
- }
- y := v_1.AuxInt
- if !(nlz(y)+nto(y) == 64 && nto(y) >= 32) {
- break
- }
- v.reset(OpRsh64Ux64)
- v0 := b.NewValue0(v.Pos, OpLsh64x64, t)
- v0.AddArg(x)
- v1 := b.NewValue0(v.Pos, OpConst64, t)
- v1.AuxInt = nlz(y)
- v0.AddArg(v1)
- v.AddArg(v0)
- v2 := b.NewValue0(v.Pos, OpConst64, t)
- v2.AuxInt = nlz(y)
- v.AddArg(v2)
- return true
- }
- // match: (And64 <t> (Const64 [y]) x)
- // cond: nlo(y) + ntz(y) == 64 && ntz(y) >= 32
- // result: (Lsh64x64 (Rsh64Ux64 <t> x (Const64 <t> [ntz(y)])) (Const64 <t> [ntz(y)]))
- for {
- t := v.Type
- x := v.Args[1]
- v_0 := v.Args[0]
- if v_0.Op != OpConst64 {
- break
- }
- y := v_0.AuxInt
- if !(nlo(y)+ntz(y) == 64 && ntz(y) >= 32) {
- break
- }
- v.reset(OpLsh64x64)
- v0 := b.NewValue0(v.Pos, OpRsh64Ux64, t)
- v0.AddArg(x)
- v1 := b.NewValue0(v.Pos, OpConst64, t)
- v1.AuxInt = ntz(y)
- v0.AddArg(v1)
- v.AddArg(v0)
- v2 := b.NewValue0(v.Pos, OpConst64, t)
- v2.AuxInt = ntz(y)
- v.AddArg(v2)
- return true
- }
- // match: (And64 <t> x (Const64 [y]))
- // cond: nlo(y) + ntz(y) == 64 && ntz(y) >= 32
- // result: (Lsh64x64 (Rsh64Ux64 <t> x (Const64 <t> [ntz(y)])) (Const64 <t> [ntz(y)]))
- for {
- t := v.Type
- _ = v.Args[1]
- x := v.Args[0]
- v_1 := v.Args[1]
- if v_1.Op != OpConst64 {
- break
- }
- y := v_1.AuxInt
- if !(nlo(y)+ntz(y) == 64 && ntz(y) >= 32) {
- break
- }
- v.reset(OpLsh64x64)
- v0 := b.NewValue0(v.Pos, OpRsh64Ux64, t)
- v0.AddArg(x)
- v1 := b.NewValue0(v.Pos, OpConst64, t)
- v1.AuxInt = ntz(y)
- v0.AddArg(v1)
- v.AddArg(v0)
- v2 := b.NewValue0(v.Pos, OpConst64, t)
- v2.AuxInt = ntz(y)
- v.AddArg(v2)
- return true
- }
// match: (And64 (And64 i:(Const64 <t>) z) x)
// cond: (z.Op != OpConst64 && x.Op != OpConst64)
// result: (And64 i (And64 <t> z x))
@@ -5867,10 +5761,6 @@ func rewriteValuegeneric_OpAnd64_10(v *Value) bool {
v.AddArg(v0)
return true
}
- return false
-}
-func rewriteValuegeneric_OpAnd64_20(v *Value) bool {
- b := v.Block
// match: (And64 (And64 z i:(Const64 <t>)) x)
// cond: (z.Op != OpConst64 && x.Op != OpConst64)
// result: (And64 i (And64 <t> z x))
@@ -5984,6 +5874,10 @@ func rewriteValuegeneric_OpAnd64_20(v *Value) bool {
v.AddArg(x)
return true
}
+ return false
+}
+func rewriteValuegeneric_OpAnd64_20(v *Value) bool {
+ b := v.Block
// match: (And64 (Const64 <t> [c]) (And64 x (Const64 <t> [d])))
// cond:
// result: (And64 (Const64 <t> [c&d]) x)