diff options
| author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-10-02 17:51:13 -0400 |
|---|---|---|
| committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-10-06 19:40:46 +0000 |
| commit | bdab5df40f474c7768a945ef4fcf5aab634f7af5 (patch) | |
| tree | 7a06719f0ae6723ac96ee476ab869513c627f206 /src/cmd/internal | |
| parent | 1fb149fd640f2e83f17206aa6eb530d664b0b5ed (diff) | |
| download | go-git-bdab5df40f474c7768a945ef4fcf5aab634f7af5.tar.gz | |
cmd/compile,cmd/internal/obj/ppc64: use mulli where possible
This adds support to allow the use of mulli when one of the multiply
operands is a constant that fits in 16 bits.
This especially helps in the case where this instruction appears in
a loop since the load of the constant is not being moved out of the loop.
Some improvements seen in compress/flate on power9:
Decode/Digits/Huffman/1e4 259µs ± 0% 261µs ± 0% +0.57% (p=1.000 n=1+1)
Decode/Digits/Huffman/1e5 2.43ms ± 0% 2.45ms ± 0% +0.79% (p=1.000 n=1+1)
Decode/Digits/Huffman/1e6 23.9ms ± 0% 24.2ms ± 0% +0.86% (p=1.000 n=1+1)
Decode/Digits/Speed/1e4 278µs ± 0% 279µs ± 0% +0.34% (p=1.000 n=1+1)
Decode/Digits/Speed/1e5 2.80ms ± 0% 2.81ms ± 0% +0.29% (p=1.000 n=1+1)
Decode/Digits/Speed/1e6 28.0ms ± 0% 28.1ms ± 0% +0.28% (p=1.000 n=1+1)
Decode/Digits/Default/1e4 278µs ± 0% 278µs ± 0% +0.28% (p=1.000 n=1+1)
Decode/Digits/Default/1e5 2.68ms ± 0% 2.69ms ± 0% +0.19% (p=1.000 n=1+1)
Decode/Digits/Default/1e6 26.6ms ± 0% 26.6ms ± 0% +0.21% (p=1.000 n=1+1)
Decode/Digits/Compression/1e4 278µs ± 0% 278µs ± 0% +0.00% (p=1.000 n=1+1)
Decode/Digits/Compression/1e5 2.68ms ± 0% 2.69ms ± 0% +0.21% (p=1.000 n=1+1)
Decode/Digits/Compression/1e6 26.6ms ± 0% 26.6ms ± 0% +0.07% (p=1.000 n=1+1)
Decode/Newton/Huffman/1e4 322µs ± 0% 312µs ± 0% -2.84% (p=1.000 n=1+1)
Decode/Newton/Huffman/1e5 3.11ms ± 0% 2.91ms ± 0% -6.41% (p=1.000 n=1+1)
Decode/Newton/Huffman/1e6 31.4ms ± 0% 29.3ms ± 0% -6.85% (p=1.000 n=1+1)
Decode/Newton/Speed/1e4 282µs ± 0% 269µs ± 0% -4.69% (p=1.000 n=1+1)
Decode/Newton/Speed/1e5 2.29ms ± 0% 2.20ms ± 0% -4.13% (p=1.000 n=1+1)
Decode/Newton/Speed/1e6 22.7ms ± 0% 21.3ms ± 0% -6.06% (p=1.000 n=1+1)
Decode/Newton/Default/1e4 254µs ± 0% 237µs ± 0% -6.60% (p=1.000 n=1+1)
Decode/Newton/Default/1e5 1.86ms ± 0% 1.75ms ± 0% -5.99% (p=1.000 n=1+1)
Decode/Newton/Default/1e6 18.1ms ± 0% 17.4ms ± 0% -4.10% (p=1.000 n=1+1)
Decode/Newton/Compression/1e4 254µs ± 0% 244µs ± 0% -3.91% (p=1.000 n=1+1)
Decode/Newton/Compression/1e5 1.85ms ± 0% 1.79ms ± 0% -3.10% (p=1.000 n=1+1)
Decode/Newton/Compression/1e6 18.0ms ± 0% 17.3ms ± 0% -3.88% (p=1.000 n=1+1)
Change-Id: I840320fab1c4bf64c76b001c2651ab79f23df4eb
Reviewed-on: https://go-review.googlesource.com/c/go/+/259444
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@gmail.com>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Diffstat (limited to 'src/cmd/internal')
| -rw-r--r-- | src/cmd/internal/obj/ppc64/asm9.go | 9 |
1 files changed, 5 insertions, 4 deletions
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index 928e299f43..c2e8e9e9d0 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -1279,6 +1279,9 @@ func buildop(ctxt *obj.Link) { case AREMD: opset(AREMDU, r0) + case AMULLW: + opset(AMULLD, r0) + case ADIVW: /* op Rb[,Ra],Rd */ opset(AMULHW, r0) @@ -1312,7 +1315,6 @@ func buildop(ctxt *obj.Link) { opset(AMULHDCC, r0) opset(AMULHDU, r0) opset(AMULHDUCC, r0) - opset(AMULLD, r0) opset(AMULLDCC, r0) opset(AMULLDVCC, r0) opset(AMULLDV, r0) @@ -1996,7 +1998,6 @@ func buildop(ctxt *obj.Link) { AMOVB, /* macro: move byte with sign extension */ AMOVBU, /* macro: move byte with sign extension & update */ AMOVFL, - AMULLW, /* op $s[,r2],r3; op r1[,r2],r3; no cc/v */ ASUBC, /* op r1,$s,r3; op r1[,r2],r3 */ ASTSW, @@ -4990,8 +4991,8 @@ func (c *ctxt9) opirr(a obj.As) uint32 { case ADARN: return OPVCC(31, 755, 0, 0) /* darn - v3.00 */ - case AMULLW: - return OPVCC(7, 0, 0, 0) + case AMULLW, AMULLD: + return OPVCC(7, 0, 0, 0) /* mulli works with MULLW or MULLD */ case AOR: return OPVCC(24, 0, 0, 0) |
