// Copyright 2022 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package main import ( . "github.com/mmcloughlin/avo/build" . "github.com/mmcloughlin/avo/operand" . "github.com/mmcloughlin/avo/reg" ) //go:generate go run . -out ../nat_amd64.s -stubs ../nat_amd64.go -pkg bigmod func main() { Package("crypto/internal/bigmod") ConstraintExpr("amd64,gc,!purego") Implement("montgomeryLoop") Pragma("noescape") size := Load(Param("d").Len(), GP64()) d := Mem{Base: Load(Param("d").Base(), GP64())} b := Mem{Base: Load(Param("b").Base(), GP64())} m := Mem{Base: Load(Param("m").Base(), GP64())} m0inv := Load(Param("m0inv"), GP64()) overflow := zero() i := zero() Label("outerLoop") ai := Load(Param("a").Base(), GP64()) MOVQ(Mem{Base: ai}.Idx(i, 8), ai) z := uint128{GP64(), GP64()} mul64(z, b, ai) add64(z, d) f := GP64() MOVQ(m0inv, f) IMULQ(z.lo, f) _MASK(f) addMul64(z, m, f) carry := shiftBy63(z) j := zero() INCQ(j) JMP(LabelRef("innerLoopCondition")) Label("innerLoop") // z = d[j] + a[i] * b[j] + f * m[j] + carry z = uint128{GP64(), GP64()} mul64(z, b.Idx(j, 8), ai) addMul64(z, m.Idx(j, 8), f) add64(z, d.Idx(j, 8)) add64(z, carry) // d[j-1] = z_lo & _MASK storeMasked(z.lo, d.Idx(j, 8).Offset(-8)) // carry = z_hi<<1 | z_lo>>_W MOVQ(shiftBy63(z), carry) INCQ(j) Label("innerLoopCondition") CMPQ(size, j) JGT(LabelRef("innerLoop")) ADDQ(carry, overflow) storeMasked(overflow, d.Idx(size, 8).Offset(-8)) SHRQ(Imm(63), overflow) INCQ(i) CMPQ(size, i) JGT(LabelRef("outerLoop")) Store(overflow, ReturnIndex(0)) RET() Generate() } // zero zeroes a new register and returns it. func zero() Register { r := GP64() XORQ(r, r) return r } // _MASK masks out the top bit of r. func _MASK(r Register) { BTRQ(Imm(63), r) } type uint128 struct { hi, lo GPVirtual } // storeMasked stores _MASK(src) in dst. It doesn't modify src. func storeMasked(src, dst Op) { out := GP64() MOVQ(src, out) _MASK(out) MOVQ(out, dst) } // shiftBy63 returns z >> 63. It reuses z.lo. func shiftBy63(z uint128) Register { SHRQ(Imm(63), z.hi, z.lo) result := z.lo z.hi, z.lo = nil, nil return result } // add64 sets r to r + a. func add64(r uint128, a Op) { ADDQ(a, r.lo) ADCQ(Imm(0), r.hi) } // mul64 sets r to a * b. func mul64(r uint128, a, b Op) { MOVQ(a, RAX) MULQ(b) // RDX, RAX = RAX * b MOVQ(RAX, r.lo) MOVQ(RDX, r.hi) } // addMul64 sets r to r + a * b. func addMul64(r uint128, a, b Op) { MOVQ(a, RAX) MULQ(b) // RDX, RAX = RAX * b ADDQ(RAX, r.lo) ADCQ(RDX, r.hi) }