cmd/compile: intrinsify Sub64 on riscv64

After this CL, the performance difference in crypto/elliptic
benchmarks on linux/riscv64 are:

name                 old time/op    new time/op    delta
ScalarBaseMult/P256    1.64ms ± 1%    1.60ms ± 1%   -2.36%  (p=0.008 n=5+5)
ScalarBaseMult/P224    1.53ms ± 1%    1.47ms ± 2%   -4.24%  (p=0.008 n=5+5)
ScalarBaseMult/P384    5.12ms ± 2%    5.03ms ± 2%     ~     (p=0.095 n=5+5)
ScalarBaseMult/P521    22.3ms ± 2%    13.8ms ± 1%  -37.89%  (p=0.008 n=5+5)
ScalarMult/P256        4.49ms ± 2%    4.26ms ± 2%   -5.13%  (p=0.008 n=5+5)
ScalarMult/P224        4.33ms ± 1%    4.09ms ± 1%   -5.59%  (p=0.008 n=5+5)
ScalarMult/P384        16.3ms ± 1%    15.5ms ± 2%   -4.78%  (p=0.008 n=5+5)
ScalarMult/P521         101ms ± 0%      47ms ± 2%  -53.36%  (p=0.008 n=5+5)

Change-Id: I31cf0506e27f9d85f576af1813630a19c20dda8a
Reviewed-on: https://go-review.googlesource.com/c/go/+/420095
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: David Chase <drchase@google.com>
Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Wayne Zuo 2022-07-29 22:14:53 +08:00 committed by Joel Sing
parent 969f48a3a2
commit a6219737e3
4 changed files with 50 additions and 2 deletions

View File

@ -56,6 +56,10 @@
(Select1 (Add64carry x y c)) => (Select1 (Add64carry x y c)) =>
(OR (SLTU <typ.UInt64> s:(ADD <typ.UInt64> x y) x) (SLTU <typ.UInt64> (ADD <typ.UInt64> s c) s)) (OR (SLTU <typ.UInt64> s:(ADD <typ.UInt64> x y) x) (SLTU <typ.UInt64> (ADD <typ.UInt64> s c) s))
(Select0 (Sub64borrow x y c)) => (SUB (SUB <typ.UInt64> x y) c)
(Select1 (Sub64borrow x y c)) =>
(OR (SLTU <typ.UInt64> x s:(SUB <typ.UInt64> x y)) (SLTU <typ.UInt64> s (SUB <typ.UInt64> s c)))
// (x + y) / 2 => (x / 2) + (y / 2) + (x & y & 1) // (x + y) / 2 => (x / 2) + (y / 2) + (x & y & 1)
(Avg64u <t> x y) => (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y))) (Avg64u <t> x y) => (ADD (ADD <t> (SRLI <t> [1] x) (SRLI <t> [1] y)) (ANDI <t> [1] (AND <t> x y)))

View File

@ -6089,6 +6089,21 @@ func rewriteValueRISCV64_OpSelect0(v *Value) bool {
v.AddArg2(v0, c) v.AddArg2(v0, c)
return true return true
} }
// match: (Select0 (Sub64borrow x y c))
// result: (SUB (SUB <typ.UInt64> x y) c)
for {
if v_0.Op != OpSub64borrow {
break
}
c := v_0.Args[2]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpRISCV64SUB)
v0 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
v0.AddArg2(x, y)
v.AddArg2(v0, c)
return true
}
// match: (Select0 m:(LoweredMuluhilo x y)) // match: (Select0 m:(LoweredMuluhilo x y))
// cond: m.Uses == 1 // cond: m.Uses == 1
// result: (MULHU x y) // result: (MULHU x y)
@ -6133,6 +6148,27 @@ func rewriteValueRISCV64_OpSelect1(v *Value) bool {
v.AddArg2(v0, v2) v.AddArg2(v0, v2)
return true return true
} }
// match: (Select1 (Sub64borrow x y c))
// result: (OR (SLTU <typ.UInt64> x s:(SUB <typ.UInt64> x y)) (SLTU <typ.UInt64> s (SUB <typ.UInt64> s c)))
for {
if v_0.Op != OpSub64borrow {
break
}
c := v_0.Args[2]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpRISCV64OR)
v0 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
s := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
s.AddArg2(x, y)
v0.AddArg2(x, s)
v2 := b.NewValue0(v.Pos, OpRISCV64SLTU, typ.UInt64)
v3 := b.NewValue0(v.Pos, OpRISCV64SUB, typ.UInt64)
v3.AddArg2(s, c)
v2.AddArg2(s, v3)
v.AddArg2(v0, v2)
return true
}
// match: (Select1 m:(LoweredMuluhilo x y)) // match: (Select1 m:(LoweredMuluhilo x y))
// cond: m.Uses == 1 // cond: m.Uses == 1
// result: (MUL x y) // result: (MUL x y)

View File

@ -4732,8 +4732,8 @@ func InitTables() {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2]) return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
}, },
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X) sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64)
alias("math/bits", "Sub", "math/bits", "Sub64", sys.ArchAMD64, sys.ArchARM64, sys.ArchS390X) alias("math/bits", "Sub", "math/bits", "Sub64", sys.ArchAMD64, sys.ArchARM64, sys.ArchS390X, sys.ArchRISCV64)
addF("math/bits", "Div64", addF("math/bits", "Div64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// check for divide-by-zero/overflow and panic with appropriate message // check for divide-by-zero/overflow and panic with appropriate message

View File

@ -621,6 +621,7 @@ func Sub(x, y, ci uint) (r, co uint) {
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB","SLTU"
return bits.Sub(x, y, ci) return bits.Sub(x, y, ci)
} }
@ -630,6 +631,7 @@ func SubC(x, ci uint) (r, co uint) {
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB","SLTU"
return bits.Sub(x, 7, ci) return bits.Sub(x, 7, ci)
} }
@ -639,6 +641,7 @@ func SubZ(x, y uint) (r, co uint) {
// ppc64:"SUBC", -"SUBE", "SUBZE", "NEG" // ppc64:"SUBC", -"SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG"
// s390x:"SUBC" // s390x:"SUBC"
// riscv64: "SUB","SLTU"
return bits.Sub(x, y, 0) return bits.Sub(x, y, 0)
} }
@ -648,6 +651,7 @@ func SubR(x, y, ci uint) uint {
// ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG" // ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG"
// ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG" // ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB",-"SLTU"
r, _ := bits.Sub(x, y, ci) r, _ := bits.Sub(x, y, ci)
return r return r
} }
@ -669,6 +673,7 @@ func Sub64(x, y, ci uint64) (r, co uint64) {
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB","SLTU"
return bits.Sub64(x, y, ci) return bits.Sub64(x, y, ci)
} }
@ -678,6 +683,7 @@ func Sub64C(x, ci uint64) (r, co uint64) {
// ppc64:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64:"SUBC", "SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", "SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", "SUBE", "SUBZE", "NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB","SLTU"
return bits.Sub64(x, 7, ci) return bits.Sub64(x, 7, ci)
} }
@ -687,6 +693,7 @@ func Sub64Z(x, y uint64) (r, co uint64) {
// ppc64:"SUBC", -"SUBE", "SUBZE", "NEG" // ppc64:"SUBC", -"SUBE", "SUBZE", "NEG"
// ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG" // ppc64le:"SUBC", -"SUBE", "SUBZE", "NEG"
// s390x:"SUBC" // s390x:"SUBC"
// riscv64: "SUB","SLTU"
return bits.Sub64(x, y, 0) return bits.Sub64(x, y, 0)
} }
@ -696,6 +703,7 @@ func Sub64R(x, y, ci uint64) uint64 {
// ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG" // ppc64:"SUBC", "SUBE", -"SUBZE", -"NEG"
// ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG" // ppc64le:"SUBC", "SUBE", -"SUBZE", -"NEG"
// s390x:"SUBE" // s390x:"SUBE"
// riscv64: "SUB",-"SLTU"
r, _ := bits.Sub64(x, y, ci) r, _ := bits.Sub64(x, y, ci)
return r return r
} }