mirror of https://github.com/golang/go.git
math/big: implement mulAddVWW in riscv64 assembly
This provides an assembly implementation of mulAddVWW for riscv64,
processing up to four words per loop, resulting in a significant
performance gain.
On a StarFive VisionFive 2:
│ muladdvww.1 │ muladdvww.2 │
│ sec/op │ sec/op vs base │
MulAddVWW/1-4 68.18n ± 0% 65.49n ± 0% -3.95% (p=0.000 n=10)
MulAddVWW/2-4 82.81n ± 0% 78.85n ± 0% -4.78% (p=0.000 n=10)
MulAddVWW/3-4 97.49n ± 0% 72.18n ± 0% -25.96% (p=0.000 n=10)
MulAddVWW/4-4 112.20n ± 0% 85.54n ± 0% -23.76% (p=0.000 n=10)
MulAddVWW/5-4 126.90n ± 0% 98.90n ± 0% -22.06% (p=0.000 n=10)
MulAddVWW/10-4 200.3n ± 0% 144.3n ± 0% -27.96% (p=0.000 n=10)
MulAddVWW/100-4 1532.0n ± 0% 860.0n ± 0% -43.86% (p=0.000 n=10)
MulAddVWW/1000-4 14.757µ ± 0% 8.076µ ± 0% -45.27% (p=0.000 n=10)
MulAddVWW/10000-4 204.0µ ± 0% 137.1µ ± 0% -32.77% (p=0.000 n=10)
MulAddVWW/100000-4 2.066m ± 0% 1.382m ± 0% -33.12% (p=0.000 n=10)
geomean 1.311µ 950.0n -27.51%
│ muladdvww.1 │ muladdvww.2 │
│ B/s │ B/s vs base │
MulAddVWW/1-4 895.1Mi ± 0% 932.0Mi ± 0% +4.11% (p=0.000 n=10)
MulAddVWW/2-4 1.440Gi ± 0% 1.512Gi ± 0% +5.02% (p=0.000 n=10)
MulAddVWW/3-4 1.834Gi ± 0% 2.477Gi ± 0% +35.07% (p=0.000 n=10)
MulAddVWW/4-4 2.125Gi ± 0% 2.787Gi ± 0% +31.15% (p=0.000 n=10)
MulAddVWW/5-4 2.349Gi ± 0% 3.013Gi ± 0% +28.28% (p=0.000 n=10)
MulAddVWW/10-4 2.975Gi ± 0% 4.130Gi ± 0% +38.79% (p=0.000 n=10)
MulAddVWW/100-4 3.891Gi ± 0% 6.930Gi ± 0% +78.11% (p=0.000 n=10)
MulAddVWW/1000-4 4.039Gi ± 0% 7.380Gi ± 0% +82.72% (p=0.000 n=10)
MulAddVWW/10000-4 2.922Gi ± 0% 4.346Gi ± 0% +48.74% (p=0.000 n=10)
MulAddVWW/100000-4 2.884Gi ± 0% 4.313Gi ± 0% +49.52% (p=0.000 n=10)
geomean 2.321Gi 3.202Gi +37.95%
Change-Id: If08191607913ce5c7641f34bae8fa5c9dfb44777
Reviewed-on: https://go-review.googlesource.com/c/go/+/595399
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
This commit is contained in:
parent
6d55a017fa
commit
86cd5c4034
|
|
@ -300,7 +300,79 @@ TEXT ·shrVU(SB),NOSPLIT,$0
|
|||
JMP ·shrVU_g(SB)
|
||||
|
||||
TEXT ·mulAddVWW(SB),NOSPLIT,$0
|
||||
JMP ·mulAddVWW_g(SB)
|
||||
MOV x+24(FP), X5
|
||||
MOV y+48(FP), X6
|
||||
MOV z+0(FP), X7
|
||||
MOV z_len+8(FP), X30
|
||||
MOV r+56(FP), X29
|
||||
|
||||
MOV $4, X28
|
||||
|
||||
BEQ ZERO, X30, done
|
||||
BLTU X30, X28, loop1
|
||||
|
||||
loop4:
|
||||
MOV 0(X5), X8 // x[0]
|
||||
MOV 8(X5), X11 // x[1]
|
||||
MOV 16(X5), X14 // x[2]
|
||||
MOV 24(X5), X17 // x[3]
|
||||
|
||||
MULHU X8, X6, X9 // z_hi[0] = x[0] * y
|
||||
MUL X8, X6, X8 // z_lo[0] = x[0] * y
|
||||
ADD X8, X29, X10 // z[0] = z_lo[0] + c
|
||||
SLTU X8, X10, X23
|
||||
ADD X23, X9, X29 // next c
|
||||
|
||||
MULHU X11, X6, X12 // z_hi[1] = x[1] * y
|
||||
MUL X11, X6, X11 // z_lo[1] = x[1] * y
|
||||
ADD X11, X29, X13 // z[1] = z_lo[1] + c
|
||||
SLTU X11, X13, X23
|
||||
ADD X23, X12, X29 // next c
|
||||
|
||||
MULHU X14, X6, X15 // z_hi[2] = x[2] * y
|
||||
MUL X14, X6, X14 // z_lo[2] = x[2] * y
|
||||
ADD X14, X29, X16 // z[2] = z_lo[2] + c
|
||||
SLTU X14, X16, X23
|
||||
ADD X23, X15, X29 // next c
|
||||
|
||||
MULHU X17, X6, X18 // z_hi[3] = x[3] * y
|
||||
MUL X17, X6, X17 // z_lo[3] = x[3] * y
|
||||
ADD X17, X29, X19 // z[3] = z_lo[3] + c
|
||||
SLTU X17, X19, X23
|
||||
ADD X23, X18, X29 // next c
|
||||
|
||||
MOV X10, 0(X7) // z[0]
|
||||
MOV X13, 8(X7) // z[1]
|
||||
MOV X16, 16(X7) // z[2]
|
||||
MOV X19, 24(X7) // z[3]
|
||||
|
||||
ADD $32, X5
|
||||
ADD $32, X7
|
||||
SUB $4, X30
|
||||
|
||||
BGEU X30, X28, loop4
|
||||
BEQZ X30, done
|
||||
|
||||
loop1:
|
||||
MOV 0(X5), X10 // x
|
||||
|
||||
MULHU X10, X6, X12 // z_hi = x * y
|
||||
MUL X10, X6, X10 // z_lo = x * y
|
||||
ADD X10, X29, X13 // z_lo + c
|
||||
SLTU X10, X13, X15
|
||||
ADD X12, X15, X29 // next c
|
||||
|
||||
MOV X13, 0(X7) // z
|
||||
|
||||
ADD $8, X5
|
||||
ADD $8, X7
|
||||
SUB $1, X30
|
||||
|
||||
BNEZ X30, loop1
|
||||
|
||||
done:
|
||||
MOV X29, c+64(FP) // return c
|
||||
RET
|
||||
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
JMP ·addMulVVW_g(SB)
|
||||
|
|
|
|||
Loading…
Reference in New Issue