mirror of https://github.com/golang/go.git
math/big: implement addMulVVW on arm64
The lack of proper addMulVVW implementation for arm64 hurts RSA performance. This assembly implementation is optimized for arm64 based servers. name old time/op new time/op delta pkg:math/big goos:linux goarch:arm64 AddMulVVW/1 55.2ns ± 0% 11.9ns ± 1% -78.37% (p=0.000 n=8+10) AddMulVVW/2 67.0ns ± 0% 11.2ns ± 0% -83.28% (p=0.000 n=7+10) AddMulVVW/3 93.2ns ± 0% 13.2ns ± 0% -85.84% (p=0.000 n=10+10) AddMulVVW/4 126ns ± 0% 13ns ± 1% -89.82% (p=0.000 n=10+10) AddMulVVW/5 151ns ± 0% 17ns ± 0% -88.87% (p=0.000 n=10+9) AddMulVVW/10 323ns ± 0% 25ns ± 0% -92.20% (p=0.000 n=10+10) AddMulVVW/100 3.28µs ± 0% 0.14µs ± 0% -95.82% (p=0.000 n=10+10) AddMulVVW/1000 31.7µs ± 0% 1.3µs ± 0% -96.00% (p=0.000 n=10+8) AddMulVVW/10000 313µs ± 0% 13µs ± 0% -95.98% (p=0.000 n=10+10) AddMulVVW/100000 3.24ms ± 0% 0.13ms ± 1% -96.13% (p=0.000 n=9+9) pkg:crypto/rsa goos:linux goarch:arm64 RSA2048Decrypt 44.7ms ± 0% 4.0ms ± 6% -91.08% (p=0.000 n=8+10) RSA2048Sign 46.3ms ± 0% 5.0ms ± 0% -89.29% (p=0.000 n=9+10) 3PrimeRSA2048Decrypt 22.3ms ± 0% 2.4ms ± 0% -89.26% (p=0.000 n=10+10) Change-Id: I295f0bd5c51a4442d02c44ece1f6026d30dff0bc Reviewed-on: https://go-review.googlesource.com/76270 Reviewed-by: Vlad Krasnov <vlad@cloudflare.com> Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Vlad Krasnov <vlad@cloudflare.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
b1335037fa
commit
fd3d27938a
|
|
@ -199,8 +199,89 @@ done:
|
|||
|
||||
// func addMulVVW(z, x []Word, y Word) (c Word)
|
||||
TEXT ·addMulVVW(SB),NOSPLIT,$0
|
||||
B ·addMulVVW_g(SB)
|
||||
MOVD z+0(FP), R1
|
||||
MOVD z_len+8(FP), R0
|
||||
MOVD x+24(FP), R2
|
||||
MOVD y+48(FP), R3
|
||||
MOVD $0, R4
|
||||
|
||||
TBZ $0, R0, two
|
||||
|
||||
MOVD.P 8(R2), R5
|
||||
MOVD (R1), R6
|
||||
|
||||
MUL R5, R3, R7
|
||||
UMULH R5, R3, R8
|
||||
|
||||
ADDS R7, R6
|
||||
ADC $0, R8, R4
|
||||
|
||||
MOVD.P R6, 8(R1)
|
||||
SUB $1, R0
|
||||
|
||||
two:
|
||||
TBZ $1, R0, loop
|
||||
|
||||
LDP.P 16(R2), (R5, R10)
|
||||
LDP (R1), (R6, R11)
|
||||
|
||||
MUL R10, R3, R13
|
||||
UMULH R10, R3, R12
|
||||
|
||||
MUL R5, R3, R7
|
||||
UMULH R5, R3, R8
|
||||
|
||||
ADDS R4, R6
|
||||
ADCS R13, R11
|
||||
ADC $0, R12
|
||||
|
||||
ADDS R7, R6
|
||||
ADCS R8, R11
|
||||
ADC $0, R12, R4
|
||||
|
||||
STP.P (R6, R11), 16(R1)
|
||||
SUB $2, R0
|
||||
|
||||
// The main loop of this code operates on a block of 4 words every iteration
|
||||
// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
|
||||
// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
|
||||
// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
|
||||
loop:
|
||||
CBZ R0, done
|
||||
|
||||
LDP.P 16(R2), (R5, R6)
|
||||
LDP.P 16(R2), (R7, R8)
|
||||
|
||||
LDP (R1), (R9, R10)
|
||||
ADDS R4, R9
|
||||
MUL R6, R3, R14
|
||||
ADCS R14, R10
|
||||
MUL R7, R3, R15
|
||||
LDP 16(R1), (R11, R12)
|
||||
ADCS R15, R11
|
||||
MUL R8, R3, R16
|
||||
ADCS R16, R12
|
||||
UMULH R8, R3, R20
|
||||
ADC $0, R20
|
||||
|
||||
MUL R5, R3, R13
|
||||
ADDS R13, R9
|
||||
UMULH R5, R3, R17
|
||||
ADCS R17, R10
|
||||
UMULH R6, R3, R21
|
||||
STP.P (R9, R10), 16(R1)
|
||||
ADCS R21, R11
|
||||
UMULH R7, R3, R19
|
||||
ADCS R19, R12
|
||||
STP.P (R11, R12), 16(R1)
|
||||
ADC $0, R20, R4
|
||||
|
||||
SUB $4, R0
|
||||
B loop
|
||||
|
||||
done:
|
||||
MOVD R4, c+56(FP)
|
||||
RET
|
||||
|
||||
// func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
|
||||
TEXT ·divWVW(SB),NOSPLIT,$0
|
||||
|
|
|
|||
Loading…
Reference in New Issue