mirror of https://github.com/golang/go.git
crypto/internal/fips140/edwards25519/field: optimize AMD64
Replace constant multiplication with shift and adds,
this reduces pressure on multiplications, making things overall
faster.
goos: windows
goarch: amd64
pkg: crypto/internal/fips140/edwards25519/field
cpu: AMD Ryzen Threadripper 2950X 16-Core Processor
│ v0.log~ │ v1.log~ │
│ sec/op │ sec/op vs base │
Add-32 4.768n ± 1% 4.763n ± 0% ~ (p=0.683 n=20)
Multiply-32 20.93n ± 0% 19.48n ± 0% -6.88% (p=0.000 n=20)
Square-32 15.88n ± 0% 15.00n ± 0% -5.51% (p=0.000 n=20)
Invert-32 4.291µ ± 0% 4.072µ ± 0% -5.10% (p=0.000 n=20)
Mult32-32 5.184n ± 0% 5.169n ± 0% -0.30% (p=0.032 n=20)
Bytes-32 11.36n ± 0% 11.34n ± 0% ~ (p=0.106 n=20)
geomean 27.15n 26.32n -3.06%
Change-Id: I9c2f588fad29d89c3e6c712c092b32b66479f596
Reviewed-on: https://go-review.googlesource.com/c/go/+/652716
Reviewed-by: Filippo Valsorda <filippo@golang.org>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Commit-Queue: Junyang Shao <shaojunyang@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Auto-Submit: Filippo Valsorda <filippo@golang.org>
This commit is contained in:
parent
16959799bd
commit
983e30bd3b
|
|
@ -256,6 +256,23 @@ func addMul64(r uint128, i uint64, aX, bX namedComponent) {
|
|||
case 1:
|
||||
Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX))
|
||||
Load(aX, RAX)
|
||||
case 2:
|
||||
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
|
||||
Load(aX, RAX)
|
||||
SHLQ(U8(1), RAX)
|
||||
case 19:
|
||||
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
|
||||
// 19 * v ==> v + (v+v*8)*2
|
||||
tmp := Load(aX, GP64())
|
||||
LEAQ(Mem{Base: tmp, Index: tmp, Scale: 8}, RAX)
|
||||
LEAQ(Mem{Base: tmp, Index: RAX, Scale: 2}, RAX)
|
||||
case 38:
|
||||
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
|
||||
// 38 * v ==> (v + (v+v*8)*2) * 2
|
||||
tmp := Load(aX, GP64())
|
||||
LEAQ(Mem{Base: tmp, Index: tmp, Scale: 8}, RAX)
|
||||
LEAQ(Mem{Base: tmp, Index: RAX, Scale: 2}, RAX)
|
||||
SHLQ(U8(1), RAX)
|
||||
default:
|
||||
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
|
||||
IMUL3Q(Imm(i), Load(aX, GP64()), RAX)
|
||||
|
|
|
|||
|
|
@ -16,32 +16,36 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
|
|||
MOVQ DX, SI
|
||||
|
||||
// r0 += 19×a1×b4
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
MOVQ 8(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
// r0 += 19×a2×b3
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
// r0 += 19×a3×b2
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
// r0 += 19×a4×b1
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
// r1 = a0×b1
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -56,25 +60,28 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
|
|||
ADCQ DX, R8
|
||||
|
||||
// r1 += 19×a2×b4
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
|
||||
// r1 += 19×a3×b3
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
|
||||
// r1 += 19×a4×b2
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
|
||||
// r2 = a0×b2
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -95,18 +102,20 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
|
|||
ADCQ DX, R10
|
||||
|
||||
// r2 += 19×a3×b4
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
|
||||
// r2 += 19×a4×b3
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
|
||||
// r3 = a0×b3
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -133,11 +142,12 @@ TEXT ·feMul(SB), NOSPLIT, $0-24
|
|||
ADCQ DX, R12
|
||||
|
||||
// r3 += 19×a4×b4
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R12
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R12
|
||||
|
||||
// r4 = a0×b4
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -231,18 +241,22 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
|
|||
MOVQ DX, BX
|
||||
|
||||
// r0 += 38×l1×l4
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
MOVQ 8(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
|
||||
// r0 += 38×l2×l3
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BX
|
||||
|
||||
// r1 = 2×l0×l1
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -252,18 +266,21 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
|
|||
MOVQ DX, DI
|
||||
|
||||
// r1 += 38×l2×l4
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
MOVQ 16(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r1 += 19×l3×l3
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r2 = 2×l0×l2
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -279,11 +296,13 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
|
|||
ADCQ DX, R9
|
||||
|
||||
// r2 += 38×l3×l4
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
MOVQ 24(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
// r3 = 2×l0×l3
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -293,18 +312,19 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
|
|||
MOVQ DX, R11
|
||||
|
||||
// r3 += 2×l1×l2
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x02, AX, AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
MOVQ 8(CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
// r3 += 19×l4×l4
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
MOVQ 32(CX), DX
|
||||
LEAQ (DX)(DX*8), AX
|
||||
LEAQ (DX)(AX*2), AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
// r4 = 2×l0×l4
|
||||
MOVQ (CX), AX
|
||||
|
|
@ -314,11 +334,11 @@ TEXT ·feSquare(SB), NOSPLIT, $0-16
|
|||
MOVQ DX, R13
|
||||
|
||||
// r4 += 2×l1×l3
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x02, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
MOVQ 8(CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
// r4 += l2×l2
|
||||
MOVQ 16(CX), AX
|
||||
|
|
|
|||
Loading…
Reference in New Issue