crypto/subtle: implement xorBytes in hardware on loong64

goos: linux
goarch: loong64
pkg: crypto/subtle
cpu: Loongson-3A6000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
XORBytes/8Bytes       11.250n ± 0%   6.403n ± 0%  -43.08% (p=0.000 n=20)
XORBytes/128Bytes      24.61n ± 0%   12.21n ± 0%  -50.39% (p=0.000 n=20)
XORBytes/2048Bytes     216.7n ± 0%   108.3n ± 0%  -50.02% (p=0.000 n=20)
XORBytes/32768Bytes    3.657µ ± 0%   1.683µ ± 0%  -53.98% (p=0.000 n=20)
geomean                121.7n        61.44n       -49.52%

                    │  bench.old   │               bench.new                │
                    │     B/s      │      B/s       vs base                 │
XORBytes/8Bytes       678.1Mi ± 0%   1191.5Mi ± 0%   +75.72% (p=0.000 n=20)
XORBytes/128Bytes     4.844Gi ± 0%    9.766Gi ± 0%  +101.63% (p=0.000 n=20)
XORBytes/2048Bytes    8.801Gi ± 0%   17.619Gi ± 0%  +100.18% (p=0.000 n=20)
XORBytes/32768Bytes   8.346Gi ± 0%   18.137Gi ± 0%  +117.32% (p=0.000 n=20)
geomean               3.918Gi         7.763Gi        +98.14%

goos: linux
goarch: loong64
pkg: crypto/subtle
cpu: Loongson-3A5000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
XORBytes/8Bytes       16.420n ± 0%   8.806n ± 0%  -46.37% (p=0.000 n=20)
XORBytes/128Bytes      35.84n ± 0%   16.42n ± 0%  -54.19% (p=0.000 n=20)
XORBytes/2048Bytes     332.0n ± 0%   160.5n ± 0%  -51.64% (p=0.000 n=20)
XORBytes/32768Bytes    4.944µ ± 0%   2.474µ ± 0%  -49.96% (p=0.000 n=20)
geomean                176.3n        87.05n       -50.62%

                    │  bench.old   │               bench.new                │
                    │     B/s      │      B/s       vs base                 │
XORBytes/8Bytes       464.7Mi ± 0%    866.4Mi ± 0%   +86.45% (p=0.000 n=20)
XORBytes/128Bytes     3.326Gi ± 0%    7.261Gi ± 0%  +118.31% (p=0.000 n=20)
XORBytes/2048Bytes    5.745Gi ± 0%   11.880Gi ± 0%  +106.80% (p=0.000 n=20)
XORBytes/32768Bytes   6.172Gi ± 0%   12.334Gi ± 0%   +99.83% (p=0.000 n=20)
geomean               2.705Gi         5.477Gi       +102.52%

Change-Id: Id404f9023a57025f78b6922659cfa8870881d646
Reviewed-on: https://go-review.googlesource.com/c/go/+/590175
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Roland Shoemaker <roland@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Tim King <taking@google.com>
This commit is contained in:
Xiaolin Zhao 2024-06-04 13:03:06 +08:00 committed by Gopher Robot
parent 1dfb33e861
commit 69827b5c8d
3 changed files with 177 additions and 1 deletions

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !arm64 && !ppc64 && !ppc64le) || purego
//go:build (!amd64 && !arm64 && !loong64 && !ppc64 && !ppc64le) || purego
package subtle

View File

@ -0,0 +1,10 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
package subtle
//go:noescape
func xorBytes(dst, a, b *byte, n int)

View File

@ -0,0 +1,166 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !purego
#include "textflag.h"
// func xorBytes(dst, a, b *byte, n int)
TEXT ·xorBytes(SB), NOSPLIT, $0
MOVV dst+0(FP), R4
MOVV a+8(FP), R5
MOVV b+16(FP), R6
MOVV n+24(FP), R7
MOVV $64, R9
BGEU R7, R9, loop64 // n >= 64
tail:
SRLV $1, R9
BGEU R7, R9, xor_32 // n >= 32 && n < 64
SRLV $1, R9
BGEU R7, R9, xor_16 // n >= 16 && n < 32
SRLV $1, R9
BGEU R7, R9, xor_8 // n >= 8 && n < 16
SRLV $1, R9
BGEU R7, R9, xor_4 // n >= 4 && n < 8
SRLV $1, R9
BGEU R7, R9, xor_2 // n >= 2 && n < 4
SRLV $1, R9
BGEU R7, R9, xor_1 // n = 1
loop64:
MOVV (R5), R10
MOVV 8(R5), R11
MOVV 16(R5), R12
MOVV 24(R5), R13
MOVV (R6), R14
MOVV 8(R6), R15
MOVV 16(R6), R16
MOVV 24(R6), R17
XOR R10, R14
XOR R11, R15
XOR R12, R16
XOR R13, R17
MOVV R14, (R4)
MOVV R15, 8(R4)
MOVV R16, 16(R4)
MOVV R17, 24(R4)
MOVV 32(R5), R10
MOVV 40(R5), R11
MOVV 48(R5), R12
MOVV 56(R5), R13
MOVV 32(R6), R14
MOVV 40(R6), R15
MOVV 48(R6), R16
MOVV 56(R6), R17
XOR R10, R14
XOR R11, R15
XOR R12, R16
XOR R13, R17
MOVV R14, 32(R4)
MOVV R15, 40(R4)
MOVV R16, 48(R4)
MOVV R17, 56(R4)
ADDV $64, R5
ADDV $64, R6
ADDV $64, R4
SUBV $64, R7
// 64 in R9
BGEU R7, R9, loop64
BEQ R7, R0, end
xor_32_check:
SRLV $1, R9
BLT R7, R9, xor_16_check
xor_32:
MOVV (R5), R10
MOVV 8(R5), R11
MOVV 16(R5), R12
MOVV 24(R5), R13
MOVV (R6), R14
MOVV 8(R6), R15
MOVV 16(R6), R16
MOVV 24(R6), R17
XOR R10, R14
XOR R11, R15
XOR R12, R16
XOR R13, R17
MOVV R14, (R4)
MOVV R15, 8(R4)
MOVV R16, 16(R4)
MOVV R17, 24(R4)
ADDV $32, R5
ADDV $32, R6
ADDV $32, R4
SUBV $32, R7
BEQ R7, R0, end
xor_16_check:
SRLV $1, R9
BLT R7, R9, xor_8_check
xor_16:
MOVV (R5), R10
MOVV 8(R5), R11
MOVV (R6), R12
MOVV 8(R6), R13
XOR R10, R12
XOR R11, R13
MOVV R12, (R4)
MOVV R13, 8(R4)
ADDV $16, R5
ADDV $16, R6
ADDV $16, R4
SUBV $16, R7
BEQ R7, R0, end
xor_8_check:
SRLV $1, R9
BLT R7, R9, xor_4_check
xor_8:
MOVV (R5), R10
MOVV (R6), R11
XOR R10, R11
MOVV R11, (R4)
ADDV $8, R5
ADDV $8, R6
ADDV $8, R4
SUBV $8, R7
BEQ R7, R0, end
xor_4_check:
SRLV $1, R9
BLT R7, R9, xor_2_check
xor_4:
MOVW (R5), R10
MOVW (R6), R11
XOR R10, R11
MOVW R11, (R4)
ADDV $4, R5
ADDV $4, R6
ADDV $4, R4
SUBV $4, R7
BEQ R7, R0, end
xor_2_check:
SRLV $1, R9
BLT R7, R9, xor_1
xor_2:
MOVH (R5), R10
MOVH (R6), R11
XOR R10, R11
MOVH R11, (R4)
ADDV $2, R5
ADDV $2, R6
ADDV $2, R4
SUBV $2, R7
BEQ R7, R0, end
xor_1:
MOVB (R5), R10
MOVB (R6), R11
XOR R10, R11
MOVB R11, (R4)
end:
RET