internal/bytealg: optimize the function compare using SIMD on loong64

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                  │      old      │                 new                  │
                  │    sec/op     │    sec/op     vs base                │
BytesCompare/1       7.238n ± 25%    5.204n ± 0%  -28.10% (p=0.001 n=10)
BytesCompare/2       7.242n ±  6%    5.204n ± 0%  -28.14% (p=0.000 n=10)
BytesCompare/4       7.229n ±  5%    4.403n ± 0%  -39.10% (p=0.000 n=10)
BytesCompare/8       7.077n ± 36%    4.403n ± 0%  -37.78% (p=0.000 n=10)
BytesCompare/16      8.373n ±  6%    6.004n ± 0%  -28.30% (p=0.000 n=10)
BytesCompare/32      8.040n ±  3%    4.803n ± 0%  -40.26% (p=0.000 n=10)
BytesCompare/64      8.434n ± 24%   10.410n ± 0%  +23.42% (p=0.014 n=10)
BytesCompare/128    11.530n ± 23%    5.604n ± 0%  -51.40% (p=0.000 n=10)
BytesCompare/256    14.180n ±  0%    7.606n ± 0%  -46.36% (p=0.000 n=10)
BytesCompare/512     26.83n ±  0%    10.81n ± 0%  -59.71% (p=0.000 n=10)
BytesCompare/1024    52.60n ±  0%    17.21n ± 0%  -67.28% (p=0.000 n=10)
BytesCompare/2048   103.70n ±  0%    30.02n ± 0%  -71.05% (p=0.000 n=10)
geomean              13.49n          7.607n       -43.63%

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                                      │     old     │                 new                  │
                                      │   sec/op    │    sec/op     vs base                │
CompareBytesEqual                       5.603n ± 0%   5.604n ±  0%        ~ (p=0.191 n=10)
CompareBytesToNil                       3.202n ± 0%   3.202n ±  0%        ~ (p=1.000 n=10)
CompareBytesEmpty                       2.802n ± 0%   2.802n ±  0%        ~ (p=1.000 n=10)
CompareBytesIdentical                   3.202n ± 0%   2.538n ±  1%  -20.72% (p=0.000 n=10)
CompareBytesSameLength                  8.805n ± 0%   4.803n ±  0%  -45.45% (p=0.000 n=10)
CompareBytesDifferentLength             9.206n ± 0%   4.403n ±  0%  -52.17% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6       82.03µ ± 0%   45.93µ ±  0%  -44.01% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7       82.04µ ± 0%   45.93µ ±  0%  -44.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0   78.76µ ± 0%   45.69µ ±  0%  -41.98% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2   85.31µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6   85.31µ ± 0%   46.06µ ±  0%  -46.02% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7   85.32µ ± 0%   52.32µ ±  7%  -38.68% (p=0.000 n=10)
CompareBytesBig                         78.76µ ± 0%   50.20µ ±  6%  -36.26% (p=0.000 n=10)
CompareBytesBigIdentical                3.202n ± 0%   3.442n ± 24%        ~ (p=0.462 n=10)
geomean                                 4.197µ        2.630µ        -37.34%

Change-Id: I621145aef3e6a2c68e7127152f26ed047c6b2ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/671315
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
limeidan 2025-05-07 17:04:54 +08:00 committed by abner chenc
parent 9e026bf9cc
commit a1c3e2f008
1 changed files with 302 additions and 105 deletions

View File

@ -23,139 +23,140 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
// R7 = b_len
JMP cmpbody<>(SB)
// On entry:
// R5 length of a
// R7 length of b
// R4 points to the start of a
// R6 points to the start of b
// input:
// R4: points to the start of a
// R5: length of a
// R6: points to the start of b
// R7: length of b
// for regabi the return value (-1/0/1) in R4
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
SGTU R5, R7, R9
BNE R9, b_lt_a
MOVV R5, R14
JMP entry
b_lt_a:
MOVV R7, R14 // R14 is min(R5, R7)
MOVV R7, R14
entry:
ADDV R4, R14, R12 // R4 start of a, R12 end of a
BEQ R4, R12, cmp_len // minlength is 0
BEQ R14, cmp_len // minlength is 0
MOVV $32, R15
BGE R14, R15, lasx
tail:
MOVV $2, R15
BLT R14, R15, cmp1 // min < 2
SLLV $1, R15
BLT R14, R15, cmp2 // min < 4
SLLV $1, R15
BLT R14, R15, cmp4 // min < 8
SLLV $1, R15
BLT R14, R15, cmp8 // min < 16
SLLV $1, R15
BLT R14, R15, cmp16 // min < 32
// When min >= 32 bytes, enter the cmp32_loop loop processing:
// take out 4 8-bytes from a and b in turn for comparison.
cmp32_loop:
MOVV (R4), R8
MOVV (R6), R9
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
MOVV 16(R4), R8
MOVV 16(R6), R9
MOVV 24(R4), R10
MOVV 24(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $32, R4
ADDV $32, R6
SUBV $32, R14
BGE R14, R15, cmp32_loop
BEQ R14, cmp_len
check16:
MOVV $16, R15
BLT R14, R15, check8
cmp16:
MOVV (R4), R8
MOVV (R6), R9
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R8, R9, cmp8a
BNE R10, R11, cmp8b
ADDV $16, R4
ADDV $16, R6
SUBV $16, R14
BEQ R14, cmp_len
check8:
MOVV $8, R15
BLT R14, R15, check4
BLT R14, R15, lt_8
generic8_loop:
MOVV (R4), R10
MOVV (R6), R11
BEQ R10, R11, generic8_equal
cmp8:
MOVV (R4), R8
MOVV (R6), R9
BNE R8, R9, cmp8a
AND $0xff, R10, R16
AND $0xff, R11, R17
BNE R16, R17, cmp_byte
BSTRPICKV $15, R10, $8, R16
BSTRPICKV $15, R11, $8, R17
BNE R16, R17, cmp_byte
BSTRPICKV $23, R10, $16, R16
BSTRPICKV $23, R11, $16, R17
BNE R16, R17, cmp_byte
BSTRPICKV $31, R10, $24, R16
BSTRPICKV $31, R11, $24, R17
BNE R16, R17, cmp_byte
BSTRPICKV $39, R10, $32, R16
BSTRPICKV $39, R11, $32, R17
BNE R16, R17, cmp_byte
BSTRPICKV $47, R10, $40, R16
BSTRPICKV $47, R11, $40, R17
BNE R16, R17, cmp_byte
BSTRPICKV $55, R10, $48, R16
BSTRPICKV $55, R11, $48, R17
BNE R16, R17, cmp_byte
BSTRPICKV $63, R10, $56, R16
BSTRPICKV $63, R11, $56, R17
BNE R16, R17, cmp_byte
generic8_equal:
ADDV $-8, R14
BEQ R14, cmp_len
ADDV $8, R4
ADDV $8, R6
SUBV $8, R14
BEQ R14, cmp_len
BGE R14, R15, generic8_loop
check4:
lt_8:
MOVV $4, R15
BLT R14, R15, check2
cmp4:
MOVW (R4), R8
MOVW (R6), R9
BNE R8, R9, cmp8a
BLT R14, R15, lt_4
MOVWU (R4), R10
MOVWU (R6), R11
BEQ R10, R11, lt_8_equal
AND $0xff, R10, R16
AND $0xff, R11, R17
BNE R16, R17, cmp_byte
BSTRPICKV $15, R10, $8, R16
BSTRPICKV $15, R11, $8, R17
BNE R16, R17, cmp_byte
BSTRPICKV $23, R10, $16, R16
BSTRPICKV $23, R11, $16, R17
BNE R16, R17, cmp_byte
BSTRPICKV $31, R10, $24, R16
BSTRPICKV $31, R11, $24, R17
BNE R16, R17, cmp_byte
lt_8_equal:
ADDV $-4, R14
BEQ R14, cmp_len
ADDV $4, R4
ADDV $4, R6
SUBV $4, R14
BEQ R14, cmp_len
check2:
lt_4:
MOVV $2, R15
BLT R14, R15, cmp1
cmp2:
MOVH (R4), R8
MOVH (R6), R9
BNE R8, R9, cmp8a
BLT R14, R15, lt_2
MOVHU (R4), R10
MOVHU (R6), R11
BEQ R10, R11, lt_4_equal
AND $0xff, R10, R16
AND $0xff, R11, R17
BNE R16, R17, cmp_byte
BSTRPICKV $15, R10, $8, R16
BSTRPICKV $15, R11, $8, R17
BNE R16, R17, cmp_byte
lt_4_equal:
ADDV $-2, R14
BEQ R14, cmp_len
ADDV $2, R4
ADDV $2, R6
SUBV $2, R14
BEQ R14, cmp_len
cmp1:
BEQ R14, cmp_len
MOVBU (R4), R8
MOVBU (R6), R9
BNE R8, R9, byte_cmp
lt_2:
MOVBU (R4), R16
MOVBU (R6), R17
BNE R16, R17, cmp_byte
JMP cmp_len
// Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
cmp8a:
MOVV R8, R10
MOVV R9, R11
// Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
cmp8b:
MOVV $0xff, R15
// Take single bytes from R10/R11 in turn for cyclic comparison.
cmp8_loop:
AND R10, R15, R8
AND R11, R15, R9
BNE R8, R9, byte_cmp
SLLV $8, R15
JMP cmp8_loop
// Compare 1 bytes taken from R8/R9 that are known to differ.
byte_cmp:
SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
// Compare 1 byte taken from R16/R17 that are known to differ.
cmp_byte:
SGTU R16, R17, R4 // R4 = 1 if (R16 > R17)
BNE R0, R4, ret
MOVV $-1, R4
JMP ret
RET
cmp_len:
SGTU R5, R7, R8
@ -164,3 +165,199 @@ cmp_len:
ret:
RET
lasx:
MOVV $64, R20
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
BEQ R9, lsx
MOVV $128, R15
BLT R14, R15, lasx32_loop
lasx128_loop:
XVMOVQ (R4), X0
XVMOVQ (R6), X1
XVSEQB X0, X1, X0
XVSETANYEQB X0, FCC0
BFPT lasx_found_0
XVMOVQ 32(R4), X0
XVMOVQ 32(R6), X1
XVSEQB X0, X1, X0
XVSETANYEQB X0, FCC0
BFPT lasx_found_32
XVMOVQ 64(R4), X0
XVMOVQ 64(R6), X1
XVSEQB X0, X1, X0
XVSETANYEQB X0, FCC0
BFPT lasx_found_64
XVMOVQ 96(R4), X0
XVMOVQ 96(R6), X1
XVSEQB X0, X1, X0
XVSETANYEQB X0, FCC0
BFPT lasx_found_96
ADDV $-128, R14
BEQ R14, cmp_len
ADDV $128, R4
ADDV $128, R6
BGE R14, R15, lasx128_loop
MOVV $32, R15
BLT R14, R15, tail
lasx32_loop:
XVMOVQ (R4), X0
XVMOVQ (R6), X1
XVSEQB X0, X1, X0
XVSETANYEQB X0, FCC0
BFPT lasx_found_0
ADDV $-32, R14
BEQ R14, cmp_len
ADDV $32, R4
ADDV $32, R6
BGE R14, R15, lasx32_loop
JMP tail
lasx_found_0:
MOVV R0, R11
JMP lasx_find_byte
lasx_found_32:
MOVV $32, R11
JMP lasx_find_byte
lasx_found_64:
MOVV $64, R11
JMP lasx_find_byte
lasx_found_96:
MOVV $96, R11
lasx_find_byte:
XVMOVQ X0.V[0], R10
CTOV R10, R10
BNE R10, R20, find_byte
ADDV $8, R11
XVMOVQ X0.V[1], R10
CTOV R10, R10
BNE R10, R20, find_byte
ADDV $8, R11
XVMOVQ X0.V[2], R10
CTOV R10, R10
BNE R10, R20, find_byte
ADDV $8, R11
XVMOVQ X0.V[3], R10
CTOV R10, R10
JMP find_byte
lsx:
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
BEQ R9, generic32_loop
MOVV $64, R15
BLT R14, R15, lsx16_loop
lsx64_loop:
VMOVQ (R4), V0
VMOVQ (R6), V1
VSEQB V0, V1, V0
VSETANYEQB V0, FCC0
BFPT lsx_found_0
VMOVQ 16(R4), V0
VMOVQ 16(R6), V1
VSEQB V0, V1, V0
VSETANYEQB V0, FCC0
BFPT lsx_found_16
VMOVQ 32(R4), V0
VMOVQ 32(R6), V1
VSEQB V0, V1, V0
VSETANYEQB V0, FCC0
BFPT lsx_found_32
VMOVQ 48(R4), V0
VMOVQ 48(R6), V1
VSEQB V0, V1, V0
VSETANYEQB V0, FCC0
BFPT lsx_found_48
ADDV $-64, R14
BEQ R14, cmp_len
ADDV $64, R4
ADDV $64, R6
BGE R14, R15, lsx64_loop
MOVV $16, R15
BLT R14, R15, tail
lsx16_loop:
VMOVQ (R4), V0
VMOVQ (R6), V1
VSEQB V0, V1, V0
VSETANYEQB V0, FCC0
BFPT lsx_found_0
ADDV $-16, R14
BEQ R14, cmp_len
ADDV $16, R4
ADDV $16, R6
BGE R14, R15, lsx16_loop
JMP tail
lsx_found_0:
MOVV R0, R11
JMP lsx_find_byte
lsx_found_16:
MOVV $16, R11
JMP lsx_find_byte
lsx_found_32:
MOVV $32, R11
JMP lsx_find_byte
lsx_found_48:
MOVV $48, R11
lsx_find_byte:
VMOVQ V0.V[0], R10
CTOV R10, R10
BNE R10, R20, find_byte
ADDV $8, R11
VMOVQ V0.V[1], R10
CTOV R10, R10
find_byte:
SRLV $3, R10
ADDV R10, R11
ADDV R11, R4
ADDV R11, R6
MOVB (R4), R16
MOVB (R6), R17
JMP cmp_byte
generic32_loop:
MOVV (R4), R10
MOVV (R6), R11
BNE R10, R11, cmp8
MOVV 8(R4), R10
MOVV 8(R6), R11
BNE R10, R11, cmp8
MOVV 16(R4), R10
MOVV 16(R6), R11
BNE R10, R11, cmp8
MOVV 24(R4), R10
MOVV 24(R6), R11
BNE R10, R11, cmp8
ADDV $-32, R14
BEQ R14, cmp_len
ADDV $32, R4
ADDV $32, R6
MOVV $32, R15
BGE R14, R15, generic32_loop
JMP tail