mirror of https://github.com/golang/go.git
internal/bytealg: optimize the function compare using SIMD on loong64
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
BytesCompare/1 7.238n ± 25% 5.204n ± 0% -28.10% (p=0.001 n=10)
BytesCompare/2 7.242n ± 6% 5.204n ± 0% -28.14% (p=0.000 n=10)
BytesCompare/4 7.229n ± 5% 4.403n ± 0% -39.10% (p=0.000 n=10)
BytesCompare/8 7.077n ± 36% 4.403n ± 0% -37.78% (p=0.000 n=10)
BytesCompare/16 8.373n ± 6% 6.004n ± 0% -28.30% (p=0.000 n=10)
BytesCompare/32 8.040n ± 3% 4.803n ± 0% -40.26% (p=0.000 n=10)
BytesCompare/64 8.434n ± 24% 10.410n ± 0% +23.42% (p=0.014 n=10)
BytesCompare/128 11.530n ± 23% 5.604n ± 0% -51.40% (p=0.000 n=10)
BytesCompare/256 14.180n ± 0% 7.606n ± 0% -46.36% (p=0.000 n=10)
BytesCompare/512 26.83n ± 0% 10.81n ± 0% -59.71% (p=0.000 n=10)
BytesCompare/1024 52.60n ± 0% 17.21n ± 0% -67.28% (p=0.000 n=10)
BytesCompare/2048 103.70n ± 0% 30.02n ± 0% -71.05% (p=0.000 n=10)
geomean 13.49n 7.607n -43.63%
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
│ old │ new │
│ sec/op │ sec/op vs base │
CompareBytesEqual 5.603n ± 0% 5.604n ± 0% ~ (p=0.191 n=10)
CompareBytesToNil 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=10)
CompareBytesEmpty 2.802n ± 0% 2.802n ± 0% ~ (p=1.000 n=10)
CompareBytesIdentical 3.202n ± 0% 2.538n ± 1% -20.72% (p=0.000 n=10)
CompareBytesSameLength 8.805n ± 0% 4.803n ± 0% -45.45% (p=0.000 n=10)
CompareBytesDifferentLength 9.206n ± 0% 4.403n ± 0% -52.17% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6 82.03µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7 82.04µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0 78.76µ ± 0% 45.69µ ± 0% -41.98% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2 85.31µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6 85.31µ ± 0% 46.06µ ± 0% -46.02% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7 85.32µ ± 0% 52.32µ ± 7% -38.68% (p=0.000 n=10)
CompareBytesBig 78.76µ ± 0% 50.20µ ± 6% -36.26% (p=0.000 n=10)
CompareBytesBigIdentical 3.202n ± 0% 3.442n ± 24% ~ (p=0.462 n=10)
geomean 4.197µ 2.630µ -37.34%
Change-Id: I621145aef3e6a2c68e7127152f26ed047c6b2ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/671315
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
parent
9e026bf9cc
commit
a1c3e2f008
|
|
@ -23,139 +23,140 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
|
|||
// R7 = b_len
|
||||
JMP cmpbody<>(SB)
|
||||
|
||||
// On entry:
|
||||
// R5 length of a
|
||||
// R7 length of b
|
||||
// R4 points to the start of a
|
||||
// R6 points to the start of b
|
||||
// input:
|
||||
// R4: points to the start of a
|
||||
// R5: length of a
|
||||
// R6: points to the start of b
|
||||
// R7: length of b
|
||||
// for regabi the return value (-1/0/1) in R4
|
||||
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
|
||||
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
|
||||
BEQ R4, R6, cmp_len // same start of a and b, then compare lengths
|
||||
|
||||
SGTU R5, R7, R9
|
||||
BNE R9, b_lt_a
|
||||
MOVV R5, R14
|
||||
JMP entry
|
||||
|
||||
b_lt_a:
|
||||
MOVV R7, R14 // R14 is min(R5, R7)
|
||||
MOVV R7, R14
|
||||
|
||||
entry:
|
||||
ADDV R4, R14, R12 // R4 start of a, R12 end of a
|
||||
BEQ R4, R12, cmp_len // minlength is 0
|
||||
BEQ R14, cmp_len // minlength is 0
|
||||
|
||||
MOVV $32, R15
|
||||
BGE R14, R15, lasx
|
||||
tail:
|
||||
MOVV $2, R15
|
||||
BLT R14, R15, cmp1 // min < 2
|
||||
SLLV $1, R15
|
||||
BLT R14, R15, cmp2 // min < 4
|
||||
SLLV $1, R15
|
||||
BLT R14, R15, cmp4 // min < 8
|
||||
SLLV $1, R15
|
||||
BLT R14, R15, cmp8 // min < 16
|
||||
SLLV $1, R15
|
||||
BLT R14, R15, cmp16 // min < 32
|
||||
|
||||
// When min >= 32 bytes, enter the cmp32_loop loop processing:
|
||||
// take out 4 8-bytes from a and b in turn for comparison.
|
||||
cmp32_loop:
|
||||
MOVV (R4), R8
|
||||
MOVV (R6), R9
|
||||
MOVV 8(R4), R10
|
||||
MOVV 8(R6), R11
|
||||
BNE R8, R9, cmp8a
|
||||
BNE R10, R11, cmp8b
|
||||
MOVV 16(R4), R8
|
||||
MOVV 16(R6), R9
|
||||
MOVV 24(R4), R10
|
||||
MOVV 24(R6), R11
|
||||
BNE R8, R9, cmp8a
|
||||
BNE R10, R11, cmp8b
|
||||
ADDV $32, R4
|
||||
ADDV $32, R6
|
||||
SUBV $32, R14
|
||||
BGE R14, R15, cmp32_loop
|
||||
BEQ R14, cmp_len
|
||||
|
||||
check16:
|
||||
MOVV $16, R15
|
||||
BLT R14, R15, check8
|
||||
cmp16:
|
||||
MOVV (R4), R8
|
||||
MOVV (R6), R9
|
||||
MOVV 8(R4), R10
|
||||
MOVV 8(R6), R11
|
||||
BNE R8, R9, cmp8a
|
||||
BNE R10, R11, cmp8b
|
||||
ADDV $16, R4
|
||||
ADDV $16, R6
|
||||
SUBV $16, R14
|
||||
BEQ R14, cmp_len
|
||||
|
||||
check8:
|
||||
MOVV $8, R15
|
||||
BLT R14, R15, check4
|
||||
BLT R14, R15, lt_8
|
||||
generic8_loop:
|
||||
MOVV (R4), R10
|
||||
MOVV (R6), R11
|
||||
BEQ R10, R11, generic8_equal
|
||||
|
||||
cmp8:
|
||||
MOVV (R4), R8
|
||||
MOVV (R6), R9
|
||||
BNE R8, R9, cmp8a
|
||||
AND $0xff, R10, R16
|
||||
AND $0xff, R11, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $15, R10, $8, R16
|
||||
BSTRPICKV $15, R11, $8, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $23, R10, $16, R16
|
||||
BSTRPICKV $23, R11, $16, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $31, R10, $24, R16
|
||||
BSTRPICKV $31, R11, $24, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $39, R10, $32, R16
|
||||
BSTRPICKV $39, R11, $32, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $47, R10, $40, R16
|
||||
BSTRPICKV $47, R11, $40, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $55, R10, $48, R16
|
||||
BSTRPICKV $55, R11, $48, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $63, R10, $56, R16
|
||||
BSTRPICKV $63, R11, $56, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
generic8_equal:
|
||||
ADDV $-8, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $8, R4
|
||||
ADDV $8, R6
|
||||
SUBV $8, R14
|
||||
BEQ R14, cmp_len
|
||||
BGE R14, R15, generic8_loop
|
||||
|
||||
check4:
|
||||
lt_8:
|
||||
MOVV $4, R15
|
||||
BLT R14, R15, check2
|
||||
cmp4:
|
||||
MOVW (R4), R8
|
||||
MOVW (R6), R9
|
||||
BNE R8, R9, cmp8a
|
||||
BLT R14, R15, lt_4
|
||||
|
||||
MOVWU (R4), R10
|
||||
MOVWU (R6), R11
|
||||
BEQ R10, R11, lt_8_equal
|
||||
|
||||
AND $0xff, R10, R16
|
||||
AND $0xff, R11, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $15, R10, $8, R16
|
||||
BSTRPICKV $15, R11, $8, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $23, R10, $16, R16
|
||||
BSTRPICKV $23, R11, $16, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $31, R10, $24, R16
|
||||
BSTRPICKV $31, R11, $24, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
lt_8_equal:
|
||||
ADDV $-4, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $4, R4
|
||||
ADDV $4, R6
|
||||
SUBV $4, R14
|
||||
BEQ R14, cmp_len
|
||||
|
||||
check2:
|
||||
lt_4:
|
||||
MOVV $2, R15
|
||||
BLT R14, R15, cmp1
|
||||
cmp2:
|
||||
MOVH (R4), R8
|
||||
MOVH (R6), R9
|
||||
BNE R8, R9, cmp8a
|
||||
BLT R14, R15, lt_2
|
||||
|
||||
MOVHU (R4), R10
|
||||
MOVHU (R6), R11
|
||||
BEQ R10, R11, lt_4_equal
|
||||
|
||||
AND $0xff, R10, R16
|
||||
AND $0xff, R11, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
BSTRPICKV $15, R10, $8, R16
|
||||
BSTRPICKV $15, R11, $8, R17
|
||||
BNE R16, R17, cmp_byte
|
||||
|
||||
lt_4_equal:
|
||||
ADDV $-2, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $2, R4
|
||||
ADDV $2, R6
|
||||
SUBV $2, R14
|
||||
BEQ R14, cmp_len
|
||||
|
||||
cmp1:
|
||||
BEQ R14, cmp_len
|
||||
MOVBU (R4), R8
|
||||
MOVBU (R6), R9
|
||||
BNE R8, R9, byte_cmp
|
||||
lt_2:
|
||||
MOVBU (R4), R16
|
||||
MOVBU (R6), R17
|
||||
BNE R16, R17, cmp_byte
|
||||
JMP cmp_len
|
||||
|
||||
// Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
|
||||
cmp8a:
|
||||
MOVV R8, R10
|
||||
MOVV R9, R11
|
||||
|
||||
// Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
|
||||
cmp8b:
|
||||
MOVV $0xff, R15
|
||||
|
||||
// Take single bytes from R10/R11 in turn for cyclic comparison.
|
||||
cmp8_loop:
|
||||
AND R10, R15, R8
|
||||
AND R11, R15, R9
|
||||
BNE R8, R9, byte_cmp
|
||||
SLLV $8, R15
|
||||
JMP cmp8_loop
|
||||
|
||||
// Compare 1 bytes taken from R8/R9 that are known to differ.
|
||||
byte_cmp:
|
||||
SGTU R8, R9, R4 // R4 = 1 if (R8 > R9)
|
||||
// Compare 1 byte taken from R16/R17 that are known to differ.
|
||||
cmp_byte:
|
||||
SGTU R16, R17, R4 // R4 = 1 if (R16 > R17)
|
||||
BNE R0, R4, ret
|
||||
MOVV $-1, R4
|
||||
JMP ret
|
||||
RET
|
||||
|
||||
cmp_len:
|
||||
SGTU R5, R7, R8
|
||||
|
|
@ -164,3 +165,199 @@ cmp_len:
|
|||
|
||||
ret:
|
||||
RET
|
||||
|
||||
lasx:
|
||||
MOVV $64, R20
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
|
||||
BEQ R9, lsx
|
||||
|
||||
MOVV $128, R15
|
||||
BLT R14, R15, lasx32_loop
|
||||
lasx128_loop:
|
||||
XVMOVQ (R4), X0
|
||||
XVMOVQ (R6), X1
|
||||
XVSEQB X0, X1, X0
|
||||
XVSETANYEQB X0, FCC0
|
||||
BFPT lasx_found_0
|
||||
|
||||
XVMOVQ 32(R4), X0
|
||||
XVMOVQ 32(R6), X1
|
||||
XVSEQB X0, X1, X0
|
||||
XVSETANYEQB X0, FCC0
|
||||
BFPT lasx_found_32
|
||||
|
||||
XVMOVQ 64(R4), X0
|
||||
XVMOVQ 64(R6), X1
|
||||
XVSEQB X0, X1, X0
|
||||
XVSETANYEQB X0, FCC0
|
||||
BFPT lasx_found_64
|
||||
|
||||
XVMOVQ 96(R4), X0
|
||||
XVMOVQ 96(R6), X1
|
||||
XVSEQB X0, X1, X0
|
||||
XVSETANYEQB X0, FCC0
|
||||
BFPT lasx_found_96
|
||||
|
||||
ADDV $-128, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $128, R4
|
||||
ADDV $128, R6
|
||||
BGE R14, R15, lasx128_loop
|
||||
|
||||
MOVV $32, R15
|
||||
BLT R14, R15, tail
|
||||
lasx32_loop:
|
||||
XVMOVQ (R4), X0
|
||||
XVMOVQ (R6), X1
|
||||
XVSEQB X0, X1, X0
|
||||
XVSETANYEQB X0, FCC0
|
||||
BFPT lasx_found_0
|
||||
|
||||
ADDV $-32, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $32, R4
|
||||
ADDV $32, R6
|
||||
BGE R14, R15, lasx32_loop
|
||||
JMP tail
|
||||
|
||||
lasx_found_0:
|
||||
MOVV R0, R11
|
||||
JMP lasx_find_byte
|
||||
|
||||
lasx_found_32:
|
||||
MOVV $32, R11
|
||||
JMP lasx_find_byte
|
||||
|
||||
lasx_found_64:
|
||||
MOVV $64, R11
|
||||
JMP lasx_find_byte
|
||||
|
||||
lasx_found_96:
|
||||
MOVV $96, R11
|
||||
|
||||
lasx_find_byte:
|
||||
XVMOVQ X0.V[0], R10
|
||||
CTOV R10, R10
|
||||
BNE R10, R20, find_byte
|
||||
ADDV $8, R11
|
||||
|
||||
XVMOVQ X0.V[1], R10
|
||||
CTOV R10, R10
|
||||
BNE R10, R20, find_byte
|
||||
ADDV $8, R11
|
||||
|
||||
XVMOVQ X0.V[2], R10
|
||||
CTOV R10, R10
|
||||
BNE R10, R20, find_byte
|
||||
ADDV $8, R11
|
||||
|
||||
XVMOVQ X0.V[3], R10
|
||||
CTOV R10, R10
|
||||
JMP find_byte
|
||||
|
||||
lsx:
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
|
||||
BEQ R9, generic32_loop
|
||||
|
||||
MOVV $64, R15
|
||||
BLT R14, R15, lsx16_loop
|
||||
lsx64_loop:
|
||||
VMOVQ (R4), V0
|
||||
VMOVQ (R6), V1
|
||||
VSEQB V0, V1, V0
|
||||
VSETANYEQB V0, FCC0
|
||||
BFPT lsx_found_0
|
||||
|
||||
VMOVQ 16(R4), V0
|
||||
VMOVQ 16(R6), V1
|
||||
VSEQB V0, V1, V0
|
||||
VSETANYEQB V0, FCC0
|
||||
BFPT lsx_found_16
|
||||
|
||||
VMOVQ 32(R4), V0
|
||||
VMOVQ 32(R6), V1
|
||||
VSEQB V0, V1, V0
|
||||
VSETANYEQB V0, FCC0
|
||||
BFPT lsx_found_32
|
||||
|
||||
VMOVQ 48(R4), V0
|
||||
VMOVQ 48(R6), V1
|
||||
VSEQB V0, V1, V0
|
||||
VSETANYEQB V0, FCC0
|
||||
BFPT lsx_found_48
|
||||
|
||||
ADDV $-64, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $64, R4
|
||||
ADDV $64, R6
|
||||
BGE R14, R15, lsx64_loop
|
||||
|
||||
MOVV $16, R15
|
||||
BLT R14, R15, tail
|
||||
lsx16_loop:
|
||||
VMOVQ (R4), V0
|
||||
VMOVQ (R6), V1
|
||||
VSEQB V0, V1, V0
|
||||
VSETANYEQB V0, FCC0
|
||||
BFPT lsx_found_0
|
||||
|
||||
ADDV $-16, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $16, R4
|
||||
ADDV $16, R6
|
||||
BGE R14, R15, lsx16_loop
|
||||
JMP tail
|
||||
|
||||
lsx_found_0:
|
||||
MOVV R0, R11
|
||||
JMP lsx_find_byte
|
||||
|
||||
lsx_found_16:
|
||||
MOVV $16, R11
|
||||
JMP lsx_find_byte
|
||||
|
||||
lsx_found_32:
|
||||
MOVV $32, R11
|
||||
JMP lsx_find_byte
|
||||
|
||||
lsx_found_48:
|
||||
MOVV $48, R11
|
||||
|
||||
lsx_find_byte:
|
||||
VMOVQ V0.V[0], R10
|
||||
CTOV R10, R10
|
||||
BNE R10, R20, find_byte
|
||||
ADDV $8, R11
|
||||
|
||||
VMOVQ V0.V[1], R10
|
||||
CTOV R10, R10
|
||||
|
||||
find_byte:
|
||||
SRLV $3, R10
|
||||
ADDV R10, R11
|
||||
ADDV R11, R4
|
||||
ADDV R11, R6
|
||||
MOVB (R4), R16
|
||||
MOVB (R6), R17
|
||||
JMP cmp_byte
|
||||
|
||||
generic32_loop:
|
||||
MOVV (R4), R10
|
||||
MOVV (R6), R11
|
||||
BNE R10, R11, cmp8
|
||||
MOVV 8(R4), R10
|
||||
MOVV 8(R6), R11
|
||||
BNE R10, R11, cmp8
|
||||
MOVV 16(R4), R10
|
||||
MOVV 16(R6), R11
|
||||
BNE R10, R11, cmp8
|
||||
MOVV 24(R4), R10
|
||||
MOVV 24(R6), R11
|
||||
BNE R10, R11, cmp8
|
||||
ADDV $-32, R14
|
||||
BEQ R14, cmp_len
|
||||
ADDV $32, R4
|
||||
ADDV $32, R6
|
||||
MOVV $32, R15
|
||||
BGE R14, R15, generic32_loop
|
||||
JMP tail
|
||||
|
|
|
|||
Loading…
Reference in New Issue