mirror of https://github.com/golang/go.git
bytes: add optimized Compare for arm64
Use LDP instructions to load 16 bytes per loop when the source length is long. Specially process the 8 bytes length, 4 bytes length and 2 bytes length to get a better performance. Benchmark result: name old time/op new time/op delta BytesCompare/1-8 21.0ns ± 0% 10.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/2-8 11.5ns ± 0% 10.5ns ± 0% -8.70% (p=0.008 n=5+5) BytesCompare/4-8 13.5ns ± 0% 10.0ns ± 0% -25.93% (p=0.008 n=5+5) BytesCompare/8-8 28.8ns ± 0% 9.5ns ± 0% ~ (p=0.079 n=4+5) BytesCompare/16-8 40.5ns ± 0% 10.5ns ± 0% -74.07% (p=0.008 n=5+5) BytesCompare/32-8 64.6ns ± 0% 12.5ns ± 0% -80.65% (p=0.008 n=5+5) BytesCompare/64-8 112ns ± 0% 16ns ± 0% -85.27% (p=0.008 n=5+5) BytesCompare/128-8 208ns ± 0% 24ns ± 0% -88.22% (p=0.008 n=5+5) BytesCompare/256-8 400ns ± 0% 50ns ± 0% -87.62% (p=0.008 n=5+5) BytesCompare/512-8 785ns ± 0% 82ns ± 0% -89.61% (p=0.008 n=5+5) BytesCompare/1024-8 1.55µs ± 0% 0.14µs ± 0% ~ (p=0.079 n=4+5) BytesCompare/2048-8 3.09µs ± 0% 0.27µs ± 0% ~ (p=0.079 n=4+5) CompareBytesEqual-8 39.0ns ± 0% 12.0ns ± 0% -69.23% (p=0.008 n=5+5) CompareBytesToNil-8 8.57ns ± 5% 8.23ns ± 2% -3.99% (p=0.016 n=5+5) CompareBytesEmpty-8 7.37ns ± 0% 7.36ns ± 4% ~ (p=0.690 n=5+5) CompareBytesIdentical-8 7.39ns ± 0% 7.46ns ± 2% ~ (p=0.667 n=5+5) CompareBytesSameLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesDifferentLength-8 17.0ns ± 0% 10.5ns ± 0% -38.24% (p=0.008 n=5+5) CompareBytesBigUnaligned-8 1.58ms ± 0% 0.19ms ± 0% -88.31% (p=0.016 n=4+5) CompareBytesBig-8 1.59ms ± 0% 0.19ms ± 0% -88.27% (p=0.016 n=5+4) CompareBytesBigIdentical-8 7.01ns ± 0% 6.60ns ± 3% -5.91% (p=0.008 n=5+5) name old speed new speed delta CompareBytesBigUnaligned-8 662MB/s ± 0% 5660MB/s ± 0% +755.15% (p=0.016 n=4+5) CompareBytesBig-8 661MB/s ± 0% 5636MB/s ± 0% +752.57% (p=0.016 n=5+4) CompareBytesBigIdentical-8 150TB/s ± 0% 159TB/s ± 3% +6.27% (p=0.008 n=5+5) Change-Id: I70332de06f873df3bc12c4a5af1028307b670046 Reviewed-on: https://go-review.googlesource.com/90175 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
910c3a9dfc
commit
bfa8b6f8ff
|
|
@ -10,7 +10,7 @@ TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
|
|||
MOVD a_len+8(FP), R0
|
||||
MOVD b_base+24(FP), R3
|
||||
MOVD b_len+32(FP), R1
|
||||
ADD $56, RSP, R7
|
||||
MOVD $ret+48(FP), R7
|
||||
B cmpbody<>(SB)
|
||||
|
||||
TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
|
||||
|
|
@ -18,7 +18,7 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
|
|||
MOVD a_len+8(FP), R0
|
||||
MOVD b_base+24(FP), R3
|
||||
MOVD b_len+32(FP), R1
|
||||
ADD $56, RSP, R7
|
||||
MOVD $ret+48(FP), R7
|
||||
B cmpbody<>(SB)
|
||||
|
||||
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
|
||||
|
|
@ -26,7 +26,7 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
|
|||
MOVD a_len+8(FP), R0
|
||||
MOVD b_base+16(FP), R3
|
||||
MOVD b_len+24(FP), R1
|
||||
ADD $40, RSP, R7
|
||||
MOVD $ret+32(FP), R7
|
||||
B cmpbody<>(SB)
|
||||
|
||||
// On entry:
|
||||
|
|
@ -37,30 +37,98 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
|
|||
// R7 points to return value (-1/0/1 will be written here)
|
||||
//
|
||||
// On exit:
|
||||
// R4, R5, and R6 are clobbered
|
||||
// R4, R5, R6, R8, R9 and R10 are clobbered
|
||||
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
|
||||
CMP R2, R3
|
||||
BEQ samebytes // same starting pointers; compare lengths
|
||||
BEQ samebytes // same starting pointers; compare lengths
|
||||
CMP R0, R1
|
||||
CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
|
||||
CSEL LT, R1, R0, R6 // R6 is min(R0, R1)
|
||||
|
||||
ADD R2, R6 // R2 is current byte in a, R6 is last byte in a to compare
|
||||
loop:
|
||||
CMP R2, R6
|
||||
BEQ samebytes // all compared bytes were the same; compare lengths
|
||||
MOVBU.P 1(R2), R4
|
||||
MOVBU.P 1(R3), R5
|
||||
CMP $0, R6
|
||||
BEQ samebytes
|
||||
BIC $0xf, R6, R10
|
||||
CBZ R10, small // length < 16
|
||||
ADD R2, R10 // end of chunk16
|
||||
// length >= 16
|
||||
chunk16_loop:
|
||||
LDP.P 16(R2), (R4, R8)
|
||||
LDP.P 16(R3), (R5, R9)
|
||||
CMP R4, R5
|
||||
BEQ loop
|
||||
// bytes differed
|
||||
BNE cmp
|
||||
CMP R8, R9
|
||||
BNE cmpnext
|
||||
CMP R10, R2
|
||||
BNE chunk16_loop
|
||||
AND $0xf, R6, R6
|
||||
CBZ R6, samebytes
|
||||
SUBS $8, R6
|
||||
BLT tail
|
||||
// the length of tail > 8 bytes
|
||||
MOVD.P 8(R2), R4
|
||||
MOVD.P 8(R3), R5
|
||||
CMP R4, R5
|
||||
BNE cmp
|
||||
SUB $8, R6
|
||||
// compare last 8 bytes
|
||||
tail:
|
||||
MOVD (R2)(R6), R4
|
||||
MOVD (R3)(R6), R5
|
||||
CMP R4, R5
|
||||
BEQ samebytes
|
||||
cmp:
|
||||
REV R4, R4
|
||||
REV R5, R5
|
||||
CMP R4, R5
|
||||
ret:
|
||||
MOVD $1, R4
|
||||
CSNEG LT, R4, R4, R4
|
||||
CNEG HI, R4, R4
|
||||
MOVD R4, (R7)
|
||||
RET
|
||||
small:
|
||||
TBZ $3, R6, lt_8
|
||||
MOVD (R2), R4
|
||||
MOVD (R3), R5
|
||||
CMP R4, R5
|
||||
BNE cmp
|
||||
SUBS $8, R6
|
||||
BEQ samebytes
|
||||
ADD $8, R2
|
||||
ADD $8, R3
|
||||
SUB $8, R6
|
||||
B tail
|
||||
lt_8:
|
||||
TBZ $2, R6, lt_4
|
||||
MOVWU (R2), R4
|
||||
MOVWU (R3), R5
|
||||
CMPW R4, R5
|
||||
BNE cmp
|
||||
SUBS $4, R6
|
||||
BEQ samebytes
|
||||
ADD $4, R2
|
||||
ADD $4, R3
|
||||
lt_4:
|
||||
TBZ $1, R6, lt_2
|
||||
MOVHU (R2), R4
|
||||
MOVHU (R3), R5
|
||||
CMPW R4, R5
|
||||
BNE cmp
|
||||
ADD $2, R2
|
||||
ADD $2, R3
|
||||
lt_2:
|
||||
TBZ $0, R6, samebytes
|
||||
one:
|
||||
MOVBU (R2), R4
|
||||
MOVBU (R3), R5
|
||||
CMPW R4, R5
|
||||
BNE ret
|
||||
samebytes:
|
||||
MOVD $1, R4
|
||||
CMP R0, R1
|
||||
CSNEG LT, R4, R4, R4
|
||||
CSEL EQ, ZR, R4, R4
|
||||
CMP R1, R0
|
||||
CSET NE, R4
|
||||
CNEG LO, R4, R4
|
||||
MOVD R4, (R7)
|
||||
RET
|
||||
cmpnext:
|
||||
REV R8, R4
|
||||
REV R9, R5
|
||||
CMP R4, R5
|
||||
B ret
|
||||
|
|
|
|||
Loading…
Reference in New Issue