internal/bytealg: optimise memequal on riscv64

Implement memequal using loops that process 32 bytes, 16 bytes, 4 bytes
or 1 byte depending on size and alignment. For comparisons that are less
than 32 bytes the overhead of checking and adjusting alignment usually
exceeds the overhead of reading and processing 4 bytes at a time.

Updates #50615

name                 old time/op    new time/op     delta
Equal/0-4              38.3ns _ 0%     43.1ns _ 0%    +12.54%  (p=0.000 n=3+3)
Equal/1-4              77.7ns _ 0%     90.3ns _ 0%    +16.27%  (p=0.000 n=3+3)
Equal/6-4               116ns _ 0%      121ns _ 0%     +3.85%  (p=0.002 n=3+3)
Equal/9-4               137ns _ 0%      126ns _ 0%     -7.98%  (p=0.000 n=3+3)
Equal/15-4              179ns _ 0%      170ns _ 0%     -4.77%  (p=0.001 n=3+3)
Equal/16-4              186ns _ 0%      159ns _ 0%    -14.65%  (p=0.000 n=3+3)
Equal/20-4              215ns _ 0%      178ns _ 0%    -17.18%  (p=0.000 n=3+3)
Equal/32-4              298ns _ 0%      101ns _ 0%    -66.22%  (p=0.000 n=3+3)
Equal/4K-4             28.9_s _ 0%      2.2_s _ 0%    -92.56%  (p=0.000 n=3+3)
Equal/4M-4             29.6ms _ 0%      2.2ms _ 0%    -92.72%  (p=0.000 n=3+3)
Equal/64M-4             758ms _75%       35ms _ 0%       ~     (p=0.127 n=3+3)
CompareBytesEqual-4     226ns _ 0%      131ns _ 0%    -41.76%  (p=0.000 n=3+3)

name                 old speed      new speed       delta
Equal/1-4            12.9MB/s _ 0%   11.1MB/s _ 0%    -13.98%  (p=0.000 n=3+3)
Equal/6-4            51.7MB/s _ 0%   49.8MB/s _ 0%     -3.72%  (p=0.002 n=3+3)
Equal/9-4            65.7MB/s _ 0%   71.4MB/s _ 0%     +8.67%  (p=0.000 n=3+3)
Equal/15-4           83.8MB/s _ 0%   88.0MB/s _ 0%     +5.02%  (p=0.001 n=3+3)
Equal/16-4           85.9MB/s _ 0%  100.6MB/s _ 0%    +17.19%  (p=0.000 n=3+3)
Equal/20-4           93.2MB/s _ 0%  112.6MB/s _ 0%    +20.74%  (p=0.000 n=3+3)
Equal/32-4            107MB/s _ 0%    317MB/s _ 0%   +195.97%  (p=0.000 n=3+3)
Equal/4K-4            142MB/s _ 0%   1902MB/s _ 0%  +1243.76%  (p=0.000 n=3+3)
Equal/4M-4            142MB/s _ 0%   1946MB/s _ 0%  +1274.22%  (p=0.000 n=3+3)
Equal/64M-4           111MB/s _55%   1941MB/s _ 0%  +1641.21%  (p=0.000 n=3+3)

Change-Id: I9af7e82de3c4c5af8813772ed139230900c03b92
Reviewed-on: https://go-review.googlesource.com/c/go/+/380075
Trust: Joel Sing <joel@sing.id.au>
Trust: mzh <mzh@golangcn.org>
Reviewed-by: mzh <mzh@golangcn.org>
Run-TryBot: Joel Sing <joel@sing.id.au>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Joel Sing 2022-01-21 18:34:03 +11:00
parent 7fd9564fcd
commit 3c42ebf3e2
1 changed files with 111 additions and 33 deletions

View File

@ -9,41 +9,119 @@
// func memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
MOV a+0(FP), A1
MOV b+8(FP), A2
BEQ A1, A2, eq
MOV size+16(FP), A3
ADD A1, A3, A4
loop:
BEQ A1, A4, eq
MOVBU (A1), A6
ADD $1, A1
MOVBU (A2), A7
ADD $1, A2
BEQ A6, A7, loop
MOVB ZERO, ret+24(FP)
RET
eq:
MOV $1, A1
MOVB A1, ret+24(FP)
RET
MOV a+0(FP), X5
MOV b+8(FP), X6
MOV size+16(FP), X7
MOV $ret+24(FP), X19
JMP memequal<>(SB)
// func memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
MOV a+0(FP), A1
MOV b+8(FP), A2
BEQ A1, A2, eq
MOV 8(CTXT), A3 // compiler stores size at offset 8 in the closure
MOV A1, 8(X2)
MOV A2, 16(X2)
MOV A3, 24(X2)
CALL runtime·memequal(SB)
MOVBU 32(X2), A1
MOVB A1, ret+16(FP)
TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
MOV a+0(FP), X5
MOV b+8(FP), X6
MOV 8(CTXT), X7 // compiler stores size at offset 8 in the closure
MOV $ret+16(FP), X19
JMP memequal<>(SB)
// On entry X5 and X6 contain pointers, X7 contains length.
// X19 contains address for return value.
TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0
BEQ X5, X6, eq
MOV $32, X8
BLT X7, X8, loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $3, X5, X9
AND $3, X6, X10
BNE X9, X10, loop4_check
BEQZ X9, loop32_check
// Check one byte at a time until we reach 8 byte alignment.
SUB X9, X7, X7
align:
ADD $-1, X9
MOVBU 0(X5), X10
MOVBU 0(X6), X11
BNE X10, X11, not_eq
ADD $1, X5
ADD $1, X6
BNEZ X9, align
loop32_check:
MOV $32, X9
BLT X7, X9, loop16_check
loop32:
MOV 0(X5), X10
MOV 0(X6), X11
MOV 8(X5), X12
MOV 8(X6), X13
BNE X10, X11, not_eq
BNE X12, X13, not_eq
MOV 16(X5), X14
MOV 16(X6), X15
MOV 24(X5), X16
MOV 24(X6), X17
BNE X14, X15, not_eq
BNE X16, X17, not_eq
ADD $32, X5
ADD $32, X6
ADD $-32, X7
BGE X7, X9, loop32
BEQZ X7, eq
loop16_check:
MOV $16, X8
BLT X7, X8, loop4_check
loop16:
MOV 0(X5), X10
MOV 0(X6), X11
MOV 8(X5), X12
MOV 8(X6), X13
BNE X10, X11, not_eq
BNE X12, X13, not_eq
ADD $16, X5
ADD $16, X6
ADD $-16, X7
BGE X7, X8, loop16
BEQZ X7, eq
loop4_check:
MOV $4, X8
BLT X7, X8, loop1
loop4:
MOVBU 0(X5), X10
MOVBU 0(X6), X11
MOVBU 1(X5), X12
MOVBU 1(X6), X13
BNE X10, X11, not_eq
BNE X12, X13, not_eq
MOVBU 2(X5), X14
MOVBU 2(X6), X15
MOVBU 3(X5), X16
MOVBU 3(X6), X17
BNE X14, X15, not_eq
BNE X16, X17, not_eq
ADD $4, X5
ADD $4, X6
ADD $-4, X7
BGE X7, X8, loop4
loop1:
BEQZ X7, eq
MOVBU 0(X5), X10
MOVBU 0(X6), X11
BNE X10, X11, not_eq
ADD $1, X5
ADD $1, X6
ADD $-1, X7
JMP loop1
not_eq:
MOV $0, X5
MOVB X5, (X19)
RET
eq:
MOV $1, A1
MOVB A1, ret+16(FP)
MOV $1, X5
MOVB X5, (X19)
RET