runtime: improve memmove on ppc64x/power10

Rewrite memmove asm function to use the new power10 instructions
lxvl and stxvl or the load and store vector with length which can
specify the number of bytes to be loaded/stored in a register,
thereby avoiding multiple instructions to process 8bytes, 4bytes,
2bytes and a single byte while storing the tail end bytes. On power9
and power8 the code remains unchanged.
The performance for all sizes<=16 improve on power10 with this change.

name                          old time/op    new time/op    delta
Memmove/1                2.87ns ±  0%    2.64ns ±  1%    -8.11%
Memmove/2                2.85ns ±  0%    2.62ns ±  1%    -8.12%
Memmove/3                2.78ns ±  0%    2.63ns ±  1%    -5.33%
Memmove/4                2.83ns ±  0%    2.63ns ±  2%    -7.33%
Memmove/5                2.78ns ±  0%    2.63ns ±  1%    -5.40%
Memmove/6                2.61ns ±  3%    2.61ns ±  1%      ~
Memmove/7                2.82ns ±  0%    2.61ns ±  1%    -7.48%
Memmove/8                2.82ns ±  0%    2.65ns ±  1%    -6.11%
Memmove/9                6.41ns ±  0%    2.62ns ±  1%   -59.17%
Memmove/10               5.09ns ±  1%    2.60ns ±  1%   -48.90%
Memmove/11               4.68ns ±  7%    2.59ns ±  1%   -44.56%
Memmove/12               6.25ns ±  2%    2.60ns ±  1%   -58.46%
Memmove/13               4.15ns ± 25%    2.59ns ±  1%   -37.66%
Memmove/14               3.76ns ± 11%    2.59ns ±  1%   -30.94%
Memmove/15               3.82ns ±  1%    2.60ns ±  1%   -31.93%
Memmove/16               2.96ns ±  1%    2.59ns ±  1%   -12.63%
MemmoveUnalignedDst/1    3.07ns ±  0%    2.77ns ±  0%    -9.75%
MemmoveUnalignedDst/2    2.82ns ±  0%    2.77ns ±  0%    -1.73%
MemmoveUnalignedDst/3    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/4    2.85ns ±  1%    2.77ns ±  0%    -2.90%
MemmoveUnalignedDst/5    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/6    2.88ns ±  0%    2.77ns ±  0%    -4.04%
MemmoveUnalignedDst/7    3.11ns ±  0%    2.77ns ±  0%   -11.10%
MemmoveUnalignedDst/8    4.18ns ±  2%    2.77ns ±  0%   -33.90%
MemmoveUnalignedDst/9    6.36ns ±  1%    2.77ns ±  0%   -56.53%
MemmoveUnalignedDst/10   5.77ns ±  1%    2.77ns ±  0%   -52.09%
MemmoveUnalignedDst/11   4.68ns ±  1%    2.77ns ±  0%   -40.86%
MemmoveUnalignedDst/12   4.54ns ±  2%    2.77ns ±  0%   -39.05%
MemmoveUnalignedDst/13   6.16ns ±  5%    2.77ns ±  0%   -55.14%
MemmoveUnalignedDst/14   4.03ns ±  2%    2.77ns ±  0%   -31.41%
MemmoveUnalignedDst/15   4.11ns ±  0%    2.77ns ±  0%   -32.74%
MemmoveUnalignedDst/16   3.49ns ±  4%    2.79ns ±  1%   -20.04%
MemmoveUnalignedSrc/1    3.06ns ±  0%    2.77ns ±  0%    -9.68%
MemmoveUnalignedSrc/2    2.82ns ±  1%    2.77ns ±  0%    -1.93%
MemmoveUnalignedSrc/3    3.04ns ±  0%    2.77ns ±  0%    -8.95%
MemmoveUnalignedSrc/4    2.85ns ±  0%    2.77ns ±  0%    -2.86%
MemmoveUnalignedSrc/5    3.04ns ±  0%    2.77ns ±  0%    -8.97%
MemmoveUnalignedSrc/6    2.93ns ±  0%    2.77ns ±  0%    -5.43%
MemmoveUnalignedSrc/7    3.13ns ±  0%    2.77ns ±  0%   -11.56%
MemmoveUnalignedSrc/8    3.71ns ±  2%    2.77ns ±  0%   -25.46%
MemmoveUnalignedSrc/9    6.04ns ±  0%    2.77ns ±  0%   -54.16%
MemmoveUnalignedSrc/10   6.86ns ±  5%    2.77ns ±  0%   -59.69%
MemmoveUnalignedSrc/11   4.18ns ±  3%    2.77ns ±  0%   -33.81%
MemmoveUnalignedSrc/12   4.75ns ±  2%    2.77ns ±  0%   -41.81%
MemmoveUnalignedSrc/13   4.78ns ±  3%    2.77ns ±  0%   -42.15%
MemmoveUnalignedSrc/14   3.89ns ±  5%    2.77ns ±  0%   -28.80%
MemmoveUnalignedSrc/15   4.09ns ±  0%    2.77ns ±  0%   -32.30%
MemmoveUnalignedSrc/16   3.15ns ±  1%    2.77ns ±  0%   -12.05%
Change-Id: Ia3c09d968dada71a794e5ccab3300ea9c46d8374
Reviewed-on: https://go-review.googlesource.com/c/go/+/470135
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Archana R 2023-02-22 05:52:15 -06:00 committed by Lynn Boger
parent bd20bf4807
commit cdf77c7209
1 changed files with 24 additions and 0 deletions

View File

@ -39,6 +39,15 @@ TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
// Determine if there are doublewords to // Determine if there are doublewords to
// copy so a more efficient move can be done // copy so a more efficient move can be done
check: check:
#ifdef GOPPC64_power10
CMP LEN, $16
BGT mcopy
SLD $56, LEN, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
mcopy:
ANDCC $7, LEN, BYTES // R7: bytes to copy ANDCC $7, LEN, BYTES // R7: bytes to copy
SRD $3, LEN, DWORDS // R6: double words to copy SRD $3, LEN, DWORDS // R6: double words to copy
MOVFL CR0, CR3 // save CR from ANDCC MOVFL CR0, CR3 // save CR from ANDCC
@ -110,12 +119,26 @@ lt32gt8:
lt16: // Move 8 bytes if possible lt16: // Move 8 bytes if possible
CMP DWORDS, $1 CMP DWORDS, $1
BLT checkbytes BLT checkbytes
#ifdef GOPPC64_power10
ADD $8, BYTES
SLD $56, BYTES, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
MOVD 0(SRC), TMP MOVD 0(SRC), TMP
ADD $8, SRC ADD $8, SRC
MOVD TMP, 0(TGT) MOVD TMP, 0(TGT)
ADD $8, TGT ADD $8, TGT
checkbytes: checkbytes:
BC 12, 14, LR // BEQ lr BC 12, 14, LR // BEQ lr
#ifdef GOPPC64_power10
SLD $56, BYTES, TMP
LXVL SRC, TMP, V0
STXVL V0, TGT, TMP
RET
#endif
lt8: // Move word if possible lt8: // Move word if possible
CMP BYTES, $4 CMP BYTES, $4
BLT lt4 BLT lt4
@ -183,6 +206,7 @@ backward32setup:
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0 ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
MOVD QWORDS, CTR // set up loop ctr MOVD QWORDS, CTR // set up loop ctr
MOVD $16, IDX16 // 32 bytes at a time MOVD $16, IDX16 // 32 bytes at a time
PCALIGN $32
backward32loop: backward32loop:
SUB $32, TGT SUB $32, TGT