mirror of https://github.com/golang/go.git
runtime: improve memmove on ppc64x/power10
Rewrite memmove asm function to use the new power10 instructions lxvl and stxvl or the load and store vector with length which can specify the number of bytes to be loaded/stored in a register, thereby avoiding multiple instructions to process 8bytes, 4bytes, 2bytes and a single byte while storing the tail end bytes. On power9 and power8 the code remains unchanged. The performance for all sizes<=16 improve on power10 with this change. name old time/op new time/op delta Memmove/1 2.87ns ± 0% 2.64ns ± 1% -8.11% Memmove/2 2.85ns ± 0% 2.62ns ± 1% -8.12% Memmove/3 2.78ns ± 0% 2.63ns ± 1% -5.33% Memmove/4 2.83ns ± 0% 2.63ns ± 2% -7.33% Memmove/5 2.78ns ± 0% 2.63ns ± 1% -5.40% Memmove/6 2.61ns ± 3% 2.61ns ± 1% ~ Memmove/7 2.82ns ± 0% 2.61ns ± 1% -7.48% Memmove/8 2.82ns ± 0% 2.65ns ± 1% -6.11% Memmove/9 6.41ns ± 0% 2.62ns ± 1% -59.17% Memmove/10 5.09ns ± 1% 2.60ns ± 1% -48.90% Memmove/11 4.68ns ± 7% 2.59ns ± 1% -44.56% Memmove/12 6.25ns ± 2% 2.60ns ± 1% -58.46% Memmove/13 4.15ns ± 25% 2.59ns ± 1% -37.66% Memmove/14 3.76ns ± 11% 2.59ns ± 1% -30.94% Memmove/15 3.82ns ± 1% 2.60ns ± 1% -31.93% Memmove/16 2.96ns ± 1% 2.59ns ± 1% -12.63% MemmoveUnalignedDst/1 3.07ns ± 0% 2.77ns ± 0% -9.75% MemmoveUnalignedDst/2 2.82ns ± 0% 2.77ns ± 0% -1.73% MemmoveUnalignedDst/3 3.03ns ± 0% 2.77ns ± 0% -8.75% MemmoveUnalignedDst/4 2.85ns ± 1% 2.77ns ± 0% -2.90% MemmoveUnalignedDst/5 3.03ns ± 0% 2.77ns ± 0% -8.75% MemmoveUnalignedDst/6 2.88ns ± 0% 2.77ns ± 0% -4.04% MemmoveUnalignedDst/7 3.11ns ± 0% 2.77ns ± 0% -11.10% MemmoveUnalignedDst/8 4.18ns ± 2% 2.77ns ± 0% -33.90% MemmoveUnalignedDst/9 6.36ns ± 1% 2.77ns ± 0% -56.53% MemmoveUnalignedDst/10 5.77ns ± 1% 2.77ns ± 0% -52.09% MemmoveUnalignedDst/11 4.68ns ± 1% 2.77ns ± 0% -40.86% MemmoveUnalignedDst/12 4.54ns ± 2% 2.77ns ± 0% -39.05% MemmoveUnalignedDst/13 6.16ns ± 5% 2.77ns ± 0% -55.14% MemmoveUnalignedDst/14 4.03ns ± 2% 2.77ns ± 0% -31.41% MemmoveUnalignedDst/15 4.11ns ± 0% 2.77ns ± 0% -32.74% MemmoveUnalignedDst/16 3.49ns ± 4% 2.79ns ± 1% -20.04% MemmoveUnalignedSrc/1 3.06ns ± 0% 2.77ns ± 0% -9.68% MemmoveUnalignedSrc/2 2.82ns ± 1% 2.77ns ± 0% -1.93% MemmoveUnalignedSrc/3 3.04ns ± 0% 2.77ns ± 0% -8.95% MemmoveUnalignedSrc/4 2.85ns ± 0% 2.77ns ± 0% -2.86% MemmoveUnalignedSrc/5 3.04ns ± 0% 2.77ns ± 0% -8.97% MemmoveUnalignedSrc/6 2.93ns ± 0% 2.77ns ± 0% -5.43% MemmoveUnalignedSrc/7 3.13ns ± 0% 2.77ns ± 0% -11.56% MemmoveUnalignedSrc/8 3.71ns ± 2% 2.77ns ± 0% -25.46% MemmoveUnalignedSrc/9 6.04ns ± 0% 2.77ns ± 0% -54.16% MemmoveUnalignedSrc/10 6.86ns ± 5% 2.77ns ± 0% -59.69% MemmoveUnalignedSrc/11 4.18ns ± 3% 2.77ns ± 0% -33.81% MemmoveUnalignedSrc/12 4.75ns ± 2% 2.77ns ± 0% -41.81% MemmoveUnalignedSrc/13 4.78ns ± 3% 2.77ns ± 0% -42.15% MemmoveUnalignedSrc/14 3.89ns ± 5% 2.77ns ± 0% -28.80% MemmoveUnalignedSrc/15 4.09ns ± 0% 2.77ns ± 0% -32.30% MemmoveUnalignedSrc/16 3.15ns ± 1% 2.77ns ± 0% -12.05% Change-Id: Ia3c09d968dada71a794e5ccab3300ea9c46d8374 Reviewed-on: https://go-review.googlesource.com/c/go/+/470135 Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Heschi Kreinick <heschi@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
parent
bd20bf4807
commit
cdf77c7209
|
|
@ -39,6 +39,15 @@ TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
|
||||||
// Determine if there are doublewords to
|
// Determine if there are doublewords to
|
||||||
// copy so a more efficient move can be done
|
// copy so a more efficient move can be done
|
||||||
check:
|
check:
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
CMP LEN, $16
|
||||||
|
BGT mcopy
|
||||||
|
SLD $56, LEN, TMP
|
||||||
|
LXVL SRC, TMP, V0
|
||||||
|
STXVL V0, TGT, TMP
|
||||||
|
RET
|
||||||
|
#endif
|
||||||
|
mcopy:
|
||||||
ANDCC $7, LEN, BYTES // R7: bytes to copy
|
ANDCC $7, LEN, BYTES // R7: bytes to copy
|
||||||
SRD $3, LEN, DWORDS // R6: double words to copy
|
SRD $3, LEN, DWORDS // R6: double words to copy
|
||||||
MOVFL CR0, CR3 // save CR from ANDCC
|
MOVFL CR0, CR3 // save CR from ANDCC
|
||||||
|
|
@ -110,12 +119,26 @@ lt32gt8:
|
||||||
lt16: // Move 8 bytes if possible
|
lt16: // Move 8 bytes if possible
|
||||||
CMP DWORDS, $1
|
CMP DWORDS, $1
|
||||||
BLT checkbytes
|
BLT checkbytes
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
ADD $8, BYTES
|
||||||
|
SLD $56, BYTES, TMP
|
||||||
|
LXVL SRC, TMP, V0
|
||||||
|
STXVL V0, TGT, TMP
|
||||||
|
RET
|
||||||
|
#endif
|
||||||
|
|
||||||
MOVD 0(SRC), TMP
|
MOVD 0(SRC), TMP
|
||||||
ADD $8, SRC
|
ADD $8, SRC
|
||||||
MOVD TMP, 0(TGT)
|
MOVD TMP, 0(TGT)
|
||||||
ADD $8, TGT
|
ADD $8, TGT
|
||||||
checkbytes:
|
checkbytes:
|
||||||
BC 12, 14, LR // BEQ lr
|
BC 12, 14, LR // BEQ lr
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
|
SLD $56, BYTES, TMP
|
||||||
|
LXVL SRC, TMP, V0
|
||||||
|
STXVL V0, TGT, TMP
|
||||||
|
RET
|
||||||
|
#endif
|
||||||
lt8: // Move word if possible
|
lt8: // Move word if possible
|
||||||
CMP BYTES, $4
|
CMP BYTES, $4
|
||||||
BLT lt4
|
BLT lt4
|
||||||
|
|
@ -183,6 +206,7 @@ backward32setup:
|
||||||
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
|
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
|
||||||
MOVD QWORDS, CTR // set up loop ctr
|
MOVD QWORDS, CTR // set up loop ctr
|
||||||
MOVD $16, IDX16 // 32 bytes at a time
|
MOVD $16, IDX16 // 32 bytes at a time
|
||||||
|
PCALIGN $32
|
||||||
|
|
||||||
backward32loop:
|
backward32loop:
|
||||||
SUB $32, TGT
|
SUB $32, TGT
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue