runtime: improve memmove on ppc64x/power10

Rewrite memmove asm function to use the new power10 instructions
lxvl and stxvl or the load and store vector with length which can
specify the number of bytes to be loaded/stored in a register,
thereby avoiding multiple instructions to process 8bytes, 4bytes,
2bytes and a single byte while storing the tail end bytes. On power9
and power8 the code remains unchanged.
The performance for all sizes<=16 improve on power10 with this change.

name                          old time/op    new time/op    delta
Memmove/1                2.87ns ±  0%    2.64ns ±  1%    -8.11%
Memmove/2                2.85ns ±  0%    2.62ns ±  1%    -8.12%
Memmove/3                2.78ns ±  0%    2.63ns ±  1%    -5.33%
Memmove/4                2.83ns ±  0%    2.63ns ±  2%    -7.33%
Memmove/5                2.78ns ±  0%    2.63ns ±  1%    -5.40%
Memmove/6                2.61ns ±  3%    2.61ns ±  1%      ~
Memmove/7                2.82ns ±  0%    2.61ns ±  1%    -7.48%
Memmove/8                2.82ns ±  0%    2.65ns ±  1%    -6.11%
Memmove/9                6.41ns ±  0%    2.62ns ±  1%   -59.17%
Memmove/10               5.09ns ±  1%    2.60ns ±  1%   -48.90%
Memmove/11               4.68ns ±  7%    2.59ns ±  1%   -44.56%
Memmove/12               6.25ns ±  2%    2.60ns ±  1%   -58.46%
Memmove/13               4.15ns ± 25%    2.59ns ±  1%   -37.66%
Memmove/14               3.76ns ± 11%    2.59ns ±  1%   -30.94%
Memmove/15               3.82ns ±  1%    2.60ns ±  1%   -31.93%
Memmove/16               2.96ns ±  1%    2.59ns ±  1%   -12.63%
MemmoveUnalignedDst/1    3.07ns ±  0%    2.77ns ±  0%    -9.75%
MemmoveUnalignedDst/2    2.82ns ±  0%    2.77ns ±  0%    -1.73%
MemmoveUnalignedDst/3    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/4    2.85ns ±  1%    2.77ns ±  0%    -2.90%
MemmoveUnalignedDst/5    3.03ns ±  0%    2.77ns ±  0%    -8.75%
MemmoveUnalignedDst/6    2.88ns ±  0%    2.77ns ±  0%    -4.04%
MemmoveUnalignedDst/7    3.11ns ±  0%    2.77ns ±  0%   -11.10%
MemmoveUnalignedDst/8    4.18ns ±  2%    2.77ns ±  0%   -33.90%
MemmoveUnalignedDst/9    6.36ns ±  1%    2.77ns ±  0%   -56.53%
MemmoveUnalignedDst/10   5.77ns ±  1%    2.77ns ±  0%   -52.09%
MemmoveUnalignedDst/11   4.68ns ±  1%    2.77ns ±  0%   -40.86%
MemmoveUnalignedDst/12   4.54ns ±  2%    2.77ns ±  0%   -39.05%
MemmoveUnalignedDst/13   6.16ns ±  5%    2.77ns ±  0%   -55.14%
MemmoveUnalignedDst/14   4.03ns ±  2%    2.77ns ±  0%   -31.41%
MemmoveUnalignedDst/15   4.11ns ±  0%    2.77ns ±  0%   -32.74%
MemmoveUnalignedDst/16   3.49ns ±  4%    2.79ns ±  1%   -20.04%
MemmoveUnalignedSrc/1    3.06ns ±  0%    2.77ns ±  0%    -9.68%
MemmoveUnalignedSrc/2    2.82ns ±  1%    2.77ns ±  0%    -1.93%
MemmoveUnalignedSrc/3    3.04ns ±  0%    2.77ns ±  0%    -8.95%
MemmoveUnalignedSrc/4    2.85ns ±  0%    2.77ns ±  0%    -2.86%
MemmoveUnalignedSrc/5    3.04ns ±  0%    2.77ns ±  0%    -8.97%
MemmoveUnalignedSrc/6    2.93ns ±  0%    2.77ns ±  0%    -5.43%
MemmoveUnalignedSrc/7    3.13ns ±  0%    2.77ns ±  0%   -11.56%
MemmoveUnalignedSrc/8    3.71ns ±  2%    2.77ns ±  0%   -25.46%
MemmoveUnalignedSrc/9    6.04ns ±  0%    2.77ns ±  0%   -54.16%
MemmoveUnalignedSrc/10   6.86ns ±  5%    2.77ns ±  0%   -59.69%
MemmoveUnalignedSrc/11   4.18ns ±  3%    2.77ns ±  0%   -33.81%
MemmoveUnalignedSrc/12   4.75ns ±  2%    2.77ns ±  0%   -41.81%
MemmoveUnalignedSrc/13   4.78ns ±  3%    2.77ns ±  0%   -42.15%
MemmoveUnalignedSrc/14   3.89ns ±  5%    2.77ns ±  0%   -28.80%
MemmoveUnalignedSrc/15   4.09ns ±  0%    2.77ns ±  0%   -32.30%
MemmoveUnalignedSrc/16   3.15ns ±  1%    2.77ns ±  0%   -12.05%
Change-Id: Ia3c09d968dada71a794e5ccab3300ea9c46d8374
Reviewed-on: https://go-review.googlesource.com/c/go/+/470135
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Heschi Kreinick <heschi@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

This commit is contained in:

Archana R

2023-02-22 05:52:15 -06:00

committed by

Lynn Boger

parent bd20bf4807

commit cdf77c7209

1 changed files with 24 additions and 0 deletions

									
										24

src/runtime/memmove_ppc64x.s

										View File
									
					@ -39,6 +39,15 @@ TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24

						// Determine if there are doublewords to

						// Determine if there are doublewords to

						// copy so a more efficient move can be done

						// copy so a more efficient move can be done

					check:

					check:

					#ifdef GOPPC64_power10

						CMP	LEN, $16

						BGT	mcopy

						SLD	$56, LEN, TMP

						LXVL	SRC, TMP, V0

						STXVL	V0, TGT, TMP

						RET

					#endif

					mcopy:

						ANDCC	$7, LEN, BYTES	// R7: bytes to copy

						ANDCC	$7, LEN, BYTES	// R7: bytes to copy

						SRD	$3, LEN, DWORDS	// R6: double words to copy

						SRD	$3, LEN, DWORDS	// R6: double words to copy

						MOVFL	CR0, CR3	// save CR from ANDCC

						MOVFL	CR0, CR3	// save CR from ANDCC

					@ -110,12 +119,26 @@ lt32gt8:

					lt16:	// Move 8 bytes if possible

					lt16:	// Move 8 bytes if possible

						CMP     DWORDS, $1

						CMP     DWORDS, $1

						BLT     checkbytes

						BLT     checkbytes

					#ifdef GOPPC64_power10

						ADD	$8, BYTES

						SLD	$56, BYTES, TMP

						LXVL	SRC, TMP, V0

						STXVL	V0, TGT, TMP

						RET

					#endif

						MOVD    0(SRC), TMP

						MOVD    0(SRC), TMP

						ADD	$8, SRC

						ADD	$8, SRC

						MOVD    TMP, 0(TGT)

						MOVD    TMP, 0(TGT)

						ADD     $8, TGT

						ADD     $8, TGT

					checkbytes:

					checkbytes:

						BC	12, 14, LR		// BEQ lr

						BC	12, 14, LR		// BEQ lr

					#ifdef GOPPC64_power10

						SLD	$56, BYTES, TMP

						LXVL	SRC, TMP, V0

						STXVL	V0, TGT, TMP

						RET

					#endif

					lt8:	// Move word if possible

					lt8:	// Move word if possible

						CMP BYTES, $4

						CMP BYTES, $4

						BLT lt4

						BLT lt4

					@ -183,6 +206,7 @@ backward32setup:

						ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0

						ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0

						MOVD	QWORDS, CTR		// set up loop ctr

						MOVD	QWORDS, CTR		// set up loop ctr

						MOVD	$16, IDX16		// 32 bytes at a time

						MOVD	$16, IDX16		// 32 bytes at a time

						PCALIGN	$32

					backward32loop:

					backward32loop:

						SUB	$32, TGT

						SUB	$32, TGT

runtime: improve memmove on ppc64x/power10

24 src/runtime/memmove_ppc64x.s Unescape Escape View File

24

src/runtime/memmove_ppc64x.s

View File