mirror of https://github.com/golang/go.git
runtime: fix 32B backward copy on ppc64x
The test to enter the 32b copy loop always fails, and execution falls back to a single 8B/iteration copy loop for copies of more than 7 bytes. Likewise, the 32B loop has SRC/DST args mixed, and fails to truncate DWORDS after completing. Fix these, and unroll the 8B/iteration loop as it will only execute 1-3 times if reached. POWER10 benchmarks: name old speed new speed delta MemmoveOverlap/32 5.28GB/s ± 0% 10.37GB/s ± 0% +96.22% MemmoveOverlap/64 5.97GB/s ± 0% 18.15GB/s ± 0% +203.95% MemmoveOverlap/128 7.67GB/s ± 0% 24.35GB/s ± 0% +217.41% MemmoveOverlap/256 14.1GB/s ± 0% 25.0GB/s ± 0% +77.48% MemmoveOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +118.19% MemmoveOverlap/1024 12.3GB/s ± 0% 36.4GB/s ± 0% +194.75% MemmoveOverlap/2048 13.7GB/s ± 0% 48.8GB/s ± 0% +255.24% MemmoveOverlap/4096 14.1GB/s ± 0% 43.4GB/s ± 0% +208.80% MemmoveUnalignedDstOverlap/32 5.07GB/s ± 0% 3.78GB/s ± 0% -25.33% MemmoveUnalignedDstOverlap/64 6.00GB/s ± 0% 9.59GB/s ± 0% +59.78% MemmoveUnalignedDstOverlap/128 7.66GB/s ± 0% 13.51GB/s ± 0% +76.42% MemmoveUnalignedDstOverlap/256 13.4GB/s ± 0% 24.3GB/s ± 0% +80.92% MemmoveUnalignedDstOverlap/512 13.9GB/s ± 0% 30.3GB/s ± 0% +118.29% MemmoveUnalignedDstOverlap/1024 12.3GB/s ± 0% 37.3GB/s ± 0% +203.07% MemmoveUnalignedDstOverlap/2048 13.7GB/s ± 0% 45.9GB/s ± 0% +235.39% MemmoveUnalignedDstOverlap/4096 13.9GB/s ± 0% 41.2GB/s ± 0% +196.34% MemmoveUnalignedSrcOverlap/32 5.13GB/s ± 0% 5.18GB/s ± 0% +0.98% MemmoveUnalignedSrcOverlap/64 6.26GB/s ± 0% 9.53GB/s ± 0% +52.29% MemmoveUnalignedSrcOverlap/128 7.94GB/s ± 0% 18.40GB/s ± 0% +131.76% MemmoveUnalignedSrcOverlap/256 14.1GB/s ± 0% 25.5GB/s ± 0% +81.40% MemmoveUnalignedSrcOverlap/512 14.2GB/s ± 0% 30.9GB/s ± 0% +116.76% MemmoveUnalignedSrcOverlap/1024 12.4GB/s ± 0% 46.4GB/s ± 0% +275.22% MemmoveUnalignedSrcOverlap/2048 13.7GB/s ± 0% 48.7GB/s ± 0% +255.16% MemmoveUnalignedSrcOverlap/4096 14.0GB/s ± 0% 43.2GB/s ± 0% +208.89% Change-Id: I9fc6956ff454a2856d56077d1014388fb74c1f52 Reviewed-on: https://go-review.googlesource.com/c/go/+/384074 Trust: Paul Murphy <murp@ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
parent
da6f9a54ed
commit
d82c294da7
|
|
@ -139,36 +139,38 @@ backwardtailloop:
|
|||
BC 16, 0, backwardtailloop // bndz
|
||||
|
||||
nobackwardtail:
|
||||
BC 4, 5, LR // ble CR1 lr
|
||||
BC 4, 5, LR // blelr cr1, return if DWORDS == 0
|
||||
SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0
|
||||
BNE backward32setup // If QWORDS != 0, start the 32B copy loop.
|
||||
|
||||
backwardlarge:
|
||||
MOVD DWORDS, CTR
|
||||
SUB TGT, SRC, TMP // Use vsx if moving
|
||||
CMP TMP, $32 // at least 32 byte chunks
|
||||
BLT backwardlargeloop // and distance >= 32
|
||||
SRDCC $2,DWORDS,QWORDS // 32 byte chunks
|
||||
BNE backward32setup
|
||||
backward24:
|
||||
// DWORDS is a value between 1-3.
|
||||
CMP DWORDS, $2
|
||||
|
||||
backwardlargeloop:
|
||||
MOVD -8(SRC), TMP
|
||||
SUB $8,SRC
|
||||
MOVD TMP, -8(TGT)
|
||||
SUB $8,TGT
|
||||
BC 16, 0, backwardlargeloop // bndz
|
||||
BC 12, 0, LR // bltlr, return if DWORDS == 1
|
||||
|
||||
MOVD -16(SRC), TMP
|
||||
MOVD TMP, -16(TGT)
|
||||
BC 12, 2, LR // beqlr, return if DWORDS == 2
|
||||
|
||||
MOVD -24(SRC), TMP
|
||||
MOVD TMP, -24(TGT)
|
||||
RET
|
||||
|
||||
backward32setup:
|
||||
MOVD QWORDS, CTR // set up loop ctr
|
||||
MOVD $16, IDX16 // 32 bytes at a time
|
||||
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
|
||||
MOVD QWORDS, CTR // set up loop ctr
|
||||
MOVD $16, IDX16 // 32 bytes at a time
|
||||
|
||||
backward32loop:
|
||||
SUB $32, TGT
|
||||
SUB $32, SRC
|
||||
LXVD2X (R0)(TGT), VS32 // load 16 bytes
|
||||
LXVD2X (IDX16)(TGT), VS33
|
||||
STXVD2X VS32, (R0)(SRC) // store 16 bytes
|
||||
STXVD2X VS33, (IDX16)(SRC)
|
||||
BC 16, 0, backward32loop // bndz
|
||||
BC 4, 5, LR // ble CR1 lr
|
||||
MOVD DWORDS, CTR
|
||||
BR backwardlargeloop
|
||||
LXVD2X (R0)(SRC), VS32 // load 16x2 bytes
|
||||
LXVD2X (IDX16)(SRC), VS33
|
||||
STXVD2X VS32, (R0)(TGT) // store 16x2 bytes
|
||||
STXVD2X VS33, (IDX16)(TGT)
|
||||
BC 16, 0, backward32loop // bndz
|
||||
BC 12, 2, LR // beqlr, return if DWORDS == 0
|
||||
BR backward24
|
||||
|
|
|
|||
Loading…
Reference in New Issue