mirror of https://github.com/golang/go.git
runtime: improve memclr on ppc64x/power10
Rewrite memclr asm function to use the new power10 instruction stxvl or the store vector with length which can specify the number of bytes to be stored in a register, thereby avoiding loops to store the tail end bytes. On power9 and power8 the code remains unchanged. The performance for all sizes<16 improve on power10 with this change. name old time/op new time/op delta Memclr/1 2.59ns ± 1% 2.80ns ± 9% ~ Memclr/2 2.77ns ± 0% 2.90ns ± 8% ~ Memclr/3 3.70ns ± 1% 3.00ns ± 1% -19.02% Memclr/4 4.56ns ± 5% 2.98ns ± 1% -34.56% Memclr/5 10.1ns ± 8% 2.9ns ± 4% -70.82% Memclr/6 6.99ns ± 4% 2.84ns ± 9% -59.40% Memclr/7 8.66ns ± 5% 2.92ns ± 4% -66.27% Memclr/8 2.75ns ± 1% 2.75ns ± 0% ~ Memclr/9 3.43ns ± 1% 2.75ns ± 0% -19.78% Memclr/10 3.30ns ± 0% 2.77ns ± 0% -15.81% Memclr/12 5.09ns ± 0% 2.78ns ± 0% -45.49% Memclr/15 9.79ns ± 0% 2.78ns ± 0% -71.64% Memclr/16 2.65ns ± 1% 2.77ns ± 0% +4.77% GoMemclr/1 2.46ns ± 3% 2.46ns ± 3% ~ GoMemclr/2 2.91ns ± 0% 2.46ns ± 1% -15.38% GoMemclr/3 4.85ns ± 6% 2.45ns ± 1% -49.50% GoMemclr/4 5.44ns ± 5% 2.47ns ± 1% -54.63% GoMemclr/5 6.48ns ± 0% 2.45ns ± 1% -62.20% GoMemclr/6 9.33ns ±20% 2.45ns ± 1% -73.76% GoMemclr/7 8.79ns ± 2% 2.46ns ± 1% -72.03% GoMemclr/8 2.91ns ± 0% 2.84ns ± 0% -2.35% GoMemclr/9 5.04ns ± 0% 2.84ns ± 0% -43.67% GoMemclr/10 5.17ns ± 1% 2.84ns ± 0% -44.99% GoMemclr/12 6.22ns ± 0% 2.84ns ± 0% -54.41% GoMemclr/15 10.1ns ± 0% 2.8ns ± 0% -72.04% GoMemclr/16 2.90ns ± 0% 2.92ns ± 0% +0.83% Change-Id: I2b7e1a9ceafba11c57c04a2e8cbd12b5ac87ec9b Reviewed-on: https://go-review.googlesource.com/c/go/+/467635 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
This commit is contained in:
parent
cad872f75a
commit
e6faa375b4
|
|
@ -91,24 +91,33 @@ lt32gt8:
|
|||
ADD $16, R3
|
||||
ADD $-16, R4
|
||||
lt16gt8:
|
||||
#ifdef GOPPC64_power10
|
||||
SLD $56, R4, R7
|
||||
STXVL V0, R3, R7
|
||||
RET
|
||||
#else
|
||||
CMP R4, $8
|
||||
BLT nozerolarge
|
||||
MOVD R0, 0(R3)
|
||||
ADD $8, R3
|
||||
ADD $-8, R4
|
||||
|
||||
#endif
|
||||
nozerolarge:
|
||||
ANDCC $7, R4, R5 // any remaining bytes
|
||||
BC 4, 1, LR // ble lr
|
||||
|
||||
zerotail:
|
||||
#ifdef GOPPC64_power10
|
||||
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
|
||||
SLD $56, R5, R7
|
||||
STXVL V0, R3, R7
|
||||
RET
|
||||
#else
|
||||
MOVD R5, CTR // set up to clear tail bytes
|
||||
|
||||
zerotailloop:
|
||||
MOVB R0, 0(R3) // clear single bytes
|
||||
ADD $1, R3
|
||||
BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
|
||||
RET
|
||||
#endif
|
||||
|
||||
zero512xsetup: // 512 chunk with extra needed
|
||||
ANDCC $8, R3, R11 // 8 byte alignment?
|
||||
|
|
@ -123,8 +132,6 @@ zero512setup16:
|
|||
MOVD $128, R15
|
||||
SUB R14, R15, R14 // find increment to 128 alignment
|
||||
SRD $4, R14, R15 // number of 16 byte chunks
|
||||
|
||||
zero512presetup:
|
||||
MOVD R15, CTR // loop counter of 16 bytes
|
||||
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
|
||||
|
||||
|
|
@ -142,8 +149,7 @@ zero512setup: // setup for dcbz loop
|
|||
MOVD $128, R9 // index regs for 128 bytes
|
||||
MOVD $256, R10
|
||||
MOVD $384, R11
|
||||
PCALIGN $32
|
||||
|
||||
PCALIGN $32
|
||||
zero512:
|
||||
DCBZ (R3+R0) // clear first chunk
|
||||
DCBZ (R3+R9) // clear second chunk
|
||||
|
|
|
|||
Loading…
Reference in New Issue