runtime: memclr perf improvements on ppc64x

This updates runtime/memclr_ppc64x.s to improve performance, by unrolling loops for larger clears. Fixes #17348 benchmark old MB/s new MB/s speedup BenchmarkMemclr/5-80 199.71 406.63 2.04x BenchmarkMemclr/16-80 693.66 1817.41 2.62x BenchmarkMemclr/64-80 2309.35 5793.34 2.51x BenchmarkMemclr/256-80 5428.18 14765.81 2.72x BenchmarkMemclr/4096-80 8611.65 27191.94 3.16x BenchmarkMemclr/65536-80 8736.69 28604.23 3.27x BenchmarkMemclr/1M-80 9304.94 27600.09 2.97x BenchmarkMemclr/4M-80 8705.66 27589.64 3.17x BenchmarkMemclr/8M-80 8575.74 23631.04 2.76x BenchmarkMemclr/16M-80 8443.10 19240.68 2.28x BenchmarkMemclr/64M-80 8390.40 9493.04 1.13x BenchmarkGoMemclr/5-80 263.05 630.37 2.40x BenchmarkGoMemclr/16-80 904.33 1148.49 1.27x BenchmarkGoMemclr/64-80 2830.20 8756.70 3.09x BenchmarkGoMemclr/256-80 6064.59 20299.46 3.35x Change-Id: Ic76c9183c8b4129ba3df512ca8b0fe6bd424e088 Reviewed-on: https://go-review.googlesource.com/30373 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <munday@ca.ibm.com> Reviewed-by: David Chase <drchase@google.com>
2016-10-05 15:12:05 -05:00 · 2016-10-05 15:12:05 -05:00 · 3107c91e2d
parent ce645534e4
commit 3107c91e2d
1 changed files with 49 additions and 18 deletions
--- a/src/runtime/memclr_ppc64x.s
+++ b/src/runtime/memclr_ppc64x.s
@ -7,25 +7,56 @@
 #include "textflag.h"

 // void runtime·memclr(void*, uintptr)
-TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	n+8(FP), R4
-	SRADCC	$3, R4, R6	// R6 is the number of words to zero
-	BEQ	bytes
+TEXT runtime·memclr(SB), NOSPLIT|NOFRAME, $0-16
+	MOVD ptr+0(FP), R3
+	MOVD n+8(FP), R4

-	SUB	$8, R3
-	MOVD	R6, CTR
-	MOVDU	R0, 8(R3)
-	BC	25, 0, -1(PC)	// bdnz+ $-4
-	ADD	$8, R3
+	// Determine if there are doublewords to clear
+check:
+	ANDCC $7, R4, R5  // R5: leftover bytes to clear
+	SRAD  $3, R4, R6  // R6: double words to clear
+	CMP   R6, $0, CR1 // CR1[EQ] set if no double words

-bytes:
-	ANDCC	$7, R4, R7	// R7 is the number of bytes to zero
-	BEQ	done
-	SUB	$1, R3
-	MOVD	R7, CTR
-	MOVBU	R0, 1(R3)
-	BC	25, 0, -1(PC)	// bdnz+ $-4
+	BC     12, 6, nozerolarge // only single bytes
+	MOVD   R6, CTR            // R6 = number of double words
+	SRADCC $2, R6, R7         // 32 byte chunks?
+	BNE    zero32setup

-done:
+	// Clear double words
+
+zero8:
+	MOVD R0, 0(R3)    // double word
+	ADD  $8, R3
+	BC   16, 0, zero8 // dec ctr, br zero8 if ctr not 0
+	BR   nozerolarge  // handle remainder
+
+	// Prepare to clear 32 bytes at a time.
+
+zero32setup:
+	DCBTST (R3)    // prepare data cache
+	MOVD   R7, CTR // number of 32 byte chunks
+
+zero32:
+	MOVD    R0, 0(R3)       // clear 4 double words
+	MOVD    R0, 8(R3)
+	MOVD    R0, 16(R3)
+	MOVD    R0, 24(R3)
+	ADD     $32, R3
+	BC      16, 0, zero32   // dec ctr, br zero32 if ctr not 0
+	RLDCLCC $61, R4, $3, R6 // remaining doublewords
+	BEQ     nozerolarge
+	MOVD    R6, CTR         // set up the CTR for doublewords
+	BR      zero8
+
+nozerolarge:
+	CMP R5, $0   // any remaining bytes
+	BC  4, 1, LR // ble lr
+
+zerotail:
+	MOVD R5, CTR // set up to clear tail bytes
+
+zerotailloop:
+	MOVB R0, 0(R3)           // clear single bytes
+	ADD  $1, R3
+	BC   16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
 	RET