mirror of https://github.com/golang/go.git
cmd/compile: improve lowered moves and zeros for ppc64le
This change includes the following: - Generate LXV/STXV sequences instead of LXVD2X/STXVD2X on power9. These instructions do not require an index register, which allows more loads and stores within a loop without initializing multiple index registers. The LoweredQuadXXX generate LXV/STXV. - Create LoweredMoveXXXShort and LoweredZeroXXXShort for short moves that don't generate loops, and therefore don't clobber the address registers or flags. - Use registers other than R3 and R4 to avoid conflicting with registers that have already been allocated to avoid unnecessary register moves. - Eliminate the use of R14 as scratch register and use R31 instead. - Add PCALIGN when the LoweredMoveXXX or LoweredZeroXXX generates a loop with more than 3 iterations. This performance opportunity was noticed in github.com/golang/snappy benchmarks. Results on power9: WordsDecode1e1 54.1ns ± 0% 53.8ns ± 0% -0.51% (p=0.029 n=4+4) WordsDecode1e2 287ns ± 0% 282ns ± 1% -1.83% (p=0.029 n=4+4) WordsDecode1e3 3.98µs ± 0% 3.64µs ± 0% -8.52% (p=0.029 n=4+4) WordsDecode1e4 66.9µs ± 0% 67.0µs ± 0% +0.20% (p=0.029 n=4+4) WordsDecode1e5 723µs ± 0% 723µs ± 0% -0.01% (p=0.200 n=4+4) WordsDecode1e6 7.21ms ± 0% 7.21ms ± 0% -0.02% (p=1.000 n=4+4) WordsEncode1e1 29.9ns ± 0% 29.4ns ± 0% -1.51% (p=0.029 n=4+4) WordsEncode1e2 2.12µs ± 0% 1.75µs ± 0% -17.70% (p=0.029 n=4+4) WordsEncode1e3 11.7µs ± 0% 11.2µs ± 0% -4.61% (p=0.029 n=4+4) WordsEncode1e4 119µs ± 0% 120µs ± 0% +0.36% (p=0.029 n=4+4) WordsEncode1e5 1.21ms ± 0% 1.22ms ± 0% +0.41% (p=0.029 n=4+4) WordsEncode1e6 12.0ms ± 0% 12.0ms ± 0% +0.57% (p=0.029 n=4+4) RandomEncode 286µs ± 0% 203µs ± 0% -28.82% (p=0.029 n=4+4) ExtendMatch 47.4µs ± 0% 47.0µs ± 0% -0.85% (p=0.029 n=4+4) Change-Id: Iecad3a39ae55280286e42760a5c9d5c1168f5858 Reviewed-on: https://go-review.googlesource.com/c/go/+/226539 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
parent
5f3354d1bf
commit
815509ae31
|
|
@ -850,39 +850,226 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = v.AuxInt & 3
|
||||
|
||||
case ssa.OpPPC64LoweredZero:
|
||||
case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
|
||||
// The LoweredQuad code generation
|
||||
// generates STXV instructions on
|
||||
// power9. The Short variation is used
|
||||
// if no loop is generated.
|
||||
|
||||
// unaligned data doesn't hurt performance
|
||||
// for these instructions on power8 or later
|
||||
// sizes >= 64 generate a loop as follows:
|
||||
|
||||
// for sizes >= 64 generate a loop as follows:
|
||||
// Set up loop counter in CTR, used by BC
|
||||
// XXLXOR clears VS32
|
||||
// XXLXOR VS32,VS32,VS32
|
||||
// MOVD len/64,REG_TMP
|
||||
// MOVD REG_TMP,CTR
|
||||
// loop:
|
||||
// STXV VS32,0(R20)
|
||||
// STXV VS32,16(R20)
|
||||
// STXV VS32,32(R20)
|
||||
// STXV VS32,48(R20)
|
||||
// ADD $64,R20
|
||||
// BC 16, 0, loop
|
||||
|
||||
// set up loop counter in CTR, used by BC
|
||||
// Bytes per iteration
|
||||
ctr := v.AuxInt / 64
|
||||
|
||||
// Remainder bytes
|
||||
rem := v.AuxInt % 64
|
||||
|
||||
// Only generate a loop if there is more
|
||||
// than 1 iteration.
|
||||
if ctr > 1 {
|
||||
// Set up VS32 (V0) to hold 0s
|
||||
p := s.Prog(ppc64.AXXLXOR)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
p.Reg = ppc64.REG_VS32
|
||||
|
||||
// Set up CTR loop counter
|
||||
p = s.Prog(ppc64.AMOVD)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = ctr
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
|
||||
p = s.Prog(ppc64.AMOVD)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_CTR
|
||||
|
||||
// Don't generate padding for
|
||||
// loops with few iterations.
|
||||
if ctr > 3 {
|
||||
p = s.Prog(obj.APCALIGN)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 16
|
||||
}
|
||||
|
||||
// generate 4 STXVs to zero 64 bytes
|
||||
var top *obj.Prog
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
|
||||
// Save the top of loop
|
||||
if top == nil {
|
||||
top = p
|
||||
}
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = 16
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = 32
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = 48
|
||||
|
||||
// Increment address for the
|
||||
// 64 bytes just zeroed.
|
||||
p = s.Prog(ppc64.AADD)
|
||||
p.Reg = v.Args[0].Reg()
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 64
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
|
||||
// Branch back to top of loop
|
||||
// based on CTR
|
||||
// BC with BO_BCTR generates bdnz
|
||||
p = s.Prog(ppc64.ABC)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = ppc64.BO_BCTR
|
||||
p.Reg = ppc64.REG_R0
|
||||
p.To.Type = obj.TYPE_BRANCH
|
||||
gc.Patch(p, top)
|
||||
}
|
||||
// When ctr == 1 the loop was not generated but
|
||||
// there are at least 64 bytes to clear, so add
|
||||
// that to the remainder to generate the code
|
||||
// to clear those doublewords
|
||||
if ctr == 1 {
|
||||
rem += 64
|
||||
}
|
||||
|
||||
// Clear the remainder starting at offset zero
|
||||
offset := int64(0)
|
||||
|
||||
if rem >= 16 && ctr <= 1 {
|
||||
// If the XXLXOR hasn't already been
|
||||
// generated, do it here to initialize
|
||||
// VS32 (V0) to 0.
|
||||
p := s.Prog(ppc64.AXXLXOR)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
p.Reg = ppc64.REG_VS32
|
||||
}
|
||||
// Generate STXV for 32 or 64
|
||||
// bytes.
|
||||
for rem >= 32 {
|
||||
p := s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = offset
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = offset + 16
|
||||
offset += 32
|
||||
rem -= 32
|
||||
}
|
||||
// Generate 16 bytes
|
||||
if rem >= 16 {
|
||||
p := s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = offset
|
||||
offset += 16
|
||||
rem -= 16
|
||||
}
|
||||
|
||||
// first clear as many doublewords as possible
|
||||
// then clear remaining sizes as available
|
||||
for rem > 0 {
|
||||
op, size := ppc64.AMOVB, int64(1)
|
||||
switch {
|
||||
case rem >= 8:
|
||||
op, size = ppc64.AMOVD, 8
|
||||
case rem >= 4:
|
||||
op, size = ppc64.AMOVW, 4
|
||||
case rem >= 2:
|
||||
op, size = ppc64.AMOVH, 2
|
||||
}
|
||||
p := s.Prog(op)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_R0
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = v.Args[0].Reg()
|
||||
p.To.Offset = offset
|
||||
rem -= size
|
||||
offset += size
|
||||
}
|
||||
|
||||
case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
|
||||
|
||||
// Unaligned data doesn't hurt performance
|
||||
// for these instructions on power8.
|
||||
|
||||
// For sizes >= 64 generate a loop as follows:
|
||||
|
||||
// Set up loop counter in CTR, used by BC
|
||||
// XXLXOR VS32,VS32,VS32
|
||||
// MOVD len/32,REG_TMP
|
||||
// MOVD REG_TMP,CTR
|
||||
// MOVD $16,REG_TMP
|
||||
// loop:
|
||||
// STXVD2X VS32,(R0)(R3)
|
||||
// STXVD2X VS32,(R31)(R3)
|
||||
// ADD $32,R3
|
||||
// STXVD2X VS32,(R0)(R20)
|
||||
// STXVD2X VS32,(R31)(R20)
|
||||
// ADD $32,R20
|
||||
// BC 16, 0, loop
|
||||
//
|
||||
// any remainder is done as described below
|
||||
|
||||
// for sizes < 64 bytes, first clear as many doublewords as possible,
|
||||
// then handle the remainder
|
||||
// MOVD R0,(R3)
|
||||
// MOVD R0,8(R3)
|
||||
// MOVD R0,(R20)
|
||||
// MOVD R0,8(R20)
|
||||
// .... etc.
|
||||
//
|
||||
// the remainder bytes are cleared using one or more
|
||||
// of the following instructions with the appropriate
|
||||
// offsets depending which instructions are needed
|
||||
//
|
||||
// MOVW R0,n1(R3) 4 bytes
|
||||
// MOVH R0,n2(R3) 2 bytes
|
||||
// MOVB R0,n3(R3) 1 byte
|
||||
// MOVW R0,n1(R20) 4 bytes
|
||||
// MOVH R0,n2(R20) 2 bytes
|
||||
// MOVB R0,n3(R20) 1 byte
|
||||
//
|
||||
// 7 bytes: MOVW, MOVH, MOVB
|
||||
// 6 bytes: MOVW, MOVH
|
||||
|
|
@ -926,10 +1113,19 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
|
||||
// Don't add padding for alignment
|
||||
// with few loop iterations.
|
||||
if ctr > 3 {
|
||||
p = s.Prog(obj.APCALIGN)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 16
|
||||
}
|
||||
|
||||
// generate 2 STXVD2Xs to store 16 bytes
|
||||
// when this is a loop then the top must be saved
|
||||
var top *obj.Prog
|
||||
// This is the top of loop
|
||||
|
||||
p = s.Prog(ppc64.ASTXVD2X)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
|
|
@ -940,7 +1136,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
if top == nil {
|
||||
top = p
|
||||
}
|
||||
|
||||
p = s.Prog(ppc64.ASTXVD2X)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
|
|
@ -1001,8 +1196,9 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
offset += size
|
||||
}
|
||||
|
||||
case ssa.OpPPC64LoweredMove:
|
||||
case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:
|
||||
|
||||
bytesPerLoop := int64(32)
|
||||
// This will be used when moving more
|
||||
// than 8 bytes. Moves start with
|
||||
// as many 8 byte moves as possible, then
|
||||
|
|
@ -1019,34 +1215,34 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
// MOVD REG_TMP,CTR
|
||||
// MOVD $16,REG_TMP
|
||||
// top:
|
||||
// LXVD2X (R0)(R4),VS32
|
||||
// LXVD2X (R31)(R4),VS33
|
||||
// ADD $32,R4
|
||||
// STXVD2X VS32,(R0)(R3)
|
||||
// STXVD2X VS33,(R31)(R4)
|
||||
// ADD $32,R3
|
||||
// LXVD2X (R0)(R21),VS32
|
||||
// LXVD2X (R31)(R21),VS33
|
||||
// ADD $32,R21
|
||||
// STXVD2X VS32,(R0)(R20)
|
||||
// STXVD2X VS33,(R31)(R20)
|
||||
// ADD $32,R20
|
||||
// BC 16,0,top
|
||||
// Bytes not moved by this loop are moved
|
||||
// with a combination of the following instructions,
|
||||
// starting with the largest sizes and generating as
|
||||
// many as needed, using the appropriate offset value.
|
||||
// MOVD n(R4),R14
|
||||
// MOVD R14,n(R3)
|
||||
// MOVW n1(R4),R14
|
||||
// MOVW R14,n1(R3)
|
||||
// MOVH n2(R4),R14
|
||||
// MOVH R14,n2(R3)
|
||||
// MOVB n3(R4),R14
|
||||
// MOVB R14,n3(R3)
|
||||
// MOVD n(R21),R31
|
||||
// MOVD R31,n(R20)
|
||||
// MOVW n1(R21),R31
|
||||
// MOVW R31,n1(R20)
|
||||
// MOVH n2(R21),R31
|
||||
// MOVH R31,n2(R20)
|
||||
// MOVB n3(R21),R31
|
||||
// MOVB R31,n3(R20)
|
||||
|
||||
// Each loop iteration moves 32 bytes
|
||||
ctr := v.AuxInt / 32
|
||||
ctr := v.AuxInt / bytesPerLoop
|
||||
|
||||
// Remainder after the loop
|
||||
rem := v.AuxInt % 32
|
||||
rem := v.AuxInt % bytesPerLoop
|
||||
|
||||
dst_reg := v.Args[0].Reg()
|
||||
src_reg := v.Args[1].Reg()
|
||||
dstReg := v.Args[0].Reg()
|
||||
srcReg := v.Args[1].Reg()
|
||||
|
||||
// The set of registers used here, must match the clobbered reg list
|
||||
// in PPC64Ops.go.
|
||||
|
|
@ -1076,57 +1272,65 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
|
||||
// Don't adding padding for
|
||||
// alignment with small iteration
|
||||
// counts.
|
||||
if ctr > 3 {
|
||||
p = s.Prog(obj.APCALIGN)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 16
|
||||
}
|
||||
|
||||
// Generate 16 byte loads and stores.
|
||||
// Use temp register for index (16)
|
||||
// on the second one.
|
||||
|
||||
p = s.Prog(ppc64.ALXVD2X)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src_reg
|
||||
p.From.Reg = srcReg
|
||||
p.From.Index = ppc64.REGZERO
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
||||
if top == nil {
|
||||
top = p
|
||||
}
|
||||
|
||||
p = s.Prog(ppc64.ALXVD2X)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src_reg
|
||||
p.From.Reg = srcReg
|
||||
p.From.Index = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS33
|
||||
|
||||
// increment the src reg for next iteration
|
||||
p = s.Prog(ppc64.AADD)
|
||||
p.Reg = src_reg
|
||||
p.Reg = srcReg
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 32
|
||||
p.From.Offset = bytesPerLoop
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = src_reg
|
||||
p.To.Reg = srcReg
|
||||
|
||||
// generate 16 byte stores
|
||||
p = s.Prog(ppc64.ASTXVD2X)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
p.To.Index = ppc64.REGZERO
|
||||
|
||||
p = s.Prog(ppc64.ASTXVD2X)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS33
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
p.To.Index = ppc64.REGTMP
|
||||
|
||||
// increment the dst reg for next iteration
|
||||
p = s.Prog(ppc64.AADD)
|
||||
p.Reg = dst_reg
|
||||
p.Reg = dstReg
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 32
|
||||
p.From.Offset = bytesPerLoop
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
|
||||
// BC with BO_BCTR generates bdnz to branch on nonzero CTR
|
||||
// to loop top.
|
||||
|
|
@ -1137,7 +1341,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.To.Type = obj.TYPE_BRANCH
|
||||
gc.Patch(p, top)
|
||||
|
||||
// src_reg and dst_reg were incremented in the loop, so
|
||||
// srcReg and dstReg were incremented in the loop, so
|
||||
// later instructions start with offset 0.
|
||||
offset = int64(0)
|
||||
}
|
||||
|
|
@ -1145,7 +1349,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
// No loop was generated for one iteration, so
|
||||
// add 32 bytes to the remainder to move those bytes.
|
||||
if ctr == 1 {
|
||||
rem += 32
|
||||
rem += bytesPerLoop
|
||||
}
|
||||
|
||||
if rem >= 16 {
|
||||
|
|
@ -1154,7 +1358,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
// on the second one.
|
||||
p := s.Prog(ppc64.ALXVD2X)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src_reg
|
||||
p.From.Reg = srcReg
|
||||
p.From.Index = ppc64.REGZERO
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
|
@ -1163,7 +1367,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
p.To.Index = ppc64.REGZERO
|
||||
|
||||
offset = 16
|
||||
|
|
@ -1171,18 +1375,15 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
|
||||
if rem >= 16 {
|
||||
// Use REGTMP as index reg
|
||||
p = s.Prog(ppc64.AMOVD)
|
||||
p := s.Prog(ppc64.AMOVD)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 16
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
|
||||
// Generate 16 byte loads and stores.
|
||||
// Use temp register for index (16)
|
||||
// on the second one.
|
||||
p = s.Prog(ppc64.ALXVD2X)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src_reg
|
||||
p.From.Reg = srcReg
|
||||
p.From.Index = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
|
@ -1191,7 +1392,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
p.To.Index = ppc64.REGTMP
|
||||
|
||||
offset = 32
|
||||
|
|
@ -1214,17 +1415,284 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
// Load
|
||||
p := s.Prog(op)
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_R14
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = src_reg
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset
|
||||
|
||||
// Store
|
||||
p = s.Prog(op)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_R14
|
||||
p.From.Reg = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dst_reg
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset
|
||||
rem -= size
|
||||
offset += size
|
||||
}
|
||||
|
||||
case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
|
||||
bytesPerLoop := int64(64)
|
||||
// This is used when moving more
|
||||
// than 8 bytes on power9. Moves start with
|
||||
// as many 8 byte moves as possible, then
|
||||
// 4, 2, or 1 byte(s) as remaining. This will
|
||||
// work and be efficient for power8 or later.
|
||||
// If there are 64 or more bytes, then a
|
||||
// loop is generated to move 32 bytes and
|
||||
// update the src and dst addresses on each
|
||||
// iteration. When < 64 bytes, the appropriate
|
||||
// number of moves are generated based on the
|
||||
// size.
|
||||
// When moving >= 64 bytes a loop is used
|
||||
// MOVD len/32,REG_TMP
|
||||
// MOVD REG_TMP,CTR
|
||||
// top:
|
||||
// LXV 0(R21),VS32
|
||||
// LXV 16(R21),VS33
|
||||
// ADD $32,R21
|
||||
// STXV VS32,0(R20)
|
||||
// STXV VS33,16(R20)
|
||||
// ADD $32,R20
|
||||
// BC 16,0,top
|
||||
// Bytes not moved by this loop are moved
|
||||
// with a combination of the following instructions,
|
||||
// starting with the largest sizes and generating as
|
||||
// many as needed, using the appropriate offset value.
|
||||
// MOVD n(R21),R31
|
||||
// MOVD R31,n(R20)
|
||||
// MOVW n1(R21),R31
|
||||
// MOVW R31,n1(R20)
|
||||
// MOVH n2(R21),R31
|
||||
// MOVH R31,n2(R20)
|
||||
// MOVB n3(R21),R31
|
||||
// MOVB R31,n3(R20)
|
||||
|
||||
// Each loop iteration moves 32 bytes
|
||||
ctr := v.AuxInt / bytesPerLoop
|
||||
|
||||
// Remainder after the loop
|
||||
rem := v.AuxInt % bytesPerLoop
|
||||
|
||||
dstReg := v.Args[0].Reg()
|
||||
srcReg := v.Args[1].Reg()
|
||||
|
||||
offset := int64(0)
|
||||
|
||||
// top of the loop
|
||||
var top *obj.Prog
|
||||
|
||||
// Only generate looping code when loop counter is > 1 for >= 64 bytes
|
||||
if ctr > 1 {
|
||||
// Set up the CTR
|
||||
p := s.Prog(ppc64.AMOVD)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = ctr
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
|
||||
p = s.Prog(ppc64.AMOVD)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_CTR
|
||||
|
||||
p = s.Prog(obj.APCALIGN)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = 16
|
||||
|
||||
// Generate 16 byte loads and stores.
|
||||
p = s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
if top == nil {
|
||||
top = p
|
||||
}
|
||||
p = s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset + 16
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS33
|
||||
|
||||
// generate 16 byte stores
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS33
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset + 16
|
||||
|
||||
// Generate 16 byte loads and stores.
|
||||
p = s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset + 32
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
||||
p = s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset + 48
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS33
|
||||
|
||||
// generate 16 byte stores
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset + 32
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS33
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset + 48
|
||||
|
||||
// increment the src reg for next iteration
|
||||
p = s.Prog(ppc64.AADD)
|
||||
p.Reg = srcReg
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = bytesPerLoop
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = srcReg
|
||||
|
||||
// increment the dst reg for next iteration
|
||||
p = s.Prog(ppc64.AADD)
|
||||
p.Reg = dstReg
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = bytesPerLoop
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = dstReg
|
||||
|
||||
// BC with BO_BCTR generates bdnz to branch on nonzero CTR
|
||||
// to loop top.
|
||||
p = s.Prog(ppc64.ABC)
|
||||
p.From.Type = obj.TYPE_CONST
|
||||
p.From.Offset = ppc64.BO_BCTR
|
||||
p.Reg = ppc64.REG_R0
|
||||
p.To.Type = obj.TYPE_BRANCH
|
||||
gc.Patch(p, top)
|
||||
|
||||
// srcReg and dstReg were incremented in the loop, so
|
||||
// later instructions start with offset 0.
|
||||
offset = int64(0)
|
||||
}
|
||||
|
||||
// No loop was generated for one iteration, so
|
||||
// add 32 bytes to the remainder to move those bytes.
|
||||
if ctr == 1 {
|
||||
rem += bytesPerLoop
|
||||
}
|
||||
if rem >= 32 {
|
||||
p := s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
||||
p = s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = 16
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS33
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS33
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = 16
|
||||
|
||||
offset = 32
|
||||
rem -= 32
|
||||
}
|
||||
|
||||
if rem >= 16 {
|
||||
// Generate 16 byte loads and stores.
|
||||
p := s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset
|
||||
|
||||
offset += 16
|
||||
rem -= 16
|
||||
|
||||
if rem >= 16 {
|
||||
p := s.Prog(ppc64.ALXV)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REG_VS32
|
||||
|
||||
p = s.Prog(ppc64.ASTXV)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REG_VS32
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset
|
||||
|
||||
offset += 16
|
||||
rem -= 16
|
||||
}
|
||||
}
|
||||
// Generate all the remaining load and store pairs, starting with
|
||||
// as many 8 byte moves as possible, then 4, 2, 1.
|
||||
for rem > 0 {
|
||||
op, size := ppc64.AMOVB, int64(1)
|
||||
switch {
|
||||
case rem >= 8:
|
||||
op, size = ppc64.AMOVD, 8
|
||||
case rem >= 4:
|
||||
op, size = ppc64.AMOVW, 4
|
||||
case rem >= 2:
|
||||
op, size = ppc64.AMOVH, 2
|
||||
}
|
||||
// Load
|
||||
p := s.Prog(op)
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = ppc64.REGTMP
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = srcReg
|
||||
p.From.Offset = offset
|
||||
|
||||
// Store
|
||||
p = s.Prog(op)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = ppc64.REGTMP
|
||||
p.To.Type = obj.TYPE_MEM
|
||||
p.To.Reg = dstReg
|
||||
p.To.Offset = offset
|
||||
rem -= size
|
||||
offset += size
|
||||
|
|
|
|||
|
|
@ -574,7 +574,12 @@
|
|||
(MOVDstorezero [0] destptr mem))))
|
||||
|
||||
// Handle cases not handled above
|
||||
(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
|
||||
// Lowered Short cases do not generate loops, and as a result don't clobber
|
||||
// the address registers or flags.
|
||||
(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 && s < 64 -> (LoweredZeroShort [s] ptr mem)
|
||||
(Zero [s] ptr mem) && objabi.GOPPC64 <= 8 -> (LoweredZero [s] ptr mem)
|
||||
(Zero [s] ptr mem) && s < 128 && objabi.GOPPC64 >= 9 -> (LoweredQuadZeroShort [s] ptr mem)
|
||||
(Zero [s] ptr mem) && objabi.GOPPC64 >= 9 -> (LoweredQuadZero [s] ptr mem)
|
||||
|
||||
// moves
|
||||
// Only the MOVD and MOVW instructions require 4 byte
|
||||
|
|
@ -608,8 +613,12 @@
|
|||
|
||||
// Large move uses a loop. Since the address is computed and the
|
||||
// offset is zero, any alignment can be used.
|
||||
(Move [s] dst src mem) && s > 8 && logLargeCopy(v, s) ->
|
||||
(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s) ->
|
||||
(LoweredMove [s] dst src mem)
|
||||
(Move [s] dst src mem) && s > 8 && s <= 64 && objabi.GOPPC64 >= 9 ->
|
||||
(LoweredQuadMoveShort [s] dst src mem)
|
||||
(Move [s] dst src mem) && s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s) ->
|
||||
(LoweredQuadMove [s] dst src mem)
|
||||
|
||||
// Calls
|
||||
// Lowering calls
|
||||
|
|
|
|||
|
|
@ -445,14 +445,49 @@ func init() {
|
|||
aux: "Int64",
|
||||
argLength: 2,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{buildReg("R3")},
|
||||
clobbers: buildReg("R3"),
|
||||
inputs: []regMask{buildReg("R20")},
|
||||
clobbers: buildReg("R20"),
|
||||
},
|
||||
clobberFlags: true,
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
{
|
||||
name: "LoweredZeroShort",
|
||||
aux: "Int64",
|
||||
argLength: 2,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp}},
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadZeroShort",
|
||||
aux: "Int64",
|
||||
argLength: 2,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp},
|
||||
},
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadZero",
|
||||
aux: "Int64",
|
||||
argLength: 2,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{buildReg("R20")},
|
||||
clobbers: buildReg("R20"),
|
||||
},
|
||||
clobberFlags: true,
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
|
||||
// R31 is temp register
|
||||
// Loop code:
|
||||
// MOVD len/32,R31 set up loop ctr
|
||||
|
|
@ -491,8 +526,8 @@ func init() {
|
|||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{buildReg("R3"), buildReg("R4")},
|
||||
clobbers: buildReg("R3 R4 R14"),
|
||||
inputs: []regMask{buildReg("R20"), buildReg("R21")},
|
||||
clobbers: buildReg("R20 R21"),
|
||||
},
|
||||
clobberFlags: true,
|
||||
typ: "Mem",
|
||||
|
|
@ -500,6 +535,49 @@ func init() {
|
|||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
{
|
||||
name: "LoweredMoveShort",
|
||||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp, gp},
|
||||
},
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
|
||||
// The following is similar to the LoweredMove, but uses
|
||||
// LXV instead of LXVD2X, which does not require an index
|
||||
// register and will do 4 in a loop instead of only.
|
||||
{
|
||||
name: "LoweredQuadMove",
|
||||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{buildReg("R20"), buildReg("R21")},
|
||||
clobbers: buildReg("R20 R21"),
|
||||
},
|
||||
clobberFlags: true,
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
|
||||
{
|
||||
name: "LoweredQuadMoveShort",
|
||||
aux: "Int64",
|
||||
argLength: 3,
|
||||
reg: regInfo{
|
||||
inputs: []regMask{gp, gp},
|
||||
},
|
||||
typ: "Mem",
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
},
|
||||
|
||||
{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
|
||||
{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, typ: "Mem", aux: "Int64", faultOnNilArg0: true, hasSideEffects: true},
|
||||
|
|
|
|||
|
|
@ -1872,7 +1872,13 @@ const (
|
|||
OpPPC64CALLclosure
|
||||
OpPPC64CALLinter
|
||||
OpPPC64LoweredZero
|
||||
OpPPC64LoweredZeroShort
|
||||
OpPPC64LoweredQuadZeroShort
|
||||
OpPPC64LoweredQuadZero
|
||||
OpPPC64LoweredMove
|
||||
OpPPC64LoweredMoveShort
|
||||
OpPPC64LoweredQuadMove
|
||||
OpPPC64LoweredQuadMoveShort
|
||||
OpPPC64LoweredAtomicStore8
|
||||
OpPPC64LoweredAtomicStore32
|
||||
OpPPC64LoweredAtomicStore64
|
||||
|
|
@ -24865,9 +24871,47 @@ var opcodeTable = [...]opInfo{
|
|||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 8}, // R3
|
||||
{0, 1048576}, // R20
|
||||
},
|
||||
clobbers: 8, // R3
|
||||
clobbers: 1048576, // R20
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredZeroShort",
|
||||
auxType: auxInt64,
|
||||
argLen: 2,
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadZeroShort",
|
||||
auxType: auxInt64,
|
||||
argLen: 2,
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadZero",
|
||||
auxType: auxInt64,
|
||||
argLen: 2,
|
||||
clobberFlags: true,
|
||||
faultOnNilArg0: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1048576}, // R20
|
||||
},
|
||||
clobbers: 1048576, // R20
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
@ -24880,10 +24924,54 @@ var opcodeTable = [...]opInfo{
|
|||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 8}, // R3
|
||||
{1, 16}, // R4
|
||||
{0, 1048576}, // R20
|
||||
{1, 2097152}, // R21
|
||||
},
|
||||
clobbers: 3145728, // R20 R21
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredMoveShort",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadMove",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
clobberFlags: true,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1048576}, // R20
|
||||
{1, 2097152}, // R21
|
||||
},
|
||||
clobbers: 3145728, // R20 R21
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "LoweredQuadMoveShort",
|
||||
auxType: auxInt64,
|
||||
argLen: 3,
|
||||
faultOnNilArg0: true,
|
||||
faultOnNilArg1: true,
|
||||
unsafePoint: true,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
clobbers: 16408, // R3 R4 R14
|
||||
},
|
||||
},
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1075,9 +1075,9 @@ func isInlinableMemmove(dst, src *Value, sz int64, c *Config) bool {
|
|||
switch c.arch {
|
||||
case "amd64":
|
||||
return sz <= 16 || (sz < 1024 && disjoint(dst, sz, src, sz))
|
||||
case "386", "ppc64", "ppc64le", "arm64":
|
||||
case "386", "arm64":
|
||||
return sz <= 8
|
||||
case "s390x":
|
||||
case "s390x", "ppc64", "ppc64le":
|
||||
return sz <= 8 || disjoint(dst, sz, src, sz)
|
||||
case "arm", "mips", "mips64", "mipsle", "mips64le":
|
||||
return sz <= 4
|
||||
|
|
|
|||
|
|
@ -3486,14 +3486,14 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
|
|||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 8 && logLargeCopy(v, s)
|
||||
// cond: s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)
|
||||
// result: (LoweredMove [s] dst src mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 8 && logLargeCopy(v, s)) {
|
||||
if !(s > 8 && objabi.GOPPC64 <= 8 && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredMove)
|
||||
|
|
@ -3501,6 +3501,38 @@ func rewriteValuePPC64_OpMove(v *Value) bool {
|
|||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 8 && s <= 64 && objabi.GOPPC64 >= 9
|
||||
// result: (LoweredQuadMoveShort [s] dst src mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 8 && s <= 64 && objabi.GOPPC64 >= 9) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredQuadMoveShort)
|
||||
v.AuxInt = s
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Move [s] dst src mem)
|
||||
// cond: s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)
|
||||
// result: (LoweredQuadMove [s] dst src mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
dst := v_0
|
||||
src := v_1
|
||||
mem := v_2
|
||||
if !(s > 8 && objabi.GOPPC64 >= 9 && logLargeCopy(v, s)) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredQuadMove)
|
||||
v.AuxInt = s
|
||||
v.AddArg3(dst, src, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValuePPC64_OpNeq16(v *Value) bool {
|
||||
|
|
@ -14953,16 +14985,66 @@ func rewriteValuePPC64_OpZero(v *Value) bool {
|
|||
return true
|
||||
}
|
||||
// match: (Zero [s] ptr mem)
|
||||
// cond: objabi.GOPPC64 <= 8 && s < 64
|
||||
// result: (LoweredZeroShort [s] ptr mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
if !(objabi.GOPPC64 <= 8 && s < 64) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredZeroShort)
|
||||
v.AuxInt = s
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Zero [s] ptr mem)
|
||||
// cond: objabi.GOPPC64 <= 8
|
||||
// result: (LoweredZero [s] ptr mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
if !(objabi.GOPPC64 <= 8) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredZero)
|
||||
v.AuxInt = s
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Zero [s] ptr mem)
|
||||
// cond: s < 128 && objabi.GOPPC64 >= 9
|
||||
// result: (LoweredQuadZeroShort [s] ptr mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
if !(s < 128 && objabi.GOPPC64 >= 9) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredQuadZeroShort)
|
||||
v.AuxInt = s
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
// match: (Zero [s] ptr mem)
|
||||
// cond: objabi.GOPPC64 >= 9
|
||||
// result: (LoweredQuadZero [s] ptr mem)
|
||||
for {
|
||||
s := v.AuxInt
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
if !(objabi.GOPPC64 >= 9) {
|
||||
break
|
||||
}
|
||||
v.reset(OpPPC64LoweredQuadZero)
|
||||
v.AuxInt = s
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteBlockPPC64(b *Block) bool {
|
||||
switch b.Kind {
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ func movesmall7() {
|
|||
func movesmall16() {
|
||||
x := [...]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
|
||||
// amd64:-".*memmove"
|
||||
// ppc64:".*memmove"
|
||||
// ppc64le:".*memmove"
|
||||
copy(x[1:], x[:])
|
||||
}
|
||||
|
||||
|
|
@ -41,10 +43,34 @@ var x [256]byte
|
|||
|
||||
// Check that large disjoint copies are replaced with moves.
|
||||
|
||||
func moveDisjointStack32() {
|
||||
var s [32]byte
|
||||
// ppc64:-".*memmove"
|
||||
// ppc64le:-".*memmove"
|
||||
// ppc64le/power8:"LXVD2X",-"ADD",-"BC"
|
||||
// ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
|
||||
copy(s[:], x[:32])
|
||||
runtime.KeepAlive(&s)
|
||||
}
|
||||
|
||||
func moveDisjointStack64() {
|
||||
var s [96]byte
|
||||
// ppc64:-".*memmove"
|
||||
// ppc64le:-".*memmove"
|
||||
// ppc64le/power8:"LXVD2X","ADD","BC"
|
||||
// ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC"
|
||||
copy(s[:], x[:96])
|
||||
runtime.KeepAlive(&s)
|
||||
}
|
||||
|
||||
func moveDisjointStack() {
|
||||
var s [256]byte
|
||||
// s390x:-".*memmove"
|
||||
// amd64:-".*memmove"
|
||||
// ppc64:-".*memmove"
|
||||
// ppc64le:-".*memmove"
|
||||
// ppc64le/power8:"LXVD2X"
|
||||
// ppc64le/power9:"LXV",-"LXVD2X"
|
||||
copy(s[:], x[:])
|
||||
runtime.KeepAlive(&s)
|
||||
}
|
||||
|
|
@ -53,6 +79,10 @@ func moveDisjointArg(b *[256]byte) {
|
|||
var s [256]byte
|
||||
// s390x:-".*memmove"
|
||||
// amd64:-".*memmove"
|
||||
// ppc64:-".*memmove"
|
||||
// ppc64le:-".*memmove"
|
||||
// ppc64le/power8:"LXVD2X"
|
||||
// ppc64le/power9:"LXV",-"LXVD2X"
|
||||
copy(s[:], b[:])
|
||||
runtime.KeepAlive(&s)
|
||||
}
|
||||
|
|
@ -60,6 +90,10 @@ func moveDisjointArg(b *[256]byte) {
|
|||
func moveDisjointNoOverlap(a *[256]byte) {
|
||||
// s390x:-".*memmove"
|
||||
// amd64:-".*memmove"
|
||||
// ppc64:-".*memmove"
|
||||
// ppc64le:-".*memmove"
|
||||
// ppc64le/power8:"LXVD2X"
|
||||
// ppc64le/power9:"LXV",-"LXVD2X"
|
||||
copy(a[:], a[128:])
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue