mirror of https://github.com/golang/go.git
cmd/compile: add prefetch intrinsic support on PPC64
This CL enables intrinsic support to emit the following prefetch instructions for PPC64 platform that are already emitted on other platforms 1. Prefetch - prefetches data from memory address to cache; 2. PrefetchStreamed - prefetches data from memory address, with a hint that this data is being streamed. Benchmarks picked from go/test/bench/garbage Parameters tested with: GOMAXPROCS=8 tree2 -heapsize=1000000000 -cpus=8 tree -n=18 parser peano Performance results with this change on POWER9 name old time/op new time/op delta Tree2-8 75.3ms ± 2% 65.0ms ± 6% -13.61% (p=0.003 n=5+7) Tree-8 576ms ± 2% 576ms ± 1% ~ (p=0.756 n=11+10) Parser-8 3.60s ± 2% 3.59s ± 1% ~ (p=0.818 n=6+6) Peano-8 84.8ms ± 1% 84.6ms ± 1% ~ (p=0.180 n=6+6) Results on POWER8 and POWER10 are similar Change-Id: If4ac95a85aaa7b2266014e1f8fb7cd7440cbf906 Reviewed-on: https://go-review.googlesource.com/c/go/+/353730 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Trust: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
8444a545e3
commit
6ae3afa7e7
|
|
@ -901,6 +901,13 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = v.Reg()
|
||||
|
||||
case ssa.OpPPC64DCBT:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = v.Args[0].Reg()
|
||||
p.To.Type = obj.TYPE_CONST
|
||||
p.To.Offset = v.AuxInt
|
||||
|
||||
case ssa.OpPPC64MOVWstorezero, ssa.OpPPC64MOVHstorezero, ssa.OpPPC64MOVBstorezero:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.From.Type = obj.TYPE_REG
|
||||
|
|
|
|||
|
|
@ -1470,3 +1470,7 @@
|
|||
&& clobber(call)
|
||||
=> (Move [sz] dst src mem)
|
||||
|
||||
// Prefetch instructions (aux is option: 0 - DCBT ; 8 - DCBT stream)
|
||||
(PrefetchCache ptr mem) => (DCBT ptr mem [0])
|
||||
(PrefetchCacheStreamed ptr mem) => (DCBT ptr mem [8])
|
||||
|
||||
|
|
|
|||
|
|
@ -149,6 +149,7 @@ func init() {
|
|||
crgp21 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
|
||||
gpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
|
||||
gploadidx = regInfo{inputs: []regMask{gp | sp | sb, gp}, outputs: []regMask{gp}}
|
||||
prefreg = regInfo{inputs: []regMask{gp | sp | sb}}
|
||||
gpstore = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}}
|
||||
gpstoreidx = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}}
|
||||
gpstorezero = regInfo{inputs: []regMask{gp | sp | sb}} // ppc64.REGZERO is reserved zero value
|
||||
|
|
@ -336,6 +337,10 @@ func init() {
|
|||
{name: "FMOVDloadidx", argLength: 3, reg: fploadidx, asm: "FMOVD", typ: "Float64"},
|
||||
{name: "FMOVSloadidx", argLength: 3, reg: fploadidx, asm: "FMOVS", typ: "Float32"},
|
||||
|
||||
// Prefetch instruction
|
||||
// Do prefetch of address generated with arg0 and arg1 with option aux. arg0=addr,arg1=memory, aux=option.
|
||||
{name: "DCBT", argLength: 2, aux: "Int64", reg: prefreg, asm: "DCBT", hasSideEffects: true},
|
||||
|
||||
// Store bytes in the reverse endian order of the arch into arg0.
|
||||
// These are indexed stores with no offset field in the instruction so the auxint fields are not used.
|
||||
{name: "MOVDBRstore", argLength: 3, reg: gpstore, asm: "MOVDBR", aux: "Sym", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes reverse order
|
||||
|
|
|
|||
|
|
@ -1994,6 +1994,7 @@ const (
|
|||
OpPPC64MOVDBRloadidx
|
||||
OpPPC64FMOVDloadidx
|
||||
OpPPC64FMOVSloadidx
|
||||
OpPPC64DCBT
|
||||
OpPPC64MOVDBRstore
|
||||
OpPPC64MOVWBRstore
|
||||
OpPPC64MOVHBRstore
|
||||
|
|
@ -26715,6 +26716,18 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "DCBT",
|
||||
auxType: auxInt64,
|
||||
argLen: 2,
|
||||
hasSideEffects: true,
|
||||
asm: ppc64.ADCBT,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MOVDBRstore",
|
||||
auxType: auxSym,
|
||||
|
|
|
|||
|
|
@ -639,6 +639,10 @@ func rewriteValuePPC64(v *Value) bool {
|
|||
return true
|
||||
case OpPopCount8:
|
||||
return rewriteValuePPC64_OpPopCount8(v)
|
||||
case OpPrefetchCache:
|
||||
return rewriteValuePPC64_OpPrefetchCache(v)
|
||||
case OpPrefetchCacheStreamed:
|
||||
return rewriteValuePPC64_OpPrefetchCacheStreamed(v)
|
||||
case OpRotateLeft16:
|
||||
return rewriteValuePPC64_OpRotateLeft16(v)
|
||||
case OpRotateLeft32:
|
||||
|
|
@ -14005,6 +14009,34 @@ func rewriteValuePPC64_OpPopCount8(v *Value) bool {
|
|||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValuePPC64_OpPrefetchCache(v *Value) bool {
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (PrefetchCache ptr mem)
|
||||
// result: (DCBT ptr mem [0])
|
||||
for {
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
v.reset(OpPPC64DCBT)
|
||||
v.AuxInt = int64ToAuxInt(0)
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValuePPC64_OpPrefetchCacheStreamed(v *Value) bool {
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (PrefetchCacheStreamed ptr mem)
|
||||
// result: (DCBT ptr mem [8])
|
||||
for {
|
||||
ptr := v_0
|
||||
mem := v_1
|
||||
v.reset(OpPPC64DCBT)
|
||||
v.AuxInt = int64ToAuxInt(8)
|
||||
v.AddArg2(ptr, mem)
|
||||
return true
|
||||
}
|
||||
}
|
||||
func rewriteValuePPC64_OpRotateLeft16(v *Value) bool {
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
|
|
|
|||
|
|
@ -3897,9 +3897,9 @@ func InitTables() {
|
|||
// Make Prefetch intrinsics for supported platforms
|
||||
// On the unsupported platforms stub function will be eliminated
|
||||
addF("runtime/internal/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
|
||||
sys.AMD64, sys.ARM64)
|
||||
sys.AMD64, sys.ARM64, sys.PPC64)
|
||||
addF("runtime/internal/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
|
||||
sys.AMD64, sys.ARM64)
|
||||
sys.AMD64, sys.ARM64, sys.PPC64)
|
||||
|
||||
/******** runtime/internal/atomic ********/
|
||||
addF("runtime/internal/atomic", "Load",
|
||||
|
|
|
|||
Loading…
Reference in New Issue