This commit is contained in:
Ryan Berger 2025-06-20 15:37:16 -04:00 committed by GitHub
commit 10ee430ec5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 180 additions and 3 deletions

View File

@ -461,8 +461,8 @@ var passes = [...]pass{
{name: "short circuit", fn: shortcircuit},
{name: "decompose user", fn: decomposeUser, required: true},
{name: "pre-opt deadcode", fn: deadcode},
{name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules
{name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values
{name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules
{name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values
{name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt
{name: "generic cse", fn: cse},
{name: "phiopt", fn: phiopt},
@ -482,6 +482,7 @@ var passes = [...]pass{
{name: "check bce", fn: checkbce},
{name: "dse", fn: dse},
{name: "memcombine", fn: memcombine},
{name: "ilp", fn: ilp},
{name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops
{name: "insert resched checks", fn: insertLoopReschedChecks,
disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops.
@ -583,6 +584,8 @@ var passOrder = [...]constraint{
{"late fuse", "memcombine"},
// memcombine is a arch-independent pass.
{"memcombine", "lower"},
// ilp works best after ORs have been combined to loads
{"memcombine", "ilp"},
}
func init() {

View File

@ -0,0 +1,152 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssa
// ilp pass (Instruction Level Parallelism) balances trees of associative computation
// to help CPU pipeline instructions more efficiently. It only works block by block
// so that it doesn't end up pulling loop invariant expressions into tight loops
func ilp(f *Func) {
for _, b := range f.Blocks {
for _, v := range findRoots(b) {
rebalance(v)
}
}
}
// isILPOp only returns true if the operation is associative,
// which for our case would be only commutative integer ops.
func isILPOp(o Op) bool {
// if the op isn't a commutative integer op, it won't be associative
if !opcodeTable[o].commutative {
return false
}
// filter out float ops because they are not associative, leaving int ops
switch o {
case OpAdd32F, OpAdd64F, OpMul32F, OpMul64F:
return false
default:
return true
}
}
// findRoots looks for the root of a rebalanceable expressions.
// It does this by building a poor man's def-use chain, counting up the uses of
// a given expression as an argument within the block.
// Any roots of rebalanceable expressions should have a count of zero meaning
// they aren't used as arguments in rebalanceable expressions.
func findRoots(b *Block) []*Value {
uses := make(map[*Value]int)
candidates := make([]*Value, 0)
roots := make([]*Value, 0)
for _, v := range b.Values {
if !isILPOp(v.Op) {
continue
}
// could be a possible root, add it as a candidate to remove later
candidates = append(candidates, v)
// mark the arguments of the expression as being used
// if they are also a rebalanceable op (making them a non-root node)
if isILPOp(v.Args[0].Op) && v.Op == v.Args[0].Op {
uses[v.Args[0]]++
}
if isILPOp(v.Args[1].Op) && v.Op == v.Args[1].Op {
uses[v.Args[1]]++
}
}
for _, c := range candidates {
if uses[c] == 0 {
roots = append(roots, c)
}
}
return roots
}
// balanceExprTree repurposes all nodes and leaves into a well-balanced expression tree.
// It doesn't truly balance the tree in the sense of a BST, rather it
// prioritizes pairing up innermost (rightmost) expressions and their results and only
// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists
// TODO(ryan-berger): implement Huffman Tree-Height Reduction instead?
func balanceExprTree(nodes, leaves []*Value) {
// reset all arguments of nodes to reuse them
for _, n := range nodes {
n.reset(n.Op)
}
// rebuild expression trees from the bottom up, prioritizing right grouping.
// if the number of leaves is not even, skip the first leaf
// and add it to be paired up later
subTrees := leaves
i := len(nodes)-1
for len(subTrees) != 1 {
nextSubTrees := subTrees[:0]
start := len(subTrees) % 2
if start != 0 {
nextSubTrees = append(nextSubTrees, subTrees[0])
}
// pair leaves using the last nodes and move towards nodes[0] which is the root
for j := start; j < len(subTrees)-1; j += 2 {
nodes[i].AddArgs(subTrees[j], subTrees[j+1])
nextSubTrees = append(nextSubTrees, nodes[i])
i--
}
subTrees = nextSubTrees
}
}
// rebalance balances associative computation to better help CPU instruction pipelining (#49331)
//
// a + b + c + d compiles to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree.
// It is suboptimal since it requires the CPU to compute v3 before fetching it use its result in
// v2, and v2 before its use in v1
//
// This optimization rebalances this expression tree to look like v1:(v2:(a + b) + v3:(c + d)),
// which removes such dependencies and frees up the CPU pipeline.
func rebalance(v *Value) {
// The smallest possible rebalanceable binary expression has 3 nodes and 4 leaves,
// so preallocate the lists to save time if it is not rebalanceable
leaves := make([]*Value, 0, 4)
nodes := make([]*Value, 0, 3)
// Do a bfs on v to keep a nice reverse topological order
haystack := []*Value{v}
for i := 0; i < len(haystack); i++ {
needle := haystack[i]
// if we are searching a value, it must be a node so add it to our node list
nodes = append(nodes, needle)
for _, a := range needle.Args {
// If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf.
if a.Op != v.Op || a.Uses != 1 || a.Block != v.Block {
leaves = append(leaves, a)
continue
}
// nodes in the tree now hold the invariants that:
// - they are of a common associative operation as the rest of the tree
// - they have only a single use
// - they are in the same basic block
haystack = append(haystack, a)
}
}
// we need at least 3 nodes (root, two children) and 4 leaves
// for this expression to be rebalanceable
if len(nodes) < 3 || len(leaves) < 4 {
return
}
balanceExprTree(nodes, leaves)
}

View File

@ -508,7 +508,7 @@ func (v *Value) moveTo(dst *Block, i int) {
}
src := v.Block
if src.Values[i] != v {
v.Fatalf("moveTo bad index %d", v, i)
v.Fatalf("moveTo bad index %d, src.Values[i] = %v, expected %v", i, src.Values[i], v)
}
if src == dst {
return

View File

@ -0,0 +1,22 @@
// asmcheck
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
// reassociateAddition expects very specific sequence of registers
// of the form:
// R2 += R3
// R1 += R0
// R1 += R2
func reassociateAddition(a, b, c, d int) int {
// arm64:`ADD\tR2,\sR3,\sR2`
x := b + a
// arm64:`ADD\tR0,\sR1,\sR1`
y := x + c
// arm64:`ADD\tR1,\sR2,\sR0`
z := y + d
return z
}