diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index e9500a24ed..7698326f8d 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -461,8 +461,8 @@ var passes = [...]pass{ {name: "short circuit", fn: shortcircuit}, {name: "decompose user", fn: decomposeUser, required: true}, {name: "pre-opt deadcode", fn: deadcode}, - {name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules - {name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values + {name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules + {name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values {name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt {name: "generic cse", fn: cse}, {name: "phiopt", fn: phiopt}, @@ -482,6 +482,7 @@ var passes = [...]pass{ {name: "check bce", fn: checkbce}, {name: "dse", fn: dse}, {name: "memcombine", fn: memcombine}, + {name: "ilp", fn: ilp}, {name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops {name: "insert resched checks", fn: insertLoopReschedChecks, disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops. @@ -583,6 +584,8 @@ var passOrder = [...]constraint{ {"late fuse", "memcombine"}, // memcombine is a arch-independent pass. {"memcombine", "lower"}, + // ilp works best after ORs have been combined to loads + {"memcombine", "ilp"}, } func init() { diff --git a/src/cmd/compile/internal/ssa/ilp.go b/src/cmd/compile/internal/ssa/ilp.go new file mode 100644 index 0000000000..205f52bf83 --- /dev/null +++ b/src/cmd/compile/internal/ssa/ilp.go @@ -0,0 +1,152 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +// ilp pass (Instruction Level Parallelism) balances trees of associative computation +// to help CPU pipeline instructions more efficiently. It only works block by block +// so that it doesn't end up pulling loop invariant expressions into tight loops +func ilp(f *Func) { + for _, b := range f.Blocks { + for _, v := range findRoots(b) { + rebalance(v) + } + } +} + +// isILPOp only returns true if the operation is associative, +// which for our case would be only commutative integer ops. +func isILPOp(o Op) bool { + // if the op isn't a commutative integer op, it won't be associative + if !opcodeTable[o].commutative { + return false + } + + // filter out float ops because they are not associative, leaving int ops + switch o { + case OpAdd32F, OpAdd64F, OpMul32F, OpMul64F: + return false + default: + return true + } +} + +// findRoots looks for the root of a rebalanceable expressions. +// It does this by building a poor man's def-use chain, counting up the uses of +// a given expression as an argument within the block. +// Any roots of rebalanceable expressions should have a count of zero meaning +// they aren't used as arguments in rebalanceable expressions. +func findRoots(b *Block) []*Value { + uses := make(map[*Value]int) + candidates := make([]*Value, 0) + roots := make([]*Value, 0) + + for _, v := range b.Values { + if !isILPOp(v.Op) { + continue + } + + // could be a possible root, add it as a candidate to remove later + candidates = append(candidates, v) + + // mark the arguments of the expression as being used + // if they are also a rebalanceable op (making them a non-root node) + if isILPOp(v.Args[0].Op) && v.Op == v.Args[0].Op { + uses[v.Args[0]]++ + } + + if isILPOp(v.Args[1].Op) && v.Op == v.Args[1].Op { + uses[v.Args[1]]++ + } + } + + for _, c := range candidates { + if uses[c] == 0 { + roots = append(roots, c) + } + } + + return roots +} + +// balanceExprTree repurposes all nodes and leaves into a well-balanced expression tree. +// It doesn't truly balance the tree in the sense of a BST, rather it +// prioritizes pairing up innermost (rightmost) expressions and their results and only +// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists +// TODO(ryan-berger): implement Huffman Tree-Height Reduction instead? +func balanceExprTree(nodes, leaves []*Value) { + // reset all arguments of nodes to reuse them + for _, n := range nodes { + n.reset(n.Op) + } + + // rebuild expression trees from the bottom up, prioritizing right grouping. + // if the number of leaves is not even, skip the first leaf + // and add it to be paired up later + subTrees := leaves + i := len(nodes)-1 + for len(subTrees) != 1 { + nextSubTrees := subTrees[:0] + + start := len(subTrees) % 2 + if start != 0 { + nextSubTrees = append(nextSubTrees, subTrees[0]) + } + + // pair leaves using the last nodes and move towards nodes[0] which is the root + for j := start; j < len(subTrees)-1; j += 2 { + nodes[i].AddArgs(subTrees[j], subTrees[j+1]) + nextSubTrees = append(nextSubTrees, nodes[i]) + i-- + } + + subTrees = nextSubTrees + } +} + +// rebalance balances associative computation to better help CPU instruction pipelining (#49331) +// +// a + b + c + d compiles to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree. +// It is suboptimal since it requires the CPU to compute v3 before fetching it use its result in +// v2, and v2 before its use in v1 +// +// This optimization rebalances this expression tree to look like v1:(v2:(a + b) + v3:(c + d)), +// which removes such dependencies and frees up the CPU pipeline. +func rebalance(v *Value) { + // The smallest possible rebalanceable binary expression has 3 nodes and 4 leaves, + // so preallocate the lists to save time if it is not rebalanceable + leaves := make([]*Value, 0, 4) + nodes := make([]*Value, 0, 3) + + // Do a bfs on v to keep a nice reverse topological order + haystack := []*Value{v} + for i := 0; i < len(haystack); i++ { + needle := haystack[i] + + // if we are searching a value, it must be a node so add it to our node list + nodes = append(nodes, needle) + + for _, a := range needle.Args { + // If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf. + if a.Op != v.Op || a.Uses != 1 || a.Block != v.Block { + leaves = append(leaves, a) + continue + } + + // nodes in the tree now hold the invariants that: + // - they are of a common associative operation as the rest of the tree + // - they have only a single use + // - they are in the same basic block + haystack = append(haystack, a) + } + } + + // we need at least 3 nodes (root, two children) and 4 leaves + // for this expression to be rebalanceable + if len(nodes) < 3 || len(leaves) < 4 { + return + } + + balanceExprTree(nodes, leaves) +} diff --git a/src/cmd/compile/internal/ssa/shortcircuit.go b/src/cmd/compile/internal/ssa/shortcircuit.go index b86596026b..a224ebbbbe 100644 --- a/src/cmd/compile/internal/ssa/shortcircuit.go +++ b/src/cmd/compile/internal/ssa/shortcircuit.go @@ -508,7 +508,7 @@ func (v *Value) moveTo(dst *Block, i int) { } src := v.Block if src.Values[i] != v { - v.Fatalf("moveTo bad index %d", v, i) + v.Fatalf("moveTo bad index %d, src.Values[i] = %v, expected %v", i, src.Values[i], v) } if src == dst { return diff --git a/test/codegen/reassociate.go b/test/codegen/reassociate.go new file mode 100644 index 0000000000..9fa795b530 --- /dev/null +++ b/test/codegen/reassociate.go @@ -0,0 +1,22 @@ +// asmcheck + +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package codegen + +// reassociateAddition expects very specific sequence of registers +// of the form: +// R2 += R3 +// R1 += R0 +// R1 += R2 +func reassociateAddition(a, b, c, d int) int { + // arm64:`ADD\tR2,\sR3,\sR2` + x := b + a + // arm64:`ADD\tR0,\sR1,\sR1` + y := x + c + // arm64:`ADD\tR1,\sR2,\sR0` + z := y + d + return z +} \ No newline at end of file