From 15729abf855fcd1e92c277e29c23a7257a1fe6f9 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Wed, 19 Apr 2023 22:13:04 -0600 Subject: [PATCH 1/9] cmd/compile: add reassociate ssa pass to rebalance commutative operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the compiler groups expressions with commutative operations such as a + b + c + d as so: (a + (b + (c + d))) which is suboptimal for CPU instruction pipelining. This pass balances commutative expressions as shown above to (a + b) + (c + d) to optimally pipeline them. It also attempts to reassociate constants to as far right of the commutative expression as possible for better constant folding opportunities. Below is a benchmark from crypto/md5 on an MacBook Pro M2: trunk reassociate Hash1K-8 433.7Mi ± 0% 499.4Mi ± 4% +15.17% (p=0.000 n=10) Hash8K-8 454.3Mi ± 1% 524.9Mi ± 1% +15.53% (p=0.000 n=10) .... geomean 284.4Mi 327.5Mi +15.15% Other CPU architectures tried showed very little change (+/-1%) on this particular benchmark but tight mathematical code stands to gain greatly from this optimization Fixes #49331 --- src/cmd/compile/internal/ssa/compile.go | 5 +- src/cmd/compile/internal/ssa/reassociate.go | 190 +++++++++++++++++++ src/cmd/compile/internal/ssa/shortcircuit.go | 2 +- 3 files changed, 194 insertions(+), 3 deletions(-) create mode 100644 src/cmd/compile/internal/ssa/reassociate.go diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index d125891f88..b67c0115de 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -463,8 +463,9 @@ var passes = [...]pass{ {name: "short circuit", fn: shortcircuit}, {name: "decompose user", fn: decomposeUser, required: true}, {name: "pre-opt deadcode", fn: deadcode}, - {name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules - {name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values + {name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules + {name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values + {name: "reassociate", fn: reassociate}, {name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt {name: "generic cse", fn: cse}, {name: "phiopt", fn: phiopt}, diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/reassociate.go new file mode 100644 index 0000000000..42a5e4f135 --- /dev/null +++ b/src/cmd/compile/internal/ssa/reassociate.go @@ -0,0 +1,190 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "fmt" + "sort" +) + +// balanceExprTree repurposes all nodes and leafs into a +// balanced expression tree +func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leafs []*Value) { + // reset all arguments of nodes to help rebalancing + for i, n := range nodes { + n.reset(n.Op) + + // sometimes nodes in the tree are in different blocks + // so pull them in into a common block (v's block) + // to make sure nodes don't end up dominating their leaves + if v.Block != n.Block { + copied := n.copyInto(v.Block) + n.Op = OpInvalid + visited[n] = true // "revisit" the copied node + nodes[i] = copied + } + } + + // we bfs'ed through the nodes in reverse topological order + // (expression dominated by all others to expression dominated by none of the others), + // we want to rebuild the tree reverse topological order + for i, j := 0, len(nodes)-1; i <= j; i, j = i+1, j-1 { + nodes[i], nodes[j] = nodes[j], nodes[i] + } + + // push all leafs which are constants as far off to the + // right as possible to give the constant folder more opportunities + sort.Slice(leafs, func(i, j int) bool { + switch leafs[j].Op { + case OpConst8, OpConst16, OpConst32, OpConst64: + return false + default: + return true + } + }) + + // build tree in reverse topological order + for i := 0; i < len(nodes); i++ { + if len(leafs) < 2 { // we need at least two leafs per node, balance went very wrong + panic(fmt.Sprint("leafs needs to be >= 2, got", len(leafs))) + } + + // Take two leaves out and attach them to a node, + // use the node as a new leaf in the "next layer" of the tree + nodes[i].AddArg2(leafs[0], leafs[1]) + leafs = append(leafs[2:], nodes[i]) + } +} + +func isOr(op Op) bool { + switch op { + case OpOr8, OpOr16, OpOr32, OpOr64: + return true + default: + return false + } +} + +// probablyMemcombine helps find a pattern of leaves that form +// a load that can be widened which looks like: +// +// (l | l << 8 | l << 18 | l << 24) +// +// which cannot be rebalanced or else it won't fire rewrite rules +func probablyMemcombine(op Op, leafs []*Value) bool { + if !isOr(op) { + return false + } + + lshCount := 0 + for _, l := range leafs { + switch l.Op { + case OpLsh8x8, OpLsh8x16, OpLsh8x32, OpLsh8x64, + OpLsh16x8, OpLsh16x16, OpLsh16x32, OpLsh16x64, + OpLsh32x8, OpLsh32x16, OpLsh32x32, OpLsh32x64, + OpLsh64x8, OpLsh64x16, OpLsh64x32, OpLsh64x64: + lshCount++ + } + } + + return lshCount == len(leafs)-1 +} + +// rebalance balances associative computation to better help CPU instruction pipelining (#49331) +// and groups constants together catch more constant folding opportunities. +// +// a + b + c + d compiles to to (a + (b + (c + d)) which is an unbalanced expression tree +// that looks like +// +// - (v1) +// / \ +// a + (v2) +// / \ +// b + (v3) +// / \ +// c d +// +// Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in +// v2, and v2 before its use in v1 +// This optimization rebalances the expression tree to: +// +// - (v1) +// / \ +// +// (v2) + + (v3) +// +// / \ / \ +// a b c d +// +// Which removes such dependencies and frees up the CPU pipeline. +// +// The above optimization is a good starting point for other sorts of operations such as +// turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger) +func rebalance(v *Value, visited map[*Value]bool) { + // We cannot apply this optimization to non-commutative operations, + // values that have more than one use, or non-binary ops (would need more log math). + // Try and save time by not revisiting nodes + if visited[v] || !(v.Uses == 1 && opcodeTable[v.Op].commutative) || len(v.Args) > 2 { + return + } + + // The smallest possible rebalanceable expression has 3 nodes and 4 leafs, + // so preallocate the lists to save time if it is not rebalanceable + leafs := make([]*Value, 0, 4) + nodes := make([]*Value, 0, 3) + + // Do a bfs on v to keep a nice reverse topological order + haystack := []*Value{v} + for len(haystack) != 0 { + nextHaystack := make([]*Value, 0, len(v.Args)*len(haystack)) + for _, needle := range haystack { + // if we are searching a value, it must be a node so add it to our node list + nodes = append(nodes, needle) + + // Only visit nodes. Leafs may be rebalancable for a different op type + visited[needle] = true + + for _, a := range needle.Args { + // If the ops aren't the same or have more than one use it must be a leaf. + if a.Op != v.Op || a.Uses != 1 { + leafs = append(leafs, a) + continue + } + + // nodes in the tree now hold the invariants that: + // - they are of a common associative operation as the rest of the tree + // - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger) + nextHaystack = append(nextHaystack, a) + } + } + haystack = nextHaystack + } + + // we need at least 4 leafs for this expression to be rebalanceable, + // and we can't balance a potential load widening (memcombine) + if len(leafs) < 4 || probablyMemcombine(v.Op, leafs) { + return + } + + balanceExprTree(v, visited, nodes, leafs) +} + +// reassociate balances trees of commutative computation +// to better group expressions for better constant folding, +// cse, etc. +func reassociate(f *Func) { + visited := make(map[*Value]bool) + + for _, b := range f.Postorder() { + for i := len(b.Values) - 1; i >= 0; i-- { + val := b.Values[i] + rebalance(val, visited) + } + } + + for k := range visited { + delete(visited, k) + } +} diff --git a/src/cmd/compile/internal/ssa/shortcircuit.go b/src/cmd/compile/internal/ssa/shortcircuit.go index d7d0b6fe33..9ee55941da 100644 --- a/src/cmd/compile/internal/ssa/shortcircuit.go +++ b/src/cmd/compile/internal/ssa/shortcircuit.go @@ -499,7 +499,7 @@ func (v *Value) moveTo(dst *Block, i int) { } src := v.Block if src.Values[i] != v { - v.Fatalf("moveTo bad index %d", v, i) + v.Fatalf("moveTo bad index %d", i) } if src == dst { return From 47986f1517347fd82bd6b8a84b8fbc6f7b03af05 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Fri, 5 May 2023 16:38:50 -0600 Subject: [PATCH 2/9] remove badly fmt'd ASCII art from comments --- src/cmd/compile/internal/ssa/reassociate.go | 22 ++------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/reassociate.go index 42a5e4f135..56d3c31132 100644 --- a/src/cmd/compile/internal/ssa/reassociate.go +++ b/src/cmd/compile/internal/ssa/reassociate.go @@ -96,29 +96,11 @@ func probablyMemcombine(op Op, leafs []*Value) bool { // and groups constants together catch more constant folding opportunities. // // a + b + c + d compiles to to (a + (b + (c + d)) which is an unbalanced expression tree -// that looks like -// -// - (v1) -// / \ -// a + (v2) -// / \ -// b + (v3) -// / \ -// c d -// // Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in // v2, and v2 before its use in v1 -// This optimization rebalances the expression tree to: // -// - (v1) -// / \ -// -// (v2) + + (v3) -// -// / \ / \ -// a b c d -// -// Which removes such dependencies and frees up the CPU pipeline. +// This optimization rebalances this expression tree to look like (a + b) + (c + d) , +// which removes such dependencies and frees up the CPU pipeline. // // The above optimization is a good starting point for other sorts of operations such as // turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger) From 26f212799ffbdc587174b5c80372721648fc664d Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sun, 7 May 2023 21:56:06 -0600 Subject: [PATCH 3/9] add basic codegen test, fix binary tree rebalancing --- src/cmd/compile/internal/ssa/reassociate.go | 63 +++++++++++---------- test/codegen/reassociate.go | 18 ++++++ 2 files changed, 50 insertions(+), 31 deletions(-) create mode 100644 test/codegen/reassociate.go diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/reassociate.go index 56d3c31132..e11f5b7bb1 100644 --- a/src/cmd/compile/internal/ssa/reassociate.go +++ b/src/cmd/compile/internal/ssa/reassociate.go @@ -4,13 +4,10 @@ package ssa -import ( - "fmt" - "sort" -) - -// balanceExprTree repurposes all nodes and leafs into a -// balanced expression tree +// balanceExprTree repurposes all nodes and leafs into a well-balanced expression tree. +// It doesn't truly balance the tree in the sense of a BST, rather it +// prioritizes pairing up innermost (rightmost) expressions and their results and only +// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leafs []*Value) { // reset all arguments of nodes to help rebalancing for i, n := range nodes { @@ -34,27 +31,27 @@ func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leafs []*Value) { nodes[i], nodes[j] = nodes[j], nodes[i] } - // push all leafs which are constants as far off to the - // right as possible to give the constant folder more opportunities - sort.Slice(leafs, func(i, j int) bool { - switch leafs[j].Op { - case OpConst8, OpConst16, OpConst32, OpConst64: - return false - default: - return true + // rebuild expression trees from the bottom up, prioritizing + // right grouping. + // if the number of leaves is not even, skip the first leaf + // and add it to be paired up later + i := 0 + subTrees := leafs + for len(subTrees) != 1 { + nextSubTrees := make([]*Value, 0, (len(subTrees)+1)/2) + + start := len(subTrees)%2 + if start != 0 { + nextSubTrees = append(nextSubTrees, subTrees[0]) } - }) - - // build tree in reverse topological order - for i := 0; i < len(nodes); i++ { - if len(leafs) < 2 { // we need at least two leafs per node, balance went very wrong - panic(fmt.Sprint("leafs needs to be >= 2, got", len(leafs))) + + for j := start; j < len(subTrees)-1; j+=2 { + nodes[i].AddArg2(subTrees[j], subTrees[j+1]) + nextSubTrees = append(nextSubTrees, nodes[i]) + i++ } - - // Take two leaves out and attach them to a node, - // use the node as a new leaf in the "next layer" of the tree - nodes[i].AddArg2(leafs[0], leafs[1]) - leafs = append(leafs[2:], nodes[i]) + + subTrees = nextSubTrees } } @@ -72,7 +69,7 @@ func isOr(op Op) bool { // // (l | l << 8 | l << 18 | l << 24) // -// which cannot be rebalanced or else it won't fire rewrite rules +// which cannot be rebalanced or else it won't fire load widening rewrite rules func probablyMemcombine(op Op, leafs []*Value) bool { if !isOr(op) { return false @@ -89,7 +86,11 @@ func probablyMemcombine(op Op, leafs []*Value) bool { } } - return lshCount == len(leafs)-1 + // there are a few algorithms in the std lib expressed as two 32 bit loads + // which can get turned into a 64 bit load + // conservatively estimate that if there are more shifts than not then it is + // some sort of load waiting to be widened + return lshCount > len(leafs)/2 } // rebalance balances associative computation to better help CPU instruction pipelining (#49331) @@ -145,7 +146,7 @@ func rebalance(v *Value, visited map[*Value]bool) { } // we need at least 4 leafs for this expression to be rebalanceable, - // and we can't balance a potential load widening (memcombine) + // and we can't balance a potential load widening (see memcombine) if len(leafs) < 4 || probablyMemcombine(v.Op, leafs) { return } @@ -154,8 +155,8 @@ func rebalance(v *Value, visited map[*Value]bool) { } // reassociate balances trees of commutative computation -// to better group expressions for better constant folding, -// cse, etc. +// to better group expressions to expose easy optimizations in +// cse, cancelling/counting/factoring expressions, etc. func reassociate(f *Func) { visited := make(map[*Value]bool) diff --git a/test/codegen/reassociate.go b/test/codegen/reassociate.go new file mode 100644 index 0000000000..1309d9dc0a --- /dev/null +++ b/test/codegen/reassociate.go @@ -0,0 +1,18 @@ +// asmcheck + +package codegen + +// reassociateAddition expects very specific sequence of registers +// of the form: +// R2 += R3 +// R1 += R0 +// R1 += R2 +func reassociateAddition(a, b, c, d int) int { + // arm64:`ADD\tR2,\sR3,\sR2` + x := b + a + // arm64:`ADD\tR0,\sR1,\sR1` + y := x + c + // arm64:`ADD\tR1,\sR2,\sR0` + z := y + d + return z +} \ No newline at end of file From 9d1cc219b84a06302b9850cc51a9ca5050c5bb7c Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Mon, 8 May 2023 08:14:03 -0600 Subject: [PATCH 4/9] use correct plural of leaf, fix license file --- src/cmd/compile/internal/ssa/reassociate.go | 24 ++++++++++----------- test/codegen/reassociate.go | 4 ++++ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/reassociate.go index e11f5b7bb1..3363d778df 100644 --- a/src/cmd/compile/internal/ssa/reassociate.go +++ b/src/cmd/compile/internal/ssa/reassociate.go @@ -4,11 +4,11 @@ package ssa -// balanceExprTree repurposes all nodes and leafs into a well-balanced expression tree. +// balanceExprTree repurposes all nodes and leaves into a well-balanced expression tree. // It doesn't truly balance the tree in the sense of a BST, rather it // prioritizes pairing up innermost (rightmost) expressions and their results and only // pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists -func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leafs []*Value) { +func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leaves []*Value) { // reset all arguments of nodes to help rebalancing for i, n := range nodes { n.reset(n.Op) @@ -36,7 +36,7 @@ func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leafs []*Value) { // if the number of leaves is not even, skip the first leaf // and add it to be paired up later i := 0 - subTrees := leafs + subTrees := leaves for len(subTrees) != 1 { nextSubTrees := make([]*Value, 0, (len(subTrees)+1)/2) @@ -70,13 +70,13 @@ func isOr(op Op) bool { // (l | l << 8 | l << 18 | l << 24) // // which cannot be rebalanced or else it won't fire load widening rewrite rules -func probablyMemcombine(op Op, leafs []*Value) bool { +func probablyMemcombine(op Op, leaves []*Value) bool { if !isOr(op) { return false } lshCount := 0 - for _, l := range leafs { + for _, l := range leaves { switch l.Op { case OpLsh8x8, OpLsh8x16, OpLsh8x32, OpLsh8x64, OpLsh16x8, OpLsh16x16, OpLsh16x32, OpLsh16x64, @@ -90,7 +90,7 @@ func probablyMemcombine(op Op, leafs []*Value) bool { // which can get turned into a 64 bit load // conservatively estimate that if there are more shifts than not then it is // some sort of load waiting to be widened - return lshCount > len(leafs)/2 + return lshCount > len(leaves)/2 } // rebalance balances associative computation to better help CPU instruction pipelining (#49331) @@ -113,9 +113,9 @@ func rebalance(v *Value, visited map[*Value]bool) { return } - // The smallest possible rebalanceable expression has 3 nodes and 4 leafs, + // The smallest possible rebalanceable expression has 3 nodes and 4 leaves, // so preallocate the lists to save time if it is not rebalanceable - leafs := make([]*Value, 0, 4) + leaves := make([]*Value, 0, 4) nodes := make([]*Value, 0, 3) // Do a bfs on v to keep a nice reverse topological order @@ -132,7 +132,7 @@ func rebalance(v *Value, visited map[*Value]bool) { for _, a := range needle.Args { // If the ops aren't the same or have more than one use it must be a leaf. if a.Op != v.Op || a.Uses != 1 { - leafs = append(leafs, a) + leaves = append(leaves, a) continue } @@ -145,13 +145,13 @@ func rebalance(v *Value, visited map[*Value]bool) { haystack = nextHaystack } - // we need at least 4 leafs for this expression to be rebalanceable, + // we need at least 4 leaves for this expression to be rebalanceable, // and we can't balance a potential load widening (see memcombine) - if len(leafs) < 4 || probablyMemcombine(v.Op, leafs) { + if len(leaves) < 4 || probablyMemcombine(v.Op, leaves) { return } - balanceExprTree(v, visited, nodes, leafs) + balanceExprTree(v, visited, nodes, leaves) } // reassociate balances trees of commutative computation diff --git a/test/codegen/reassociate.go b/test/codegen/reassociate.go index 1309d9dc0a..9fa795b530 100644 --- a/test/codegen/reassociate.go +++ b/test/codegen/reassociate.go @@ -1,5 +1,9 @@ // asmcheck +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + package codegen // reassociateAddition expects very specific sequence of registers From 41c68cbddf067f50563d375eb8dcb74eee698d9f Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Mon, 8 May 2023 15:16:56 -0600 Subject: [PATCH 5/9] simplify bfs, other code review comments --- src/cmd/compile/internal/ssa/reassociate.go | 106 +++++++++----------- 1 file changed, 50 insertions(+), 56 deletions(-) diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/reassociate.go index 3363d778df..2a6a23776e 100644 --- a/src/cmd/compile/internal/ssa/reassociate.go +++ b/src/cmd/compile/internal/ssa/reassociate.go @@ -4,22 +4,36 @@ package ssa +// reassociate balances trees of commutative computation +// to better group expressions to expose easy optimizations in +// cse, cancelling/counting/factoring expressions, etc. +func reassociate(f *Func) { + visited := f.newSparseSet(f.NumValues()) + + for _, b := range f.Postorder() { + for i := len(b.Values) - 1; i >= 0; i-- { + val := b.Values[i] + rebalance(val, visited) + } + } +} + // balanceExprTree repurposes all nodes and leaves into a well-balanced expression tree. -// It doesn't truly balance the tree in the sense of a BST, rather it +// It doesn't truly balance the tree in the sense of a BST, rather it // prioritizes pairing up innermost (rightmost) expressions and their results and only -// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists -func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leaves []*Value) { +// pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists +func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) { // reset all arguments of nodes to help rebalancing for i, n := range nodes { n.reset(n.Op) // sometimes nodes in the tree are in different blocks // so pull them in into a common block (v's block) - // to make sure nodes don't end up dominating their leaves + // to make sure nodes don't end up dominating their leaves TODO(ryan-berger), not necessary if v.Block != n.Block { copied := n.copyInto(v.Block) n.Op = OpInvalid - visited[n] = true // "revisit" the copied node + visited.add(copied.ID) // "revisit" the copied node nodes[i] = copied } } @@ -33,24 +47,24 @@ func balanceExprTree(v *Value, visited map[*Value]bool, nodes, leaves []*Value) // rebuild expression trees from the bottom up, prioritizing // right grouping. - // if the number of leaves is not even, skip the first leaf + // if the number of leaves is not even, skip the first leaf // and add it to be paired up later i := 0 subTrees := leaves for len(subTrees) != 1 { nextSubTrees := make([]*Value, 0, (len(subTrees)+1)/2) - - start := len(subTrees)%2 + + start := len(subTrees) % 2 if start != 0 { nextSubTrees = append(nextSubTrees, subTrees[0]) } - - for j := start; j < len(subTrees)-1; j+=2 { + + for j := start; j < len(subTrees)-1; j += 2 { nodes[i].AddArg2(subTrees[j], subTrees[j+1]) nextSubTrees = append(nextSubTrees, nodes[i]) i++ } - + subTrees = nextSubTrees } } @@ -96,78 +110,58 @@ func probablyMemcombine(op Op, leaves []*Value) bool { // rebalance balances associative computation to better help CPU instruction pipelining (#49331) // and groups constants together catch more constant folding opportunities. // -// a + b + c + d compiles to to (a + (b + (c + d)) which is an unbalanced expression tree +// a + b + c + d compiles to to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree // Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in // v2, and v2 before its use in v1 // // This optimization rebalances this expression tree to look like (a + b) + (c + d) , // which removes such dependencies and frees up the CPU pipeline. // -// The above optimization is a good starting point for other sorts of operations such as +// The above optimization is also a good starting point for other sorts of operations such as // turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger) -func rebalance(v *Value, visited map[*Value]bool) { +func rebalance(v *Value, visited *sparseSet) { // We cannot apply this optimization to non-commutative operations, - // values that have more than one use, or non-binary ops (would need more log math). // Try and save time by not revisiting nodes - if visited[v] || !(v.Uses == 1 && opcodeTable[v.Op].commutative) || len(v.Args) > 2 { + if visited.contains(v.ID) || !opcodeTable[v.Op].commutative { return } - // The smallest possible rebalanceable expression has 3 nodes and 4 leaves, + // The smallest possible rebalanceable binary expression has 3 nodes and 4 leaves, // so preallocate the lists to save time if it is not rebalanceable leaves := make([]*Value, 0, 4) nodes := make([]*Value, 0, 3) // Do a bfs on v to keep a nice reverse topological order haystack := []*Value{v} - for len(haystack) != 0 { - nextHaystack := make([]*Value, 0, len(v.Args)*len(haystack)) - for _, needle := range haystack { - // if we are searching a value, it must be a node so add it to our node list - nodes = append(nodes, needle) + for i := 0; i < len(haystack); i++ { + needle := haystack[i] + // if we are searching a value, it must be a node so add it to our node list + nodes = append(nodes, needle) - // Only visit nodes. Leafs may be rebalancable for a different op type - visited[needle] = true + // Only visit nodes. Leafs may be rebalancable for a different op type + visited.add(v.ID) - for _, a := range needle.Args { - // If the ops aren't the same or have more than one use it must be a leaf. - if a.Op != v.Op || a.Uses != 1 { - leaves = append(leaves, a) - continue - } - - // nodes in the tree now hold the invariants that: - // - they are of a common associative operation as the rest of the tree - // - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger) - nextHaystack = append(nextHaystack, a) + for _, a := range needle.Args { + // If the ops aren't the same or have more than one use it must be a leaf. + if a.Op != v.Op || a.Uses != 1 { + leaves = append(leaves, a) + continue } + + // nodes in the tree now hold the invariants that: + // - they are of a common associative operation as the rest of the tree + // - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger)) + haystack = append(haystack, a) } - haystack = nextHaystack } - // we need at least 4 leaves for this expression to be rebalanceable, + minLeaves := len(v.Args) * len(v.Args) + + // we need at least args^2 leaves for this expression to be rebalanceable, // and we can't balance a potential load widening (see memcombine) - if len(leaves) < 4 || probablyMemcombine(v.Op, leaves) { + if len(leaves) < minLeaves || probablyMemcombine(v.Op, leaves) { return } balanceExprTree(v, visited, nodes, leaves) } - -// reassociate balances trees of commutative computation -// to better group expressions to expose easy optimizations in -// cse, cancelling/counting/factoring expressions, etc. -func reassociate(f *Func) { - visited := make(map[*Value]bool) - - for _, b := range f.Postorder() { - for i := len(b.Values) - 1; i >= 0; i-- { - val := b.Values[i] - rebalance(val, visited) - } - } - - for k := range visited { - delete(visited, k) - } -} From ba4aaa7fdae4e77d13e4ac1398ec95dc67d1dfc2 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Wed, 17 May 2023 20:58:32 -0600 Subject: [PATCH 6/9] specialize reassociate to do ilp --- src/cmd/compile/internal/ssa/compile.go | 4 +- .../internal/ssa/{reassociate.go => ilp.go} | 88 ++++--------------- 2 files changed, 21 insertions(+), 71 deletions(-) rename src/cmd/compile/internal/ssa/{reassociate.go => ilp.go} (55%) diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index b67c0115de..aef62b0543 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -465,7 +465,6 @@ var passes = [...]pass{ {name: "pre-opt deadcode", fn: deadcode}, {name: "opt", fn: opt, required: true}, // NB: some generic rules know the name of the opt pass. TODO: split required rules and optimizing rules {name: "zero arg cse", fn: zcse, required: true}, // required to merge OpSB values - {name: "reassociate", fn: reassociate}, {name: "opt deadcode", fn: deadcode, required: true}, // remove any blocks orphaned during opt {name: "generic cse", fn: cse}, {name: "phiopt", fn: phiopt}, @@ -485,6 +484,7 @@ var passes = [...]pass{ {name: "late fuse", fn: fuseLate}, {name: "dse", fn: dse}, {name: "memcombine", fn: memcombine}, + {name: "ilp", fn: ilp}, {name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops {name: "insert resched checks", fn: insertLoopReschedChecks, disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops. @@ -586,6 +586,8 @@ var passOrder = [...]constraint{ {"late fuse", "memcombine"}, // memcombine is a arch-independent pass. {"memcombine", "lower"}, + // ilp works best after ORs have been combined to loads + {"memcombine", "ilp"}, } func init() { diff --git a/src/cmd/compile/internal/ssa/reassociate.go b/src/cmd/compile/internal/ssa/ilp.go similarity index 55% rename from src/cmd/compile/internal/ssa/reassociate.go rename to src/cmd/compile/internal/ssa/ilp.go index 2a6a23776e..fa0bbbd960 100644 --- a/src/cmd/compile/internal/ssa/reassociate.go +++ b/src/cmd/compile/internal/ssa/ilp.go @@ -4,10 +4,10 @@ package ssa -// reassociate balances trees of commutative computation -// to better group expressions to expose easy optimizations in -// cse, cancelling/counting/factoring expressions, etc. -func reassociate(f *Func) { +// ilp pass (Instruction Level Parallelism) balances trees of commutative computation +// to help CPU pipeline instructions more efficiently. It only works block by block +// so that it doesn't end up pulling loop invariant expressions into tight loops +func ilp(f *Func) { visited := f.newSparseSet(f.NumValues()) for _, b := range f.Postorder() { @@ -22,20 +22,10 @@ func reassociate(f *Func) { // It doesn't truly balance the tree in the sense of a BST, rather it // prioritizes pairing up innermost (rightmost) expressions and their results and only // pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists -func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) { - // reset all arguments of nodes to help rebalancing - for i, n := range nodes { +func balanceExprTree(nodes, leaves []*Value) { + // reset all arguments of nodes to reuse them + for _, n := range nodes { n.reset(n.Op) - - // sometimes nodes in the tree are in different blocks - // so pull them in into a common block (v's block) - // to make sure nodes don't end up dominating their leaves TODO(ryan-berger), not necessary - if v.Block != n.Block { - copied := n.copyInto(v.Block) - n.Op = OpInvalid - visited.add(copied.ID) // "revisit" the copied node - nodes[i] = copied - } } // we bfs'ed through the nodes in reverse topological order @@ -60,7 +50,7 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) { } for j := start; j < len(subTrees)-1; j += 2 { - nodes[i].AddArg2(subTrees[j], subTrees[j+1]) + nodes[i].AddArgs(subTrees[j], subTrees[j+1]) nextSubTrees = append(nextSubTrees, nodes[i]) i++ } @@ -69,46 +59,7 @@ func balanceExprTree(v *Value, visited *sparseSet, nodes, leaves []*Value) { } } -func isOr(op Op) bool { - switch op { - case OpOr8, OpOr16, OpOr32, OpOr64: - return true - default: - return false - } -} - -// probablyMemcombine helps find a pattern of leaves that form -// a load that can be widened which looks like: -// -// (l | l << 8 | l << 18 | l << 24) -// -// which cannot be rebalanced or else it won't fire load widening rewrite rules -func probablyMemcombine(op Op, leaves []*Value) bool { - if !isOr(op) { - return false - } - - lshCount := 0 - for _, l := range leaves { - switch l.Op { - case OpLsh8x8, OpLsh8x16, OpLsh8x32, OpLsh8x64, - OpLsh16x8, OpLsh16x16, OpLsh16x32, OpLsh16x64, - OpLsh32x8, OpLsh32x16, OpLsh32x32, OpLsh32x64, - OpLsh64x8, OpLsh64x16, OpLsh64x32, OpLsh64x64: - lshCount++ - } - } - - // there are a few algorithms in the std lib expressed as two 32 bit loads - // which can get turned into a 64 bit load - // conservatively estimate that if there are more shifts than not then it is - // some sort of load waiting to be widened - return lshCount > len(leaves)/2 -} - // rebalance balances associative computation to better help CPU instruction pipelining (#49331) -// and groups constants together catch more constant folding opportunities. // // a + b + c + d compiles to to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree // Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in @@ -116,13 +67,12 @@ func probablyMemcombine(op Op, leaves []*Value) bool { // // This optimization rebalances this expression tree to look like (a + b) + (c + d) , // which removes such dependencies and frees up the CPU pipeline. -// -// The above optimization is also a good starting point for other sorts of operations such as -// turning a + a + a => 3*a, cancelling pairs a + (-a), collecting up common factors TODO(ryan-berger) func rebalance(v *Value, visited *sparseSet) { - // We cannot apply this optimization to non-commutative operations, + // We cannot apply this optimization to non-commutative operations. + // We also exclude 3+ arg ops because there are 0 opportunities in the std lib, + // and the benefit for maintenance cost is not currently worth it. // Try and save time by not revisiting nodes - if visited.contains(v.ID) || !opcodeTable[v.Op].commutative { + if visited.contains(v.ID) || !opcodeTable[v.Op].commutative || len(v.Args) > 2{ return } @@ -142,26 +92,24 @@ func rebalance(v *Value, visited *sparseSet) { visited.add(v.ID) for _, a := range needle.Args { - // If the ops aren't the same or have more than one use it must be a leaf. - if a.Op != v.Op || a.Uses != 1 { + // If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf. + if a.Op != v.Op || a.Uses != 1 || a.Block != v.Block { leaves = append(leaves, a) continue } // nodes in the tree now hold the invariants that: // - they are of a common associative operation as the rest of the tree - // - they have only a single use (this invariant could be removed with further analysis TODO(ryan-berger)) + // - they have only a single use + // - they are in the same basic block haystack = append(haystack, a) } } - minLeaves := len(v.Args) * len(v.Args) - // we need at least args^2 leaves for this expression to be rebalanceable, - // and we can't balance a potential load widening (see memcombine) - if len(leaves) < minLeaves || probablyMemcombine(v.Op, leaves) { + if len(leaves) < 4 { return } - balanceExprTree(v, visited, nodes, leaves) + balanceExprTree(nodes, leaves) } From 3307a8c64358226437ce44bf925b3bd44117b356 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sat, 9 Sep 2023 15:47:39 -0600 Subject: [PATCH 7/9] wip: redo algorithm to find expression roots --- src/cmd/compile/internal/ssa/ilp.go | 105 +++++++++++++++++++--------- 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/src/cmd/compile/internal/ssa/ilp.go b/src/cmd/compile/internal/ssa/ilp.go index fa0bbbd960..f6619b6f60 100644 --- a/src/cmd/compile/internal/ssa/ilp.go +++ b/src/cmd/compile/internal/ssa/ilp.go @@ -8,39 +8,84 @@ package ssa // to help CPU pipeline instructions more efficiently. It only works block by block // so that it doesn't end up pulling loop invariant expressions into tight loops func ilp(f *Func) { - visited := f.newSparseSet(f.NumValues()) - - for _, b := range f.Postorder() { - for i := len(b.Values) - 1; i >= 0; i-- { - val := b.Values[i] - rebalance(val, visited) + for _, b := range f.Blocks { + for _, v := range findRoots(b) { + rebalance(v) } } } +// isILPOp only returns true if the operation is commutative +// and associative, which for our case would be only commutative integer ops +func isILPOp(o Op) bool { + // if the op isn't commutative it won't be useable for ilp + if !opcodeTable[o].commutative { + return false + } + + // filter out float ops because they are not associative + switch o { + case OpAdd32F, OpAdd64F, OpMul32F, OpMul64F: + return false + default: + return true + } +} + +// findRoots looks for the root of a rebalanceable expressions. +// It does this by building a poor man's def-use chain, counting up the uses of +// a given expression as an argument within the block. +// Any roots of rebalanceable expressions should have a count of zero meaning +// they aren't used as arguments in rebalanceable expressions. +func findRoots(b *Block) []*Value { + uses := make(map[*Value]int) + candidates := make([]*Value, 0) + roots := make([]*Value, 0) + + for _, v := range b.Values { + if !isILPOp(v.Op) { + continue + } + + // could be a possible root, add it as a candidate to remove later + candidates = append(candidates, v) + + // mark the arguments of the expression as being used + // if they are also a rebalanceable op (making them a non-root node) + if isILPOp(v.Args[0].Op) && v.Op == v.Args[0].Op { + uses[v.Args[0]]++ + } + + if isILPOp(v.Args[1].Op) && v.Op == v.Args[1].Op { + uses[v.Args[1]]++ + } + } + + for _, c := range candidates { + if uses[c] == 0 { + roots = append(roots, c) + } + } + + return roots +} + // balanceExprTree repurposes all nodes and leaves into a well-balanced expression tree. // It doesn't truly balance the tree in the sense of a BST, rather it // prioritizes pairing up innermost (rightmost) expressions and their results and only // pairing results of outermost (leftmost) expressions up with them when no other nice pairing exists +// TODO(ryan-berger): implement Huffman Tree-Height Reduction instead? func balanceExprTree(nodes, leaves []*Value) { // reset all arguments of nodes to reuse them for _, n := range nodes { n.reset(n.Op) } - // we bfs'ed through the nodes in reverse topological order - // (expression dominated by all others to expression dominated by none of the others), - // we want to rebuild the tree reverse topological order - for i, j := 0, len(nodes)-1; i <= j; i, j = i+1, j-1 { - nodes[i], nodes[j] = nodes[j], nodes[i] - } - - // rebuild expression trees from the bottom up, prioritizing - // right grouping. + // rebuild expression trees from the bottom up, prioritizing right grouping. // if the number of leaves is not even, skip the first leaf // and add it to be paired up later - i := 0 subTrees := leaves + i := len(nodes)-1 for len(subTrees) != 1 { nextSubTrees := make([]*Value, 0, (len(subTrees)+1)/2) @@ -49,10 +94,11 @@ func balanceExprTree(nodes, leaves []*Value) { nextSubTrees = append(nextSubTrees, subTrees[0]) } + // pair leaves using the last nodes and move towards nodes[0] which is the root for j := start; j < len(subTrees)-1; j += 2 { nodes[i].AddArgs(subTrees[j], subTrees[j+1]) nextSubTrees = append(nextSubTrees, nodes[i]) - i++ + i-- } subTrees = nextSubTrees @@ -61,21 +107,13 @@ func balanceExprTree(nodes, leaves []*Value) { // rebalance balances associative computation to better help CPU instruction pipelining (#49331) // -// a + b + c + d compiles to to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree -// Which is suboptimal since it requires the CPU to compute v3 before fetching it use its result in +// a + b + c + d compiles to v1:(a + v2:(b + v3:(c + d)) which is an unbalanced expression tree. +// It is suboptimal since it requires the CPU to compute v3 before fetching it use its result in // v2, and v2 before its use in v1 // -// This optimization rebalances this expression tree to look like (a + b) + (c + d) , +// This optimization rebalances this expression tree to look like v1:(v2:(a + b) + v3:(c + d)), // which removes such dependencies and frees up the CPU pipeline. -func rebalance(v *Value, visited *sparseSet) { - // We cannot apply this optimization to non-commutative operations. - // We also exclude 3+ arg ops because there are 0 opportunities in the std lib, - // and the benefit for maintenance cost is not currently worth it. - // Try and save time by not revisiting nodes - if visited.contains(v.ID) || !opcodeTable[v.Op].commutative || len(v.Args) > 2{ - return - } - +func rebalance(v *Value) { // The smallest possible rebalanceable binary expression has 3 nodes and 4 leaves, // so preallocate the lists to save time if it is not rebalanceable leaves := make([]*Value, 0, 4) @@ -85,12 +123,10 @@ func rebalance(v *Value, visited *sparseSet) { haystack := []*Value{v} for i := 0; i < len(haystack); i++ { needle := haystack[i] + // if we are searching a value, it must be a node so add it to our node list nodes = append(nodes, needle) - // Only visit nodes. Leafs may be rebalancable for a different op type - visited.add(v.ID) - for _, a := range needle.Args { // If the ops aren't the same, have more than one use, or not in the same BB it must be a leaf. if a.Op != v.Op || a.Uses != 1 || a.Block != v.Block { @@ -106,8 +142,9 @@ func rebalance(v *Value, visited *sparseSet) { } } - // we need at least args^2 leaves for this expression to be rebalanceable, - if len(leaves) < 4 { + // we need at least 3 nodes (root, two children) and len(args)^2 leaves + // for this expression to be rebalanceable + if len(nodes) < 3 || len(leaves) < 4 { return } From 0cbf73b126922c6ccfb916ee56e57a37e76def3b Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sat, 9 Sep 2023 15:48:13 -0600 Subject: [PATCH 8/9] wip: redo algorithm to find expression roots --- src/cmd/compile/internal/ssa/ilp.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/compile/internal/ssa/ilp.go b/src/cmd/compile/internal/ssa/ilp.go index f6619b6f60..6615c3cd52 100644 --- a/src/cmd/compile/internal/ssa/ilp.go +++ b/src/cmd/compile/internal/ssa/ilp.go @@ -16,7 +16,7 @@ func ilp(f *Func) { } // isILPOp only returns true if the operation is commutative -// and associative, which for our case would be only commutative integer ops +// and associative, which for our case would be only commutative integer ops. func isILPOp(o Op) bool { // if the op isn't commutative it won't be useable for ilp if !opcodeTable[o].commutative { From cdffffdb12226e517f93168eec2d5dd0b457c2b9 Mon Sep 17 00:00:00 2001 From: Ryan Berger Date: Sat, 9 Sep 2023 16:23:11 -0600 Subject: [PATCH 9/9] nits, fix error message for moveTo --- src/cmd/compile/internal/ssa/ilp.go | 14 +++++++------- src/cmd/compile/internal/ssa/shortcircuit.go | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/ssa/ilp.go b/src/cmd/compile/internal/ssa/ilp.go index 6615c3cd52..205f52bf83 100644 --- a/src/cmd/compile/internal/ssa/ilp.go +++ b/src/cmd/compile/internal/ssa/ilp.go @@ -4,7 +4,7 @@ package ssa -// ilp pass (Instruction Level Parallelism) balances trees of commutative computation +// ilp pass (Instruction Level Parallelism) balances trees of associative computation // to help CPU pipeline instructions more efficiently. It only works block by block // so that it doesn't end up pulling loop invariant expressions into tight loops func ilp(f *Func) { @@ -15,15 +15,15 @@ func ilp(f *Func) { } } -// isILPOp only returns true if the operation is commutative -// and associative, which for our case would be only commutative integer ops. +// isILPOp only returns true if the operation is associative, +// which for our case would be only commutative integer ops. func isILPOp(o Op) bool { - // if the op isn't commutative it won't be useable for ilp + // if the op isn't a commutative integer op, it won't be associative if !opcodeTable[o].commutative { return false } - // filter out float ops because they are not associative + // filter out float ops because they are not associative, leaving int ops switch o { case OpAdd32F, OpAdd64F, OpMul32F, OpMul64F: return false @@ -87,7 +87,7 @@ func balanceExprTree(nodes, leaves []*Value) { subTrees := leaves i := len(nodes)-1 for len(subTrees) != 1 { - nextSubTrees := make([]*Value, 0, (len(subTrees)+1)/2) + nextSubTrees := subTrees[:0] start := len(subTrees) % 2 if start != 0 { @@ -142,7 +142,7 @@ func rebalance(v *Value) { } } - // we need at least 3 nodes (root, two children) and len(args)^2 leaves + // we need at least 3 nodes (root, two children) and 4 leaves // for this expression to be rebalanceable if len(nodes) < 3 || len(leaves) < 4 { return diff --git a/src/cmd/compile/internal/ssa/shortcircuit.go b/src/cmd/compile/internal/ssa/shortcircuit.go index 9ee55941da..45a9f3354f 100644 --- a/src/cmd/compile/internal/ssa/shortcircuit.go +++ b/src/cmd/compile/internal/ssa/shortcircuit.go @@ -499,7 +499,7 @@ func (v *Value) moveTo(dst *Block, i int) { } src := v.Block if src.Values[i] != v { - v.Fatalf("moveTo bad index %d", i) + v.Fatalf("moveTo bad index %d, src.Values[i] = %v, expected %v", i, src.Values[i], v) } if src == dst { return