diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go index 937c757b21..71ca774431 100644 --- a/src/cmd/compile/internal/ssa/block.go +++ b/src/cmd/compile/internal/ssa/block.go @@ -358,6 +358,22 @@ func (b *Block) AuxIntString() string { } } +// likelyBranch reports whether block b is the likely branch of all of its predecessors. +func (b *Block) likelyBranch() bool { + if len(b.Preds) == 0 { + return false + } + for _, e := range b.Preds { + p := e.b + if len(p.Succs) == 1 || len(p.Succs) == 2 && (p.Likely == BranchLikely && p.Succs[0].b == b || + p.Likely == BranchUnlikely && p.Succs[1].b == b) { + continue + } + return false + } + return true +} + func (b *Block) Logf(msg string, args ...interface{}) { b.Func.Logf(msg, args...) } func (b *Block) Log() bool { return b.Func.Log() } func (b *Block) Fatalf(msg string, args ...interface{}) { b.Func.Fatalf(msg, args...) } diff --git a/src/cmd/compile/internal/ssa/layout.go b/src/cmd/compile/internal/ssa/layout.go index 30b7b97d04..a7fd73aead 100644 --- a/src/cmd/compile/internal/ssa/layout.go +++ b/src/cmd/compile/internal/ssa/layout.go @@ -41,8 +41,13 @@ func layoutOrder(f *Func) []*Block { indegree := make([]int, f.NumBlocks()) posdegree := f.newSparseSet(f.NumBlocks()) // blocks with positive remaining degree defer f.retSparseSet(posdegree) - zerodegree := f.newSparseSet(f.NumBlocks()) // blocks with zero remaining degree - defer f.retSparseSet(zerodegree) + // blocks with zero remaining degree. Use slice to simulate a LIFO queue to implement + // the depth-first topology sorting algorithm. + var zerodegree []ID + // LIFO queue. Track the successor blocks of the scheduled block so that when we + // encounter loops, we choose to schedule the successor block of the most recently + // scheduled block. + var succs []ID exit := f.newSparseSet(f.NumBlocks()) // exit blocks defer f.retSparseSet(exit) @@ -88,7 +93,8 @@ func layoutOrder(f *Func) []*Block { } indegree[b.ID] = len(b.Preds) if len(b.Preds) == 0 { - zerodegree.add(b.ID) + // Push an element to the tail of the queue. + zerodegree = append(zerodegree, b.ID) } else { posdegree.add(b.ID) } @@ -105,12 +111,24 @@ blockloop: break } - for _, e := range b.Succs { - c := e.b + // Here, the order of traversing the b.Succs affects the direction in which the topological + // sort advances in depth. Take the following cfg as an example, regardless of other factors. + // b1 + // 0/ \1 + // b2 b3 + // Traverse b.Succs in order, the right child node b3 will be scheduled immediately after + // b1, traverse b.Succs in reverse order, the left child node b2 will be scheduled + // immediately after b1. The test results show that reverse traversal performs a little + // better. + // Note: You need to consider both layout and register allocation when testing performance. + for i := len(b.Succs) - 1; i >= 0; i-- { + c := b.Succs[i].b indegree[c.ID]-- if indegree[c.ID] == 0 { posdegree.remove(c.ID) - zerodegree.add(c.ID) + zerodegree = append(zerodegree, c.ID) + } else { + succs = append(succs, c.ID) } } @@ -132,30 +150,30 @@ blockloop: // Use degree for now. bid = 0 - mindegree := f.NumBlocks() - for _, e := range order[len(order)-1].Succs { - c := e.b - if scheduled[c.ID] || c.Kind == BlockExit { - continue - } - if indegree[c.ID] < mindegree { - mindegree = indegree[c.ID] - bid = c.ID - } - } - if bid != 0 { - continue - } // TODO: improve this part // No successor of the previously scheduled block works. // Pick a zero-degree block if we can. - for zerodegree.size() > 0 { - cid := zerodegree.pop() + for len(zerodegree) > 0 { + // Pop an element from the tail of the queue. + cid := zerodegree[len(zerodegree)-1] + zerodegree = zerodegree[:len(zerodegree)-1] if !scheduled[cid] { bid = cid continue blockloop } } + + // Still nothing, pick the unscheduled successor block encountered most recently. + for len(succs) > 0 { + // Pop an element from the tail of the queue. + cid := succs[len(succs)-1] + succs = succs[:len(succs)-1] + if !scheduled[cid] { + bid = cid + continue blockloop + } + } + // Still nothing, pick any non-exit block. for posdegree.size() > 0 { cid := posdegree.pop() diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go index 2e5e421df7..35010a78d8 100644 --- a/src/cmd/compile/internal/ssa/looprotate.go +++ b/src/cmd/compile/internal/ssa/looprotate.go @@ -68,12 +68,15 @@ func loopRotate(f *Func) { if nextb == p { // original loop predecessor is next break } - if loopnest.b2l[nextb.ID] != loop { // about to leave loop - break + if loopnest.b2l[nextb.ID] == loop { + after[p.ID] = append(after[p.ID], nextb) } - after[p.ID] = append(after[p.ID], nextb) b = nextb } + // Swap b and p so that we'll handle p before b when moving blocks. + f.Blocks[idToIdx[loop.header.ID]] = p + f.Blocks[idToIdx[p.ID]] = loop.header + idToIdx[loop.header.ID], idToIdx[p.ID] = idToIdx[p.ID], idToIdx[loop.header.ID] // Place b after p. for _, b := range after[p.ID] { @@ -86,21 +89,23 @@ func loopRotate(f *Func) { // before the rest of the loop. And that relies on the // fact that we only identify reducible loops. j := 0 - for i, b := range f.Blocks { + // Some blocks that are not part of a loop may be placed + // between loop blocks. In order to avoid these blocks from + // being overwritten, use a temporary slice. + newOrder := make([]*Block, 0, f.NumBlocks()) + for _, b := range f.Blocks { if _, ok := move[b.ID]; ok { continue } - f.Blocks[j] = b + newOrder = append(newOrder, b) j++ for _, a := range after[b.ID] { - if j > i { - f.Fatalf("head before tail in loop %s", b) - } - f.Blocks[j] = a + newOrder = append(newOrder, a) j++ } } if j != len(f.Blocks) { f.Fatalf("bad reordering in looprotate") } + f.Blocks = newOrder } diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go index c104a36888..18908681df 100644 --- a/src/cmd/compile/internal/ssa/regalloc.go +++ b/src/cmd/compile/internal/ssa/regalloc.go @@ -241,12 +241,6 @@ type regAllocState struct { GReg register allocatable regMask - // for each block, its primary predecessor. - // A predecessor of b is primary if it is the closest - // predecessor that appears before b in the layout order. - // We record the index in the Preds list where the primary predecessor sits. - primary []int32 - // live values at the end of each block. live[b.ID] is a list of value IDs // which are live at the end of b, together with a count of how many instructions // forward to the next use. @@ -304,6 +298,9 @@ type regAllocState struct { // choose a good order in which to visit blocks for allocation purposes. visitOrder []*Block + + // blockOrder[b.ID] corresponds to the index of block b in visitOrder. + blockOrder []int32 } type endReg struct { @@ -636,9 +633,9 @@ func (s *regAllocState) init(f *Func) { // Compute block order. This array allows us to distinguish forward edges // from backward edges and compute how far they go. - blockOrder := make([]int32, f.NumBlocks()) + s.blockOrder = make([]int32, f.NumBlocks()) for i, b := range s.visitOrder { - blockOrder[b.ID] = int32(i) + s.blockOrder[b.ID] = int32(i) } s.regs = make([]regState, s.numRegs) @@ -664,22 +661,6 @@ func (s *regAllocState) init(f *Func) { } s.computeLive() - // Compute primary predecessors. - s.primary = make([]int32, f.NumBlocks()) - for _, b := range s.visitOrder { - best := -1 - for i, e := range b.Preds { - p := e.b - if blockOrder[p.ID] >= blockOrder[b.ID] { - continue // backward edge - } - if best == -1 || blockOrder[p.ID] > blockOrder[b.Preds[best].b.ID] { - best = i - } - } - s.primary[b.ID] = int32(best) - } - s.endRegs = make([][]endReg, f.NumBlocks()) s.startRegs = make([][]startReg, f.NumBlocks()) s.spillLive = make([][]ID, f.NumBlocks()) @@ -957,10 +938,49 @@ func (s *regAllocState) regalloc(f *Func) { // This is the complicated case. We have more than one predecessor, // which means we may have Phi ops. - // Start with the final register state of the primary predecessor - idx := s.primary[b.ID] + // Start with the final register state of the predecessor with least spill values. + // This is based on the following points: + // 1, The less spill value indicates that the register pressure of this path is smaller, + // so the values of this block are more likely to be allocated to registers. + // 2, Avoid the predecessor that contains the function call, because the predecessor that + // contains the function call usually generates a lot of spills and lose the previous + // allocation state. + // TODO: Improve this part. At least the size of endRegs of the predecessor also has + // an impact on the code size and compiler speed. But it is not easy to find a simple + // and efficient method that combines multiple factors. + idx := -1 + for i, p := range b.Preds { + // If the predecessor has not been visited yet, skip it because its end state + // (redRegs and spillLive) has not been computed yet. + pb := p.b + if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] { + continue + } + if idx == -1 { + idx = i + continue + } + pSel := b.Preds[idx].b + if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) { + idx = i + } else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) { + // Use a bit of likely information. After critical pass, pb and pSel must + // be plain blocks, so check edge pb->pb.Preds instead of edge pb->b. + // TODO: improve the prediction of the likely predecessor. The following + // method is only suitable for the simplest cases. For complex cases, + // the prediction may be inaccurate, but this does not affect the + // correctness of the program. + // According to the layout algorithm, the predecessor with the + // smaller blockOrder is the true branch, and the test results show + // that it is better to choose the predecessor with a smaller + // blockOrder than no choice. + if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] { + idx = i + } + } + } if idx < 0 { - f.Fatalf("block with no primary predecessor %s", b) + f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b) } p := b.Preds[idx].b s.setState(s.endRegs[p.ID]) @@ -1048,7 +1068,7 @@ func (s *regAllocState) regalloc(f *Func) { // If one of the other inputs of v is in a register, and the register is available, // select this register, which can save some unnecessary copies. for i, pe := range b.Preds { - if int32(i) == idx { + if i == idx { continue } ri := noRegister diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go index 0bdb66a376..dea7e0ba61 100644 --- a/test/codegen/arithmetic.go +++ b/test/codegen/arithmetic.go @@ -322,6 +322,9 @@ func NoFix64A(divr int64) (int64, int64) { if divr > 5 { d /= divr // amd64:-"JMP" e %= divr // amd64:-"JMP" + // The following statement is to avoid conflict between the above check + // and the normal JMP generated at the end of the block. + d += e } return d, e } @@ -333,6 +336,7 @@ func NoFix64B(divd int64) (int64, int64) { if divd > -9223372036854775808 { d = divd / divr // amd64:-"JMP" e = divd % divr // amd64:-"JMP" + d += e } return d, e } @@ -347,6 +351,7 @@ func NoFix32A(divr int32) (int32, int32) { // amd64:-"JMP" // 386:-"JMP" e %= divr + d += e } return d, e } @@ -362,6 +367,7 @@ func NoFix32B(divd int32) (int32, int32) { // amd64:-"JMP" // 386:-"JMP" e = divd % divr + d += e } return d, e } @@ -376,6 +382,7 @@ func NoFix16A(divr int16) (int16, int16) { // amd64:-"JMP" // 386:-"JMP" e %= divr + d += e } return d, e } @@ -391,6 +398,7 @@ func NoFix16B(divd int16) (int16, int16) { // amd64:-"JMP" // 386:-"JMP" e = divd % divr + d += e } return d, e } diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go index 02bed38661..719063cdc3 100644 --- a/test/codegen/comparisons.go +++ b/test/codegen/comparisons.go @@ -426,7 +426,7 @@ func UintGeqZero(a uint8, b uint16, c uint32, d uint64) int { } func UintGtZero(a uint8, b uint16, c uint32, d uint64) int { - // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLS|BHI)` + // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLS|BHI)` if a > 0 || b > 0 || c > 0 || d > 0 { return 1 } @@ -434,7 +434,7 @@ func UintGtZero(a uint8, b uint16, c uint32, d uint64) int { } func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int { - // arm64: `CBNZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHI|BLS)` + // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHI|BLS)` if a <= 0 || b <= 0 || c <= 0 || d <= 0 { return 1 } @@ -442,7 +442,7 @@ func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int { } func UintLtOne(a uint8, b uint16, c uint32, d uint64) int { - // arm64: `CBNZW`, `CBZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHS|BLO)` + // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHS|BLO)` if a < 1 || b < 1 || c < 1 || d < 1 { return 1 } @@ -450,7 +450,7 @@ func UintLtOne(a uint8, b uint16, c uint32, d uint64) int { } func UintGeqOne(a uint8, b uint16, c uint32, d uint64) int { - // arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLO|BHS)` + // arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLO|BHS)` if a >= 1 || b >= 1 || c >= 1 || d >= 1 { return 1 }