diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go
index 937c757b21..71ca774431 100644
--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@@ -358,6 +358,22 @@ func (b *Block) AuxIntString() string {
 	}
 }
 
+// likelyBranch reports whether block b is the likely branch of all of its predecessors.
+func (b *Block) likelyBranch() bool {
+	if len(b.Preds) == 0 {
+		return false
+	}
+	for _, e := range b.Preds {
+		p := e.b
+		if len(p.Succs) == 1 || len(p.Succs) == 2 && (p.Likely == BranchLikely && p.Succs[0].b == b ||
+			p.Likely == BranchUnlikely && p.Succs[1].b == b) {
+			continue
+		}
+		return false
+	}
+	return true
+}
+
 func (b *Block) Logf(msg string, args ...interface{})   { b.Func.Logf(msg, args...) }
 func (b *Block) Log() bool                              { return b.Func.Log() }
 func (b *Block) Fatalf(msg string, args ...interface{}) { b.Func.Fatalf(msg, args...) }
diff --git a/src/cmd/compile/internal/ssa/layout.go b/src/cmd/compile/internal/ssa/layout.go
index 30b7b97d04..a7fd73aead 100644
--- a/src/cmd/compile/internal/ssa/layout.go
+++ b/src/cmd/compile/internal/ssa/layout.go
@@ -41,8 +41,13 @@ func layoutOrder(f *Func) []*Block {
 	indegree := make([]int, f.NumBlocks())
 	posdegree := f.newSparseSet(f.NumBlocks()) // blocks with positive remaining degree
 	defer f.retSparseSet(posdegree)
-	zerodegree := f.newSparseSet(f.NumBlocks()) // blocks with zero remaining degree
-	defer f.retSparseSet(zerodegree)
+	// blocks with zero remaining degree. Use slice to simulate a LIFO queue to implement
+	// the depth-first topology sorting algorithm.
+	var zerodegree []ID
+	// LIFO queue. Track the successor blocks of the scheduled block so that when we
+	// encounter loops, we choose to schedule the successor block of the most recently
+	// scheduled block.
+	var succs []ID
 	exit := f.newSparseSet(f.NumBlocks()) // exit blocks
 	defer f.retSparseSet(exit)
 
@@ -88,7 +93,8 @@ func layoutOrder(f *Func) []*Block {
 		}
 		indegree[b.ID] = len(b.Preds)
 		if len(b.Preds) == 0 {
-			zerodegree.add(b.ID)
+			// Push an element to the tail of the queue.
+			zerodegree = append(zerodegree, b.ID)
 		} else {
 			posdegree.add(b.ID)
 		}
@@ -105,12 +111,24 @@ blockloop:
 			break
 		}
 
-		for _, e := range b.Succs {
-			c := e.b
+		// Here, the order of traversing the b.Succs affects the direction in which the topological
+		// sort advances in depth. Take the following cfg as an example, regardless of other factors.
+		//           b1
+		//         0/ \1
+		//        b2   b3
+		// Traverse b.Succs in order, the right child node b3 will be scheduled immediately after
+		// b1, traverse b.Succs in reverse order, the left child node b2 will be scheduled
+		// immediately after b1. The test results show that reverse traversal performs a little
+		// better.
+		// Note: You need to consider both layout and register allocation when testing performance.
+		for i := len(b.Succs) - 1; i >= 0; i-- {
+			c := b.Succs[i].b
 			indegree[c.ID]--
 			if indegree[c.ID] == 0 {
 				posdegree.remove(c.ID)
-				zerodegree.add(c.ID)
+				zerodegree = append(zerodegree, c.ID)
+			} else {
+				succs = append(succs, c.ID)
 			}
 		}
 
@@ -132,30 +150,30 @@ blockloop:
 
 		// Use degree for now.
 		bid = 0
-		mindegree := f.NumBlocks()
-		for _, e := range order[len(order)-1].Succs {
-			c := e.b
-			if scheduled[c.ID] || c.Kind == BlockExit {
-				continue
-			}
-			if indegree[c.ID] < mindegree {
-				mindegree = indegree[c.ID]
-				bid = c.ID
-			}
-		}
-		if bid != 0 {
-			continue
-		}
 		// TODO: improve this part
 		// No successor of the previously scheduled block works.
 		// Pick a zero-degree block if we can.
-		for zerodegree.size() > 0 {
-			cid := zerodegree.pop()
+		for len(zerodegree) > 0 {
+			// Pop an element from the tail of the queue.
+			cid := zerodegree[len(zerodegree)-1]
+			zerodegree = zerodegree[:len(zerodegree)-1]
 			if !scheduled[cid] {
 				bid = cid
 				continue blockloop
 			}
 		}
+
+		// Still nothing, pick the unscheduled successor block encountered most recently.
+		for len(succs) > 0 {
+			// Pop an element from the tail of the queue.
+			cid := succs[len(succs)-1]
+			succs = succs[:len(succs)-1]
+			if !scheduled[cid] {
+				bid = cid
+				continue blockloop
+			}
+		}
+
 		// Still nothing, pick any non-exit block.
 		for posdegree.size() > 0 {
 			cid := posdegree.pop()
diff --git a/src/cmd/compile/internal/ssa/looprotate.go b/src/cmd/compile/internal/ssa/looprotate.go
index 2e5e421df7..35010a78d8 100644
--- a/src/cmd/compile/internal/ssa/looprotate.go
+++ b/src/cmd/compile/internal/ssa/looprotate.go
@@ -68,12 +68,15 @@ func loopRotate(f *Func) {
 			if nextb == p { // original loop predecessor is next
 				break
 			}
-			if loopnest.b2l[nextb.ID] != loop { // about to leave loop
-				break
+			if loopnest.b2l[nextb.ID] == loop {
+				after[p.ID] = append(after[p.ID], nextb)
 			}
-			after[p.ID] = append(after[p.ID], nextb)
 			b = nextb
 		}
+		// Swap b and p so that we'll handle p before b when moving blocks.
+		f.Blocks[idToIdx[loop.header.ID]] = p
+		f.Blocks[idToIdx[p.ID]] = loop.header
+		idToIdx[loop.header.ID], idToIdx[p.ID] = idToIdx[p.ID], idToIdx[loop.header.ID]
 
 		// Place b after p.
 		for _, b := range after[p.ID] {
@@ -86,21 +89,23 @@ func loopRotate(f *Func) {
 	// before the rest of the loop.  And that relies on the
 	// fact that we only identify reducible loops.
 	j := 0
-	for i, b := range f.Blocks {
+	// Some blocks that are not part of a loop may be placed
+	// between loop blocks. In order to avoid these blocks from
+	// being overwritten, use a temporary slice.
+	newOrder := make([]*Block, 0, f.NumBlocks())
+	for _, b := range f.Blocks {
 		if _, ok := move[b.ID]; ok {
 			continue
 		}
-		f.Blocks[j] = b
+		newOrder = append(newOrder, b)
 		j++
 		for _, a := range after[b.ID] {
-			if j > i {
-				f.Fatalf("head before tail in loop %s", b)
-			}
-			f.Blocks[j] = a
+			newOrder = append(newOrder, a)
 			j++
 		}
 	}
 	if j != len(f.Blocks) {
 		f.Fatalf("bad reordering in looprotate")
 	}
+	f.Blocks = newOrder
 }
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index c104a36888..18908681df 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -241,12 +241,6 @@ type regAllocState struct {
 	GReg        register
 	allocatable regMask
 
-	// for each block, its primary predecessor.
-	// A predecessor of b is primary if it is the closest
-	// predecessor that appears before b in the layout order.
-	// We record the index in the Preds list where the primary predecessor sits.
-	primary []int32
-
 	// live values at the end of each block.  live[b.ID] is a list of value IDs
 	// which are live at the end of b, together with a count of how many instructions
 	// forward to the next use.
@@ -304,6 +298,9 @@ type regAllocState struct {
 
 	// choose a good order in which to visit blocks for allocation purposes.
 	visitOrder []*Block
+
+	// blockOrder[b.ID] corresponds to the index of block b in visitOrder.
+	blockOrder []int32
 }
 
 type endReg struct {
@@ -636,9 +633,9 @@ func (s *regAllocState) init(f *Func) {
 
 	// Compute block order. This array allows us to distinguish forward edges
 	// from backward edges and compute how far they go.
-	blockOrder := make([]int32, f.NumBlocks())
+	s.blockOrder = make([]int32, f.NumBlocks())
 	for i, b := range s.visitOrder {
-		blockOrder[b.ID] = int32(i)
+		s.blockOrder[b.ID] = int32(i)
 	}
 
 	s.regs = make([]regState, s.numRegs)
@@ -664,22 +661,6 @@ func (s *regAllocState) init(f *Func) {
 	}
 	s.computeLive()
 
-	// Compute primary predecessors.
-	s.primary = make([]int32, f.NumBlocks())
-	for _, b := range s.visitOrder {
-		best := -1
-		for i, e := range b.Preds {
-			p := e.b
-			if blockOrder[p.ID] >= blockOrder[b.ID] {
-				continue // backward edge
-			}
-			if best == -1 || blockOrder[p.ID] > blockOrder[b.Preds[best].b.ID] {
-				best = i
-			}
-		}
-		s.primary[b.ID] = int32(best)
-	}
-
 	s.endRegs = make([][]endReg, f.NumBlocks())
 	s.startRegs = make([][]startReg, f.NumBlocks())
 	s.spillLive = make([][]ID, f.NumBlocks())
@@ -957,10 +938,49 @@ func (s *regAllocState) regalloc(f *Func) {
 			// This is the complicated case. We have more than one predecessor,
 			// which means we may have Phi ops.
 
-			// Start with the final register state of the primary predecessor
-			idx := s.primary[b.ID]
+			// Start with the final register state of the predecessor with least spill values.
+			// This is based on the following points:
+			// 1, The less spill value indicates that the register pressure of this path is smaller,
+			//    so the values of this block are more likely to be allocated to registers.
+			// 2, Avoid the predecessor that contains the function call, because the predecessor that
+			//    contains the function call usually generates a lot of spills and lose the previous
+			//    allocation state.
+			// TODO: Improve this part. At least the size of endRegs of the predecessor also has
+			// an impact on the code size and compiler speed. But it is not easy to find a simple
+			// and efficient method that combines multiple factors.
+			idx := -1
+			for i, p := range b.Preds {
+				// If the predecessor has not been visited yet, skip it because its end state
+				// (redRegs and spillLive) has not been computed yet.
+				pb := p.b
+				if s.blockOrder[pb.ID] >= s.blockOrder[b.ID] {
+					continue
+				}
+				if idx == -1 {
+					idx = i
+					continue
+				}
+				pSel := b.Preds[idx].b
+				if len(s.spillLive[pb.ID]) < len(s.spillLive[pSel.ID]) {
+					idx = i
+				} else if len(s.spillLive[pb.ID]) == len(s.spillLive[pSel.ID]) {
+					// Use a bit of likely information. After critical pass, pb and pSel must
+					// be plain blocks, so check edge pb->pb.Preds instead of edge pb->b.
+					// TODO: improve the prediction of the likely predecessor. The following
+					// method is only suitable for the simplest cases. For complex cases,
+					// the prediction may be inaccurate, but this does not affect the
+					// correctness of the program.
+					// According to the layout algorithm, the predecessor with the
+					// smaller blockOrder is the true branch, and the test results show
+					// that it is better to choose the predecessor with a smaller
+					// blockOrder than no choice.
+					if pb.likelyBranch() && !pSel.likelyBranch() || s.blockOrder[pb.ID] < s.blockOrder[pSel.ID] {
+						idx = i
+					}
+				}
+			}
 			if idx < 0 {
-				f.Fatalf("block with no primary predecessor %s", b)
+				f.Fatalf("bad visitOrder, no predecessor of %s has been visited before it", b)
 			}
 			p := b.Preds[idx].b
 			s.setState(s.endRegs[p.ID])
@@ -1048,7 +1068,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				// If one of the other inputs of v is in a register, and the register is available,
 				// select this register, which can save some unnecessary copies.
 				for i, pe := range b.Preds {
-					if int32(i) == idx {
+					if i == idx {
 						continue
 					}
 					ri := noRegister
diff --git a/test/codegen/arithmetic.go b/test/codegen/arithmetic.go
index 0bdb66a376..dea7e0ba61 100644
--- a/test/codegen/arithmetic.go
+++ b/test/codegen/arithmetic.go
@@ -322,6 +322,9 @@ func NoFix64A(divr int64) (int64, int64) {
 	if divr > 5 {
 		d /= divr // amd64:-"JMP"
 		e %= divr // amd64:-"JMP"
+		// The following statement is to avoid conflict between the above check
+		// and the normal JMP generated at the end of the block.
+		d += e
 	}
 	return d, e
 }
@@ -333,6 +336,7 @@ func NoFix64B(divd int64) (int64, int64) {
 	if divd > -9223372036854775808 {
 		d = divd / divr // amd64:-"JMP"
 		e = divd % divr // amd64:-"JMP"
+		d += e
 	}
 	return d, e
 }
@@ -347,6 +351,7 @@ func NoFix32A(divr int32) (int32, int32) {
 		// amd64:-"JMP"
 		// 386:-"JMP"
 		e %= divr
+		d += e
 	}
 	return d, e
 }
@@ -362,6 +367,7 @@ func NoFix32B(divd int32) (int32, int32) {
 		// amd64:-"JMP"
 		// 386:-"JMP"
 		e = divd % divr
+		d += e
 	}
 	return d, e
 }
@@ -376,6 +382,7 @@ func NoFix16A(divr int16) (int16, int16) {
 		// amd64:-"JMP"
 		// 386:-"JMP"
 		e %= divr
+		d += e
 	}
 	return d, e
 }
@@ -391,6 +398,7 @@ func NoFix16B(divd int16) (int16, int16) {
 		// amd64:-"JMP"
 		// 386:-"JMP"
 		e = divd % divr
+		d += e
 	}
 	return d, e
 }
diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go
index 02bed38661..719063cdc3 100644
--- a/test/codegen/comparisons.go
+++ b/test/codegen/comparisons.go
@@ -426,7 +426,7 @@ func UintGeqZero(a uint8, b uint16, c uint32, d uint64) int {
 }
 
 func UintGtZero(a uint8, b uint16, c uint32, d uint64) int {
-	// arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLS|BHI)`
+	// arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLS|BHI)`
 	if a > 0 || b > 0 || c > 0 || d > 0 {
 		return 1
 	}
@@ -434,7 +434,7 @@ func UintGtZero(a uint8, b uint16, c uint32, d uint64) int {
 }
 
 func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int {
-	// arm64: `CBNZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHI|BLS)`
+	// arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHI|BLS)`
 	if a <= 0 || b <= 0 || c <= 0 || d <= 0 {
 		return 1
 	}
@@ -442,7 +442,7 @@ func UintLeqZero(a uint8, b uint16, c uint32, d uint64) int {
 }
 
 func UintLtOne(a uint8, b uint16, c uint32, d uint64) int {
-	// arm64: `CBNZW`, `CBZW`, `CBZW`, `CBZ`, -`(CMPW|CMP|BHS|BLO)`
+	// arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BHS|BLO)`
 	if a < 1 || b < 1 || c < 1 || d < 1 {
 		return 1
 	}
@@ -450,7 +450,7 @@ func UintLtOne(a uint8, b uint16, c uint32, d uint64) int {
 }
 
 func UintGeqOne(a uint8, b uint16, c uint32, d uint64) int {
-	// arm64: `CBZW`, `CBNZW`, `CBNZ`, -`(CMPW|CMP|BLO|BHS)`
+	// arm64: `(CBN?ZW)`, `(CBN?Z[^W])`, -`(CMPW|CMP|BLO|BHS)`
 	if a >= 1 || b >= 1 || c >= 1 || d >= 1 {
 		return 1
 	}