diff --git a/src/cmd/compile/internal/ssa/dom.go b/src/cmd/compile/internal/ssa/dom.go
index 4790e3383a..3dae5fbf07 100644
--- a/src/cmd/compile/internal/ssa/dom.go
+++ b/src/cmd/compile/internal/ssa/dom.go
@@ -296,7 +296,8 @@ func dominatorsSimple(f *Func) []*Block {
 // intersect finds the closest dominator of both b and c.
 // It requires a postorder numbering of all the blocks.
 func intersect(b, c *Block, postnum []int, idom []*Block) *Block {
-	// TODO: This loop is O(n^2). See BenchmarkNilCheckDeep*.
+	// TODO: This loop is O(n^2). It used to be used in nilcheck,
+	// see BenchmarkNilCheckDeep*.
 	for b != c {
 		if postnum[b.ID] < postnum[c.ID] {
 			b = idom[b.ID]
diff --git a/src/cmd/compile/internal/ssa/func.go b/src/cmd/compile/internal/ssa/func.go
index b005876096..5dc352e991 100644
--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@@ -104,6 +104,31 @@ func (f *Func) newValue(op Op, t Type, b *Block, pos src.XPos) *Value {
 	return v
 }
 
+// newValueNoBlock allocates a new Value with the given fields.
+// The returned value is not placed in any block.  Once the caller
+// decides on a block b, it must set b.Block and append
+// the returned value to b.Values.
+func (f *Func) newValueNoBlock(op Op, t Type, pos src.XPos) *Value {
+	var v *Value
+	if f.freeValues != nil {
+		v = f.freeValues
+		f.freeValues = v.argstorage[0]
+		v.argstorage[0] = nil
+	} else {
+		ID := f.vid.get()
+		if int(ID) < len(f.Config.values) {
+			v = &f.Config.values[ID]
+		} else {
+			v = &Value{ID: ID}
+		}
+	}
+	v.Op = op
+	v.Type = t
+	v.Block = nil // caller must fix this.
+	v.Pos = pos
+	return v
+}
+
 // logPassStat writes a string key and int value as a warning in a
 // tab-separated format easily handled by spreadsheets or awk.
 // file names, lines, and function names are included to provide enough (?)
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index e40fe17ad4..95f2f7c91a 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -26,28 +26,48 @@
 
 // Spilling
 //
-// For every value, we generate a spill immediately after the value itself.
-//     x = Op y z    : AX
-//     x2 = StoreReg x
-// While AX still holds x, any uses of x will use that value. When AX is needed
-// for another value, we simply reuse AX.  Spill code has already been generated
-// so there is no code generated at "spill" time. When x is referenced
-// subsequently, we issue a load to restore x to a register using x2 as
-//  its argument:
-//    x3 = Restore x2 : CX
-// x3 can then be used wherever x is referenced again.
-// If the spill (x2) is never used, it will be removed at the end of regalloc.
+// During the normal course of the allocator, we might throw a still-live
+// value out of all registers. When that value is subsequently used, we must
+// load it from a slot on the stack. We must also issue an instruction to
+// initialize that stack location with a copy of v.
+//
+// pre-regalloc:
+//   (1) v = Op ...
+//   (2) x = Op ...
+//   (3) ... = Op v ...
+//
+// post-regalloc:
+//   (1) v = Op ...    : AX // computes v, store result in AX
+//       s = StoreReg v     // spill v to a stack slot
+//   (2) x = Op ...    : AX // some other op uses AX
+//       c = LoadReg s : CX // restore v from stack slot
+//   (3) ... = Op c ...     // use the restored value
+//
+// Allocation occurs normally until we reach (3) and we realize we have
+// a use of v and it isn't in any register. At that point, we allocate
+// a spill (a StoreReg) for v. We can't determine the correct place for
+// the spill at this point, so we allocate the spill as blockless initially.
+// The restore is then generated to load v back into a register so it can
+// be used. Subsequent uses of v will use the restored value c instead.
+//
+// What remains is the question of where to schedule the spill.
+// During allocation, we keep track of the dominator of all restores of v.
+// The spill of v must dominate that block. The spill must also be issued at
+// a point where v is still in a register.
+//
+// To find the right place, start at b, the block which dominates all restores.
+//  - If b is v.Block, then issue the spill right after v.
+//    It is known to be in a register at that point, and dominates any restores.
+//  - Otherwise, if v is in a register at the start of b,
+//    put the spill of v at the start of b.
+//  - Otherwise, set b = immediate dominator of b, and repeat.
 //
 // Phi values are special, as always. We define two kinds of phis, those
 // where the merge happens in a register (a "register" phi) and those where
 // the merge happens in a stack location (a "stack" phi).
 //
 // A register phi must have the phi and all of its inputs allocated to the
-// same register. Register phis are spilled similarly to regular ops:
-//     b1: y = ... : AX        b2: z = ... : AX
-//         goto b3                 goto b3
-//     b3: x = phi(y, z) : AX
-//         x2 = StoreReg x
+// same register. Register phis are spilled similarly to regular ops.
 //
 // A stack phi must have the phi and all of its inputs allocated to the same
 // stack location. Stack phis start out life already spilled - each phi
@@ -91,18 +111,6 @@
 // will have no use (so don't run deadcode after regalloc!).
 // TODO: maybe we should introduce these extra phis?
 
-// Additional not-quite-SSA output occurs when spills are sunk out
-// of loops to the targets of exit edges from the loop.  Before sinking,
-// there is one spill site (one StoreReg) targeting stack slot X, after
-// sinking there may be multiple spill sites targeting stack slot X,
-// with no phi functions at any join points reachable by the multiple
-// spill sites.  In addition, uses of the spill from copies of the original
-// will not name the copy in their reference; instead they will name
-// the original, though both will have the same spill location.  The
-// first sunk spill will be the original, but moved, to an exit block,
-// thus ensuring that there is a definition somewhere corresponding to
-// the original spill's uses.
-
 package ssa
 
 import (
@@ -186,14 +194,15 @@ type use struct {
 	next *use     // linked list of uses of a value in nondecreasing dist order
 }
 
+// A valState records the register allocation state for a (pre-regalloc) value.
 type valState struct {
 	regs              regMask // the set of registers holding a Value (usually just one)
 	uses              *use    // list of uses in this block
-	spill             *Value  // spilled copy of the Value
-	spillUsed         bool
-	spillUsedShuffle  bool // true if used in shuffling, after ordinary uses
-	needReg           bool // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
-	rematerializeable bool // cached value of v.rematerializeable()
+	spill             *Value  // spilled copy of the Value (if any)
+	restoreMin        int32   // minimum of all restores' blocks' sdom.entry
+	restoreMax        int32   // maximum of all restores' blocks' sdom.exit
+	needReg           bool    // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
+	rematerializeable bool    // cached value of v.rematerializeable()
 }
 
 type regState struct {
@@ -205,6 +214,7 @@ type regState struct {
 type regAllocState struct {
 	f *Func
 
+	sdom        SparseTree
 	registers   []Register
 	numRegs     register
 	SPReg       register
@@ -271,15 +281,6 @@ type regAllocState struct {
 	loopnest *loopnest
 }
 
-type spillToSink struct {
-	spill *Value // Spill instruction to move (a StoreReg)
-	dests int32  // Bitmask indicating exit blocks from loop in which spill/val is defined. 1<<i set means val is live into loop.exitBlocks[i]
-}
-
-func (sts *spillToSink) spilledValue() *Value {
-	return sts.spill.Args[0]
-}
-
 type endReg struct {
 	r register
 	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
@@ -288,7 +289,8 @@ type endReg struct {
 
 type startReg struct {
 	r   register
-	vid ID       // pre-regalloc value needed in this register
+	v   *Value   // pre-regalloc value needed in this register
+	c   *Value   // cached version of the value
 	pos src.XPos // source position of use of this register
 }
 
@@ -405,6 +407,28 @@ func (s *regAllocState) allocReg(mask regMask, v *Value) register {
 	return r
 }
 
+// makeSpill returns a Value which represents the spilled value of v.
+// b is the block in which the spill is used.
+func (s *regAllocState) makeSpill(v *Value, b *Block) *Value {
+	vi := &s.values[v.ID]
+	if vi.spill != nil {
+		// Final block not known - keep track of subtree where restores reside.
+		vi.restoreMin = min32(vi.restoreMin, s.sdom[b.ID].entry)
+		vi.restoreMax = max32(vi.restoreMax, s.sdom[b.ID].exit)
+		return vi.spill
+	}
+	// Make a spill for v. We don't know where we want
+	// to put it yet, so we leave it blockless for now.
+	spill := s.f.newValueNoBlock(OpStoreReg, v.Type, v.Pos)
+	// We also don't know what the spill's arg will be.
+	// Leave it argless for now.
+	s.setOrig(spill, v)
+	vi.spill = spill
+	vi.restoreMin = s.sdom[b.ID].entry
+	vi.restoreMax = s.sdom[b.ID].exit
+	return spill
+}
+
 // allocValToReg allocates v to a register selected from regMask and
 // returns the register copy of v. Any previous user is kicked out and spilled
 // (if necessary). Load code is added at the current pc. If nospill is set the
@@ -442,17 +466,12 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool, pos
 		// Rematerialize instead of loading from the spill location.
 		c = v.copyInto(s.curBlock)
 	} else {
-		switch {
 		// Load v from its spill location.
-		case vi.spill != nil:
-			if s.f.pass.debug > logSpills {
-				s.f.Config.Warnl(vi.spill.Pos, "load spill for %v from %v", v, vi.spill)
-			}
-			c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, vi.spill)
-			vi.spillUsed = true
-		default:
-			s.f.Fatalf("attempt to load unspilled value %v", v.LongString())
+		spill := s.makeSpill(v, s.curBlock)
+		if s.f.pass.debug > logSpills {
+			s.f.Config.Warnl(vi.spill.Pos, "load spill for %v from %v", v, spill)
 		}
+		c = s.curBlock.NewValue1(pos, OpLoadReg, v.Type, spill)
 	}
 	s.setOrig(c, v)
 	s.assignReg(r, v, c)
@@ -615,6 +634,7 @@ func (s *regAllocState) init(f *Func) {
 	s.endRegs = make([][]endReg, f.NumBlocks())
 	s.startRegs = make([][]startReg, f.NumBlocks())
 	s.spillLive = make([][]ID, f.NumBlocks())
+	s.sdom = f.sdom()
 }
 
 // Adds a use record for id at distance dist from the start of the block.
@@ -688,22 +708,6 @@ func (s *regAllocState) compatRegs(t Type) regMask {
 	return m & s.allocatable
 }
 
-// loopForBlock returns the loop containing block b,
-// provided that the loop is "interesting" for purposes
-// of improving register allocation (= is inner, and does
-// not contain a call)
-func (s *regAllocState) loopForBlock(b *Block) *loop {
-	loop := s.loopnest.b2l[b.ID]
-
-	// Minor for-the-time-being optimization: nothing happens
-	// unless a loop is both inner and call-free, therefore
-	// don't bother with other loops.
-	if loop != nil && (loop.containsCall || !loop.isInner) {
-		loop = nil
-	}
-	return loop
-}
-
 func (s *regAllocState) regalloc(f *Func) {
 	liveSet := f.newSparseSet(f.NumValues())
 	defer f.retSparseSet(liveSet)
@@ -712,14 +716,6 @@ func (s *regAllocState) regalloc(f *Func) {
 	var phiRegs []register
 	var args []*Value
 
-	// statistics
-	var nSpills int               // # of spills remaining
-	var nSpillsInner int          // # of spills remaining in inner loops
-	var nSpillsSunk int           // # of sunk spills remaining
-	var nSpillsChanged int        // # of sunk spills lost because of register use change
-	var nSpillsSunkUnused int     // # of spills not sunk because they were removed completely
-	var nSpillsNotSunkLateUse int // # of spills not sunk because of very late use (in shuffle)
-
 	// Data structure used for computing desired registers.
 	var desired desiredState
 
@@ -734,24 +730,8 @@ func (s *regAllocState) regalloc(f *Func) {
 		f.Fatalf("entry block must be first")
 	}
 
-	// Get loop nest so that spills in inner loops can be
-	// tracked.  When the last block of a loop is processed,
-	// attempt to move spills out of the loop.
-	s.loopnest.findExits()
-
-	// Spills are moved from one block's slice of values to another's.
-	// This confuses register allocation if it occurs before it is
-	// complete, so candidates are recorded, then rechecked and
-	// moved after all allocation (register and stack) is complete.
-	// Because movement is only within a stack slot's lifetime, it
-	// is safe to do this.
-	var toSink []spillToSink
-	// Will be used to figure out live inputs to exit blocks of inner loops.
-	entryCandidates := newSparseMap(f.NumValues())
-
 	for _, b := range f.Blocks {
 		s.curBlock = b
-		loop := s.loopForBlock(b)
 
 		// Initialize liveSet and uses fields for this block.
 		// Walk backwards through the block doing liveness analysis.
@@ -945,22 +925,11 @@ func (s *regAllocState) regalloc(f *Func) {
 				if r == noRegister {
 					// stack-based phi
 					// Spills will be inserted in all the predecessors below.
-					s.values[v.ID].spill = v        // v starts life spilled
-					s.values[v.ID].spillUsed = true // use is guaranteed
+					s.values[v.ID].spill = v // v starts life spilled
 					continue
 				}
 				// register-based phi
 				s.assignReg(r, v, v)
-				// Spill the phi in case we need to restore it later.
-				spill := b.NewValue1(v.Pos, OpStoreReg, v.Type, v)
-				s.setOrig(spill, v)
-				s.values[v.ID].spill = spill
-				s.values[v.ID].spillUsed = false
-				if loop != nil {
-					loop.spills = append(loop.spills, v)
-					nSpillsInner++
-				}
-				nSpills++
 			}
 
 			// Save the starting state for use by merge edges.
@@ -975,14 +944,14 @@ func (s *regAllocState) regalloc(f *Func) {
 					// specially during merge edge processing.
 					continue
 				}
-				regList = append(regList, startReg{r, v.ID, s.values[v.ID].uses.pos})
+				regList = append(regList, startReg{r, v, s.regs[r].c, s.values[v.ID].uses.pos})
 			}
 			s.startRegs[b.ID] = regList
 
 			if s.f.pass.debug > regDebug {
 				fmt.Printf("after phis\n")
 				for _, x := range s.startRegs[b.ID] {
-					fmt.Printf("  %s: v%d\n", s.registers[x.r].Name(), x.vid)
+					fmt.Printf("  %s: v%d\n", s.registers[x.r].Name(), x.v.ID)
 				}
 			}
 		}
@@ -1005,7 +974,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			succ := e.b
 			// TODO: prioritize likely successor?
 			for _, x := range s.startRegs[succ.ID] {
-				desired.add(x.vid, x.r)
+				desired.add(x.v.ID, x.r)
 			}
 			// Process phi ops in succ.
 			pidx := e.i
@@ -1100,7 +1069,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				// any register here. We just set up the spill pointer to
 				// point at itself and any later user will restore it to use it.
 				s.values[v.ID].spill = v
-				s.values[v.ID].spillUsed = true // use is guaranteed
 				b.Values = append(b.Values, v)
 				s.advanceUses(v)
 				continue
@@ -1109,7 +1077,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				// Make sure the argument to v is still live here.
 				s.advanceUses(v)
 				vi := &s.values[v.Args[0].ID]
-				if vi.spillUsed {
+				if vi.spill != nil {
 					// Use the spill location.
 					v.SetArg(0, vi.spill)
 				} else {
@@ -1373,28 +1341,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 			b.Values = append(b.Values, v)
 
-			// Issue a spill for this value. We issue spills unconditionally,
-			// then at the end of regalloc delete the ones we never use.
-			// TODO: schedule the spill at a point that dominates all restores.
-			// The restore may be off in an unlikely branch somewhere and it
-			// would be better to have the spill in that unlikely branch as well.
-			// v := ...
-			// if unlikely {
-			//     f()
-			// }
-			// It would be good to have both spill and restore inside the IF.
 		issueSpill:
-			if s.values[v.ID].needReg {
-				spill := b.NewValue1(v.Pos, OpStoreReg, v.Type, v)
-				s.setOrig(spill, v)
-				s.values[v.ID].spill = spill
-				s.values[v.ID].spillUsed = false
-				if loop != nil {
-					loop.spills = append(loop.spills, v)
-					nSpillsInner++
-				}
-				nSpills++
-			}
 		}
 
 		// Load control value into reg.
@@ -1504,91 +1451,24 @@ func (s *regAllocState) regalloc(f *Func) {
 		}
 
 		// If a value is live at the end of the block and
-		// isn't in a register, remember that its spill location
-		// is live. We need to remember this information so that
+		// isn't in a register, generate a use for the spill location.
+		// We need to remember this information so that
 		// the liveness analysis in stackalloc is correct.
 		for _, e := range s.live[b.ID] {
-			if s.values[e.ID].regs != 0 {
+			vi := &s.values[e.ID]
+			if vi.regs != 0 {
 				// in a register, we'll use that source for the merge.
 				continue
 			}
-			spill := s.values[e.ID].spill
-			if spill == nil {
-				// rematerializeable values will have spill==nil.
+			if vi.rematerializeable {
+				// we'll rematerialize during the merge.
 				continue
 			}
+			//fmt.Printf("live-at-end spill for %s at %s\n", s.orig[e.ID], b)
+			spill := s.makeSpill(s.orig[e.ID], b)
 			s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
-			s.values[e.ID].spillUsed = true
 		}
 
-		// Keep track of values that are spilled in the loop, but whose spill
-		// is not used in the loop.  It may be possible to move ("sink") the
-		// spill out of the loop into one or more exit blocks.
-		if loop != nil {
-			loop.scratch++                    // increment count of blocks in this loop that have been processed
-			if loop.scratch == loop.nBlocks { // just processed last block of loop, if it is an inner loop.
-				// This check is redundant with code at the top of the loop.
-				// This is definitive; the one at the top of the loop is an optimization.
-				if loop.isInner && // Common case, easier, most likely to be profitable
-					!loop.containsCall && // Calls force spills, also lead to puzzling spill info.
-					len(loop.exits) <= 32 { // Almost no inner loops have more than 32 exits,
-					// and this allows use of a bitvector and a sparseMap.
-
-					// TODO: exit calculation is messed up for non-inner loops
-					// because of multilevel exits that are not part of the "exit"
-					// count.
-
-					// Compute the set of spill-movement candidates live at entry to exit blocks.
-					// isLoopSpillCandidate filters for
-					// (1) defined in appropriate loop
-					// (2) needs a register
-					// (3) spill not already used (in the loop)
-					// Condition (3) === "in a register at all loop exits"
-
-					entryCandidates.clear()
-
-					for whichExit, ss := range loop.exits {
-						// Start with live at end.
-						for _, li := range s.live[ss.ID] {
-							if s.isLoopSpillCandidate(loop, s.orig[li.ID]) {
-								// s.live contains original IDs, use s.orig above to map back to *Value
-								entryCandidates.setBit(li.ID, uint(whichExit))
-							}
-						}
-						// Control can also be live.
-						if ss.Control != nil && s.orig[ss.Control.ID] != nil && s.isLoopSpillCandidate(loop, s.orig[ss.Control.ID]) {
-							entryCandidates.setBit(s.orig[ss.Control.ID].ID, uint(whichExit))
-						}
-						// Walk backwards, filling in locally live values, removing those defined.
-						for i := len(ss.Values) - 1; i >= 0; i-- {
-							v := ss.Values[i]
-							vorig := s.orig[v.ID]
-							if vorig != nil {
-								entryCandidates.remove(vorig.ID) // Cannot be an issue, only keeps the sets smaller.
-							}
-							for _, a := range v.Args {
-								aorig := s.orig[a.ID]
-								if aorig != nil && s.isLoopSpillCandidate(loop, aorig) {
-									entryCandidates.setBit(aorig.ID, uint(whichExit))
-								}
-							}
-						}
-					}
-
-					for _, e := range loop.spills {
-						whichblocks := entryCandidates.get(e.ID)
-						oldSpill := s.values[e.ID].spill
-						if whichblocks != 0 && whichblocks != -1 { // -1 = not in map.
-							toSink = append(toSink, spillToSink{spill: oldSpill, dests: whichblocks})
-						}
-					}
-
-				} // loop is inner etc
-				loop.scratch = 0 // Don't leave a mess, just in case.
-				loop.spills = nil
-			} // if scratch == nBlocks
-		} // if loop is not nil
-
 		// Clear any final uses.
 		// All that is left should be the pseudo-uses added for values which
 		// are live at the end of b.
@@ -1606,52 +1486,8 @@ func (s *regAllocState) regalloc(f *Func) {
 		}
 	}
 
-	// Erase any spills we never used
-	for i := range s.values {
-		vi := s.values[i]
-		if vi.spillUsed {
-			if s.f.pass.debug > logSpills && vi.spill.Op != OpArg {
-				s.f.Config.Warnl(vi.spill.Pos, "spilled value at %v remains", vi.spill)
-			}
-			continue
-		}
-		spill := vi.spill
-		if spill == nil {
-			// Constants, SP, SB, ...
-			continue
-		}
-		loop := s.loopForBlock(spill.Block)
-		if loop != nil {
-			nSpillsInner--
-		}
-
-		spill.Args[0].Uses--
-		f.freeValue(spill)
-		nSpills--
-	}
-
-	for _, b := range f.Blocks {
-		i := 0
-		for _, v := range b.Values {
-			if v.Op == OpInvalid {
-				continue
-			}
-			b.Values[i] = v
-			i++
-		}
-		b.Values = b.Values[:i]
-		// TODO: zero b.Values[i:], recycle Values
-		// Not important now because this is the last phase that manipulates Values
-	}
-
-	// Must clear these out before any potential recycling, though that's
-	// not currently implemented.
-	for i, ts := range toSink {
-		vsp := ts.spill
-		if vsp.Op == OpInvalid { // This spill was completely eliminated
-			toSink[i].spill = nil
-		}
-	}
+	// Decide where the spills we generated will go.
+	s.placeSpills()
 
 	// Anything that didn't get a register gets a stack location here.
 	// (StoreReg, stack-based phis, inputs, ...)
@@ -1660,114 +1496,6 @@ func (s *regAllocState) regalloc(f *Func) {
 	// Fix up all merge edges.
 	s.shuffle(stacklive)
 
-	// Insert moved spills (that have not been marked invalid above)
-	// at start of appropriate block and remove the originals from their
-	// location within loops.  Notice that this can break SSA form;
-	// if a spill is sunk to multiple exits, there will be no phi for that
-	// spill at a join point downstream of those two exits, though the
-	// two spills will target the same stack slot.  Notice also that this
-	// takes place after stack allocation, so the stack allocator does
-	// not need to process these malformed flow graphs.
-sinking:
-	for _, ts := range toSink {
-		vsp := ts.spill
-		if vsp == nil { // This spill was completely eliminated
-			nSpillsSunkUnused++
-			continue sinking
-		}
-		e := ts.spilledValue()
-		if s.values[e.ID].spillUsedShuffle {
-			nSpillsNotSunkLateUse++
-			continue sinking
-		}
-
-		// move spills to a better (outside of loop) block.
-		// This would be costly if it occurred very often, but it doesn't.
-		b := vsp.Block
-		loop := s.loopnest.b2l[b.ID]
-		dests := ts.dests
-
-		// Pre-check to be sure that spilled value is still in expected register on all exits where live.
-	check_val_still_in_reg:
-		for i := uint(0); i < 32 && dests != 0; i++ {
-
-			if dests&(1<<i) == 0 {
-				continue
-			}
-			dests ^= 1 << i
-			d := loop.exits[i]
-			if len(d.Preds) > 1 {
-				panic("Should be impossible given critical edges removed")
-			}
-			p := d.Preds[0].b // block in loop exiting to d.
-
-			endregs := s.endRegs[p.ID]
-			for _, regrec := range endregs {
-				if regrec.v == e && regrec.r != noRegister && regrec.c == e { // TODO: regrec.c != e implies different spill possible.
-					continue check_val_still_in_reg
-				}
-			}
-			// If here, the register assignment was lost down at least one exit and it can't be sunk
-			if s.f.pass.debug > moveSpills {
-				s.f.Config.Warnl(e.Pos, "lost register assignment for spill %v in %v at exit %v to %v",
-					vsp, b, p, d)
-			}
-			nSpillsChanged++
-			continue sinking
-		}
-
-		nSpillsSunk++
-		nSpillsInner--
-		// don't update nSpills, since spill is only moved, and if it is duplicated, the spills-on-a-path is not increased.
-
-		dests = ts.dests
-
-		// remove vsp from b.Values
-		i := 0
-		for _, w := range b.Values {
-			if vsp == w {
-				continue
-			}
-			b.Values[i] = w
-			i++
-		}
-		b.Values = b.Values[:i]
-
-		first := true
-		for i := uint(0); i < 32 && dests != 0; i++ {
-
-			if dests&(1<<i) == 0 {
-				continue
-			}
-
-			dests ^= 1 << i
-
-			d := loop.exits[i]
-			vspnew := vsp // reuse original for first sunk spill, saves tracking down and renaming uses
-			if !first {   // any sunk spills after first must make a copy
-				vspnew = d.NewValue1(e.Pos, OpStoreReg, e.Type, e)
-				f.setHome(vspnew, f.getHome(vsp.ID)) // copy stack home
-				if s.f.pass.debug > moveSpills {
-					s.f.Config.Warnl(e.Pos, "copied spill %v in %v for %v to %v in %v",
-						vsp, b, e, vspnew, d)
-				}
-			} else {
-				first = false
-				vspnew.Block = d
-				d.Values = append(d.Values, vspnew)
-				if s.f.pass.debug > moveSpills {
-					s.f.Config.Warnl(e.Pos, "moved spill %v in %v for %v to %v in %v",
-						vsp, b, e, vspnew, d)
-				}
-			}
-
-			// shuffle vspnew to the beginning of its block
-			copy(d.Values[1:], d.Values[0:len(d.Values)-1])
-			d.Values[0] = vspnew
-
-		}
-	}
-
 	// Erase any copies we never used.
 	// Also, an unused copy might be the only use of another copy,
 	// so continue erasing until we reach a fixed point.
@@ -1800,38 +1528,143 @@ sinking:
 		}
 		b.Values = b.Values[:i]
 	}
+}
 
-	if f.pass.stats > 0 {
-		f.LogStat("spills_info",
-			nSpills, "spills", nSpillsInner, "inner_spills_remaining", nSpillsSunk, "inner_spills_sunk", nSpillsSunkUnused, "inner_spills_unused", nSpillsNotSunkLateUse, "inner_spills_shuffled", nSpillsChanged, "inner_spills_changed")
+func (s *regAllocState) placeSpills() {
+	f := s.f
+
+	// Precompute some useful info.
+	phiRegs := make([]regMask, f.NumBlocks())
+	for _, b := range f.Blocks {
+		var m regMask
+		for _, v := range b.Values {
+			if v.Op != OpPhi {
+				break
+			}
+			if r, ok := f.getHome(v.ID).(*Register); ok {
+				m |= regMask(1) << uint(r.num)
+			}
+		}
+		phiRegs[b.ID] = m
 	}
-}
 
-// isLoopSpillCandidate indicates whether the spill for v satisfies preliminary
-// spill-sinking conditions just after the last block of loop has been processed.
-// In particular:
-//   v needs a register.
-//   v's spill is not (YET) used.
-//   v's definition is within loop.
-// The spill may be used in the future, either by an outright use
-// in the code, or by shuffling code inserted after stack allocation.
-// Outright uses cause sinking; shuffling (within the loop) inhibits it.
-func (s *regAllocState) isLoopSpillCandidate(loop *loop, v *Value) bool {
-	return s.values[v.ID].needReg && !s.values[v.ID].spillUsed && s.loopnest.b2l[v.Block.ID] == loop
-}
+	// Start maps block IDs to the list of spills
+	// that go at the start of the block (but after any phis).
+	start := map[ID][]*Value{}
+	// After maps value IDs to the list of spills
+	// that go immediately after that value ID.
+	after := map[ID][]*Value{}
 
-// lateSpillUse notes a late (after stack allocation) use of the spill of value with ID vid.
-// This will inhibit spill sinking.
-func (s *regAllocState) lateSpillUse(vid ID) {
-	// TODO investigate why this is necessary.
-	// It appears that an outside-the-loop use of
-	// an otherwise sinkable spill makes the spill
-	// a candidate for shuffling, when it would not
-	// otherwise have been the case (spillUsed was not
-	// true when isLoopSpillCandidate was called, yet
-	// it was shuffled).  Such shuffling cuts the amount
-	// of spill sinking by more than half (in make.bash)
-	s.values[vid].spillUsedShuffle = true
+	for i := range s.values {
+		vi := s.values[i]
+		spill := vi.spill
+		if spill == nil {
+			continue
+		}
+		if spill.Block != nil {
+			// Some spills are already fully set up,
+			// like OpArgs and stack-based phis.
+			continue
+		}
+		v := s.orig[i]
+
+		// Walk down the dominator tree looking for a good place to
+		// put the spill of v.  At the start "best" is the best place
+		// we have found so far.
+		// TODO: find a way to make this O(1) without arbitrary cutoffs.
+		best := v.Block
+		bestArg := v
+		var bestDepth int16
+		if l := s.loopnest.b2l[best.ID]; l != nil {
+			bestDepth = l.depth
+		}
+		b := best
+		const maxSpillSearch = 100
+		for i := 0; i < maxSpillSearch; i++ {
+			// Find the child of b in the dominator tree which
+			// dominates all restores.
+			p := b
+			b = nil
+			for c := s.sdom.Child(p); c != nil && i < maxSpillSearch; c, i = s.sdom.Sibling(c), i+1 {
+				if s.sdom[c.ID].entry <= vi.restoreMin && s.sdom[c.ID].exit >= vi.restoreMax {
+					// c also dominates all restores.  Walk down into c.
+					b = c
+					break
+				}
+			}
+			if b == nil {
+				// Ran out of blocks which dominate all restores.
+				break
+			}
+
+			var depth int16
+			if l := s.loopnest.b2l[b.ID]; l != nil {
+				depth = l.depth
+			}
+			if depth > bestDepth {
+				// Don't push the spill into a deeper loop.
+				continue
+			}
+
+			// If v is in a register at the start of b, we can
+			// place the spill here (after the phis).
+			if len(b.Preds) == 1 {
+				for _, e := range s.endRegs[b.Preds[0].b.ID] {
+					if e.v == v {
+						// Found a better spot for the spill.
+						best = b
+						bestArg = e.c
+						bestDepth = depth
+						break
+					}
+				}
+			} else {
+				for _, e := range s.startRegs[b.ID] {
+					if e.v == v {
+						// Found a better spot for the spill.
+						best = b
+						bestArg = e.c
+						bestDepth = depth
+						break
+					}
+				}
+			}
+		}
+
+		// Put the spill in the best block we found.
+		spill.Block = best
+		spill.AddArg(bestArg)
+		if best == v.Block && v.Op != OpPhi {
+			// Place immediately after v.
+			after[v.ID] = append(after[v.ID], spill)
+		} else {
+			// Place at the start of best block.
+			start[best.ID] = append(start[best.ID], spill)
+		}
+	}
+
+	// Insert spill instructions into the block schedules.
+	var oldSched []*Value
+	for _, b := range f.Blocks {
+		nphi := 0
+		for _, v := range b.Values {
+			if v.Op != OpPhi {
+				break
+			}
+			nphi++
+		}
+		oldSched = append(oldSched[:0], b.Values[nphi:]...)
+		b.Values = b.Values[:nphi]
+		for _, v := range start[b.ID] {
+			b.Values = append(b.Values, v)
+		}
+		for _, v := range oldSched {
+			b.Values = append(b.Values, v)
+			for _, w := range after[v.ID] {
+				b.Values = append(b.Values, w)
+			}
+		}
+	}
 }
 
 // shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
@@ -1919,13 +1752,24 @@ func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive
 	for _, spillID := range stacklive {
 		v := e.s.orig[spillID]
 		spill := e.s.values[v.ID].spill
+		if !e.s.sdom.isAncestorEq(spill.Block, e.p) {
+			// Spills were placed that only dominate the uses found
+			// during the first regalloc pass. The edge fixup code
+			// can't use a spill location if the spill doesn't dominate
+			// the edge.
+			// We are guaranteed that if the spill doesn't dominate this edge,
+			// then the value is available in a register (because we called
+			// makeSpill for every value not in a register at the start
+			// of an edge).
+			continue
+		}
 		e.set(e.s.f.getHome(spillID), v.ID, spill, false, src.NoXPos) // don't care the position of the source
 	}
 
 	// Figure out all the destinations we need.
 	dsts := e.destinations[:0]
 	for _, x := range dstReg {
-		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.vid, nil, x.pos})
+		dsts = append(dsts, dstRecord{&e.s.registers[x.r], x.v.ID, nil, x.pos})
 	}
 	// Phis need their args to end up in a specific location.
 	for _, v := range e.b.Values {
@@ -2010,7 +1854,6 @@ func (e *edgeState) process() {
 		if _, isReg := loc.(*Register); isReg {
 			c = e.p.NewValue1(d.pos, OpCopy, c.Type, c)
 		} else {
-			e.s.lateSpillUse(vid)
 			c = e.p.NewValue1(d.pos, OpLoadReg, c.Type, c)
 		}
 		e.set(r, vid, c, false, d.pos)
@@ -2028,9 +1871,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 			(*splice).Uses--
 			*splice = occupant.c
 			occupant.c.Uses++
-			if occupant.c.Op == OpStoreReg {
-				e.s.lateSpillUse(vid)
-			}
 		}
 		// Note: if splice==nil then c will appear dead. This is
 		// non-SSA formed code, so be careful after this pass not to run
@@ -2106,7 +1946,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 			}
 		} else {
 			if dstReg {
-				e.s.lateSpillUse(vid)
 				x = e.p.NewValue1(pos, OpLoadReg, c.Type, c)
 			} else {
 				// mem->mem. Use temp register.
@@ -2124,7 +1963,6 @@ func (e *edgeState) processDest(loc Location, vid ID, splice **Value, pos src.XP
 				e.erase(loc)
 
 				r := e.findRegFor(c.Type)
-				e.s.lateSpillUse(vid)
 				t := e.p.NewValue1(pos, OpLoadReg, c.Type, c)
 				e.set(r, vid, t, false, pos)
 				x = e.p.NewValue1(pos, OpStoreReg, loc.(LocalSlot).Type, t)
@@ -2630,3 +2468,16 @@ func (d *desiredState) merge(x *desiredState) {
 		d.addList(e.ID, e.regs)
 	}
 }
+
+func min32(x, y int32) int32 {
+	if x < y {
+		return x
+	}
+	return y
+}
+func max32(x, y int32) int32 {
+	if x > y {
+		return x
+	}
+	return y
+}