[dev.ssa] cmd/compile: initial 386 SSA port

Basically just copied all the amd64 files, removed all the *Q ops, and rebuilt. Compiles fib successfully. Still need to do: - all the 64->32 bit op translations. - audit for instructions that aren't available on 386. - GO386=387? Update #16358 Change-Id: Ib8c684586416a554a527a5eefa0cff71424e36f5 Reviewed-on: https://go-review.googlesource.com/24912 Reviewed-by: Cherry Zhang <cherryyz@google.com>
2016-07-13 13:43:08 -07:00 · 2016-07-13 13:43:08 -07:00 · 14cf6e2083
parent efefd11725
commit 14cf6e2083
7 changed files with 19340 additions and 2 deletions
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -142,8 +142,14 @@ func NewConfig(arch string, fe Frontend, ctxt *obj.Link, optimize bool) *Config
 	case "386":
 		c.IntSize = 4
 		c.PtrSize = 4
-		c.lowerBlock = rewriteBlockAMD64
-		c.lowerValue = rewriteValueAMD64 // TODO(khr): full 32-bit support
+		c.lowerBlock = rewriteBlock386
+		c.lowerValue = rewriteValue386
+		c.registers = registers386[:]
+		c.gpRegMask = gpRegMask386
+		c.fpRegMask = fpRegMask386
+		c.flagRegMask = flagRegMask386
+		c.FPReg = framepointerReg386
+		c.hasGReg = false
 	case "arm":
 		c.IntSize = 4
 		c.PtrSize = 4
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@ -0,0 +1,477 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import "strings"
+
+// Notes:
+//  - Integer types live in the low portion of registers. Upper portions are junk.
+//  - Boolean types use the low-order byte of a register. 0=false, 1=true.
+//    Upper bytes are junk.
+//  - Floating-point types live in the low natural slot of an sse2 register.
+//    Unused portions are junk.
+//  - We do not use AH,BH,CH,DH registers.
+//  - When doing sub-register operations, we try to write the whole
+//    destination register to avoid a partial-register write.
+//  - Unused portions of AuxInt (or the Val portion of ValAndOff) are
+//    filled by sign-extending the used portion.  Users of AuxInt which interpret
+//    AuxInt as unsigned (e.g. shifts) must be careful.
+
+// Suffixes encode the bit width of various instructions.
+// L (long word) = 32 bit
+// W (word)      = 16 bit
+// B (byte)      = 8 bit
+
+// copied from ../../x86/reg.go
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+
+	// pseudo-registers
+	"SB",
+	"FLAGS",
+}
+
+func init() {
+	// Make map from reg names to reg integers.
+	if len(regNames386) > 64 {
+		panic("too many registers")
+	}
+	num := map[string]int{}
+	for i, name := range regNames386 {
+		num[name] = i
+	}
+	buildReg := func(s string) regMask {
+		m := regMask(0)
+		for _, r := range strings.Split(s, " ") {
+			if n, ok := num[r]; ok {
+				m |= regMask(1) << uint(n)
+				continue
+			}
+			panic("register " + r + " not found")
+		}
+		return m
+	}
+
+	// Common individual register masks
+	var (
+		ax         = buildReg("AX")
+		cx         = buildReg("CX")
+		dx         = buildReg("DX")
+		gp         = buildReg("AX CX DX BX BP SI DI")
+		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
+		x7         = buildReg("X7")
+		gpsp       = gp | buildReg("SP")
+		gpspsb     = gpsp | buildReg("SB")
+		flags      = buildReg("FLAGS")
+		callerSave = gp | fp | flags
+	)
+	// Common slices of register masks
+	var (
+		gponly    = []regMask{gp}
+		fponly    = []regMask{fp}
+		flagsonly = []regMask{flags}
+	)
+
+	// Common regInfo
+	var (
+		gp01      = regInfo{inputs: []regMask{}, outputs: gponly}
+		gp11      = regInfo{inputs: []regMask{gp}, outputs: gponly, clobbers: flags}
+		gp11sp    = regInfo{inputs: []regMask{gpsp}, outputs: gponly, clobbers: flags}
+		gp11nf    = regInfo{inputs: []regMask{gpsp}, outputs: gponly} // nf: no flags clobbered
+		gp11sb    = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
+		gp21      = regInfo{inputs: []regMask{gp, gp}, outputs: gponly, clobbers: flags}
+		gp21sp    = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly, clobbers: flags}
+		gp21sb    = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
+		gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}, clobbers: flags}
+		gp11div   = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax},
+			clobbers: dx | flags}
+		gp11hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx},
+			clobbers: ax | flags}
+		gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx},
+			clobbers: ax | flags}
+
+		gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}, outputs: flagsonly}
+		gp1flags = regInfo{inputs: []regMask{gpsp}, outputs: flagsonly}
+		flagsgp  = regInfo{inputs: flagsonly, outputs: gponly}
+
+		readflags = regInfo{inputs: flagsonly, outputs: gponly}
+		flagsgpax = regInfo{inputs: flagsonly, clobbers: ax | flags, outputs: []regMask{gp &^ ax}}
+
+		gpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
+		gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
+
+		gpstore         = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+		gpstoreconst    = regInfo{inputs: []regMask{gpspsb, 0}}
+		gpstoreidx      = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
+		gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
+
+		fp01   = regInfo{inputs: []regMask{}, outputs: fponly}
+		fp21   = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
+		fp21x7 = regInfo{inputs: []regMask{fp &^ x7, fp &^ x7},
+			clobbers: x7, outputs: []regMask{fp &^ x7}}
+		fpgp     = regInfo{inputs: fponly, outputs: gponly}
+		gpfp     = regInfo{inputs: gponly, outputs: fponly}
+		fp11     = regInfo{inputs: fponly, outputs: fponly}
+		fp2flags = regInfo{inputs: []regMask{fp, fp}, outputs: flagsonly}
+
+		fpload    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
+		fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
+
+		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
+		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
+	)
+
+	var _386ops = []opData{
+		// fp ops
+		{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
+		{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
+		{name: "SUBSS", argLength: 2, reg: fp21x7, asm: "SUBSS", resultInArg0: true},                  // fp32 sub
+		{name: "SUBSD", argLength: 2, reg: fp21x7, asm: "SUBSD", resultInArg0: true},                  // fp64 sub
+		{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
+		{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
+		{name: "DIVSS", argLength: 2, reg: fp21x7, asm: "DIVSS", resultInArg0: true},                  // fp32 div
+		{name: "DIVSD", argLength: 2, reg: fp21x7, asm: "DIVSD", resultInArg0: true},                  // fp64 div
+
+		{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff"},            // fp32 load
+		{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff"},            // fp64 load
+		{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant
+		{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant
+		{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by i
+		{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff"},     // fp32 load indexed by 4*i
+		{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by i
+		{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff"},     // fp64 load indexed by 8*i
+
+		{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff"},        // fp32 store
+		{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff"},        // fp64 store
+		{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by i store
+		{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff"}, // fp32 indexed by 4i store
+		{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by i store
+		{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff"}, // fp64 indexed by 8i store
+
+		// binary ops
+		{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true},                // arg0 + arg1
+		{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32"}, // arg0 + auxint
+
+		{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true},                    // arg0 - arg1
+		{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0 - auxint
+
+		{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true}, // arg0 * arg1
+		{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMULL", aux: "Int32", resultInArg0: true}, // arg0 * auxint
+
+		{name: "HMULL", argLength: 2, reg: gp11hmul, asm: "IMULL"}, // (arg0 * arg1) >> width
+		{name: "HMULW", argLength: 2, reg: gp11hmul, asm: "IMULW"}, // (arg0 * arg1) >> width
+		{name: "HMULB", argLength: 2, reg: gp11hmul, asm: "IMULB"}, // (arg0 * arg1) >> width
+		{name: "HMULLU", argLength: 2, reg: gp11hmul, asm: "MULL"}, // (arg0 * arg1) >> width
+		{name: "HMULWU", argLength: 2, reg: gp11hmul, asm: "MULW"}, // (arg0 * arg1) >> width
+		{name: "HMULBU", argLength: 2, reg: gp11hmul, asm: "MULB"}, // (arg0 * arg1) >> width
+
+		{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL"}, // arg0 / arg1
+		{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW"}, // arg0 / arg1
+		{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL"}, // arg0 / arg1
+		{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW"}, // arg0 / arg1
+
+		{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL"}, // arg0 % arg1
+		{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW"}, // arg0 % arg1
+		{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL"}, // arg0 % arg1
+		{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW"}, // arg0 % arg1
+
+		{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true}, // arg0 & arg1
+		{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true}, // arg0 & auxint
+
+		{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true}, // arg0 | arg1
+		{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true}, // arg0 | auxint
+
+		{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true}, // arg0 ^ arg1
+		{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true}, // arg0 ^ auxint
+
+		{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"},                    // arg0 compare to arg1
+		{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
+		{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
+		{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"},  // arg0 compare to auxint
+
+		{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
+		{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
+
+		{name: "TESTL", argLength: 2, reg: gp2flags, asm: "TESTL", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTW", argLength: 2, reg: gp2flags, asm: "TESTW", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTB", argLength: 2, reg: gp2flags, asm: "TESTB", typ: "Flags"},                    // (arg0 & arg1) compare to 0
+		{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"},  // (arg0 & auxint) compare to 0
+
+		{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true},               // arg0 << arg1, shift amount is mod 32
+		{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true},               // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true},  // unsigned arg0 >> auxint, shift amount 0-31
+
+		{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true},               // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true},  // signed arg0 >> auxint, shift amount 0-31
+
+		{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-31
+		{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true}, // arg0 rotate left auxint, rotate amount 0-15
+		{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true},  // arg0 rotate left auxint, rotate amount 0-7
+
+		// unary ops
+		{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true}, // -arg0
+
+		{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
+
+		{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL"}, // arg0 # of low-order zeroes ; undef if zero
+		{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW"}, // arg0 # of low-order zeroes ; undef if zero
+
+		{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL"}, // arg0 # of high-order zeroes ; undef if zero
+		{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW"}, // arg0 # of high-order zeroes ; undef if zero
+
+		{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
+
+		{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
+
+		{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
+		// Note: SBBW and SBBB are subsumed by SBBL
+
+		{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"},  // extract signed < condition from arg0
+		{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
+		{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"},  // extract signed > condition from arg0
+		{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
+		{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"},  // extract unsigned < condition from arg0
+		{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
+		{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"},  // extract unsigned > condition from arg0
+		{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
+		// Need different opcodes for floating point conditions because
+		// any comparison involving a NaN is always FALSE and thus
+		// the patterns for inverting conditions cannot be used.
+		{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ"}, // extract == condition from arg0
+		{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE"}, // extract != condition from arg0
+		{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"},   // extract "ordered" (No Nan present) condition from arg0
+		{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"},   // extract "unordered" (Nan present) condition from arg0
+
+		{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"},  // extract floating > condition from arg0
+		{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
+
+		{name: "MOVBLSX", argLength: 1, reg: gp11nf, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
+		{name: "MOVBLZX", argLength: 1, reg: gp11nf, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
+		{name: "MOVWLSX", argLength: 1, reg: gp11nf, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
+		{name: "MOVWLZX", argLength: 1, reg: gp11nf, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
+
+		{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
+
+		{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
+		{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
+		{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"},   // convert int32 to float32
+		{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"},   // convert int32 to float64
+		{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"},   // convert float64 to float32
+		{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"},   // convert float32 to float64
+
+		{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
+
+		{name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true}, // arg0 + auxint + offset encoded in aux
+		{name: "LEAL1", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + arg1 + auxint + aux
+		{name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 2*arg1 + auxint + aux
+		{name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 4*arg1 + auxint + aux
+		{name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff"},                         // arg0 + 8*arg1 + auxint + aux
+		// Note: LEAL{1,2,4,8} must not have OpSB as either argument.
+
+		// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
+		{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8"},  // load byte from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16"}, // load 2 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff"},             // ditto, sign extend to int32
+		{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32"},    // load 4 bytes from arg0+auxint+aux. arg1=mem.  Zero extend.
+		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},     // store byte in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},     // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"},     // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
+		{name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128"},  // load 16 bytes from arg0+auxint+aux. arg1=mem
+		{name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem"},   // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
+
+		// indexed loads/stores
+		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
+		{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff"},    // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
+		// TODO: sign-extending indexed loads
+		{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVB", aux: "SymOff"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
+
+		// For storeconst ops, the AuxInt field encodes both
+		// the value to store and an address offset of the store.
+		// Cast AuxInt to a ValAndOff to extract Val and Off fields.
+		{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux.  arg1=mem
+		{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ...
+		{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ...
+
+		{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux.  arg2=mem
+		{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... arg1 ...
+		{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem"}, // store low 2 bytes of ... 2*arg1 ...
+		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
+		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...
+
+		// arg0 = (duff-adjusted) pointer to start of memory to zero
+		// arg1 = value to store (will always be zero)
+		// arg2 = mem
+		// auxint = offset into duffzero code to start executing
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
+				clobbers: buildReg("DI FLAGS"),
+			},
+		},
+		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},
+
+		// arg0 = address of memory to zero
+		// arg1 = # of 4-byte words to zero
+		// arg2 = value to store (will always be zero)
+		// arg3 = mem
+		// returns mem
+		{
+			name:      "REPSTOSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
+				clobbers: buildReg("DI CX"),
+			},
+		},
+
+		{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "SymOff"},                                // call static function aux.(*gc.Sym).  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLclosure", argLength: 3, reg: regInfo{[]regMask{gpsp, buildReg("DX"), 0}, callerSave, nil}, aux: "Int64"}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
+		{name: "CALLdefer", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                  // call deferproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLgo", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "Int64"},                                     // call newproc.  arg0=mem, auxint=argsize, returns mem
+		{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "Int64"},           // call fn by pointer.  arg0=codeptr, arg1=mem, auxint=argsize, returns mem
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = mem
+		// auxint = offset from duffcopy symbol to call
+		// returns memory
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
+				clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary
+			},
+		},
+
+		// arg0 = destination pointer
+		// arg1 = source pointer
+		// arg2 = # of 8-byte words to copy
+		// arg3 = mem
+		// returns memory
+		{
+			name:      "REPMOVSL",
+			argLength: 4,
+			reg: regInfo{
+				inputs:   []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
+				clobbers: buildReg("DI SI CX"),
+			},
+		},
+
+		// (InvertFlags (CMPL a b)) == (CMPL b a)
+		// So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
+		// then we do (SETL (InvertFlags (CMPL b a))) instead.
+		// Rewrites will convert this to (SETG (CMPL b a)).
+		// InvertFlags is a pseudo-op which can't appear in assembly output.
+		{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
+
+		// Pseudo-ops
+		{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
+		// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
+		// and sorts it to the very beginning of the block to prevent other
+		// use of DX (the closure pointer)
+		{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}},
+		//arg0=ptr,arg1=mem, returns void.  Faults if ptr is nil.
+		{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}, clobbers: flags}},
+
+		// MOVLconvert converts between pointers and integers.
+		// We have a special op for this so as to not confuse GC
+		// (particularly stack maps).  It takes a memory arg so it
+		// gets correctly ordered with respect to GC safepoints.
+		// arg0=ptr/int arg1=mem, output=int/ptr
+		{name: "MOVLconvert", argLength: 2, reg: gp11nf, asm: "MOVL"},
+
+		// Constant flag values. For any comparison, there are 5 possible
+		// outcomes: the three from the signed total order (<,==,>) and the
+		// three from the unsigned total order. The == cases overlap.
+		// Note: there's a sixth "unordered" outcome for floating-point
+		// comparisons, but we don't use such a beast yet.
+		// These ops are for temporary use by rewrite rules. They
+		// cannot appear in the generated assembly.
+		{name: "FlagEQ"},     // equal
+		{name: "FlagLT_ULT"}, // signed < and unsigned <
+		{name: "FlagLT_UGT"}, // signed < and unsigned >
+		{name: "FlagGT_UGT"}, // signed > and unsigned <
+		{name: "FlagGT_ULT"}, // signed > and unsigned >
+	}
+
+	var _386blocks = []blockData{
+		{name: "EQ"},
+		{name: "NE"},
+		{name: "LT"},
+		{name: "LE"},
+		{name: "GT"},
+		{name: "GE"},
+		{name: "ULT"},
+		{name: "ULE"},
+		{name: "UGT"},
+		{name: "UGE"},
+		{name: "EQF"},
+		{name: "NEF"},
+		{name: "ORD"}, // FP, ordered comparison (parity zero)
+		{name: "NAN"}, // FP, unordered comparison (parity one)
+	}
+
+	archs = append(archs, arch{
+		name:            "386",
+		pkg:             "cmd/internal/obj/x86",
+		genfile:         "../../x86/ssa.go",
+		ops:             _386ops,
+		blocks:          _386blocks,
+		regnames:        regNames386,
+		gpregmask:       gp,
+		fpregmask:       fp,
+		flagmask:        flags,
+		framepointerreg: int8(num["BP"]),
+	})
+}
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
--- a/src/cmd/compile/internal/x86/galign.go
+++ b/src/cmd/compile/internal/x86/galign.go
@ -77,6 +77,11 @@ func Main() {
 	gc.Thearch.Doregbits = doregbits
 	gc.Thearch.Regnames = regnames

+	gc.Thearch.SSARegToReg = ssaRegToReg
+	gc.Thearch.SSAMarkMoves = ssaMarkMoves
+	gc.Thearch.SSAGenValue = ssaGenValue
+	gc.Thearch.SSAGenBlock = ssaGenBlock
+
 	gc.Main()
 	gc.Exit(0)
 }
--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@ -0,0 +1,959 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package x86
+
+import (
+	"fmt"
+	"math"
+
+	"cmd/compile/internal/gc"
+	"cmd/compile/internal/ssa"
+	"cmd/internal/obj"
+	"cmd/internal/obj/x86"
+)
+
+// Smallest possible faulting page at address zero.
+const minZeroPage = 4096
+
+// ssaRegToReg maps ssa register numbers to obj register numbers.
+var ssaRegToReg = []int16{
+	x86.REG_AX,
+	x86.REG_CX,
+	x86.REG_DX,
+	x86.REG_BX,
+	x86.REG_SP,
+	x86.REG_BP,
+	x86.REG_SI,
+	x86.REG_DI,
+	x86.REG_X0,
+	x86.REG_X1,
+	x86.REG_X2,
+	x86.REG_X3,
+	x86.REG_X4,
+	x86.REG_X5,
+	x86.REG_X6,
+	x86.REG_X7,
+	0, // SB isn't a real register.  We fill an Addr.Reg field with 0 in this case.
+}
+
+// markMoves marks any MOVXconst ops that need to avoid clobbering flags.
+func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
+	flive := b.FlagsLiveAtEnd
+	if b.Control != nil && b.Control.Type.IsFlags() {
+		flive = true
+	}
+	for i := len(b.Values) - 1; i >= 0; i-- {
+		v := b.Values[i]
+		if flive && v.Op == ssa.Op386MOVLconst {
+			// The "mark" is any non-nil Aux value.
+			v.Aux = v
+		}
+		if v.Type.IsFlags() {
+			flive = false
+		}
+		for _, a := range v.Args {
+			if a.Type.IsFlags() {
+				flive = true
+			}
+		}
+	}
+}
+
+// loadByType returns the load instruction of the given type.
+func loadByType(t ssa.Type) obj.As {
+	// Avoid partial register write
+	if !t.IsFloat() && t.Size() <= 2 {
+		if t.Size() == 1 {
+			return x86.AMOVBLZX
+		} else {
+			return x86.AMOVWLZX
+		}
+	}
+	// Otherwise, there's no difference between load and store opcodes.
+	return storeByType(t)
+}
+
+// storeByType returns the store instruction of the given type.
+func storeByType(t ssa.Type) obj.As {
+	width := t.Size()
+	if t.IsFloat() {
+		switch width {
+		case 4:
+			return x86.AMOVSS
+		case 8:
+			return x86.AMOVSD
+		}
+	} else {
+		switch width {
+		case 1:
+			return x86.AMOVB
+		case 2:
+			return x86.AMOVW
+		case 4:
+			return x86.AMOVL
+		}
+	}
+	panic("bad store type")
+}
+
+// moveByType returns the reg->reg move instruction of the given type.
+func moveByType(t ssa.Type) obj.As {
+	if t.IsFloat() {
+		// Moving the whole sse2 register is faster
+		// than moving just the correct low portion of it.
+		// There is no xmm->xmm move with 1 byte opcode,
+		// so use movups, which has 2 byte opcode.
+		return x86.AMOVUPS
+	} else {
+		switch t.Size() {
+		case 1:
+			// Avoids partial register write
+			return x86.AMOVL
+		case 2:
+			return x86.AMOVL
+		case 4:
+			return x86.AMOVL
+		case 16:
+			return x86.AMOVUPS // int128s are in SSE registers
+		default:
+			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
+		}
+	}
+}
+
+// opregreg emits instructions for
+//     dest := dest(To) op src(From)
+// and also returns the created obj.Prog so it
+// may be further adjusted (offset, scale, etc).
+func opregreg(op obj.As, dest, src int16) *obj.Prog {
+	p := gc.Prog(op)
+	p.From.Type = obj.TYPE_REG
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = dest
+	p.From.Reg = src
+	return p
+}
+
+func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
+	s.SetLineno(v.Line)
+	switch v.Op {
+	case ssa.Op386ADDL:
+		r := gc.SSARegNum(v)
+		r1 := gc.SSARegNum(v.Args[0])
+		r2 := gc.SSARegNum(v.Args[1])
+		switch {
+		case r == r1:
+			p := gc.Prog(v.Op.Asm())
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = r2
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		case r == r2:
+			p := gc.Prog(v.Op.Asm())
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = r1
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		default:
+			p := gc.Prog(x86.ALEAL)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = r1
+			p.From.Scale = 1
+			p.From.Index = r2
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		}
+	// 2-address opcode arithmetic
+	case ssa.Op386SUBL,
+		ssa.Op386MULL,
+		ssa.Op386ANDL,
+		ssa.Op386ORL,
+		ssa.Op386XORL,
+		ssa.Op386SHLL,
+		ssa.Op386SHRL, ssa.Op386SHRW, ssa.Op386SHRB,
+		ssa.Op386SARL, ssa.Op386SARW, ssa.Op386SARB,
+		ssa.Op386ADDSS, ssa.Op386ADDSD, ssa.Op386SUBSS, ssa.Op386SUBSD,
+		ssa.Op386MULSS, ssa.Op386MULSD, ssa.Op386DIVSS, ssa.Op386DIVSD,
+		ssa.Op386PXOR:
+		r := gc.SSARegNum(v)
+		if r != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1]))
+
+	case ssa.Op386DIVL, ssa.Op386DIVW,
+		ssa.Op386DIVLU, ssa.Op386DIVWU,
+		ssa.Op386MODL, ssa.Op386MODW,
+		ssa.Op386MODLU, ssa.Op386MODWU:
+
+		// Arg[0] is already in AX as it's the only register we allow
+		// and AX is the only output
+		x := gc.SSARegNum(v.Args[1])
+
+		// CPU faults upon signed overflow, which occurs when most
+		// negative int is divided by -1.
+		var j *obj.Prog
+		if v.Op == ssa.Op386DIVL || v.Op == ssa.Op386DIVW ||
+			v.Op == ssa.Op386MODL || v.Op == ssa.Op386MODW {
+
+			var c *obj.Prog
+			switch v.Op {
+			case ssa.Op386DIVL, ssa.Op386MODL:
+				c = gc.Prog(x86.ACMPL)
+				j = gc.Prog(x86.AJEQ)
+				gc.Prog(x86.ACDQ) //TODO: fix
+
+			case ssa.Op386DIVW, ssa.Op386MODW:
+				c = gc.Prog(x86.ACMPW)
+				j = gc.Prog(x86.AJEQ)
+				gc.Prog(x86.ACWD)
+			}
+			c.From.Type = obj.TYPE_REG
+			c.From.Reg = x
+			c.To.Type = obj.TYPE_CONST
+			c.To.Offset = -1
+
+			j.To.Type = obj.TYPE_BRANCH
+		}
+
+		// for unsigned ints, we sign extend by setting DX = 0
+		// signed ints were sign extended above
+		if v.Op == ssa.Op386DIVLU || v.Op == ssa.Op386MODLU ||
+			v.Op == ssa.Op386DIVWU || v.Op == ssa.Op386MODWU {
+			c := gc.Prog(x86.AXORL)
+			c.From.Type = obj.TYPE_REG
+			c.From.Reg = x86.REG_DX
+			c.To.Type = obj.TYPE_REG
+			c.To.Reg = x86.REG_DX
+		}
+
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x
+
+		// signed division, rest of the check for -1 case
+		if j != nil {
+			j2 := gc.Prog(obj.AJMP)
+			j2.To.Type = obj.TYPE_BRANCH
+
+			var n *obj.Prog
+			if v.Op == ssa.Op386DIVL || v.Op == ssa.Op386DIVW {
+				// n * -1 = -n
+				n = gc.Prog(x86.ANEGL)
+				n.To.Type = obj.TYPE_REG
+				n.To.Reg = x86.REG_AX
+			} else {
+				// n % -1 == 0
+				n = gc.Prog(x86.AXORL)
+				n.From.Type = obj.TYPE_REG
+				n.From.Reg = x86.REG_DX
+				n.To.Type = obj.TYPE_REG
+				n.To.Reg = x86.REG_DX
+			}
+
+			j.To.Val = n
+			j2.To.Val = s.Pc()
+		}
+
+	case ssa.Op386HMULL, ssa.Op386HMULW, ssa.Op386HMULB,
+		ssa.Op386HMULLU, ssa.Op386HMULWU, ssa.Op386HMULBU:
+		// the frontend rewrites constant division by 8/16/32 bit integers into
+		// HMUL by a constant
+		// SSA rewrites generate the 64 bit versions
+
+		// Arg[0] is already in AX as it's the only register we allow
+		// and DX is the only output we care about (the high bits)
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+
+		// IMULB puts the high portion in AH instead of DL,
+		// so move it to DL for consistency
+		if v.Type.Size() == 1 {
+			m := gc.Prog(x86.AMOVB)
+			m.From.Type = obj.TYPE_REG
+			m.From.Reg = x86.REG_AH
+			m.To.Type = obj.TYPE_REG
+			m.To.Reg = x86.REG_DX
+		}
+
+	case ssa.Op386ADDLconst:
+		r := gc.SSARegNum(v)
+		a := gc.SSARegNum(v.Args[0])
+		if r == a {
+			if v.AuxInt == 1 {
+				p := gc.Prog(x86.AINCL)
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+				return
+			}
+			if v.AuxInt == -1 {
+				p := gc.Prog(x86.ADECL)
+				p.To.Type = obj.TYPE_REG
+				p.To.Reg = r
+				return
+			}
+			p := gc.Prog(v.Op.Asm())
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = v.AuxInt
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+			return
+		}
+		p := gc.Prog(x86.ALEAL)
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = a
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+
+	case ssa.Op386MULLconst:
+		r := gc.SSARegNum(v)
+		if r != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+		// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
+		// then we don't need to use resultInArg0 for these ops.
+		//p.From3 = new(obj.Addr)
+		//p.From3.Type = obj.TYPE_REG
+		//p.From3.Reg = gc.SSARegNum(v.Args[0])
+
+	case ssa.Op386SUBLconst,
+		ssa.Op386ANDLconst,
+		ssa.Op386ORLconst,
+		ssa.Op386XORLconst,
+		ssa.Op386SHLLconst,
+		ssa.Op386SHRLconst, ssa.Op386SHRWconst, ssa.Op386SHRBconst,
+		ssa.Op386SARLconst, ssa.Op386SARWconst, ssa.Op386SARBconst,
+		ssa.Op386ROLLconst, ssa.Op386ROLWconst, ssa.Op386ROLBconst:
+		r := gc.SSARegNum(v)
+		if r != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	case ssa.Op386SBBLcarrymask:
+		r := gc.SSARegNum(v)
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = r
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	case ssa.Op386LEAL1, ssa.Op386LEAL2, ssa.Op386LEAL4, ssa.Op386LEAL8:
+		r := gc.SSARegNum(v.Args[0])
+		i := gc.SSARegNum(v.Args[1])
+		p := gc.Prog(x86.ALEAL)
+		switch v.Op {
+		case ssa.Op386LEAL1:
+			p.From.Scale = 1
+			if i == x86.REG_SP {
+				r, i = i, r
+			}
+		case ssa.Op386LEAL2:
+			p.From.Scale = 2
+		case ssa.Op386LEAL4:
+			p.From.Scale = 4
+		case ssa.Op386LEAL8:
+			p.From.Scale = 8
+		}
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = r
+		p.From.Index = i
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386LEAL:
+		p := gc.Prog(x86.ALEAL)
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386CMPL, ssa.Op386CMPW, ssa.Op386CMPB,
+		ssa.Op386TESTL, ssa.Op386TESTW, ssa.Op386TESTB:
+		opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[0]))
+	case ssa.Op386UCOMISS, ssa.Op386UCOMISD:
+		// Go assembler has swapped operands for UCOMISx relative to CMP,
+		// must account for that right here.
+		opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1]))
+	case ssa.Op386CMPLconst, ssa.Op386CMPWconst, ssa.Op386CMPBconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_CONST
+		p.To.Offset = v.AuxInt
+	case ssa.Op386TESTLconst, ssa.Op386TESTWconst, ssa.Op386TESTBconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+	case ssa.Op386MOVLconst:
+		x := gc.SSARegNum(v)
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = x
+		// If flags are live at this instruction, suppress the
+		// MOV $0,AX -> XOR AX,AX optimization.
+		if v.Aux != nil {
+			p.Mark |= x86.PRESERVEFLAGS
+		}
+	case ssa.Op386MOVSSconst, ssa.Op386MOVSDconst:
+		x := gc.SSARegNum(v)
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_FCONST
+		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = x
+	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVOload:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386MOVSDloadidx8:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.From.Scale = 8
+		p.From.Index = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386MOVLloadidx4, ssa.Op386MOVSSloadidx4:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.From.Scale = 4
+		p.From.Index = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386MOVWloadidx2:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.From, v)
+		p.From.Scale = 2
+		p.From.Index = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386MOVBloadidx1, ssa.Op386MOVWloadidx1, ssa.Op386MOVLloadidx1, ssa.Op386MOVSSloadidx1, ssa.Op386MOVSDloadidx1:
+		r := gc.SSARegNum(v.Args[0])
+		i := gc.SSARegNum(v.Args[1])
+		if i == x86.REG_SP {
+			r, i = i, r
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = r
+		p.From.Scale = 1
+		p.From.Index = i
+		gc.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVOstore:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[1])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+	case ssa.Op386MOVSDstoreidx8:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Scale = 8
+		p.To.Index = gc.SSARegNum(v.Args[1])
+		gc.AddAux(&p.To, v)
+	case ssa.Op386MOVSSstoreidx4, ssa.Op386MOVLstoreidx4:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Scale = 4
+		p.To.Index = gc.SSARegNum(v.Args[1])
+		gc.AddAux(&p.To, v)
+	case ssa.Op386MOVWstoreidx2:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Scale = 2
+		p.To.Index = gc.SSARegNum(v.Args[1])
+		gc.AddAux(&p.To, v)
+	case ssa.Op386MOVBstoreidx1, ssa.Op386MOVWstoreidx1, ssa.Op386MOVLstoreidx1, ssa.Op386MOVSSstoreidx1, ssa.Op386MOVSDstoreidx1:
+		r := gc.SSARegNum(v.Args[0])
+		i := gc.SSARegNum(v.Args[1])
+		if i == x86.REG_SP {
+			r, i = i, r
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = r
+		p.To.Scale = 1
+		p.To.Index = i
+		gc.AddAux(&p.To, v)
+	case ssa.Op386MOVLstoreconst, ssa.Op386MOVWstoreconst, ssa.Op386MOVBstoreconst:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		sc := v.AuxValAndOff()
+		p.From.Offset = sc.Val()
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux2(&p.To, v, sc.Off())
+	case ssa.Op386MOVLstoreconstidx1, ssa.Op386MOVLstoreconstidx4, ssa.Op386MOVWstoreconstidx1, ssa.Op386MOVWstoreconstidx2, ssa.Op386MOVBstoreconstidx1:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		sc := v.AuxValAndOff()
+		p.From.Offset = sc.Val()
+		r := gc.SSARegNum(v.Args[0])
+		i := gc.SSARegNum(v.Args[1])
+		switch v.Op {
+		case ssa.Op386MOVBstoreconstidx1, ssa.Op386MOVWstoreconstidx1, ssa.Op386MOVLstoreconstidx1:
+			p.To.Scale = 1
+			if i == x86.REG_SP {
+				r, i = i, r
+			}
+		case ssa.Op386MOVWstoreconstidx2:
+			p.To.Scale = 2
+		case ssa.Op386MOVLstoreconstidx4:
+			p.To.Scale = 4
+		}
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = r
+		p.To.Index = i
+		gc.AddAux2(&p.To, v, sc.Off())
+	case ssa.Op386MOVWLSX, ssa.Op386MOVBLSX, ssa.Op386MOVWLZX, ssa.Op386MOVBLZX,
+		ssa.Op386CVTSL2SS, ssa.Op386CVTSL2SD,
+		ssa.Op386CVTTSS2SL, ssa.Op386CVTTSD2SL,
+		ssa.Op386CVTSS2SD, ssa.Op386CVTSD2SS:
+		opregreg(v.Op.Asm(), gc.SSARegNum(v), gc.SSARegNum(v.Args[0]))
+	case ssa.Op386DUFFZERO:
+		p := gc.Prog(obj.ADUFFZERO)
+		p.To.Type = obj.TYPE_ADDR
+		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
+		p.To.Offset = v.AuxInt
+	case ssa.Op386MOVOconst:
+		if v.AuxInt != 0 {
+			v.Unimplementedf("MOVOconst can only do constant=0")
+		}
+		r := gc.SSARegNum(v)
+		opregreg(x86.AXORPS, r, r)
+	case ssa.Op386DUFFCOPY:
+		p := gc.Prog(obj.ADUFFCOPY)
+		p.To.Type = obj.TYPE_ADDR
+		p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg))
+		p.To.Offset = v.AuxInt
+
+	case ssa.OpCopy, ssa.Op386MOVLconvert: // TODO: use MOVLreg for reg->reg copies instead of OpCopy?
+		if v.Type.IsMemory() {
+			return
+		}
+		x := gc.SSARegNum(v.Args[0])
+		y := gc.SSARegNum(v)
+		if x != y {
+			opregreg(moveByType(v.Type), y, x)
+		}
+	case ssa.OpLoadReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("load flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(loadByType(v.Type))
+		n, off := gc.AutoVar(v.Args[0])
+		p.From.Type = obj.TYPE_MEM
+		p.From.Node = n
+		p.From.Sym = gc.Linksym(n.Sym)
+		p.From.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.From.Name = obj.NAME_PARAM
+			p.From.Offset += n.Xoffset
+		} else {
+			p.From.Name = obj.NAME_AUTO
+		}
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+	case ssa.OpStoreReg:
+		if v.Type.IsFlags() {
+			v.Unimplementedf("store flags not implemented: %v", v.LongString())
+			return
+		}
+		p := gc.Prog(storeByType(v.Type))
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		n, off := gc.AutoVar(v)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Node = n
+		p.To.Sym = gc.Linksym(n.Sym)
+		p.To.Offset = off
+		if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT {
+			p.To.Name = obj.NAME_PARAM
+			p.To.Offset += n.Xoffset
+		} else {
+			p.To.Name = obj.NAME_AUTO
+		}
+	case ssa.OpPhi:
+		gc.CheckLoweredPhi(v)
+	case ssa.OpInitMem:
+		// memory arg needs no code
+	case ssa.OpArg:
+		// input args need no code
+	case ssa.Op386LoweredGetClosurePtr:
+		// Closure pointer is DX.
+		gc.CheckLoweredGetClosurePtr(v)
+	case ssa.Op386LoweredGetG:
+		r := gc.SSARegNum(v)
+		// See the comments in cmd/internal/obj/x86/obj6.go
+		// near CanUse1InsnTLS for a detailed explanation of these instructions.
+		if x86.CanUse1InsnTLS(gc.Ctxt) {
+			// MOVQ (TLS), r
+			p := gc.Prog(x86.AMOVL)
+			p.From.Type = obj.TYPE_MEM
+			p.From.Reg = x86.REG_TLS
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		} else {
+			// MOVQ TLS, r
+			// MOVQ (r)(TLS*1), r
+			p := gc.Prog(x86.AMOVL)
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = x86.REG_TLS
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+			q := gc.Prog(x86.AMOVL)
+			q.From.Type = obj.TYPE_MEM
+			q.From.Reg = r
+			q.From.Index = x86.REG_TLS
+			q.From.Scale = 1
+			q.To.Type = obj.TYPE_REG
+			q.To.Reg = r
+		}
+	case ssa.Op386CALLstatic:
+		if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym {
+			// Deferred calls will appear to be returning to
+			// the CALL deferreturn(SB) that we are about to emit.
+			// However, the stack trace code will show the line
+			// of the instruction byte before the return PC.
+			// To avoid that being an unrelated instruction,
+			// insert an actual hardware NOP that will have the right line number.
+			// This is different from obj.ANOP, which is a virtual no-op
+			// that doesn't make it into the instruction stream.
+			ginsnop()
+		}
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym))
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.Op386CALLclosure:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.Op386CALLdefer:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Deferproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.Op386CALLgo:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(gc.Newproc.Sym)
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.Op386CALLinter:
+		p := gc.Prog(obj.ACALL)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		if gc.Maxarg < v.AuxInt {
+			gc.Maxarg = v.AuxInt
+		}
+	case ssa.Op386NEGL,
+		ssa.Op386BSWAPL,
+		ssa.Op386NOTL:
+		r := gc.SSARegNum(v)
+		if r != gc.SSARegNum(v.Args[0]) {
+			v.Fatalf("input[0] and output not in same register %s", v.LongString())
+		}
+		p := gc.Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+	case ssa.Op386BSFL, ssa.Op386BSFW,
+		ssa.Op386BSRL, ssa.Op386BSRW,
+		ssa.Op386SQRTSD:
+		p := gc.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = gc.SSARegNum(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+	case ssa.OpSP, ssa.OpSB:
+		// nothing to do
+	case ssa.Op386SETEQ, ssa.Op386SETNE,
+		ssa.Op386SETL, ssa.Op386SETLE,
+		ssa.Op386SETG, ssa.Op386SETGE,
+		ssa.Op386SETGF, ssa.Op386SETGEF,
+		ssa.Op386SETB, ssa.Op386SETBE,
+		ssa.Op386SETORD, ssa.Op386SETNAN,
+		ssa.Op386SETA, ssa.Op386SETAE:
+		p := gc.Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+
+	case ssa.Op386SETNEF:
+		p := gc.Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+		q := gc.Prog(x86.ASETPS)
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = x86.REG_AX
+		opregreg(x86.AORL, gc.SSARegNum(v), x86.REG_AX)
+
+	case ssa.Op386SETEQF:
+		p := gc.Prog(v.Op.Asm())
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = gc.SSARegNum(v)
+		q := gc.Prog(x86.ASETPC)
+		q.To.Type = obj.TYPE_REG
+		q.To.Reg = x86.REG_AX
+		opregreg(x86.AANDL, gc.SSARegNum(v), x86.REG_AX)
+
+	case ssa.Op386InvertFlags:
+		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
+	case ssa.Op386FlagEQ, ssa.Op386FlagLT_ULT, ssa.Op386FlagLT_UGT, ssa.Op386FlagGT_ULT, ssa.Op386FlagGT_UGT:
+		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
+	case ssa.Op386REPSTOSL:
+		gc.Prog(x86.AREP)
+		gc.Prog(x86.ASTOSL)
+	case ssa.Op386REPMOVSL:
+		gc.Prog(x86.AREP)
+		gc.Prog(x86.AMOVSL)
+	case ssa.OpVarDef:
+		gc.Gvardef(v.Aux.(*gc.Node))
+	case ssa.OpVarKill:
+		gc.Gvarkill(v.Aux.(*gc.Node))
+	case ssa.OpVarLive:
+		gc.Gvarlive(v.Aux.(*gc.Node))
+	case ssa.OpKeepAlive:
+		if !v.Args[0].Type.IsPtrShaped() {
+			v.Fatalf("keeping non-pointer alive %v", v.Args[0])
+		}
+		n, off := gc.AutoVar(v.Args[0])
+		if n == nil {
+			v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0])
+		}
+		if off != 0 {
+			v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off)
+		}
+		gc.Gvarlive(n)
+	case ssa.Op386LoweredNilCheck:
+		// Optimization - if the subsequent block has a load or store
+		// at the same address, we don't need to issue this instruction.
+		mem := v.Args[1]
+		for _, w := range v.Block.Succs[0].Block().Values {
+			if w.Op == ssa.OpPhi {
+				if w.Type.IsMemory() {
+					mem = w
+				}
+				continue
+			}
+			if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() {
+				// w doesn't use a store - can't be a memory op.
+				continue
+			}
+			if w.Args[len(w.Args)-1] != mem {
+				v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w)
+			}
+			switch w.Op {
+			case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload,
+				ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore,
+				ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload,
+				ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVOload,
+				ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVOstore:
+				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			case ssa.Op386MOVLstoreconst, ssa.Op386MOVWstoreconst, ssa.Op386MOVBstoreconst:
+				off := ssa.ValAndOff(v.AuxInt).Off()
+				if w.Args[0] == v.Args[0] && w.Aux == nil && off >= 0 && off < minZeroPage {
+					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
+						gc.Warnl(v.Line, "removed nil check")
+					}
+					return
+				}
+			}
+			if w.Type.IsMemory() {
+				if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive {
+					// these ops are OK
+					mem = w
+					continue
+				}
+				// We can't delay the nil check past the next store.
+				break
+			}
+		}
+		// Issue a load which will fault if the input is nil.
+		// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
+		// Should we use the 3-byte TESTB $0, (reg) instead?  It is larger
+		// but it doesn't have false dependency on AX.
+		// Or maybe allocate an output register and use MOVL (reg),reg2 ?
+		// That trades clobbering flags for clobbering a register.
+		p := gc.Prog(x86.ATESTB)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x86.REG_AX
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = gc.SSARegNum(v.Args[0])
+		gc.AddAux(&p.To, v)
+		if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers
+			gc.Warnl(v.Line, "generated nil check")
+		}
+	default:
+		v.Unimplementedf("genValue not implemented: %s", v.LongString())
+	}
+}
+
+var blockJump = [...]struct {
+	asm, invasm obj.As
+}{
+	ssa.Block386EQ:  {x86.AJEQ, x86.AJNE},
+	ssa.Block386NE:  {x86.AJNE, x86.AJEQ},
+	ssa.Block386LT:  {x86.AJLT, x86.AJGE},
+	ssa.Block386GE:  {x86.AJGE, x86.AJLT},
+	ssa.Block386LE:  {x86.AJLE, x86.AJGT},
+	ssa.Block386GT:  {x86.AJGT, x86.AJLE},
+	ssa.Block386ULT: {x86.AJCS, x86.AJCC},
+	ssa.Block386UGE: {x86.AJCC, x86.AJCS},
+	ssa.Block386UGT: {x86.AJHI, x86.AJLS},
+	ssa.Block386ULE: {x86.AJLS, x86.AJHI},
+	ssa.Block386ORD: {x86.AJPC, x86.AJPS},
+	ssa.Block386NAN: {x86.AJPS, x86.AJPC},
+}
+
+var eqfJumps = [2][2]gc.FloatingEQNEJump{
+	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
+	{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
+}
+var nefJumps = [2][2]gc.FloatingEQNEJump{
+	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
+	{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
+}
+
+func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
+	s.SetLineno(b.Line)
+
+	switch b.Kind {
+	case ssa.BlockPlain, ssa.BlockCall, ssa.BlockCheck:
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+	case ssa.BlockDefer:
+		// defer returns in rax:
+		// 0 if we should continue executing
+		// 1 if we should jump to deferreturn call
+		p := gc.Prog(x86.ATESTL)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = x86.REG_AX
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = x86.REG_AX
+		p = gc.Prog(x86.AJNE)
+		p.To.Type = obj.TYPE_BRANCH
+		s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		if b.Succs[0].Block() != next {
+			p := gc.Prog(obj.AJMP)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		}
+	case ssa.BlockExit:
+		gc.Prog(obj.AUNDEF) // tell plive.go that we never reach here
+	case ssa.BlockRet:
+		gc.Prog(obj.ARET)
+	case ssa.BlockRetJmp:
+		p := gc.Prog(obj.AJMP)
+		p.To.Type = obj.TYPE_MEM
+		p.To.Name = obj.NAME_EXTERN
+		p.To.Sym = gc.Linksym(b.Aux.(*gc.Sym))
+
+	case ssa.Block386EQF:
+		gc.SSAGenFPJump(s, b, next, &eqfJumps)
+
+	case ssa.Block386NEF:
+		gc.SSAGenFPJump(s, b, next, &nefJumps)
+
+	case ssa.Block386EQ, ssa.Block386NE,
+		ssa.Block386LT, ssa.Block386GE,
+		ssa.Block386LE, ssa.Block386GT,
+		ssa.Block386ULT, ssa.Block386UGT,
+		ssa.Block386ULE, ssa.Block386UGE:
+		jmp := blockJump[b.Kind]
+		likely := b.Likely
+		var p *obj.Prog
+		switch next {
+		case b.Succs[0].Block():
+			p = gc.Prog(jmp.invasm)
+			likely *= -1
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
+		case b.Succs[1].Block():
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+		default:
+			p = gc.Prog(jmp.asm)
+			p.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
+			q := gc.Prog(obj.AJMP)
+			q.To.Type = obj.TYPE_BRANCH
+			s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
+		}
+
+		// liblink reorders the instruction stream as it sees fit.
+		// Pass along what we know so liblink can make use of it.
+		// TODO: Once we've fully switched to SSA,
+		// make liblink leave our output alone.
+		switch likely {
+		case ssa.BranchUnlikely:
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 0
+		case ssa.BranchLikely:
+			p.From.Type = obj.TYPE_CONST
+			p.From.Offset = 1
+		}
+
+	default:
+		b.Unimplementedf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
+	}
+}