mirror of https://github.com/golang/go.git
cmd/compile: optimize ARM64's code with MADD/MSUB
MADD does MUL-ADD in a single instruction, and MSUB does the similiar simplification for MUL-SUB. The CL implements the optimization with MADD/MSUB. 1. The total size of pkg/android_arm64/ decreases about 20KB, excluding cmd/compile/. 2. The go1 benchmark shows a little improvement for RegexpMatchHard_32-4 and Template-4, excluding noise. name old time/op new time/op delta BinaryTree17-4 16.3s ± 1% 16.5s ± 1% +1.41% (p=0.000 n=26+28) Fannkuch11-4 8.79s ± 1% 8.76s ± 0% -0.36% (p=0.000 n=26+28) FmtFprintfEmpty-4 172ns ± 0% 172ns ± 0% ~ (all equal) FmtFprintfString-4 362ns ± 1% 364ns ± 0% +0.55% (p=0.000 n=30+30) FmtFprintfInt-4 416ns ± 0% 416ns ± 0% ~ (p=0.099 n=22+30) FmtFprintfIntInt-4 655ns ± 1% 660ns ± 1% +0.76% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 810ns ± 0% 809ns ± 0% -0.08% (p=0.009 n=29+29) FmtFprintfFloat-4 1.08µs ± 0% 1.09µs ± 0% +0.61% (p=0.000 n=30+29) FmtManyArgs-4 2.70µs ± 0% 2.69µs ± 0% -0.23% (p=0.000 n=29+28) GobDecode-4 32.2ms ± 1% 32.1ms ± 1% -0.39% (p=0.000 n=27+26) GobEncode-4 27.4ms ± 2% 27.4ms ± 1% ~ (p=0.864 n=28+28) Gzip-4 1.53s ± 1% 1.52s ± 1% -0.30% (p=0.031 n=29+29) Gunzip-4 146ms ± 0% 146ms ± 0% -0.14% (p=0.001 n=25+30) HTTPClientServer-4 1.00ms ± 4% 0.98ms ± 6% -1.65% (p=0.001 n=29+30) JSONEncode-4 67.3ms ± 1% 67.2ms ± 1% ~ (p=0.520 n=28+28) JSONDecode-4 329ms ± 5% 330ms ± 4% ~ (p=0.142 n=30+30) Mandelbrot200-4 17.3ms ± 0% 17.3ms ± 0% ~ (p=0.055 n=26+29) GoParse-4 16.9ms ± 1% 17.0ms ± 1% +0.82% (p=0.000 n=30+30) RegexpMatchEasy0_32-4 382ns ± 0% 382ns ± 0% ~ (all equal) RegexpMatchEasy0_1K-4 1.33µs ± 0% 1.33µs ± 0% -0.25% (p=0.000 n=30+27) RegexpMatchEasy1_32-4 361ns ± 0% 361ns ± 0% -0.08% (p=0.002 n=30+28) RegexpMatchEasy1_1K-4 2.11µs ± 0% 2.09µs ± 0% -0.54% (p=0.000 n=30+29) RegexpMatchMedium_32-4 594ns ± 0% 592ns ± 0% -0.32% (p=0.000 n=30+30) RegexpMatchMedium_1K-4 173µs ± 0% 172µs ± 0% -0.77% (p=0.000 n=29+27) RegexpMatchHard_32-4 10.4µs ± 0% 10.1µs ± 0% -3.63% (p=0.000 n=28+27) RegexpMatchHard_1K-4 306µs ± 0% 301µs ± 0% -1.64% (p=0.000 n=29+30) Revcomp-4 2.51s ± 1% 2.52s ± 0% +0.18% (p=0.017 n=26+27) Template-4 394ms ± 3% 382ms ± 3% -3.22% (p=0.000 n=28+28) TimeParse-4 1.67µs ± 0% 1.67µs ± 0% +0.05% (p=0.030 n=27+30) TimeFormat-4 1.72µs ± 0% 1.70µs ± 0% -0.79% (p=0.000 n=28+26) [Geo mean] 259µs 259µs -0.33% name old speed new speed delta GobDecode-4 23.8MB/s ± 1% 23.9MB/s ± 1% +0.40% (p=0.001 n=27+26) GobEncode-4 28.0MB/s ± 2% 28.0MB/s ± 1% ~ (p=0.863 n=28+28) Gzip-4 12.7MB/s ± 1% 12.7MB/s ± 1% +0.32% (p=0.026 n=29+29) Gunzip-4 133MB/s ± 0% 133MB/s ± 0% +0.15% (p=0.001 n=24+30) JSONEncode-4 28.8MB/s ± 1% 28.9MB/s ± 1% ~ (p=0.475 n=28+28) JSONDecode-4 5.89MB/s ± 4% 5.87MB/s ± 5% ~ (p=0.174 n=29+30) GoParse-4 3.43MB/s ± 0% 3.40MB/s ± 1% -0.83% (p=0.000 n=28+30) RegexpMatchEasy0_32-4 83.6MB/s ± 0% 83.6MB/s ± 0% ~ (p=0.848 n=28+29) RegexpMatchEasy0_1K-4 768MB/s ± 0% 770MB/s ± 0% +0.25% (p=0.000 n=30+27) RegexpMatchEasy1_32-4 88.5MB/s ± 0% 88.5MB/s ± 0% ~ (p=0.086 n=29+29) RegexpMatchEasy1_1K-4 486MB/s ± 0% 489MB/s ± 0% +0.54% (p=0.000 n=30+29) RegexpMatchMedium_32-4 1.68MB/s ± 0% 1.69MB/s ± 0% +0.60% (p=0.000 n=30+23) RegexpMatchMedium_1K-4 5.90MB/s ± 0% 5.95MB/s ± 0% +0.85% (p=0.000 n=18+20) RegexpMatchHard_32-4 3.07MB/s ± 0% 3.18MB/s ± 0% +3.72% (p=0.000 n=29+26) RegexpMatchHard_1K-4 3.35MB/s ± 0% 3.40MB/s ± 0% +1.69% (p=0.000 n=30+30) Revcomp-4 101MB/s ± 0% 101MB/s ± 0% -0.18% (p=0.018 n=26+27) Template-4 4.92MB/s ± 4% 5.09MB/s ± 3% +3.31% (p=0.000 n=28+28) [Geo mean] 22.4MB/s 22.6MB/s +0.62% Change-Id: I8f304b272785739f57b3c8f736316f658f8c1b2a Reviewed-on: https://go-review.googlesource.com/129119 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
parent
1018a80fe8
commit
b444215116
|
|
@ -212,7 +212,11 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
|||
ssa.OpARM64FMSUBS,
|
||||
ssa.OpARM64FMSUBD,
|
||||
ssa.OpARM64FNMSUBS,
|
||||
ssa.OpARM64FNMSUBD:
|
||||
ssa.OpARM64FNMSUBD,
|
||||
ssa.OpARM64MADD,
|
||||
ssa.OpARM64MADDW,
|
||||
ssa.OpARM64MSUB,
|
||||
ssa.OpARM64MSUBW:
|
||||
rt := v.Reg()
|
||||
ra := v.Args[0].Reg()
|
||||
rm := v.Args[1].Reg()
|
||||
|
|
|
|||
|
|
@ -594,6 +594,34 @@
|
|||
(EQ (CMPWconst [0] x) yes no) -> (ZW x yes no)
|
||||
(NE (CMPWconst [0] x) yes no) -> (NZW x yes no)
|
||||
|
||||
(EQ (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (EQ (CMN a (MUL <x.Type> x y)) yes no)
|
||||
(NE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (NE (CMN a (MUL <x.Type> x y)) yes no)
|
||||
(LT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (LT (CMN a (MUL <x.Type> x y)) yes no)
|
||||
(LE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (LE (CMN a (MUL <x.Type> x y)) yes no)
|
||||
(GT (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (GT (CMN a (MUL <x.Type> x y)) yes no)
|
||||
(GE (CMPconst [0] z:(MADD a x y)) yes no) && z.Uses==1 -> (GE (CMN a (MUL <x.Type> x y)) yes no)
|
||||
|
||||
(EQ (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (EQ (CMP a (MUL <x.Type> x y)) yes no)
|
||||
(NE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (NE (CMP a (MUL <x.Type> x y)) yes no)
|
||||
(LE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (LE (CMP a (MUL <x.Type> x y)) yes no)
|
||||
(LT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (LT (CMP a (MUL <x.Type> x y)) yes no)
|
||||
(GE (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (GE (CMP a (MUL <x.Type> x y)) yes no)
|
||||
(GT (CMPconst [0] z:(MSUB a x y)) yes no) && z.Uses==1 -> (GT (CMP a (MUL <x.Type> x y)) yes no)
|
||||
|
||||
(EQ (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (EQ (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
(NE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (NE (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
(LE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (LE (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
(LT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (LT (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
(GE (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (GE (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
(GT (CMPWconst [0] z:(MADDW a x y)) yes no) && z.Uses==1 -> (GT (CMNW a (MULW <x.Type> x y)) yes no)
|
||||
|
||||
(EQ (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (EQ (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
(NE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (NE (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
(LE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (LE (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
(LT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (LT (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
(GE (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (GE (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
(GT (CMPWconst [0] z:(MSUBW a x y)) yes no) && z.Uses==1 -> (GT (CMPW a (MULW <x.Type> x y)) yes no)
|
||||
|
||||
// Absorb bit-tests into block
|
||||
(Z (ANDconst [c] x) yes no) && oneBit(c) -> (TBZ {ntz(c)} x yes no)
|
||||
(NZ (ANDconst [c] x) yes no) && oneBit(c) -> (TBNZ {ntz(c)} x yes no)
|
||||
|
|
@ -1058,6 +1086,17 @@
|
|||
(MUL (NEG x) y) -> (MNEG x y)
|
||||
(MULW (NEG x) y) -> (MNEGW x y)
|
||||
|
||||
// madd/msub
|
||||
(ADD a l:(MUL x y)) && l.Uses==1 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MADD a x y)
|
||||
(SUB a l:(MUL x y)) && l.Uses==1 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MSUB a x y)
|
||||
(ADD a l:(MNEG x y)) && l.Uses==1 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MSUB a x y)
|
||||
(SUB a l:(MNEG x y)) && l.Uses==1 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MADD a x y)
|
||||
|
||||
(ADD a l:(MULW x y)) && l.Uses==1 && a.Type.Size() != 8 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MADDW a x y)
|
||||
(SUB a l:(MULW x y)) && l.Uses==1 && a.Type.Size() != 8 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MSUBW a x y)
|
||||
(ADD a l:(MNEGW x y)) && l.Uses==1 && a.Type.Size() != 8 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MSUBW a x y)
|
||||
(SUB a l:(MNEGW x y)) && l.Uses==1 && a.Type.Size() != 8 && x.Op!=OpARM64MOVDconst && y.Op!=OpARM64MOVDconst && a.Op!=OpARM64MOVDconst && clobber(l) -> (MADDW a x y)
|
||||
|
||||
// mul by constant
|
||||
(MUL x (MOVDconst [-1])) -> (NEG x)
|
||||
(MUL _ (MOVDconst [0])) -> (MOVDconst [0])
|
||||
|
|
|
|||
|
|
@ -139,6 +139,7 @@ func init() {
|
|||
gp1flags = regInfo{inputs: []regMask{gpg}}
|
||||
gp1flags1 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
|
||||
gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
|
||||
gp31 = regInfo{inputs: []regMask{gpg, gpg, gpg}, outputs: []regMask{gp}}
|
||||
gp21nog = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
|
||||
gp2flags = regInfo{inputs: []regMask{gpg, gpg}}
|
||||
gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
|
||||
|
|
@ -235,6 +236,10 @@ func init() {
|
|||
{name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD"}, // +arg0 - (arg1 * arg2)
|
||||
{name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS"}, // -arg0 + (arg1 * arg2)
|
||||
{name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD"}, // -arg0 + (arg1 * arg2)
|
||||
{name: "MADD", argLength: 3, reg: gp31, asm: "MADD"}, // +arg0 + (arg1 * arg2)
|
||||
{name: "MADDW", argLength: 3, reg: gp31, asm: "MADDW"}, // +arg0 + (arg1 * arg2), 32-bit
|
||||
{name: "MSUB", argLength: 3, reg: gp31, asm: "MSUB"}, // +arg0 - (arg1 * arg2)
|
||||
{name: "MSUBW", argLength: 3, reg: gp31, asm: "MSUBW"}, // +arg0 - (arg1 * arg2), 32-bit
|
||||
|
||||
// shifts
|
||||
{name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64
|
||||
|
|
|
|||
|
|
@ -1129,6 +1129,10 @@ const (
|
|||
OpARM64FMSUBD
|
||||
OpARM64FNMSUBS
|
||||
OpARM64FNMSUBD
|
||||
OpARM64MADD
|
||||
OpARM64MADDW
|
||||
OpARM64MSUB
|
||||
OpARM64MSUBW
|
||||
OpARM64SLL
|
||||
OpARM64SLLconst
|
||||
OpARM64SRL
|
||||
|
|
@ -14954,6 +14958,66 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MADD",
|
||||
argLen: 3,
|
||||
asm: arm64.AMADD,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MADDW",
|
||||
argLen: 3,
|
||||
asm: arm64.AMADDW,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MSUB",
|
||||
argLen: 3,
|
||||
asm: arm64.AMSUB,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MSUBW",
|
||||
argLen: 3,
|
||||
asm: arm64.AMSUBW,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "SLL",
|
||||
argLen: 2,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -205,3 +205,9 @@ func AddMul(x int) int {
|
|||
// amd64:"LEAQ\t1"
|
||||
return 2*x + 1
|
||||
}
|
||||
|
||||
func MULA(a, b, c uint32) uint32 {
|
||||
// arm:`MULA`
|
||||
// arm64:`MADDW`
|
||||
return a*b + c
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue