mirror of https://github.com/golang/go.git
bytes: speed up Compare() on amd64
Use AVX2 if available. Results (haswell), below: name old time/op new time/op delta BytesCompare1-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare2-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare4-6 11.4ns ± 0% 11.4ns ± 0% ~ (all samples are equal) BytesCompare8-6 9.29ns ± 2% 8.76ns ± 0% -5.72% (p=0.000 n=16+17) BytesCompare16-6 9.29ns ± 2% 9.20ns ± 0% -1.02% (p=0.000 n=20+16) BytesCompare32-6 11.4ns ± 1% 11.4ns ± 0% ~ (p=0.191 n=20+20) BytesCompare64-6 14.4ns ± 0% 13.1ns ± 0% -8.68% (p=0.000 n=20+20) BytesCompare128-6 20.2ns ± 0% 18.5ns ± 0% -8.27% (p=0.000 n=16+20) BytesCompare256-6 29.3ns ± 0% 24.5ns ± 0% -16.38% (p=0.000 n=16+16) BytesCompare512-6 46.8ns ± 0% 37.1ns ± 0% -20.78% (p=0.000 n=18+16) BytesCompare1024-6 82.9ns ± 0% 62.3ns ± 0% -24.86% (p=0.000 n=20+14) BytesCompare2048-6 155ns ± 0% 112ns ± 0% -27.74% (p=0.000 n=20+20) CompareBytesEqual-6 10.1ns ± 1% 10.0ns ± 1% ~ (p=0.527 n=20+20) CompareBytesToNil-6 10.0ns ± 2% 9.4ns ± 0% -6.57% (p=0.000 n=20+17) CompareBytesEmpty-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesIdentical-6 8.76ns ± 0% 8.76ns ± 0% ~ (all samples are equal) CompareBytesSameLength-6 10.6ns ± 1% 10.6ns ± 1% ~ (p=0.240 n=20+20) CompareBytesDifferentLength-6 10.6ns ± 0% 10.6ns ± 1% ~ (p=1.000 n=20+20) CompareBytesBigUnaligned-6 132±s ± 1% 105±s ± 1% -20.61% (p=0.000 n=20+18) CompareBytesBig-6 125±s ± 1% 105±s ± 1% -16.31% (p=0.000 n=20+20) CompareBytesBigIdentical-6 8.13ns ± 0% 8.13ns ± 0% ~ (all samples are equal) name old speed new speed delta CompareBytesBigUnaligned-6 7.94GB/s ± 1% 10.01GB/s ± 1% +25.96% (p=0.000 n=20+18) CompareBytesBig-6 8.38GB/s ± 1% 10.01GB/s ± 1% +19.48% (p=0.000 n=20+20) CompareBytesBigIdentical-6 129TB/s ± 0% 129TB/s ± 0% +0.01% (p=0.003 n=17+19) Change-Id: I820f31bab4582dd4204b146bb077c0d2f24cd8f5 Reviewed-on: https://go-review.googlesource.com/16434 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Klaus Post <klauspost@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
cf73357e37
commit
0e23ca41d9
|
|
@ -746,6 +746,8 @@ const (
|
||||||
AMOVHDU
|
AMOVHDU
|
||||||
AMOVNTHD
|
AMOVNTHD
|
||||||
AMOVHDA
|
AMOVHDA
|
||||||
|
AVPCMPEQB
|
||||||
|
AVPMOVMSKB
|
||||||
|
|
||||||
// from 386
|
// from 386
|
||||||
AJCXZW
|
AJCXZW
|
||||||
|
|
|
||||||
|
|
@ -687,6 +687,8 @@ var Anames = []string{
|
||||||
"MOVHDU",
|
"MOVHDU",
|
||||||
"MOVNTHD",
|
"MOVNTHD",
|
||||||
"MOVHDA",
|
"MOVHDA",
|
||||||
|
"VPCMPEQB",
|
||||||
|
"VPMOVMSKB",
|
||||||
"JCXZW",
|
"JCXZW",
|
||||||
"FCMOVCC",
|
"FCMOVCC",
|
||||||
"FCMOVCS",
|
"FCMOVCS",
|
||||||
|
|
|
||||||
|
|
@ -195,6 +195,7 @@ const (
|
||||||
Zr_m
|
Zr_m
|
||||||
Zr_m_xm
|
Zr_m_xm
|
||||||
Zr_m_xm_vex
|
Zr_m_xm_vex
|
||||||
|
Zr_r_r_vex
|
||||||
Zrp_
|
Zrp_
|
||||||
Z_ib
|
Z_ib
|
||||||
Z_il
|
Z_il
|
||||||
|
|
@ -630,6 +631,11 @@ var yxr_ml_vex = []ytab{
|
||||||
{Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
|
{Yxr, Ynone, Yml, Zr_m_xm_vex, 1},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var yxm_xm_xm = []ytab{
|
||||||
|
{Yxr, Yxr, Yxr, Zr_r_r_vex, 1},
|
||||||
|
{Yxm, Yxr, Yxr, Zr_r_r_vex, 1},
|
||||||
|
}
|
||||||
|
|
||||||
var ymr = []ytab{
|
var ymr = []ytab{
|
||||||
{Ymr, Ynone, Ymr, Zm_r, 1},
|
{Ymr, Ynone, Ymr, Zm_r, 1},
|
||||||
}
|
}
|
||||||
|
|
@ -725,6 +731,10 @@ var ymskb = []ytab{
|
||||||
{Ymr, Ynone, Yrl, Zm_r_xm, 1},
|
{Ymr, Ynone, Yrl, Zm_r_xm, 1},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var ymskb_vex = []ytab{
|
||||||
|
{Yxr, Ynone, Yrl, Zm_r_xm_vex, 2},
|
||||||
|
}
|
||||||
|
|
||||||
var ycrc32l = []ytab{
|
var ycrc32l = []ytab{
|
||||||
{Yml, Ynone, Yrl, Zlitm_r, 0},
|
{Yml, Ynone, Yrl, Zlitm_r, 0},
|
||||||
}
|
}
|
||||||
|
|
@ -1497,6 +1507,8 @@ var optab =
|
||||||
{AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}},
|
{AMOVHDU, yxmov_vex, Pvex2, [23]uint8{0x6f, 0x7f}},
|
||||||
{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
|
{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
|
||||||
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
|
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
|
||||||
|
{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
|
||||||
|
{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
|
||||||
{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
|
{obj.AUSEFIELD, ynop, Px, [23]uint8{0, 0}},
|
||||||
{obj.ATYPE, nil, 0, [23]uint8{}},
|
{obj.ATYPE, nil, 0, [23]uint8{}},
|
||||||
{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
|
{obj.AFUNCDATA, yfuncdata, Px, [23]uint8{0, 0}},
|
||||||
|
|
@ -2943,11 +2955,15 @@ var bpduff2 = []byte{
|
||||||
0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
|
0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
|
||||||
}
|
}
|
||||||
|
|
||||||
func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
// Assemble vex prefix, from 3 operands and prefix.
|
||||||
|
// For details about vex prefix see:
|
||||||
|
// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
|
||||||
|
func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, from3 *obj.Addr, pref uint8) {
|
||||||
rexR := regrex[to.Reg]
|
rexR := regrex[to.Reg]
|
||||||
rexB := regrex[from.Reg]
|
rexB := regrex[from.Reg]
|
||||||
rexX := regrex[from.Index]
|
rexX := regrex[from.Index]
|
||||||
var prefBit uint8
|
var prefBit uint8
|
||||||
|
// This will go into VEX.PP field.
|
||||||
if pref == Pvex1 {
|
if pref == Pvex1 {
|
||||||
prefBit = 1
|
prefBit = 1
|
||||||
} else if pref == Pvex2 {
|
} else if pref == Pvex2 {
|
||||||
|
|
@ -2955,21 +2971,36 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
||||||
} // TODO add Pvex0,Pvex3
|
} // TODO add Pvex0,Pvex3
|
||||||
|
|
||||||
if rexX == 0 && rexB == 0 { // 2-byte vex prefix
|
if rexX == 0 && rexB == 0 { // 2-byte vex prefix
|
||||||
|
// In 2-byte case, first byte is always C5
|
||||||
ctxt.Andptr[0] = 0xc5
|
ctxt.Andptr[0] = 0xc5
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
|
|
||||||
if rexR != 0 {
|
if from3 == nil {
|
||||||
|
// If this is a 2-operand instruction fill VEX.VVVV with 1111
|
||||||
|
// We are also interested only in 256-bit version, so VEX.L=1
|
||||||
ctxt.Andptr[0] = 0x7c
|
ctxt.Andptr[0] = 0x7c
|
||||||
} else {
|
} else {
|
||||||
ctxt.Andptr[0] = 0xfc
|
// VEX.L=1
|
||||||
|
ctxt.Andptr[0] = 0x4
|
||||||
|
// VEX.VVVV (bits 3:6) is a inversed register number
|
||||||
|
ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
|
||||||
|
}
|
||||||
|
|
||||||
|
// VEX encodes REX.R as inversed upper bit
|
||||||
|
if rexR == 0 {
|
||||||
|
ctxt.Andptr[0] |= 0x80
|
||||||
}
|
}
|
||||||
ctxt.Andptr[0] |= prefBit
|
ctxt.Andptr[0] |= prefBit
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
} else {
|
} else { // 3-byte case
|
||||||
|
// First byte is always C$
|
||||||
ctxt.Andptr[0] = 0xc4
|
ctxt.Andptr[0] = 0xc4
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
|
|
||||||
|
// Encode VEX.mmmmm with prefix value, for now assume 0F 38,
|
||||||
|
// which encodes as 1.
|
||||||
ctxt.Andptr[0] = 0x1 // TODO handle different prefix
|
ctxt.Andptr[0] = 0x1 // TODO handle different prefix
|
||||||
|
// REX.[RXB] are inverted and encoded in 3 upper bits
|
||||||
if rexR == 0 {
|
if rexR == 0 {
|
||||||
ctxt.Andptr[0] |= 0x80
|
ctxt.Andptr[0] |= 0x80
|
||||||
}
|
}
|
||||||
|
|
@ -2981,7 +3012,13 @@ func vexprefix(ctxt *obj.Link, to *obj.Addr, from *obj.Addr, pref uint8) {
|
||||||
}
|
}
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
|
|
||||||
ctxt.Andptr[0] = 0x7c
|
// Fill VEX.VVVV, same as 2-operand VEX instruction.
|
||||||
|
if from3 == nil {
|
||||||
|
ctxt.Andptr[0] = 0x7c
|
||||||
|
} else {
|
||||||
|
ctxt.Andptr[0] = 0x4
|
||||||
|
ctxt.Andptr[0] |= byte((^(from3.Reg - REG_X0))<<3) & 0x78
|
||||||
|
}
|
||||||
ctxt.Andptr[0] |= prefBit
|
ctxt.Andptr[0] |= prefBit
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
}
|
}
|
||||||
|
|
@ -3222,7 +3259,7 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
|
||||||
|
|
||||||
case Zm_r_xm_vex:
|
case Zm_r_xm_vex:
|
||||||
ctxt.Vexflag = 1
|
ctxt.Vexflag = 1
|
||||||
vexprefix(ctxt, &p.To, &p.From, o.prefix)
|
vexprefix(ctxt, &p.To, &p.From, nil, o.prefix)
|
||||||
ctxt.Andptr[0] = byte(op)
|
ctxt.Andptr[0] = byte(op)
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
asmand(ctxt, p, &p.From, &p.To)
|
asmand(ctxt, p, &p.From, &p.To)
|
||||||
|
|
@ -3284,11 +3321,18 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
|
||||||
|
|
||||||
case Zr_m_xm_vex:
|
case Zr_m_xm_vex:
|
||||||
ctxt.Vexflag = 1
|
ctxt.Vexflag = 1
|
||||||
vexprefix(ctxt, &p.From, &p.To, o.prefix)
|
vexprefix(ctxt, &p.From, &p.To, nil, o.prefix)
|
||||||
ctxt.Andptr[0] = byte(op)
|
ctxt.Andptr[0] = byte(op)
|
||||||
ctxt.Andptr = ctxt.Andptr[1:]
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
asmand(ctxt, p, &p.To, &p.From)
|
asmand(ctxt, p, &p.To, &p.From)
|
||||||
|
|
||||||
|
case Zr_r_r_vex:
|
||||||
|
ctxt.Vexflag = 1
|
||||||
|
vexprefix(ctxt, &p.To, &p.From, p.From3, o.prefix)
|
||||||
|
ctxt.Andptr[0] = byte(op)
|
||||||
|
ctxt.Andptr = ctxt.Andptr[1:]
|
||||||
|
asmand(ctxt, p, &p.From, &p.To)
|
||||||
|
|
||||||
case Zr_m_xm:
|
case Zr_m_xm:
|
||||||
mediaop(ctxt, o, op, int(yt.zoffset), z)
|
mediaop(ctxt, o, op, int(yt.zoffset), z)
|
||||||
asmand(ctxt, p, &p.To, &p.From)
|
asmand(ctxt, p, &p.To, &p.From)
|
||||||
|
|
|
||||||
|
|
@ -42,11 +42,37 @@ TEXT runtime·rt0_go(SB),NOSPLIT,$0
|
||||||
JNE notintel
|
JNE notintel
|
||||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
||||||
notintel:
|
notintel:
|
||||||
|
// Do nothing.
|
||||||
|
|
||||||
MOVQ $1, AX
|
MOVQ $1, AX
|
||||||
CPUID
|
CPUID
|
||||||
MOVL CX, runtime·cpuid_ecx(SB)
|
MOVL CX, runtime·cpuid_ecx(SB)
|
||||||
MOVL DX, runtime·cpuid_edx(SB)
|
MOVL DX, runtime·cpuid_edx(SB)
|
||||||
|
// Detect AVX and AVX2 as per 14.7.1 Detection of AVX2 chapter of [1]
|
||||||
|
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||||
|
// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
|
||||||
|
ANDL $0x18000000, CX // check for OSXSAVE and AVX bits
|
||||||
|
CMPL CX, $0x18000000
|
||||||
|
JNE noavx
|
||||||
|
MOVL $0, CX
|
||||||
|
// For XGETBV, OSXSAVE bit is required and sufficient
|
||||||
|
BYTE $0x0F; BYTE $0x01; BYTE $0xD0
|
||||||
|
ANDL $6, AX
|
||||||
|
CMPL AX, $6 // Check for OS support of YMM registers
|
||||||
|
JNE noavx
|
||||||
|
MOVB $1, runtime·support_avx(SB)
|
||||||
|
MOVL $7, AX
|
||||||
|
MOVL $0, CX
|
||||||
|
CPUID
|
||||||
|
ANDL $0x20, BX // check for AVX2 bit
|
||||||
|
CMPL BX, $0x20
|
||||||
|
JNE noavx2
|
||||||
|
MOVB $1, runtime·support_avx2(SB)
|
||||||
|
JMP nocpuinfo
|
||||||
|
noavx:
|
||||||
|
MOVB $0, runtime·support_avx(SB)
|
||||||
|
noavx2:
|
||||||
|
MOVB $0, runtime·support_avx2(SB)
|
||||||
nocpuinfo:
|
nocpuinfo:
|
||||||
|
|
||||||
// if there is an _cgo_init, call it.
|
// if there is an _cgo_init, call it.
|
||||||
|
|
@ -1508,7 +1534,10 @@ TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
|
||||||
JB small
|
JB small
|
||||||
|
|
||||||
CMPQ R8, $63
|
CMPQ R8, $63
|
||||||
JA big_loop
|
JBE loop
|
||||||
|
CMPB runtime·support_avx2(SB), $1
|
||||||
|
JEQ big_loop_avx2
|
||||||
|
JMP big_loop
|
||||||
loop:
|
loop:
|
||||||
CMPQ R8, $16
|
CMPQ R8, $16
|
||||||
JBE _0through16
|
JBE _0through16
|
||||||
|
|
@ -1657,6 +1686,45 @@ big_loop:
|
||||||
JBE loop
|
JBE loop
|
||||||
JMP big_loop
|
JMP big_loop
|
||||||
|
|
||||||
|
// Compare 64-bytes per loop iteration.
|
||||||
|
// Loop is unrolled and uses AVX2.
|
||||||
|
big_loop_avx2:
|
||||||
|
MOVHDU (SI), X2
|
||||||
|
MOVHDU (DI), X3
|
||||||
|
MOVHDU 32(SI), X4
|
||||||
|
MOVHDU 32(DI), X5
|
||||||
|
VPCMPEQB X2, X3, X0
|
||||||
|
VPMOVMSKB X0, AX
|
||||||
|
XORL $0xffffffff, AX
|
||||||
|
JNE diff32_avx2
|
||||||
|
VPCMPEQB X4, X5, X6
|
||||||
|
VPMOVMSKB X6, AX
|
||||||
|
XORL $0xffffffff, AX
|
||||||
|
JNE diff64_avx2
|
||||||
|
|
||||||
|
ADDQ $64, SI
|
||||||
|
ADDQ $64, DI
|
||||||
|
SUBQ $64, R8
|
||||||
|
CMPQ R8, $64
|
||||||
|
JB big_loop_avx2_exit
|
||||||
|
JMP big_loop_avx2
|
||||||
|
|
||||||
|
// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
|
||||||
|
diff32_avx2:
|
||||||
|
VZEROUPPER
|
||||||
|
JMP diff16
|
||||||
|
|
||||||
|
// Same as diff32_avx2, but for last 32 bytes.
|
||||||
|
diff64_avx2:
|
||||||
|
VZEROUPPER
|
||||||
|
JMP diff48
|
||||||
|
|
||||||
|
// For <64 bytes remainder jump to normal loop.
|
||||||
|
big_loop_avx2_exit:
|
||||||
|
VZEROUPPER
|
||||||
|
JMP loop
|
||||||
|
|
||||||
|
|
||||||
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
|
TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
|
||||||
MOVQ s+0(FP), SI
|
MOVQ s+0(FP), SI
|
||||||
MOVQ s_len+8(FP), BX
|
MOVQ s_len+8(FP), BX
|
||||||
|
|
|
||||||
|
|
@ -627,6 +627,8 @@ var (
|
||||||
cpuid_ecx uint32
|
cpuid_ecx uint32
|
||||||
cpuid_edx uint32
|
cpuid_edx uint32
|
||||||
lfenceBeforeRdtsc bool
|
lfenceBeforeRdtsc bool
|
||||||
|
support_avx bool
|
||||||
|
support_avx2 bool
|
||||||
|
|
||||||
goarm uint8 // set by cmd/link on arm systems
|
goarm uint8 // set by cmd/link on arm systems
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue