mirror of https://github.com/golang/go.git
runtime: remove dead code and unnecessary checks for amd64
Use amd64 assembly header to remove unnecessary cpu flags checks and dead code that is guaranteed to not be executed when compiling for specific microarchitectures. name old time/op new time/op delta BytesCompare/1-12 3.88ns ± 1% 3.18ns ± 1% -18.15% (p=0.008 n=5+5) BytesCompare/2-12 3.89ns ± 1% 3.21ns ± 2% -17.66% (p=0.008 n=5+5) BytesCompare/4-12 3.89ns ± 0% 3.17ns ± 0% -18.62% (p=0.008 n=5+5) BytesCompare/8-12 3.44ns ± 2% 3.39ns ± 1% -1.36% (p=0.008 n=5+5) BytesCompare/16-12 3.40ns ± 1% 3.14ns ± 0% -7.77% (p=0.008 n=5+5) BytesCompare/32-12 3.90ns ± 1% 3.65ns ± 0% -6.19% (p=0.008 n=5+5) BytesCompare/64-12 4.96ns ± 1% 4.71ns ± 2% -4.98% (p=0.008 n=5+5) BytesCompare/128-12 6.42ns ± 0% 5.99ns ± 4% -6.75% (p=0.008 n=5+5) BytesCompare/256-12 9.36ns ± 0% 7.40ns ± 0% -20.97% (p=0.008 n=5+5) BytesCompare/512-12 15.9ns ± 1% 11.4ns ± 1% -28.36% (p=0.008 n=5+5) BytesCompare/1024-12 27.0ns ± 0% 19.3ns ± 0% -28.36% (p=0.008 n=5+5) BytesCompare/2048-12 50.2ns ± 0% 43.3ns ± 0% -13.71% (p=0.008 n=5+5) [Geo mean] 7.13ns 6.07ns -14.86% name old speed new speed delta Count/10-12 723MB/s ± 0% 704MB/s ± 1% -2.73% (p=0.008 n=5+5) Count/32-12 2.21GB/s ± 0% 2.12GB/s ± 2% -4.21% (p=0.008 n=5+5) Count/4K-12 1.03GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=5+5) Count/4M-12 1.04GB/s ± 0% 1.02GB/s ± 2% ~ (p=0.310 n=5+5) Count/64M-12 1.02GB/s ± 0% 1.01GB/s ± 1% -1.00% (p=0.016 n=5+5) CountEasy/10-12 779MB/s ± 0% 768MB/s ± 1% -1.48% (p=0.008 n=5+5) CountEasy/32-12 2.15GB/s ± 0% 2.09GB/s ± 1% -2.71% (p=0.008 n=5+5) CountEasy/4K-12 45.1GB/s ± 1% 45.2GB/s ± 1% ~ (p=0.421 n=5+5) CountEasy/4M-12 36.4GB/s ± 1% 36.5GB/s ± 1% ~ (p=0.690 n=5+5) CountEasy/64M-12 16.1GB/s ± 2% 16.4GB/s ± 1% ~ (p=0.056 n=5+5) CountSingle/10-12 2.15GB/s ± 2% 2.22GB/s ± 1% +3.37% (p=0.008 n=5+5) CountSingle/32-12 5.86GB/s ± 1% 5.76GB/s ± 1% -1.55% (p=0.008 n=5+5) CountSingle/4K-12 54.6GB/s ± 1% 55.0GB/s ± 1% ~ (p=0.548 n=5+5) CountSingle/4M-12 45.9GB/s ± 4% 46.4GB/s ± 2% ~ (p=0.548 n=5+5) CountSingle/64M-12 17.3GB/s ± 1% 17.2GB/s ± 2% ~ (p=1.000 n=5+5) [Geo mean] 5.11GB/s 5.08GB/s -0.53% name old speed new speed delta Equal/1-12 200MB/s ± 0% 188MB/s ± 1% -6.11% (p=0.008 n=5+5) Equal/6-12 1.20GB/s ± 0% 1.13GB/s ± 1% -6.38% (p=0.008 n=5+5) Equal/9-12 1.67GB/s ± 3% 1.74GB/s ± 1% +3.83% (p=0.008 n=5+5) Equal/15-12 2.82GB/s ± 1% 2.89GB/s ± 1% +2.63% (p=0.008 n=5+5) Equal/16-12 2.96GB/s ± 1% 3.08GB/s ± 1% +3.95% (p=0.008 n=5+5) Equal/20-12 3.33GB/s ± 1% 3.54GB/s ± 1% +6.36% (p=0.008 n=5+5) Equal/32-12 4.57GB/s ± 0% 5.26GB/s ± 1% +15.09% (p=0.008 n=5+5) Equal/4K-12 62.0GB/s ± 1% 65.9GB/s ± 2% +6.29% (p=0.008 n=5+5) Equal/4M-12 23.6GB/s ± 2% 24.8GB/s ± 4% +5.43% (p=0.008 n=5+5) Equal/64M-12 11.1GB/s ± 2% 11.3GB/s ± 1% +1.69% (p=0.008 n=5+5) [Geo mean] 3.91GB/s 4.03GB/s +3.11% name old speed new speed delta IndexByte/10-12 2.64GB/s ± 0% 2.69GB/s ± 0% +1.67% (p=0.008 n=5+5) IndexByte/32-12 6.79GB/s ± 0% 6.27GB/s ± 0% -7.57% (p=0.008 n=5+5) IndexByte/4K-12 56.2GB/s ± 0% 56.9GB/s ± 0% +1.27% (p=0.008 n=5+5) IndexByte/4M-12 40.1GB/s ± 1% 41.7GB/s ± 1% +4.05% (p=0.008 n=5+5) IndexByte/64M-12 17.5GB/s ± 0% 17.7GB/s ± 1% ~ (p=0.095 n=5+5) IndexBytePortable/10-12 2.06GB/s ± 1% 2.16GB/s ± 1% +5.08% (p=0.008 n=5+5) IndexBytePortable/32-12 1.40GB/s ± 1% 1.54GB/s ± 1% +10.05% (p=0.008 n=5+5) IndexBytePortable/4K-12 3.99GB/s ± 0% 4.08GB/s ± 0% +2.16% (p=0.008 n=5+5) IndexBytePortable/4M-12 4.05GB/s ± 1% 4.08GB/s ± 2% ~ (p=0.095 n=5+5) IndexBytePortable/64M-12 3.80GB/s ± 1% 3.81GB/s ± 0% ~ (p=0.421 n=5+5) IndexRune/10-12 746MB/s ± 1% 752MB/s ± 0% +0.85% (p=0.008 n=5+5) IndexRune/32-12 2.33GB/s ± 0% 2.42GB/s ± 0% +3.66% (p=0.008 n=5+5) IndexRune/4K-12 44.4GB/s ± 0% 44.2GB/s ± 0% ~ (p=0.095 n=5+5) IndexRune/4M-12 36.2GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.841 n=5+5) IndexRune/64M-12 16.2GB/s ± 2% 16.3GB/s ± 2% ~ (p=0.548 n=5+5) IndexRuneASCII/10-12 2.57GB/s ± 0% 2.58GB/s ± 0% +0.63% (p=0.008 n=5+5) IndexRuneASCII/32-12 6.00GB/s ± 0% 6.30GB/s ± 1% +4.98% (p=0.008 n=5+5) IndexRuneASCII/4K-12 56.7GB/s ± 0% 56.8GB/s ± 1% ~ (p=0.151 n=5+5) IndexRuneASCII/4M-12 41.6GB/s ± 1% 41.7GB/s ± 2% ~ (p=0.151 n=5+5) IndexRuneASCII/64M-12 17.7GB/s ± 1% 17.6GB/s ± 1% ~ (p=0.222 n=5+5) Index/10-12 1.06GB/s ± 1% 1.06GB/s ± 0% ~ (p=0.310 n=5+5) Index/32-12 3.57GB/s ± 0% 3.56GB/s ± 1% ~ (p=0.056 n=5+5) Index/4K-12 1.02GB/s ± 2% 1.03GB/s ± 0% ~ (p=0.690 n=5+5) Index/4M-12 1.04GB/s ± 0% 1.03GB/s ± 1% ~ (p=1.000 n=4+5) Index/64M-12 1.02GB/s ± 0% 1.02GB/s ± 0% ~ (p=0.905 n=5+4) IndexEasy/10-12 1.12GB/s ± 2% 1.15GB/s ± 1% +3.10% (p=0.008 n=5+5) IndexEasy/32-12 3.14GB/s ± 2% 3.13GB/s ± 1% ~ (p=0.310 n=5+5) IndexEasy/4K-12 47.6GB/s ± 1% 47.7GB/s ± 2% ~ (p=0.310 n=5+5) IndexEasy/4M-12 36.4GB/s ± 1% 36.3GB/s ± 2% ~ (p=0.690 n=5+5) IndexEasy/64M-12 16.1GB/s ± 1% 16.4GB/s ± 5% ~ (p=0.151 n=5+5) [Geo mean] 6.39GB/s 6.46GB/s +1.11% Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82 Reviewed-on: https://go-review.googlesource.com/c/go/+/397576 Reviewed-by: Keith Randall <khr@google.com> Run-TryBot: Keith Randall <khr@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
c82bbc0e8e
commit
330cffb869
|
|
@ -3,6 +3,7 @@
|
|||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "asm_amd64.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
|
||||
|
|
@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
|
|||
|
||||
CMPQ R8, $63
|
||||
JBE loop
|
||||
#ifndef hasAVX2
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
|
||||
JEQ big_loop_avx2
|
||||
JMP big_loop
|
||||
#else
|
||||
JMP big_loop_avx2
|
||||
#endif
|
||||
loop:
|
||||
CMPQ R8, $16
|
||||
JBE _0through16
|
||||
|
|
@ -155,6 +160,7 @@ allsame:
|
|||
RET
|
||||
|
||||
// this works for >= 64 bytes of data.
|
||||
#ifndef hasAVX2
|
||||
big_loop:
|
||||
MOVOU (SI), X0
|
||||
MOVOU (DI), X1
|
||||
|
|
@ -190,6 +196,7 @@ big_loop:
|
|||
CMPQ R8, $64
|
||||
JBE loop
|
||||
JMP big_loop
|
||||
#endif
|
||||
|
||||
// Compare 64-bytes per loop iteration.
|
||||
// Loop is unrolled and uses AVX2.
|
||||
|
|
|
|||
|
|
@ -3,12 +3,15 @@
|
|||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "asm_amd64.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·Count(SB),NOSPLIT,$0-40
|
||||
#ifndef hasPOPCNT
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
|
||||
JEQ 2(PC)
|
||||
JMP ·countGeneric(SB)
|
||||
#endif
|
||||
MOVQ b_base+0(FP), SI
|
||||
MOVQ b_len+8(FP), BX
|
||||
MOVB c+24(FP), AL
|
||||
|
|
@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40
|
|||
JMP countbody<>(SB)
|
||||
|
||||
TEXT ·CountString(SB),NOSPLIT,$0-32
|
||||
#ifndef hasPOPCNT
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1
|
||||
JEQ 2(PC)
|
||||
JMP ·countGenericString(SB)
|
||||
#endif
|
||||
MOVQ s_base+0(FP), SI
|
||||
MOVQ s_len+8(FP), BX
|
||||
MOVB c+16(FP), AL
|
||||
|
|
@ -151,8 +156,10 @@ endofpage:
|
|||
RET
|
||||
|
||||
avx2:
|
||||
#ifndef hasAVX2
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
|
||||
JNE sse
|
||||
#endif
|
||||
MOVD AX, X0
|
||||
LEAQ -32(SI)(BX*1), R11
|
||||
VPBROADCASTB X0, Y1
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "asm_amd64.h"
|
||||
#include "textflag.h"
|
||||
|
||||
// memequal(a, b unsafe.Pointer, size uintptr) bool
|
||||
|
|
@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
|
|||
JB small
|
||||
CMPQ BX, $64
|
||||
JB bigloop
|
||||
#ifndef hasAVX2
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
|
||||
JE hugeloop_avx2
|
||||
|
||||
|
|
@ -76,6 +78,7 @@ hugeloop:
|
|||
JEQ hugeloop
|
||||
XORQ AX, AX // return 0
|
||||
RET
|
||||
#endif
|
||||
|
||||
// 64 bytes at a time using ymm registers
|
||||
hugeloop_avx2:
|
||||
|
|
|
|||
|
|
@ -233,8 +233,10 @@ success_avx2:
|
|||
VZEROUPPER
|
||||
JMP success
|
||||
sse42:
|
||||
#ifndef hasSSE42
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
|
||||
JNE no_sse42
|
||||
#endif
|
||||
CMPQ AX, $12
|
||||
// PCMPESTRI is slower than normal compare,
|
||||
// so using it makes sense only if we advance 4+ bytes per compare
|
||||
|
|
|
|||
|
|
@ -115,8 +115,10 @@ endofpage:
|
|||
RET
|
||||
|
||||
avx2:
|
||||
#ifndef hasAVX2
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
|
||||
JNE sse
|
||||
#endif
|
||||
MOVD AX, X0
|
||||
LEAQ -32(SI)(BX*1), R11
|
||||
VPBROADCASTB X0, Y1
|
||||
|
|
|
|||
|
|
@ -5,10 +5,21 @@
|
|||
// Define features that are guaranteed to be supported by setting the AMD64 variable.
|
||||
// If a feature is supported, there's no need to check it at runtime every time.
|
||||
|
||||
#ifdef GOAMD64_v2
|
||||
#define hasPOPCNT
|
||||
#define hasSSE42
|
||||
#endif
|
||||
|
||||
#ifdef GOAMD64_v3
|
||||
#define hasAVX
|
||||
#define hasAVX2
|
||||
#define hasPOPCNT
|
||||
#define hasSSE42
|
||||
#endif
|
||||
|
||||
#ifdef GOAMD64_v4
|
||||
#define hasAVX
|
||||
#define hasAVX2
|
||||
#define hasPOPCNT
|
||||
#define hasSSE42
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -126,6 +126,9 @@ func header(arch string) {
|
|||
fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
|
||||
}
|
||||
fmt.Fprintf(out, "#include \"go_asm.h\"\n")
|
||||
if arch == "amd64" {
|
||||
fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
|
||||
}
|
||||
fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
|
||||
fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
|
||||
}
|
||||
|
|
@ -267,8 +270,10 @@ func genAMD64() {
|
|||
// Clear the upper bits to get to a clean state. See issue #37174.
|
||||
// It is safe here as Go code don't use the upper bits of Y registers.
|
||||
p("#ifdef GOOS_darwin")
|
||||
p("#ifndef hasAVX")
|
||||
p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0")
|
||||
p("JE 2(PC)")
|
||||
p("#endif")
|
||||
p("VZEROUPPER")
|
||||
p("#endif")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
// Code generated by mkpreempt.go; DO NOT EDIT.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "asm_amd64.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
|
||||
|
|
@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
|
|||
MOVQ R14, 96(SP)
|
||||
MOVQ R15, 104(SP)
|
||||
#ifdef GOOS_darwin
|
||||
#ifndef hasAVX
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
|
||||
JE 2(PC)
|
||||
#endif
|
||||
VZEROUPPER
|
||||
#endif
|
||||
MOVUPS X0, 112(SP)
|
||||
|
|
|
|||
Loading…
Reference in New Issue