runtime: remove dead code and unnecessary checks for amd64

Use amd64 assembly header to remove unnecessary cpu flags checks
and dead code that is guaranteed to not be executed when compiling
for specific microarchitectures.

name                  old time/op  new time/op  delta
BytesCompare/1-12     3.88ns ± 1%  3.18ns ± 1%  -18.15%  (p=0.008 n=5+5)
BytesCompare/2-12     3.89ns ± 1%  3.21ns ± 2%  -17.66%  (p=0.008 n=5+5)
BytesCompare/4-12     3.89ns ± 0%  3.17ns ± 0%  -18.62%  (p=0.008 n=5+5)
BytesCompare/8-12     3.44ns ± 2%  3.39ns ± 1%   -1.36%  (p=0.008 n=5+5)
BytesCompare/16-12    3.40ns ± 1%  3.14ns ± 0%   -7.77%  (p=0.008 n=5+5)
BytesCompare/32-12    3.90ns ± 1%  3.65ns ± 0%   -6.19%  (p=0.008 n=5+5)
BytesCompare/64-12    4.96ns ± 1%  4.71ns ± 2%   -4.98%  (p=0.008 n=5+5)
BytesCompare/128-12   6.42ns ± 0%  5.99ns ± 4%   -6.75%  (p=0.008 n=5+5)
BytesCompare/256-12   9.36ns ± 0%  7.40ns ± 0%  -20.97%  (p=0.008 n=5+5)
BytesCompare/512-12   15.9ns ± 1%  11.4ns ± 1%  -28.36%  (p=0.008 n=5+5)
BytesCompare/1024-12  27.0ns ± 0%  19.3ns ± 0%  -28.36%  (p=0.008 n=5+5)
BytesCompare/2048-12  50.2ns ± 0%  43.3ns ± 0%  -13.71%  (p=0.008 n=5+5)
[Geo mean]            7.13ns       6.07ns       -14.86%

name                old speed      new speed      delta
Count/10-12          723MB/s ± 0%   704MB/s ± 1%  -2.73%  (p=0.008 n=5+5)
Count/32-12         2.21GB/s ± 0%  2.12GB/s ± 2%  -4.21%  (p=0.008 n=5+5)
Count/4K-12         1.03GB/s ± 0%  1.03GB/s ± 1%    ~     (p=1.000 n=5+5)
Count/4M-12         1.04GB/s ± 0%  1.02GB/s ± 2%    ~     (p=0.310 n=5+5)
Count/64M-12        1.02GB/s ± 0%  1.01GB/s ± 1%  -1.00%  (p=0.016 n=5+5)
CountEasy/10-12      779MB/s ± 0%   768MB/s ± 1%  -1.48%  (p=0.008 n=5+5)
CountEasy/32-12     2.15GB/s ± 0%  2.09GB/s ± 1%  -2.71%  (p=0.008 n=5+5)
CountEasy/4K-12     45.1GB/s ± 1%  45.2GB/s ± 1%    ~     (p=0.421 n=5+5)
CountEasy/4M-12     36.4GB/s ± 1%  36.5GB/s ± 1%    ~     (p=0.690 n=5+5)
CountEasy/64M-12    16.1GB/s ± 2%  16.4GB/s ± 1%    ~     (p=0.056 n=5+5)
CountSingle/10-12   2.15GB/s ± 2%  2.22GB/s ± 1%  +3.37%  (p=0.008 n=5+5)
CountSingle/32-12   5.86GB/s ± 1%  5.76GB/s ± 1%  -1.55%  (p=0.008 n=5+5)
CountSingle/4K-12   54.6GB/s ± 1%  55.0GB/s ± 1%    ~     (p=0.548 n=5+5)
CountSingle/4M-12   45.9GB/s ± 4%  46.4GB/s ± 2%    ~     (p=0.548 n=5+5)
CountSingle/64M-12  17.3GB/s ± 1%  17.2GB/s ± 2%    ~     (p=1.000 n=5+5)
[Geo mean]          5.11GB/s       5.08GB/s       -0.53%

name          old speed      new speed      delta
Equal/1-12     200MB/s ± 0%   188MB/s ± 1%   -6.11%  (p=0.008 n=5+5)
Equal/6-12    1.20GB/s ± 0%  1.13GB/s ± 1%   -6.38%  (p=0.008 n=5+5)
Equal/9-12    1.67GB/s ± 3%  1.74GB/s ± 1%   +3.83%  (p=0.008 n=5+5)
Equal/15-12   2.82GB/s ± 1%  2.89GB/s ± 1%   +2.63%  (p=0.008 n=5+5)
Equal/16-12   2.96GB/s ± 1%  3.08GB/s ± 1%   +3.95%  (p=0.008 n=5+5)
Equal/20-12   3.33GB/s ± 1%  3.54GB/s ± 1%   +6.36%  (p=0.008 n=5+5)
Equal/32-12   4.57GB/s ± 0%  5.26GB/s ± 1%  +15.09%  (p=0.008 n=5+5)
Equal/4K-12   62.0GB/s ± 1%  65.9GB/s ± 2%   +6.29%  (p=0.008 n=5+5)
Equal/4M-12   23.6GB/s ± 2%  24.8GB/s ± 4%   +5.43%  (p=0.008 n=5+5)
Equal/64M-12  11.1GB/s ± 2%  11.3GB/s ± 1%   +1.69%  (p=0.008 n=5+5)
[Geo mean]    3.91GB/s       4.03GB/s        +3.11%

name                              old speed      new speed      delta
IndexByte/10-12                   2.64GB/s ± 0%  2.69GB/s ± 0%   +1.67%  (p=0.008 n=5+5)
IndexByte/32-12                   6.79GB/s ± 0%  6.27GB/s ± 0%   -7.57%  (p=0.008 n=5+5)
IndexByte/4K-12                   56.2GB/s ± 0%  56.9GB/s ± 0%   +1.27%  (p=0.008 n=5+5)
IndexByte/4M-12                   40.1GB/s ± 1%  41.7GB/s ± 1%   +4.05%  (p=0.008 n=5+5)
IndexByte/64M-12                  17.5GB/s ± 0%  17.7GB/s ± 1%     ~     (p=0.095 n=5+5)
IndexBytePortable/10-12           2.06GB/s ± 1%  2.16GB/s ± 1%   +5.08%  (p=0.008 n=5+5)
IndexBytePortable/32-12           1.40GB/s ± 1%  1.54GB/s ± 1%  +10.05%  (p=0.008 n=5+5)
IndexBytePortable/4K-12           3.99GB/s ± 0%  4.08GB/s ± 0%   +2.16%  (p=0.008 n=5+5)
IndexBytePortable/4M-12           4.05GB/s ± 1%  4.08GB/s ± 2%     ~     (p=0.095 n=5+5)
IndexBytePortable/64M-12          3.80GB/s ± 1%  3.81GB/s ± 0%     ~     (p=0.421 n=5+5)
IndexRune/10-12                    746MB/s ± 1%   752MB/s ± 0%   +0.85%  (p=0.008 n=5+5)
IndexRune/32-12                   2.33GB/s ± 0%  2.42GB/s ± 0%   +3.66%  (p=0.008 n=5+5)
IndexRune/4K-12                   44.4GB/s ± 0%  44.2GB/s ± 0%     ~     (p=0.095 n=5+5)
IndexRune/4M-12                   36.2GB/s ± 1%  36.3GB/s ± 2%     ~     (p=0.841 n=5+5)
IndexRune/64M-12                  16.2GB/s ± 2%  16.3GB/s ± 2%     ~     (p=0.548 n=5+5)
IndexRuneASCII/10-12              2.57GB/s ± 0%  2.58GB/s ± 0%   +0.63%  (p=0.008 n=5+5)
IndexRuneASCII/32-12              6.00GB/s ± 0%  6.30GB/s ± 1%   +4.98%  (p=0.008 n=5+5)
IndexRuneASCII/4K-12              56.7GB/s ± 0%  56.8GB/s ± 1%     ~     (p=0.151 n=5+5)
IndexRuneASCII/4M-12              41.6GB/s ± 1%  41.7GB/s ± 2%     ~     (p=0.151 n=5+5)
IndexRuneASCII/64M-12             17.7GB/s ± 1%  17.6GB/s ± 1%     ~     (p=0.222 n=5+5)
Index/10-12                       1.06GB/s ± 1%  1.06GB/s ± 0%     ~     (p=0.310 n=5+5)
Index/32-12                       3.57GB/s ± 0%  3.56GB/s ± 1%     ~     (p=0.056 n=5+5)
Index/4K-12                       1.02GB/s ± 2%  1.03GB/s ± 0%     ~     (p=0.690 n=5+5)
Index/4M-12                       1.04GB/s ± 0%  1.03GB/s ± 1%     ~     (p=1.000 n=4+5)
Index/64M-12                      1.02GB/s ± 0%  1.02GB/s ± 0%     ~     (p=0.905 n=5+4)
IndexEasy/10-12                   1.12GB/s ± 2%  1.15GB/s ± 1%   +3.10%  (p=0.008 n=5+5)
IndexEasy/32-12                   3.14GB/s ± 2%  3.13GB/s ± 1%     ~     (p=0.310 n=5+5)
IndexEasy/4K-12                   47.6GB/s ± 1%  47.7GB/s ± 2%     ~     (p=0.310 n=5+5)
IndexEasy/4M-12                   36.4GB/s ± 1%  36.3GB/s ± 2%     ~     (p=0.690 n=5+5)
IndexEasy/64M-12                  16.1GB/s ± 1%  16.4GB/s ± 5%     ~     (p=0.151 n=5+5)
[Geo mean]                        6.39GB/s       6.46GB/s        +1.11%

Change-Id: Ic1ca62f5cc719d87e2c4aeff25ad73507facff82
Reviewed-on: https://go-review.googlesource.com/c/go/+/397576
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
vpachkov 2022-04-01 20:37:30 +03:00 committed by Keith Randall
parent c82bbc0e8e
commit 330cffb869
8 changed files with 40 additions and 0 deletions

View File

@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0
CMPQ R8, $63
JBE loop
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JEQ big_loop_avx2
JMP big_loop
#else
JMP big_loop_avx2
#endif
loop:
CMPQ R8, $16
JBE _0through16
@ -155,6 +160,7 @@ allsame:
RET
// this works for >= 64 bytes of data.
#ifndef hasAVX2
big_loop:
MOVOU (SI), X0
MOVOU (DI), X1
@ -190,6 +196,7 @@ big_loop:
CMPQ R8, $64
JBE loop
JMP big_loop
#endif
// Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2.

View File

@ -3,12 +3,15 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "asm_amd64.h"
#include "textflag.h"
TEXT ·Count(SB),NOSPLIT,$0-40
#ifndef hasPOPCNT
CMPB internalcpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGeneric(SB)
#endif
MOVQ b_base+0(FP), SI
MOVQ b_len+8(FP), BX
MOVB c+24(FP), AL
@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40
JMP countbody<>(SB)
TEXT ·CountString(SB),NOSPLIT,$0-32
#ifndef hasPOPCNT
CMPB internalcpu·X86+const_offsetX86HasPOPCNT(SB), $1
JEQ 2(PC)
JMP ·countGenericString(SB)
#endif
MOVQ s_base+0(FP), SI
MOVQ s_len+8(FP), BX
MOVB c+16(FP), AL
@ -151,8 +156,10 @@ endofpage:
RET
avx2:
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1

View File

@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "asm_amd64.h"
#include "textflag.h"
// memequal(a, b unsafe.Pointer, size uintptr) bool
@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0
JB small
CMPQ BX, $64
JB bigloop
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JE hugeloop_avx2
@ -76,6 +78,7 @@ hugeloop:
JEQ hugeloop
XORQ AX, AX // return 0
RET
#endif
// 64 bytes at a time using ymm registers
hugeloop_avx2:

View File

@ -233,8 +233,10 @@ success_avx2:
VZEROUPPER
JMP success
sse42:
#ifndef hasSSE42
CMPB internalcpu·X86+const_offsetX86HasSSE42(SB), $1
JNE no_sse42
#endif
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare

View File

@ -115,8 +115,10 @@ endofpage:
RET
avx2:
#ifndef hasAVX2
CMPB internalcpu·X86+const_offsetX86HasAVX2(SB), $1
JNE sse
#endif
MOVD AX, X0
LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1

View File

@ -5,10 +5,21 @@
// Define features that are guaranteed to be supported by setting the AMD64 variable.
// If a feature is supported, there's no need to check it at runtime every time.
#ifdef GOAMD64_v2
#define hasPOPCNT
#define hasSSE42
#endif
#ifdef GOAMD64_v3
#define hasAVX
#define hasAVX2
#define hasPOPCNT
#define hasSSE42
#endif
#ifdef GOAMD64_v4
#define hasAVX
#define hasAVX2
#define hasPOPCNT
#define hasSSE42
#endif

View File

@ -126,6 +126,9 @@ func header(arch string) {
fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
}
fmt.Fprintf(out, "#include \"go_asm.h\"\n")
if arch == "amd64" {
fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
}
fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
}
@ -267,8 +270,10 @@ func genAMD64() {
// Clear the upper bits to get to a clean state. See issue #37174.
// It is safe here as Go code don't use the upper bits of Y registers.
p("#ifdef GOOS_darwin")
p("#ifndef hasAVX")
p("CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $0")
p("JE 2(PC)")
p("#endif")
p("VZEROUPPER")
p("#endif")

View File

@ -1,6 +1,7 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
#include "go_asm.h"
#include "asm_amd64.h"
#include "textflag.h"
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVQ R14, 96(SP)
MOVQ R15, 104(SP)
#ifdef GOOS_darwin
#ifndef hasAVX
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $0
JE 2(PC)
#endif
VZEROUPPER
#endif
MOVUPS X0, 112(SP)