diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s index 4ccaca5e87..fdd015f560 100644 --- a/src/internal/bytealg/compare_amd64.s +++ b/src/internal/bytealg/compare_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Compare(SB),NOSPLIT,$0-56 @@ -44,9 +45,13 @@ TEXT cmpbody<>(SB),NOSPLIT,$0-0 CMPQ R8, $63 JBE loop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JEQ big_loop_avx2 JMP big_loop +#else + JMP big_loop_avx2 +#endif loop: CMPQ R8, $16 JBE _0through16 @@ -155,6 +160,7 @@ allsame: RET // this works for >= 64 bytes of data. +#ifndef hasAVX2 big_loop: MOVOU (SI), X0 MOVOU (DI), X1 @@ -190,6 +196,7 @@ big_loop: CMPQ R8, $64 JBE loop JMP big_loop +#endif // Compare 64-bytes per loop iteration. // Loop is unrolled and uses AVX2. diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s index fa864c4c76..efb17f84b7 100644 --- a/src/internal/bytealg/count_amd64.s +++ b/src/internal/bytealg/count_amd64.s @@ -3,12 +3,15 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·Count(SB),NOSPLIT,$0-40 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGeneric(SB) +#endif MOVQ b_base+0(FP), SI MOVQ b_len+8(FP), BX MOVB c+24(FP), AL @@ -16,9 +19,11 @@ TEXT ·Count(SB),NOSPLIT,$0-40 JMP countbody<>(SB) TEXT ·CountString(SB),NOSPLIT,$0-32 +#ifndef hasPOPCNT CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 JEQ 2(PC) JMP ·countGenericString(SB) +#endif MOVQ s_base+0(FP), SI MOVQ s_len+8(FP), BX MOVB c+16(FP), AL @@ -151,8 +156,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s index dd46e2e0fd..d178a33779 100644 --- a/src/internal/bytealg/equal_amd64.s +++ b/src/internal/bytealg/equal_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" // memequal(a, b unsafe.Pointer, size uintptr) bool @@ -46,6 +47,7 @@ TEXT memeqbody<>(SB),NOSPLIT,$0-0 JB small CMPQ BX, $64 JB bigloop +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JE hugeloop_avx2 @@ -76,6 +78,7 @@ hugeloop: JEQ hugeloop XORQ AX, AX // return 0 RET +#endif // 64 bytes at a time using ymm registers hugeloop_avx2: diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s index 6193b57239..04314917b8 100644 --- a/src/internal/bytealg/index_amd64.s +++ b/src/internal/bytealg/index_amd64.s @@ -233,8 +233,10 @@ success_avx2: VZEROUPPER JMP success sse42: +#ifndef hasSSE42 CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 JNE no_sse42 +#endif CMPQ AX, $12 // PCMPESTRI is slower than normal compare, // so using it makes sense only if we advance 4+ bytes per compare diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s index f78093c539..1ca70e39e2 100644 --- a/src/internal/bytealg/indexbyte_amd64.s +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -115,8 +115,10 @@ endofpage: RET avx2: +#ifndef hasAVX2 CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 JNE sse +#endif MOVD AX, X0 LEAQ -32(SI)(BX*1), R11 VPBROADCASTB X0, Y1 diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h index 49e0ee2323..f7a8896db6 100644 --- a/src/runtime/asm_amd64.h +++ b/src/runtime/asm_amd64.h @@ -5,10 +5,21 @@ // Define features that are guaranteed to be supported by setting the AMD64 variable. // If a feature is supported, there's no need to check it at runtime every time. +#ifdef GOAMD64_v2 +#define hasPOPCNT +#define hasSSE42 +#endif + #ifdef GOAMD64_v3 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif #ifdef GOAMD64_v4 +#define hasAVX #define hasAVX2 +#define hasPOPCNT +#define hasSSE42 #endif diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go index 28befcbd0d..61d2d0247e 100644 --- a/src/runtime/mkpreempt.go +++ b/src/runtime/mkpreempt.go @@ -126,6 +126,9 @@ func header(arch string) { fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base) } fmt.Fprintf(out, "#include \"go_asm.h\"\n") + if arch == "amd64" { + fmt.Fprintf(out, "#include \"asm_amd64.h\"\n") + } fmt.Fprintf(out, "#include \"textflag.h\"\n\n") fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n") } @@ -267,8 +270,10 @@ func genAMD64() { // Clear the upper bits to get to a clean state. See issue #37174. // It is safe here as Go code don't use the upper bits of Y registers. p("#ifdef GOOS_darwin") + p("#ifndef hasAVX") p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0") p("JE 2(PC)") + p("#endif") p("VZEROUPPER") p("#endif") diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s index 31f7c8b66f..94a84fb74c 100644 --- a/src/runtime/preempt_amd64.s +++ b/src/runtime/preempt_amd64.s @@ -1,6 +1,7 @@ // Code generated by mkpreempt.go; DO NOT EDIT. #include "go_asm.h" +#include "asm_amd64.h" #include "textflag.h" TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 @@ -27,8 +28,10 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0 MOVQ R14, 96(SP) MOVQ R15, 104(SP) #ifdef GOOS_darwin + #ifndef hasAVX CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0 JE 2(PC) + #endif VZEROUPPER #endif MOVUPS X0, 112(SP)