diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 9be280c6ba..4ef43e3efc 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -37,6 +37,7 @@ var X86 struct {
 	HasBMI1      bool
 	HasBMI2      bool
 	HasERMS      bool
+	HasFSRM      bool
 	HasFMA       bool
 	HasOSXSAVE   bool
 	HasPCLMULQDQ bool
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index 2b629d4da0..ee812076e9 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -40,7 +40,8 @@ const (
 	cpuid_SHA      = 1 << 29
 	cpuid_AVX512BW = 1 << 30
 	cpuid_AVX512VL = 1 << 31
-
+	// edx bits
+	cpuid_FSRM = 1 << 4
 	// edx bits for CPUID 0x80000001
 	cpuid_RDTSCP = 1 << 27
 )
@@ -52,6 +53,7 @@ func doinit() {
 		{Name: "adx", Feature: &X86.HasADX},
 		{Name: "aes", Feature: &X86.HasAES},
 		{Name: "erms", Feature: &X86.HasERMS},
+		{Name: "fsrm", Feature: &X86.HasFSRM},
 		{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
 		{Name: "rdtscp", Feature: &X86.HasRDTSCP},
 		{Name: "sha", Feature: &X86.HasSHA},
@@ -137,7 +139,7 @@ func doinit() {
 		return
 	}
 
-	_, ebx7, _, _ := cpuid(7, 0)
+	_, ebx7, _, edx7 := cpuid(7, 0)
 	X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
 	X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
 	X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@@ -151,6 +153,8 @@ func doinit() {
 		X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
 	}
 
+	X86.HasFSRM = isSet(edx7, cpuid_FSRM)
+
 	var maxExtendedInformation uint32
 	maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
 
diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
index 8cca4bca8f..b6d8c6c1e9 100644
--- a/src/runtime/cpuflags_amd64.go
+++ b/src/runtime/cpuflags_amd64.go
@@ -8,17 +8,31 @@ import (
 	"internal/cpu"
 )
 
-var useAVXmemmove bool
+var memmoveBits uint8
+
+const (
+	// avxSupported indicates that the CPU supports AVX instructions.
+	avxSupported = 1 << 0
+
+	// repmovsPreferred indicates that REP MOVSx instruction is more
+	// efficient on the CPU.
+	repmovsPreferred = 1 << 1
+)
 
 func init() {
-	// Let's remove stepping and reserved fields
-	processor := processorVersionInfo & 0x0FFF3FF0
-
-	isIntelBridgeFamily := isIntel &&
-		processor == 0x206A0 ||
-		processor == 0x206D0 ||
-		processor == 0x306A0 ||
-		processor == 0x306E0
-
-	useAVXmemmove = cpu.X86.HasAVX && !isIntelBridgeFamily
+	// Here we assume that on modern CPUs with both FSRM and ERMS features,
+	// copying data blocks of 2KB or larger using the REP MOVSB instruction
+	// will be more efficient to avoid having to keep up with CPU generations.
+	// Therefore, we may retain a BlockList mechanism to ensure that microarchitectures
+	// that do not fit this case may appear in the future.
+	// We enable it on Intel CPUs first, and we may support more platforms
+	// in the future.
+	isERMSNiceCPU := isIntel
+	useREPMOV := isERMSNiceCPU && cpu.X86.HasERMS && cpu.X86.HasFSRM
+	if cpu.X86.HasAVX {
+		memmoveBits |= avxSupported
+	}
+	if useREPMOV {
+		memmoveBits |= repmovsPreferred
+	}
 }
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index 018bb0b19d..8883b55ede 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -72,9 +72,10 @@ tail:
 	CMPQ	BX, $256
 	JBE	move_129through256
 
-	TESTB	$1, runtime·useAVXmemmove(SB)
-	JNZ	avxUnaligned
-
+	MOVB	runtime·memmoveBits(SB), AX
+	// We have AVX but we don't want to use REP MOVSx.
+	CMPB	AX, $const_avxSupported
+	JEQ	avxUnaligned
 /*
  * check and set for backwards
  */
@@ -82,16 +83,23 @@ tail:
 	JLS	back
 
 /*
- * forward copy loop
- */
+* forward copy loop
+*/
 forward:
+	CMPQ	BX, $2048
+	JL	check_avx
+	// REP MOVSx is slow if destination address is unaligned.
+	TESTQ	$15,DI
+	JNZ	check_avx
+	TESTB	$const_repmovsPreferred, AX
+	JNZ	fwdBy8
+	// For backward copy, REP MOVSx performs worse than avx.
+check_avx:
+	TESTB	$const_avxSupported, AX
+	JNZ	avxUnaligned
+
 	CMPQ	BX, $2048
 	JLS	move_256through2048
-
-	// If REP MOVSB isn't fast, don't use it
-	CMPB	internal∕cpu·X86+const_offsetX86HasERMS(SB), $1 // enhanced REP MOVSB/STOSB
-	JNE	fwdBy8
-
 	// Check alignment
 	MOVL	SI, AX
 	ORL	DI, AX
@@ -104,12 +112,16 @@ forward:
 	RET
 
 fwdBy8:
+	// Loading the last (possibly partially overlapping) word and writing
+	// it at the end.
+	MOVQ	-8(SI)(BX*1), AX
+	LEAQ	-8(DI)(BX*1), DX
 	// Do 8 bytes at a time
-	MOVQ	BX, CX
+	LEAQ 	-1(BX),CX
 	SHRQ	$3, CX
-	ANDQ	$7, BX
 	REP;	MOVSQ
-	JMP	tail
+	MOVQ	AX, (DX)
+	RET
 
 back:
 /*
@@ -119,6 +131,9 @@ back:
 	ADDQ	BX, CX
 	CMPQ	CX, DI
 	JLS	forward
+
+	TESTB	$const_avxSupported, AX
+	JNZ	avxUnaligned
 /*
  * whole thing backwards has
  * adjusted addresses