diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
index e81e50f5df..06424642c7 100644
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -20,7 +20,8 @@ const (
 
 	offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
 
-	offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX)
+	offsetLOONG64HasLSX  = unsafe.Offsetof(cpu.Loong64.HasLSX)
+	offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX)
 )
 
 var (
diff --git a/src/runtime/memclr_loong64.s b/src/runtime/memclr_loong64.s
index 346b210c8d..76d8fb56bf 100644
--- a/src/runtime/memclr_loong64.s
+++ b/src/runtime/memclr_loong64.s
@@ -14,17 +14,29 @@
 
 // Algorithm:
 //
-// 1. when count <= 64 bytes, memory alignment check is omitted.
-// The handling is divided into distinct cases based on the size
-// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7,
-// clr_8, clr_9through16, clr_17through32, and clr_33through64.
+// 1. if lasx is enabled:
+//        THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256,
+//    else if lsx is enabled:
+//        THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128,
+//    else
+//        THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64,
 //
-// 2. when count > 64 bytes, memory alignment check is performed.
-// Unaligned bytes are processed first (that is, 8-(ptr&7)), and
-// then a 64-byte loop is executed to zero out memory.
-// When the number of remaining bytes not cleared is n < 64 bytes,
-// a tail processing is performed, invoking the corresponding case
-// based on the size of n.
+// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted.
+// The handling is divided into distinct cases based on the size of count:
+//   a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8,
+//      clr_9through16, clr_17through32, clr_33through64,
+//   b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128,
+//   c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128,
+//      lasx_clr_65through128, lasx_clr_129through256
+//
+// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned
+// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then
+// a LOOPBLOCKS-byte loop is executed to zero out memory.
+// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail
+// processing is performed, invoking the corresponding case based on the size of n.
+//
+// example:
+//    THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64
 //
 //    ptr           newptr                           ptrend
 //     |               |<----count after correction---->|
@@ -40,7 +52,6 @@
 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
 	BEQ	R5, clr_0
 	ADDV	R4, R5, R6
-
 tail:
 	// <=64 bytes, clear directly, not check aligned
 	SGTU	$2, R5, R7
@@ -57,25 +68,152 @@ tail:
 	BNE	R7, clr_8
 	SGTU	$17, R5, R7
 	BNE	R7, clr_9through16
+
+	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+	BNE	R7, lasx_tail
+	MOVBU	internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+	BNE	R7, lsx_tail
+
 	SGTU	$33, R5, R7
 	BNE	R7, clr_17through32
 	SGTU	$65, R5, R7
 	BNE	R7, clr_33through64
+	JMP	clr_large
 
-	// n > 64 bytes, check aligned
+lasx_tail:
+	// X0 = 0
+	XVXORV	X0, X0, X0
+
+	SGTU	$33, R5, R7
+	BNE	R7, lasx_clr_17through32
+	SGTU	$65, R5, R7
+	BNE	R7, lasx_clr_33through64
+	SGTU	$129, R5, R7
+	BNE	R7, lasx_clr_65through128
+	SGTU	$257, R5, R7
+	BNE	R7, lasx_clr_129through256
+	JMP	lasx_clr_large
+
+lsx_tail:
+	// V0 = 0
+	VXORV	V0, V0, V0
+
+	SGTU	$33, R5, R7
+	BNE	R7, lsx_clr_17through32
+	SGTU	$65, R5, R7
+	BNE	R7, lsx_clr_33through64
+	SGTU	$129, R5, R7
+	BNE	R7, lsx_clr_65through128
+	JMP	lsx_clr_large
+
+	// use simd 256 instructions to implement memclr
+	// n > 256 bytes, check 32-byte alignment
+lasx_clr_large:
+	AND	$31, R4, R7
+	BEQ	R7, lasx_clr_256loop
+	XVMOVQ	X0, (R4)
+	SUBV	R7, R4
+	ADDV	R7, R5
+	SUBV	$32, R5 // newn = n - (32 - (ptr & 31))
+	ADDV	$32, R4 // newptr = ptr + (32 - (ptr & 31))
+	SGTU	$257, R5, R7
+	BNE	R7, lasx_clr_129through256
+lasx_clr_256loop:
+	SUBV	$256, R5
+	SGTU	$256, R5, R7
+	XVMOVQ	X0, 0(R4)
+	XVMOVQ	X0, 32(R4)
+	XVMOVQ	X0, 64(R4)
+	XVMOVQ	X0, 96(R4)
+	XVMOVQ	X0, 128(R4)
+	XVMOVQ	X0, 160(R4)
+	XVMOVQ	X0, 192(R4)
+	XVMOVQ	X0, 224(R4)
+	ADDV	$256, R4
+	BEQ	R7, lasx_clr_256loop
+
+	// remaining_length is 0
+	BEQ	R5, clr_0
+
+	// 128 < remaining_length < 256
+	SGTU	$129, R5, R7
+	BEQ	R7, lasx_clr_129through256
+
+	// 64 < remaining_length <= 128
+	SGTU	$65, R5, R7
+	BEQ	R7, lasx_clr_65through128
+
+	// 32 < remaining_length <= 64
+	SGTU	$33, R5, R7
+	BEQ	R7, lasx_clr_33through64
+
+	// 16 < remaining_length <= 32
+	SGTU	$17, R5, R7
+	BEQ	R7, lasx_clr_17through32
+
+	// 0 < remaining_length <= 16
+	JMP	tail
+
+	// use simd 128 instructions to implement memclr
+	// n > 128 bytes, check 16-byte alignment
+lsx_clr_large:
+	// check 16-byte alignment
+	AND	$15, R4, R7
+	BEQ	R7, lsx_clr_128loop
+	VMOVQ	V0, (R4)
+	SUBV	R7, R4
+	ADDV	R7, R5
+	SUBV	$16, R5 // newn = n - (16 - (ptr & 15))
+	ADDV	$16, R4 // newptr = ptr + (16 - (ptr & 15))
+	SGTU	$129, R5, R7
+	BNE	R7, lsx_clr_65through128
+lsx_clr_128loop:
+	SUBV	$128, R5
+	SGTU	$128, R5, R7
+	VMOVQ	V0, 0(R4)
+	VMOVQ	V0, 16(R4)
+	VMOVQ	V0, 32(R4)
+	VMOVQ	V0, 48(R4)
+	VMOVQ	V0, 64(R4)
+	VMOVQ	V0, 80(R4)
+	VMOVQ	V0, 96(R4)
+	VMOVQ	V0, 112(R4)
+	ADDV	$128, R4
+	BEQ	R7, lsx_clr_128loop
+
+	// remaining_length is 0
+	BEQ	R5, clr_0
+
+	// 64 < remaining_length <= 128
+	SGTU	$65, R5, R7
+	BEQ	R7, lsx_clr_65through128
+
+	// 32 < remaining_length <= 64
+	SGTU	$33, R5, R7
+	BEQ	R7, lsx_clr_33through64
+
+	// 16 < remaining_length <= 32
+	SGTU	$17, R5, R7
+	BEQ	R7, lsx_clr_17through32
+
+	// 0 < remaining_length <= 16
+	JMP	tail
+
+	// use general instructions to implement memclr
+	// n > 64 bytes, check 16-byte alignment
+clr_large:
 	AND	$7, R4, R7
-	BEQ	R7, body
-
-head:
+	BEQ	R7, clr_64loop
 	MOVV	R0, (R4)
 	SUBV	R7, R4
 	ADDV	R7, R5
 	ADDV	$8, R4	// newptr = ptr + (8 - (ptr & 7))
 	SUBV	$8, R5	// newn = n - (8 - (ptr & 7))
-	SGTU	$65, R5, R7
-	BNE	R7, clr_33through64
-
-body:
+	MOVV	$64, R7
+	BLT	R5, R7, clr_33through64
+clr_64loop:
+	SUBV	$64, R5
+	SGTU    $64, R5, R7
 	MOVV	R0, (R4)
 	MOVV	R0, 8(R4)
 	MOVV	R0, 16(R4)
@@ -84,11 +222,21 @@ body:
 	MOVV	R0, 40(R4)
 	MOVV	R0, 48(R4)
 	MOVV	R0, 56(R4)
-	ADDV	$-64, R5
 	ADDV	$64, R4
-	SGTU	$65, R5, R7
-	BEQ	R7, body
+	BEQ     R7, clr_64loop
+
+	// remaining_length is 0
 	BEQ	R5, clr_0
+
+	// 32 < remaining_length < 64
+	SGTU	$33, R5, R7
+	BEQ	R7, clr_33through64
+
+	// 16 < remaining_length <= 32
+	SGTU	$17, R5, R7
+	BEQ	R7, clr_17through32
+
+	// 0 < remaining_length <= 16
 	JMP	tail
 
 clr_0:
@@ -133,3 +281,49 @@ clr_33through64:
 	MOVV	R0, -16(R6)
 	MOVV	R0, -8(R6)
 	RET
+
+lasx_clr_17through32:
+	VMOVQ	V0, 0(R4)
+	VMOVQ	V0, -16(R6)
+	RET
+lasx_clr_33through64:
+	XVMOVQ	X0, 0(R4)
+	XVMOVQ	X0, -32(R6)
+	RET
+lasx_clr_65through128:
+	XVMOVQ	X0, 0(R4)
+	XVMOVQ	X0, 32(R4)
+	XVMOVQ	X0, -64(R6)
+	XVMOVQ	X0, -32(R6)
+	RET
+lasx_clr_129through256:
+	XVMOVQ	X0, 0(R4)
+	XVMOVQ	X0, 32(R4)
+	XVMOVQ	X0, 64(R4)
+	XVMOVQ	X0, 96(R4)
+	XVMOVQ	X0, -128(R6)
+	XVMOVQ	X0, -96(R6)
+	XVMOVQ	X0, -64(R6)
+	XVMOVQ	X0, -32(R6)
+	RET
+
+lsx_clr_17through32:
+	VMOVQ	V0, 0(R4)
+	VMOVQ	V0, -16(R6)
+	RET
+lsx_clr_33through64:
+	VMOVQ	V0, 0(R4)
+	VMOVQ	V0, 16(R4)
+	VMOVQ	V0, -32(R6)
+	VMOVQ	V0, -16(R6)
+	RET
+lsx_clr_65through128:
+	VMOVQ	V0, 0(R4)
+	VMOVQ	V0, 16(R4)
+	VMOVQ	V0, 32(R4)
+	VMOVQ	V0, 48(R4)
+	VMOVQ	V0, -64(R6)
+	VMOVQ	V0, -48(R6)
+	VMOVQ	V0, -32(R6)
+	VMOVQ	V0, -16(R6)
+	RET