diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index e81e50f5df..06424642c7 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -20,7 +20,8 @@ const ( offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA) - offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) + offsetLOONG64HasLSX = unsafe.Offsetof(cpu.Loong64.HasLSX) + offsetLOONG64HasLASX = unsafe.Offsetof(cpu.Loong64.HasLASX) ) var ( diff --git a/src/runtime/memclr_loong64.s b/src/runtime/memclr_loong64.s index 346b210c8d..76d8fb56bf 100644 --- a/src/runtime/memclr_loong64.s +++ b/src/runtime/memclr_loong64.s @@ -14,17 +14,29 @@ // Algorithm: // -// 1. when count <= 64 bytes, memory alignment check is omitted. -// The handling is divided into distinct cases based on the size -// of count: clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, -// clr_8, clr_9through16, clr_17through32, and clr_33through64. +// 1. if lasx is enabled: +// THRESHOLD = 256, ALIGNMENTS = 32, LOOPBLOCKS = 256, +// else if lsx is enabled: +// THRESHOLD = 128, ALIGNMENTS = 16, LOOPBLOCKS = 128, +// else +// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64, // -// 2. when count > 64 bytes, memory alignment check is performed. -// Unaligned bytes are processed first (that is, 8-(ptr&7)), and -// then a 64-byte loop is executed to zero out memory. -// When the number of remaining bytes not cleared is n < 64 bytes, -// a tail processing is performed, invoking the corresponding case -// based on the size of n. +// 2. when 'count <= THRESHOLD' bytes, memory alignment check is omitted. +// The handling is divided into distinct cases based on the size of count: +// a. clr_0, clr_1, clr_2, clr_3, clr_4, clr_5through7, clr_8, +// clr_9through16, clr_17through32, clr_33through64, +// b. lsx_clr_17through32, lsx_clr_33through64, lsx_clr_65through128, +// c. lasx_clr_17through32, lasx_clr_33through64, lsx_clr_65through128, +// lasx_clr_65through128, lasx_clr_129through256 +// +// 3. when 'count > THRESHOLD' bytes, memory alignment check is performed. Unaligned +// bytes are processed first (that is, ALIGNMENTS - (ptr & (ALIGNMENTS-1))), and then +// a LOOPBLOCKS-byte loop is executed to zero out memory. +// When the number of remaining bytes not cleared is n < LOOPBLOCKS bytes, a tail +// processing is performed, invoking the corresponding case based on the size of n. +// +// example: +// THRESHOLD = 64, ALIGNMENTS = 8, LOOPBLOCKS = 64 // // ptr newptr ptrend // | |<----count after correction---->| @@ -40,7 +52,6 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 BEQ R5, clr_0 ADDV R4, R5, R6 - tail: // <=64 bytes, clear directly, not check aligned SGTU $2, R5, R7 @@ -57,25 +68,152 @@ tail: BNE R7, clr_8 SGTU $17, R5, R7 BNE R7, clr_9through16 + + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7 + BNE R7, lasx_tail + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7 + BNE R7, lsx_tail + SGTU $33, R5, R7 BNE R7, clr_17through32 SGTU $65, R5, R7 BNE R7, clr_33through64 + JMP clr_large - // n > 64 bytes, check aligned +lasx_tail: + // X0 = 0 + XVXORV X0, X0, X0 + + SGTU $33, R5, R7 + BNE R7, lasx_clr_17through32 + SGTU $65, R5, R7 + BNE R7, lasx_clr_33through64 + SGTU $129, R5, R7 + BNE R7, lasx_clr_65through128 + SGTU $257, R5, R7 + BNE R7, lasx_clr_129through256 + JMP lasx_clr_large + +lsx_tail: + // V0 = 0 + VXORV V0, V0, V0 + + SGTU $33, R5, R7 + BNE R7, lsx_clr_17through32 + SGTU $65, R5, R7 + BNE R7, lsx_clr_33through64 + SGTU $129, R5, R7 + BNE R7, lsx_clr_65through128 + JMP lsx_clr_large + + // use simd 256 instructions to implement memclr + // n > 256 bytes, check 32-byte alignment +lasx_clr_large: + AND $31, R4, R7 + BEQ R7, lasx_clr_256loop + XVMOVQ X0, (R4) + SUBV R7, R4 + ADDV R7, R5 + SUBV $32, R5 // newn = n - (32 - (ptr & 31)) + ADDV $32, R4 // newptr = ptr + (32 - (ptr & 31)) + SGTU $257, R5, R7 + BNE R7, lasx_clr_129through256 +lasx_clr_256loop: + SUBV $256, R5 + SGTU $256, R5, R7 + XVMOVQ X0, 0(R4) + XVMOVQ X0, 32(R4) + XVMOVQ X0, 64(R4) + XVMOVQ X0, 96(R4) + XVMOVQ X0, 128(R4) + XVMOVQ X0, 160(R4) + XVMOVQ X0, 192(R4) + XVMOVQ X0, 224(R4) + ADDV $256, R4 + BEQ R7, lasx_clr_256loop + + // remaining_length is 0 + BEQ R5, clr_0 + + // 128 < remaining_length < 256 + SGTU $129, R5, R7 + BEQ R7, lasx_clr_129through256 + + // 64 < remaining_length <= 128 + SGTU $65, R5, R7 + BEQ R7, lasx_clr_65through128 + + // 32 < remaining_length <= 64 + SGTU $33, R5, R7 + BEQ R7, lasx_clr_33through64 + + // 16 < remaining_length <= 32 + SGTU $17, R5, R7 + BEQ R7, lasx_clr_17through32 + + // 0 < remaining_length <= 16 + JMP tail + + // use simd 128 instructions to implement memclr + // n > 128 bytes, check 16-byte alignment +lsx_clr_large: + // check 16-byte alignment + AND $15, R4, R7 + BEQ R7, lsx_clr_128loop + VMOVQ V0, (R4) + SUBV R7, R4 + ADDV R7, R5 + SUBV $16, R5 // newn = n - (16 - (ptr & 15)) + ADDV $16, R4 // newptr = ptr + (16 - (ptr & 15)) + SGTU $129, R5, R7 + BNE R7, lsx_clr_65through128 +lsx_clr_128loop: + SUBV $128, R5 + SGTU $128, R5, R7 + VMOVQ V0, 0(R4) + VMOVQ V0, 16(R4) + VMOVQ V0, 32(R4) + VMOVQ V0, 48(R4) + VMOVQ V0, 64(R4) + VMOVQ V0, 80(R4) + VMOVQ V0, 96(R4) + VMOVQ V0, 112(R4) + ADDV $128, R4 + BEQ R7, lsx_clr_128loop + + // remaining_length is 0 + BEQ R5, clr_0 + + // 64 < remaining_length <= 128 + SGTU $65, R5, R7 + BEQ R7, lsx_clr_65through128 + + // 32 < remaining_length <= 64 + SGTU $33, R5, R7 + BEQ R7, lsx_clr_33through64 + + // 16 < remaining_length <= 32 + SGTU $17, R5, R7 + BEQ R7, lsx_clr_17through32 + + // 0 < remaining_length <= 16 + JMP tail + + // use general instructions to implement memclr + // n > 64 bytes, check 16-byte alignment +clr_large: AND $7, R4, R7 - BEQ R7, body - -head: + BEQ R7, clr_64loop MOVV R0, (R4) SUBV R7, R4 ADDV R7, R5 ADDV $8, R4 // newptr = ptr + (8 - (ptr & 7)) SUBV $8, R5 // newn = n - (8 - (ptr & 7)) - SGTU $65, R5, R7 - BNE R7, clr_33through64 - -body: + MOVV $64, R7 + BLT R5, R7, clr_33through64 +clr_64loop: + SUBV $64, R5 + SGTU $64, R5, R7 MOVV R0, (R4) MOVV R0, 8(R4) MOVV R0, 16(R4) @@ -84,11 +222,21 @@ body: MOVV R0, 40(R4) MOVV R0, 48(R4) MOVV R0, 56(R4) - ADDV $-64, R5 ADDV $64, R4 - SGTU $65, R5, R7 - BEQ R7, body + BEQ R7, clr_64loop + + // remaining_length is 0 BEQ R5, clr_0 + + // 32 < remaining_length < 64 + SGTU $33, R5, R7 + BEQ R7, clr_33through64 + + // 16 < remaining_length <= 32 + SGTU $17, R5, R7 + BEQ R7, clr_17through32 + + // 0 < remaining_length <= 16 JMP tail clr_0: @@ -133,3 +281,49 @@ clr_33through64: MOVV R0, -16(R6) MOVV R0, -8(R6) RET + +lasx_clr_17through32: + VMOVQ V0, 0(R4) + VMOVQ V0, -16(R6) + RET +lasx_clr_33through64: + XVMOVQ X0, 0(R4) + XVMOVQ X0, -32(R6) + RET +lasx_clr_65through128: + XVMOVQ X0, 0(R4) + XVMOVQ X0, 32(R4) + XVMOVQ X0, -64(R6) + XVMOVQ X0, -32(R6) + RET +lasx_clr_129through256: + XVMOVQ X0, 0(R4) + XVMOVQ X0, 32(R4) + XVMOVQ X0, 64(R4) + XVMOVQ X0, 96(R4) + XVMOVQ X0, -128(R6) + XVMOVQ X0, -96(R6) + XVMOVQ X0, -64(R6) + XVMOVQ X0, -32(R6) + RET + +lsx_clr_17through32: + VMOVQ V0, 0(R4) + VMOVQ V0, -16(R6) + RET +lsx_clr_33through64: + VMOVQ V0, 0(R4) + VMOVQ V0, 16(R4) + VMOVQ V0, -32(R6) + VMOVQ V0, -16(R6) + RET +lsx_clr_65through128: + VMOVQ V0, 0(R4) + VMOVQ V0, 16(R4) + VMOVQ V0, 32(R4) + VMOVQ V0, 48(R4) + VMOVQ V0, -64(R6) + VMOVQ V0, -48(R6) + VMOVQ V0, -32(R6) + VMOVQ V0, -16(R6) + RET