diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s index 30abd980c5..dd4840dae1 100644 --- a/src/internal/bytealg/equal_arm64.s +++ b/src/internal/bytealg/equal_arm64.s @@ -67,6 +67,7 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17 CMP R3, R4 BEQ eq MOVD 8(R26), R5 // compiler stores size at offset 8 in the closure + CBZ R5, eq MOVD R3, 8(RSP) MOVD R4, 16(RSP) MOVD R5, 24(RSP) @@ -119,30 +120,41 @@ chunk16: CBZ R3, tail ADD R3, R0, R6 // end of chunks chunk16_loop: - VLD1.P (R0), [V0.D2] - VLD1.P (R2), [V1.D2] - VCMEQ V0.D2, V1.D2, V2.D2 + LDP.P 16(R0), (R4, R5) + LDP.P 16(R2), (R7, R9) + EOR R4, R7 + CBNZ R7, not_equal + EOR R5, R9 + CBNZ R9, not_equal CMP R0, R6 - VMOV V2.D[0], R4 - VMOV V2.D[1], R5 - CBZ R4, not_equal - CBZ R5, not_equal BNE chunk16_loop AND $0xf, R1, R1 CBZ R1, equal tail: // special compare of tail with length < 16 TBZ $3, R1, lt_8 - MOVD.P 8(R0), R4 - MOVD.P 8(R2), R5 - CMP R4, R5 - BNE not_equal + MOVD (R0), R4 + MOVD (R2), R5 + EOR R4, R5 + CBNZ R5, not_equal + SUB $8, R1, R6 // offset of the last 8 bytes + MOVD (R0)(R6), R4 + MOVD (R2)(R6), R5 + EOR R4, R5 + CBNZ R5, not_equal + B equal lt_8: TBZ $2, R1, lt_4 - MOVWU.P 4(R0), R4 - MOVWU.P 4(R2), R5 - CMP R4, R5 - BNE not_equal + MOVWU (R0), R4 + MOVWU (R2), R5 + EOR R4, R5 + CBNZ R5, not_equal + SUB $4, R1, R6 // offset of the last 4 bytes + MOVWU (R0)(R6), R4 + MOVWU (R2)(R6), R5 + EOR R4, R5 + CBNZ R5, not_equal + B equal lt_4: TBZ $1, R1, lt_2 MOVHU.P 2(R0), R4 @@ -150,7 +162,7 @@ lt_4: CMP R4, R5 BNE not_equal lt_2: - TBZ $0, R1, equal + TBZ $0, R1, equal one: MOVBU (R0), R4 MOVBU (R2), R5