diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s index 79d90c1adb..ea622ed951 100644 --- a/src/runtime/memmove_riscv64.s +++ b/src/runtime/memmove_riscv64.s @@ -8,91 +8,311 @@ // void runtime·memmove(void*, void*, uintptr) TEXT runtime·memmove(SB),NOSPLIT,$-0-24 - // A0 = to - // A1 = from - // A2 = n - ADD A1, A2, T5 + // X10 = to + // X11 = from + // X12 = n + BEQ X10, X11, done + BEQZ X12, done // If the destination is ahead of the source, start at the end of the // buffer and go backward. - BLTU A1, A0, b + BGTU X10, X11, backward - // If less than eight bytes, do one byte at a time. - SLTU $8, A2, T3 - BNE T3, ZERO, f_outcheck + // If less than 8 bytes, do single byte copies. + MOV $8, X9 + BLT X12, X9, f_loop4_check - // Do one byte at a time until from is eight-aligned. - JMP f_aligncheck + // Check alignment - if alignment differs we have to do one byte at a time. + AND $3, X10, X5 + AND $3, X11, X6 + BNE X5, X6, f_loop8_unaligned_check + BEQZ X5, f_loop_check + + // Move one byte at a time until we reach 8 byte alignment. + SUB X5, X12, X12 f_align: - MOVB (A1), T3 - MOVB T3, (A0) - ADD $1, A0 - ADD $1, A1 -f_aligncheck: - AND $7, A1, T3 - BNE T3, ZERO, f_align + ADD $-1, X5 + MOVB 0(X11), X14 + MOVB X14, 0(X10) + ADD $1, X10 + ADD $1, X11 + BNEZ X5, f_align - // Do eight bytes at a time as long as there is room. - ADD $-7, T5, T6 - JMP f_wordscheck -f_words: - MOV (A1), T3 - MOV T3, (A0) - ADD $8, A0 - ADD $8, A1 -f_wordscheck: - SLTU T6, A1, T3 - BNE T3, ZERO, f_words +f_loop_check: + MOV $16, X9 + BLT X12, X9, f_loop8_check + MOV $32, X9 + BLT X12, X9, f_loop16_check + MOV $64, X9 + BLT X12, X9, f_loop32_check +f_loop64: + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 + MOV 24(X11), X17 + MOV 32(X11), X18 + MOV 40(X11), X19 + MOV 48(X11), X20 + MOV 56(X11), X21 + MOV X14, 0(X10) + MOV X15, 8(X10) + MOV X16, 16(X10) + MOV X17, 24(X10) + MOV X18, 32(X10) + MOV X19, 40(X10) + MOV X20, 48(X10) + MOV X21, 56(X10) + ADD $64, X10 + ADD $64, X11 + ADD $-64, X12 + BGE X12, X9, f_loop64 + BEQZ X12, done - // Finish off the remaining partial word. - JMP f_outcheck -f_out: - MOVB (A1), T3 - MOVB T3, (A0) - ADD $1, A0 - ADD $1, A1 -f_outcheck: - BNE A1, T5, f_out +f_loop32_check: + MOV $32, X9 + BLT X12, X9, f_loop16_check +f_loop32: + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 + MOV 24(X11), X17 + MOV X14, 0(X10) + MOV X15, 8(X10) + MOV X16, 16(X10) + MOV X17, 24(X10) + ADD $32, X10 + ADD $32, X11 + ADD $-32, X12 + BGE X12, X9, f_loop32 + BEQZ X12, done - RET +f_loop16_check: + MOV $16, X9 + BLT X12, X9, f_loop8_check +f_loop16: + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV X14, 0(X10) + MOV X15, 8(X10) + ADD $16, X10 + ADD $16, X11 + ADD $-16, X12 + BGE X12, X9, f_loop16 + BEQZ X12, done -b: - ADD A0, A2, T4 - // If less than eight bytes, do one byte at a time. - SLTU $8, A2, T3 - BNE T3, ZERO, b_outcheck +f_loop8_check: + MOV $8, X9 + BLT X12, X9, f_loop4_check +f_loop8: + MOV 0(X11), X14 + MOV X14, 0(X10) + ADD $8, X10 + ADD $8, X11 + ADD $-8, X12 + BGE X12, X9, f_loop8 + BEQZ X12, done + JMP f_loop4_check - // Do one byte at a time until from+n is eight-aligned. - JMP b_aligncheck +f_loop8_unaligned_check: + MOV $8, X9 + BLT X12, X9, f_loop4_check +f_loop8_unaligned: + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 + MOVB 3(X11), X17 + MOVB 4(X11), X18 + MOVB 5(X11), X19 + MOVB 6(X11), X20 + MOVB 7(X11), X21 + MOVB X14, 0(X10) + MOVB X15, 1(X10) + MOVB X16, 2(X10) + MOVB X17, 3(X10) + MOVB X18, 4(X10) + MOVB X19, 5(X10) + MOVB X20, 6(X10) + MOVB X21, 7(X10) + ADD $8, X10 + ADD $8, X11 + ADD $-8, X12 + BGE X12, X9, f_loop8_unaligned + +f_loop4_check: + MOV $4, X9 + BLT X12, X9, f_loop1 +f_loop4: + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 + MOVB 3(X11), X17 + MOVB X14, 0(X10) + MOVB X15, 1(X10) + MOVB X16, 2(X10) + MOVB X17, 3(X10) + ADD $4, X10 + ADD $4, X11 + ADD $-4, X12 + BGE X12, X9, f_loop4 + +f_loop1: + BEQZ X12, done + MOVB 0(X11), X14 + MOVB X14, 0(X10) + ADD $1, X10 + ADD $1, X11 + ADD $-1, X12 + JMP f_loop1 + +backward: + ADD X10, X12, X10 + ADD X11, X12, X11 + + // If less than 8 bytes, do single byte copies. + MOV $8, X9 + BLT X12, X9, b_loop4_check + + // Check alignment - if alignment differs we have to do one byte at a time. + AND $3, X10, X5 + AND $3, X11, X6 + BNE X5, X6, b_loop8_unaligned_check + BEQZ X5, b_loop_check + + // Move one byte at a time until we reach 8 byte alignment. + SUB X5, X12, X12 b_align: - ADD $-1, T4 - ADD $-1, T5 - MOVB (T5), T3 - MOVB T3, (T4) -b_aligncheck: - AND $7, T5, T3 - BNE T3, ZERO, b_align + ADD $-1, X5 + ADD $-1, X10 + ADD $-1, X11 + MOVB 0(X11), X14 + MOVB X14, 0(X10) + BNEZ X5, b_align - // Do eight bytes at a time as long as there is room. - ADD $7, A1, T6 - JMP b_wordscheck -b_words: - ADD $-8, T4 - ADD $-8, T5 - MOV (T5), T3 - MOV T3, (T4) -b_wordscheck: - SLTU T5, T6, T3 - BNE T3, ZERO, b_words +b_loop_check: + MOV $16, X9 + BLT X12, X9, b_loop8_check + MOV $32, X9 + BLT X12, X9, b_loop16_check + MOV $64, X9 + BLT X12, X9, b_loop32_check +b_loop64: + ADD $-64, X10 + ADD $-64, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 + MOV 24(X11), X17 + MOV 32(X11), X18 + MOV 40(X11), X19 + MOV 48(X11), X20 + MOV 56(X11), X21 + MOV X14, 0(X10) + MOV X15, 8(X10) + MOV X16, 16(X10) + MOV X17, 24(X10) + MOV X18, 32(X10) + MOV X19, 40(X10) + MOV X20, 48(X10) + MOV X21, 56(X10) + ADD $-64, X12 + BGE X12, X9, b_loop64 + BEQZ X12, done - // Finish off the remaining partial word. - JMP b_outcheck -b_out: - ADD $-1, T4 - ADD $-1, T5 - MOVB (T5), T3 - MOVB T3, (T4) -b_outcheck: - BNE T5, A1, b_out +b_loop32_check: + MOV $32, X9 + BLT X12, X9, b_loop16_check +b_loop32: + ADD $-32, X10 + ADD $-32, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV 16(X11), X16 + MOV 24(X11), X17 + MOV X14, 0(X10) + MOV X15, 8(X10) + MOV X16, 16(X10) + MOV X17, 24(X10) + ADD $-32, X12 + BGE X12, X9, b_loop32 + BEQZ X12, done +b_loop16_check: + MOV $16, X9 + BLT X12, X9, b_loop8_check +b_loop16: + ADD $-16, X10 + ADD $-16, X11 + MOV 0(X11), X14 + MOV 8(X11), X15 + MOV X14, 0(X10) + MOV X15, 8(X10) + ADD $-16, X12 + BGE X12, X9, b_loop16 + BEQZ X12, done + +b_loop8_check: + MOV $8, X9 + BLT X12, X9, b_loop4_check +b_loop8: + ADD $-8, X10 + ADD $-8, X11 + MOV 0(X11), X14 + MOV X14, 0(X10) + ADD $-8, X12 + BGE X12, X9, b_loop8 + BEQZ X12, done + JMP b_loop4_check + +b_loop8_unaligned_check: + MOV $8, X9 + BLT X12, X9, b_loop4_check +b_loop8_unaligned: + ADD $-8, X10 + ADD $-8, X11 + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 + MOVB 3(X11), X17 + MOVB 4(X11), X18 + MOVB 5(X11), X19 + MOVB 6(X11), X20 + MOVB 7(X11), X21 + MOVB X14, 0(X10) + MOVB X15, 1(X10) + MOVB X16, 2(X10) + MOVB X17, 3(X10) + MOVB X18, 4(X10) + MOVB X19, 5(X10) + MOVB X20, 6(X10) + MOVB X21, 7(X10) + ADD $-8, X12 + BGE X12, X9, b_loop8_unaligned + +b_loop4_check: + MOV $4, X9 + BLT X12, X9, b_loop1 +b_loop4: + ADD $-4, X10 + ADD $-4, X11 + MOVB 0(X11), X14 + MOVB 1(X11), X15 + MOVB 2(X11), X16 + MOVB 3(X11), X17 + MOVB X14, 0(X10) + MOVB X15, 1(X10) + MOVB X16, 2(X10) + MOVB X17, 3(X10) + ADD $-4, X12 + BGE X12, X9, b_loop4 + +b_loop1: + BEQZ X12, done + ADD $-1, X10 + ADD $-1, X11 + MOVB 0(X11), X14 + MOVB X14, 0(X10) + ADD $-1, X12 + JMP b_loop1 + +done: RET