diff --git a/src/vendor/golang_org/x/crypto/poly1305/sum_arm.s b/src/vendor/golang_org/x/crypto/poly1305/sum_arm.s index 9c3d60f29d..93167b2712 100644 --- a/src/vendor/golang_org/x/crypto/poly1305/sum_arm.s +++ b/src/vendor/golang_org/x/crypto/poly1305/sum_arm.s @@ -20,10 +20,12 @@ GLOBL poly1305_init_constants_armv6<>(SB), 8, $20 // take care and verify that no synthetic instructions use it. TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 - // Needs 32 bytes of stack and 64 bytes of space pointed to by R0. - // (It might look like it's only 60 bytes of space but the final - // four bytes will be written by another function.) - MOVM.DB.W [R4-R11], (R13) + // Needs 16 bytes of stack and 64 bytes of space pointed to by R0. (It + // might look like it's only 60 bytes of space but the final four bytes + // will be written by another function.) We need to skip over four + // bytes of stack because that's saving the value of 'g'. + ADD $4, R13, R8 + MOVM.IB [R4-R7], (R8) MOVM.IA.W (R1), [R2-R5] MOVW $poly1305_init_constants_armv6<>(SB), R7 MOVW R2, R8 @@ -49,7 +51,8 @@ TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 MOVM.IA.W [R2-R6], (R0) MOVM.IA.W (R1), [R2-R5] MOVM.IA [R2-R6], (R0) - MOVM.IA.W (R13), [R4-R11] + ADD $20, R13, R0 + MOVM.DA (R0), [R4-R7] RET #define MOVW_UNALIGNED(Rsrc, Rdst, Rtmp, offset) \ @@ -63,29 +66,34 @@ TEXT poly1305_init_ext_armv6<>(SB), NOSPLIT, $0 MOVBU Rtmp, (offset+3)(Rdst) TEXT poly1305_blocks_armv6<>(SB), NOSPLIT, $0 - // Needs 36 + 128 bytes of stack. - MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13) - SUB $128, R13 - MOVW R0, 36(R13) - MOVW R1, 40(R13) - MOVW R2, 44(R13) - MOVW R1, R14 - MOVW R2, R12 - MOVW 56(R0), R8 - WORD $0xe1180008 // TST R8, R8 not working see issue 5921 - EOR R6, R6, R6 - MOVW.EQ $(1<<24), R6 - MOVW R6, 32(R13) - ADD $64, R13, g - MOVM.IA (R0), [R0-R9] - MOVM.IA [R0-R4], (g) - CMP $16, R12 - BLO poly1305_blocks_armv6_done + // Needs 24 bytes of stack for saved registers and then 88 bytes of + // scratch space after that. We assume that 24 bytes at (R13) have + // already been used: four bytes for the link register saved in the + // prelude of poly1305_auth_armv6, four bytes for saving the value of g + // in that function and 16 bytes of scratch space used around + // poly1305_finish_ext_armv6_skip1. + ADD $24, R13, R12 + MOVM.IB [R4-R8, R14], (R12) + MOVW R0, 88(R13) + MOVW R1, 92(R13) + MOVW R2, 96(R13) + MOVW R1, R14 + MOVW R2, R12 + MOVW 56(R0), R8 + WORD $0xe1180008 // TST R8, R8 not working see issue 5921 + EOR R6, R6, R6 + MOVW.EQ $(1<<24), R6 + MOVW R6, 84(R13) + ADD $116, R13, g + MOVM.IA (R0), [R0-R9] + MOVM.IA [R0-R4], (g) + CMP $16, R12 + BLO poly1305_blocks_armv6_done poly1305_blocks_armv6_mainloop: WORD $0xe31e0003 // TST R14, #3 not working see issue 5921 BEQ poly1305_blocks_armv6_mainloop_aligned - ADD $48, R13, g + ADD $100, R13, g MOVW_UNALIGNED(R14, g, R0, 0) MOVW_UNALIGNED(R14, g, R0, 4) MOVW_UNALIGNED(R14, g, R0, 8) @@ -101,21 +109,21 @@ poly1305_blocks_armv6_mainloop_loaded: MOVW R0>>26, g MOVW R1>>20, R11 MOVW R2>>14, R12 - MOVW R14, 40(R13) + MOVW R14, 92(R13) MOVW R3>>8, R4 ORR R1<<6, g, g ORR R2<<12, R11, R11 ORR R3<<18, R12, R12 BIC $0xfc000000, R0, R0 BIC $0xfc000000, g, g - MOVW 32(R13), R3 + MOVW 84(R13), R3 BIC $0xfc000000, R11, R11 BIC $0xfc000000, R12, R12 ADD R0, R5, R5 ADD g, R6, R6 ORR R3, R4, R4 ADD R11, R7, R7 - ADD $64, R13, R14 + ADD $116, R13, R14 ADD R12, R8, R8 ADD R4, R9, R9 MOVM.IA (R14), [R0-R4] @@ -131,10 +139,10 @@ poly1305_blocks_armv6_mainloop_loaded: MULALU R0, R8, (R14, R12) MULALU R0, R9, (R11, g) MULALU R4, R9, (R14, R12) - MOVW g, 24(R13) - MOVW R11, 28(R13) - MOVW R12, 16(R13) - MOVW R14, 20(R13) + MOVW g, 76(R13) + MOVW R11, 80(R13) + MOVW R12, 68(R13) + MOVW R14, 72(R13) MULLU R2, R5, (R11, g) MULLU R1, R5, (R14, R12) MULALU R1, R6, (R11, g) @@ -147,16 +155,17 @@ poly1305_blocks_armv6_mainloop_loaded: MULALU R3, R8, (R14, R12) MULALU R3, R9, (R11, g) MULALU R2, R9, (R14, R12) - MOVW g, 8(R13) - MOVW R11, 12(R13) - MOVW R12, 0(R13) - MOVW R14, w+4(SP) + MOVW g, 60(R13) + MOVW R11, 64(R13) + MOVW R12, 52(R13) + MOVW R14, 56(R13) MULLU R0, R5, (R11, g) MULALU R4, R6, (R11, g) MULALU R3, R7, (R11, g) MULALU R2, R8, (R11, g) MULALU R1, R9, (R11, g) - MOVM.IA (R13), [R0-R7] + ADD $52, R13, R0 + MOVM.IA (R0), [R0-R7] MOVW g>>26, R12 MOVW R4>>26, R14 ORR R11<<6, R12, R12 @@ -187,23 +196,23 @@ poly1305_blocks_armv6_mainloop_loaded: MOVW R4>>26, R12 BIC $0xfc000000, R4, R8 ADD R12, R6, R9 - MOVW w+44(SP), R12 - MOVW w+40(SP), R14 + MOVW 96(R13), R12 + MOVW 92(R13), R14 MOVW R0, R6 CMP $32, R12 SUB $16, R12, R12 - MOVW R12, 44(R13) + MOVW R12, 96(R13) BHS poly1305_blocks_armv6_mainloop poly1305_blocks_armv6_done: - MOVW 36(R13), R12 - MOVW R5, 20(R12) - MOVW R6, 24(R12) - MOVW R7, 28(R12) - MOVW R8, 32(R12) - MOVW R9, 36(R12) - ADD $128, R13, R13 - MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14] + MOVW 88(R13), R12 + MOVW R5, 20(R12) + MOVW R6, 24(R12) + MOVW R7, 28(R12) + MOVW R8, 32(R12) + MOVW R9, 36(R12) + ADD $48, R13, R0 + MOVM.DA (R0), [R4-R8, R14] RET #define MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) \ @@ -216,26 +225,76 @@ poly1305_blocks_armv6_done: MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp); \ MOVHUP_UNALIGNED(Rsrc, Rdst, Rtmp) -TEXT poly1305_finish_ext_armv6<>(SB), NOSPLIT, $0 - // Needs 36 + 16 bytes of stack. - MOVM.DB.W [R4, R5, R6, R7, R8, R9, g, R11, R14], (R13) - SUB $16, R13, R13 - MOVW R0, R5 - MOVW R1, R6 - MOVW R2, R7 - MOVW R3, R8 - AND.S R2, R2, R2 - BEQ poly1305_finish_ext_armv6_noremaining - EOR R0, R0 - MOVW R13, R9 - MOVW R0, 0(R13) - MOVW R0, 4(R13) - MOVW R0, 8(R13) - MOVW R0, 12(R13) - WORD $0xe3110003 // TST R1, #3 not working see issue 5921 - BEQ poly1305_finish_ext_armv6_aligned - WORD $0xe3120008 // TST R2, #8 not working see issue 5921 - BEQ poly1305_finish_ext_armv6_skip8 +// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key) +TEXT ·poly1305_auth_armv6(SB), $196-16 + // The value 196, just above, is the sum of 64 (the size of the context + // structure) and 132 (the amount of stack needed). + // + // At this point, the stack pointer (R13) has been moved down. It + // points to the saved link register and there's 196 bytes of free + // space above it. + // + // The stack for this function looks like: + // + // +--------------------- + // | + // | 64 bytes of context structure + // | + // +--------------------- + // | + // | 112 bytes for poly1305_blocks_armv6 + // | + // +--------------------- + // | 16 bytes of final block, constructed at + // | poly1305_finish_ext_armv6_skip8 + // +--------------------- + // | four bytes of saved 'g' + // +--------------------- + // | lr, saved by prelude <- R13 points here + // +--------------------- + MOVW g, 4(R13) + + MOVW out+0(FP), R4 + MOVW m+4(FP), R5 + MOVW mlen+8(FP), R6 + MOVW key+12(FP), R7 + + ADD $136, R13, R0 // 136 = 4 + 4 + 16 + 112 + MOVW R7, R1 + + // poly1305_init_ext_armv6 will write to the stack from R13+4, but + // that's ok because none of the other values have been written yet. + BL poly1305_init_ext_armv6<>(SB) + BIC.S $15, R6, R2 + BEQ poly1305_auth_armv6_noblocks + ADD $136, R13, R0 + MOVW R5, R1 + ADD R2, R5, R5 + SUB R2, R6, R6 + BL poly1305_blocks_armv6<>(SB) + +poly1305_auth_armv6_noblocks: + ADD $136, R13, R0 + MOVW R5, R1 + MOVW R6, R2 + MOVW R4, R3 + + MOVW R0, R5 + MOVW R1, R6 + MOVW R2, R7 + MOVW R3, R8 + AND.S R2, R2, R2 + BEQ poly1305_finish_ext_armv6_noremaining + EOR R0, R0 + ADD $8, R13, R9 // 8 = offset to 16 byte scratch space + MOVW R0, (R9) + MOVW R0, 4(R9) + MOVW R0, 8(R9) + MOVW R0, 12(R9) + WORD $0xe3110003 // TST R1, #3 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_aligned + WORD $0xe3120008 // TST R2, #8 not working see issue 5921 + BEQ poly1305_finish_ext_armv6_skip8 MOVWP_UNALIGNED(R1, R9, g) MOVWP_UNALIGNED(R1, R9, g) @@ -279,7 +338,7 @@ poly1305_finish_ext_armv6_skip1: MOVBU R11, 0(R9) MOVW R11, 56(R5) MOVW R5, R0 - MOVW R13, R1 + ADD $8, R13, R1 MOVW $16, R2 BL poly1305_blocks_armv6<>(SB) @@ -318,14 +377,14 @@ poly1305_finish_ext_armv6_noremaining: MOVW $-(1<<26), R12 ADD R11>>26, R12, R12 BIC $0xfc000000, R11, R11 - ADD R12, R4, R14 - MOVW R14>>31, R12 + ADD R12, R4, R9 + MOVW R9>>31, R12 SUB $1, R12 AND R12, R6, R6 AND R12, R7, R7 AND R12, g, g AND R12, R11, R11 - AND R12, R14, R14 + AND R12, R9, R9 MVN R12, R12 AND R12, R0, R0 AND R12, R1, R1 @@ -336,7 +395,7 @@ poly1305_finish_ext_armv6_noremaining: ORR R7, R1, R1 ORR g, R2, R2 ORR R11, R3, R3 - ORR R14, R4, R4 + ORR R9, R4, R4 ORR R1<<26, R0, R0 MOVW R1>>6, R1 ORR R2<<20, R1, R1 @@ -364,53 +423,5 @@ poly1305_finish_ext_armv6_noremaining: EOR R7, R7, R7 MOVM.IA.W [R0-R7], (R12) MOVM.IA [R0-R7], (R12) - ADD $16, R13, R13 - MOVM.IA.W (R13), [R4, R5, R6, R7, R8, R9, g, R11, R14] - RET - -// func poly1305_auth_armv6(out *[16]byte, m *byte, mlen uint32, key *[32]key) -TEXT ·poly1305_auth_armv6(SB), $228-16 - // The value 228, just above, is the sum of 64 (the size of the context - // structure) and 164 (the amount of stack that |poly1305_blocks_armv6| - // needs). - // - // At this point, the stack pointer (R13) has been moved down. It - // points to the saved link register and there's 228 bytes of free - // space above it. - MOVW out+0(FP), R4 - MOVW m+4(FP), R5 - MOVW mlen+8(FP), R6 - MOVW key+12(FP), R7 - - // We need to keep a 64-byte structure on the stack and have enough - // space for |poly1305_blocks_armv6| (which needs 164 bytes of stack - // space). This assembly code was written for a C-based world where - // code just assumes that sufficient stack is available below the - // current stack pointer. So the structure is kept at the highest - // addresses of the frame and the stack for other functions exists just - // below it. - // - // (In ARM, R13 points at the value currently at the top of the stack, - // so the structure address and stack pointer are the same value.) - // - // We add 168, not 164, because the link register is saved at *R13. - ADD $168, R13, R13 - MOVW R13, R0 - MOVW R7, R1 - BL poly1305_init_ext_armv6<>(SB) - BIC.S $15, R6, R2 - BEQ poly1305_auth_armv6_noblocks - MOVW R13, R0 - MOVW R5, R1 - ADD R2, R5, R5 - SUB R2, R6, R6 - BL poly1305_blocks_armv6<>(SB) - -poly1305_auth_armv6_noblocks: - MOVW R13, R0 - MOVW R5, R1 - MOVW R6, R2 - MOVW R4, R3 - BL poly1305_finish_ext_armv6<>(SB) - SUB $168, R13, R13 + MOVW 4(R13), g RET