mirror of https://github.com/golang/go.git
crypto/aes: rewrite ppc64le asm-cbc routine
This loads the keys once per call, not once per block. This has the effect of unrolling the inner loop too. This allows decryption to scale better with available hardware. Noteably, encryption serializes crypto ops, thus no performance improvement is seen, but neither is it reduced. Care is also taken to explicitly clear keys from registers as was done implicitly in the prior version. Also, fix a couple of typos from copying the asm used to load ESPERM. Performance delta on POWER9: name old time/op new time/op delta AESCBCEncrypt1K 1.10µs ± 0% 1.10µs ± 0% +0.55% AESCBCDecrypt1K 793ns ± 0% 415ns ± 0% -47.70% Change-Id: I52ca939fefa1d776a390a0869e7f4564058942b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/441816 Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Joedian Reid <joedian@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
parent
74b6a22057
commit
01d12c947c
|
|
@ -102,11 +102,11 @@ TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
|
||||||
MOVD dec+24(FP), OUTDEC
|
MOVD dec+24(FP), OUTDEC
|
||||||
|
|
||||||
#ifdef GOARCH_ppc64le
|
#ifdef GOARCH_ppc64le
|
||||||
MOVD $·rcon(SB), PTR // PTR point to rcon addr
|
MOVD $·rcon(SB), PTR // PTR points to rcon addr
|
||||||
LVX (PTR), ESPERM
|
LVX (PTR), ESPERM
|
||||||
ADD $0x10, PTR
|
ADD $0x10, PTR
|
||||||
#else
|
#else
|
||||||
MOVD $·rcon+0x10(SB), PTR // PTR point to rcon addr (skipping permute vector)
|
MOVD $·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Get key from memory and write aligned into VR
|
// Get key from memory and write aligned into VR
|
||||||
|
|
@ -500,61 +500,124 @@ Linvalid_key_len:
|
||||||
#undef KEY
|
#undef KEY
|
||||||
#undef TMP
|
#undef TMP
|
||||||
|
|
||||||
// CBC encrypt or decrypt
|
|
||||||
// R3 src
|
|
||||||
// R4 dst
|
|
||||||
// R5 len
|
|
||||||
// R6 key
|
|
||||||
// R7 iv
|
|
||||||
// R8 enc=1 dec=0
|
|
||||||
// Ported from: aes_p8_cbc_encrypt
|
|
||||||
// Register usage:
|
|
||||||
// R9: ROUNDS
|
|
||||||
// R10: Index
|
|
||||||
// V4: IV
|
|
||||||
// V5: SRC
|
|
||||||
// V7: DST
|
|
||||||
|
|
||||||
#define INP R3
|
#define INP R3
|
||||||
#define OUT R4
|
#define OUTP R4
|
||||||
#define LEN R5
|
#define LEN R5
|
||||||
#define KEY R6
|
#define KEYP R6
|
||||||
#define IVP R7
|
#define ROUNDS R7
|
||||||
#define ENC R8
|
#define IVP R8
|
||||||
#define ROUNDS R9
|
#define ENC R9
|
||||||
#define IDX R10
|
|
||||||
|
|
||||||
#define RNDKEY0 V0
|
|
||||||
#define INOUT V2
|
#define INOUT V2
|
||||||
#define TMP V3
|
#define TMP V3
|
||||||
|
|
||||||
#define IVEC V4
|
#define IVEC V4
|
||||||
|
|
||||||
// Vector loads are done using LVX followed by
|
// Load the crypt key into VSRs.
|
||||||
// a VPERM using mask generated from previous
|
//
|
||||||
// LVSL or LVSR instruction, to obtain the correct
|
// The expanded key is stored and loaded using
|
||||||
// bytes if address is unaligned.
|
// STXVD2X/LXVD2X. The in-memory byte ordering
|
||||||
|
// depends on the endianness of the machine. The
|
||||||
|
// expanded keys are generated by expandKeyAsm above.
|
||||||
|
//
|
||||||
|
// Rkeyp holds the key pointer. It is clobbered. Once
|
||||||
|
// the expanded keys are loaded, it is not needed.
|
||||||
|
//
|
||||||
|
// R12,R14-R21 are scratch registers.
|
||||||
|
// For keyp of 10, V6, V11-V20 hold the expanded key.
|
||||||
|
// For keyp of 12, V6, V9-V20 hold the expanded key.
|
||||||
|
// For keyp of 14, V6, V7-V20 hold the expanded key.
|
||||||
|
#define LOAD_KEY(Rkeyp) \
|
||||||
|
MOVD $16, R12 \
|
||||||
|
MOVD $32, R14 \
|
||||||
|
MOVD $48, R15 \
|
||||||
|
MOVD $64, R16 \
|
||||||
|
MOVD $80, R17 \
|
||||||
|
MOVD $96, R18 \
|
||||||
|
MOVD $112, R19 \
|
||||||
|
MOVD $128, R20 \
|
||||||
|
MOVD $144, R21 \
|
||||||
|
LXVD2X (R0+Rkeyp), V6 \
|
||||||
|
ADD $16, Rkeyp \
|
||||||
|
BEQ CR1, L_start10 \
|
||||||
|
BEQ CR2, L_start12 \
|
||||||
|
LXVD2X (R0+Rkeyp), V7 \
|
||||||
|
LXVD2X (R12+Rkeyp), V8 \
|
||||||
|
ADD $32, Rkeyp \
|
||||||
|
L_start12: \
|
||||||
|
LXVD2X (R0+Rkeyp), V9 \
|
||||||
|
LXVD2X (R12+Rkeyp), V10 \
|
||||||
|
ADD $32, Rkeyp \
|
||||||
|
L_start10: \
|
||||||
|
LXVD2X (R0+Rkeyp), V11 \
|
||||||
|
LXVD2X (R12+Rkeyp), V12 \
|
||||||
|
LXVD2X (R14+Rkeyp), V13 \
|
||||||
|
LXVD2X (R15+Rkeyp), V14 \
|
||||||
|
LXVD2X (R16+Rkeyp), V15 \
|
||||||
|
LXVD2X (R17+Rkeyp), V16 \
|
||||||
|
LXVD2X (R18+Rkeyp), V17 \
|
||||||
|
LXVD2X (R19+Rkeyp), V18 \
|
||||||
|
LXVD2X (R20+Rkeyp), V19 \
|
||||||
|
LXVD2X (R21+Rkeyp), V20
|
||||||
|
|
||||||
// Encryption is done with VCIPHER and VCIPHERLAST
|
// Perform aes cipher operation for keysize 10/12/14 using the keys
|
||||||
// Decryption is done with VNCIPHER and VNCIPHERLAST
|
// loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
|
||||||
|
//
|
||||||
|
// Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
|
||||||
|
// performance V6 and IVEC can be swapped (xor is both associative and
|
||||||
|
// commutative) during encryption:
|
||||||
|
//
|
||||||
|
// VXOR INOUT, IVEC, INOUT
|
||||||
|
// VXOR INOUT, V6, INOUT
|
||||||
|
//
|
||||||
|
// into
|
||||||
|
//
|
||||||
|
// VXOR INOUT, V6, INOUT
|
||||||
|
// VXOR INOUT, IVEC, INOUT
|
||||||
|
//
|
||||||
|
#define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
|
||||||
|
VXOR Vin, Vxor, Vout \
|
||||||
|
BEQ CR1, label10 \
|
||||||
|
BEQ CR2, label12 \
|
||||||
|
vcipher Vout, V7, Vout \
|
||||||
|
vcipher Vout, V8, Vout \
|
||||||
|
label12: \
|
||||||
|
vcipher Vout, V9, Vout \
|
||||||
|
vcipher Vout, V10, Vout \
|
||||||
|
label10: \
|
||||||
|
vcipher Vout, V11, Vout \
|
||||||
|
vcipher Vout, V12, Vout \
|
||||||
|
vcipher Vout, V13, Vout \
|
||||||
|
vcipher Vout, V14, Vout \
|
||||||
|
vcipher Vout, V15, Vout \
|
||||||
|
vcipher Vout, V16, Vout \
|
||||||
|
vcipher Vout, V17, Vout \
|
||||||
|
vcipher Vout, V18, Vout \
|
||||||
|
vcipher Vout, V19, Vout \
|
||||||
|
vciphel Vout, V20, Vout \
|
||||||
|
|
||||||
// Encrypt and decypt is done as follows:
|
#define CLEAR_KEYS() \
|
||||||
// - INOUT value is initialized in outer loop.
|
VXOR V6, V6, V6 \
|
||||||
// - ROUNDS value is adjusted for loop unrolling.
|
VXOR V7, V7, V7 \
|
||||||
// - Encryption/decryption is done in loop based on
|
VXOR V8, V8, V8 \
|
||||||
// adjusted ROUNDS value.
|
VXOR V9, V9, V9 \
|
||||||
// - Final INOUT value is encrypted/decrypted and stored.
|
VXOR V10, V10, V10 \
|
||||||
|
VXOR V11, V11, V11 \
|
||||||
|
VXOR V12, V12, V12 \
|
||||||
|
VXOR V13, V13, V13 \
|
||||||
|
VXOR V14, V14, V14 \
|
||||||
|
VXOR V15, V15, V15 \
|
||||||
|
VXOR V16, V16, V16 \
|
||||||
|
VXOR V17, V17, V17 \
|
||||||
|
VXOR V18, V18, V18 \
|
||||||
|
VXOR V19, V19, V19 \
|
||||||
|
VXOR V20, V20, V20
|
||||||
|
|
||||||
// Note: original implementation had an 8X version
|
//func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
|
||||||
// for decryption which was omitted to avoid the
|
|
||||||
// complexity.
|
|
||||||
|
|
||||||
// func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
|
|
||||||
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
|
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
|
||||||
MOVD src+0(FP), INP
|
MOVD src+0(FP), INP
|
||||||
MOVD dst+8(FP), OUT
|
MOVD dst+8(FP), OUTP
|
||||||
MOVD length+16(FP), LEN
|
MOVD length+16(FP), LEN
|
||||||
MOVD key+24(FP), KEY
|
MOVD key+24(FP), KEYP
|
||||||
MOVD iv+32(FP), IVP
|
MOVD iv+32(FP), IVP
|
||||||
MOVD enc+40(FP), ENC
|
MOVD enc+40(FP), ENC
|
||||||
MOVD nr+48(FP), ROUNDS
|
MOVD nr+48(FP), ROUNDS
|
||||||
|
|
@ -564,91 +627,45 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
|
||||||
LVX (R11), ESPERM // Permute value for P8_ macros.
|
LVX (R11), ESPERM // Permute value for P8_ macros.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CMPU LEN, $16 // cmpldi r5,16
|
// Assume len > 0 && len % blockSize == 0.
|
||||||
BC 14, 0, LR // bltlr-, return if len < 16.
|
CMPW ENC, $0
|
||||||
CMPW ENC, $0 // cmpwi r8,0
|
P8_LXVB16X(IVP, R0, IVEC)
|
||||||
|
CMPU ROUNDS, $10, CR1
|
||||||
|
CMPU ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
|
||||||
|
|
||||||
P8_LXVB16X(IVP, R0, IVEC) // load ivec in BE register order
|
// Setup key in VSRs, and set loop count in CTR.
|
||||||
|
LOAD_KEY(KEYP)
|
||||||
|
SRD $4, LEN
|
||||||
|
MOVD LEN, CTR
|
||||||
|
|
||||||
SRW $1, ROUNDS // rlwinm r9,r9,31,1,31
|
BEQ Lcbc_dec
|
||||||
MOVD $0, IDX // li r10,0
|
|
||||||
ADD $-1, ROUNDS // addi r9,r9,-1
|
|
||||||
BEQ Lcbc_dec // beq
|
|
||||||
PCALIGN $16
|
|
||||||
|
|
||||||
// Outer loop: initialize encrypted value (INOUT)
|
PCALIGN $32
|
||||||
// Load input (INPTAIL) ivec (IVEC)
|
|
||||||
Lcbc_enc:
|
Lcbc_enc:
|
||||||
P8_LXVB16X(INP, R0, INOUT) // load text in BE vreg order
|
P8_LXVB16X(INP, R0, INOUT)
|
||||||
ADD $16, INP // addi r3,r3,16
|
ADD $16, INP
|
||||||
MOVD ROUNDS, CTR // mtctr r9
|
VXOR INOUT, V6, INOUT
|
||||||
ADD $-16, LEN // addi r5,r5,-16
|
CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load first xkey
|
VOR INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
|
||||||
ADD $16, IDX // addi r10,r10,16
|
P8_STXVB16X(INOUT, OUTP, R0)
|
||||||
VXOR INOUT, RNDKEY0, INOUT // vxor v2,v2,v0
|
ADD $16, OUTP
|
||||||
VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
|
BDNZ Lcbc_enc
|
||||||
|
|
||||||
// Encryption loop of INOUT using RNDKEY0
|
P8_STXVB16X(INOUT, IVP, R0)
|
||||||
Loop_cbc_enc:
|
CLEAR_KEYS()
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
RET
|
||||||
VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
|
||||||
VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
BDNZ Loop_cbc_enc
|
|
||||||
|
|
||||||
// Encrypt tail values and store INOUT
|
PCALIGN $32
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
|
||||||
VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v1
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load final xkey
|
|
||||||
VCIPHERLAST INOUT, RNDKEY0, IVEC // vcipherlast v4,v2,v0
|
|
||||||
MOVD $0, IDX // reset key index for next block
|
|
||||||
CMPU LEN, $16 // cmpldi r5,16
|
|
||||||
P8_STXVB16X(IVEC, OUT, R0) // store ciphertext in BE order
|
|
||||||
ADD $16, OUT // addi r4,r4,16
|
|
||||||
BGE Lcbc_enc // bge Lcbc_enc
|
|
||||||
BR Lcbc_done // b Lcbc_done
|
|
||||||
|
|
||||||
// Outer loop: initialize decrypted value (INOUT)
|
|
||||||
// Load input (INPTAIL) ivec (IVEC)
|
|
||||||
Lcbc_dec:
|
Lcbc_dec:
|
||||||
P8_LXVB16X(INP, R0, TMP) // load ciphertext in BE vreg order
|
P8_LXVB16X(INP, R0, TMP)
|
||||||
ADD $16, INP // addi r3,r3,16
|
ADD $16, INP
|
||||||
MOVD ROUNDS, CTR // mtctr r9
|
CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
|
||||||
ADD $-16, LEN // addi r5,r5,-16
|
VXOR INOUT, IVEC, INOUT
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load first xkey
|
VOR TMP, TMP, IVEC // TMP is IVEC for next block.
|
||||||
ADD $16, IDX // addi r10,r10,16
|
P8_STXVB16X(INOUT, OUTP, R0)
|
||||||
VXOR TMP, RNDKEY0, INOUT // vxor v2,v3,v0
|
ADD $16, OUTP
|
||||||
PCALIGN $16
|
BDNZ Lcbc_dec
|
||||||
|
|
||||||
// Decryption loop of INOUT using RNDKEY0
|
|
||||||
Loop_cbc_dec:
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0
|
|
||||||
BDNZ Loop_cbc_dec
|
|
||||||
|
|
||||||
// Decrypt tail values and store INOUT
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load next xkey
|
|
||||||
ADD $16, IDX // addi r10,r10,16
|
|
||||||
VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v1
|
|
||||||
LXVD2X (KEY+IDX), RNDKEY0 // load final xkey
|
|
||||||
MOVD $0, IDX // li r10,0
|
|
||||||
VNCIPHERLAST INOUT, RNDKEY0, INOUT // vncipherlast v2,v2,v0
|
|
||||||
CMPU LEN, $16 // cmpldi r5,16
|
|
||||||
VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
|
|
||||||
VOR TMP, TMP, IVEC // vor v4,v3,v3
|
|
||||||
P8_STXVB16X(INOUT, OUT, R0) // store text in BE order
|
|
||||||
ADD $16, OUT // addi r4,r4,16
|
|
||||||
BGE Lcbc_dec // bge
|
|
||||||
|
|
||||||
Lcbc_done:
|
|
||||||
VXOR RNDKEY0, RNDKEY0, RNDKEY0 // clear key register
|
|
||||||
P8_STXVB16X(IVEC, R0, IVP) // Save ivec in BE order for next round.
|
|
||||||
RET // bclr 20,lt,0
|
|
||||||
|
|
||||||
|
P8_STXVB16X(IVEC, IVP, R0)
|
||||||
|
CLEAR_KEYS()
|
||||||
|
RET
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue