diff --git a/src/crypto/aes/gcm_ppc64x.go b/src/crypto/aes/gcm_ppc64x.go index 44b27056d6..3dbf4ba578 100644 --- a/src/crypto/aes/gcm_ppc64x.go +++ b/src/crypto/aes/gcm_ppc64x.go @@ -51,6 +51,8 @@ type gcmAsm struct { tagSize int } +func counterCryptASM(nr int, out, in []byte, counter *[gcmBlockSize]byte, key *uint32) + // NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only // called by crypto/cipher.NewGCM via the gcmAble interface. func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) { @@ -114,34 +116,10 @@ func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) { // into out. counter is the initial count value and will be updated with the next // count value. The length of out must be greater than or equal to the length // of in. +// counterCryptASM implements counterCrypt which then allows the loop to +// be unrolled and optimized. func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) { - var mask [gcmBlockSize]byte - - for len(in) >= gcmBlockSize { - // Hint to avoid bounds check - _, _ = in[15], out[15] - g.cipher.Encrypt(mask[:], counter[:]) - gcmInc32(counter) - - // XOR 16 bytes each loop iteration in 8 byte chunks - in0 := binary.LittleEndian.Uint64(in[0:]) - in1 := binary.LittleEndian.Uint64(in[8:]) - m0 := binary.LittleEndian.Uint64(mask[:8]) - m1 := binary.LittleEndian.Uint64(mask[8:]) - binary.LittleEndian.PutUint64(out[:8], in0^m0) - binary.LittleEndian.PutUint64(out[8:], in1^m1) - out = out[16:] - in = in[16:] - } - - if len(in) > 0 { - g.cipher.Encrypt(mask[:], counter[:]) - gcmInc32(counter) - // XOR leftover bytes - for i, inb := range in { - out[i] = inb ^ mask[i] - } - } + counterCryptASM(len(g.cipher.enc)/4-1, out, in, counter, &g.cipher.enc[0]) } // increments the rightmost 32-bits of the count value by 1. diff --git a/src/crypto/aes/gcm_ppc64x.s b/src/crypto/aes/gcm_ppc64x.s index 72f0b8e01c..f661b27642 100644 --- a/src/crypto/aes/gcm_ppc64x.s +++ b/src/crypto/aes/gcm_ppc64x.s @@ -4,7 +4,7 @@ //go:build ppc64 || ppc64le -// Based on CRYPTOGAMS code with the following comment: +// Portions based on CRYPTOGAMS code with the following comment: // # ==================================================================== // # Written by Andy Polyakov for the OpenSSL // # project. The module is, however, dual licensed under OpenSSL and @@ -12,13 +12,17 @@ // # details see http://www.openssl.org/~appro/cryptogams/. // # ==================================================================== -// This implementation is based on the ppc64 asm generated by the -// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl +// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm +// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl // from commit d47afb3c. // Changes were made due to differences in the ABI and some register usage. // Some arguments were changed due to the way the Go code passes them. +// Portions that use the stitched AES-GCM approach in counterCryptASM +// are based on code found in +// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s + #include "textflag.h" #define XIP R3 @@ -87,6 +91,292 @@ #define VIN0 VIN +#define ESPERM V10 +#define TMP2 V11 + +// The following macros provide appropriate +// implementations for endianness as well as +// ISA specific for power8 and power9. +#ifdef GOARCH_ppc64le +# ifdef GOPPC64_power9 +#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT +#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB) +# else +#define NEEDS_ESPERM +#define P8_LXVB16X(RA,RB,VT) \ + LXVD2X (RA+RB), VT \ + VPERM VT, VT, ESPERM, VT + +#define P8_STXVB16X(VS,RA,RB) \ + VPERM VS, VS, ESPERM, TMP2; \ + STXVD2X TMP2, (RA+RB) + +# endif +#else +#define P8_LXVB16X(RA,RB,VT) \ + LXVD2X (RA+RB), VT + +#define P8_STXVB16X(VS,RA,RB) \ + STXVD2X VS, (RA+RB) + +#endif + +#define MASK_PTR R8 + +#define MASKV V0 +#define INV V1 + +// The following macros are used for +// the stitched implementation within +// counterCryptASM. + +// Load the initial GCM counter value +// in V30 and set up the counter increment +// in V31 +#define SETUP_COUNTER \ + P8_LXVB16X(COUNTER, R0, V30); \ + VSPLTISB $1, V28; \ + VXOR V31, V31, V31; \ + VSLDOI $1, V31, V28, V31 + +// These macros set up the initial value +// for a single encryption, or 4 or 8 +// stitched encryptions implemented +// with interleaving vciphers. +// +// The input value for each encryption +// is generated by XORing the counter +// from V30 with the first key in VS0 +// and incrementing the counter. +// +// Single encryption in V15 +#define GEN_VCIPHER_INPUT \ + XXLOR VS0, VS0, V29 \ + VXOR V30, V29, V15; \ + VADDUWM V30, V31, V30 + +// 4 encryptions in V15 - V18 +#define GEN_VCIPHER_4_INPUTS \ + XXLOR VS0, VS0, V29; \ + VXOR V30, V29, V15; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V16; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V17; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V18; \ + VADDUWM V30, V31, V30 + +// 8 encryptions in V15 - V22 +#define GEN_VCIPHER_8_INPUTS \ + XXLOR VS0, VS0, V29; \ + VXOR V30, V29, V15; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V16; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V17; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V18; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V19; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V20; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V21; \ + VADDUWM V30, V31, V30; \ + VXOR V30, V29, V22; \ + VADDUWM V30, V31, V30 + +// Load the keys to be used for +// encryption based on key_len. +// Keys are in VS0 - VS14 +// depending on key_len. +// Valid keys sizes are verified +// here. CR2 is set and used +// throughout to check key_len. +#define LOAD_KEYS(blk_key, key_len) \ + MOVD $16, R16; \ + MOVD $32, R17; \ + MOVD $48, R18; \ + MOVD $64, R19; \ + LXVD2X (blk_key)(R0), VS0; \ + LXVD2X (blk_key)(R16), VS1; \ + LXVD2X (blk_key)(R17), VS2; \ + LXVD2X (blk_key)(R18), VS3; \ + LXVD2X (blk_key)(R19), VS4; \ + ADD $64, R16; \ + ADD $64, R17; \ + ADD $64, R18; \ + ADD $64, R19; \ + LXVD2X (blk_key)(R16), VS5; \ + LXVD2X (blk_key)(R17), VS6; \ + LXVD2X (blk_key)(R18), VS7; \ + LXVD2X (blk_key)(R19), VS8; \ + ADD $64, R16; \ + ADD $64, R17; \ + ADD $64, R18; \ + ADD $64, R19; \ + LXVD2X (blk_key)(R16), VS9; \ + LXVD2X (blk_key)(R17), VS10; \ + CMP key_len, $12, CR2; \ + CMP key_len, $10; \ + BEQ keysLoaded; \ + LXVD2X (blk_key)(R18), VS11; \ + LXVD2X (blk_key)(R19), VS12; \ + BEQ CR2, keysLoaded; \ + ADD $64, R16; \ + ADD $64, R17; \ + LXVD2X (blk_key)(R16), VS13; \ + LXVD2X (blk_key)(R17), VS14; \ + CMP key_len, $14; \ + BEQ keysLoaded; \ + MOVD R0,0(R0); \ +keysLoaded: + +// Encrypt 1 (vin) with first 9 +// keys from VS1 - VS9. +#define VCIPHER_1X9_KEYS(vin) \ + XXLOR VS1, VS1, V23; \ + XXLOR VS2, VS2, V24; \ + XXLOR VS3, VS3, V25; \ + XXLOR VS4, VS4, V26; \ + XXLOR VS5, VS5, V27; \ + VCIPHER vin, V23, vin; \ + VCIPHER vin, V24, vin; \ + VCIPHER vin, V25, vin; \ + VCIPHER vin, V26, vin; \ + VCIPHER vin, V27, vin; \ + XXLOR VS6, VS6, V23; \ + XXLOR VS7, VS7, V24; \ + XXLOR VS8, VS8, V25; \ + XXLOR VS9, VS9, V26; \ + VCIPHER vin, V23, vin; \ + VCIPHER vin, V24, vin; \ + VCIPHER vin, V25, vin; \ + VCIPHER vin, V26, vin + +// Encrypt 1 value (vin) with +// 2 specified keys +#define VCIPHER_1X2_KEYS(vin, key1, key2) \ + XXLOR key1, key1, V25; \ + XXLOR key2, key2, V26; \ + VCIPHER vin, V25, vin; \ + VCIPHER vin, V26, vin + +// Encrypt 4 values in V15 - V18 +// with the specified key from +// VS1 - VS9. +#define VCIPHER_4X1_KEY(key) \ + XXLOR key, key, V23; \ + VCIPHER V15, V23, V15; \ + VCIPHER V16, V23, V16; \ + VCIPHER V17, V23, V17; \ + VCIPHER V18, V23, V18 + +// Encrypt 8 values in V15 - V22 +// with the specified key, +// assuming it is a VSreg +#define VCIPHER_8X1_KEY(key) \ + XXLOR key, key, V23; \ + VCIPHER V15, V23, V15; \ + VCIPHER V16, V23, V16; \ + VCIPHER V17, V23, V17; \ + VCIPHER V18, V23, V18; \ + VCIPHER V19, V23, V19; \ + VCIPHER V20, V23, V20; \ + VCIPHER V21, V23, V21; \ + VCIPHER V22, V23, V22 + +// Load input block into V1-V4 +// in big endian order and +// update blk_inp by 64. +#define LOAD_INPUT_BLOCK64(blk_inp) \ + MOVD $16, R16; \ + MOVD $32, R17; \ + MOVD $48, R18; \ + P8_LXVB16X(blk_inp,R0,V1); \ + P8_LXVB16X(blk_inp,R16,V2); \ + P8_LXVB16X(blk_inp,R17,V3); \ + P8_LXVB16X(blk_inp,R18,V4); \ + ADD $64, blk_inp + +// Load input block into V1-V8 +// in big endian order and +// Update blk_inp by 128 +#define LOAD_INPUT_BLOCK128(blk_inp) \ + MOVD $16, R16; \ + MOVD $32, R17; \ + MOVD $48, R18; \ + MOVD $64, R19; \ + MOVD $80, R20; \ + MOVD $96, R21; \ + MOVD $112, R22; \ + P8_LXVB16X(blk_inp,R0,V1); \ + P8_LXVB16X(blk_inp,R16,V2); \ + P8_LXVB16X(blk_inp,R17,V3); \ + P8_LXVB16X(blk_inp,R18,V4); \ + P8_LXVB16X(blk_inp,R19,V5); \ + P8_LXVB16X(blk_inp,R20,V6); \ + P8_LXVB16X(blk_inp,R21,V7); \ + P8_LXVB16X(blk_inp,R22,V8); \ + ADD $128, blk_inp + +// Finish encryption on 8 streams and +// XOR with input block +#define VCIPHERLAST8_XOR_INPUT \ + VCIPHERLAST V15, V23, V15; \ + VCIPHERLAST V16, V23, V16; \ + VCIPHERLAST V17, V23, V17; \ + VCIPHERLAST V18, V23, V18; \ + VCIPHERLAST V19, V23, V19; \ + VCIPHERLAST V20, V23, V20; \ + VCIPHERLAST V21, V23, V21; \ + VCIPHERLAST V22, V23, V22; \ + XXLXOR V1, V15, V1; \ + XXLXOR V2, V16, V2; \ + XXLXOR V3, V17, V3; \ + XXLXOR V4, V18, V4; \ + XXLXOR V5, V19, V5; \ + XXLXOR V6, V20, V6; \ + XXLXOR V7, V21, V7; \ + XXLXOR V8, V22, V8 + +// Finish encryption on 4 streams and +// XOR with input block +#define VCIPHERLAST4_XOR_INPUT \ + VCIPHERLAST V15, V23, V15; \ + VCIPHERLAST V16, V23, V16; \ + VCIPHERLAST V17, V23, V17; \ + VCIPHERLAST V18, V23, V18; \ + XXLXOR V1, V15, V1; \ + XXLXOR V2, V16, V2; \ + XXLXOR V3, V17, V3; \ + XXLXOR V4, V18, V4 + +// Store output block from V1-V8 +// in big endian order and +// Update blk_out by 128 +#define STORE_OUTPUT_BLOCK128(blk_out) \ + P8_STXVB16X(V1,blk_out,R0); \ + P8_STXVB16X(V2,blk_out,R16); \ + P8_STXVB16X(V3,blk_out,R17); \ + P8_STXVB16X(V4,blk_out,R18); \ + P8_STXVB16X(V5,blk_out,R19); \ + P8_STXVB16X(V6,blk_out,R20); \ + P8_STXVB16X(V7,blk_out,R21); \ + P8_STXVB16X(V8,blk_out,R22); \ + ADD $128, blk_out + +// Store output block from V1-V4 +// in big endian order and +// Update blk_out by 64 +#define STORE_OUTPUT_BLOCK64(blk_out) \ + P8_STXVB16X(V1,blk_out,R0); \ + P8_STXVB16X(V2,blk_out,R16); \ + P8_STXVB16X(V3,blk_out,R17); \ + P8_STXVB16X(V4,blk_out,R18); \ + ADD $64, blk_out + // func gcmInit(productTable *[256]byte, h []byte) TEXT ·gcmInit(SB), NOSPLIT, $0-32 MOVD productTable+0(FP), XIP @@ -588,3 +878,226 @@ TEXT ·gcmMul(SB), NOSPLIT, $0-32 #endif STXVD2X VXL, (XIP+R0) // write out Xi RET + +#define BLK_INP R3 +#define BLK_OUT R4 +#define BLK_KEY R5 +#define KEY_LEN R6 +#define BLK_IDX R7 +#define IDX R8 +#define IN_LEN R9 +#define COUNTER R10 +#define CONPTR R14 +#define MASK V5 + +// Implementation of the counterCrypt function in assembler. +// Original loop is unrolled to allow for multiple encryption +// streams to be done in parallel, which is achieved by interleaving +// vcipher instructions from each stream. This is also referred to as +// stitching, and provides significant performance improvements. +// Some macros are defined which enable execution for big or little +// endian as well as different ISA targets. +//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32) +//func counterCryptASM(xr, out, in, counter, key) +TEXT ·counterCryptASM(SB), NOSPLIT, $16-72 + MOVD xr(FP), KEY_LEN + MOVD out+8(FP), BLK_OUT + MOVD out_len+16(FP), R8 + MOVD in+32(FP), BLK_INP + MOVD in_len+40(FP), IN_LEN + MOVD counter+56(FP), COUNTER + MOVD key+64(FP), BLK_KEY + +// Set up permute string when needed. +#ifdef NEEDS_ESPERM + MOVD $·rcon(SB), R14 + LVX (R14), ESPERM // Permute value for P8_ macros. +#endif + SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1} + LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize + CMP IN_LEN, $128 + BLT block64 +block128_loop: + // Do 8 encryptions in parallel by setting + // input values in V15-V22 and executing + // vcipher on the updated value and the keys. + GEN_VCIPHER_8_INPUTS + VCIPHER_8X1_KEY(VS1) + VCIPHER_8X1_KEY(VS2) + VCIPHER_8X1_KEY(VS3) + VCIPHER_8X1_KEY(VS4) + VCIPHER_8X1_KEY(VS5) + VCIPHER_8X1_KEY(VS6) + VCIPHER_8X1_KEY(VS7) + VCIPHER_8X1_KEY(VS8) + VCIPHER_8X1_KEY(VS9) + // Additional encryptions are done based on + // the key length, with the last key moved + // to V23 for use with VCIPHERLAST. + // CR2 = CMP key_len, $12 + XXLOR VS10, VS10, V23 + BLT CR2, block128_last // key_len = 10 + VCIPHER_8X1_KEY(VS10) + VCIPHER_8X1_KEY(VS11) + XXLOR VS12,VS12,V23 + BEQ CR2, block128_last // ken_len = 12 + VCIPHER_8X1_KEY(VS12) + VCIPHER_8X1_KEY(VS13) + XXLOR VS14,VS14,V23 // key_len = 14 +block128_last: + // vcipher encryptions are in V15-V22 at this + // point with vcipherlast remaining to be done. + // Load input block into V1-V8, setting index offsets + // in R16-R22 to use with the STORE. + LOAD_INPUT_BLOCK128(BLK_INP) + // Do VCIPHERLAST on the last key for each encryption + // stream and XOR the result with the corresponding + // value from the input block. + VCIPHERLAST8_XOR_INPUT + // Store the results (8*16) and update BLK_OUT by 128. + STORE_OUTPUT_BLOCK128(BLK_OUT) + ADD $-128, IN_LEN // input size + CMP IN_LEN, $128 // check if >= blocksize + BGE block128_loop // next input block + CMP IN_LEN, $0 + BEQ done +block64: + CMP IN_LEN, $64 // Check if >= 64 + BLT block16_loop + // Do 4 encryptions in parallel by setting + // input values in V15-V18 and executing + // vcipher on the updated value and the keys. + GEN_VCIPHER_4_INPUTS + VCIPHER_4X1_KEY(VS1) + VCIPHER_4X1_KEY(VS2) + VCIPHER_4X1_KEY(VS3) + VCIPHER_4X1_KEY(VS4) + VCIPHER_4X1_KEY(VS5) + VCIPHER_4X1_KEY(VS6) + VCIPHER_4X1_KEY(VS7) + VCIPHER_4X1_KEY(VS8) + VCIPHER_4X1_KEY(VS9) + // Check key length based on CR2 + // Move last key to V23 for use with later vcipherlast + XXLOR VS10, VS10, V23 + BLT CR2, block64_last // size = 10 + VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys + VCIPHER_4X1_KEY(VS11) + XXLOR VS12, VS12, V23 + BEQ CR2, block64_last // size = 12 + VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys + VCIPHER_4X1_KEY(VS13) + XXLOR VS14, VS14, V23 // size = 14 +block64_last: + LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input + // Do VCIPHERLAST on the last for each encryption + // stream and XOR the result with the corresponding + // value from the input block. + VCIPHERLAST4_XOR_INPUT + // Store the results (4*16) and update BLK_OUT by 64. + STORE_OUTPUT_BLOCK64(BLK_OUT) + ADD $-64, IN_LEN // decrement input block length + CMP IN_LEN, $0 // check for remaining length + BEQ done +block16_loop: + CMP IN_LEN, $16 // More input + BLT final_block // If not, then handle partial block + // Single encryption, no stitching + GEN_VCIPHER_INPUT // Generate input value for single encryption + VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys + XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast + // Key length based on CR2. (LT=10, EQ=12, GT=14) + BLT CR2, block16_last // Finish for key size 10 + VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys + XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast + BEQ CR2, block16_last // Finish for key size 12 + VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys + XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14 +block16_last: + P8_LXVB16X(BLK_INP, R0, V1) // Load input + VCIPHERLAST V15, V23, V15 // Encrypt last value in V23 + XXLXOR V15, V1, V1 // XOR with input + P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output + ADD $16, BLK_INP // Increment input pointer + ADD $16, BLK_OUT // Increment output pointer + ADD $-16, IN_LEN // Decrement input length + BR block16_loop // Check for next +final_block: + CMP IN_LEN, $0 + BEQ done + GEN_VCIPHER_INPUT // Generate input value for partial encryption + VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys + XXLOR VS10, VS10, V23 // Save possible last key + BLT CR2, final_block_last + VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys + XXLOR VS12, VS12, V23 // Save possible last key + BEQ CR2, final_block_last + VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys + XXLOR VS14, VS14, V23 // Save last key +final_block_last: + VCIPHERLAST V15, V23, V15 // Finish encryption +#ifdef GOPPC64_power10 + // set up length + SLD $56, IN_LEN, R17 + LXVLL BLK_INP, R17, V25 + VXOR V25, V15, V25 + STXVLL V25, BLK_OUT, R17 +#else + ADD $32, R1, MASK_PTR + MOVD $0, R16 + P8_STXVB16X(V15, MASK_PTR, R0) + CMP IN_LEN, $8 + BLT next4 + MOVD 0(MASK_PTR), R14 + MOVD 0(BLK_INP), R15 + XOR R14, R15, R14 + MOVD R14, 0(BLK_OUT) + ADD $8, R16 + ADD $-8, IN_LEN +next4: + CMP IN_LEN, $4 + BLT next2 + MOVWZ (BLK_INP)(R16), R15 + MOVWZ (MASK_PTR)(R16), R14 + XOR R14, R15, R14 + MOVW R14, (R16)(BLK_OUT) + ADD $4, R16 + ADD $-4, IN_LEN +next2: + CMP IN_LEN, $2 + BLT next1 + MOVHZ (BLK_INP)(R16), R15 + MOVHZ (MASK_PTR)(R16), R14 + XOR R14, R15, R14 + MOVH R14, (R16)(BLK_OUT) + ADD $2, R16 + ADD $-2, IN_LEN +next1: + CMP IN_LEN, $1 + BLT done + MOVBZ (MASK_PTR)(R16), R14 + MOVBZ (BLK_INP)(R16), R15 + XOR R14, R15, R14 + MOVB R14, (R16)(BLK_OUT) +#endif +done: + // Save the updated counter value + P8_STXVB16X(V30, COUNTER, R0) + // Clear the keys + XXLXOR VS0, VS0, VS0 + XXLXOR VS1, VS1, VS1 + XXLXOR VS2, VS2, VS2 + XXLXOR VS3, VS3, VS3 + XXLXOR VS4, VS4, VS4 + XXLXOR VS5, VS5, VS5 + XXLXOR VS6, VS6, VS6 + XXLXOR VS7, VS7, VS7 + XXLXOR VS8, VS8, VS8 + XXLXOR VS9, VS9, VS9 + XXLXOR VS10, VS10, VS10 + XXLXOR VS11, VS11, VS11 + XXLXOR VS12, VS12, VS12 + XXLXOR VS13, VS13, VS13 + XXLXOR VS14, VS14, VS14 + RET +