mirror of https://github.com/golang/go.git
crypto/aes: simplify key expansion in ppc64le asm
The ported cryptogam implementation uses a subtle and tricky mechanism using lxv/vperm/lvsl to load unaligned vectors. This is difficult to read, and may read and write unrelated bytes if reading from an unaligned address. Instead, POWER8 instructions can be used to load from unaligned memory with much less overhead. Alignment interrupts only occur when reading or writing cache-inhibited memory, which we assume isn't used in go today, otherwise alignment penalties are usually marginal. Instead lxvd2x+xxpermdi and xxpermdi+stxvd2x can be used to emulate unaligned LE bytewise loads, similar to lxv/stxv on POWER9 in little-endian mode. Likewise, a custom permute vector is used to emulate BE bytewise storage operations, lxvb16x/stxvb16x, on POWER9. This greatly simplifies the code, and it makes it much easier to store the keys in reverse (which is exactly how the decrypt keys are expected to be stored). Change-Id: I2334337e31a8fdf8d13ba96231142a039f237098 Reviewed-on: https://go-review.googlesource.com/c/go/+/395494 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Trust: Paul Murphy <murp@ibm.com> Run-TryBot: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
parent
6f6942ef7a
commit
8d581f589e
|
|
@ -43,6 +43,10 @@
|
|||
#define OUTHEAD V10
|
||||
#define OUTTAIL V11
|
||||
|
||||
// For P9 instruction emulation
|
||||
#define ESPERM V21 // Endian swapping permute into BE
|
||||
#define TMP2 V22 // Temporary for P8_STXVB16X/P8_STXV
|
||||
|
||||
// For {en,de}cryptBlockAsm
|
||||
#define BLK_INP R3
|
||||
#define BLK_OUT R4
|
||||
|
|
@ -50,15 +54,38 @@
|
|||
#define BLK_ROUNDS R6
|
||||
#define BLK_IDX R7
|
||||
|
||||
DATA ·rcon+0x00(SB)/8, $0x0100000001000000 // RCON
|
||||
DATA ·rcon+0x08(SB)/8, $0x0100000001000000 // RCON
|
||||
DATA ·rcon+0x10(SB)/8, $0x1b0000001b000000
|
||||
DATA ·rcon+0x18(SB)/8, $0x1b0000001b000000
|
||||
DATA ·rcon+0x20(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
|
||||
DATA ·rcon+0x28(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
|
||||
DATA ·rcon+0x30(SB)/8, $0x0000000000000000
|
||||
DATA ·rcon+0x38(SB)/8, $0x0000000000000000
|
||||
GLOBL ·rcon(SB), RODATA, $64
|
||||
DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
|
||||
DATA ·rcon+0x08(SB)/8, $0x0706050403020100
|
||||
DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
|
||||
DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
|
||||
DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
|
||||
DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
|
||||
DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
|
||||
DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
|
||||
DATA ·rcon+0x40(SB)/8, $0x0000000000000000
|
||||
DATA ·rcon+0x48(SB)/8, $0x0000000000000000
|
||||
GLOBL ·rcon(SB), RODATA, $80
|
||||
|
||||
// Emulate unaligned BE vector load/stores on LE targets
|
||||
#define P8_LXVB16X(RA,RB,VT) \
|
||||
LXVD2X (RA+RB), VT \
|
||||
VPERM VT, VT, ESPERM, VT
|
||||
|
||||
#define P8_STXVB16X(VS,RA,RB) \
|
||||
VPERM VS, VS, ESPERM, TMP2 \
|
||||
STXVD2X TMP2, (RA+RB)
|
||||
|
||||
#define P8_STXV(VS,RA,RB) \
|
||||
XXPERMDI VS, VS, $2, TMP2 \
|
||||
STXVD2X TMP2, (RA+RB)
|
||||
|
||||
#define P8_LXV(RA,RB,VT) \
|
||||
LXVD2X (RA+RB), VT \
|
||||
XXPERMDI VT, VT, $2, VT
|
||||
|
||||
#define LXSDX_BE(RA,RB,VT) \
|
||||
LXSDX (RA+RB), VT \
|
||||
VPERM VT, VT, ESPERM, VT
|
||||
|
||||
// func setEncryptKeyAsm(key *byte, keylen int, enc *uint32) int
|
||||
TEXT ·setEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
|
||||
|
|
@ -87,45 +114,32 @@ TEXT ·doEncryptKeyAsm(SB), NOSPLIT|NOFRAME, $0
|
|||
BC 0x06, 2, enc_key_abort // bne- .Lenc_key_abort
|
||||
|
||||
MOVD $·rcon(SB), PTR // PTR point to rcon addr
|
||||
LVX (PTR), ESPERM
|
||||
ADD $0x10, PTR
|
||||
|
||||
// Get key from memory and write aligned into VR
|
||||
NEG INP, R9 // neg 9,3 R9 is ~INP + 1
|
||||
LVX (INP)(R0), IN0 // lvx 1,0,3 Load key inside IN0
|
||||
ADD $15, INP, INP // addi 3,3,15 Add 15B to INP addr
|
||||
LVSR (R9)(R0), KEY // lvsr 3,0,9
|
||||
P8_LXVB16X(INP, R0, IN0)
|
||||
ADD $0x10, INP, INP
|
||||
MOVD $0x20, R8 // li 8,0x20 R8 = 32
|
||||
|
||||
CMPW BITS, $192 // cmpwi 4,192 Key size == 192?
|
||||
LVX (INP)(R0), IN1 // lvx 2,0,3
|
||||
VSPLTISB $0x0f, MASK// vspltisb 5,0x0f 0x0f0f0f0f... mask
|
||||
LVX (PTR)(R0), RCON // lvx 4,0,6 Load first 16 bytes into RCON
|
||||
VXOR KEY, MASK, KEY // vxor 3,3,5 Adjust for byte swap
|
||||
LVX (PTR)(R8), MASK // lvx 5,8,6
|
||||
ADD $0x10, PTR, PTR // addi 6,6,0x10 PTR to next 16 bytes of RCON
|
||||
VPERM IN0, IN1, KEY, IN0 // vperm 1,1,2,3 Align
|
||||
MOVD $8, CNT // li 7,8 CNT = 8
|
||||
VXOR ZERO, ZERO, ZERO // vxor 0,0,0 Zero to be zero :)
|
||||
MOVD CNT, CTR // mtctr 7 Set the counter to 8 (rounds)
|
||||
|
||||
LVSL (OUT)(R0), OUTPERM // lvsl 8,0,5
|
||||
VSPLTISB $-1, OUTMASK // vspltisb 9,-1
|
||||
LVX (OUT)(R0), OUTHEAD // lvx 10,0,5
|
||||
VPERM OUTMASK, ZERO, OUTPERM, OUTMASK // vperm 9,9,0,8
|
||||
|
||||
BLT loop128 // blt .Loop128
|
||||
ADD $8, INP, INP // addi 3,3,8
|
||||
BEQ l192 // beq .L192
|
||||
ADD $8, INP, INP // addi 3,3,8
|
||||
JMP l256 // b .L256
|
||||
|
||||
loop128:
|
||||
// Key schedule (Round 1 to 8)
|
||||
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
|
||||
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5 Write to output
|
||||
ADD $16, OUT, OUT // addi 5,5,16 Point to the next round
|
||||
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
|
|
@ -142,11 +156,8 @@ loop128:
|
|||
// Key schedule (Round 9)
|
||||
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-spat
|
||||
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 9
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
|
||||
// Key schedule (Round 10)
|
||||
|
|
@ -160,11 +171,8 @@ loop128:
|
|||
|
||||
VPERM IN0, IN0, MASK, KEY // vperm 3,1,1,5 Rotate-n-splat
|
||||
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8 Rotate
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 10
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
|
||||
// Key schedule (Round 11)
|
||||
|
|
@ -174,26 +182,18 @@ loop128:
|
|||
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
VXOR IN0, KEY, IN0 // vxor 1,1,3
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5 Round 11
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
|
||||
ADD $15, OUT, INP // addi 3,5,15
|
||||
ADD $0x50, OUT, OUT // addi 5,5,0x50
|
||||
|
||||
MOVD $10, ROUNDS // li 8,10
|
||||
JMP done // b .Ldone
|
||||
|
||||
l192:
|
||||
LVX (INP)(R0), TMP // lvx 6,0,3
|
||||
LXSDX_BE(INP, R0, IN1) // Load next 8 bytes into upper half of VSR in BE order.
|
||||
MOVD $4, CNT // li 7,4
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
|
||||
VSPLTISB $8, KEY // vspltisb 3,8
|
||||
MOVD CNT, CTR // mtctr 7
|
||||
VSUBUBM MASK, KEY, MASK // vsububm 5,5,3
|
||||
|
|
@ -221,23 +221,17 @@ loop192:
|
|||
|
||||
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
|
||||
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
|
||||
VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(STAGE, R0, OUT)
|
||||
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
|
||||
VSLDOI $8, IN0, IN1, STAGE // vsldoi 7,1,2,8
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
|
||||
VPERM STAGE, STAGE, OUTPERM, OUTTAIL // vperm 11,7,7,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(STAGE, R0, OUT)
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
VSLDOI $12, ZERO, TMP, TMP // vsldoi 6,0,6,12
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
|
||||
VSPLTW $3, IN0, TMP // vspltw 6,1,3
|
||||
|
|
@ -247,11 +241,7 @@ loop192:
|
|||
VXOR IN1, TMP, IN1 // vxor 2,2,6
|
||||
VXOR IN0, KEY, IN0 // vxor 1,1,3
|
||||
VXOR IN1, KEY, IN1 // vxor 2,2,3
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
ADD $15, OUT, INP // addi 3,5,15
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
BC 0x10, 0, loop192 // bdnz .Loop192
|
||||
|
||||
|
|
@ -260,25 +250,18 @@ loop192:
|
|||
BR done // b .Ldone
|
||||
|
||||
l256:
|
||||
LVX (INP)(R0), TMP // lvx 6,0,3
|
||||
P8_LXVB16X(INP, R0, IN1)
|
||||
MOVD $7, CNT // li 7,7
|
||||
MOVD $14, ROUNDS // li 8,14
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
VPERM IN1, TMP, KEY, IN1 // vperm 2,2,6,3
|
||||
MOVD CNT, CTR // mtctr 7
|
||||
|
||||
loop256:
|
||||
VPERM IN1, IN1, MASK, KEY // vperm 3,2,2,5
|
||||
VSLDOI $12, ZERO, IN0, TMP // vsldoi 6,0,1,12
|
||||
VPERM IN1, IN1, OUTPERM, OUTTAIL // vperm 11,2,2,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
P8_STXV(IN1, R0, OUT)
|
||||
VCIPHERLAST KEY, RCON, KEY // vcipherlast 3,3,4
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
|
||||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
|
|
@ -288,11 +271,7 @@ loop256:
|
|||
VXOR IN0, TMP, IN0 // vxor 1,1,6
|
||||
VADDUWM RCON, RCON, RCON // vadduwm 4,4,4
|
||||
VXOR IN0, KEY, IN0 // vxor 1,1,3
|
||||
VPERM IN0, IN0, OUTPERM, OUTTAIL // vperm 11,1,1,8
|
||||
VSEL OUTHEAD, OUTTAIL, OUTMASK, STAGE // vsel 7,10,11,9
|
||||
VOR OUTTAIL, OUTTAIL, OUTHEAD // vor 10,11,11
|
||||
STVX STAGE, (OUT+R0) // stvx 7,0,5
|
||||
ADD $15, OUT, INP // addi 3,5,15
|
||||
P8_STXV(IN0, R0, OUT)
|
||||
ADD $16, OUT, OUT // addi 5,5,16
|
||||
BC 0x12, 0, done // bdz .Ldone
|
||||
|
||||
|
|
@ -310,9 +289,6 @@ loop256:
|
|||
JMP loop256 // b .Loop256
|
||||
|
||||
done:
|
||||
LVX (INP)(R0), IN1 // lvx 2,0,3
|
||||
VSEL OUTHEAD, IN1, OUTMASK, IN1 // vsel 2,10,2,9
|
||||
STVX IN1, (INP+R0) // stvx 2,0,3
|
||||
MOVD $0, PTR // li 6,0 set PTR to 0 (exit code 0)
|
||||
MOVW ROUNDS, 0(OUT) // stw 8,0(5)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue