mirror of https://github.com/golang/go.git
cmd/internal/notsha256: revert PPC64 removal, and fix PPC64 asm
This reverts commit a41e37f56a, and
updates the ASM usage to be go1.8 compliant. go 1.18 added support
for using VR's in place of VSR arguments.
The following transformations are made:
XXLOR Vx, Vx, VSy -> XXLORQ VSx+32, VSx+32, VSy
XXLOR VSx, VSx, Vy -> XXLORQ VSx, VSx, VSy+32
XXLOR is broken on 1.8, but XXLORQ is identical and still supported
today.
Change-Id: Icc9cd5511b412c30a14e6afd07a51839aaaf6021
Reviewed-on: https://go-review.googlesource.com/c/go/+/403734
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
89c0dd829f
commit
d24c65d8bf
|
|
@ -2,8 +2,8 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build 386 || amd64
|
||||
// +build 386 amd64
|
||||
//go:build 386 || amd64 || ppc64le || ppc64
|
||||
// +build 386 amd64 ppc64le ppc64
|
||||
|
||||
package notsha256
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !amd64 && !386
|
||||
// +build !amd64,!386
|
||||
//go:build !amd64 && !386 && !ppc64le && !ppc64
|
||||
// +build !amd64,!386,!ppc64le,!ppc64
|
||||
|
||||
package notsha256
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,430 @@
|
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//
|
||||
// WARNING: this file is built by the bootstrap compiler, thus
|
||||
// it must maintain compatibility with the oldest supported
|
||||
// bootstrap toolchain.
|
||||
//
|
||||
|
||||
//go:build ppc64 || ppc64le
|
||||
// +build ppc64 ppc64le
|
||||
|
||||
// Based on CRYPTOGAMS code with the following comment:
|
||||
// # ====================================================================
|
||||
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
// # project. The module is, however, dual licensed under OpenSSL and
|
||||
// # CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
// # details see http://www.openssl.org/~appro/cryptogams/.
|
||||
// # ====================================================================
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// SHA256 block routine. See sha256block.go for Go equivalent.
|
||||
//
|
||||
// The algorithm is detailed in FIPS 180-4:
|
||||
//
|
||||
// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
|
||||
//
|
||||
// Wt = Mt; for 0 <= t <= 15
|
||||
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
|
||||
//
|
||||
// a = H0
|
||||
// b = H1
|
||||
// c = H2
|
||||
// d = H3
|
||||
// e = H4
|
||||
// f = H5
|
||||
// g = H6
|
||||
// h = H7
|
||||
//
|
||||
// for t = 0 to 63 {
|
||||
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
|
||||
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
|
||||
// h = g
|
||||
// g = f
|
||||
// f = e
|
||||
// e = d + T1
|
||||
// d = c
|
||||
// c = b
|
||||
// b = a
|
||||
// a = T1 + T2
|
||||
// }
|
||||
//
|
||||
// H0 = a + H0
|
||||
// H1 = b + H1
|
||||
// H2 = c + H2
|
||||
// H3 = d + H3
|
||||
// H4 = e + H4
|
||||
// H5 = f + H5
|
||||
// H6 = g + H6
|
||||
// H7 = h + H7
|
||||
|
||||
#define CTX R3
|
||||
#define INP R4
|
||||
#define END R5
|
||||
#define TBL R6
|
||||
#define IDX R7
|
||||
#define LEN R9
|
||||
#define TEMP R12
|
||||
|
||||
#define HEX00 R0
|
||||
#define HEX10 R10
|
||||
|
||||
// V0-V7 are A-H
|
||||
// V8-V23 are used for the message schedule
|
||||
#define KI V24
|
||||
#define FUNC V25
|
||||
#define S0 V26
|
||||
#define S1 V27
|
||||
#define s0 V28
|
||||
#define s1 V29
|
||||
#define LEMASK V31 // Permutation control register for little endian
|
||||
|
||||
// 4 copies of each Kt, to fill all 4 words of a vector register
|
||||
DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
|
||||
DATA ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
|
||||
DATA ·kcon+0x010(SB)/8, $0x7137449171374491
|
||||
DATA ·kcon+0x018(SB)/8, $0x7137449171374491
|
||||
DATA ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
|
||||
DATA ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
|
||||
DATA ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
|
||||
DATA ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
|
||||
DATA ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
|
||||
DATA ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
|
||||
DATA ·kcon+0x050(SB)/8, $0x59f111f159f111f1
|
||||
DATA ·kcon+0x058(SB)/8, $0x59f111f159f111f1
|
||||
DATA ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
|
||||
DATA ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
|
||||
DATA ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
|
||||
DATA ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
|
||||
DATA ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
|
||||
DATA ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
|
||||
DATA ·kcon+0x090(SB)/8, $0x12835b0112835b01
|
||||
DATA ·kcon+0x098(SB)/8, $0x12835b0112835b01
|
||||
DATA ·kcon+0x0A0(SB)/8, $0x243185be243185be
|
||||
DATA ·kcon+0x0A8(SB)/8, $0x243185be243185be
|
||||
DATA ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
|
||||
DATA ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
|
||||
DATA ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
|
||||
DATA ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
|
||||
DATA ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
|
||||
DATA ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
|
||||
DATA ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
|
||||
DATA ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
|
||||
DATA ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
|
||||
DATA ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
|
||||
DATA ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
|
||||
DATA ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
|
||||
DATA ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
|
||||
DATA ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
|
||||
DATA ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
|
||||
DATA ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
|
||||
DATA ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
|
||||
DATA ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
|
||||
DATA ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
|
||||
DATA ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
|
||||
DATA ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
|
||||
DATA ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
|
||||
DATA ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
|
||||
DATA ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
|
||||
DATA ·kcon+0x170(SB)/8, $0x76f988da76f988da
|
||||
DATA ·kcon+0x178(SB)/8, $0x76f988da76f988da
|
||||
DATA ·kcon+0x180(SB)/8, $0x983e5152983e5152
|
||||
DATA ·kcon+0x188(SB)/8, $0x983e5152983e5152
|
||||
DATA ·kcon+0x190(SB)/8, $0xa831c66da831c66d
|
||||
DATA ·kcon+0x198(SB)/8, $0xa831c66da831c66d
|
||||
DATA ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
|
||||
DATA ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
|
||||
DATA ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
|
||||
DATA ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
|
||||
DATA ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
|
||||
DATA ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
|
||||
DATA ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
|
||||
DATA ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
|
||||
DATA ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
|
||||
DATA ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
|
||||
DATA ·kcon+0x1F0(SB)/8, $0x1429296714292967
|
||||
DATA ·kcon+0x1F8(SB)/8, $0x1429296714292967
|
||||
DATA ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
|
||||
DATA ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
|
||||
DATA ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
|
||||
DATA ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
|
||||
DATA ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
|
||||
DATA ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
|
||||
DATA ·kcon+0x230(SB)/8, $0x53380d1353380d13
|
||||
DATA ·kcon+0x238(SB)/8, $0x53380d1353380d13
|
||||
DATA ·kcon+0x240(SB)/8, $0x650a7354650a7354
|
||||
DATA ·kcon+0x248(SB)/8, $0x650a7354650a7354
|
||||
DATA ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
|
||||
DATA ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
|
||||
DATA ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
|
||||
DATA ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
|
||||
DATA ·kcon+0x270(SB)/8, $0x92722c8592722c85
|
||||
DATA ·kcon+0x278(SB)/8, $0x92722c8592722c85
|
||||
DATA ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
|
||||
DATA ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
|
||||
DATA ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
|
||||
DATA ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
|
||||
DATA ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
|
||||
DATA ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
|
||||
DATA ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
|
||||
DATA ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
|
||||
DATA ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
|
||||
DATA ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
|
||||
DATA ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
|
||||
DATA ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
|
||||
DATA ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
|
||||
DATA ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
|
||||
DATA ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
|
||||
DATA ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
|
||||
DATA ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
|
||||
DATA ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
|
||||
DATA ·kcon+0x310(SB)/8, $0x1e376c081e376c08
|
||||
DATA ·kcon+0x318(SB)/8, $0x1e376c081e376c08
|
||||
DATA ·kcon+0x320(SB)/8, $0x2748774c2748774c
|
||||
DATA ·kcon+0x328(SB)/8, $0x2748774c2748774c
|
||||
DATA ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
|
||||
DATA ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
|
||||
DATA ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
|
||||
DATA ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
|
||||
DATA ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
|
||||
DATA ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
|
||||
DATA ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
|
||||
DATA ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
|
||||
DATA ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
|
||||
DATA ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
|
||||
DATA ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
|
||||
DATA ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
|
||||
DATA ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
|
||||
DATA ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
|
||||
DATA ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
|
||||
DATA ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
|
||||
DATA ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
|
||||
DATA ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
|
||||
DATA ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
|
||||
DATA ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
|
||||
DATA ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
|
||||
DATA ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
|
||||
DATA ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
|
||||
DATA ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
|
||||
DATA ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
|
||||
DATA ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
|
||||
DATA ·kcon+0x400(SB)/8, $0x0000000000000000
|
||||
DATA ·kcon+0x408(SB)/8, $0x0000000000000000
|
||||
|
||||
#ifdef GOARCH_ppc64le
|
||||
DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
|
||||
DATA ·kcon+0x418(SB)/8, $0x1011121300010203
|
||||
DATA ·kcon+0x420(SB)/8, $0x1011121310111213
|
||||
DATA ·kcon+0x428(SB)/8, $0x0405060700010203
|
||||
DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b
|
||||
DATA ·kcon+0x438(SB)/8, $0x0405060700010203
|
||||
#else
|
||||
DATA ·kcon+0x410(SB)/8, $0x1011121300010203
|
||||
DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
|
||||
DATA ·kcon+0x420(SB)/8, $0x0405060700010203
|
||||
DATA ·kcon+0x428(SB)/8, $0x1011121310111213
|
||||
DATA ·kcon+0x430(SB)/8, $0x0001020304050607
|
||||
DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213
|
||||
#endif
|
||||
|
||||
GLOBL ·kcon(SB), RODATA, $1088
|
||||
|
||||
#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \
|
||||
VSEL g, f, e, FUNC; \
|
||||
VSHASIGMAW $15, e, $1, S1; \
|
||||
VADDUWM xi, h, h; \
|
||||
VSHASIGMAW $0, a, $1, S0; \
|
||||
VADDUWM FUNC, h, h; \
|
||||
VXOR b, a, FUNC; \
|
||||
VADDUWM S1, h, h; \
|
||||
VSEL b, c, FUNC, FUNC; \
|
||||
VADDUWM KI, g, g; \
|
||||
VADDUWM h, d, d; \
|
||||
VADDUWM FUNC, S0, S0; \
|
||||
LVX (TBL)(IDX), KI; \
|
||||
ADD $16, IDX; \
|
||||
VADDUWM S0, h, h
|
||||
|
||||
#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
|
||||
VSHASIGMAW $0, xj_1, $0, s0; \
|
||||
VSEL g, f, e, FUNC; \
|
||||
VSHASIGMAW $15, e, $1, S1; \
|
||||
VADDUWM xi, h, h; \
|
||||
VSHASIGMAW $0, a, $1, S0; \
|
||||
VSHASIGMAW $15, xj_14, $0, s1; \
|
||||
VADDUWM FUNC, h, h; \
|
||||
VXOR b, a, FUNC; \
|
||||
VADDUWM xj_9, xj, xj; \
|
||||
VADDUWM S1, h, h; \
|
||||
VSEL b, c, FUNC, FUNC; \
|
||||
VADDUWM KI, g, g; \
|
||||
VADDUWM h, d, d; \
|
||||
VADDUWM FUNC, S0, S0; \
|
||||
VADDUWM s0, xj, xj; \
|
||||
LVX (TBL)(IDX), KI; \
|
||||
ADD $16, IDX; \
|
||||
VADDUWM S0, h, h; \
|
||||
VADDUWM s1, xj, xj
|
||||
|
||||
#ifdef GOARCH_ppc64le
|
||||
#define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
|
||||
#else
|
||||
#define VPERMLE(va,vb,vc,vt)
|
||||
#endif
|
||||
|
||||
// func block(dig *digest, p []byte)
|
||||
TEXT ·block(SB),0,$0-32
|
||||
MOVD dig+0(FP), CTX
|
||||
MOVD p_base+8(FP), INP
|
||||
MOVD p_len+16(FP), LEN
|
||||
|
||||
SRD $6, LEN
|
||||
SLD $6, LEN
|
||||
ADD INP, LEN, END
|
||||
|
||||
CMP INP, END
|
||||
BEQ end
|
||||
|
||||
MOVD $·kcon(SB), TBL
|
||||
MOVWZ $0x10, HEX10
|
||||
MOVWZ $8, IDX
|
||||
|
||||
#ifdef GOARCH_ppc64le
|
||||
LVSL (IDX)(R0), LEMASK
|
||||
VSPLTISB $0x0F, KI
|
||||
VXOR KI, LEMASK, LEMASK
|
||||
#endif
|
||||
|
||||
LXVW4X (CTX)(HEX00), VS32 // v0 = vs32
|
||||
LXVW4X (CTX)(HEX10), VS36 // v4 = vs36
|
||||
|
||||
// unpack the input values into vector registers
|
||||
VSLDOI $4, V0, V0, V1
|
||||
VSLDOI $8, V0, V0, V2
|
||||
VSLDOI $12, V0, V0, V3
|
||||
VSLDOI $4, V4, V4, V5
|
||||
VSLDOI $8, V4, V4, V6
|
||||
VSLDOI $12, V4, V4, V7
|
||||
|
||||
loop:
|
||||
LVX (TBL)(HEX00), KI
|
||||
MOVWZ $16, IDX
|
||||
|
||||
LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
|
||||
ADD $16, INP
|
||||
|
||||
// Offload to VSR24-31 (aka FPR24-31)
|
||||
XXLORQ VS32, VS32, VS24
|
||||
XXLORQ VS33, VS33, VS25
|
||||
XXLORQ VS34, VS34, VS26
|
||||
XXLORQ VS35, VS35, VS27
|
||||
XXLORQ VS36, VS36, VS28
|
||||
XXLORQ VS37, VS37, VS29
|
||||
XXLORQ VS38, VS38, VS30
|
||||
XXLORQ VS39, VS39, VS31
|
||||
|
||||
VADDUWM KI, V7, V7 // h+K[i]
|
||||
LVX (TBL)(IDX), KI
|
||||
ADD $16, IDX
|
||||
|
||||
VPERMLE(V8, V8, LEMASK, V8)
|
||||
SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
|
||||
VSLDOI $4, V8, V8, V9
|
||||
SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
|
||||
VSLDOI $4, V9, V9, V10
|
||||
SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
|
||||
LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
|
||||
ADD $16, INP, INP
|
||||
VSLDOI $4, V10, V10, V11
|
||||
SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
|
||||
VPERMLE(V12, V12, LEMASK, V12)
|
||||
SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
|
||||
VSLDOI $4, V12, V12, V13
|
||||
SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
|
||||
VSLDOI $4, V13, V13, V14
|
||||
SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
|
||||
LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
|
||||
ADD $16, INP, INP
|
||||
VSLDOI $4, V14, V14, V15
|
||||
SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
|
||||
VPERMLE(V16, V16, LEMASK, V16)
|
||||
SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
|
||||
VSLDOI $4, V16, V16, V17
|
||||
SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
|
||||
VSLDOI $4, V17, V17, V18
|
||||
SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
|
||||
VSLDOI $4, V18, V18, V19
|
||||
LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
|
||||
VPERMLE(V20, V20, LEMASK, V20)
|
||||
SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
|
||||
VSLDOI $4, V20, V20, V21
|
||||
SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
|
||||
VSLDOI $4, V21, V21, V22
|
||||
SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
|
||||
VSLDOI $4, V22, V22, V23
|
||||
SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
|
||||
|
||||
MOVWZ $3, TEMP
|
||||
MOVWZ TEMP, CTR
|
||||
|
||||
L16_xx:
|
||||
SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
|
||||
SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
|
||||
SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
|
||||
SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
|
||||
SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
|
||||
SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
|
||||
SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
|
||||
SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
|
||||
SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
|
||||
SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
|
||||
SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
|
||||
SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
|
||||
SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
|
||||
SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
|
||||
SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
|
||||
SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
|
||||
|
||||
BC 0x10, 0, L16_xx // bdnz
|
||||
|
||||
XXLORQ VS24, VS24, VS42
|
||||
|
||||
XXLORQ VS25, VS25, VS43
|
||||
VADDUWM V10, V0, V0
|
||||
XXLORQ VS26, VS26, VS44
|
||||
VADDUWM V11, V1, V1
|
||||
XXLORQ VS27, VS27, VS45
|
||||
VADDUWM V12, V2, V2
|
||||
XXLORQ VS28, VS28, VS46
|
||||
VADDUWM V13, V3, V3
|
||||
XXLORQ VS29, VS29, VS47
|
||||
VADDUWM V14, V4, V4
|
||||
XXLORQ VS30, VS30, VS48
|
||||
VADDUWM V15, V5, V5
|
||||
XXLORQ VS31, VS31, VS49
|
||||
VADDUWM V16, V6, V6
|
||||
VADDUWM V17, V7, V7
|
||||
|
||||
CMPU INP, END
|
||||
BLT loop
|
||||
|
||||
LVX (TBL)(IDX), V8
|
||||
ADD $16, IDX
|
||||
VPERM V0, V1, KI, V0
|
||||
LVX (TBL)(IDX), V9
|
||||
VPERM V4, V5, KI, V4
|
||||
VPERM V0, V2, V8, V0
|
||||
VPERM V4, V6, V8, V4
|
||||
VPERM V0, V3, V9, V0
|
||||
VPERM V4, V7, V9, V4
|
||||
STXVD2X VS32, (CTX+HEX00) // v0 = vs32
|
||||
STXVD2X VS36, (CTX+HEX10) // v4 = vs36
|
||||
|
||||
end:
|
||||
RET
|
||||
|
||||
Loading…
Reference in New Issue