diff --git a/src/crypto/aes/gcm_ppc64x.go b/src/crypto/aes/gcm_ppc64x.go
index 44b27056d6..3dbf4ba578 100644
--- a/src/crypto/aes/gcm_ppc64x.go
+++ b/src/crypto/aes/gcm_ppc64x.go
@@ -51,6 +51,8 @@ type gcmAsm struct {
 	tagSize int
 }
 
+func counterCryptASM(nr int, out, in []byte, counter *[gcmBlockSize]byte, key *uint32)
+
 // NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
 // called by crypto/cipher.NewGCM via the gcmAble interface.
 func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
@@ -114,34 +116,10 @@ func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
 // into out. counter is the initial count value and will be updated with the next
 // count value. The length of out must be greater than or equal to the length
 // of in.
+// counterCryptASM implements counterCrypt which then allows the loop to
+// be unrolled and optimized.
 func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
-	var mask [gcmBlockSize]byte
-
-	for len(in) >= gcmBlockSize {
-		// Hint to avoid bounds check
-		_, _ = in[15], out[15]
-		g.cipher.Encrypt(mask[:], counter[:])
-		gcmInc32(counter)
-
-		// XOR 16 bytes each loop iteration in 8 byte chunks
-		in0 := binary.LittleEndian.Uint64(in[0:])
-		in1 := binary.LittleEndian.Uint64(in[8:])
-		m0 := binary.LittleEndian.Uint64(mask[:8])
-		m1 := binary.LittleEndian.Uint64(mask[8:])
-		binary.LittleEndian.PutUint64(out[:8], in0^m0)
-		binary.LittleEndian.PutUint64(out[8:], in1^m1)
-		out = out[16:]
-		in = in[16:]
-	}
-
-	if len(in) > 0 {
-		g.cipher.Encrypt(mask[:], counter[:])
-		gcmInc32(counter)
-		// XOR leftover bytes
-		for i, inb := range in {
-			out[i] = inb ^ mask[i]
-		}
-	}
+	counterCryptASM(len(g.cipher.enc)/4-1, out, in, counter, &g.cipher.enc[0])
 }
 
 // increments the rightmost 32-bits of the count value by 1.
diff --git a/src/crypto/aes/gcm_ppc64x.s b/src/crypto/aes/gcm_ppc64x.s
index 72f0b8e01c..f661b27642 100644
--- a/src/crypto/aes/gcm_ppc64x.s
+++ b/src/crypto/aes/gcm_ppc64x.s
@@ -4,7 +4,7 @@
 
 //go:build ppc64 || ppc64le
 
-// Based on CRYPTOGAMS code with the following comment:
+// Portions based on CRYPTOGAMS code with the following comment:
 // # ====================================================================
 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 // # project. The module is, however, dual licensed under OpenSSL and
@@ -12,13 +12,17 @@
 // # details see http://www.openssl.org/~appro/cryptogams/.
 // # ====================================================================
 
-// This implementation is based on the ppc64 asm generated by the
-// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
+// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
+// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
 // from commit d47afb3c.
 
 // Changes were made due to differences in the ABI and some register usage.
 // Some arguments were changed due to the way the Go code passes them.
 
+// Portions that use the stitched AES-GCM approach in counterCryptASM
+// are based on code found in
+// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
+
 #include "textflag.h"
 
 #define XIP    R3
@@ -87,6 +91,292 @@
 
 #define VIN0   VIN
 
+#define ESPERM V10
+#define TMP2 V11
+
+// The following macros provide appropriate
+// implementations for endianness as well as
+// ISA specific for power8 and power9.
+#ifdef GOARCH_ppc64le
+#  ifdef GOPPC64_power9
+#define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
+#define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
+#  else
+#define NEEDS_ESPERM
+#define P8_LXVB16X(RA,RB,VT) \
+	LXVD2X  (RA+RB), VT \
+	VPERM	VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+	VPERM	VS, VS, ESPERM, TMP2; \
+	STXVD2X TMP2, (RA+RB)
+
+#  endif
+#else
+#define P8_LXVB16X(RA,RB,VT) \
+	LXVD2X  (RA+RB), VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+	STXVD2X VS, (RA+RB)
+
+#endif
+
+#define MASK_PTR   R8
+
+#define MASKV   V0
+#define INV     V1
+
+// The following macros are used for
+// the stitched implementation within
+// counterCryptASM.
+
+// Load the initial GCM counter value
+// in V30 and set up the counter increment
+// in V31
+#define SETUP_COUNTER \
+	P8_LXVB16X(COUNTER, R0, V30); \
+	VSPLTISB $1, V28; \
+	VXOR V31, V31, V31; \
+	VSLDOI $1, V31, V28, V31
+
+// These macros set up the initial value
+// for a single encryption, or 4 or 8
+// stitched encryptions implemented
+// with interleaving vciphers.
+//
+// The input value for each encryption
+// is generated by XORing the counter
+// from V30 with the first key in VS0
+// and incrementing the counter.
+//
+// Single encryption in V15
+#define GEN_VCIPHER_INPUT \
+	XXLOR VS0, VS0, V29 \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30
+
+// 4 encryptions in V15 - V18
+#define GEN_VCIPHER_4_INPUTS \
+	XXLOR VS0, VS0, V29; \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V16; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V17; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V18; \
+	VADDUWM V30, V31, V30
+
+// 8 encryptions in V15 - V22
+#define GEN_VCIPHER_8_INPUTS \
+	XXLOR VS0, VS0, V29; \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V16; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V17; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V18; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V19; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V20; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V21; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V22; \
+	VADDUWM V30, V31, V30
+
+// Load the keys to be used for
+// encryption based on key_len.
+// Keys are in VS0 - VS14
+// depending on key_len.
+// Valid keys sizes are verified
+// here. CR2 is set and used
+// throughout to check key_len.
+#define LOAD_KEYS(blk_key, key_len) \
+	MOVD	$16, R16; \
+	MOVD	$32, R17; \
+	MOVD	$48, R18; \
+	MOVD	$64, R19; \
+	LXVD2X (blk_key)(R0), VS0; \
+	LXVD2X (blk_key)(R16), VS1; \
+	LXVD2X (blk_key)(R17), VS2; \
+	LXVD2X (blk_key)(R18), VS3; \
+	LXVD2X (blk_key)(R19), VS4; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	ADD $64, R18; \
+	ADD $64, R19; \
+	LXVD2X (blk_key)(R16), VS5; \
+	LXVD2X (blk_key)(R17), VS6; \
+	LXVD2X (blk_key)(R18), VS7; \
+	LXVD2X (blk_key)(R19), VS8; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	ADD $64, R18; \
+	ADD $64, R19; \
+	LXVD2X (blk_key)(R16), VS9; \
+	LXVD2X (blk_key)(R17), VS10; \
+	CMP key_len, $12, CR2; \
+	CMP key_len, $10; \
+	BEQ keysLoaded; \
+	LXVD2X (blk_key)(R18), VS11; \
+	LXVD2X (blk_key)(R19), VS12; \
+	BEQ CR2, keysLoaded; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	LXVD2X (blk_key)(R16), VS13; \
+	LXVD2X (blk_key)(R17), VS14; \
+	CMP key_len, $14; \
+	BEQ keysLoaded; \
+	MOVD R0,0(R0); \
+keysLoaded:
+
+// Encrypt 1 (vin) with first 9
+// keys from VS1 - VS9.
+#define VCIPHER_1X9_KEYS(vin) \
+	XXLOR VS1, VS1, V23; \
+	XXLOR VS2, VS2, V24; \
+	XXLOR VS3, VS3, V25; \
+	XXLOR VS4, VS4, V26; \
+	XXLOR VS5, VS5, V27; \
+	VCIPHER vin, V23, vin; \
+	VCIPHER vin, V24, vin; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER vin, V26, vin; \
+	VCIPHER vin, V27, vin; \
+	XXLOR VS6, VS6, V23; \
+	XXLOR VS7, VS7, V24; \
+	XXLOR VS8, VS8, V25; \
+	XXLOR VS9, VS9, V26; \
+	VCIPHER vin, V23, vin; \
+	VCIPHER vin, V24, vin; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER	vin, V26, vin
+
+// Encrypt 1 value (vin) with
+// 2 specified keys
+#define VCIPHER_1X2_KEYS(vin, key1, key2) \
+	XXLOR key1, key1, V25; \
+	XXLOR key2, key2, V26; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER vin, V26, vin
+
+// Encrypt 4 values in V15 - V18
+// with the specified key from
+// VS1 - VS9.
+#define VCIPHER_4X1_KEY(key) \
+	XXLOR key, key, V23; \
+	VCIPHER V15, V23, V15; \
+	VCIPHER V16, V23, V16; \
+	VCIPHER V17, V23, V17; \
+	VCIPHER V18, V23, V18
+
+// Encrypt 8 values in V15 - V22
+// with the specified key,
+// assuming it is a VSreg
+#define VCIPHER_8X1_KEY(key) \
+	XXLOR key, key, V23; \
+	VCIPHER V15, V23, V15; \
+	VCIPHER V16, V23, V16; \
+	VCIPHER V17, V23, V17; \
+	VCIPHER V18, V23, V18; \
+	VCIPHER V19, V23, V19; \
+	VCIPHER V20, V23, V20; \
+	VCIPHER V21, V23, V21; \
+	VCIPHER V22, V23, V22
+
+// Load input block into V1-V4
+// in big endian order and
+// update blk_inp by 64.
+#define LOAD_INPUT_BLOCK64(blk_inp) \
+	MOVD $16, R16; \
+	MOVD $32, R17; \
+	MOVD $48, R18; \
+	P8_LXVB16X(blk_inp,R0,V1); \
+	P8_LXVB16X(blk_inp,R16,V2); \
+	P8_LXVB16X(blk_inp,R17,V3); \
+	P8_LXVB16X(blk_inp,R18,V4); \
+	ADD $64, blk_inp
+
+// Load input block into V1-V8
+// in big endian order and
+// Update blk_inp by 128
+#define LOAD_INPUT_BLOCK128(blk_inp) \
+	MOVD $16, R16; \
+	MOVD $32, R17; \
+	MOVD $48, R18; \
+	MOVD $64, R19; \
+	MOVD $80, R20; \
+	MOVD $96, R21; \
+	MOVD $112, R22; \
+	P8_LXVB16X(blk_inp,R0,V1); \
+	P8_LXVB16X(blk_inp,R16,V2); \
+	P8_LXVB16X(blk_inp,R17,V3); \
+	P8_LXVB16X(blk_inp,R18,V4); \
+	P8_LXVB16X(blk_inp,R19,V5); \
+	P8_LXVB16X(blk_inp,R20,V6); \
+	P8_LXVB16X(blk_inp,R21,V7); \
+	P8_LXVB16X(blk_inp,R22,V8); \
+	ADD $128, blk_inp
+
+// Finish encryption on 8 streams and
+// XOR with input block
+#define VCIPHERLAST8_XOR_INPUT \
+	VCIPHERLAST     V15, V23, V15; \
+	VCIPHERLAST     V16, V23, V16; \
+	VCIPHERLAST     V17, V23, V17; \
+	VCIPHERLAST     V18, V23, V18; \
+	VCIPHERLAST     V19, V23, V19; \
+	VCIPHERLAST     V20, V23, V20; \
+	VCIPHERLAST     V21, V23, V21; \
+	VCIPHERLAST     V22, V23, V22; \
+	XXLXOR          V1, V15, V1; \
+	XXLXOR          V2, V16, V2; \
+	XXLXOR          V3, V17, V3; \
+	XXLXOR          V4, V18, V4; \
+	XXLXOR          V5, V19, V5; \
+	XXLXOR          V6, V20, V6; \
+	XXLXOR          V7, V21, V7; \
+	XXLXOR          V8, V22, V8
+
+// Finish encryption on 4 streams and
+// XOR with input block
+#define VCIPHERLAST4_XOR_INPUT \
+	VCIPHERLAST     V15, V23, V15; \
+	VCIPHERLAST     V16, V23, V16; \
+	VCIPHERLAST     V17, V23, V17; \
+	VCIPHERLAST     V18, V23, V18; \
+	XXLXOR          V1, V15, V1; \
+	XXLXOR          V2, V16, V2; \
+	XXLXOR          V3, V17, V3; \
+	XXLXOR          V4, V18, V4
+
+// Store output block from V1-V8
+// in big endian order and
+// Update blk_out by 128
+#define STORE_OUTPUT_BLOCK128(blk_out) \
+	P8_STXVB16X(V1,blk_out,R0); \
+	P8_STXVB16X(V2,blk_out,R16); \
+	P8_STXVB16X(V3,blk_out,R17); \
+	P8_STXVB16X(V4,blk_out,R18); \
+	P8_STXVB16X(V5,blk_out,R19); \
+	P8_STXVB16X(V6,blk_out,R20); \
+	P8_STXVB16X(V7,blk_out,R21); \
+	P8_STXVB16X(V8,blk_out,R22); \
+	ADD $128, blk_out
+
+// Store output block from V1-V4
+// in big endian order and
+// Update blk_out by 64
+#define STORE_OUTPUT_BLOCK64(blk_out) \
+	P8_STXVB16X(V1,blk_out,R0); \
+	P8_STXVB16X(V2,blk_out,R16); \
+	P8_STXVB16X(V3,blk_out,R17); \
+	P8_STXVB16X(V4,blk_out,R18); \
+	ADD $64, blk_out
+
 // func gcmInit(productTable *[256]byte, h []byte)
 TEXT ·gcmInit(SB), NOSPLIT, $0-32
 	MOVD productTable+0(FP), XIP
@@ -588,3 +878,226 @@ TEXT ·gcmMul(SB), NOSPLIT, $0-32
 #endif
 	STXVD2X VXL, (XIP+R0)      // write out Xi
 	RET
+
+#define BLK_INP    R3
+#define BLK_OUT    R4
+#define BLK_KEY    R5
+#define KEY_LEN    R6
+#define BLK_IDX    R7
+#define IDX        R8
+#define IN_LEN     R9
+#define COUNTER    R10
+#define CONPTR     R14
+#define MASK       V5
+
+// Implementation of the counterCrypt function in assembler.
+// Original loop is unrolled to allow for multiple encryption
+// streams to be done in parallel, which is achieved by interleaving
+// vcipher instructions from each stream. This is also referred to as
+// stitching, and provides significant performance improvements.
+// Some macros are defined which enable execution for big or little
+// endian as well as different ISA targets.
+//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
+//func counterCryptASM(xr, out, in, counter, key)
+TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
+	MOVD	xr(FP), KEY_LEN
+	MOVD    out+8(FP), BLK_OUT
+	MOVD    out_len+16(FP), R8
+	MOVD    in+32(FP), BLK_INP
+	MOVD    in_len+40(FP), IN_LEN
+	MOVD    counter+56(FP), COUNTER
+	MOVD    key+64(FP), BLK_KEY
+
+// Set up permute string when needed.
+#ifdef NEEDS_ESPERM
+	MOVD    $·rcon(SB), R14
+	LVX     (R14), ESPERM   // Permute value for P8_ macros.
+#endif
+	SETUP_COUNTER		// V30 Counter V31 BE {0, 0, 0, 1}
+	LOAD_KEYS(BLK_KEY, KEY_LEN)	// VS1 - VS10/12/14 based on keysize
+	CMP     IN_LEN, $128
+	BLT	block64
+block128_loop:
+	// Do 8 encryptions in parallel by setting
+	// input values in V15-V22 and executing
+	// vcipher on the updated value and the keys.
+	GEN_VCIPHER_8_INPUTS
+	VCIPHER_8X1_KEY(VS1)
+	VCIPHER_8X1_KEY(VS2)
+	VCIPHER_8X1_KEY(VS3)
+	VCIPHER_8X1_KEY(VS4)
+	VCIPHER_8X1_KEY(VS5)
+	VCIPHER_8X1_KEY(VS6)
+	VCIPHER_8X1_KEY(VS7)
+	VCIPHER_8X1_KEY(VS8)
+	VCIPHER_8X1_KEY(VS9)
+	// Additional encryptions are done based on
+	// the key length, with the last key moved
+	// to V23 for use with VCIPHERLAST.
+	// CR2 = CMP key_len, $12
+	XXLOR VS10, VS10, V23
+	BLT	CR2, block128_last // key_len = 10
+	VCIPHER_8X1_KEY(VS10)
+	VCIPHER_8X1_KEY(VS11)
+	XXLOR VS12,VS12,V23
+	BEQ	CR2, block128_last // ken_len = 12
+	VCIPHER_8X1_KEY(VS12)
+	VCIPHER_8X1_KEY(VS13)
+	XXLOR VS14,VS14,V23	// key_len = 14
+block128_last:
+	// vcipher encryptions are in V15-V22 at this
+	// point with vcipherlast remaining to be done.
+	// Load input block into V1-V8, setting index offsets
+	// in R16-R22 to use with the STORE.
+	LOAD_INPUT_BLOCK128(BLK_INP)
+	// Do VCIPHERLAST on the last key for each encryption
+	// stream and XOR the result with the corresponding
+	// value from the input block.
+	VCIPHERLAST8_XOR_INPUT
+	// Store the results (8*16) and update BLK_OUT by 128.
+	STORE_OUTPUT_BLOCK128(BLK_OUT)
+	ADD	$-128, IN_LEN	// input size
+	CMP     IN_LEN, $128	// check if >= blocksize
+	BGE	block128_loop	// next input block
+	CMP	IN_LEN, $0
+	BEQ	done
+block64:
+	CMP	IN_LEN, $64	// Check if >= 64
+	BLT	block16_loop
+	// Do 4 encryptions in parallel by setting
+	// input values in V15-V18 and executing
+	// vcipher on the updated value and the keys.
+	GEN_VCIPHER_4_INPUTS
+	VCIPHER_4X1_KEY(VS1)
+	VCIPHER_4X1_KEY(VS2)
+	VCIPHER_4X1_KEY(VS3)
+	VCIPHER_4X1_KEY(VS4)
+	VCIPHER_4X1_KEY(VS5)
+	VCIPHER_4X1_KEY(VS6)
+	VCIPHER_4X1_KEY(VS7)
+	VCIPHER_4X1_KEY(VS8)
+	VCIPHER_4X1_KEY(VS9)
+	// Check key length based on CR2
+	// Move last key to V23 for use with later vcipherlast
+	XXLOR	VS10, VS10, V23
+	BLT	CR2, block64_last	// size = 10
+	VCIPHER_4X1_KEY(VS10)		// Encrypt next 2 keys
+	VCIPHER_4X1_KEY(VS11)
+	XXLOR	VS12, VS12, V23
+	BEQ	CR2, block64_last	// size = 12
+	VCIPHER_4X1_KEY(VS12)		// Encrypt last 2 keys
+	VCIPHER_4X1_KEY(VS13)
+	XXLOR	VS14, VS14, V23		// size = 14
+block64_last:
+	LOAD_INPUT_BLOCK64(BLK_INP)	// Load 64 bytes of input
+	// Do VCIPHERLAST on the last for each encryption
+	// stream and XOR the result with the corresponding
+	// value from the input block.
+	VCIPHERLAST4_XOR_INPUT
+	// Store the results (4*16) and update BLK_OUT by 64.
+	STORE_OUTPUT_BLOCK64(BLK_OUT)
+	ADD	$-64, IN_LEN		// decrement input block length
+	CMP	IN_LEN, $0		// check for remaining length
+	BEQ	done
+block16_loop:
+	CMP	IN_LEN, $16		// More input
+	BLT	final_block		// If not, then handle partial block
+	// Single encryption, no stitching
+	GEN_VCIPHER_INPUT		// Generate input value for single encryption
+	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 value with 9 keys
+	XXLOR	VS10, VS10, V23		// Last key -> V23 for later vcipiherlast
+	// Key length based on CR2. (LT=10, EQ=12, GT=14)
+	BLT	CR2, block16_last	// Finish for key size 10
+	VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
+	XXLOR	VS12, VS12, V23		// Last key -> V23 for later vcipherlast
+	BEQ	CR2, block16_last	// Finish for key size 12
+	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+	XXLOR	VS14, VS14, V23		// Last key -> V23 for vcipherlast with key size 14
+block16_last:
+	P8_LXVB16X(BLK_INP, R0, V1)	// Load input
+	VCIPHERLAST V15, V23, V15	// Encrypt last value in V23
+	XXLXOR	V15, V1, V1		// XOR with input
+	P8_STXVB16X(V1,R0,BLK_OUT)	// Store final encryption value to output
+	ADD	$16, BLK_INP		// Increment input pointer
+	ADD	$16, BLK_OUT		// Increment output pointer
+	ADD	$-16, IN_LEN		// Decrement input length
+	BR	block16_loop		// Check for next
+final_block:
+	CMP	IN_LEN, $0
+	BEQ	done
+	GEN_VCIPHER_INPUT		// Generate input value for partial encryption
+	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 with 9 keys
+	XXLOR	VS10, VS10, V23		// Save possible last key
+	BLT	CR2, final_block_last
+	VCIPHER_1X2_KEYS(V15, VS10, VS11)	// Encrypt V15 with next 2 keys
+	XXLOR	VS12, VS12, V23		// Save possible last key
+	BEQ	CR2, final_block_last
+	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+	XXLOR	VS14, VS14, V23		// Save last key
+final_block_last:
+	VCIPHERLAST V15, V23, V15	// Finish encryption
+#ifdef GOPPC64_power10
+	// set up length
+	SLD	$56, IN_LEN, R17
+	LXVLL	BLK_INP, R17, V25
+	VXOR	V25, V15, V25
+	STXVLL	V25, BLK_OUT, R17
+#else
+	ADD	$32, R1, MASK_PTR
+	MOVD	$0, R16
+	P8_STXVB16X(V15, MASK_PTR, R0)
+	CMP	IN_LEN, $8
+	BLT	next4
+	MOVD	0(MASK_PTR), R14
+	MOVD	0(BLK_INP), R15
+	XOR	R14, R15, R14
+	MOVD	R14, 0(BLK_OUT)
+	ADD	$8, R16
+	ADD	$-8, IN_LEN
+next4:
+	CMP	IN_LEN, $4
+	BLT	next2
+	MOVWZ	(BLK_INP)(R16), R15
+	MOVWZ	(MASK_PTR)(R16), R14
+	XOR	R14, R15, R14
+	MOVW	R14, (R16)(BLK_OUT)
+	ADD	$4, R16
+	ADD	$-4, IN_LEN
+next2:
+	CMP	IN_LEN, $2
+	BLT	next1
+	MOVHZ	(BLK_INP)(R16), R15
+	MOVHZ	(MASK_PTR)(R16), R14
+	XOR	R14, R15, R14
+	MOVH	R14, (R16)(BLK_OUT)
+	ADD	$2, R16
+	ADD	$-2, IN_LEN
+next1:
+	CMP	IN_LEN, $1
+	BLT	done
+	MOVBZ	(MASK_PTR)(R16), R14
+	MOVBZ	(BLK_INP)(R16), R15
+	XOR	R14, R15, R14
+	MOVB	R14, (R16)(BLK_OUT)
+#endif
+done:
+	// Save the updated counter value
+	P8_STXVB16X(V30, COUNTER, R0)
+	// Clear the keys
+	XXLXOR	VS0, VS0, VS0
+	XXLXOR	VS1, VS1, VS1
+	XXLXOR	VS2, VS2, VS2
+	XXLXOR	VS3, VS3, VS3
+	XXLXOR	VS4, VS4, VS4
+	XXLXOR	VS5, VS5, VS5
+	XXLXOR	VS6, VS6, VS6
+	XXLXOR	VS7, VS7, VS7
+	XXLXOR	VS8, VS8, VS8
+	XXLXOR	VS9, VS9, VS9
+	XXLXOR	VS10, VS10, VS10
+	XXLXOR	VS11, VS11, VS11
+	XXLXOR	VS12, VS12, VS12
+	XXLXOR	VS13, VS13, VS13
+	XXLXOR	VS14, VS14, VS14
+	RET
+