diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
index 1a6e852d67..b6714f45aa 100644
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -11,381 +11,304 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
 	// R3 = byte array pointer
 	// R4 = length
 	MOVD	R6, R5		// R5 = byte
-	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)
 
 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
 	// R3 = string
 	// R4 = length
 	// R5 = byte
-	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
 	BR	indexbytebody<>(SB)
 
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX	MOVDBR
+#define _LWBEX	MOVWBR
+#define _LHBEX	MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX	MOVD
+#define _LWBEX	MOVW
+#define _LHBEX	MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
-// R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-	MOVD	R3,R17		// Save base address for calculating the index later.
-	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
-	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
-	ADD	R4,R3,R7	// Last acceptable address in R7.
+	CMPU	R4,$32
 
-	RLDIMI	$16,R5,$32,R5
-	CMPU	R4,$32		// Check if it's a small string (≤32 bytes). Those will be processed differently.
-	MOVD	$-1,R9
-	RLWNM	$3,R3,$26,$28,R6	// shift amount for mask (r3&0x7)*8
-	RLDIMI	$32,R5,$0,R5
-	MOVD	R7,R10		// Save last acceptable address in R10 for later.
-	ADD	$-1,R7,R7
-#ifdef GOARCH_ppc64le
-	SLD	R6,R9,R9	// Prepare mask for Little Endian
-#else
-	SRD	R6,R9,R9	// Same for Big Endian
+#ifndef GOPPC64_power9
+	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
+	MOVD	$indexbytevbperm<>+00(SB),R16
+	LXVD2X	(R16),V0	// Set up swap string
 #endif
-	BLT	small_string	// Jump to the small string case if it's <32 bytes.
-	CMP	R16,$1		// optimize for power8 v power9
-	BNE	power8
-	VSPLTISB	$3,V10	// Use V10 as control for VBPERMQ
+
 	MTVRD	R5,V1
-	LVSL	(R0+R0),V11	// set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
-	VSLB	V11,V10,V10	// to extract the first bit of match result into GPR
 	VSPLTB	$7,V1,V1	// Replicate byte across V1
-	CMP	R4,$64
+
+	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
+
+	CMP	R4,$64,CR1
 	MOVD	$16,R11
 	MOVD	R3,R8
-	BLT	cmp32
+	BLT	CR1,cmp32	// Special case for length 32 - 63
 	MOVD	$32,R12
 	MOVD	$48,R6
 
+	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
+	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
+	ANDCC	$63,R4		// (len &= 63) cmp 0.
+
+	PCALIGN	$16
 loop64:
-	LXVB16X	(R0)(R8),V2	// scan 64 bytes at a time
+	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat0	// match found at R8, jump out
+	BNE	CR6,foundat0	// Match found at R8, jump out
 
-	LXVB16X	(R8)(R11),V2
+	LXVD2X	(R11)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat1	// match found at R8+16 bytes, jump out
+	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out
 
-	LXVB16X	(R8)(R12),V2
+	LXVD2X	(R12)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat2	// match found at R8+32 bytes, jump out
+	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out
 
-	LXVB16X	(R8)(R6),V2
+	LXVD2X	(R6)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat3	// match found at R8+48 bytes, jump out
+	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
+
 	ADD	$64,R8
-	ADD	$-64,R4
-	CMP	R4,$64		// >=64 bytes left to scan?
-	BGE	loop64
-	CMP	R4,$32
-	BLT	rem		// jump to rem if there are < 32 bytes left
-cmp32:
-	LXVB16X	(R0)(R8),V2	// 32-63 bytes left
+	CMPU	R8,R9,CR1
+	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
+
+	PCALIGN	$32
+	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
+
+	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
+	CMP	R4,$16,CR1
+	BGE	cmp32
+
+	ADD	R8,R4,R9
+	ADD	$-16,R9
+	BLE	CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16:	// Tail length 17 - 32
+	LXVD2X	(R0)(R8),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat0	// match found at R8
+	BNE	CR6,foundat0
 
-	LXVB16X	(R11)(R8),V2
+cmp64_tail_gt0:	// Tail length 1 - 16
+	MOVD	R9,R8
+	LXVD2X	(R0)(R9),V2
 	VCMPEQUBCC	V2,V1,V6
-	BNE	CR6,foundat1	// match found at R8+16
+	BNE	CR6,foundat0
 
-	ADD	$32,R8
-	ADD	$-32,R4
-rem:
-	RLDICR	$0,R8,$60,R8	// align address to reuse code for tail end processing
-	BR	small_string
+	BR	notfound
 
+cmp32:	// Length 32 - 63
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0
+
+	// Bytes 16 - 31
+	LXVD2X	(R8)(R11),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
+
+	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+	CMP	R4,$48
+
+	ADD	R4,R8,R9		// Compute &s[len(s)-16]
+	ADD	$32,R8,R8
+	ADD	$-16,R9,R9
+	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
+
+	// Bytes 33 - 47
+	LXVD2X	(R0)(R8),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
+
+	BLE	notfound
+
+	// Bytes 48 - 63
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
 foundat3:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$48+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat2:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$32+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat1:
-	ADD	$16,R8
+	SUB	R3,R8,R3
+	ADD	$16+ADJUST_FOR_CNTLZW,R3
+	BR	vfound
 foundat0:
-	// Compress the result into a single doubleword and
-	// move it to a GPR for the final calculation.
-	VBPERMQ	V6,V10,V6
-	MFVRD	V6,R3
-	// count leading zeroes upto the match that ends up in low 16 bits
-	// in both endian modes, compute index by subtracting the number by 16
-	CNTLZW	R3,R11
-	ADD	$-16,R11
-	ADD	R8,R11,R3	// Calculate byte address
-	SUB	R17,R3
+	SUB	R3,R8,R3
+	ADD	$0+ADJUST_FOR_CNTLZW,R3
+vfound:
+	// Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+	VBPERMQ	V6,V0,V6
+	MFVRD	V6,R4
+	CNTLZW	R4,R4
+#else
+#ifdef GOARCH_ppc64le
+	// Put the value back into LE ordering by swapping doublewords.
+	XXPERMDI	V6,V6,$2,V6
+#endif
+	_VCZBEBB	V6,R4
+#endif
+	ADD	R3,R4,R3
 	RET
-power8:
-	// If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
-	// in V0, V1 and V10, then branch to the preloop.
-	ANDCC	$63,R3,R11
-	BEQ	CR0,qw_align
-	RLDICL	$0,R3,$61,R11
 
-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
-	BNE	CR7,done
-	ADD	$8,R8,R8
+cmp16:	// Length 16 - 31
+	CMPU	R4,$16
+	ADD	R4,R3,R9
+	BLT	cmp8
+
+	ADD	$-16,R9,R9		// &s[len(s)-16]
+
+	// Bytes 0 - 15
+	LXVD2X	(R0)(R3),V2
+	VCMPEQUBCC	V2,V1,V6
+	MOVD	R3,R8
+	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
+
+	BEQ	notfound
+
+	// Bytes 16 - 30
+	MOVD	R9,R8			// R9 holds the final check.
+	LXVD2X	(R0)(R9),V2
+	VCMPEQUBCC	V2,V1,V6
+	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
+
+	BR	notfound
+
+
+cmp8:	// Length 8 - 15
+#ifdef GOPPC64_power10
+	// Load all the bytes into a single VSR in BE order.
+	SLD	$56,R4,R5
+	LXVLL	R3,R5,V2
+	// Compare and count the number which don't match.
+	VCMPEQUB	V2,V1,V6
+	VCLZLSBB	V6,R3
+	// If count is the number of bytes, or more. No matches are found.
+	CMPU	R3,R4
+	MOVD	$-1,R5
+	// Otherwise, the count is the index of the first match.
+	ISEL	CR0LT,R3,R5,R3
+	RET
+#else
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+	RLDIMI	$16,R5,$32,R5
+	RLDIMI	$32,R5,$0,R5
+	CMPU	R4,$8
+	BLT	cmp4
+	MOVD	$-8,R11
 	ADD	$-8,R4,R4
+
+	_LDBEX	(R0)(R3),R10
+	_LDBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CMPU	R10,$0
+	CMPU	R11,$0,CR1
+	CNTLZD	R10,R10
+	CNTLZD	R11,R11
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
 	ADD	R4,R11,R4
-
-	// Check for quadword alignment
-	ANDCC	$15,R8,R11
-	BEQ	CR0,qw_align
-
-	// Not aligned, so handle the next doubleword
-	MOVD	0(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR7
-	BNE	CR7,done
-	ADD	$8,R8,R8
-	ADD	$-8,R4,R4
-
-	// Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
-	// Set up auxiliary data for the vectorized algorithm.
-	VSPLTISB  $0,V0		// Replicate 0 across V0
-	VSPLTISB  $3,V10	// Use V10 as control for VBPERMQ
-	MTVRD	  R5,V1
-	LVSL	  (R0+R0),V11
-	VSLB	  V11,V10,V10
-	VSPLTB	  $7,V1,V1	// Replicate byte across V1
-	CMPU	  R4, $64	// If len ≤ 64, don't use the vectorized loop
-	BLE	  tail
-
-	// We will load 4 quardwords per iteration in the loop, so check for
-	// 64-byte alignment. If 64-byte aligned, then branch to the preloop.
-	ANDCC	  $63,R8,R11
-	BEQ	  CR0,preloop
-
-	// Not 64-byte aligned. Load one quadword at a time until aligned.
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	ADD	    $-16,R4,R4
-
-	ANDCC	    $63,R8,R11
-	BEQ	    CR0,preloop
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6		// Check for byte in V4
-	BNE	    CR6,found_qw_align
-	ADD	    $-16,R4,R4
-	ADD	    $16,R8,R8
-
-	// 64-byte aligned. Prepare for the main loop.
-preloop:
-	CMPU	R4,$64
-	BLE	tail	      // If len ≤ 64, don't use the vectorized loop
-
-	// We are now aligned to a 64-byte boundary. We will load 4 quadwords
-	// per loop iteration. The last doubleword is in R10, so our loop counter
-	// starts at (R10-R8)/64.
-	SUB	R8,R10,R6
-	SRD	$6,R6,R9      // Loop counter in R9
-	MOVD	R9,CTR
-
-	ADD	$-64,R8,R8   // Adjust index for loop entry
-	MOVD	$16,R11      // Load offsets for the vector loads
-	MOVD	$32,R9
-	MOVD	$48,R7
-
-	// Main loop we will load 64 bytes per iteration
-loop:
-	ADD	    $64,R8,R8	      // Fuse addi+lvx for performance
-	LVX	    (R8+R0),V2	      // Load 4 16-byte vectors
-	LVX	    (R8+R11),V3
-	VCMPEQUB    V1,V2,V6	      // Look for byte in each vector
-	VCMPEQUB    V1,V3,V7
-
-	LVX	    (R8+R9),V4
-	LVX	    (R8+R7),V5
-	VCMPEQUB    V1,V4,V8
-	VCMPEQUB    V1,V5,V9
-
-	VOR	    V6,V7,V11	      // Compress the result in a single vector
-	VOR	    V8,V9,V12
-	VOR	    V11,V12,V13
-	VCMPEQUBCC  V0,V13,V14	      // Check for byte
-	BGE	    CR6,found
-	BC	    16,0,loop	      // bdnz loop
-
-	// Handle the tailing bytes or R4 ≤ 64
-	RLDICL	$0,R6,$58,R4
-	ADD	$64,R8,R8
-tail:
-	CMPU	    R4,$0
-	BEQ	    notfound
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-	ADD	    $16,R8,R8
-	CMPU	    R4,$16,CR6
-	BLE	    CR6,notfound
-	ADD	    $-16,R4,R4
-
-	LVX	    (R8+R0),V4
-	VCMPEQUBCC  V1,V4,V6
-	BNE	    CR6,found_qw_align
-
-notfound:
-	MOVD	$-1, R3
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
 	RET
 
+cmp4:	// Length 4 - 7
+	CMPU	R4,$4
+	BLT	cmp2
+	MOVD	$-4,R11
+	ADD	$-4,R4,R4
+
+	_LWBEX	(R0)(R3),R10
+	_LWBEX	(R11)(R9),R11
+	CMPB	R10,R5,R10
+	CMPB	R11,R5,R11
+	CNTLZW	R10,R10
+	CNTLZW	R11,R11
+	CMPU	R10,$32
+	CMPU	R11,$32,CR1
+	SRD	$3,R10,R3
+	SRD	$3,R11,R11
+	BNE	found
+
+	ADD	R4,R11,R4
+	MOVD	$-1,R3
+	ISEL	CR1EQ,R3,R4,R3
+	RET
+
+cmp2:	// Length 2 - 3
+	CMPU	R4,$2
+	BLT	cmp1
+
+	_LHBEX	(R0)(R3),R10
+	CMPB	R10,R5,R10
+	SLDCC	$48,R10,R10
+	CNTLZD	R10,R10
+	SRD	$3,R10,R3
+	BNE	found
+
+cmp1:	// Length 1
+	MOVD	$-1,R3
+	ANDCC	$1,R4,R31
+	BEQ	found
+
+	MOVBZ	-1(R9),R10
+	CMPB	R10,R5,R10
+	ANDCC	$1,R10
+	ADD	$-1,R4
+	ISEL	CR0EQ,R3,R4,R3
+
 found:
-	// We will now compress the results into a single doubleword,
-	// so it can be moved to a GPR for the final index calculation.
-
-	// The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-	// first bit of each byte into bits 48-63.
-	VBPERMQ	  V6,V10,V6
-	VBPERMQ	  V7,V10,V7
-	VBPERMQ	  V8,V10,V8
-	VBPERMQ	  V9,V10,V9
-
-	// Shift each 16-bit component into its correct position for
-	// merging into a single doubleword.
-#ifdef GOARCH_ppc64le
-	VSLDOI	  $2,V7,V7,V7
-	VSLDOI	  $4,V8,V8,V8
-	VSLDOI	  $6,V9,V9,V9
-#else
-	VSLDOI	  $6,V6,V6,V6
-	VSLDOI	  $4,V7,V7,V7
-	VSLDOI	  $2,V8,V8,V8
+	RET
 #endif
 
-	// Merge V6-V9 into a single doubleword and move to a GPR.
-	VOR	V6,V7,V11
-	VOR	V8,V9,V4
-	VOR	V4,V11,V4
-	MFVRD	V4,R3
-
-#ifdef GOARCH_ppc64le
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11	// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	ADD	R8,R11,R3	// Calculate byte address
-
-return:
-	SUB	R17, R3
+notfound:
+	MOVD $-1,R3
 	RET
 
-found_qw_align:
-	// Use the same algorithm as above. Compress the result into
-	// a single doubleword and move it to a GPR for the final
-	// calculation.
-	VBPERMQ	  V6,V10,V6
-
-#ifdef GOARCH_ppc64le
-	MFVRD	  V6,R3
-	ADD	  $-1,R3,R11
-	ANDN	  R3,R11,R11
-	POPCNTD	  R11,R11
-#else
-	VSLDOI	  $6,V6,V6,V6
-	MFVRD	  V6,R3
-	CNTLZD	  R3,R11
-#endif
-	ADD	  R8,R11,R3
-	CMPU	  R11,R4
-	BLT	  return
-	BR	  notfound
-	PCALIGN	  $16
-
-done:
-	ADD	$-1,R10,R6
-	// Offset of last index for the final
-	// doubleword comparison
-	RLDICL	$0,R6,$61,R6
-	// At this point, R3 has 0xFF in the same position as the byte we are
-	// looking for in the doubleword. Use that to calculate the exact index
-	// of the byte.
-#ifdef GOARCH_ppc64le
-	ADD	$-1,R3,R11
-	ANDN	R3,R11,R11
-	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
-#else
-	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
-#endif
-	CMPU	R8,R7		// Check if we are at the last doubleword.
-	SRD	$3,R11		// Convert trailing zeros to bytes.
-	ADD	R11,R8,R3
-	CMPU	R11,R6,CR7	// If at the last doubleword, check the byte offset.
-	BNE	return
-	BLE	CR7,return
-	BR	notfound
-
-small_string:
-	// process string of length < 32 bytes
-	// We unroll this loop for better performance.
-	CMPU	R4,$0		// Check for length=0
-	BEQ	notfound
-
-	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
-	CMPB	R12,R5,R3	// Check for a match.
-	AND	R9,R3,R3	// Mask bytes below s_base.
-	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
-	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
-	CMPU	R8,R7
-	BNE	CR7,done
-	BEQ	notfound	// Hit length.
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	CMPU	R8,R7
-	BNE	CR6,done
-	BEQ	notfound
-
-	MOVDU	8(R8),R12
-	CMPB	R12,R5,R3
-	CMPU	R3,$0,CR6
-	BNE	CR6,done
-	BR	notfound
-