diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s
index 3a551a72da..38e0b14e75 100644
--- a/src/internal/bytealg/index_arm64.s
+++ b/src/internal/bytealg/index_arm64.s
@@ -5,29 +5,30 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Index(SB),NOSPLIT,$0-56
-	MOVD	a_base+0(FP), R0
-	MOVD	a_len+8(FP), R1
-	MOVD	b_base+24(FP), R2
-	MOVD	b_len+32(FP), R3
-	MOVD	$ret+48(FP), R9
-	B	indexbody<>(SB)
-
-TEXT ·IndexString(SB),NOSPLIT,$0-40
-	MOVD	a_base+0(FP), R0
-	MOVD	a_len+8(FP), R1
-	MOVD	b_base+16(FP), R2
-	MOVD	b_len+24(FP), R3
-	MOVD	$ret+32(FP), R9
-	B	indexbody<>(SB)
-
+// func Index(a, b []byte) int
 // input:
-//   R0: haystack
-//   R1: length of haystack
-//   R2: needle
-//   R3: length of needle (2 <= len <= 32)
-//   R9: address to put result
-TEXT indexbody<>(SB),NOSPLIT,$0-56
+//   R0: a ptr (haystack)
+//   R1: a len (haystack)
+//   R2: a cap (haystack) (unused)
+//   R3: b ptr (needle)
+//   R4: b len (needle) (2 <= len <= 32)
+//   R5: b cap (needle) (unused)
+// return:
+//   R0: result
+TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
+	MOVD	R3, R2
+	MOVD	R4, R3
+	B	·IndexString<ABIInternal>(SB)
+
+// func IndexString(a, b string) int
+// input:
+//   R0: a ptr (haystack)
+//   R1: a len (haystack)
+//   R2: b ptr (needle)
+//   R3: b len (needle) (2 <= len <= 32)
+// return:
+//   R0: result
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
 	// main idea is to load 'sep' into separate register(s)
 	// to avoid repeatedly re-load it again and again
 	// for sebsequent substring comparisons
@@ -136,11 +137,9 @@ loop_2:
 	BNE	loop_2
 found:
 	SUB	R8, R0, R0
-	MOVD	R0, (R9)
 	RET
 not_found:
 	MOVD	$-1, R0
-	MOVD	R0, (R9)
 	RET
 greater_8:
 	SUB	$9, R3, R11	// len(sep) - 9, offset of R0 for last 8 bytes
diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s
index 40843fbc5b..92a61a4302 100644
--- a/src/internal/bytealg/indexbyte_arm64.s
+++ b/src/internal/bytealg/indexbyte_arm64.s
@@ -4,26 +4,26 @@
 
 #include "textflag.h"
 
-TEXT ·IndexByte(SB),NOSPLIT,$0-40
-	MOVD	b_base+0(FP), R0
-	MOVD	b_len+8(FP), R2
-	MOVBU	c+24(FP), R1
-	MOVD	$ret+32(FP), R8
-	B	indexbytebody<>(SB)
-
-TEXT ·IndexByteString(SB),NOSPLIT,$0-32
-	MOVD	s_base+0(FP), R0
-	MOVD	s_len+8(FP), R2
-	MOVBU	c+16(FP), R1
-	MOVD	$ret+24(FP), R8
-	B	indexbytebody<>(SB)
-
+// func IndexByte(b []byte, c byte) int
 // input:
-//   R0: data
-//   R1: byte to search
-//   R2: data len
-//   R8: address to put result
-TEXT indexbytebody<>(SB),NOSPLIT,$0
+//   R0: b ptr
+//   R1: b len
+//   R2: b cap (unused)
+//   R3: c byte to search
+// return
+//   R0: result
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+	MOVD	R3, R2
+	B	·IndexByteString<ABIInternal>(SB)
+
+// func IndexByteString(s string, c byte) int
+// input:
+//   R0: s ptr
+//   R1: s len
+//   R2: c byte to search
+// return
+//   R0: result
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
 	// Core algorithm:
 	// For each 32-byte chunk we calculate a 64-bit syndrome value,
 	// with two bits per byte. For each tuple, bit 0 is set if the
@@ -33,19 +33,19 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
 	// in the original string, counting trailing zeros allows to
 	// identify exactly which byte has matched.
 
-	CBZ	R2, fail
+	CBZ	R1, fail
 	MOVD	R0, R11
 	// Magic constant 0x40100401 allows us to identify
 	// which lane matches the requested byte.
 	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
 	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
 	MOVD	$0x40100401, R5
-	VMOV	R1, V0.B16
+	VMOV	R2, V0.B16
 	// Work with aligned 32-byte chunks
 	BIC	$0x1f, R0, R3
 	VMOV	R5, V5.S4
 	ANDS	$0x1f, R0, R9
-	AND	$0x1f, R2, R10
+	AND	$0x1f, R1, R10
 	BEQ	loop
 
 	// Input string is not 32-byte aligned. We calculate the
@@ -53,7 +53,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
 	// the first bytes and mask off the irrelevant part.
 	VLD1.P	(R3), [V1.B16, V2.B16]
 	SUB	$0x20, R9, R4
-	ADDS	R4, R2, R2
+	ADDS	R4, R1, R1
 	VCMEQ	V0.B16, V1.B16, V3.B16
 	VCMEQ	V0.B16, V2.B16, V4.B16
 	VAND	V5.B16, V3.B16, V3.B16
@@ -72,7 +72,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
 
 loop:
 	VLD1.P	(R3), [V1.B16, V2.B16]
-	SUBS	$0x20, R2, R2
+	SUBS	$0x20, R1, R1
 	VCMEQ	V0.B16, V1.B16, V3.B16
 	VCMEQ	V0.B16, V2.B16, V4.B16
 	// If we're out of data we finish regardless of the result
@@ -117,10 +117,8 @@ tail:
 	ADD	R6>>1, R3, R0
 	// Compute the offset result
 	SUB	R11, R0, R0
-	MOVD	R0, (R8)
 	RET
 
 fail:
 	MOVD	$-1, R0
-	MOVD	R0, (R8)
 	RET