diff --git a/src/internal/bytealg/index_arm64.s b/src/internal/bytealg/index_arm64.s index 3a551a72da..38e0b14e75 100644 --- a/src/internal/bytealg/index_arm64.s +++ b/src/internal/bytealg/index_arm64.s @@ -5,29 +5,30 @@ #include "go_asm.h" #include "textflag.h" -TEXT ·Index(SB),NOSPLIT,$0-56 - MOVD a_base+0(FP), R0 - MOVD a_len+8(FP), R1 - MOVD b_base+24(FP), R2 - MOVD b_len+32(FP), R3 - MOVD $ret+48(FP), R9 - B indexbody<>(SB) - -TEXT ·IndexString(SB),NOSPLIT,$0-40 - MOVD a_base+0(FP), R0 - MOVD a_len+8(FP), R1 - MOVD b_base+16(FP), R2 - MOVD b_len+24(FP), R3 - MOVD $ret+32(FP), R9 - B indexbody<>(SB) - +// func Index(a, b []byte) int // input: -// R0: haystack -// R1: length of haystack -// R2: needle -// R3: length of needle (2 <= len <= 32) -// R9: address to put result -TEXT indexbody<>(SB),NOSPLIT,$0-56 +// R0: a ptr (haystack) +// R1: a len (haystack) +// R2: a cap (haystack) (unused) +// R3: b ptr (needle) +// R4: b len (needle) (2 <= len <= 32) +// R5: b cap (needle) (unused) +// return: +// R0: result +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVD R3, R2 + MOVD R4, R3 + B ·IndexString(SB) + +// func IndexString(a, b string) int +// input: +// R0: a ptr (haystack) +// R1: a len (haystack) +// R2: b ptr (needle) +// R3: b len (needle) (2 <= len <= 32) +// return: +// R0: result +TEXT ·IndexString(SB),NOSPLIT,$0-40 // main idea is to load 'sep' into separate register(s) // to avoid repeatedly re-load it again and again // for sebsequent substring comparisons @@ -136,11 +137,9 @@ loop_2: BNE loop_2 found: SUB R8, R0, R0 - MOVD R0, (R9) RET not_found: MOVD $-1, R0 - MOVD R0, (R9) RET greater_8: SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes diff --git a/src/internal/bytealg/indexbyte_arm64.s b/src/internal/bytealg/indexbyte_arm64.s index 40843fbc5b..92a61a4302 100644 --- a/src/internal/bytealg/indexbyte_arm64.s +++ b/src/internal/bytealg/indexbyte_arm64.s @@ -4,26 +4,26 @@ #include "textflag.h" -TEXT ·IndexByte(SB),NOSPLIT,$0-40 - MOVD b_base+0(FP), R0 - MOVD b_len+8(FP), R2 - MOVBU c+24(FP), R1 - MOVD $ret+32(FP), R8 - B indexbytebody<>(SB) - -TEXT ·IndexByteString(SB),NOSPLIT,$0-32 - MOVD s_base+0(FP), R0 - MOVD s_len+8(FP), R2 - MOVBU c+16(FP), R1 - MOVD $ret+24(FP), R8 - B indexbytebody<>(SB) - +// func IndexByte(b []byte, c byte) int // input: -// R0: data -// R1: byte to search -// R2: data len -// R8: address to put result -TEXT indexbytebody<>(SB),NOSPLIT,$0 +// R0: b ptr +// R1: b len +// R2: b cap (unused) +// R3: c byte to search +// return +// R0: result +TEXT ·IndexByte(SB),NOSPLIT,$0-40 + MOVD R3, R2 + B ·IndexByteString(SB) + +// func IndexByteString(s string, c byte) int +// input: +// R0: s ptr +// R1: s len +// R2: c byte to search +// return +// R0: result +TEXT ·IndexByteString(SB),NOSPLIT,$0-32 // Core algorithm: // For each 32-byte chunk we calculate a 64-bit syndrome value, // with two bits per byte. For each tuple, bit 0 is set if the @@ -33,19 +33,19 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0 // in the original string, counting trailing zeros allows to // identify exactly which byte has matched. - CBZ R2, fail + CBZ R1, fail MOVD R0, R11 // Magic constant 0x40100401 allows us to identify // which lane matches the requested byte. // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) // Different bytes have different bit masks (i.e: 1, 4, 16, 64) MOVD $0x40100401, R5 - VMOV R1, V0.B16 + VMOV R2, V0.B16 // Work with aligned 32-byte chunks BIC $0x1f, R0, R3 VMOV R5, V5.S4 ANDS $0x1f, R0, R9 - AND $0x1f, R2, R10 + AND $0x1f, R1, R10 BEQ loop // Input string is not 32-byte aligned. We calculate the @@ -53,7 +53,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0 // the first bytes and mask off the irrelevant part. VLD1.P (R3), [V1.B16, V2.B16] SUB $0x20, R9, R4 - ADDS R4, R2, R2 + ADDS R4, R1, R1 VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V2.B16, V4.B16 VAND V5.B16, V3.B16, V3.B16 @@ -72,7 +72,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0 loop: VLD1.P (R3), [V1.B16, V2.B16] - SUBS $0x20, R2, R2 + SUBS $0x20, R1, R1 VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V2.B16, V4.B16 // If we're out of data we finish regardless of the result @@ -117,10 +117,8 @@ tail: ADD R6>>1, R3, R0 // Compute the offset result SUB R11, R0, R0 - MOVD R0, (R8) RET fail: MOVD $-1, R0 - MOVD R0, (R8) RET