internal/bytealg: optimize Index/IndexString/IndexByte/IndexByteString on arm64

Introduce ABIInternal support for Index/IndexString/IndexByte/IndexByteString

goos: linux
goarch: arm64
pkg: bytes
                              │   base.txt    │                new.txt                │
                              │      B/s      │      B/s       vs base                │
IndexByte/10                     1.090Gi ± 0%    1.313Gi ± 0%  +20.51% (p=0.000 n=10)
IndexByte/32                     3.714Gi ± 0%    4.289Gi ± 0%  +15.47% (p=0.000 n=10)
IndexByte/4K                     22.92Gi ± 0%    23.01Gi ± 0%   +0.37% (p=0.000 n=10)
IndexByte/4M                     20.23Gi ± 0%    20.35Gi ± 0%   +0.60% (p=0.000 n=10)
IndexByte/64M                    23.82Gi ± 0%    23.81Gi ± 0%   -0.01% (p=0.002 n=10)
IndexBytePortable/10             788.5Mi ± 0%    788.5Mi ± 0%        ~ (p=0.722 n=10)
IndexBytePortable/32            1002.3Mi ± 0%   1002.3Mi ± 0%        ~ (p=0.137 n=10)
IndexBytePortable/4K             1.111Gi ± 0%    1.111Gi ± 0%        ~ (p=0.692 n=10)
IndexBytePortable/4M             1.116Gi ± 0%    1.116Gi ± 0%        ~ (p=0.158 n=10)
IndexBytePortable/64M            1.116Gi ± 0%    1.116Gi ± 0%   -0.01% (p=0.000 n=10)
IndexRune/10                     352.1Mi ± 0%    445.0Mi ± 0%  +26.38% (p=0.000 n=10)
IndexRune/32                     1.101Gi ± 0%    1.391Gi ± 0%  +26.43% (p=0.000 n=10)
IndexRune/4K                     21.07Gi ± 0%    21.25Gi ± 0%   +0.82% (p=0.000 n=10)
IndexRune/4M                     23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.218 n=10)
IndexRune/64M                    23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.271 n=10)
IndexRuneASCII/10                1.038Gi ± 0%    1.190Gi ± 1%  +14.63% (p=0.000 n=10)
IndexRuneASCII/32                3.643Gi ± 2%    4.203Gi ± 0%  +15.38% (p=0.000 n=10)
IndexRuneASCII/4K                22.90Gi ± 0%    22.98Gi ± 0%   +0.34% (p=0.000 n=10)
IndexRuneASCII/4M                23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.108 n=10)
IndexRuneASCII/64M               23.82Gi ± 0%    23.81Gi ± 0%        ~ (p=0.105 n=10)
IndexRuneUnicode/Latin/10        404.4Mi ± 0%    493.7Mi ± 0%  +22.10% (p=0.000 n=10)
IndexRuneUnicode/Latin/32        1.261Gi ± 0%    1.543Gi ± 0%  +22.31% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K        6.966Gi ± 0%    8.115Gi ± 0%  +16.50% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M        6.599Gi ± 0%    7.576Gi ± 0%  +14.80% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M       6.297Gi ± 0%    7.070Gi ± 2%  +12.28% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10     385.9Mi ± 0%    440.1Mi ± 0%  +14.03% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32     1.206Gi ± 0%    1.375Gi ± 0%  +14.05% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K     2.468Gi ± 0%    2.921Gi ± 0%  +18.37% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M     2.386Gi ± 0%    2.845Gi ± 0%  +19.23% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/64M    2.280Gi ± 0%    2.717Gi ± 0%  +19.14% (p=0.000 n=10)
IndexRuneUnicode/Han/10          307.1Mi ± 0%    331.5Mi ± 0%   +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/32          982.2Mi ± 0%   1060.2Mi ± 0%   +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/4K          4.986Gi ± 0%    5.957Gi ± 0%  +19.48% (p=0.000 n=10)
IndexRuneUnicode/Han/4M          3.822Gi ± 0%    4.198Gi ± 0%   +9.83% (p=0.000 n=10)
IndexRuneUnicode/Han/64M         3.765Gi ± 0%    4.140Gi ± 0%   +9.96% (p=0.000 n=10)
Index/10                         634.6Mi ± 0%    635.2Mi ± 0%   +0.09% (p=0.000 n=10)
Index/32                         375.3Mi ± 0%    385.1Mi ± 0%   +2.63% (p=0.000 n=10)
Index/4K                         754.8Mi ± 0%    755.2Mi ± 0%   +0.04% (p=0.001 n=10)
Index/4M                         746.5Mi ± 0%    746.3Mi ± 0%   -0.03% (p=0.000 n=10)
Index/64M                        746.5Mi ± 0%    746.3Mi ± 0%   -0.03% (p=0.000 n=10)
IndexEasy/10                     714.6Mi ± 0%    714.6Mi ± 0%   +0.00% (p=0.001 n=10)
IndexEasy/32                     1.221Gi ± 0%    1.524Gi ± 0%  +24.81% (p=0.000 n=10)
IndexEasy/4K                     21.06Gi ± 0%    21.47Gi ± 0%   +1.91% (p=0.000 n=10)
IndexEasy/4M                     20.23Gi ± 0%    20.24Gi ± 0%        ~ (p=0.684 n=10)
IndexEasy/64M                    13.07Gi ± 0%    12.58Gi ± 4%   -3.75% (p=0.000 n=10)
IndexHard1                       1.114Gi ± 0%    1.114Gi ± 0%        ~ (p=0.193 n=10)
IndexHard2                       1.111Gi ± 0%    1.112Gi ± 0%   +0.04% (p=0.001 n=10)
IndexHard3                       1.086Gi ± 0%    1.081Gi ± 0%   -0.37% (p=0.000 n=10)
IndexHard4                       607.9Mi ± 0%    607.9Mi ± 0%        ~ (p=0.136 n=10)
geomean                          2.536Gi         2.720Gi        +7.26%

Change-Id: I1fc246783ebb215882d7144d05dbe2433dc66751
Reviewed-on: https://go-review.googlesource.com/c/go/+/662415
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
This commit is contained in:
Vasily Leonenko 2025-04-02 21:31:41 +03:00 committed by Gopher Robot
parent f2e9076764
commit 5b36f61356
2 changed files with 47 additions and 50 deletions

View File

@ -5,29 +5,30 @@
#include "go_asm.h" #include "go_asm.h"
#include "textflag.h" #include "textflag.h"
TEXT ·Index(SB),NOSPLIT,$0-56 // func Index(a, b []byte) int
MOVD a_base+0(FP), R0
MOVD a_len+8(FP), R1
MOVD b_base+24(FP), R2
MOVD b_len+32(FP), R3
MOVD $ret+48(FP), R9
B indexbody<>(SB)
TEXT ·IndexString(SB),NOSPLIT,$0-40
MOVD a_base+0(FP), R0
MOVD a_len+8(FP), R1
MOVD b_base+16(FP), R2
MOVD b_len+24(FP), R3
MOVD $ret+32(FP), R9
B indexbody<>(SB)
// input: // input:
// R0: haystack // R0: a ptr (haystack)
// R1: length of haystack // R1: a len (haystack)
// R2: needle // R2: a cap (haystack) (unused)
// R3: length of needle (2 <= len <= 32) // R3: b ptr (needle)
// R9: address to put result // R4: b len (needle) (2 <= len <= 32)
TEXT indexbody<>(SB),NOSPLIT,$0-56 // R5: b cap (needle) (unused)
// return:
// R0: result
TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
MOVD R3, R2
MOVD R4, R3
B ·IndexString<ABIInternal>(SB)
// func IndexString(a, b string) int
// input:
// R0: a ptr (haystack)
// R1: a len (haystack)
// R2: b ptr (needle)
// R3: b len (needle) (2 <= len <= 32)
// return:
// R0: result
TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
// main idea is to load 'sep' into separate register(s) // main idea is to load 'sep' into separate register(s)
// to avoid repeatedly re-load it again and again // to avoid repeatedly re-load it again and again
// for sebsequent substring comparisons // for sebsequent substring comparisons
@ -136,11 +137,9 @@ loop_2:
BNE loop_2 BNE loop_2
found: found:
SUB R8, R0, R0 SUB R8, R0, R0
MOVD R0, (R9)
RET RET
not_found: not_found:
MOVD $-1, R0 MOVD $-1, R0
MOVD R0, (R9)
RET RET
greater_8: greater_8:
SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes SUB $9, R3, R11 // len(sep) - 9, offset of R0 for last 8 bytes

View File

@ -4,26 +4,26 @@
#include "textflag.h" #include "textflag.h"
TEXT ·IndexByte(SB),NOSPLIT,$0-40 // func IndexByte(b []byte, c byte) int
MOVD b_base+0(FP), R0
MOVD b_len+8(FP), R2
MOVBU c+24(FP), R1
MOVD $ret+32(FP), R8
B indexbytebody<>(SB)
TEXT ·IndexByteString(SB),NOSPLIT,$0-32
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU c+16(FP), R1
MOVD $ret+24(FP), R8
B indexbytebody<>(SB)
// input: // input:
// R0: data // R0: b ptr
// R1: byte to search // R1: b len
// R2: data len // R2: b cap (unused)
// R8: address to put result // R3: c byte to search
TEXT indexbytebody<>(SB),NOSPLIT,$0 // return
// R0: result
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
MOVD R3, R2
B ·IndexByteString<ABIInternal>(SB)
// func IndexByteString(s string, c byte) int
// input:
// R0: s ptr
// R1: s len
// R2: c byte to search
// return
// R0: result
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
// Core algorithm: // Core algorithm:
// For each 32-byte chunk we calculate a 64-bit syndrome value, // For each 32-byte chunk we calculate a 64-bit syndrome value,
// with two bits per byte. For each tuple, bit 0 is set if the // with two bits per byte. For each tuple, bit 0 is set if the
@ -33,19 +33,19 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
// in the original string, counting trailing zeros allows to // in the original string, counting trailing zeros allows to
// identify exactly which byte has matched. // identify exactly which byte has matched.
CBZ R2, fail CBZ R1, fail
MOVD R0, R11 MOVD R0, R11
// Magic constant 0x40100401 allows us to identify // Magic constant 0x40100401 allows us to identify
// which lane matches the requested byte. // which lane matches the requested byte.
// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24)) // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
// Different bytes have different bit masks (i.e: 1, 4, 16, 64) // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
MOVD $0x40100401, R5 MOVD $0x40100401, R5
VMOV R1, V0.B16 VMOV R2, V0.B16
// Work with aligned 32-byte chunks // Work with aligned 32-byte chunks
BIC $0x1f, R0, R3 BIC $0x1f, R0, R3
VMOV R5, V5.S4 VMOV R5, V5.S4
ANDS $0x1f, R0, R9 ANDS $0x1f, R0, R9
AND $0x1f, R2, R10 AND $0x1f, R1, R10
BEQ loop BEQ loop
// Input string is not 32-byte aligned. We calculate the // Input string is not 32-byte aligned. We calculate the
@ -53,7 +53,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
// the first bytes and mask off the irrelevant part. // the first bytes and mask off the irrelevant part.
VLD1.P (R3), [V1.B16, V2.B16] VLD1.P (R3), [V1.B16, V2.B16]
SUB $0x20, R9, R4 SUB $0x20, R9, R4
ADDS R4, R2, R2 ADDS R4, R1, R1
VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16 VCMEQ V0.B16, V2.B16, V4.B16
VAND V5.B16, V3.B16, V3.B16 VAND V5.B16, V3.B16, V3.B16
@ -72,7 +72,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
loop: loop:
VLD1.P (R3), [V1.B16, V2.B16] VLD1.P (R3), [V1.B16, V2.B16]
SUBS $0x20, R2, R2 SUBS $0x20, R1, R1
VCMEQ V0.B16, V1.B16, V3.B16 VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16 VCMEQ V0.B16, V2.B16, V4.B16
// If we're out of data we finish regardless of the result // If we're out of data we finish regardless of the result
@ -117,10 +117,8 @@ tail:
ADD R6>>1, R3, R0 ADD R6>>1, R3, R0
// Compute the offset result // Compute the offset result
SUB R11, R0, R0 SUB R11, R0, R0
MOVD R0, (R8)
RET RET
fail: fail:
MOVD $-1, R0 MOVD $-1, R0
MOVD R0, (R8)
RET RET