mirror of https://github.com/golang/go.git
170 lines
3.7 KiB
ArmAsm
170 lines
3.7 KiB
ArmAsm
// Copyright 2018 The Go Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
#include "go_asm.h"
|
||
#include "textflag.h"
|
||
|
||
TEXT ·IndexByte(SB), NOSPLIT, $0-40
|
||
MOVQ b_base+0(FP), SI
|
||
MOVQ b_len+8(FP), BX
|
||
MOVB c+24(FP), AL
|
||
LEAQ ret+32(FP), R8
|
||
JMP indexbytebody<>(SB)
|
||
|
||
TEXT ·IndexByteString(SB), NOSPLIT, $0-32
|
||
MOVQ s_base+0(FP), SI
|
||
MOVQ s_len+8(FP), BX
|
||
MOVB c+16(FP), AL
|
||
LEAQ ret+24(FP), R8
|
||
JMP indexbytebody<>(SB)
|
||
|
||
// Provide direct access to these functions from other packages.
|
||
// This is the equivlant of doing:
|
||
// package bytes
|
||
// func IndexByte(b []byte, c byte) int {
|
||
// return bytealg.IndexByte(s, c)
|
||
// }
|
||
// but involves no call overhead.
|
||
// TODO: remove this hack when midstack inlining is enabled?
|
||
TEXT bytes·IndexByte(SB), NOSPLIT, $0-40
|
||
MOVQ b_base+0(FP), SI
|
||
MOVQ b_len+8(FP), BX
|
||
MOVB c+24(FP), AL
|
||
LEAQ ret+32(FP), R8
|
||
JMP indexbytebody<>(SB)
|
||
|
||
TEXT strings·IndexByte(SB), NOSPLIT, $0-32
|
||
MOVQ s_base+0(FP), SI
|
||
MOVQ s_len+8(FP), BX
|
||
MOVB c+16(FP), AL
|
||
LEAQ ret+24(FP), R8
|
||
JMP indexbytebody<>(SB)
|
||
|
||
// input:
|
||
// SI: data
|
||
// BX: data len
|
||
// AL: byte sought
|
||
// R8: address to put result
|
||
TEXT indexbytebody<>(SB), NOSPLIT, $0
|
||
// Shuffle X0 around so that each byte contains
|
||
// the character we're looking for.
|
||
MOVD AX, X0
|
||
PUNPCKLBW X0, X0
|
||
PUNPCKLBW X0, X0
|
||
PSHUFL $0, X0, X0
|
||
|
||
CMPQ BX, $16
|
||
JLT small
|
||
|
||
MOVQ SI, DI
|
||
|
||
CMPQ BX, $32
|
||
JA avx2
|
||
sse:
|
||
LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
|
||
JMP sseloopentry
|
||
|
||
sseloop:
|
||
// Move the next 16-byte chunk of the data into X1.
|
||
MOVOU (DI), X1
|
||
// Compare bytes in X0 to X1.
|
||
PCMPEQB X0, X1
|
||
// Take the top bit of each byte in X1 and put the result in DX.
|
||
PMOVMSKB X1, DX
|
||
// Find first set bit, if any.
|
||
BSFL DX, DX
|
||
JNZ ssesuccess
|
||
// Advance to next block.
|
||
ADDQ $16, DI
|
||
sseloopentry:
|
||
CMPQ DI, AX
|
||
JB sseloop
|
||
|
||
// Search the last 16-byte chunk. This chunk may overlap with the
|
||
// chunks we've already searched, but that's ok.
|
||
MOVQ AX, DI
|
||
MOVOU (AX), X1
|
||
PCMPEQB X0, X1
|
||
PMOVMSKB X1, DX
|
||
BSFL DX, DX
|
||
JNZ ssesuccess
|
||
|
||
failure:
|
||
MOVQ $-1, (R8)
|
||
RET
|
||
|
||
// We've found a chunk containing the byte.
|
||
// The chunk was loaded from DI.
|
||
// The index of the matching byte in the chunk is DX.
|
||
// The start of the data is SI.
|
||
ssesuccess:
|
||
SUBQ SI, DI // Compute offset of chunk within data.
|
||
ADDQ DX, DI // Add offset of byte within chunk.
|
||
MOVQ DI, (R8)
|
||
RET
|
||
|
||
// handle for lengths < 16
|
||
small:
|
||
TESTQ BX, BX
|
||
JEQ failure
|
||
|
||
// Check if we'll load across a page boundary.
|
||
LEAQ 16(SI), AX
|
||
TESTW $0xff0, AX
|
||
JEQ endofpage
|
||
|
||
MOVOU (SI), X1 // Load data
|
||
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||
PMOVMSKB X1, DX // Move result bits to integer register.
|
||
BSFL DX, DX // Find first set bit.
|
||
JZ failure // No set bit, failure.
|
||
CMPL DX, BX
|
||
JAE failure // Match is past end of data.
|
||
MOVQ DX, (R8)
|
||
RET
|
||
|
||
endofpage:
|
||
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
|
||
PCMPEQB X0, X1 // Compare target byte with each byte in data.
|
||
PMOVMSKB X1, DX // Move result bits to integer register.
|
||
MOVL BX, CX
|
||
SHLL CX, DX
|
||
SHRL $16, DX // Shift desired bits down to bottom of register.
|
||
BSFL DX, DX // Find first set bit.
|
||
JZ failure // No set bit, failure.
|
||
MOVQ DX, (R8)
|
||
RET
|
||
|
||
avx2:
|
||
CMPB internal∕cpu·X86+const_x86_HasAVX2(SB), $1
|
||
JNE sse
|
||
MOVD AX, X0
|
||
LEAQ -32(SI)(BX*1), R11
|
||
VPBROADCASTB X0, Y1
|
||
avx2_loop:
|
||
VMOVDQU (DI), Y2
|
||
VPCMPEQB Y1, Y2, Y3
|
||
VPTEST Y3, Y3
|
||
JNZ avx2success
|
||
ADDQ $32, DI
|
||
CMPQ DI, R11
|
||
JLT avx2_loop
|
||
MOVQ R11, DI
|
||
VMOVDQU (DI), Y2
|
||
VPCMPEQB Y1, Y2, Y3
|
||
VPTEST Y3, Y3
|
||
JNZ avx2success
|
||
VZEROUPPER
|
||
MOVQ $-1, (R8)
|
||
RET
|
||
|
||
avx2success:
|
||
VPMOVMSKB Y3, DX
|
||
BSFL DX, DX
|
||
SUBQ SI, DI
|
||
ADDQ DI, DX
|
||
MOVQ DX, (R8)
|
||
VZEROUPPER
|
||
RET
|