diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s index 1a6e852d67..b6714f45aa 100644 --- a/src/internal/bytealg/indexbyte_ppc64x.s +++ b/src/internal/bytealg/indexbyte_ppc64x.s @@ -11,381 +11,304 @@ TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 // R3 = byte array pointer // R4 = length MOVD R6, R5 // R5 = byte - MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 BR indexbytebody<>(SB) TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 // R3 = string // R4 = length // R5 = byte - MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 BR indexbytebody<>(SB) +#ifndef GOPPC64_power9 +#ifdef GOARCH_ppc64le +DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 +DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 +#else +DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 +DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 +#endif +GLOBL indexbytevbperm<>+0(SB), RODATA, $16 +#endif + +// Some operations are endian specific, choose the correct opcode base on GOARCH. +// Note, _VCZBEBB is only available on power9 and newer. +#ifdef GOARCH_ppc64le +#define _LDBEX MOVDBR +#define _LWBEX MOVWBR +#define _LHBEX MOVHBR +#define _VCZBEBB VCTZLSBB +#else +#define _LDBEX MOVD +#define _LWBEX MOVW +#define _LHBEX MOVH +#define _VCZBEBB VCLZLSBB +#endif + // R3 = addr of string // R4 = len of string // R5 = byte to find -// R16 = 1 if running on a POWER9 system, 0 otherwise // On exit: // R3 = return value TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R3,R17 // Save base address for calculating the index later. - RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. - RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. - ADD R4,R3,R7 // Last acceptable address in R7. + CMPU R4,$32 - RLDIMI $16,R5,$32,R5 - CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. - MOVD $-1,R9 - RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 - RLDIMI $32,R5,$0,R5 - MOVD R7,R10 // Save last acceptable address in R10 for later. - ADD $-1,R7,R7 -#ifdef GOARCH_ppc64le - SLD R6,R9,R9 // Prepare mask for Little Endian -#else - SRD R6,R9,R9 // Same for Big Endian +#ifndef GOPPC64_power9 + // Load VBPERMQ constant to reduce compare into an ordered bit mask. + MOVD $indexbytevbperm<>+00(SB),R16 + LXVD2X (R16),V0 // Set up swap string #endif - BLT small_string // Jump to the small string case if it's <32 bytes. - CMP R16,$1 // optimize for power8 v power9 - BNE power8 - VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 - LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} - VSLB V11,V10,V10 // to extract the first bit of match result into GPR VSPLTB $7,V1,V1 // Replicate byte across V1 - CMP R4,$64 + + BLT cmp16 // Jump to the small string case if it's <32 bytes. + + CMP R4,$64,CR1 MOVD $16,R11 MOVD R3,R8 - BLT cmp32 + BLT CR1,cmp32 // Special case for length 32 - 63 MOVD $32,R12 MOVD $48,R6 + RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63 + ADD R3,R9,R9 // R9 = &s[len &^ 63] + ANDCC $63,R4 // (len &= 63) cmp 0. + + PCALIGN $16 loop64: - LXVB16X (R0)(R8),V2 // scan 64 bytes at a time + LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0] VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat0 // match found at R8, jump out + BNE CR6,foundat0 // Match found at R8, jump out - LXVB16X (R8)(R11),V2 + LXVD2X (R11)(R8),V2 VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat1 // match found at R8+16 bytes, jump out + BNE CR6,foundat1 // Match found at R8+16 bytes, jump out - LXVB16X (R8)(R12),V2 + LXVD2X (R12)(R8),V2 VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat2 // match found at R8+32 bytes, jump out + BNE CR6,foundat2 // Match found at R8+32 bytes, jump out - LXVB16X (R8)(R6),V2 + LXVD2X (R6)(R8),V2 VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat3 // match found at R8+48 bytes, jump out + BNE CR6,foundat3 // Match found at R8+48 bytes, jump out + ADD $64,R8 - ADD $-64,R4 - CMP R4,$64 // >=64 bytes left to scan? - BGE loop64 - CMP R4,$32 - BLT rem // jump to rem if there are < 32 bytes left -cmp32: - LXVB16X (R0)(R8),V2 // 32-63 bytes left + CMPU R8,R9,CR1 + BNE CR1,loop64 // R8 != &s[len &^ 63]? + + PCALIGN $32 + BEQ notfound // Is tail length 0? CR0 is set before entering loop64. + + CMP R4,$32 // Tail length >= 32, use cmp32 path. + CMP R4,$16,CR1 + BGE cmp32 + + ADD R8,R4,R9 + ADD $-16,R9 + BLE CR1,cmp64_tail_gt0 + +cmp64_tail_gt16: // Tail length 17 - 32 + LXVD2X (R0)(R8),V2 VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat0 // match found at R8 + BNE CR6,foundat0 - LXVB16X (R11)(R8),V2 +cmp64_tail_gt0: // Tail length 1 - 16 + MOVD R9,R8 + LXVD2X (R0)(R9),V2 VCMPEQUBCC V2,V1,V6 - BNE CR6,foundat1 // match found at R8+16 + BNE CR6,foundat0 - ADD $32,R8 - ADD $-32,R4 -rem: - RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing - BR small_string + BR notfound +cmp32: // Length 32 - 63 + + // Bytes 0 - 15 + LXVD2X (R0)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 + + // Bytes 16 - 31 + LXVD2X (R8)(R11),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // Match found at R8+16 bytes, jump out + + BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32) + CMP R4,$48 + + ADD R4,R8,R9 // Compute &s[len(s)-16] + ADD $32,R8,R8 + ADD $-16,R9,R9 + ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8 + + // Bytes 33 - 47 + LXVD2X (R0)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8+32 bytes, jump out + + BLE notfound + + // Bytes 48 - 63 + MOVD R9,R8 // R9 holds the final check. + LXVD2X (R0)(R9),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // Match found at R8+48 bytes, jump out + + BR notfound + +// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. +#ifndef GOPPC64_power9 +#define ADJUST_FOR_CNTLZW -16 +#else +#define ADJUST_FOR_CNTLZW 0 +#endif + +// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used +// to determine the offset into the 16B vector, it will overcount by 16. Account for it here. foundat3: - ADD $16,R8 + SUB R3,R8,R3 + ADD $48+ADJUST_FOR_CNTLZW,R3 + BR vfound foundat2: - ADD $16,R8 + SUB R3,R8,R3 + ADD $32+ADJUST_FOR_CNTLZW,R3 + BR vfound foundat1: - ADD $16,R8 + SUB R3,R8,R3 + ADD $16+ADJUST_FOR_CNTLZW,R3 + BR vfound foundat0: - // Compress the result into a single doubleword and - // move it to a GPR for the final calculation. - VBPERMQ V6,V10,V6 - MFVRD V6,R3 - // count leading zeroes upto the match that ends up in low 16 bits - // in both endian modes, compute index by subtracting the number by 16 - CNTLZW R3,R11 - ADD $-16,R11 - ADD R8,R11,R3 // Calculate byte address - SUB R17,R3 + SUB R3,R8,R3 + ADD $0+ADJUST_FOR_CNTLZW,R3 +vfound: + // Map equal values into a 16 bit value with earlier matches setting higher bits. +#ifndef GOPPC64_power9 + VBPERMQ V6,V0,V6 + MFVRD V6,R4 + CNTLZW R4,R4 +#else +#ifdef GOARCH_ppc64le + // Put the value back into LE ordering by swapping doublewords. + XXPERMDI V6,V6,$2,V6 +#endif + _VCZBEBB V6,R4 +#endif + ADD R3,R4,R3 RET -power8: - // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values - // in V0, V1 and V10, then branch to the preloop. - ANDCC $63,R3,R11 - BEQ CR0,qw_align - RLDICL $0,R3,$61,R11 - MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. - CMPB R12,R5,R3 // Check for a match. - AND R9,R3,R3 // Mask bytes below s_base - RLDICR $0,R7,$60,R7 // Last doubleword in R7 - CMPU R3,$0,CR7 // If we have a match, jump to the final computation - BNE CR7,done - ADD $8,R8,R8 +cmp16: // Length 16 - 31 + CMPU R4,$16 + ADD R4,R3,R9 + BLT cmp8 + + ADD $-16,R9,R9 // &s[len(s)-16] + + // Bytes 0 - 15 + LXVD2X (R0)(R3),V2 + VCMPEQUBCC V2,V1,V6 + MOVD R3,R8 + BNE CR6,foundat0 // Match found at R8+32 bytes, jump out + + BEQ notfound + + // Bytes 16 - 30 + MOVD R9,R8 // R9 holds the final check. + LXVD2X (R0)(R9),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // Match found at R8+48 bytes, jump out + + BR notfound + + +cmp8: // Length 8 - 15 +#ifdef GOPPC64_power10 + // Load all the bytes into a single VSR in BE order. + SLD $56,R4,R5 + LXVLL R3,R5,V2 + // Compare and count the number which don't match. + VCMPEQUB V2,V1,V6 + VCLZLSBB V6,R3 + // If count is the number of bytes, or more. No matches are found. + CMPU R3,R4 + MOVD $-1,R5 + // Otherwise, the count is the index of the first match. + ISEL CR0LT,R3,R5,R3 + RET +#else + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + RLDIMI $16,R5,$32,R5 + RLDIMI $32,R5,$0,R5 + CMPU R4,$8 + BLT cmp4 + MOVD $-8,R11 ADD $-8,R4,R4 + + _LDBEX (R0)(R3),R10 + _LDBEX (R11)(R9),R11 + CMPB R10,R5,R10 + CMPB R11,R5,R11 + CMPU R10,$0 + CMPU R11,$0,CR1 + CNTLZD R10,R10 + CNTLZD R11,R11 + SRD $3,R10,R3 + SRD $3,R11,R11 + BNE found + ADD R4,R11,R4 - - // Check for quadword alignment - ANDCC $15,R8,R11 - BEQ CR0,qw_align - - // Not aligned, so handle the next doubleword - MOVD 0(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR7 - BNE CR7,done - ADD $8,R8,R8 - ADD $-8,R4,R4 - - // Either quadword aligned or 64-byte at this point. We can use LVX. -qw_align: - - // Set up auxiliary data for the vectorized algorithm. - VSPLTISB $0,V0 // Replicate 0 across V0 - VSPLTISB $3,V10 // Use V10 as control for VBPERMQ - MTVRD R5,V1 - LVSL (R0+R0),V11 - VSLB V11,V10,V10 - VSPLTB $7,V1,V1 // Replicate byte across V1 - CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop - BLE tail - - // We will load 4 quardwords per iteration in the loop, so check for - // 64-byte alignment. If 64-byte aligned, then branch to the preloop. - ANDCC $63,R8,R11 - BEQ CR0,preloop - - // Not 64-byte aligned. Load one quadword at a time until aligned. - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 // Check for byte in V4 - BNE CR6,found_qw_align - ADD $16,R8,R8 - ADD $-16,R4,R4 - - ANDCC $63,R8,R11 - BEQ CR0,preloop - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 // Check for byte in V4 - BNE CR6,found_qw_align - ADD $16,R8,R8 - ADD $-16,R4,R4 - - ANDCC $63,R8,R11 - BEQ CR0,preloop - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 // Check for byte in V4 - BNE CR6,found_qw_align - ADD $-16,R4,R4 - ADD $16,R8,R8 - - // 64-byte aligned. Prepare for the main loop. -preloop: - CMPU R4,$64 - BLE tail // If len ≤ 64, don't use the vectorized loop - - // We are now aligned to a 64-byte boundary. We will load 4 quadwords - // per loop iteration. The last doubleword is in R10, so our loop counter - // starts at (R10-R8)/64. - SUB R8,R10,R6 - SRD $6,R6,R9 // Loop counter in R9 - MOVD R9,CTR - - ADD $-64,R8,R8 // Adjust index for loop entry - MOVD $16,R11 // Load offsets for the vector loads - MOVD $32,R9 - MOVD $48,R7 - - // Main loop we will load 64 bytes per iteration -loop: - ADD $64,R8,R8 // Fuse addi+lvx for performance - LVX (R8+R0),V2 // Load 4 16-byte vectors - LVX (R8+R11),V3 - VCMPEQUB V1,V2,V6 // Look for byte in each vector - VCMPEQUB V1,V3,V7 - - LVX (R8+R9),V4 - LVX (R8+R7),V5 - VCMPEQUB V1,V4,V8 - VCMPEQUB V1,V5,V9 - - VOR V6,V7,V11 // Compress the result in a single vector - VOR V8,V9,V12 - VOR V11,V12,V13 - VCMPEQUBCC V0,V13,V14 // Check for byte - BGE CR6,found - BC 16,0,loop // bdnz loop - - // Handle the tailing bytes or R4 ≤ 64 - RLDICL $0,R6,$58,R4 - ADD $64,R8,R8 -tail: - CMPU R4,$0 - BEQ notfound - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 - BNE CR6,found_qw_align - ADD $16,R8,R8 - CMPU R4,$16,CR6 - BLE CR6,notfound - ADD $-16,R4,R4 - - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 - BNE CR6,found_qw_align - ADD $16,R8,R8 - CMPU R4,$16,CR6 - BLE CR6,notfound - ADD $-16,R4,R4 - - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 - BNE CR6,found_qw_align - ADD $16,R8,R8 - CMPU R4,$16,CR6 - BLE CR6,notfound - ADD $-16,R4,R4 - - LVX (R8+R0),V4 - VCMPEQUBCC V1,V4,V6 - BNE CR6,found_qw_align - -notfound: - MOVD $-1, R3 + MOVD $-1,R3 + ISEL CR1EQ,R3,R4,R3 RET +cmp4: // Length 4 - 7 + CMPU R4,$4 + BLT cmp2 + MOVD $-4,R11 + ADD $-4,R4,R4 + + _LWBEX (R0)(R3),R10 + _LWBEX (R11)(R9),R11 + CMPB R10,R5,R10 + CMPB R11,R5,R11 + CNTLZW R10,R10 + CNTLZW R11,R11 + CMPU R10,$32 + CMPU R11,$32,CR1 + SRD $3,R10,R3 + SRD $3,R11,R11 + BNE found + + ADD R4,R11,R4 + MOVD $-1,R3 + ISEL CR1EQ,R3,R4,R3 + RET + +cmp2: // Length 2 - 3 + CMPU R4,$2 + BLT cmp1 + + _LHBEX (R0)(R3),R10 + CMPB R10,R5,R10 + SLDCC $48,R10,R10 + CNTLZD R10,R10 + SRD $3,R10,R3 + BNE found + +cmp1: // Length 1 + MOVD $-1,R3 + ANDCC $1,R4,R31 + BEQ found + + MOVBZ -1(R9),R10 + CMPB R10,R5,R10 + ANDCC $1,R10 + ADD $-1,R4 + ISEL CR0EQ,R3,R4,R3 + found: - // We will now compress the results into a single doubleword, - // so it can be moved to a GPR for the final index calculation. - - // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the - // first bit of each byte into bits 48-63. - VBPERMQ V6,V10,V6 - VBPERMQ V7,V10,V7 - VBPERMQ V8,V10,V8 - VBPERMQ V9,V10,V9 - - // Shift each 16-bit component into its correct position for - // merging into a single doubleword. -#ifdef GOARCH_ppc64le - VSLDOI $2,V7,V7,V7 - VSLDOI $4,V8,V8,V8 - VSLDOI $6,V9,V9,V9 -#else - VSLDOI $6,V6,V6,V6 - VSLDOI $4,V7,V7,V7 - VSLDOI $2,V8,V8,V8 + RET #endif - // Merge V6-V9 into a single doubleword and move to a GPR. - VOR V6,V7,V11 - VOR V8,V9,V4 - VOR V4,V11,V4 - MFVRD V4,R3 - -#ifdef GOARCH_ppc64le - ADD $-1,R3,R11 - ANDN R3,R11,R11 - POPCNTD R11,R11 // Count trailing zeros (Little Endian). -#else - CNTLZD R3,R11 // Count leading zeros (Big Endian). -#endif - ADD R8,R11,R3 // Calculate byte address - -return: - SUB R17, R3 +notfound: + MOVD $-1,R3 RET -found_qw_align: - // Use the same algorithm as above. Compress the result into - // a single doubleword and move it to a GPR for the final - // calculation. - VBPERMQ V6,V10,V6 - -#ifdef GOARCH_ppc64le - MFVRD V6,R3 - ADD $-1,R3,R11 - ANDN R3,R11,R11 - POPCNTD R11,R11 -#else - VSLDOI $6,V6,V6,V6 - MFVRD V6,R3 - CNTLZD R3,R11 -#endif - ADD R8,R11,R3 - CMPU R11,R4 - BLT return - BR notfound - PCALIGN $16 - -done: - ADD $-1,R10,R6 - // Offset of last index for the final - // doubleword comparison - RLDICL $0,R6,$61,R6 - // At this point, R3 has 0xFF in the same position as the byte we are - // looking for in the doubleword. Use that to calculate the exact index - // of the byte. -#ifdef GOARCH_ppc64le - ADD $-1,R3,R11 - ANDN R3,R11,R11 - POPCNTD R11,R11 // Count trailing zeros (Little Endian). -#else - CNTLZD R3,R11 // Count leading zeros (Big Endian). -#endif - CMPU R8,R7 // Check if we are at the last doubleword. - SRD $3,R11 // Convert trailing zeros to bytes. - ADD R11,R8,R3 - CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset. - BNE return - BLE CR7,return - BR notfound - -small_string: - // process string of length < 32 bytes - // We unroll this loop for better performance. - CMPU R4,$0 // Check for length=0 - BEQ notfound - - MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. - CMPB R12,R5,R3 // Check for a match. - AND R9,R3,R3 // Mask bytes below s_base. - CMPU R3,$0,CR7 // If we have a match, jump to the final computation. - RLDICR $0,R7,$60,R7 // Last doubleword in R7. - CMPU R8,R7 - BNE CR7,done - BEQ notfound // Hit length. - - MOVDU 8(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR6 - CMPU R8,R7 - BNE CR6,done - BEQ notfound - - MOVDU 8(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR6 - CMPU R8,R7 - BNE CR6,done - BEQ notfound - - MOVDU 8(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR6 - CMPU R8,R7 - BNE CR6,done - BEQ notfound - - MOVDU 8(R8),R12 - CMPB R12,R5,R3 - CMPU R3,$0,CR6 - BNE CR6,done - BR notfound -