diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s index 8c9443d6fd..f2c7cc10f0 100644 --- a/src/internal/bytealg/equal_ppc64x.s +++ b/src/internal/bytealg/equal_ppc64x.s @@ -7,6 +7,21 @@ #include "go_asm.h" #include "textflag.h" +// 4K (smallest case) page size offset mask for PPC64. +#define PAGE_OFFSET 4095 + +// TODO: At writing, ISEL and BC do not support CR bit type arguments, +// define them here for readability. +#define CR0LT 4*0+0 +#define CR0EQ 4*0+2 +#define CR1LT 4*1+0 +#define CR6LT 4*6+0 + +// Likewise, the BC opcode is hard to read, and no extended +// mnemonics are offered for these forms. +#define BGELR_CR6 BC 4, CR6LT, (LR) +#define BEQLR BC 12, CR0EQ, (LR) + // memequal(a, b unsafe.Pointer, size uintptr) bool TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25 // R3 = a @@ -33,66 +48,158 @@ eq: // On exit: // R3 = return value TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0 - MOVD R5,CTR - CMP R5,$8 // only optimize >=8 - BLT simplecheck - DCBT (R3) // cache hint - DCBT (R4) - CMP R5,$32 // optimize >= 32 - MOVD R5,R6 // needed if setup8a branch - BLT setup8a // 8 byte moves only -setup32a: // 8 byte aligned, >= 32 bytes - SRADCC $5,R5,R6 // number of 32 byte chunks to compare - MOVD R6,CTR - MOVD $16,R14 // index for VSX loads and stores -loop32a: - LXVD2X (R3+R0), VS32 // VS32 = V0 - LXVD2X (R4+R0), VS33 // VS33 = V1 + MOVD R3, R8 // Move s1 into R8 + ADD R5, R3, R9 // &s1[len(s1)] + ADD R5, R4, R10 // &s2[len(s2)] + MOVD $1, R11 + CMP R5, $16 // Use GPR checks for check for len <= 16 + BLE check0_16 + MOVD $0, R3 // Assume no-match in case BGELR CR6 returns + CMP R5, $32 // Use overlapping VSX loads for len <= 32 + BLE check17_32 // Do a pair of overlapping VSR compares + CMP R5, $64 + BLE check33_64 // Hybrid check + overlap compare. + +setup64: + SRD $6, R5, R6 // number of 64 byte chunks to compare + MOVD R6, CTR + MOVD $16, R14 // index for VSX loads and stores + MOVD $32, R15 + MOVD $48, R16 + ANDCC $0x3F, R5, R5 // len%64==0? + + PCALIGN $32 +loop64: + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 VCMPEQUBCC V0, V1, V2 // compare, setting CR6 - BGE CR6, noteq - LXVD2X (R3+R14), VS32 - LXVD2X (R4+R14), VS33 - VCMPEQUBCC V0, V1, V2 - BGE CR6, noteq - ADD $32,R3 // bump up to next 32 - ADD $32,R4 - BC 16, 0, loop32a // br ctr and cr - ANDCC $24,R5,R6 // Any 8 byte chunks? - BEQ leftover // and result is 0 -setup8a: - SRADCC $3,R6,R6 // get the 8 byte count - BEQ leftover // shifted value is 0 - MOVD R6,CTR -loop8: - MOVD 0(R3),R6 // doublewords to compare - ADD $8,R3 - MOVD 0(R4),R7 - ADD $8,R4 - CMP R6,R7 // match? - BC 8,2,loop8 // bt ctr <> 0 && cr - BNE noteq -leftover: - ANDCC $7,R5,R6 // check for leftover bytes - BEQ equal - MOVD R6,CTR - BR simple -simplecheck: - CMP R5,$0 - BEQ equal -simple: - MOVBZ 0(R3), R6 - ADD $1,R3 - MOVBZ 0(R4), R7 - ADD $1,R4 - CMP R6, R7 - BNE noteq - BC 8,2,simple - BNE noteq - BR equal -noteq: - MOVD $0, R3 - RET -equal: - MOVD $1, R3 + BGELR_CR6 + LXVD2X (R8+R14), V0 + LXVD2X (R4+R14), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R15), V0 + LXVD2X (R4+R15), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R16), V0 + LXVD2X (R4+R16), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + ADD $64,R8 // bump up to next 64 + ADD $64,R4 + BDNZ loop64 + + ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0. + BEQLR // return if no tail. + + ADD $-64, R9, R8 + ADD $-64, R10, R4 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R14), V0 + LXVD2X (R4+R14), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R15), V0 + LXVD2X (R4+R15), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + LXVD2X (R8+R16), V0 + LXVD2X (R4+R16), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R11, R0, R3 RET +check33_64: + // Bytes 0-15 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + ADD $16, R8 + ADD $16, R4 + + // Bytes 16-31 + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + BGELR_CR6 + + // A little tricky, but point R4,R8 to &sx[len-32], + // and reuse check17_32 to check the next 1-31 bytes (with some overlap) + ADD $-32, R9, R8 + ADD $-32, R10, R4 + // Fallthrough + +check17_32: + LXVD2X (R8+R0), V0 + LXVD2X (R4+R0), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R11, R0, R5 + + // Load sX[len(sX)-16:len(sX)] and compare. + ADD $-16, R9 + ADD $-16, R10 + LXVD2X (R9+R0), V0 + LXVD2X (R10+R0), V1 + VCMPEQUBCC V0, V1, V2 + ISEL $CR6LT, R5, R0, R3 + RET + +check0_16: + CMP R5, $8 + BLT check0_7 + // Load sX[0:7] and compare. + MOVD (R8), R6 + MOVD (R4), R7 + CMP R6, R7 + ISEL $CR0EQ, R11, R0, R5 + // Load sX[len(sX)-8:len(sX)] and compare. + MOVD -8(R9), R6 + MOVD -8(R10), R7 + CMP R6, R7 + ISEL $CR0EQ, R5, R0, R3 + RET + +check0_7: + CMP R5,$0 + MOVD $1, R3 + BEQLR // return if len == 0 + + // Check < 8B loads with a single compare, but select the load address + // such that it cannot cross a page boundary. Load a few bytes from the + // lower address if that does not cross the lower page. Or, load a few + // extra bytes from the higher addresses. And align those values + // consistently in register as either address may have differing + // alignment requirements. + ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET + ANDCC $PAGE_OFFSET, R4, R9 + SUBC R5, $8, R12 // 8-len + SLD $3, R12, R14 // (8-len)*8 + CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower? + CMPU R9, R12, CR0 + SUB R12, R8, R6 // compute lower load address + SUB R12, R4, R9 + ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len)) + ISEL $CR0LT, R4, R9, R4 // Similar for s2 + MOVD (R8), R15 + MOVD (R4), R16 + SLD R14, R15, R7 + SLD R14, R16, R17 + SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts) + SRD R14, R17, R17 + SRD R14, R15, R6 // Clear the lower (8-len) bytes + SRD R14, R16, R9 +#ifdef GOARCH_ppc64le + ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment + ISEL $CR0LT, R17, R9, R4 +#else + ISEL $CR1LT, R6, R7, R8 + ISEL $CR0LT, R9, R17, R4 +#endif + CMP R4, R8 + ISEL $CR0EQ, R11, R0, R3 + RET