mirror of https://github.com/golang/go.git
internal/bytealg: optimize cmpbody for ppc64le/ppc64
Vectorize the cmpbody loop for bytes of size greater than or equal to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve performance of smaller size compares Performance improves for most sizes with this change on POWER8, 9 and POWER10. For the very small sizes (upto 8) the overhead of calling function starts to impact performance. POWER9: name old time/op new time/op delta BytesCompare/1 4.60ns ± 0% 5.49ns ± 0% +19.27% BytesCompare/2 4.68ns ± 0% 5.46ns ± 0% +16.71% BytesCompare/4 6.58ns ± 0% 5.49ns ± 0% -16.58% BytesCompare/8 4.89ns ± 0% 5.46ns ± 0% +11.64% BytesCompare/16 5.21ns ± 0% 4.96ns ± 0% -4.70% BytesCompare/32 5.09ns ± 0% 4.98ns ± 0% -2.14% BytesCompare/64 6.40ns ± 0% 5.96ns ± 0% -6.84% BytesCompare/128 11.3ns ± 0% 8.1ns ± 0% -28.09% BytesCompare/256 15.1ns ± 0% 12.8ns ± 0% -15.16% BytesCompare/512 26.5ns ± 0% 23.3ns ± 5% -12.03% BytesCompare/1024 50.2ns ± 0% 41.6ns ± 2% -17.01% BytesCompare/2048 99.3ns ± 0% 86.5ns ± 0% -12.88% Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609 Reviewed-on: https://go-review.googlesource.com/c/go/+/362797 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Reviewed-by: Than McIntosh <thanm@google.com>
This commit is contained in:
parent
1e5987635c
commit
78fb1d03d3
|
|
@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
|
||||||
CMP R5,R6,CR7
|
CMP R5,R6,CR7
|
||||||
CMP R3,R4,CR6
|
CMP R3,R4,CR6
|
||||||
BEQ CR7,equal
|
BEQ CR7,equal
|
||||||
#ifdef GOARCH_ppc64le
|
MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
|
||||||
BR cmpbodyLE<>(SB)
|
CMP R16,$1
|
||||||
#else
|
BNE power8
|
||||||
BR cmpbodyBE<>(SB)
|
BR cmpbodyp9<>(SB)
|
||||||
#endif
|
power8:
|
||||||
|
BR cmpbody<>(SB)
|
||||||
equal:
|
equal:
|
||||||
BEQ CR6,done
|
BEQ CR6,done
|
||||||
MOVD $1, R8
|
MOVD $1, R8
|
||||||
|
|
@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
|
||||||
CMP R5,R6,CR7
|
CMP R5,R6,CR7
|
||||||
CMP R3,R4,CR6
|
CMP R3,R4,CR6
|
||||||
BEQ CR7,equal
|
BEQ CR7,equal
|
||||||
#ifdef GOARCH_ppc64le
|
MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
|
||||||
BR cmpbodyLE<>(SB)
|
CMP R16,$1
|
||||||
#else
|
BNE power8
|
||||||
BR cmpbodyBE<>(SB)
|
BR cmpbodyp9<>(SB)
|
||||||
#endif
|
power8:
|
||||||
|
BR cmpbody<>(SB)
|
||||||
equal:
|
equal:
|
||||||
BEQ CR6,done
|
BEQ CR6,done
|
||||||
MOVD $1, R8
|
MOVD $1, R8
|
||||||
|
|
@ -70,209 +72,431 @@ done:
|
||||||
MOVD $0, R3
|
MOVD $0, R3
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// Do an efficient memcmp for ppc64le
|
#ifdef GOARCH_ppc64le
|
||||||
|
DATA byteswap<>+0(SB)/8, $0x0706050403020100
|
||||||
|
DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
|
||||||
|
GLOBL byteswap<>+0(SB), RODATA, $16
|
||||||
|
#define SWAP V21
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Do an efficient memcmp for ppc64le/ppc64/POWER8
|
||||||
// R3 = a len
|
// R3 = a len
|
||||||
// R4 = b len
|
// R4 = b len
|
||||||
// R5 = a addr
|
// R5 = a addr
|
||||||
// R6 = b addr
|
// R6 = b addr
|
||||||
// On exit:
|
// On exit:
|
||||||
// R3 = return value
|
// R3 = return value
|
||||||
TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
|
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
|
||||||
MOVD R3,R8 // set up length
|
MOVD R3,R8 // set up length
|
||||||
CMP R3,R4,CR2 // unequal?
|
CMP R3,R4,CR2 // unequal?
|
||||||
BC 12,8,setuplen // BLT CR2
|
BLT CR2,setuplen // BLT CR2
|
||||||
MOVD R4,R8 // use R4 for comparison len
|
MOVD R4,R8 // use R4 for comparison len
|
||||||
setuplen:
|
setuplen:
|
||||||
MOVD R8,CTR // set up loop counter
|
|
||||||
CMP R8,$8 // only optimize >=8
|
|
||||||
BLT simplecheck
|
|
||||||
DCBT (R5) // cache hint
|
|
||||||
DCBT (R6)
|
|
||||||
CMP R8,$32 // optimize >= 32
|
CMP R8,$32 // optimize >= 32
|
||||||
MOVD R8,R9
|
MOVD R8,R9
|
||||||
BLT setup8a // 8 byte moves only
|
BLT setup8a // optimize < 32
|
||||||
setup32a:
|
MOVD $16,R10 // set offsets to load into vectors
|
||||||
SRADCC $5,R8,R9 // number of 32 byte chunks
|
CMP R8,$64
|
||||||
MOVD R9,CTR
|
BLT cmp32 // process size 32-63
|
||||||
|
|
||||||
// Special processing for 32 bytes or longer.
|
DCBT (R5) // optimize >= 64
|
||||||
// Loading this way is faster and correct as long as the
|
DCBT (R6) // cache hint
|
||||||
// doublewords being compared are equal. Once they
|
MOVD $32,R11 // set offsets to load into vector
|
||||||
// are found unequal, reload them in proper byte order
|
MOVD $48,R12 // set offsets to load into vector
|
||||||
// to determine greater or less than.
|
|
||||||
loop32a:
|
loop64a:// process size 64 and greater
|
||||||
MOVD 0(R5),R9 // doublewords to compare
|
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
|
||||||
MOVD 0(R6),R10 // get 4 doublewords
|
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
|
||||||
MOVD 8(R5),R14
|
VCMPEQUDCC V3,V4,V1
|
||||||
MOVD 8(R6),R15
|
BGE CR6,different // jump out if its different
|
||||||
CMPU R9,R10 // bytes equal?
|
|
||||||
MOVD $0,R16 // set up for cmpne
|
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
|
||||||
BNE cmpne // further compare for LT or GT
|
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
|
||||||
MOVD 16(R5),R9 // get next pair of doublewords
|
|
||||||
MOVD 16(R6),R10
|
VCMPEQUDCC V3,V4,V1
|
||||||
CMPU R14,R15 // bytes match?
|
BGE CR6,different
|
||||||
MOVD $8,R16 // set up for cmpne
|
|
||||||
BNE cmpne // further compare for LT or GT
|
LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
|
||||||
MOVD 24(R5),R14 // get next pair of doublewords
|
LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
|
||||||
MOVD 24(R6),R15
|
|
||||||
CMPU R9,R10 // bytes match?
|
VCMPEQUDCC V3,V4,V1
|
||||||
MOVD $16,R16 // set up for cmpne
|
BGE CR6,different
|
||||||
BNE cmpne // further compare for LT or GT
|
|
||||||
MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
|
LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
|
||||||
ADD $32,R5 // bump up to next 32
|
LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
|
||||||
ADD $32,R6
|
|
||||||
CMPU R14,R15 // bytes match?
|
VCMPEQUDCC V3,V4,V1
|
||||||
BC 8,2,loop32a // br ctr and cr
|
BGE CR6,different
|
||||||
BNE cmpne
|
|
||||||
|
ADD $-64,R9,R9 // reduce remaining size by 64
|
||||||
|
ADD $64,R5,R5 // increment to next 64 bytes of A
|
||||||
|
ADD $64,R6,R6 // increment to next 64 bytes of B
|
||||||
|
CMPU R9,$64
|
||||||
|
BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
|
||||||
|
|
||||||
|
CMPU R9,$32
|
||||||
|
BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
|
||||||
|
CMPU R9,$0
|
||||||
|
BNE rem // loop to rem if the remainder is not 0
|
||||||
|
|
||||||
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
|
BLT CR2,less // jump to less if len(A)<len(B)
|
||||||
|
BR greater // jump to greater otherwise
|
||||||
|
cmp32:
|
||||||
|
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
|
||||||
|
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
|
||||||
|
|
||||||
|
VCMPEQUDCC V3,V4,V1
|
||||||
|
BGE CR6,different
|
||||||
|
|
||||||
|
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
|
||||||
|
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
|
||||||
|
|
||||||
|
VCMPEQUDCC V3,V4,V1
|
||||||
|
BGE CR6,different
|
||||||
|
|
||||||
|
ADD $-32,R9,R9 // reduce remaining size by 32
|
||||||
|
ADD $32,R5,R5 // increment to next 32 bytes of A
|
||||||
|
ADD $32,R6,R6 // increment to next 32 bytes of B
|
||||||
|
CMPU R9,$0
|
||||||
|
BNE rem // loop to rem if the remainder is not 0
|
||||||
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
|
BLT CR2,less // jump to less if len(A)<len(B)
|
||||||
|
BR greater // jump to greater otherwise
|
||||||
|
rem:
|
||||||
|
MOVD R9,R8
|
||||||
ANDCC $24,R8,R9 // Any 8 byte chunks?
|
ANDCC $24,R8,R9 // Any 8 byte chunks?
|
||||||
BEQ leftover // and result is 0
|
BEQ leftover // and result is 0
|
||||||
|
BR setup8a
|
||||||
|
|
||||||
|
different:
|
||||||
|
#ifdef GOARCH_ppc64le
|
||||||
|
MOVD $byteswap<>+00(SB), R16
|
||||||
|
LXVD2X (R16)(R0),SWAP // Set up swap string
|
||||||
|
|
||||||
|
VPERM V3,V3,SWAP,V3
|
||||||
|
VPERM V4,V4,SWAP,V4
|
||||||
|
#endif
|
||||||
|
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
|
||||||
|
MFVSRD VS36,R10
|
||||||
|
|
||||||
|
CMPU R16,R10
|
||||||
|
BEQ lower
|
||||||
|
BGT greater
|
||||||
|
MOVD $-1,R3 // return value if A < B
|
||||||
|
RET
|
||||||
|
lower:
|
||||||
|
VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
|
||||||
|
MFVSRD VS35,R16
|
||||||
|
VSLDOI $8,V4,V4,V4
|
||||||
|
MFVSRD VS36,R10
|
||||||
|
|
||||||
|
CMPU R16,R10
|
||||||
|
BGT greater
|
||||||
|
MOVD $-1,R3 // return value if A < B
|
||||||
|
RET
|
||||||
setup8a:
|
setup8a:
|
||||||
SRADCC $3,R9,R9 // get the 8 byte count
|
SRADCC $3,R8,R9 // get the 8 byte count
|
||||||
BEQ leftover // shifted value is 0
|
BEQ leftover // shifted value is 0
|
||||||
|
CMPU R8,$8 // optimize 8byte move
|
||||||
|
BEQ size8
|
||||||
|
CMPU R8,$16
|
||||||
|
BEQ size16
|
||||||
MOVD R9,CTR // loop count for doublewords
|
MOVD R9,CTR // loop count for doublewords
|
||||||
loop8:
|
loop8:
|
||||||
MOVDBR (R5+R0),R9 // doublewords to compare
|
#ifdef GOARCH_ppc64le
|
||||||
|
MOVDBR (R5+R0),R16 // doublewords to compare
|
||||||
MOVDBR (R6+R0),R10 // LE compare order
|
MOVDBR (R6+R0),R10 // LE compare order
|
||||||
|
#else
|
||||||
|
MOVD (R5+R0),R16 // doublewords to compare
|
||||||
|
MOVD (R6+R0),R10 // BE compare order
|
||||||
|
#endif
|
||||||
ADD $8,R5
|
ADD $8,R5
|
||||||
ADD $8,R6
|
ADD $8,R6
|
||||||
CMPU R9,R10 // match?
|
CMPU R16,R10 // match?
|
||||||
BC 8,2,loop8 // bt ctr <> 0 && cr
|
BC 8,2,loop8 // bt ctr <> 0 && cr
|
||||||
BGT greater
|
BGT greater
|
||||||
BLT less
|
BLT less
|
||||||
leftover:
|
leftover:
|
||||||
ANDCC $7,R8,R9 // check for leftover bytes
|
ANDCC $7,R8,R9 // check for leftover bytes
|
||||||
MOVD R9,CTR // save the ctr
|
BEQ zeroremainder
|
||||||
BNE simple // leftover bytes
|
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
|
||||||
BC 12,8,less
|
|
||||||
BR greater
|
|
||||||
simplecheck:
|
simplecheck:
|
||||||
CMP R8,$0 // remaining compare length 0
|
MOVD R0,R14
|
||||||
BNE simple // do simple compare
|
CMP R9,$4 // process 4 bytes
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
BLT halfword
|
||||||
BC 12,8,less // 1st len < 2nd len, result less
|
#ifdef GOARCH_ppc64le
|
||||||
BR greater // 1st len > 2nd len must be greater
|
MOVWBR (R5)(R14),R10
|
||||||
simple:
|
MOVWBR (R6)(R14),R11
|
||||||
MOVBZ 0(R5), R9 // get byte from 1st operand
|
#else
|
||||||
ADD $1,R5
|
MOVWZ (R5)(R14),R10
|
||||||
MOVBZ 0(R6), R10 // get byte from 2nd operand
|
MOVWZ (R6)(R14),R11
|
||||||
ADD $1,R6
|
#endif
|
||||||
CMPU R9, R10
|
CMPU R10,R11
|
||||||
BC 8,2,simple // bc ctr <> 0 && cr
|
BGT greater
|
||||||
BGT greater // 1st > 2nd
|
BLT less
|
||||||
BLT less // 1st < 2nd
|
ADD $-4,R9
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
ADD $4,R14
|
||||||
BC 12,9,greater // 2nd len > 1st len
|
PCALIGN $16
|
||||||
BR less // must be less
|
|
||||||
cmpne: // only here is not equal
|
halfword:
|
||||||
MOVDBR (R5+R16),R8 // reload in reverse order
|
CMP R9,$2 // process 2 bytes
|
||||||
MOVDBR (R6+R16),R9
|
BLT byte
|
||||||
CMPU R8,R9 // compare correct endianness
|
#ifdef GOARCH_ppc64le
|
||||||
BGT greater // here only if NE
|
MOVHBR (R5)(R14),R10
|
||||||
less:
|
MOVHBR (R6)(R14),R11
|
||||||
MOVD $-1, R3 // return value if A < B
|
#else
|
||||||
|
MOVHZ (R5)(R14),R10
|
||||||
|
MOVHZ (R6)(R14),R11
|
||||||
|
#endif
|
||||||
|
CMPU R10,R11
|
||||||
|
BGT greater
|
||||||
|
BLT less
|
||||||
|
ADD $-2,R9
|
||||||
|
ADD $2,R14
|
||||||
|
PCALIGN $16
|
||||||
|
byte:
|
||||||
|
CMP R9,$0 // process 1 byte
|
||||||
|
BEQ skip
|
||||||
|
MOVBZ (R5)(R14),R10
|
||||||
|
MOVBZ (R6)(R14),R11
|
||||||
|
CMPU R10,R11
|
||||||
|
BGT greater
|
||||||
|
BLT less
|
||||||
|
PCALIGN $16
|
||||||
|
skip:
|
||||||
|
BEQ CR2,equal
|
||||||
|
BGT CR2,greater
|
||||||
|
|
||||||
|
less: MOVD $-1,R3 // return value if A < B
|
||||||
RET
|
RET
|
||||||
|
size16:
|
||||||
|
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
|
||||||
|
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
|
||||||
|
VCMPEQUDCC V3,V4,V1
|
||||||
|
BGE CR6,different
|
||||||
|
zeroremainder:
|
||||||
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
|
BLT CR2,less // jump to less if len(A)<len(B)
|
||||||
|
BR greater // jump to greater otherwise
|
||||||
|
size8:
|
||||||
|
#ifdef GOARCH_ppc64le
|
||||||
|
MOVDBR (R5+R0),R16 // doublewords to compare
|
||||||
|
MOVDBR (R6+R0),R10 // LE compare order
|
||||||
|
#else
|
||||||
|
MOVD (R5+R0),R16 // doublewords to compare
|
||||||
|
MOVD (R6+R0),R10 // BE compare order
|
||||||
|
#endif
|
||||||
|
CMPU R16,R10 // match?
|
||||||
|
BGT greater
|
||||||
|
BLT less
|
||||||
|
BGT CR2,greater // 2nd len > 1st len
|
||||||
|
BLT CR2,less // 2nd len < 1st len
|
||||||
equal:
|
equal:
|
||||||
MOVD $0, R3 // return value if A == B
|
MOVD $0, R3 // return value if A == B
|
||||||
RET
|
RET
|
||||||
greater:
|
greater:
|
||||||
MOVD $1, R3 // return value if A > B
|
MOVD $1,R3 // return value if A > B
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// Do an efficient memcmp for ppc64 (BE)
|
// Do an efficient memcmp for ppc64le/ppc64/POWER9
|
||||||
// R3 = a len
|
// R3 = a len
|
||||||
// R4 = b len
|
// R4 = b len
|
||||||
// R5 = a addr
|
// R5 = a addr
|
||||||
// R6 = b addr
|
// R6 = b addr
|
||||||
// On exit:
|
// On exit:
|
||||||
// R3 = return value
|
// R3 = return value
|
||||||
TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
|
TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
|
||||||
MOVD R3,R8 // set up length
|
MOVD R3,R8 // set up length
|
||||||
CMP R3,R4,CR2 // unequal?
|
CMP R3,R4,CR2 // unequal?
|
||||||
BC 12,8,setuplen // BLT CR2
|
BLT CR2,setuplen // BLT CR2
|
||||||
MOVD R4,R8 // use R4 for comparison len
|
MOVD R4,R8 // use R4 for comparison len
|
||||||
setuplen:
|
setuplen:
|
||||||
MOVD R8,CTR // set up loop counter
|
CMP R8,$16 // optimize for size<16
|
||||||
CMP R8,$8 // only optimize >=8
|
|
||||||
BLT simplecheck
|
|
||||||
DCBT (R5) // cache hint
|
|
||||||
DCBT (R6)
|
|
||||||
CMP R8,$32 // optimize >= 32
|
|
||||||
MOVD R8,R9
|
MOVD R8,R9
|
||||||
BLT setup8a // 8 byte moves only
|
BLT simplecheck
|
||||||
|
MOVD $16,R10 // set offsets to load into vectors
|
||||||
|
CMP R8,$32 // optimize for size 16-31
|
||||||
|
BLT cmp16
|
||||||
|
CMP R8,$64
|
||||||
|
BLT cmp32 // optimize for size 32-63
|
||||||
|
DCBT (R5) // optimize for size>=64
|
||||||
|
DCBT (R6) // cache hint
|
||||||
|
|
||||||
setup32a:
|
MOVD $32,R11 // set offsets to load into vector
|
||||||
SRADCC $5,R8,R9 // number of 32 byte chunks
|
MOVD $48,R12 // set offsets to load into vector
|
||||||
MOVD R9,CTR
|
|
||||||
loop32a:
|
loop64a:// process size 64 and greater
|
||||||
MOVD 0(R5),R9 // doublewords to compare
|
LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
|
||||||
MOVD 0(R6),R10 // get 4 doublewords
|
LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
|
||||||
MOVD 8(R5),R14
|
VCMPNEBCC V3,V4,V1 // record comparison into V1
|
||||||
MOVD 8(R6),R15
|
BNE CR6,different // jump out if its different
|
||||||
CMPU R9,R10 // bytes equal?
|
|
||||||
BLT less // found to be less
|
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
|
||||||
BGT greater // found to be greater
|
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
|
||||||
MOVD 16(R5),R9 // get next pair of doublewords
|
VCMPNEBCC V3,V4,V1
|
||||||
MOVD 16(R6),R10
|
BNE CR6,different
|
||||||
CMPU R14,R15 // bytes match?
|
|
||||||
BLT less // found less
|
LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
|
||||||
BGT greater // found greater
|
LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
|
||||||
MOVD 24(R5),R14 // get next pair of doublewords
|
VCMPNEBCC V3,V4,V1
|
||||||
MOVD 24(R6),R15
|
BNE CR6,different
|
||||||
CMPU R9,R10 // bytes match?
|
|
||||||
BLT less // found to be less
|
LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
|
||||||
BGT greater // found to be greater
|
LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
|
||||||
ADD $32,R5 // bump up to next 32
|
VCMPNEBCC V3,V4,V1
|
||||||
ADD $32,R6
|
BNE CR6,different
|
||||||
CMPU R14,R15 // bytes match?
|
|
||||||
BC 8,2,loop32a // br ctr and cr
|
ADD $-64,R9,R9 // reduce remaining size by 64
|
||||||
BLT less // with BE, byte ordering is
|
ADD $64,R5,R5 // increment to next 64 bytes of A
|
||||||
BGT greater // good for compare
|
ADD $64,R6,R6 // increment to next 64 bytes of B
|
||||||
ANDCC $24,R8,R9 // Any 8 byte chunks?
|
CMPU R9,$64
|
||||||
BEQ leftover // and result is 0
|
BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
|
||||||
setup8a:
|
|
||||||
SRADCC $3,R9,R9 // get the 8 byte count
|
CMPU R9,$32
|
||||||
BEQ leftover // shifted value is 0
|
BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
|
||||||
MOVD R9,CTR // loop count for doublewords
|
CMPU R9,$16
|
||||||
loop8:
|
BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
|
||||||
MOVD (R5),R9
|
CMPU R9,$0
|
||||||
MOVD (R6),R10
|
BNE simplecheck // loop to simplecheck for remaining bytes
|
||||||
ADD $8,R5
|
|
||||||
ADD $8,R6
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
CMPU R9,R10 // match?
|
BLT CR2,less // jump to less if len(A)<len(B)
|
||||||
BC 8,2,loop8 // bt ctr <> 0 && cr
|
BR greater // jump to greater otherwise
|
||||||
|
cmp32:
|
||||||
|
LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
|
||||||
|
LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
|
||||||
|
|
||||||
|
VCMPNEBCC V3,V4,V1 // record comparison into V1
|
||||||
|
BNE CR6,different // jump out if its different
|
||||||
|
|
||||||
|
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
|
||||||
|
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
|
||||||
|
VCMPNEBCC V3,V4,V1
|
||||||
|
BNE CR6,different
|
||||||
|
|
||||||
|
ADD $-32,R9,R9 // reduce remaining size by 32
|
||||||
|
ADD $32,R5,R5 // increment to next 32 bytes of A
|
||||||
|
ADD $32,R6,R6 // increment to next 32 bytes of B
|
||||||
|
CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
|
||||||
|
BGE cmp16
|
||||||
|
CMPU R9,$0
|
||||||
|
BNE simplecheck // loop to simplecheck for remainder bytes
|
||||||
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
|
BLT CR2,less // jump to less if len(A)<len(B)
|
||||||
|
BR greater // jump to greater otherwise
|
||||||
|
different:
|
||||||
|
|
||||||
|
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
|
||||||
|
MFVSRD VS36,R10
|
||||||
|
|
||||||
|
CMPU R16,R10
|
||||||
|
BEQ lower
|
||||||
|
BGT greater
|
||||||
|
MOVD $-1,R3 // return value if A < B
|
||||||
|
RET
|
||||||
|
lower:
|
||||||
|
MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
|
||||||
|
MFVSRLD VS36,R10
|
||||||
|
|
||||||
|
CMPU R16,R10
|
||||||
|
BGT greater
|
||||||
|
MOVD $-1,R3 // return value if A < B
|
||||||
|
RET
|
||||||
|
|
||||||
|
greater:
|
||||||
|
MOVD $1,R3 // return value if A > B
|
||||||
|
RET
|
||||||
|
cmp16:
|
||||||
|
ANDCC $16,R9,R31
|
||||||
|
BEQ tail
|
||||||
|
|
||||||
|
LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
|
||||||
|
LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
|
||||||
|
VCMPEQUDCC V3,V4,V1
|
||||||
|
BGE CR6,different
|
||||||
|
|
||||||
|
ADD $16,R5
|
||||||
|
ADD $16,R6
|
||||||
|
tail:
|
||||||
|
ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
|
||||||
|
BEQ end
|
||||||
|
|
||||||
|
ADD R9,R5
|
||||||
|
ADD R9,R6
|
||||||
|
MOVD $-16,R10
|
||||||
|
|
||||||
|
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
|
||||||
|
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
|
||||||
|
VCMPEQUDCC V3,V4,V1
|
||||||
|
BGE CR6,different
|
||||||
|
end:
|
||||||
|
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
|
||||||
|
BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
|
||||||
|
BR greater // jump to greater otherwise
|
||||||
|
simplecheck:
|
||||||
|
MOVD $0,R14 // process 8 bytes
|
||||||
|
CMP R9,$8
|
||||||
|
BLT word
|
||||||
|
#ifdef GOARCH_ppc64le
|
||||||
|
MOVDBR (R5+R14),R10
|
||||||
|
MOVDBR (R6+R14),R11
|
||||||
|
#else
|
||||||
|
MOVD (R5+R14),R10
|
||||||
|
MOVD (R6+R14),R11
|
||||||
|
#endif
|
||||||
|
CMPU R10,R11
|
||||||
BGT greater
|
BGT greater
|
||||||
BLT less
|
BLT less
|
||||||
leftover:
|
ADD $8,R14
|
||||||
ANDCC $7,R8,R9 // check for leftover bytes
|
ADD $-8,R9
|
||||||
MOVD R9,CTR // save the ctr
|
PCALIGN $16
|
||||||
BNE simple // leftover bytes
|
word:
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
CMP R9,$4 // process 4 bytes
|
||||||
BC 12,8,less
|
BLT halfword
|
||||||
BR greater
|
#ifdef GOARCH_ppc64le
|
||||||
simplecheck:
|
MOVWBR (R5+R14),R10
|
||||||
CMP R8,$0 // remaining compare length 0
|
MOVWBR (R6+R14),R11
|
||||||
BNE simple // do simple compare
|
#else
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
MOVWZ (R5+R14),R10
|
||||||
BC 12,8,less // 1st len < 2nd len, result less
|
MOVWZ (R6+R14),R11
|
||||||
BR greater // same len, must be equal
|
#endif
|
||||||
simple:
|
CMPU R10,R11
|
||||||
MOVBZ 0(R5),R9 // get byte from 1st operand
|
BGT greater
|
||||||
ADD $1,R5
|
BLT less
|
||||||
MOVBZ 0(R6),R10 // get byte from 2nd operand
|
ADD $4,R14
|
||||||
ADD $1,R6
|
ADD $-4,R9
|
||||||
CMPU R9,R10
|
PCALIGN $16
|
||||||
BC 8,2,simple // bc ctr <> 0 && cr
|
halfword:
|
||||||
BGT greater // 1st > 2nd
|
CMP R9,$2 // process 2 bytes
|
||||||
BLT less // 1st < 2nd
|
BLT byte
|
||||||
BC 12,10,equal // test CR2 for length comparison
|
#ifdef GOARCH_ppc64le
|
||||||
BC 12,9,greater // 2nd len > 1st len
|
MOVHBR (R5+R14),R10
|
||||||
|
MOVHBR (R6+R14),R11
|
||||||
|
#else
|
||||||
|
MOVHZ (R5+R14),R10
|
||||||
|
MOVHZ (R6+R14),R11
|
||||||
|
#endif
|
||||||
|
CMPU R10,R11
|
||||||
|
BGT greater
|
||||||
|
BLT less
|
||||||
|
ADD $2,R14
|
||||||
|
ADD $-2,R9
|
||||||
|
PCALIGN $16
|
||||||
|
byte:
|
||||||
|
CMP R9,$0 // process 1 byte
|
||||||
|
BEQ skip
|
||||||
|
MOVBZ (R5+R14),R10
|
||||||
|
MOVBZ (R6+R14),R11
|
||||||
|
CMPU R10,R11
|
||||||
|
BGT greater
|
||||||
|
BLT less
|
||||||
|
PCALIGN $16
|
||||||
|
skip:
|
||||||
|
BEQ CR2,equal
|
||||||
|
BGT CR2,greater
|
||||||
less:
|
less:
|
||||||
MOVD $-1, R3 // return value if A < B
|
MOVD $-1,R3 // return value if A < B
|
||||||
RET
|
RET
|
||||||
equal:
|
equal:
|
||||||
MOVD $0, R3 // return value if A == B
|
MOVD $0, R3 // return value if A == B
|
||||||
RET
|
RET
|
||||||
greater:
|
|
||||||
MOVD $1, R3 // return value if A > B
|
|
||||||
RET
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue