internal/bytealg: optimize cmpbody for ppc64le/ppc64

Vectorize the cmpbody loop for bytes of size greater than or equal
to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve
performance of smaller size compares

Performance improves for most sizes with this change on POWER8, 9
and POWER10. For the very small sizes (upto 8) the overhead of
calling function starts to impact performance.

POWER9:
name               old time/op  new time/op  delta
BytesCompare/1     4.60ns ± 0%  5.49ns ± 0%  +19.27%
BytesCompare/2     4.68ns ± 0%  5.46ns ± 0%  +16.71%
BytesCompare/4     6.58ns ± 0%  5.49ns ± 0%  -16.58%
BytesCompare/8     4.89ns ± 0%  5.46ns ± 0%  +11.64%
BytesCompare/16    5.21ns ± 0%  4.96ns ± 0%   -4.70%
BytesCompare/32    5.09ns ± 0%  4.98ns ± 0%   -2.14%
BytesCompare/64    6.40ns ± 0%  5.96ns ± 0%   -6.84%
BytesCompare/128   11.3ns ± 0%   8.1ns ± 0%  -28.09%
BytesCompare/256   15.1ns ± 0%  12.8ns ± 0%  -15.16%
BytesCompare/512   26.5ns ± 0%  23.3ns ± 5%  -12.03%
BytesCompare/1024  50.2ns ± 0%  41.6ns ± 2%  -17.01%
BytesCompare/2048  99.3ns ± 0%  86.5ns ± 0%  -12.88%

Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609
Reviewed-on: https://go-review.googlesource.com/c/go/+/362797
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
This commit is contained in:
Archana R 2021-11-10 01:18:42 -06:00 committed by Lynn Boger
parent 1e5987635c
commit 78fb1d03d3
1 changed files with 386 additions and 162 deletions

View File

@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
CMP R5,R6,CR7 CMP R5,R6,CR7
CMP R3,R4,CR6 CMP R3,R4,CR6
BEQ CR7,equal BEQ CR7,equal
#ifdef GOARCH_ppc64le MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR cmpbodyLE<>(SB) CMP R16,$1
#else BNE power8
BR cmpbodyBE<>(SB) BR cmpbodyp9<>(SB)
#endif power8:
BR cmpbody<>(SB)
equal: equal:
BEQ CR6,done BEQ CR6,done
MOVD $1, R8 MOVD $1, R8
@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
CMP R5,R6,CR7 CMP R5,R6,CR7
CMP R3,R4,CR6 CMP R3,R4,CR6
BEQ CR7,equal BEQ CR7,equal
#ifdef GOARCH_ppc64le MOVBZ internalcpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR cmpbodyLE<>(SB) CMP R16,$1
#else BNE power8
BR cmpbodyBE<>(SB) BR cmpbodyp9<>(SB)
#endif power8:
BR cmpbody<>(SB)
equal: equal:
BEQ CR6,done BEQ CR6,done
MOVD $1, R8 MOVD $1, R8
@ -70,209 +72,431 @@ done:
MOVD $0, R3 MOVD $0, R3
RET RET
// Do an efficient memcmp for ppc64le #ifdef GOARCH_ppc64le
DATA byteswap<>+0(SB)/8, $0x0706050403020100
DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
GLOBL byteswap<>+0(SB), RODATA, $16
#define SWAP V21
#endif
// Do an efficient memcmp for ppc64le/ppc64/POWER8
// R3 = a len // R3 = a len
// R4 = b len // R4 = b len
// R5 = a addr // R5 = a addr
// R6 = b addr // R6 = b addr
// On exit: // On exit:
// R3 = return value // R3 = return value
TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal? CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2 BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len MOVD R4,R8 // use R4 for comparison len
setuplen: setuplen:
MOVD R8,CTR // set up loop counter
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32 CMP R8,$32 // optimize >= 32
MOVD R8,R9 MOVD R8,R9
BLT setup8a // 8 byte moves only BLT setup8a // optimize < 32
setup32a: MOVD $16,R10 // set offsets to load into vectors
SRADCC $5,R8,R9 // number of 32 byte chunks CMP R8,$64
MOVD R9,CTR BLT cmp32 // process size 32-63
// Special processing for 32 bytes or longer. DCBT (R5) // optimize >= 64
// Loading this way is faster and correct as long as the DCBT (R6) // cache hint
// doublewords being compared are equal. Once they MOVD $32,R11 // set offsets to load into vector
// are found unequal, reload them in proper byte order MOVD $48,R12 // set offsets to load into vector
// to determine greater or less than.
loop32a: loop64a:// process size 64 and greater
MOVD 0(R5),R9 // doublewords to compare LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
MOVD 0(R6),R10 // get 4 doublewords LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
MOVD 8(R5),R14 VCMPEQUDCC V3,V4,V1
MOVD 8(R6),R15 BGE CR6,different // jump out if its different
CMPU R9,R10 // bytes equal?
MOVD $0,R16 // set up for cmpne LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
BNE cmpne // further compare for LT or GT LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
MOVD 16(R5),R9 // get next pair of doublewords
MOVD 16(R6),R10 VCMPEQUDCC V3,V4,V1
CMPU R14,R15 // bytes match? BGE CR6,different
MOVD $8,R16 // set up for cmpne
BNE cmpne // further compare for LT or GT LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
MOVD 24(R5),R14 // get next pair of doublewords LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
MOVD 24(R6),R15
CMPU R9,R10 // bytes match? VCMPEQUDCC V3,V4,V1
MOVD $16,R16 // set up for cmpne BGE CR6,different
BNE cmpne // further compare for LT or GT
MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32 LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
ADD $32,R5 // bump up to next 32 LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
ADD $32,R6
CMPU R14,R15 // bytes match? VCMPEQUDCC V3,V4,V1
BC 8,2,loop32a // br ctr and cr BGE CR6,different
BNE cmpne
ADD $-64,R9,R9 // reduce remaining size by 64
ADD $64,R5,R5 // increment to next 64 bytes of A
ADD $64,R6,R6 // increment to next 64 bytes of B
CMPU R9,$64
BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
CMPU R9,$32
BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
CMPU R9,$0
BNE rem // loop to rem if the remainder is not 0
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
cmp32:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $-32,R9,R9 // reduce remaining size by 32
ADD $32,R5,R5 // increment to next 32 bytes of A
ADD $32,R6,R6 // increment to next 32 bytes of B
CMPU R9,$0
BNE rem // loop to rem if the remainder is not 0
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
rem:
MOVD R9,R8
ANDCC $24,R8,R9 // Any 8 byte chunks? ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0 BEQ leftover // and result is 0
BR setup8a
different:
#ifdef GOARCH_ppc64le
MOVD $byteswap<>+00(SB), R16
LXVD2X (R16)(R0),SWAP // Set up swap string
VPERM V3,V3,SWAP,V3
VPERM V4,V4,SWAP,V4
#endif
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
BGT greater
MOVD $-1,R3 // return value if A < B
RET
lower:
VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
MFVSRD VS35,R16
VSLDOI $8,V4,V4,V4
MFVSRD VS36,R10
CMPU R16,R10
BGT greater
MOVD $-1,R3 // return value if A < B
RET
setup8a: setup8a:
SRADCC $3,R9,R9 // get the 8 byte count SRADCC $3,R8,R9 // get the 8 byte count
BEQ leftover // shifted value is 0 BEQ leftover // shifted value is 0
CMPU R8,$8 // optimize 8byte move
BEQ size8
CMPU R8,$16
BEQ size16
MOVD R9,CTR // loop count for doublewords MOVD R9,CTR // loop count for doublewords
loop8: loop8:
MOVDBR (R5+R0),R9 // doublewords to compare #ifdef GOARCH_ppc64le
MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order MOVDBR (R6+R0),R10 // LE compare order
#else
MOVD (R5+R0),R16 // doublewords to compare
MOVD (R6+R0),R10 // BE compare order
#endif
ADD $8,R5 ADD $8,R5
ADD $8,R6 ADD $8,R6
CMPU R9,R10 // match? CMPU R16,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater BGT greater
BLT less BLT less
leftover: leftover:
ANDCC $7,R8,R9 // check for leftover bytes ANDCC $7,R8,R9 // check for leftover bytes
MOVD R9,CTR // save the ctr BEQ zeroremainder
BNE simple // leftover bytes
BC 12,10,equal // test CR2 for length comparison
BC 12,8,less
BR greater
simplecheck: simplecheck:
CMP R8,$0 // remaining compare length 0 MOVD R0,R14
BNE simple // do simple compare CMP R9,$4 // process 4 bytes
BC 12,10,equal // test CR2 for length comparison BLT halfword
BC 12,8,less // 1st len < 2nd len, result less #ifdef GOARCH_ppc64le
BR greater // 1st len > 2nd len must be greater MOVWBR (R5)(R14),R10
simple: MOVWBR (R6)(R14),R11
MOVBZ 0(R5), R9 // get byte from 1st operand #else
ADD $1,R5 MOVWZ (R5)(R14),R10
MOVBZ 0(R6), R10 // get byte from 2nd operand MOVWZ (R6)(R14),R11
ADD $1,R6 #endif
CMPU R9, R10 CMPU R10,R11
BC 8,2,simple // bc ctr <> 0 && cr BGT greater
BGT greater // 1st > 2nd BLT less
BLT less // 1st < 2nd ADD $-4,R9
BC 12,10,equal // test CR2 for length comparison ADD $4,R14
BC 12,9,greater // 2nd len > 1st len PCALIGN $16
BR less // must be less
cmpne: // only here is not equal halfword:
MOVDBR (R5+R16),R8 // reload in reverse order CMP R9,$2 // process 2 bytes
MOVDBR (R6+R16),R9 BLT byte
CMPU R8,R9 // compare correct endianness #ifdef GOARCH_ppc64le
BGT greater // here only if NE MOVHBR (R5)(R14),R10
less: MOVHBR (R6)(R14),R11
MOVD $-1, R3 // return value if A < B #else
MOVHZ (R5)(R14),R10
MOVHZ (R6)(R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $-2,R9
ADD $2,R14
PCALIGN $16
byte:
CMP R9,$0 // process 1 byte
BEQ skip
MOVBZ (R5)(R14),R10
MOVBZ (R6)(R14),R11
CMPU R10,R11
BGT greater
BLT less
PCALIGN $16
skip:
BEQ CR2,equal
BGT CR2,greater
less: MOVD $-1,R3 // return value if A < B
RET RET
size16:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
zeroremainder:
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
size8:
#ifdef GOARCH_ppc64le
MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
#else
MOVD (R5+R0),R16 // doublewords to compare
MOVD (R6+R0),R10 // BE compare order
#endif
CMPU R16,R10 // match?
BGT greater
BLT less
BGT CR2,greater // 2nd len > 1st len
BLT CR2,less // 2nd len < 1st len
equal: equal:
MOVD $0, R3 // return value if A == B MOVD $0, R3 // return value if A == B
RET RET
greater: greater:
MOVD $1, R3 // return value if A > B MOVD $1,R3 // return value if A > B
RET RET
// Do an efficient memcmp for ppc64 (BE) // Do an efficient memcmp for ppc64le/ppc64/POWER9
// R3 = a len // R3 = a len
// R4 = b len // R4 = b len
// R5 = a addr // R5 = a addr
// R6 = b addr // R6 = b addr
// On exit: // On exit:
// R3 = return value // R3 = return value
TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0 TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal? CMP R3,R4,CR2 // unequal?
BC 12,8,setuplen // BLT CR2 BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len MOVD R4,R8 // use R4 for comparison len
setuplen: setuplen:
MOVD R8,CTR // set up loop counter CMP R8,$16 // optimize for size<16
CMP R8,$8 // only optimize >=8
BLT simplecheck
DCBT (R5) // cache hint
DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9 MOVD R8,R9
BLT setup8a // 8 byte moves only BLT simplecheck
MOVD $16,R10 // set offsets to load into vectors
CMP R8,$32 // optimize for size 16-31
BLT cmp16
CMP R8,$64
BLT cmp32 // optimize for size 32-63
DCBT (R5) // optimize for size>=64
DCBT (R6) // cache hint
setup32a: MOVD $32,R11 // set offsets to load into vector
SRADCC $5,R8,R9 // number of 32 byte chunks MOVD $48,R12 // set offsets to load into vector
MOVD R9,CTR
loop32a: loop64a:// process size 64 and greater
MOVD 0(R5),R9 // doublewords to compare LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
MOVD 0(R6),R10 // get 4 doublewords LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
MOVD 8(R5),R14 VCMPNEBCC V3,V4,V1 // record comparison into V1
MOVD 8(R6),R15 BNE CR6,different // jump out if its different
CMPU R9,R10 // bytes equal?
BLT less // found to be less LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
BGT greater // found to be greater LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
MOVD 16(R5),R9 // get next pair of doublewords VCMPNEBCC V3,V4,V1
MOVD 16(R6),R10 BNE CR6,different
CMPU R14,R15 // bytes match?
BLT less // found less LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
BGT greater // found greater LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
MOVD 24(R5),R14 // get next pair of doublewords VCMPNEBCC V3,V4,V1
MOVD 24(R6),R15 BNE CR6,different
CMPU R9,R10 // bytes match?
BLT less // found to be less LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
BGT greater // found to be greater LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
ADD $32,R5 // bump up to next 32 VCMPNEBCC V3,V4,V1
ADD $32,R6 BNE CR6,different
CMPU R14,R15 // bytes match?
BC 8,2,loop32a // br ctr and cr ADD $-64,R9,R9 // reduce remaining size by 64
BLT less // with BE, byte ordering is ADD $64,R5,R5 // increment to next 64 bytes of A
BGT greater // good for compare ADD $64,R6,R6 // increment to next 64 bytes of B
ANDCC $24,R8,R9 // Any 8 byte chunks? CMPU R9,$64
BEQ leftover // and result is 0 BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
setup8a:
SRADCC $3,R9,R9 // get the 8 byte count CMPU R9,$32
BEQ leftover // shifted value is 0 BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
MOVD R9,CTR // loop count for doublewords CMPU R9,$16
loop8: BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
MOVD (R5),R9 CMPU R9,$0
MOVD (R6),R10 BNE simplecheck // loop to simplecheck for remaining bytes
ADD $8,R5
ADD $8,R6 BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
CMPU R9,R10 // match? BLT CR2,less // jump to less if len(A)<len(B)
BC 8,2,loop8 // bt ctr <> 0 && cr BR greater // jump to greater otherwise
cmp32:
LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
VCMPNEBCC V3,V4,V1 // record comparison into V1
BNE CR6,different // jump out if its different
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
VCMPNEBCC V3,V4,V1
BNE CR6,different
ADD $-32,R9,R9 // reduce remaining size by 32
ADD $32,R5,R5 // increment to next 32 bytes of A
ADD $32,R6,R6 // increment to next 32 bytes of B
CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
BGE cmp16
CMPU R9,$0
BNE simplecheck // loop to simplecheck for remainder bytes
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if len(A)<len(B)
BR greater // jump to greater otherwise
different:
MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
BGT greater
MOVD $-1,R3 // return value if A < B
RET
lower:
MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
MFVSRLD VS36,R10
CMPU R16,R10
BGT greater
MOVD $-1,R3 // return value if A < B
RET
greater:
MOVD $1,R3 // return value if A > B
RET
cmp16:
ANDCC $16,R9,R31
BEQ tail
LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
ADD $16,R5
ADD $16,R6
tail:
ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
BEQ end
ADD R9,R5
ADD R9,R6
MOVD $-16,R10
LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
VCMPEQUDCC V3,V4,V1
BGE CR6,different
end:
BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
BR greater // jump to greater otherwise
simplecheck:
MOVD $0,R14 // process 8 bytes
CMP R9,$8
BLT word
#ifdef GOARCH_ppc64le
MOVDBR (R5+R14),R10
MOVDBR (R6+R14),R11
#else
MOVD (R5+R14),R10
MOVD (R6+R14),R11
#endif
CMPU R10,R11
BGT greater BGT greater
BLT less BLT less
leftover: ADD $8,R14
ANDCC $7,R8,R9 // check for leftover bytes ADD $-8,R9
MOVD R9,CTR // save the ctr PCALIGN $16
BNE simple // leftover bytes word:
BC 12,10,equal // test CR2 for length comparison CMP R9,$4 // process 4 bytes
BC 12,8,less BLT halfword
BR greater #ifdef GOARCH_ppc64le
simplecheck: MOVWBR (R5+R14),R10
CMP R8,$0 // remaining compare length 0 MOVWBR (R6+R14),R11
BNE simple // do simple compare #else
BC 12,10,equal // test CR2 for length comparison MOVWZ (R5+R14),R10
BC 12,8,less // 1st len < 2nd len, result less MOVWZ (R6+R14),R11
BR greater // same len, must be equal #endif
simple: CMPU R10,R11
MOVBZ 0(R5),R9 // get byte from 1st operand BGT greater
ADD $1,R5 BLT less
MOVBZ 0(R6),R10 // get byte from 2nd operand ADD $4,R14
ADD $1,R6 ADD $-4,R9
CMPU R9,R10 PCALIGN $16
BC 8,2,simple // bc ctr <> 0 && cr halfword:
BGT greater // 1st > 2nd CMP R9,$2 // process 2 bytes
BLT less // 1st < 2nd BLT byte
BC 12,10,equal // test CR2 for length comparison #ifdef GOARCH_ppc64le
BC 12,9,greater // 2nd len > 1st len MOVHBR (R5+R14),R10
MOVHBR (R6+R14),R11
#else
MOVHZ (R5+R14),R10
MOVHZ (R6+R14),R11
#endif
CMPU R10,R11
BGT greater
BLT less
ADD $2,R14
ADD $-2,R9
PCALIGN $16
byte:
CMP R9,$0 // process 1 byte
BEQ skip
MOVBZ (R5+R14),R10
MOVBZ (R6+R14),R11
CMPU R10,R11
BGT greater
BLT less
PCALIGN $16
skip:
BEQ CR2,equal
BGT CR2,greater
less: less:
MOVD $-1, R3 // return value if A < B MOVD $-1,R3 // return value if A < B
RET RET
equal: equal:
MOVD $0, R3 // return value if A == B MOVD $0, R3 // return value if A == B
RET RET
greater:
MOVD $1, R3 // return value if A > B
RET