cmd/internal/obj/ppc64: modify PCALIGN to ensure alignment

The initial purpose of PCALIGN was to identify code
where it would be beneficial to align code for performance,
but avoid cases where too many NOPs were added. On p10, it
is now necessary to enforce a certain alignment in some
cases, so the behavior of PCALIGN needs to be slightly
different.  Code will now be aligned to the value specified
on the PCALIGN instruction regardless of number of NOPs added,
which is more intuitive and consistent with power assembler
alignment directives.

This also adds 64 as a possible alignment value.

The existing values used in PCALIGN were modified according to
the new behavior.

A testcase was updated and performance testing was done to
verify that this does not adversely affect performance.

Change-Id: Iad1cf5ff112e5bfc0514f0805be90e24095e932b
Reviewed-on: https://go-review.googlesource.com/c/go/+/485056
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
This commit is contained in:
Lynn Boger 2023-04-17 10:02:48 -05:00
parent de788efeac
commit e23322e2cc
10 changed files with 35 additions and 51 deletions

View File

@ -585,40 +585,17 @@ var buildOpCfg = "" // Save the os/cpu/arch tuple used to configure the assem
// padding bytes to add to align code as requested.
func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int {
// For 16 and 32 byte alignment, there is a tradeoff
// between aligning the code and adding too many NOPs.
switch a {
case 8:
if pc&7 != 0 {
return 4
case 8, 16, 32, 64:
// By default function alignment is 16. If an alignment > 16 is
// requested then the function alignment must also be promoted.
// The function alignment is not promoted on AIX at this time.
// TODO: Investigate AIX function alignment.
if ctxt.Headtype != objabi.Haix && cursym.Func().Align < int32(a) {
cursym.Func().Align = int32(a)
}
case 16:
// Align to 16 bytes if possible but add at
// most 2 NOPs.
switch pc & 15 {
case 4, 12:
return 4
case 8:
return 8
}
case 32:
// Align to 32 bytes if possible but add at
// most 3 NOPs.
switch pc & 31 {
case 4, 20:
return 12
case 8, 24:
return 8
case 12, 28:
return 4
}
// When 32 byte alignment is requested on Linux,
// promote the function's alignment to 32. On AIX
// the function alignment is not changed which might
// result in 16 byte alignment but that is still fine.
// TODO: alignment on AIX
if ctxt.Headtype != objabi.Haix && cursym.Func().Align < 32 {
cursym.Func().Align = 32
if pc&(a-1) != 0 {
return int(a - (pc & (a - 1)))
}
default:
ctxt.Diag("Unexpected alignment: %d for PCALIGN directive\n", a)

View File

@ -28,7 +28,7 @@ var platformEnvs = [][]string{
const invalidPCAlignSrc = `
TEXT test(SB),0,$0-0
ADD $2, R3
PCALIGN $64
PCALIGN $128
RET
`

View File

@ -187,8 +187,15 @@ exists in PPC64 assembler and is frequently used by PPC64 assembler writers.
PCALIGN $16
PCALIGN $8
Functions in Go are aligned to 16 bytes, as is the case in all other compilers
for PPC64.
By default, functions in Go are aligned to 16 bytes, as is the case in all
other compilers for PPC64. If there is a PCALIGN directive requesting alignment
greater than 16, then the alignment of the containing function must be
promoted to that same alignment or greater.
The behavior of PCALIGN is changed in Go 1.21 to be more straightforward to
ensure the alignment required for some instructions in power10. The acceptable
values are 8, 16, 32 and 64, and the use of those values will always provide the
specified alignment.
6. Shift instructions

View File

@ -644,7 +644,7 @@ TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
BEQ Lcbc_dec
PCALIGN $32
PCALIGN $16
Lcbc_enc:
P8_LXVB16X(INP, R0, INOUT)
ADD $16, INP
@ -659,7 +659,7 @@ Lcbc_enc:
CLEAR_KEYS()
RET
PCALIGN $32
PCALIGN $16
Lcbc_dec:
P8_LXVB16X(INP, R0, TMP)
ADD $16, INP

View File

@ -118,7 +118,7 @@ cmp64: // >= 64B
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
PCALIGN $32
PCALIGN $16
cmp64_loop:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector

View File

@ -61,7 +61,7 @@ setup64:
MOVD $48, R16
ANDCC $0x3F, R5, R5 // len%64==0?
PCALIGN $32
PCALIGN $16
loop64:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1

View File

@ -674,7 +674,7 @@ index2to16:
#else
MOVD $3, R17 // Number of bytes beyond 16
#endif
PCALIGN $32
PCALIGN $16
index2to16loop:
@ -776,7 +776,7 @@ short:
MTVSRD R10, V8 // Set up shift
VSLDOI $8, V8, V8, V8
VSLO V1, V8, V1 // Shift by start byte
PCALIGN $32
PCALIGN $16
index2to16next:
VAND V1, SEPMASK, V2 // Just compare size of sep
VCMPEQUBCC V0, V2, V3 // Compare sep and partial string

View File

@ -45,7 +45,7 @@ TEXT ·addVV(SB), NOSPLIT, $0
// gain significant performance as z_len increases (up to
// 1.45x).
PCALIGN $32
PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
@ -134,7 +134,7 @@ TEXT ·subVV(SB), NOSPLIT, $0
// gain significant performance as z_len increases (up to
// 1.45x).
PCALIGN $32
PCALIGN $16
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
@ -216,7 +216,7 @@ TEXT ·addVW(SB), NOSPLIT, $0
CMP R0, R9
MOVD R9, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $32
PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
@ -294,7 +294,7 @@ TEXT ·subVW(SB), NOSPLIT, $0
// we don't need to capture CA every iteration because we've already
// done that above.
PCALIGN $32
PCALIGN $16
loop:
MOVD 8(R8), R20
MOVD 16(R8), R21
@ -365,7 +365,7 @@ TEXT ·shlVU(SB), NOSPLIT, $0
CMP R5, R0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
PCALIGN $32
PCALIGN $16
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
@ -528,7 +528,7 @@ TEXT ·mulAddVWW(SB), NOSPLIT, $0
CMP R0, R14
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
PCALIGN $32
PCALIGN $16
loop:
MOVD 8(R8), R20 // R20 = x[i]
@ -611,7 +611,7 @@ TEXT ·addMulVVW(SB), NOSPLIT, $0
MOVD R0, R4 // R4 = c = 0
MOVD R22, CTR // Initialize loop counter
BEQ done
PCALIGN $32
PCALIGN $16
loop:
MOVD (R8)(R3), R20 // Load x[i]

View File

@ -149,7 +149,7 @@ zero512setup: // setup for dcbz loop
MOVD $128, R9 // index regs for 128 bytes
MOVD $256, R10
MOVD $384, R11
PCALIGN $32
PCALIGN $16
zero512:
DCBZ (R3+R0) // clear first chunk
DCBZ (R3+R9) // clear second chunk

View File

@ -77,7 +77,7 @@ forward64setup:
MOVD OCTWORDS, CTR // Number of 64 byte chunks
MOVD $32, IDX32
MOVD $48, IDX48
PCALIGN $32
PCALIGN $16
forward64:
LXVD2X (R0)(SRC), VS32 // load 64 bytes
@ -206,7 +206,7 @@ backward32setup:
ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
MOVD QWORDS, CTR // set up loop ctr
MOVD $16, IDX16 // 32 bytes at a time
PCALIGN $32
PCALIGN $16
backward32loop:
SUB $32, TGT