cmd/compile/internal/ssa: re-adjust CarryChainTail scheduling priority

This needs to be as low as possible while not breaking priority
assumptions of other scores to correctly schedule carry chains.

Prior to the arm64 changes, it was set below ReadTuple. At the time,
this prevented the MulHiLo implementation on PPC64 from occluding
the scheduling of a full carry chain.

Memory scores can also prevent better scheduling, as can be observed
with crypto/internal/edwards25519/field.feMulGeneric.

Fixes #56497

Change-Id: Ia4b54e6dffcce584faf46b1b8d7cea18a3913887
Reviewed-on: https://go-review.googlesource.com/c/go/+/447435
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@google.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Paul E. Murphy 2022-10-28 15:59:43 -05:00 committed by Paul Murphy
parent c065bc70ef
commit d031e9e07a
2 changed files with 34 additions and 1 deletions

View File

@ -16,8 +16,8 @@ const (
ScoreNilCheck
ScoreReadTuple
ScoreVarDef
ScoreMemory
ScoreCarryChainTail
ScoreMemory
ScoreReadFlags
ScoreDefault
ScoreFlags

View File

@ -620,6 +620,39 @@ func Add64MPanicOnOverflowGT(a, b [2]uint64) [2]uint64 {
return r
}
// Verify independent carry chain operations are scheduled efficiently
// and do not cause unnecessary save/restore of the CA bit.
//
// This is an example of why CarryChainTail priority must be lower
// (earlier in the block) than Memory. f[0]=f1 could be scheduled
// after the first two lower 64 bit limb adds, but before either
// high 64 bit limbs are added.
//
// This is what happened on PPC64 when compiling
// crypto/internal/edwards25519/field.feMulGeneric.
func Add64MultipleChains(a, b, c, d [2]uint64) {
var cx, d1, d2 uint64
a1, a2 := a[0], a[1]
b1, b2 := b[0], b[1]
c1, c2 := c[0], c[1]
// ppc64: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
// ppc64le: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
d1, cx = bits.Add64(a1, b1, 0)
// ppc64: "ADDE", -"ADDC", -"MOVD\t.*, XER"
// ppc64le: "ADDE", -"ADDC", -"MOVD\t.*, XER"
d2, _ = bits.Add64(a2, b2, cx)
// ppc64: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
// ppc64le: "ADDC\tR\\d+,", -"ADDE", -"MOVD\tXER"
d1, cx = bits.Add64(c1, d1, 0)
// ppc64: "ADDE", -"ADDC", -"MOVD\t.*, XER"
// ppc64le: "ADDE", -"ADDC", -"MOVD\t.*, XER"
d2, _ = bits.Add64(c2, d2, cx)
d[0] = d1
d[1] = d2
}
// --------------- //
// bits.Sub* //
// --------------- //