math/big: clean up GCD a little

The GCD code was setting one *Int to the value of another
by smashing one struct on top of the other, instead of using Set.
That was safe in this one case, but it's not idiomatic in math/big
nor safe in general, so rewrite the code not to do that.
(In one case, by swapping variables around; in another, by calling Set.)

The added Set call does slow down GCDs by a small amount,
since the answer has to be copied out. To compensate for that,
optimize a bit: remove the s, t temporaries entirely and handle
vector x word multiplication directly. The net result is that almost
all GCDs are faster, except for small ones, which are a few
nanoseconds slower.

goos: darwin
goarch: arm64
pkg: math/big
cpu: Apple M3 Pro
                              │ bench.before │             bench.after             │
                              │    sec/op    │   sec/op     vs base                │
GCD10x10/WithoutXY-12            23.80n ± 1%   31.71n ± 1%  +33.24% (p=0.000 n=10)
GCD10x10/WithXY-12              100.40n ± 0%   92.14n ± 1%   -8.22% (p=0.000 n=10)
GCD10x100/WithoutXY-12           63.70n ± 0%   70.73n ± 0%  +11.05% (p=0.000 n=10)
GCD10x100/WithXY-12              278.6n ± 0%   233.1n ± 1%  -16.35% (p=0.000 n=10)
GCD10x1000/WithoutXY-12          153.4n ± 0%   162.2n ± 1%   +5.74% (p=0.000 n=10)
GCD10x1000/WithXY-12             456.0n ± 0%   411.8n ± 1%   -9.69% (p=0.000 n=10)
GCD10x10000/WithoutXY-12         1.002µ ± 1%   1.036µ ± 0%   +3.39% (p=0.000 n=10)
GCD10x10000/WithXY-12            2.330µ ± 1%   2.210µ ± 0%   -5.13% (p=0.000 n=10)
GCD10x100000/WithoutXY-12        8.894µ ± 0%   8.889µ ± 1%        ~ (p=0.754 n=10)
GCD10x100000/WithXY-12           20.84µ ± 0%   20.24µ ± 0%   -2.84% (p=0.000 n=10)
GCD100x100/WithoutXY-12          373.3n ± 3%   314.4n ± 0%  -15.76% (p=0.000 n=10)
GCD100x100/WithXY-12             662.5n ± 0%   572.4n ± 1%  -13.59% (p=0.000 n=10)
GCD100x1000/WithoutXY-12         641.8n ± 0%   598.1n ± 1%   -6.81% (p=0.000 n=10)
GCD100x1000/WithXY-12            1.123µ ± 0%   1.019µ ± 1%   -9.26% (p=0.000 n=10)
GCD100x10000/WithoutXY-12        2.870µ ± 0%   2.831µ ± 0%   -1.38% (p=0.000 n=10)
GCD100x10000/WithXY-12           4.930µ ± 1%   4.675µ ± 0%   -5.16% (p=0.000 n=10)
GCD100x100000/WithoutXY-12       24.08µ ± 0%   23.97µ ± 0%   -0.48% (p=0.007 n=10)
GCD100x100000/WithXY-12          43.66µ ± 0%   42.52µ ± 0%   -2.61% (p=0.001 n=10)
GCD1000x1000/WithoutXY-12        3.999µ ± 0%   3.569µ ± 1%  -10.75% (p=0.000 n=10)
GCD1000x1000/WithXY-12           6.397µ ± 0%   5.534µ ± 0%  -13.49% (p=0.000 n=10)
GCD1000x10000/WithoutXY-12       6.875µ ± 0%   6.450µ ± 0%   -6.18% (p=0.000 n=10)
GCD1000x10000/WithXY-12          20.75µ ± 1%   19.17µ ± 1%   -7.64% (p=0.000 n=10)
GCD1000x100000/WithoutXY-12      36.38µ ± 0%   35.60µ ± 1%   -2.13% (p=0.000 n=10)
GCD1000x100000/WithXY-12         172.1µ ± 0%   174.4µ ± 3%        ~ (p=0.052 n=10)
GCD10000x10000/WithoutXY-12      79.89µ ± 1%   75.16µ ± 2%   -5.92% (p=0.000 n=10)
GCD10000x10000/WithXY-12         160.1µ ± 0%   150.0µ ± 0%   -6.33% (p=0.000 n=10)
GCD10000x100000/WithoutXY-12     213.2µ ± 1%   209.0µ ± 1%   -1.98% (p=0.000 n=10)
GCD10000x100000/WithXY-12        1.399m ± 0%   1.342m ± 3%   -4.08% (p=0.002 n=10)
GCD100000x100000/WithoutXY-12    5.463m ± 1%   5.504m ± 2%        ~ (p=0.190 n=10)
GCD100000x100000/WithXY-12       11.36m ± 0%   11.46m ± 1%   +0.86% (p=0.000 n=10)
geomean                          6.953µ        6.695µ        -3.71%

goos: linux
goarch: amd64
pkg: math/big
cpu: AMD Ryzen 9 7950X 16-Core Processor
                              │ bench.before │             bench.after             │
                              │    sec/op    │   sec/op     vs base                │
GCD10x10/WithoutXY-32           39.66n ±  4%   44.34n ± 4%  +11.77% (p=0.000 n=10)
GCD10x10/WithXY-32              156.7n ± 12%   130.8n ± 2%  -16.53% (p=0.000 n=10)
GCD10x100/WithoutXY-32          115.8n ±  5%   120.2n ± 2%   +3.89% (p=0.000 n=10)
GCD10x100/WithXY-32             465.3n ±  3%   368.1n ± 2%  -20.91% (p=0.000 n=10)
GCD10x1000/WithoutXY-32         201.1n ±  1%   210.8n ± 2%   +4.82% (p=0.000 n=10)
GCD10x1000/WithXY-32            652.9n ±  4%   605.0n ± 1%   -7.32% (p=0.002 n=10)
GCD10x10000/WithoutXY-32        1.046µ ±  2%   1.143µ ± 1%   +9.33% (p=0.000 n=10)
GCD10x10000/WithXY-32           3.360µ ±  1%   3.258µ ± 1%   -3.04% (p=0.000 n=10)
GCD10x100000/WithoutXY-32       9.391µ ±  3%   9.997µ ± 1%   +6.46% (p=0.000 n=10)
GCD10x100000/WithXY-32          27.92µ ±  1%   28.21µ ± 0%   +1.04% (p=0.043 n=10)
GCD100x100/WithoutXY-32         443.7n ±  5%   320.0n ± 2%  -27.88% (p=0.000 n=10)
GCD100x100/WithXY-32            789.9n ±  2%   690.4n ± 1%  -12.60% (p=0.000 n=10)
GCD100x1000/WithoutXY-32        718.4n ±  3%   600.0n ± 1%  -16.48% (p=0.000 n=10)
GCD100x1000/WithXY-32           1.388µ ±  4%   1.175µ ± 1%  -15.28% (p=0.000 n=10)
GCD100x10000/WithoutXY-32       2.750µ ±  1%   2.668µ ± 1%   -2.96% (p=0.000 n=10)
GCD100x10000/WithXY-32          6.016µ ±  1%   5.590µ ± 1%   -7.09% (p=0.000 n=10)
GCD100x100000/WithoutXY-32      21.40µ ±  1%   22.30µ ± 1%   +4.21% (p=0.000 n=10)
GCD100x100000/WithXY-32         47.02µ ±  4%   48.80µ ± 0%   +3.78% (p=0.015 n=10)
GCD1000x1000/WithoutXY-32       3.417µ ±  4%   3.020µ ± 1%  -11.65% (p=0.000 n=10)
GCD1000x1000/WithXY-32          5.752µ ±  0%   5.418µ ± 2%   -5.81% (p=0.000 n=10)
GCD1000x10000/WithoutXY-32      6.150µ ±  0%   6.246µ ± 1%   +1.55% (p=0.000 n=10)
GCD1000x10000/WithXY-32         24.68µ ±  3%   25.07µ ± 1%        ~ (p=0.051 n=10)
GCD1000x100000/WithoutXY-32     34.60µ ±  2%   36.85µ ± 1%   +6.51% (p=0.000 n=10)
GCD1000x100000/WithXY-32        209.5µ ±  4%   227.4µ ± 0%   +8.56% (p=0.000 n=10)
GCD10000x10000/WithoutXY-32     90.69µ ±  0%   88.48µ ± 0%   -2.44% (p=0.000 n=10)
GCD10000x10000/WithXY-32        197.1µ ±  0%   200.5µ ± 0%   +1.73% (p=0.000 n=10)
GCD10000x100000/WithoutXY-32    239.1µ ±  0%   242.5µ ± 0%   +1.42% (p=0.000 n=10)
GCD10000x100000/WithXY-32       1.963m ±  3%   2.028m ± 0%   +3.28% (p=0.000 n=10)
GCD100000x100000/WithoutXY-32   7.466m ±  0%   7.412m ± 0%   -0.71% (p=0.000 n=10)
GCD100000x100000/WithXY-32      16.10m ±  2%   16.47m ± 0%   +2.25% (p=0.000 n=10)
geomean                         8.388µ         8.127µ        -3.12%

Change-Id: I161dc409bad11bcc553bc8116449905ae5b06742
Reviewed-on: https://go-review.googlesource.com/c/go/+/650636
Reviewed-by: Robert Griesemer <gri@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
This commit is contained in:
Russ Cox 2025-01-16 11:23:46 -05:00 committed by Gopher Robot
parent 37e9c5eaba
commit b66d83ccea
1 changed files with 31 additions and 45 deletions

View File

@ -707,42 +707,36 @@ func lehmerSimulate(A, B *Int) (u0, u1, v0, v1 Word, even bool) {
// For even == true: u0, v1 >= 0 && u1, v0 <= 0
// For even == false: u0, v1 <= 0 && u1, v0 >= 0
// q, r, s, t are temporary variables to avoid allocations in the multiplication.
func lehmerUpdate(A, B, q, r, s, t *Int, u0, u1, v0, v1 Word, even bool) {
func lehmerUpdate(A, B, q, r *Int, u0, u1, v0, v1 Word, even bool) {
mulW(q, B, even, v0)
mulW(r, A, even, u1)
mulW(A, A, !even, u0)
mulW(B, B, !even, v1)
A.Add(A, q)
B.Add(B, r)
}
t.abs = t.abs.setWord(u0)
s.abs = s.abs.setWord(v0)
t.neg = !even
s.neg = even
t.Mul(A, t)
s.Mul(B, s)
r.abs = r.abs.setWord(u1)
q.abs = q.abs.setWord(v1)
r.neg = even
q.neg = !even
r.Mul(A, r)
q.Mul(B, q)
A.Add(t, s)
B.Add(r, q)
// mulW sets z = x * (-?)w
// where the minus sign is present when neg is true.
func mulW(z, x *Int, neg bool, w Word) {
z.abs = z.abs.mulAddWW(x.abs, w, 0)
z.neg = x.neg != neg
}
// euclidUpdate performs a single step of the Euclidean GCD algorithm
// if extended is true, it also updates the cosequence Ua, Ub.
func euclidUpdate(A, B, Ua, Ub, q, r, s, t *Int, extended bool) {
q, r = q.QuoRem(A, B, r)
*A, *B, *r = *B, *r, *A
// q and r are used as temporaries; the initial values are ignored.
func euclidUpdate(A, B, Ua, Ub, q, r *Int, extended bool) (nA, nB, nr, nUa, nUb *Int) {
q.QuoRem(A, B, r)
if extended {
// Ua, Ub = Ub, Ua - q*Ub
t.Set(Ub)
s.Mul(Ub, q)
Ub.Sub(Ua, s)
Ua.Set(t)
// Ua, Ub = Ub, Ua-q*Ub
q.Mul(q, Ub)
Ua, Ub = Ub, Ua
Ub.Sub(Ub, q)
}
return B, r, A, Ua, Ub
}
// lehmerGCD sets z to the greatest common divisor of a and b,
@ -772,8 +766,6 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
// temp variables for multiprecision update
q := new(Int)
r := new(Int)
s := new(Int)
t := new(Int)
// ensure A >= B
if A.abs.cmp(B.abs) < 0 {
@ -791,18 +783,18 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
// Simulate the effect of the single-precision steps using the cosequences.
// A = u0*A + v0*B
// B = u1*A + v1*B
lehmerUpdate(A, B, q, r, s, t, u0, u1, v0, v1, even)
lehmerUpdate(A, B, q, r, u0, u1, v0, v1, even)
if extended {
// Ua = u0*Ua + v0*Ub
// Ub = u1*Ua + v1*Ub
lehmerUpdate(Ua, Ub, q, r, s, t, u0, u1, v0, v1, even)
lehmerUpdate(Ua, Ub, q, r, u0, u1, v0, v1, even)
}
} else {
// Single-digit calculations failed to simulate any quotients.
// Do a standard Euclidean step.
euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
A, B, r, Ua, Ub = euclidUpdate(A, B, Ua, Ub, q, r, extended)
}
}
@ -810,7 +802,7 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
// extended Euclidean algorithm base case if B is a single Word
if len(A.abs) > 1 {
// A is longer than a single Word, so one update is needed.
euclidUpdate(A, B, Ua, Ub, q, r, s, t, extended)
A, B, r, Ua, Ub = euclidUpdate(A, B, Ua, Ub, q, r, extended)
}
if len(B.abs) > 0 {
// A and B are both a single Word.
@ -828,15 +820,9 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
even = !even
}
t.abs = t.abs.setWord(ua)
s.abs = s.abs.setWord(va)
t.neg = !even
s.neg = even
t.Mul(Ua, t)
s.Mul(Ub, s)
Ua.Add(t, s)
mulW(Ua, Ua, !even, ua)
mulW(Ub, Ub, even, va)
Ua.Add(Ua, Ub)
} else {
for bWord != 0 {
aWord, bWord = bWord, aWord%bWord
@ -863,13 +849,13 @@ func (z *Int) lehmerGCD(x, y, a, b *Int) *Int {
}
if x != nil {
*x = *Ua
x.Set(Ua)
if negA {
x.neg = !x.neg
}
}
*z = *A
z.Set(A)
return z
}