math/big: simplify, speed up Karatsuba multiplication

The old Karatsuba implementation only operated on lengths that are
a power of two times a number smaller than karatsubaThreshold.
For example, when karatsubaThreshold = 40, multiplying a pair
of 99-word numbers runs karatsuba on the low 96 (= 39<<2) words
and then has to fix up the answer to include the high 3 words of each.

I suspect this requirement was needed to make the analysis of
how many temporary words to reserve easier, back when the
answer was 3*n and depended on exactly halving the size at
each Karatsuba step.

Now that we have the more flexible temporary allocation stack,
we can change Karatsuba to accept operands of odd length.
Doing so avoids most of the fixup that the old approach required.
For example, multiplying a pair of 99-word numbers runs
karatsuba on all 99 words now.

This is simpler and about the same speed or, for large cases, faster.

goos: linux
goarch: amd64
pkg: math/big
cpu: Intel(R) Xeon(R) CPU @ 3.10GHz
                            │     old     │                 new                 │
                            │   sec/op    │   sec/op     vs base                │
GCD10x10/WithoutXY-16         99.62n ± 3%   99.10n ± 3%        ~ (p=0.009 n=15)
GCD10x10/WithXY-16            243.4n ± 1%   245.2n ± 1%        ~ (p=0.009 n=15)
GCD100x100/WithoutXY-16       921.9n ± 1%   919.2n ± 1%        ~ (p=0.076 n=15)
GCD100x100/WithXY-16          1.527µ ± 1%   1.526µ ± 0%        ~ (p=0.813 n=15)
GCD1000x1000/WithoutXY-16     9.704µ ± 1%   9.696µ ± 0%        ~ (p=0.532 n=15)
GCD1000x1000/WithXY-16        14.03µ ± 1%   13.96µ ± 0%        ~ (p=0.014 n=15)
GCD10000x10000/WithoutXY-16   206.5µ ± 2%   206.5µ ± 0%        ~ (p=0.967 n=15)
GCD10000x10000/WithXY-16      398.0µ ± 1%   397.4µ ± 0%        ~ (p=0.683 n=15)
Div/20/10-16                  22.22n ± 0%   22.23n ± 0%        ~ (p=0.105 n=15)
Div/40/20-16                  22.23n ± 0%   22.23n ± 0%        ~ (p=0.307 n=15)
Div/100/50-16                 55.47n ± 0%   55.47n ± 0%        ~ (p=0.573 n=15)
Div/200/100-16                174.9n ± 1%   174.6n ± 1%        ~ (p=0.814 n=15)
Div/400/200-16                209.5n ± 1%   210.5n ± 1%        ~ (p=0.454 n=15)
Div/1000/500-16               379.9n ± 0%   383.5n ± 2%        ~ (p=0.123 n=15)
Div/2000/1000-16              780.1n ± 0%   784.6n ± 1%   +0.58% (p=0.000 n=15)
Div/20000/10000-16            25.22µ ± 1%   25.15µ ± 0%        ~ (p=0.213 n=15)
Div/200000/100000-16          921.8µ ± 1%   926.1µ ± 0%        ~ (p=0.009 n=15)
Div/2000000/1000000-16        37.91m ± 0%   35.63m ± 0%   -6.02% (p=0.000 n=15)
Div/20000000/10000000-16       1.378 ± 0%    1.336 ± 0%   -3.03% (p=0.000 n=15)
NatMul/10-16                  166.8n ± 4%   168.9n ± 3%        ~ (p=0.008 n=15)
NatMul/100-16                 5.519µ ± 2%   5.548µ ± 4%        ~ (p=0.032 n=15)
NatMul/1000-16                230.4µ ± 1%   220.2µ ± 1%   -4.43% (p=0.000 n=15)
NatMul/10000-16               8.569m ± 1%   8.640m ± 1%        ~ (p=0.005 n=15)
NatMul/100000-16              376.5m ± 1%   334.1m ± 0%  -11.26% (p=0.000 n=15)
NatSqr/1-16                   27.85n ± 5%   28.60n ± 2%        ~ (p=0.123 n=15)
NatSqr/2-16                   47.99n ± 2%   48.84n ± 1%        ~ (p=0.008 n=15)
NatSqr/3-16                   59.41n ± 2%   60.87n ± 2%   +2.46% (p=0.001 n=15)
NatSqr/5-16                   87.27n ± 2%   89.31n ± 3%        ~ (p=0.087 n=15)
NatSqr/8-16                   124.6n ± 3%   128.9n ± 3%        ~ (p=0.006 n=15)
NatSqr/10-16                  166.3n ± 3%   172.7n ± 3%        ~ (p=0.002 n=15)
NatSqr/20-16                  385.2n ± 2%   394.7n ± 3%        ~ (p=0.036 n=15)
NatSqr/30-16                  622.7n ± 3%   642.9n ± 3%        ~ (p=0.032 n=15)
NatSqr/50-16                  1.274µ ± 3%   1.323µ ± 4%        ~ (p=0.003 n=15)
NatSqr/80-16                  2.606µ ± 4%   2.714µ ± 4%        ~ (p=0.044 n=15)
NatSqr/100-16                 3.731µ ± 4%   3.871µ ± 4%        ~ (p=0.038 n=15)
NatSqr/200-16                 12.99µ ± 2%   13.09µ ± 3%        ~ (p=0.838 n=15)
NatSqr/300-16                 22.87µ ± 2%   23.25µ ± 2%        ~ (p=0.285 n=15)
NatSqr/500-16                 58.43µ ± 1%   58.25µ ± 2%        ~ (p=0.345 n=15)
NatSqr/800-16                 115.3µ ± 3%   116.2µ ± 3%        ~ (p=0.126 n=15)
NatSqr/1000-16                173.9µ ± 1%   174.3µ ± 1%        ~ (p=0.935 n=15)
NatSqr/10000-16               6.133m ± 2%   6.034m ± 1%   -1.62% (p=0.000 n=15)
NatSqr/100000-16              253.8m ± 1%   241.5m ± 0%   -4.87% (p=0.000 n=15)
geomean                       7.745µ        7.760µ        +0.19%

goos: linux
goarch: amd64
pkg: math/big
cpu: Intel(R) Xeon(R) Platinum 8481C CPU @ 2.70GHz
                            │     old     │                 new                  │
                            │   sec/op    │    sec/op     vs base                │
GCD10x10/WithoutXY-88         62.17n ± 4%   61.44n ±  0%   -1.17% (p=0.000 n=15)
GCD10x10/WithXY-88            173.4n ± 2%   172.4n ±  4%        ~ (p=0.615 n=15)
GCD100x100/WithoutXY-88       584.0n ± 1%   582.9n ±  0%        ~ (p=0.009 n=15)
GCD100x100/WithXY-88          1.098µ ± 1%   1.091µ ±  2%        ~ (p=0.002 n=15)
GCD1000x1000/WithoutXY-88     6.055µ ± 0%   6.049µ ±  0%        ~ (p=0.007 n=15)
GCD1000x1000/WithXY-88        9.430µ ± 0%   9.417µ ±  1%        ~ (p=0.123 n=15)
GCD10000x10000/WithoutXY-88   153.4µ ± 2%   149.0µ ±  2%   -2.85% (p=0.000 n=15)
GCD10000x10000/WithXY-88      350.6µ ± 3%   349.0µ ±  2%        ~ (p=0.126 n=15)
Div/20/10-88                  13.12n ± 0%   13.12n ±  1%    0.00% (p=0.042 n=15)
Div/40/20-88                  13.12n ± 0%   13.13n ±  0%        ~ (p=0.004 n=15)
Div/100/50-88                 25.49n ± 0%   25.49n ±  0%        ~ (p=0.452 n=15)
Div/200/100-88                115.7n ± 2%   113.8n ±  2%        ~ (p=0.212 n=15)
Div/400/200-88                135.0n ± 1%   136.1n ±  1%        ~ (p=0.005 n=15)
Div/1000/500-88               257.5n ± 1%   259.9n ±  1%        ~ (p=0.004 n=15)
Div/2000/1000-88              567.5n ± 1%   572.4n ±  2%        ~ (p=0.616 n=15)
Div/20000/10000-88            25.65µ ± 0%   25.77µ ±  1%        ~ (p=0.032 n=15)
Div/200000/100000-88          777.4µ ± 1%   754.3µ ±  1%   -2.97% (p=0.000 n=15)
Div/2000000/1000000-88        33.66m ± 0%   31.37m ±  0%   -6.81% (p=0.000 n=15)
Div/20000000/10000000-88       1.320 ± 0%    1.266 ±  0%   -4.04% (p=0.000 n=15)
NatMul/10-88                  151.9n ± 7%   143.3n ±  7%        ~ (p=0.878 n=15)
NatMul/100-88                 4.418µ ± 2%   4.337µ ±  3%        ~ (p=0.512 n=15)
NatMul/1000-88                206.8µ ± 1%   189.8µ ±  1%   -8.25% (p=0.000 n=15)
NatMul/10000-88               8.531m ± 1%   8.095m ±  0%   -5.12% (p=0.000 n=15)
NatMul/100000-88              298.9m ± 0%   260.5m ±  1%  -12.85% (p=0.000 n=15)
NatSqr/1-88                   27.55n ± 6%   28.25n ±  7%        ~ (p=0.024 n=15)
NatSqr/2-88                   44.71n ± 6%   46.21n ±  9%        ~ (p=0.024 n=15)
NatSqr/3-88                   55.44n ± 4%   58.41n ± 10%        ~ (p=0.126 n=15)
NatSqr/5-88                   80.71n ± 5%   81.41n ±  5%        ~ (p=0.032 n=15)
NatSqr/8-88                   115.7n ± 4%   115.4n ±  5%        ~ (p=0.814 n=15)
NatSqr/10-88                  147.4n ± 4%   147.3n ±  4%        ~ (p=0.505 n=15)
NatSqr/20-88                  337.8n ± 3%   337.3n ±  4%        ~ (p=0.814 n=15)
NatSqr/30-88                  556.9n ± 3%   557.6n ±  4%        ~ (p=0.814 n=15)
NatSqr/50-88                  1.208µ ± 4%   1.208µ ±  3%        ~ (p=0.910 n=15)
NatSqr/80-88                  2.591µ ± 3%   2.581µ ±  3%        ~ (p=0.705 n=15)
NatSqr/100-88                 3.870µ ± 3%   3.858µ ±  3%        ~ (p=0.846 n=15)
NatSqr/200-88                 14.43µ ± 3%   14.28µ ±  2%        ~ (p=0.383 n=15)
NatSqr/300-88                 24.68µ ± 2%   24.49µ ±  2%        ~ (p=0.624 n=15)
NatSqr/500-88                 66.27µ ± 1%   66.18µ ±  1%        ~ (p=0.735 n=15)
NatSqr/800-88                 128.7µ ± 1%   127.4µ ±  1%        ~ (p=0.050 n=15)
NatSqr/1000-88                198.7µ ± 1%   197.7µ ±  1%        ~ (p=0.229 n=15)
NatSqr/10000-88               6.582m ± 1%   6.426m ±  1%   -2.37% (p=0.000 n=15)
NatSqr/100000-88              274.3m ± 0%   267.3m ±  0%   -2.57% (p=0.000 n=15)
geomean                       6.518µ        6.438µ         -1.22%

goos: linux
goarch: arm64
pkg: math/big
                            │     old     │                 new                 │
                            │   sec/op    │   sec/op     vs base                │
GCD10x10/WithoutXY-16         61.70n ± 1%   61.32n ± 1%        ~ (p=0.361 n=15)
GCD10x10/WithXY-16            217.3n ± 1%   217.0n ± 1%        ~ (p=0.395 n=15)
GCD100x100/WithoutXY-16       569.7n ± 0%   572.6n ± 2%        ~ (p=0.213 n=15)
GCD100x100/WithXY-16          1.241µ ± 1%   1.236µ ± 1%        ~ (p=0.157 n=15)
GCD1000x1000/WithoutXY-16     5.558µ ± 0%   5.566µ ± 0%        ~ (p=0.228 n=15)
GCD1000x1000/WithXY-16        9.319µ ± 0%   9.326µ ± 0%        ~ (p=0.233 n=15)
GCD10000x10000/WithoutXY-16   126.4µ ± 2%   128.7µ ± 3%        ~ (p=0.081 n=15)
GCD10000x10000/WithXY-16      279.3µ ± 0%   278.3µ ± 5%        ~ (p=0.187 n=15)
Div/20/10-16                  15.12n ± 1%   15.21n ± 1%        ~ (p=0.490 n=15)
Div/40/20-16                  15.11n ± 0%   15.23n ± 1%        ~ (p=0.107 n=15)
Div/100/50-16                 26.53n ± 0%   26.50n ± 0%        ~ (p=0.299 n=15)
Div/200/100-16                123.7n ± 0%   124.0n ± 0%        ~ (p=0.086 n=15)
Div/400/200-16                142.5n ± 0%   142.4n ± 0%        ~ (p=0.039 n=15)
Div/1000/500-16               259.9n ± 1%   261.2n ± 1%        ~ (p=0.044 n=15)
Div/2000/1000-16              539.4n ± 1%   532.3n ± 1%   -1.32% (p=0.001 n=15)
Div/20000/10000-16            22.43µ ± 0%   22.32µ ± 0%   -0.49% (p=0.000 n=15)
Div/200000/100000-16          898.3µ ± 0%   889.6µ ± 0%   -0.96% (p=0.000 n=15)
Div/2000000/1000000-16        38.37m ± 0%   35.11m ± 0%   -8.49% (p=0.000 n=15)
Div/20000000/10000000-16       1.449 ± 0%    1.384 ± 0%   -4.48% (p=0.000 n=15)
NatMul/10-16                  182.0n ± 1%   177.8n ± 1%   -2.31% (p=0.000 n=15)
NatMul/100-16                 5.537µ ± 0%   5.693µ ± 0%   +2.82% (p=0.000 n=15)
NatMul/1000-16                229.9µ ± 0%   224.8µ ± 0%   -2.24% (p=0.000 n=15)
NatMul/10000-16               8.985m ± 0%   8.751m ± 0%   -2.61% (p=0.000 n=15)
NatMul/100000-16              371.1m ± 0%   331.5m ± 0%  -10.66% (p=0.000 n=15)
NatSqr/1-16                   46.77n ± 6%   42.76n ± 1%   -8.57% (p=0.000 n=15)
NatSqr/2-16                   66.99n ± 4%   63.62n ± 1%   -5.03% (p=0.000 n=15)
NatSqr/3-16                   76.79n ± 4%   73.42n ± 1%        ~ (p=0.007 n=15)
NatSqr/5-16                   99.00n ± 3%   95.35n ± 1%   -3.69% (p=0.000 n=15)
NatSqr/8-16                   160.0n ± 3%   155.1n ± 1%   -3.06% (p=0.001 n=15)
NatSqr/10-16                  178.4n ± 2%   175.9n ± 0%   -1.40% (p=0.001 n=15)
NatSqr/20-16                  361.9n ± 2%   361.3n ± 0%        ~ (p=0.083 n=15)
NatSqr/30-16                  584.7n ± 0%   586.8n ± 0%   +0.36% (p=0.000 n=15)
NatSqr/50-16                  1.327µ ± 0%   1.329µ ± 0%        ~ (p=0.349 n=15)
NatSqr/80-16                  2.893µ ± 1%   2.925µ ± 0%   +1.11% (p=0.000 n=15)
NatSqr/100-16                 4.330µ ± 1%   4.381µ ± 0%   +1.18% (p=0.000 n=15)
NatSqr/200-16                 16.25µ ± 1%   16.43µ ± 0%   +1.07% (p=0.000 n=15)
NatSqr/300-16                 27.85µ ± 1%   28.06µ ± 0%   +0.77% (p=0.000 n=15)
NatSqr/500-16                 76.01µ ± 0%   76.34µ ± 0%        ~ (p=0.002 n=15)
NatSqr/800-16                 146.8µ ± 0%   148.1µ ± 0%   +0.83% (p=0.000 n=15)
NatSqr/1000-16                228.2µ ± 0%   228.6µ ± 0%        ~ (p=0.123 n=15)
NatSqr/10000-16               7.524m ± 0%   7.426m ± 0%   -1.31% (p=0.000 n=15)
NatSqr/100000-16              316.7m ± 0%   309.2m ± 0%   -2.36% (p=0.000 n=15)
geomean                       7.264µ        7.172µ        -1.27%

goos: darwin
goarch: arm64
pkg: math/big
cpu: Apple M3 Pro
                            │     old     │                new                 │
                            │   sec/op    │   sec/op     vs base               │
GCD10x10/WithoutXY-12         32.61n ± 1%   32.42n ± 1%       ~ (p=0.021 n=15)
GCD10x10/WithXY-12            87.70n ± 1%   88.42n ± 1%       ~ (p=0.010 n=15)
GCD100x100/WithoutXY-12       305.9n ± 0%   306.4n ± 0%       ~ (p=0.003 n=15)
GCD100x100/WithXY-12          560.3n ± 2%   556.6n ± 1%       ~ (p=0.018 n=15)
GCD1000x1000/WithoutXY-12     3.509µ ± 2%   3.464µ ± 1%       ~ (p=0.145 n=15)
GCD1000x1000/WithXY-12        5.347µ ± 2%   5.372µ ± 1%       ~ (p=0.046 n=15)
GCD10000x10000/WithoutXY-12   73.75µ ± 1%   73.99µ ± 1%       ~ (p=0.004 n=15)
GCD10000x10000/WithXY-12      148.4µ ± 0%   147.8µ ± 1%       ~ (p=0.076 n=15)
Div/20/10-12                  9.481n ± 0%   9.462n ± 1%       ~ (p=0.631 n=15)
Div/40/20-12                  9.457n ± 0%   9.462n ± 1%       ~ (p=0.798 n=15)
Div/100/50-12                 14.91n ± 0%   14.79n ± 1%  -0.80% (p=0.000 n=15)
Div/200/100-12                84.56n ± 1%   84.60n ± 1%       ~ (p=0.271 n=15)
Div/400/200-12                103.8n ± 0%   102.8n ± 0%  -0.96% (p=0.000 n=15)
Div/1000/500-12               181.3n ± 1%   184.2n ± 2%       ~ (p=0.091 n=15)
Div/2000/1000-12              397.5n ± 0%   397.4n ± 0%       ~ (p=0.299 n=15)
Div/20000/10000-12            14.04µ ± 1%   13.99µ ± 0%       ~ (p=0.221 n=15)
Div/200000/100000-12          523.1µ ± 0%   514.0µ ± 3%       ~ (p=0.775 n=15)
Div/2000000/1000000-12        21.58m ± 0%   20.01m ± 1%  -7.29% (p=0.000 n=15)
Div/20000000/10000000-12      813.5m ± 0%   796.2m ± 1%  -2.13% (p=0.000 n=15)
NatMul/10-12                  80.46n ± 1%   80.02n ± 1%       ~ (p=0.063 n=15)
NatMul/100-12                 2.904µ ± 0%   2.979µ ± 1%  +2.58% (p=0.000 n=15)
NatMul/1000-12                127.8µ ± 0%   122.3µ ± 0%  -4.28% (p=0.000 n=15)
NatMul/10000-12               5.141m ± 0%   4.975m ± 1%  -3.23% (p=0.000 n=15)
NatMul/100000-12              208.8m ± 0%   189.6m ± 3%  -9.21% (p=0.000 n=15)
NatSqr/1-12                   11.90n ± 1%   11.76n ± 1%       ~ (p=0.059 n=15)
NatSqr/2-12                   21.33n ± 1%   21.12n ± 0%       ~ (p=0.063 n=15)
NatSqr/3-12                   26.05n ± 1%   25.79n ± 0%       ~ (p=0.002 n=15)
NatSqr/5-12                   37.31n ± 0%   36.98n ± 1%       ~ (p=0.008 n=15)
NatSqr/8-12                   63.07n ± 0%   62.75n ± 1%       ~ (p=0.061 n=15)
NatSqr/10-12                  79.48n ± 0%   79.59n ± 0%       ~ (p=0.455 n=15)
NatSqr/20-12                  173.1n ± 0%   173.2n ± 1%       ~ (p=0.518 n=15)
NatSqr/30-12                  288.6n ± 1%   289.2n ± 0%       ~ (p=0.030 n=15)
NatSqr/50-12                  653.3n ± 0%   653.3n ± 0%       ~ (p=0.361 n=15)
NatSqr/80-12                  1.492µ ± 0%   1.496µ ± 0%       ~ (p=0.018 n=15)
NatSqr/100-12                 2.270µ ± 1%   2.270µ ± 0%       ~ (p=0.326 n=15)
NatSqr/200-12                 8.776µ ± 1%   8.784µ ± 1%       ~ (p=0.083 n=15)
NatSqr/300-12                 15.07µ ± 0%   15.09µ ± 0%       ~ (p=0.455 n=15)
NatSqr/500-12                 41.71µ ± 0%   41.77µ ± 1%       ~ (p=0.305 n=15)
NatSqr/800-12                 80.77µ ± 1%   80.59µ ± 0%       ~ (p=0.113 n=15)
NatSqr/1000-12                126.4µ ± 1%   126.5µ ± 0%       ~ (p=0.683 n=15)
NatSqr/10000-12               4.204m ± 0%   4.119m ± 0%  -2.02% (p=0.000 n=15)
NatSqr/100000-12              177.0m ± 0%   172.9m ± 0%  -2.31% (p=0.000 n=15)
geomean                       3.790µ        3.757µ       -0.87%

Change-Id: Ifc7a9b61f678df216690511ac8bb9143189a795e
Reviewed-on: https://go-review.googlesource.com/c/go/+/652057
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Robert Griesemer <gri@google.com>
This commit is contained in:
Russ Cox 2025-01-17 17:26:59 -05:00 committed by Gopher Robot
parent 26040b1dd7
commit d037ed62bc
5 changed files with 364 additions and 256 deletions

View File

@ -181,10 +181,14 @@ func (z *Int) Sub(x, y *Int) *Int {
// Mul sets z to the product x*y and returns z. // Mul sets z to the product x*y and returns z.
func (z *Int) Mul(x, y *Int) *Int { func (z *Int) Mul(x, y *Int) *Int {
return z.mul(nil, x, y) z.mul(nil, x, y)
return z
} }
func (z *Int) mul(stk *stack, x, y *Int) *Int { // mul is like Mul but takes an explicit stack to use, for internal use.
// It does not return a *Int because doing so makes the stack-allocated Ints
// used in natmul.go escape to the heap (even though the result is unused).
func (z *Int) mul(stk *stack, x, y *Int) {
// x * y == x * y // x * y == x * y
// x * (-y) == -(x * y) // x * (-y) == -(x * y)
// (-x) * y == -(x * y) // (-x) * y == -(x * y)
@ -192,11 +196,10 @@ func (z *Int) mul(stk *stack, x, y *Int) *Int {
if x == y { if x == y {
z.abs = z.abs.sqr(stk, x.abs) z.abs = z.abs.sqr(stk, x.abs)
z.neg = false z.neg = false
return z return
} }
z.abs = z.abs.mul(stk, x.abs, y.abs) z.abs = z.abs.mul(stk, x.abs, y.abs)
z.neg = len(z.abs) > 0 && x.neg != y.neg // 0 has no sign z.neg = len(z.abs) > 0 && x.neg != y.neg // 0 has no sign
return z
} }
// MulRange sets z to the product of all integers // MulRange sets z to the product of all integers

View File

@ -227,15 +227,14 @@ func alias(x, y nat) bool {
return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1] return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1]
} }
// addAt implements z += x<<(_W*i); z must be long enough. // addTo implements z += x; z must be long enough.
// (we don't use nat.add because we need z to stay the same // (we don't use nat.add because we need z to stay the same
// slice, and we don't need to normalize z after each addition) // slice, and we don't need to normalize z after each addition)
func addAt(z, x nat, i int) { func addTo(z, x nat) {
if n := len(x); n > 0 { if n := len(x); n > 0 {
if c := addVV(z[i:i+n], z[i:], x); c != 0 { if c := addVV(z[:n], z, x); c != 0 {
j := i + n if n < len(z) {
if j < len(z) { addVW(z[n:], z[n:], c)
addVW(z[j:], z[j:], c)
} }
} }
} }

View File

@ -7,6 +7,8 @@ package big
import ( import (
"fmt" "fmt"
"math" "math"
"math/bits"
"math/rand/v2"
"runtime" "runtime"
"strings" "strings"
"testing" "testing"
@ -56,12 +58,84 @@ var sumNN = []argNN{
{nat{0, 0, 0, 1}, nat{0, 0, _M}, nat{0, 0, 1}}, {nat{0, 0, 0, 1}, nat{0, 0, _M}, nat{0, 0, 1}},
} }
var prodNN = []argNN{ var prodNN = append(prodTests(), prodNNExtra...)
{},
{nil, nil, nil}, func permute[E any](x []E) {
out := make([]E, len(x))
for i, j := range rand.Perm(len(x)) {
out[i] = x[j]
}
copy(x, out)
}
// testMul returns the product of x and y using the grade-school algorithm,
// as a reference implementation.
func testMul(x, y nat) nat {
z := make(nat, len(x)+len(y))
for i, xi := range x {
for j, yj := range y {
hi, lo := bits.Mul(uint(xi), uint(yj))
k := i + j
s, c := bits.Add(uint(z[k]), lo, 0)
z[k] = Word(s)
k++
for hi != 0 || c != 0 {
s, c = bits.Add(uint(z[k]), hi, c)
hi = 0
z[k] = Word(s)
k++
}
}
}
return z.norm()
}
func prodTests() []argNN {
var tests []argNN
for size := range 10 {
var x, y nat
for i := range size {
x = append(x, Word(i+1))
y = append(y, Word(i+1+size))
}
permute(x)
permute(y)
x = x.norm()
y = y.norm()
tests = append(tests, argNN{testMul(x, y), x, y})
}
words := []Word{0, 1, 2, 3, 4, ^Word(0), ^Word(1), ^Word(2), ^Word(3)}
for size := range 10 {
if size == 0 {
continue // already tested the only 0-length possibility above
}
for range 10 {
x := make(nat, size)
y := make(nat, size)
for i := range size {
x[i] = words[rand.N(len(words))]
y[i] = words[rand.N(len(words))]
}
x = x.norm()
y = y.norm()
tests = append(tests, argNN{testMul(x, y), x, y})
}
}
return tests
}
var prodNNExtra = []argNN{
{nil, nat{991}, nil}, {nil, nat{991}, nil},
{nat{991}, nat{991}, nat{1}}, {nat{991}, nat{991}, nat{1}},
{nat{991 * 991}, nat{991}, nat{991}}, {nat{991 * 991}, nat{991}, nat{991}},
{nat{8, 22, 15}, nat{2, 3}, nat{4, 5}},
{nat{10, 27, 52, 45, 28}, nat{2, 3, 4}, nat{5, 6, 7}},
{nat{12, 32, 61, 100, 94, 76, 45}, nat{2, 3, 4, 5}, nat{6, 7, 8, 9}},
{nat{12, 32, 61, 100, 94, 76, 45}, nat{2, 3, 4, 5}, nat{6, 7, 8, 9}},
{nat{14, 37, 70, 114, 170, 166, 148, 115, 66}, nat{2, 3, 4, 5, 6}, nat{7, 8, 9, 10, 11}},
{nat{991 * 991, 991 * 2, 1}, nat{991, 1}, nat{991, 1}},
{nat{991 * 991, 991 * 777 * 2, 777 * 777}, nat{991, 777}, nat{991, 777}},
{nat{0, 0, 991 * 991}, nat{0, 991}, nat{0, 991}}, {nat{0, 0, 991 * 991}, nat{0, 991}, nat{0, 991}},
{nat{1 * 991, 2 * 991, 3 * 991, 4 * 991}, nat{1, 2, 3, 4}, nat{991}}, {nat{1 * 991, 2 * 991, 3 * 991, 4 * 991}, nat{1, 2, 3, 4}, nat{991}},
{nat{4, 11, 20, 30, 20, 11, 4}, nat{1, 2, 3, 4}, nat{4, 3, 2, 1}}, {nat{4, 11, 20, 30, 20, 11, 4}, nat{1, 2, 3, 4}, nat{4, 3, 2, 1}},
@ -114,36 +188,111 @@ func testFunNN(t *testing.T, msg string, f funNN, a argNN) {
} }
func testFunSNN(t *testing.T, msg string, f funSNN, a argNN) { func testFunSNN(t *testing.T, msg string, f funSNN, a argNN) {
t.Helper()
stk := getStack() stk := getStack()
defer stk.free() defer stk.free()
z := f(nil, stk, a.x, a.y) z := f(nil, stk, a.x, a.y)
if z.cmp(a.z) != 0 { if z.cmp(a.z) != 0 {
t.Errorf("%s%+v\n\tgot z = %v; want %v", msg, a, z, a.z) t.Fatalf("%s%+v\n\tgot z = %v; want %v", msg, a, z, a.z)
} }
} }
func TestFunNN(t *testing.T) { func setDuringTest[V any](t *testing.T, p *V, v V) {
old := *p
*p = v
t.Cleanup(func() { *p = old })
}
func TestAdd(t *testing.T) {
for _, a := range sumNN { for _, a := range sumNN {
arg := a testFunNN(t, "add", nat.add, a)
testFunNN(t, "add", nat.add, arg) a.x, a.y = a.y, a.x
testFunNN(t, "add", nat.add, a)
arg = argNN{a.z, a.y, a.x}
testFunNN(t, "add symmetric", nat.add, arg)
arg = argNN{a.x, a.z, a.y}
testFunNN(t, "sub", nat.sub, arg)
arg = argNN{a.y, a.z, a.x}
testFunNN(t, "sub symmetric", nat.sub, arg)
} }
}
func TestSub(t *testing.T) {
for _, a := range sumNN {
a.x, a.z = a.z, a.x
testFunNN(t, "sub", nat.sub, a)
a.y, a.z = a.z, a.y
testFunNN(t, "sub", nat.sub, a)
}
}
func TestNatMul(t *testing.T) {
t.Run("Basic", func(t *testing.T) {
setDuringTest(t, &karatsubaThreshold, 1e9)
for _, a := range prodNN { for _, a := range prodNN {
arg := a if len(a.z) >= 100 {
testFunSNN(t, "mul", nat.mul, arg) continue
arg = argNN{a.z, a.y, a.x}
testFunSNN(t, "mul symmetric", nat.mul, arg)
} }
testFunSNN(t, "mul", nat.mul, a)
a.x, a.y = a.y, a.x
testFunSNN(t, "mul", nat.mul, a)
}
})
t.Run("Karatsuba", func(t *testing.T) {
setDuringTest(t, &karatsubaThreshold, 2)
for _, a := range prodNN {
testFunSNN(t, "mul", nat.mul, a)
a.x, a.y = a.y, a.x
testFunSNN(t, "mul", nat.mul, a)
}
})
t.Run("Mul", func(t *testing.T) {
for _, a := range prodNN {
testFunSNN(t, "mul", nat.mul, a)
a.x, a.y = a.y, a.x
testFunSNN(t, "mul", nat.mul, a)
}
})
}
func testSqr(t *testing.T, x nat) {
stk := getStack()
defer stk.free()
got := make(nat, 2*len(x))
want := make(nat, 2*len(x))
got = got.sqr(stk, x)
want = want.mul(stk, x, x)
if got.cmp(want) != 0 {
t.Errorf("basicSqr(%v), got %v, want %v", x, got, want)
}
}
func TestNatSqr(t *testing.T) {
t.Run("Basic", func(t *testing.T) {
setDuringTest(t, &basicSqrThreshold, 0)
setDuringTest(t, &karatsubaSqrThreshold, 1e9)
for _, a := range prodNN {
if len(a.z) >= 100 {
continue
}
testSqr(t, a.x)
testSqr(t, a.y)
testSqr(t, a.z)
}
})
t.Run("Karatsuba", func(t *testing.T) {
setDuringTest(t, &basicSqrThreshold, 2)
setDuringTest(t, &karatsubaSqrThreshold, 2)
for _, a := range prodNN {
testSqr(t, a.x)
testSqr(t, a.y)
testSqr(t, a.z)
}
})
t.Run("Sqr", func(t *testing.T) {
for _, a := range prodNN {
testSqr(t, a.x)
testSqr(t, a.y)
testSqr(t, a.z)
}
})
} }
var mulRangesN = []struct { var mulRangesN = []struct {
@ -739,33 +888,6 @@ func TestSticky(t *testing.T) {
} }
} }
func testSqr(t *testing.T, x nat) {
stk := getStack()
defer stk.free()
got := make(nat, 2*len(x))
want := make(nat, 2*len(x))
got = got.sqr(stk, x)
want = want.mul(stk, x, x)
if got.cmp(want) != 0 {
t.Errorf("basicSqr(%v), got %v, want %v", x, got, want)
}
}
func TestSqr(t *testing.T) {
for _, a := range prodNN {
if a.x != nil {
testSqr(t, a.x)
}
if a.y != nil {
testSqr(t, a.y)
}
if a.z != nil {
testSqr(t, a.z)
}
}
}
func benchmarkNatSqr(b *testing.B, nwords int) { func benchmarkNatSqr(b *testing.B, nwords int) {
x := rndNat(nwords) x := rndNat(nwords)
var z nat var z nat

View File

@ -831,7 +831,7 @@ func (z nat) divRecursiveStep(stk *stack, u, v nat, depth int) {
if len(qhatv) > s { if len(qhatv) > s {
subVW(qhatv[s:], qhatv[s:], c) subVW(qhatv[s:], qhatv[s:], c)
} }
addAt(uu[s:], v[s:], 0) addTo(uu[s:], v[s:])
} }
if qhatv.cmp(uu.norm()) > 0 { if qhatv.cmp(uu.norm()) > 0 {
panic("impossible") panic("impossible")
@ -840,7 +840,7 @@ func (z nat) divRecursiveStep(stk *stack, u, v nat, depth int) {
if c > 0 { if c > 0 {
subVW(uu[len(qhatv):], uu[len(qhatv):], c) subVW(uu[len(qhatv):], uu[len(qhatv):], c)
} }
addAt(z, qhat, j-B) addTo(z[j-B:], qhat)
j -= B j -= B
stk.restore(mark) stk.restore(mark)
} }
@ -865,7 +865,7 @@ func (z nat) divRecursiveStep(stk *stack, u, v nat, depth int) {
if len(qhatv) > s { if len(qhatv) > s {
subVW(qhatv[s:], qhatv[s:], c) subVW(qhatv[s:], qhatv[s:], c)
} }
addAt(u[s:], v[s:], 0) addTo(u[s:], v[s:])
} }
} }
if qhatv.cmp(u.norm()) > 0 { if qhatv.cmp(u.norm()) > 0 {
@ -880,5 +880,5 @@ func (z nat) divRecursiveStep(stk *stack, u, v nat, depth int) {
} }
// Done! // Done!
addAt(z, qhat.norm(), 0) addTo(z, qhat.norm())
} }

View File

@ -31,71 +31,31 @@ func (z nat) mul(stk *stack, x, y nat) nat {
if alias(z, x) || alias(z, y) { if alias(z, x) || alias(z, y) {
z = nil // z is an alias for x or y - cannot reuse z = nil // z is an alias for x or y - cannot reuse
} }
z = z.make(m + n)
// use basic multiplication if the numbers are small // use basic multiplication if the numbers are small
if n < karatsubaThreshold { if n < karatsubaThreshold {
z = z.make(m + n)
basicMul(z, x, y) basicMul(z, x, y)
return z.norm() return z.norm()
} }
// m >= n && n >= karatsubaThreshold && n >= 2
// determine Karatsuba length k such that
//
// x = xh*b + x0 (0 <= x0 < b)
// y = yh*b + y0 (0 <= y0 < b)
// b = 1<<(_W*k) ("base" of digits xi, yi)
//
k := karatsubaLen(n, karatsubaThreshold)
// k <= n
if stk == nil { if stk == nil {
stk = getStack() stk = getStack()
defer stk.free() defer stk.free()
} }
// multiply x0 and y0 via Karatsuba // Let x = x1:x0 where x0 is the same length as y.
x0 := x[0:k] // x0 is not normalized // Compute z = x0*y and then add in x1*y in sections
y0 := y[0:k] // y0 is not normalized // if needed.
z = z.make(m + n) // enough space for full result of x*y karatsuba(stk, z[:2*n], x[:n], y)
karatsuba(stk, z, x0, y0)
clear(z[2*k:]) // upper portion of z is garbage (and 2*k <= m+n since k <= n <= m)
// If xh != 0 or yh != 0, add the missing terms to z. For if n < m {
// clear(z[2*n:])
// xh = xi*b^i + ... + x2*b^2 + x1*b (0 <= xi < b)
// yh = y1*b (0 <= y1 < b)
//
// the missing terms are
//
// x0*y1*b and xi*y0*b^i, xi*y1*b^(i+1) for i > 0
//
// since all the yi for i > 1 are 0 by choice of k: If any of them
// were > 0, then yh >= b^2 and thus y >= b^2. Then k' = k*2 would
// be a larger valid threshold contradicting the assumption about k.
//
if k < n || m != n {
defer stk.restore(stk.save()) defer stk.restore(stk.save())
t := stk.nat(3 * k) t := stk.nat(2 * n)
for i := n; i < m; i += n {
// add x0*y1*b t = t.mul(stk, x[i:min(i+n, len(x))], y)
x0 := x0.norm() addTo(z[i:], t)
y1 := y[k:] // y1 is normalized because y is
t = t.mul(stk, x0, y1) // update t so we don't lose t's underlying array
addAt(z, t, k)
// add xi*y0<<i, xi*y1*b<<(i+k)
y0 := y0.norm()
for i := k; i < len(x); i += k {
xi := x[i:]
if len(xi) > k {
xi = xi[:k]
}
xi = xi.norm()
t = t.mul(stk, xi, y0)
addAt(z, t, i)
t = t.mul(stk, xi, y1)
addAt(z, t, i+k)
} }
} }
@ -142,28 +102,7 @@ func (z nat) sqr(stk *stack, x nat) nat {
return z.norm() return z.norm()
} }
// Use Karatsuba multiplication optimized for x == y. karatsubaSqr(stk, z, x)
// The algorithm and layout of z are the same as for mul.
// z = (x1*b + x0)^2 = x1^2*b^2 + 2*x1*x0*b + x0^2
k := karatsubaLen(n, karatsubaSqrThreshold)
x0 := x[0:k]
karatsubaSqr(stk, z, x0) // z = x0^2
clear(z[2*k:])
if k < n {
t := stk.nat(2 * k)
x0 := x0.norm()
x1 := x[k:]
t = t.mul(stk, x0, x1)
addAt(z, t, k)
addAt(z, t, k) // z = 2*x1*x0*b + x0^2
t = t.sqr(stk, x1)
addAt(z, t, 2*k) // z = x1^2*b^2 + 2*x1*x0*b + x0^2
}
return z.norm() return z.norm()
} }
@ -188,6 +127,7 @@ func basicSqr(stk *stack, z, x nat) {
addVV(z, z, t) // combine the result addVV(z, z, t) // combine the result
} }
// mulAddWW returns z = x*y + r.
func (z nat) mulAddWW(x nat, y, r Word) nat { func (z nat) mulAddWW(x nat, y, r Word) nat {
m := len(x) m := len(x)
if m == 0 || y == 0 { if m == 0 || y == 0 {
@ -212,155 +152,199 @@ func basicMul(z, x, y nat) {
} }
} }
// karatsubaLen computes an approximation to the maximum k <= n such that // karatsuba multiplies x and y,
// k = p<<i for a number p <= threshold and an i >= 0. Thus, the // writing the (non-normalized) result to z.
// result is the largest number that can be divided repeatedly by 2 before // x and y must have the same length n,
// becoming about the value of threshold. // and z must have length twice that.
func karatsubaLen(n, threshold int) int {
i := uint(0)
for n > threshold {
n >>= 1
i++
}
return n << i
}
// Fast version of z[0:n+n>>1].add(z[0:n+n>>1], x[0:n]) w/o bounds checks.
// Factored out for readability - do not use outside karatsuba.
func karatsubaAdd(z, x nat, n int) {
if c := addVV(z[0:n], z, x); c != 0 {
addVW(z[n:n+n>>1], z[n:], c)
}
}
// Like karatsubaAdd, but does subtract.
func karatsubaSub(z, x nat, n int) {
if c := subVV(z[0:n], z, x); c != 0 {
subVW(z[n:n+n>>1], z[n:], c)
}
}
// karatsuba multiplies x and y and leaves the result in z.
// Both x and y must have the same length n and n must be a
// power of 2. The result vector z must have len(z) == len(x)+len(y).
// The (non-normalized) result is placed in z.
func karatsuba(stk *stack, z, x, y nat) { func karatsuba(stk *stack, z, x, y nat) {
n := len(y) n := len(y)
if len(x) != n || len(z) != 2*n {
panic("bad karatsuba length")
}
// Switch to basic multiplication if numbers are odd or small. // Fall back to basic algorithm if small enough.
// (n is always even if karatsubaThreshold is even, but be if n < karatsubaThreshold || n < 2 {
// conservative)
if n&1 != 0 || n < karatsubaThreshold || n < 2 {
basicMul(z, x, y) basicMul(z, x, y)
return return
} }
// n&1 == 0 && n >= karatsubaThreshold && n >= 2
// Let the notation x1:x0 denote the nat (x1<<N)+x0 for some N,
// and similarly z2:z1:z0 = (z2<<2N)+(z1<<N)+z0.
//
// (Note that z0, z1, z2 might be ≥ 2**N, in which case the high
// bits of, say, z0 are being added to the low bits of z1 in this notation.)
//
// Karatsuba multiplication is based on the observation that // Karatsuba multiplication is based on the observation that
// for two numbers x and y with:
// //
// x = x1*b + x0 // x1:x0 * y1:y0 = x1*y1:(x0*y1+y0*x1):x0*y0
// y = y1*b + y0 // = x1*y1:((x0-x1)*(y1-y0)+x1*y1+x0*y0):x0*y0
// //
// the product x*y can be obtained with 3 products z2, z1, z0 // The second form uses only three half-width multiplications
// instead of 4: // instead of the four that the straightforward first form does.
// //
// x*y = x1*y1*b*b + (x1*y0 + x0*y1)*b + x0*y0 // We call the three pieces z0, z1, z2:
// = z2*b*b + z1*b + z0
// //
// with: // z0 = x0*y0
// // z2 = x1*y1
// xd = x1 - x0 // z1 = (x0-x1)*(y1-y0) + z0 + z2
// yd = y0 - y1
//
// z1 = xd*yd + z2 + z0
// = (x1-x0)*(y0 - y1) + z2 + z0
// = x1*y0 - x1*y1 - x0*y0 + x0*y1 + z2 + z0
// = x1*y0 - z2 - z0 + x0*y1 + z2 + z0
// = x1*y0 + x0*y1
// split x, y into "digits" n2 := (n + 1) / 2
n2 := n >> 1 // n2 >= 1 x0, x1 := &Int{abs: x[:n2].norm()}, &Int{abs: x[n2:].norm()}
x1, x0 := x[n2:], x[0:n2] // x = x1*b + y0 y0, y1 := &Int{abs: y[:n2].norm()}, &Int{abs: y[n2:].norm()}
y1, y0 := y[n2:], y[0:n2] // y = y1*b + y0 z0 := &Int{abs: z[0 : 2*n2]}
z2 := &Int{abs: z[2*n2:]}
// compute z0 and z2 with the result "in place" in z // Allocate temporary storage for z1; repurpose z0 to hold tx and ty.
karatsuba(stk, z, x0, y0) // z0 = x0*y0
karatsuba(stk, z[n:], x1, y1) // z2 = x1*y1
// compute xd, yd (or the negative value if underflow occurs)
s := 1 // sign of product xd*yd
defer stk.restore(stk.save()) defer stk.restore(stk.save())
xd := stk.nat(n2) z1 := &Int{abs: stk.nat(2*n2 + 1)}
yd := stk.nat(n2) tx := &Int{abs: z[0:n2]}
if subVV(xd, x1, x0) != 0 { // x1-x0 ty := &Int{abs: z[n2 : 2*n2]}
s = -s
subVV(xd, x0, x1) // x0-x1 tx.Sub(x0, x1)
ty.Sub(y1, y0)
z1.mul(stk, tx, ty)
clear(z)
z0.mul(stk, x0, y0)
z2.mul(stk, x1, y1)
z1.Add(z1, z0)
z1.Add(z1, z2)
addTo(z[n2:], z1.abs)
// Debug mode: double-check answer and print trace on failure.
const debug = false
if debug {
zz := make(nat, len(z))
basicMul(zz, x, y)
if z.cmp(zz) != 0 {
// All the temps were aliased to z and gone. Recompute.
z0 = new(Int)
z0.mul(stk, x0, y0)
tx = new(Int).Sub(x1, x0)
ty = new(Int).Sub(y0, y1)
z2 = new(Int)
z2.mul(stk, x1, y1)
print("karatsuba wrong\n")
trace("x ", &Int{abs: x})
trace("y ", &Int{abs: y})
trace("z ", &Int{abs: z})
trace("zz", &Int{abs: zz})
trace("x0", x0)
trace("x1", x1)
trace("y0", y0)
trace("y1", y1)
trace("tx", tx)
trace("ty", ty)
trace("z0", z0)
trace("z1", z1)
trace("z2", z2)
panic("karatsuba")
} }
if subVV(yd, y0, y1) != 0 { // y0-y1
s = -s
subVV(yd, y1, y0) // y1-y0
} }
// p = (x1-x0)*(y0-y1) == x1*y0 - x1*y1 - x0*y0 + x0*y1 for s > 0
// p = (x0-x1)*(y0-y1) == x0*y0 - x0*y1 - x1*y0 + x1*y1 for s < 0
p := stk.nat(2 * n2)
karatsuba(stk, p, xd, yd)
// save original z2:z0
// (ok to use upper half of z since we're done recurring)
r := stk.nat(n * 2)
copy(r, z[:n*2])
// add up all partial products
//
// 2*n n 0
// z = [ z2 | z0 ]
// + [ z0 ]
// + [ z2 ]
// + [ p ]
//
karatsubaAdd(z[n2:], r, n)
karatsubaAdd(z[n2:], r[n:], n)
if s > 0 {
karatsubaAdd(z[n2:], p, n)
} else {
karatsubaSub(z[n2:], p, n)
}
} }
// karatsubaSqr squares x and leaves the result in z. // karatsubaSqr squares x,
// len(x) must be a power of 2 and len(z) == 2*len(x). // writing the (non-normalized) result to z.
// The (non-normalized) result is placed in z. // z must have length 2*len(x).
// // It is analogous to [karatsuba] but can run faster
// The algorithm and the layout of z are the same as for karatsuba. // knowing both multiplicands are the same value.
func karatsubaSqr(stk *stack, z, x nat) { func karatsubaSqr(stk *stack, z, x nat) {
n := len(x) n := len(x)
if len(z) != 2*n {
panic("bad karatsubaSqr length")
}
if n&1 != 0 || n < karatsubaSqrThreshold || n < 2 { if n < karatsubaSqrThreshold || n < 2 {
basicSqr(stk, z[:2*n], x) basicSqr(stk, z, x)
return return
} }
n2 := n >> 1 // Recall that for karatsuba we want to compute:
x1, x0 := x[n2:], x[0:n2] //
// x1:x0 * y1:y0 = x1y1:(x0y1+y0x1):x0y0
// = x1y1:((x0-x1)*(y1-y0)+x1y1+x0y0):x0y0
// = z2:z1:z0
// where:
//
// z0 = x0y0
// z2 = x1y1
// z1 = (x0-x1)*(y1-y0) + z0 + z2
//
// When x = y, these simplify to:
//
// z0 = x0²
// z2 = x1²
// z1 = z0 + z2 - (x0-x1)²
karatsubaSqr(stk, z, x0) n2 := (n + 1) / 2
karatsubaSqr(stk, z[n:], x1) x0, x1 := &Int{abs: x[:n2].norm()}, &Int{abs: x[n2:].norm()}
z0 := &Int{abs: z[0 : 2*n2]}
z2 := &Int{abs: z[2*n2:]}
// s = sign(xd*yd) == -1 for xd != 0; s == 1 for xd == 0 // Allocate temporary storage for z1; repurpose z0 to hold tx.
defer stk.restore(stk.save()) defer stk.restore(stk.save())
p := stk.nat(2 * n2) z1 := &Int{abs: stk.nat(2*n2 + 1)}
r := stk.nat(n * 2) tx := &Int{abs: z[0:n2]}
xd := r[:n2]
if subVV(xd, x1, x0) != 0 { tx.Sub(x0, x1)
subVV(xd, x0, x1) z1.abs = z1.abs.sqr(stk, tx.abs)
z1.neg = true
clear(z)
z0.abs = z0.abs.sqr(stk, x0.abs)
z2.abs = z2.abs.sqr(stk, x1.abs)
z1.Add(z1, z0)
z1.Add(z1, z2)
addTo(z[n2:], z1.abs)
// Debug mode: double-check answer and print trace on failure.
const debug = false
if debug {
zz := make(nat, len(z))
basicSqr(stk, zz, x)
if z.cmp(zz) != 0 {
// All the temps were aliased to z and gone. Recompute.
tx = new(Int).Sub(x0, x1)
z0 = new(Int).Mul(x0, x0)
z2 = new(Int).Mul(x1, x1)
z1 = new(Int).Mul(tx, tx)
z1.Neg(z1)
z1.Add(z1, z0)
z1.Add(z1, z2)
print("karatsubaSqr wrong\n")
trace("x ", &Int{abs: x})
trace("z ", &Int{abs: z})
trace("zz", &Int{abs: zz})
trace("x0", x0)
trace("x1", x1)
trace("z0", z0)
trace("z1", z1)
trace("z2", z2)
panic("karatsubaSqr")
}
}
}
// ifmt returns the debug formatting of the Int x: 0xHEX.
func ifmt(x *Int) string {
neg, s, t := "", x.Text(16), ""
if s == "" { // happens for denormalized zero
s = "0x0"
}
if s[0] == '-' {
neg, s = "-", s[1:]
} }
karatsubaSqr(stk, p, xd) // Add _ between words.
copy(r, z[:n*2]) const D = _W / 4 // digits per chunk
for len(s) > D {
karatsubaAdd(z[n2:], r, n) s, t = s[:len(s)-D], s[len(s)-D:]+"_"+t
karatsubaAdd(z[n2:], r[n:], n) }
karatsubaSub(z[n2:], p, n) // s == -1 for p != 0; s == 1 for p == 0 return neg + s + t
}
// trace prints a single debug value.
func trace(name string, x *Int) {
print(name, "=", ifmt(x), "\n")
} }