math/big: update calibration tests and recalibrate

Refactor calibration tests to use the same logic for all.

Choosing thresholds that are broadly appropriate for all systems is part science
but also part guesswork and judgement. We could instead set per-GOOS/GOARCH
thresholds, but that seems like too much work, and even then there would be
variation between different chips within a GOOS/GOARCH.
(For example see the three linux/amd64 systems benchmarked below.)

The thresholds chosen in this CL are:

	karatsubaThreshold = 40 // unchanged
	basicSqrThreshold = 12 // was 20
	karatsubaSqrThreshold = 80 // was 260
	divRecursiveThreshold = 40 // was 100

The new file calibrate.md explains the calibration process and links to graphs
justifying those values. (The graphs are hosted on swtch.com to avoid adding
a megabyte of extra data to the Go repo and Go distributions.)

A rendered copy of calibrate.md is at https://swtch.com/math/big/calibrate.html.

goos: linux
goarch: amd64
pkg: math/big
cpu: Intel(R) Xeon(R) Platinum 8481C CPU @ 2.70GHz
                         │     old      │                 new                 │
                         │    sec/op    │   sec/op     vs base                │
Div/20/10-88                13.13n ± 2%   13.14n ± 2%        ~ (p=0.494 n=15)
Div/40/20-88                13.13n ± 2%   13.14n ± 2%        ~ (p=0.137 n=15)
Div/100/50-88               25.50n ± 0%   25.51n ± 0%        ~ (p=0.038 n=15)
Div/200/100-88              113.1n ± 1%   116.0n ± 3%   +2.56% (p=0.000 n=15)
Div/400/200-88              135.3n ± 0%   137.1n ± 1%        ~ (p=0.004 n=15)
Div/1000/500-88             259.9n ± 1%   259.0n ± 2%        ~ (p=0.182 n=15)
Div/2000/1000-88            568.8n ± 1%   564.7n ± 3%        ~ (p=0.927 n=15)
Div/20000/10000-88          25.79µ ± 1%   22.11µ ± 2%  -14.26% (p=0.000 n=15)
Div/200000/100000-88        755.1µ ± 1%   737.6µ ± 1%   -2.32% (p=0.000 n=15)
Div/2000000/1000000-88      31.30m ± 0%   31.20m ± 1%        ~ (p=0.081 n=15)
Div/20000000/10000000-88     1.268 ± 0%    1.265 ± 0%        ~ (p=0.011 n=15)
NatMul/10-88                142.6n ± 0%   142.9n ± 7%        ~ (p=0.145 n=15)
NatMul/100-88               4.347µ ± 0%   4.350µ ± 3%        ~ (p=0.430 n=15)
NatMul/1000-88              187.6µ ± 0%   188.4µ ± 2%        ~ (p=0.004 n=15)
NatMul/10000-88             8.052m ± 0%   8.057m ± 1%        ~ (p=0.148 n=15)
NatMul/100000-88            260.6m ± 0%   260.7m ± 0%        ~ (p=0.512 n=15)
NatSqr/1-88                 26.58n ± 5%   27.96n ± 8%        ~ (p=0.574 n=15)
NatSqr/2-88                 42.35n ± 7%   44.87n ± 6%        ~ (p=0.690 n=15)
NatSqr/3-88                 53.28n ± 4%   55.62n ± 5%        ~ (p=0.151 n=15)
NatSqr/5-88                 76.26n ± 6%   81.43n ± 6%   +6.78% (p=0.000 n=15)
NatSqr/8-88                 110.8n ± 5%   116.4n ± 6%        ~ (p=0.040 n=15)
NatSqr/10-88                141.4n ± 4%   147.8n ± 4%        ~ (p=0.011 n=15)
NatSqr/20-88                325.8n ± 3%   341.7n ± 4%   +4.88% (p=0.000 n=15)
NatSqr/30-88                536.8n ± 3%   556.1n ± 4%        ~ (p=0.027 n=15)
NatSqr/50-88                1.168µ ± 3%   1.197µ ± 3%        ~ (p=0.442 n=15)
NatSqr/80-88                2.527µ ± 2%   2.480µ ± 2%   -1.86% (p=0.000 n=15)
NatSqr/100-88               3.771µ ± 2%   3.535µ ± 2%   -6.26% (p=0.000 n=15)
NatSqr/200-88               14.03µ ± 2%   10.57µ ± 3%  -24.68% (p=0.000 n=15)
NatSqr/300-88               24.06µ ± 2%   20.57µ ± 2%  -14.52% (p=0.000 n=15)
NatSqr/500-88               65.43µ ± 1%   45.45µ ± 1%  -30.55% (p=0.000 n=15)
NatSqr/800-88              126.41µ ± 1%   94.13µ ± 2%  -25.54% (p=0.000 n=15)
NatSqr/1000-88              196.4µ ± 1%   135.1µ ± 1%  -31.18% (p=0.000 n=15)
NatSqr/10000-88             6.404m ± 0%   5.326m ± 1%  -16.84% (p=0.000 n=15)
NatSqr/100000-88            267.2m ± 0%   198.7m ± 0%  -25.64% (p=0.000 n=15)
geomean                     7.318µ        6.948µ        -5.06%

goos: linux
goarch: amd64
pkg: math/big
cpu: Intel(R) Xeon(R) CPU @ 3.10GHz
                         │     old     │                 new                 │
                         │   sec/op    │   sec/op     vs base                │
Div/20/10-16               22.23n ± 0%   22.23n ± 0%        ~ (p=0.973 n=15)
Div/40/20-16               22.23n ± 0%   22.23n ± 0%        ~ (p=0.226 n=15)
Div/100/50-16              55.27n ± 1%   55.59n ± 0%        ~ (p=0.004 n=15)
Div/200/100-16             174.7n ± 3%   175.9n ± 2%        ~ (p=0.645 n=15)
Div/400/200-16             208.8n ± 1%   209.5n ± 2%        ~ (p=0.169 n=15)
Div/1000/500-16            378.7n ± 2%   380.5n ± 2%        ~ (p=0.091 n=15)
Div/2000/1000-16           778.4n ± 1%   781.1n ± 2%        ~ (p=0.104 n=15)
Div/20000/10000-16         25.16µ ± 1%   24.93µ ± 1%   -0.91% (p=0.000 n=15)
Div/200000/100000-16       926.4µ ± 0%   927.7µ ± 1%        ~ (p=0.436 n=15)
Div/2000000/1000000-16     35.58m ± 0%   35.53m ± 0%        ~ (p=0.267 n=15)
Div/20000000/10000000-16    1.333 ± 0%    1.330 ± 0%        ~ (p=0.126 n=15)
NatMul/10-16               172.6n ± 0%   165.4n ± 0%   -4.17% (p=0.000 n=15)
NatMul/100-16              5.706µ ± 0%   5.503µ ± 0%   -3.56% (p=0.000 n=15)
NatMul/1000-16             220.8µ ± 0%   219.1µ ± 0%   -0.76% (p=0.000 n=15)
NatMul/10000-16            8.688m ± 0%   8.621m ± 0%   -0.77% (p=0.000 n=15)
NatMul/100000-16           333.3m ± 0%   333.5m ± 0%        ~ (p=0.512 n=15)
NatSqr/1-16                28.66n ± 1%   28.42n ± 3%   -0.84% (p=0.000 n=15)
NatSqr/2-16                48.29n ± 2%   48.19n ± 2%        ~ (p=0.042 n=15)
NatSqr/3-16                59.93n ± 0%   59.64n ± 2%   -0.48% (p=0.000 n=15)
NatSqr/5-16                88.05n ± 0%   87.89n ± 3%        ~ (p=0.066 n=15)
NatSqr/8-16                127.7n ± 0%   126.9n ± 3%   -0.63% (p=0.000 n=15)
NatSqr/10-16               170.4n ± 0%   169.7n ± 3%        ~ (p=0.004 n=15)
NatSqr/20-16               388.8n ± 0%   392.9n ± 3%        ~ (p=0.123 n=15)
NatSqr/30-16               635.2n ± 0%   641.7n ± 3%        ~ (p=0.123 n=15)
NatSqr/50-16               1.304µ ± 1%   1.314µ ± 3%        ~ (p=0.927 n=15)
NatSqr/80-16               2.709µ ± 1%   2.899µ ± 4%   +7.01% (p=0.000 n=15)
NatSqr/100-16              3.885µ ± 0%   3.981µ ± 4%        ~ (p=0.123 n=15)
NatSqr/200-16              13.29µ ± 2%   12.14µ ± 4%   -8.67% (p=0.000 n=15)
NatSqr/300-16              23.39µ ± 0%   22.51µ ± 3%   -3.78% (p=0.000 n=15)
NatSqr/500-16              58.13µ ± 1%   50.56µ ± 2%  -13.02% (p=0.000 n=15)
NatSqr/800-16              118.4µ ± 1%   107.6µ ± 2%   -9.11% (p=0.000 n=15)
NatSqr/1000-16             172.7µ ± 1%   151.8µ ± 2%  -12.11% (p=0.000 n=15)
NatSqr/10000-16            6.065m ± 1%   5.757m ± 1%   -5.08% (p=0.000 n=15)
NatSqr/100000-16           240.9m ± 0%   228.1m ± 0%   -5.32% (p=0.000 n=15)
geomean                    8.601µ        8.453µ        -1.71%

goos: linux
goarch: amd64
pkg: math/big
cpu: AMD Ryzen 9 7950X 16-Core Processor
                         │     old      │                  new                  │
                         │    sec/op    │    sec/op      vs base                │
Div/20/10-32               11.11n ±  0%    11.11n ±  1%        ~ (p=0.532 n=15)
Div/40/20-32               11.08n ±  1%    11.11n ±  0%        ~ (p=0.815 n=15)
Div/100/50-32              16.81n ±  0%    16.84n ± 29%        ~ (p=0.020 n=15)
Div/200/100-32             73.91n ±  0%    76.85n ± 11%   +3.98% (p=0.000 n=15)
Div/400/200-32             87.35n ±  0%    88.91n ± 34%   +1.79% (p=0.000 n=15)
Div/1000/500-32            169.3n ±  1%    168.9n ±  1%        ~ (p=0.049 n=15)
Div/2000/1000-32           369.3n ±  0%    369.0n ±  0%        ~ (p=0.108 n=15)
Div/20000/10000-32         15.92µ ±  0%    13.55µ ±  2%  -14.91% (p=0.000 n=15)
Div/200000/100000-32       491.4µ ±  0%    482.4µ ±  1%   -1.84% (p=0.000 n=15)
Div/2000000/1000000-32     20.09m ±  0%    19.96m ±  0%   -0.69% (p=0.000 n=15)
Div/20000000/10000000-32   756.5m ±  0%    755.5m ±  0%        ~ (p=0.089 n=15)
NatMul/10-32               125.4n ±  5%    124.8n ±  1%        ~ (p=0.588 n=15)
NatMul/100-32              2.952µ ±  3%    2.969µ ±  0%        ~ (p=0.237 n=15)
NatMul/1000-32             120.7µ ±  0%    121.1µ ±  0%   +0.30% (p=0.000 n=15)
NatMul/10000-32            4.845m ±  0%    4.839m ±  1%        ~ (p=0.653 n=15)
NatMul/100000-32           173.3m ±  0%    173.3m ±  0%        ~ (p=0.838 n=15)
NatSqr/1-32                31.18n ± 23%    32.08n ±  2%        ~ (p=0.015 n=15)
NatSqr/2-32                57.22n ± 28%    58.88n ±  2%        ~ (p=0.054 n=15)
NatSqr/3-32                61.34n ± 18%    64.33n ±  2%        ~ (p=0.237 n=15)
NatSqr/5-32                72.47n ± 17%    79.81n ±  3%        ~ (p=0.067 n=15)
NatSqr/8-32                83.26n ± 26%   100.10n ±  3%        ~ (p=0.016 n=15)
NatSqr/10-32               87.31n ± 43%   125.50n ±  2%        ~ (p=0.003 n=15)
NatSqr/20-32               193.5n ± 25%    244.4n ± 13%        ~ (p=0.002 n=15)
NatSqr/30-32               323.9n ± 17%    380.9n ±  6%        ~ (p=0.003 n=15)
NatSqr/50-32               713.4n ±  9%    761.7n ±  8%        ~ (p=0.419 n=15)
NatSqr/80-32               1.486µ ±  7%    1.609µ ±  5%   +8.28% (p=0.000 n=15)
NatSqr/100-32              2.115µ ±  9%    2.253µ ±  1%        ~ (p=0.104 n=15)
NatSqr/200-32              7.201µ ±  4%    6.610µ ±  1%   -8.21% (p=0.000 n=15)
NatSqr/300-32              13.08µ ±  2%    12.37µ ±  1%   -5.41% (p=0.000 n=15)
NatSqr/500-32              32.56µ ±  2%    27.83µ ±  2%  -14.52% (p=0.000 n=15)
NatSqr/800-32              66.83µ ±  3%    59.59µ ±  1%  -10.83% (p=0.000 n=15)
NatSqr/1000-32             98.09µ ±  1%    83.59µ ±  1%  -14.78% (p=0.000 n=15)
NatSqr/10000-32            3.445m ±  1%    3.245m ±  0%   -5.81% (p=0.000 n=15)
NatSqr/100000-32           137.3m ±  0%    127.0m ±  0%   -7.54% (p=0.000 n=15)
geomean                    4.897µ          4.972µ         +1.52%

goos: linux
goarch: arm64
pkg: math/big
                         │     old     │                 new                 │
                         │   sec/op    │   sec/op     vs base                │
Div/20/10-16               15.26n ± 2%   15.14n ± 1%        ~ (p=0.212 n=15)
Div/40/20-16               15.22n ± 1%   15.16n ± 0%        ~ (p=0.190 n=15)
Div/100/50-16              26.53n ± 2%   26.42n ± 0%   -0.41% (p=0.000 n=15)
Div/200/100-16             124.3n ± 0%   124.0n ± 0%        ~ (p=0.704 n=15)
Div/400/200-16             142.4n ± 0%   141.8n ± 0%        ~ (p=0.074 n=15)
Div/1000/500-16            262.0n ± 1%   261.3n ± 1%        ~ (p=0.046 n=15)
Div/2000/1000-16           532.6n ± 0%   532.5n ± 1%        ~ (p=0.798 n=15)
Div/20000/10000-16         22.27µ ± 0%   22.88µ ± 0%   +2.73% (p=0.000 n=15)
Div/200000/100000-16       890.4µ ± 0%   902.8µ ± 0%   +1.39% (p=0.000 n=15)
Div/2000000/1000000-16     35.03m ± 0%   35.10m ± 0%        ~ (p=0.305 n=15)
Div/20000000/10000000-16    1.380 ± 0%    1.385 ± 0%        ~ (p=0.019 n=15)
NatMul/10-16               177.6n ± 1%   175.6n ± 3%        ~ (p=0.480 n=15)
NatMul/100-16              5.675µ ± 0%   5.669µ ± 1%        ~ (p=0.705 n=15)
NatMul/1000-16             224.3µ ± 0%   224.6µ ± 0%        ~ (p=0.653 n=15)
NatMul/10000-16            8.735m ± 0%   8.739m ± 0%        ~ (p=0.567 n=15)
NatMul/100000-16           331.6m ± 0%   331.6m ± 1%        ~ (p=0.412 n=15)
NatSqr/1-16                43.69n ± 2%   42.77n ± 6%        ~ (p=0.383 n=15)
NatSqr/2-16                65.26n ± 2%   63.91n ± 5%        ~ (p=0.285 n=15)
NatSqr/3-16                73.95n ± 1%   72.25n ± 6%        ~ (p=0.198 n=15)
NatSqr/5-16                95.06n ± 1%   94.21n ± 3%        ~ (p=0.721 n=15)
NatSqr/8-16                155.5n ± 1%   153.4n ± 4%        ~ (p=0.170 n=15)
NatSqr/10-16               175.4n ± 1%   174.0n ± 2%        ~ (p=0.271 n=15)
NatSqr/20-16               360.8n ± 0%   358.5n ± 2%        ~ (p=0.170 n=15)
NatSqr/30-16               584.7n ± 0%   582.9n ± 1%        ~ (p=0.170 n=15)
NatSqr/50-16               1.323µ ± 0%   1.322µ ± 0%        ~ (p=0.627 n=15)
NatSqr/80-16               2.916µ ± 0%   2.674µ ± 0%   -8.30% (p=0.000 n=15)
NatSqr/100-16              4.365µ ± 0%   3.802µ ± 0%  -12.90% (p=0.000 n=15)
NatSqr/200-16              16.42µ ± 0%   11.29µ ± 0%  -31.26% (p=0.000 n=15)
NatSqr/300-16              28.07µ ± 0%   22.83µ ± 0%  -18.68% (p=0.000 n=15)
NatSqr/500-16              76.30µ ± 0%   50.06µ ± 0%  -34.39% (p=0.000 n=15)
NatSqr/800-16              147.5µ ± 0%   101.2µ ± 1%  -31.41% (p=0.000 n=15)
NatSqr/1000-16             228.6µ ± 0%   149.5µ ± 0%  -34.61% (p=0.000 n=15)
NatSqr/10000-16            7.417m ± 0%   6.025m ± 0%  -18.76% (p=0.000 n=15)
NatSqr/100000-16           309.2m ± 0%   214.9m ± 0%  -30.50% (p=0.000 n=15)
geomean                    8.559µ        7.906µ        -7.63%

goos: darwin
goarch: arm64
pkg: math/big
cpu: Apple M3 Pro
                         │     old      │                 new                 │
                         │    sec/op    │   sec/op     vs base                │
Div/20/10-12                9.577n ± 6%   9.473n ± 5%        ~ (p=0.384 n=15)
Div/40/20-12                9.480n ± 1%   9.430n ± 1%        ~ (p=0.019 n=15)
Div/100/50-12               14.82n ± 0%   14.82n ± 0%        ~ (p=0.845 n=15)
Div/200/100-12              83.94n ± 1%   84.35n ± 4%        ~ (p=0.512 n=15)
Div/400/200-12              102.7n ± 1%   102.9n ± 0%        ~ (p=0.845 n=15)
Div/1000/500-12             185.3n ± 1%   181.9n ± 1%   -1.83% (p=0.000 n=15)
Div/2000/1000-12            397.0n ± 1%   396.7n ± 0%        ~ (p=0.959 n=15)
Div/20000/10000-12          14.05µ ± 0%   13.70µ ± 1%        ~ (p=0.002 n=15)
Div/200000/100000-12        529.4µ ± 3%   526.7µ ± 2%        ~ (p=0.967 n=15)
Div/2000000/1000000-12      20.05m ± 0%   20.05m ± 0%        ~ (p=0.653 n=15)
Div/20000000/10000000-12    788.2m ± 1%   789.0m ± 1%        ~ (p=0.412 n=15)
NatMul/10-12                79.95n ± 1%   80.87n ± 1%   +1.15% (p=0.000 n=15)
NatMul/100-12               2.973µ ± 0%   2.986µ ± 2%        ~ (p=0.051 n=15)
NatMul/1000-12              122.6µ ± 5%   123.0µ ± 1%        ~ (p=0.783 n=15)
NatMul/10000-12             4.990m ± 1%   5.000m ± 1%        ~ (p=0.653 n=15)
NatMul/100000-12            185.3m ± 3%   190.3m ± 1%        ~ (p=0.089 n=15)
NatSqr/1-12                 11.84n ± 1%   11.88n ± 1%        ~ (p=0.735 n=15)
NatSqr/2-12                 21.01n ± 1%   21.44n ± 6%        ~ (p=0.039 n=15)
NatSqr/3-12                 25.59n ± 0%   26.74n ± 9%   +4.49% (p=0.000 n=15)
NatSqr/5-12                 36.78n ± 0%   37.04n ± 1%   +0.71% (p=0.000 n=15)
NatSqr/8-12                 63.09n ± 3%   63.22n ± 1%        ~ (p=0.846 n=15)
NatSqr/10-12                79.98n ± 0%   79.78n ± 0%        ~ (p=0.100 n=15)
NatSqr/20-12                174.0n ± 0%   175.5n ± 1%        ~ (p=0.361 n=15)
NatSqr/30-12                290.0n ± 0%   291.4n ± 0%        ~ (p=0.002 n=15)
NatSqr/50-12                655.2n ± 4%   658.1n ± 0%        ~ (p=0.060 n=15)
NatSqr/80-12                1.506µ ± 0%   1.397µ ± 5%   -7.24% (p=0.000 n=15)
NatSqr/100-12               2.273µ ± 0%   2.005µ ± 5%  -11.79% (p=0.000 n=15)
NatSqr/200-12               8.833µ ± 6%   6.109µ ± 0%  -30.84% (p=0.000 n=15)
NatSqr/300-12               15.15µ ± 4%   12.37µ ± 0%  -18.34% (p=0.000 n=15)
NatSqr/500-12               41.89µ ± 0%   27.70µ ± 1%  -33.88% (p=0.000 n=15)
NatSqr/800-12               80.72µ ± 0%   56.40µ ± 0%  -30.12% (p=0.000 n=15)
NatSqr/1000-12             127.06µ ± 1%   84.06µ ± 1%  -33.84% (p=0.000 n=15)
NatSqr/10000-12             4.130m ± 0%   3.390m ± 0%  -17.91% (p=0.000 n=15)
NatSqr/100000-12            173.2m ± 0%   131.2m ± 6%  -24.25% (p=0.000 n=15)
geomean                     4.489µ        4.189µ        -6.68%

Change-Id: Iaf65fd85457b003ebf07a787c875cda321b40cc9
Reviewed-on: https://go-review.googlesource.com/c/go/+/652058
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Robert Griesemer <gri@google.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Russ Cox <rsc@golang.org>
This commit is contained in:
Russ Cox 2025-01-18 00:17:21 -05:00 committed by Gopher Robot
parent 40c953cd46
commit a812e5f3c3
6 changed files with 735 additions and 148 deletions

180
src/math/big/calibrate.md Normal file
View File

@ -0,0 +1,180 @@
# Calibration of Algorithm Thresholds
This document describes the approach to calibration of algorithmic thresholds in
`math/big`, implemented in [calibrate_test.go](calibrate_test.go).
Basic operations like multiplication and division have many possible implementations.
Most algorithms that are better asymptotically have overheads that make them
run slower for small inputs. When presented with an operation to run, `math/big`
must decide which algorithm to use.
For example, for small inputs, multiplication using the “grade school algorithm” is fastest.
Given multi-digit x, y and a target z: clear z, and then for each digit y[i], z[i:] += x\*y[i].
That last operation, adding a vector times a digit to another vector (including carrying up
the vector during the multiplication and addition), can be implemented in a tight assembly loop.
The overall speed is O(N\*\*2) where N is the number of digits in x and y (assume they match),
but the tight inner loop performs well for small inputs.
[Karatsuba's algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
multiplies two N-digit numbers by splitting them in half, computing
three N/2-digit products, and then reconstructing the final product using a few more
additions and subtractions. It runs in O(N\*\*log₂ 3) = O(N\*\*1.58) time.
The grade school loop runs faster for small inputs,
but eventually Karatsuba's smaller asymptotic run time wins.
The multiplication implementation must decide which to use.
Under the assumption that once Karatsuba is faster for some N,
it will be larger for all larger N as well,
the rule is to use Karatsuba's algorithm when the input length N ≥ karatsubaThreshold.
Calibration is the process of determining what karatsubaThreshold should be set to.
It doesn't sound like it should be that hard, but it is:
- Theoretical analysis does not help: the answer depends on the actual machines
and the actual constant factors in the two implementations.
- We are picking a single karatsubaThreshold for all systems,
despite them having different relative execution speeds for the operations
in the two algorithms.
(We could in theory pick different thresholds for different architectures,
but there can still be significant variation within a given architecture.)
- The assumption that there is a single N where
an asymptotically better algorithm becomes faster and stays faster
is not true in general.
- Recursive algorithms like Karatsuba's may have different optimal
thresholds for different large input sizes.
- Thresholds can interfere. For example, changing the karatsubaThreshold makes
multiplication faster or slower, which in turn affects the best divRecursiveThreshold
(because divisions use multiplication).
The best we can do is measure the performance of the overall multiplication
algorithm across a variety of inputs and thresholds and look for a threshold
that balances all these concerns reasonably well,
setting thresholds in dependency order (for example, multiplication before division).
The code in `calibrate_test.go` does this measurement of a variety of input sizes
and threshold values and prints the timing results as a CSV file.
The code in `calibrate_graph.go` reads the CSV and writes out an SVG file plotting the data.
For example:
go test -run=Calibrate/KaratsubaMul -timeout=1h -calibrate >kmul.csv
go run calibrate_graph.go kmul.csv >kmul.svg
Any particular input is sensitive to only a few transitions in threshold.
For example, an input of size 320 recurses on inputs of size 160,
which recurses on inputs of size 80,
which recurses on inputs of size 40,
and so on, until falling below the Karatsuba threshold.
Here is what the timing looks like for an input of size 320,
normalized so that 1.0 is the fastest timing observed:
![KaratsubaThreshold on an Apple M3 Pro, N=320 only](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac320.svg)
For this input, all thresholds from 21 to 40 perform optimally and identically: they all mean “recurse at N=40 but not at N=20”.
From the single input of size N=320, we cannot decide which of these 20 thresholds is best.
Other inputs exercise other decision points. For example, here is the timing for N=240:
![KaratsubaThreshold on an Apple M3 Pro, N=240 only](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac240.svg)
In this case, all the thresholds from 31 to 60 perform optimally and identically, recursing at N=60 but not N=30.
If we combine these two into a single graph and then plot the geometric mean of the two lines in blue,
the optimal range becomes a little clearer:
![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac240+320.svg)
The actual calibration runs all possible inputs from size N=200 to N=400, in increments of 8,
plotting all 26 lines in a faded gray (note the changed y-axis scale, zooming in near 1.0).
![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac.svg)
Now the optimal value is clear: the best threshold on this chip, with these algorithmic implementations, is 40.
Unfortunately, other chips are different. Here is an Intel Xeon server chip:
![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c2s16.svg)
On this chip, the best threshold is closer to 60. Luckily, 40 is not a terrible choice either: it is only about 2% slower on average.
The rest of this document presents the timings measured for the `math/big` thresholds on a variety of machines
and justifies the final thresholds. The timings used these machines:
- The `gotip-linux-amd64_c3h88-perf_vs_release` gomote, a Google Cloud c3-high-88 machine using an Intel Xeon Platinum 8481C CPU (Emerald Rapids).
- The `gotip-linux-amd64_c2s16-perf_vs_release` gomote, a Google Cloud c2-standard-16 machine using an Intel Xeon Gold 6253CL CPU (Cascade Lake).
- A home server built with an AMD Ryzen 9 7950X CPU.
- The `gotip-linux-arm64_c4as16-perf_vs_release` gomote, a Google Cloud c4a-standard-16 machine using Google's Axiom Arm CPU.
- An Apple MacBook Pro with an Apple M3 Pro CPU.
In general, we break ties in favor of the newer c3h88 x86 perf gomote, then the c4as16 arm64 perf gomote, and then the others.
## Karatsuba Multiplication
Here are the full results for the Karatsuba multiplication threshold.
![KaratsubaThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c3h88.svg)
![KaratsubaThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c2s16.svg)
![KaratsubaThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.s7.svg)
![KaratsubaThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c4as16.svg)
![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac.svg)
The majority of systems have optimum thresholds near 40, so we chose karatsubaThreshold = 40.
## Basic Squaring
For squaring a number (`z.Mul(x, x)`), math/big uses grade school multiplication
up to basicSqrThreshold, where it switches to a customized algorithm that is
still quadratic but avoids half the word-by-word multiplies
since the two arguments are identical.
That algorithm's inner loops are not as tight as the grade school multiplication,
so it is slower for small inputs. How small?
Here are the timings:
![BasicSqrThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c3h88.svg)
![BasicSqrThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c2s16.svg)
![BasicSqrThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/BasicSqr/cal.s7.svg)
![BasicSqrThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c4as16.svg)
![BasicSqrThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/BasicSqr/cal.mac.svg)
These inputs are so small that the calibration times batches of 100 instead of individual operations.
There is no one best threshold, even on a single system, because some of the sizes seem to run
the grade school algorithm faster than others.
For example, on the AMD CPU,
for N=14, basic squaring is 4% faster than basic multiplication,
suggesting the threshold has been crossed,
but for N=16, basic multiplication is 9% faster than basic squaring,
probably because the tight assembly can use larger chunks.
It is unclear why the Axiom Arm timings are so incredibly noisy.
We chose basicSqrThreshold = 12.
## Karatsuba Squaring
Beyond the basic squaring threshold, at some point a customized Karatsuba can take over.
It uses three half-sized squarings instead of three half-sized multiplies.
Here are the timings:
![KaratsubaSqrThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c3h88.svg)
![KaratsubaSqrThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c2s16.svg)
![KaratsubaSqrThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.s7.svg)
![KaratsubaSqrThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c4as16.svg)
![KaratsubaSqrThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.mac.svg)
The majority of chips preferred a lower threshold, around 60-70,
but the older Intel Xeon and the AMD prefer a threshold around 100-120.
We chose karatsubaSqrThreshold = 80, which is within 2% of optimal on all the chips.
## Recursive Division
Division uses a recursive divide-and-conquer algorithm for large inputs,
eventually falling back to a more traditional grade-school whole-input trial-and-error division.
Here are the timings for the threshold between the two:
![DivRecursiveThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c3h88.svg)
![DivRecursiveThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c2s16.svg)
![DivRecursiveThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/DivRecursive/cal.s7.svg)
![DivRecursiveThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c4as16.svg)
![DivRecursiveThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/DivRecursive/cal.mac.svg)
We chose divRecursiveThreshold = 40.

View File

@ -0,0 +1,321 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
// This program converts CSV calibration data printed by
//
// go test -run=Calibrate/Name -calibrate >file.csv
//
// into an SVG file. Invoke as:
//
// go run calibrate_graph.go file.csv >file.svg
//
// See calibrate.md for more details.
package main
import (
"bytes"
"encoding/csv"
"flag"
"fmt"
"log"
"math"
"os"
"strconv"
)
func usage() {
fmt.Fprintf(os.Stderr, "usage: go run calibrate_graph.go file.csv >file.svg\n")
os.Exit(2)
}
// A Point is an X, Y coordinate in the data being plotted.
type Point struct {
X, Y float64
}
// A Graph is a graph to draw as SVG.
type Graph struct {
Title string // title above graph
Geomean []Point // geomean line
Lines [][]Point // normalized data lines
XAxis string // x-axis label
YAxis string // y-axis label
Min Point // min point of data display
Max Point // max point of data display
}
var yMax = flag.Float64("ymax", 1.2, "maximum y axis value")
var alphaNorm = flag.Float64("alphanorm", 0.1, "alpha for a single norm line")
func main() {
flag.Usage = usage
flag.Parse()
if flag.NArg() != 1 {
usage()
}
// Read CSV. It may be enclosed in
// -- name.csv --
// ...
// -- eof --
// framing, in which case remove the framing.
fdata, err := os.ReadFile(flag.Arg(0))
if err != nil {
log.Fatal(err)
}
if _, after, ok := bytes.Cut(fdata, []byte(".csv --\n")); ok {
fdata = after
}
if before, _, ok := bytes.Cut(fdata, []byte("-- eof --\n")); ok {
fdata = before
}
rd := csv.NewReader(bytes.NewReader(fdata))
rd.FieldsPerRecord = -1
records, err := rd.ReadAll()
if err != nil {
log.Fatal(err)
}
// Construct graph from loaded CSV.
// CSV starts with metadata lines like
// goos,darwin
// and then has two tables of timings.
// Each table looks like
// size \ threshold,10,20,30,40
// 100,1,2,3,4
// 200,2,3,4,5
// 300,3,4,5,6
// 400,4,5,6,7
// 500,5,6,7,8
// The header line gives the threshold values and then each row
// gives an input size and the timings for each threshold.
// Omitted timings are empty strings and turn into infinities when parsing.
// The first table gives raw nanosecond timings.
// The second table gives timings normalized relative to the fastest
// possible threshold for a given input size.
// We only want the second table.
// The tables are followed by a list of geomeans of all the normalized
// timings for each threshold:
// geomean,1.2,1.1,1.0,1.4
// We turn each normalized timing row into a line in the graph,
// and we turn the geomean into an overlaid thick line.
// The metadata is used for preparing the titles.
g := &Graph{
YAxis: "Relative Slowdown",
Min: Point{0, 1},
Max: Point{1, 1.2},
}
meta := make(map[string]string)
table := 0 // number of table headers seen
var thresholds []float64
maxNorm := 0.0
for _, rec := range records {
if len(rec) == 0 {
continue
}
if len(rec) == 2 {
meta[rec[0]] = rec[1]
continue
}
if rec[0] == `size \ threshold` {
table++
if table == 2 {
thresholds = parseFloats(rec)
g.Min.X = thresholds[0]
g.Max.X = thresholds[len(thresholds)-1]
}
continue
}
if rec[0] == "geomean" {
table = 3 // end of norms table
geomeans := parseFloats(rec)
g.Geomean = floatsToLine(thresholds, geomeans)
continue
}
if table == 2 {
if _, err := strconv.Atoi(rec[0]); err != nil { // size
log.Fatalf("invalid table line: %q", rec)
}
norms := parseFloats(rec)
if len(norms) > len(thresholds) {
log.Fatalf("too many timings (%d > %d): %q", len(norms), len(thresholds), rec)
}
g.Lines = append(g.Lines, floatsToLine(thresholds, norms))
for _, y := range norms {
maxNorm = max(maxNorm, y)
}
continue
}
}
g.Max.Y = min(*yMax, math.Ceil(maxNorm*100)/100)
g.XAxis = meta["calibrate"] + "Threshold"
g.Title = meta["goos"] + "/" + meta["goarch"] + " " + meta["cpu"]
os.Stdout.Write(g.SVG())
}
// parseFloats parses rec[1:] as floating point values.
// If a field is the empty string, it is represented as +Inf.
func parseFloats(rec []string) []float64 {
floats := make([]float64, 0, len(rec)-1)
for _, v := range rec[1:] {
if v == "" {
floats = append(floats, math.Inf(+1))
continue
}
f, err := strconv.ParseFloat(v, 64)
if err != nil {
log.Fatalf("invalid record: %q (%v)", rec, err)
}
floats = append(floats, f)
}
return floats
}
// floatsToLine converts a sequence of floats into a line, ignoring missing (infinite) values.
func floatsToLine(x, y []float64) []Point {
var line []Point
for i, yi := range y {
if !math.IsInf(yi, 0) {
line = append(line, Point{x[i], yi})
}
}
return line
}
const svgHeader = `<svg width="%d" height="%d" version="1.1" xmlns="http://www.w3.org/2000/svg">
<defs>
<style type="text/css"><![CDATA[
text { stroke-width: 0; white-space: pre; }
text.hjc { text-anchor: middle; }
text.hjl { text-anchor: start; }
text.hjr { text-anchor: end; }
.def { stroke-linecap: round; stroke-linejoin: round; fill: none; stroke: #000000; stroke-width: 1px; }
.tick { stroke: #000000; fill: #000000; font: %dpx Times; }
.title { stroke: #000000; fill: #000000; font: %dpx Times; font-weight: bold; }
.axis { stroke-width: 2px; }
.norm { stroke: rgba(0,0,0,%f); }
.geomean { stroke: #6666ff; stroke-width: 2px; }
]]></style>
</defs>
<g class="def">
`
// Layout constants for drawing graph
const (
DX = 600 // width of graphed data
DY = 150 // height of graphed data
ML = 80 // margin left
MT = 30 // margin top
MR = 10 // margin right
MB = 50 // margin bottom
PS = 14 // point size of text
W = ML + DX + MR // width of overall graph
H = MT + DY + MB // height of overall graph
Tick = 5 // axis tick length
)
// An SVGPoint is a point in the SVG image, in pixel units,
// with Y increasing down the page.
type SVGPoint struct {
X, Y int
}
func (p SVGPoint) String() string {
return fmt.Sprintf("%d,%d", p.X, p.Y)
}
// pt converts an x, y data value (such as from a Point) to an SVGPoint.
func (g *Graph) pt(x, y float64) SVGPoint {
return SVGPoint{
X: ML + int((x-g.Min.X)/(g.Max.X-g.Min.X)*DX),
Y: H - MB - int((y-g.Min.Y)/(g.Max.Y-g.Min.Y)*DY),
}
}
// SVG returns the SVG text for the graph.
func (g *Graph) SVG() []byte {
var svg bytes.Buffer
fmt.Fprintf(&svg, svgHeader, W, H, PS, PS, *alphaNorm)
// Draw data, clipped.
fmt.Fprintf(&svg, "<clipPath id=\"cp\"><path d=\"M %v L %v L %v L %v Z\" /></clipPath>\n",
g.pt(g.Min.X, g.Min.Y), g.pt(g.Max.X, g.Min.Y), g.pt(g.Max.X, g.Max.Y), g.pt(g.Min.X, g.Max.Y))
fmt.Fprintf(&svg, "<g clip-path=\"url(#cp)\">\n")
for _, line := range g.Lines {
if len(line) == 0 {
continue
}
fmt.Fprintf(&svg, "<path class=\"norm\" d=\"M %v", g.pt(line[0].X, line[0].Y))
for _, v := range line[1:] {
fmt.Fprintf(&svg, " L %v", g.pt(v.X, v.Y))
}
fmt.Fprintf(&svg, "\"/>\n")
}
// Draw geomean.
if len(g.Geomean) > 0 {
line := g.Geomean
fmt.Fprintf(&svg, "<path class=\"geomean\" d=\"M %v", g.pt(line[0].X, line[0].Y))
for _, v := range line[1:] {
fmt.Fprintf(&svg, " L %v", g.pt(v.X, v.Y))
}
fmt.Fprintf(&svg, "\"/>\n")
}
fmt.Fprintf(&svg, "</g>\n")
// Draw axes and major and minor tick marks.
fmt.Fprintf(&svg, "<path class=\"axis\" d=\"")
fmt.Fprintf(&svg, " M %v L %v", g.pt(g.Min.X, g.Min.Y), g.pt(g.Max.X, g.Min.Y)) // x axis
fmt.Fprintf(&svg, " M %v L %v", g.pt(g.Min.X, g.Min.Y), g.pt(g.Min.X, g.Max.Y)) // y axis
xscale := 10.0
if g.Max.X-g.Min.X < 100 {
xscale = 1.0
}
for x := int(math.Ceil(g.Min.X / xscale)); float64(x)*xscale <= g.Max.X; x++ {
if x%5 != 0 {
fmt.Fprintf(&svg, " M %v l 0,%d", g.pt(float64(x)*xscale, g.Min.Y), Tick)
} else {
fmt.Fprintf(&svg, " M %v l 0,%d", g.pt(float64(x)*xscale, g.Min.Y), 2*Tick)
}
}
yscale := 100.0
if g.Max.Y-g.Min.Y > 0.5 {
yscale = 10
}
for y := int(math.Ceil(g.Min.Y * yscale)); float64(y) <= g.Max.Y*yscale; y++ {
if y%5 != 0 {
fmt.Fprintf(&svg, " M %v l -%d,0", g.pt(g.Min.X, float64(y)/yscale), Tick)
} else {
fmt.Fprintf(&svg, " M %v l -%d,0", g.pt(g.Min.X, float64(y)/yscale), 2*Tick)
}
}
fmt.Fprintf(&svg, "\"/>\n")
// Draw tick labels on major marks.
for x := int(math.Ceil(g.Min.X / xscale)); float64(x)*xscale <= g.Max.X; x++ {
if x%5 == 0 {
p := g.pt(float64(x)*xscale, g.Min.Y)
fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"tick hjc\">%d</text>\n", p.X, p.Y+2*Tick+PS, x*int(xscale))
}
}
for y := int(math.Ceil(g.Min.Y * yscale)); float64(y) <= g.Max.Y*yscale; y++ {
if y%5 == 0 {
p := g.pt(g.Min.X, float64(y)/yscale)
fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"tick hjr\">%.2f</text>\n", p.X-2*Tick-Tick, p.Y+PS/3, float64(y)/yscale)
}
}
// Draw graph title and axis titles.
fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"title hjc\">%s</text>\n", ML+DX/2, MT-PS/3, g.Title)
fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"title hjc\">%s</text>\n", ML+DX/2, MT+DY+2*Tick+2*PS+PS/2, g.XAxis)
fmt.Fprintf(&svg, "<g transform=\"translate(%d,%d) rotate(-90)\"><text x=\"0\" y=\"0\" class=\"title hjc\">%s</text></g>\n", ML-Tick-Tick-3*PS, MT+DY/2, g.YAxis)
fmt.Fprintf(&svg, "</g></svg>\n")
return svg.Bytes()
}

View File

@ -2,172 +2,266 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Calibration used to determine thresholds for using
// different algorithms. Ideally, this would be converted
// to go generate to create thresholds.go
// This file prints execution times for the Mul benchmark
// given different Karatsuba thresholds. The result may be
// used to manually fine-tune the threshold constant. The
// results are somewhat fragile; use repeated runs to get
// a clear picture.
// Calculates lower and upper thresholds for when basicSqr
// is faster than standard multiplication.
// Usage: go test -run='^TestCalibrate$' -v -calibrate
// TestCalibrate determines appropriate thresholds for when to use
// different calculation algorithms. To run it, use:
//
// go test -run=Calibrate -calibrate >cal.log
//
// Calibration data is printed in CSV format, along with the normal test output.
// See calibrate.md for more details about using the output.
package big
import (
"flag"
"fmt"
"internal/sysinfo"
"math"
"runtime"
"slices"
"strings"
"sync"
"testing"
"time"
)
var calibrate = flag.Bool("calibrate", false, "run calibration test")
const (
sqrModeMul = "mul(x, x)"
sqrModeBasic = "basicSqr(x)"
sqrModeKaratsuba = "karatsubaSqr(x)"
)
var calibrateOnce sync.Once
func TestCalibrate(t *testing.T) {
if !*calibrate {
return
}
computeKaratsubaThresholds()
t.Run("KaratsubaMul", computeKaratsubaThreshold)
t.Run("BasicSqr", computeBasicSqrThreshold)
t.Run("KaratsubaSqr", computeKaratsubaSqrThreshold)
t.Run("DivRecursive", computeDivRecursiveThreshold)
}
// compute basicSqrThreshold where overhead becomes negligible
minSqr := computeSqrThreshold(10, 30, 1, 3, sqrModeMul, sqrModeBasic)
// compute karatsubaSqrThreshold where karatsuba is faster
maxSqr := computeSqrThreshold(200, 500, 10, 3, sqrModeBasic, sqrModeKaratsuba)
if minSqr != 0 {
fmt.Printf("found basicSqrThreshold = %d\n", minSqr)
} else {
fmt.Println("no basicSqrThreshold found")
}
if maxSqr != 0 {
fmt.Printf("found karatsubaSqrThreshold = %d\n", maxSqr)
} else {
fmt.Println("no karatsubaSqrThreshold found")
func computeKaratsubaThreshold(t *testing.T) {
set := func(n int) { karatsubaThreshold = n }
computeThreshold(t, "karatsuba", set, 0, 4, 200, benchMul, 200, 8, 400)
}
func benchMul(size int) func() {
x := rndNat(size)
y := rndNat(size)
var z nat
return func() {
z.mul(nil, x, y)
}
}
func karatsubaLoad(b *testing.B) {
BenchmarkMul(b)
func computeBasicSqrThreshold(t *testing.T) {
setDuringTest(t, &karatsubaSqrThreshold, 1e9)
set := func(n int) { basicSqrThreshold = n }
computeThreshold(t, "basicSqr", set, 2, 1, 40, benchBasicSqr, 1, 1, 40)
}
// measureKaratsuba returns the time to run a Karatsuba-relevant benchmark
// given Karatsuba threshold th.
func measureKaratsuba(th int) time.Duration {
th, karatsubaThreshold = karatsubaThreshold, th
res := testing.Benchmark(karatsubaLoad)
karatsubaThreshold = th
return time.Duration(res.NsPerOp())
func benchBasicSqr(size int) func() {
x := rndNat(size)
var z nat
return func() {
// Run 100 squarings because 1 is too fast at the small sizes we consider.
// Some systems don't even have precise enough clocks to measure it accurately.
for range 100 {
z.sqr(nil, x)
}
}
}
func computeKaratsubaThresholds() {
fmt.Printf("Multiplication times for varying Karatsuba thresholds\n")
fmt.Printf("(run repeatedly for good results)\n")
func computeKaratsubaSqrThreshold(t *testing.T) {
set := func(n int) { karatsubaSqrThreshold = n }
computeThreshold(t, "karatsubaSqr", set, 0, 4, 200, benchSqr, 200, 8, 400)
}
// determine Tk, the work load execution time using basic multiplication
Tb := measureKaratsuba(1e9) // th == 1e9 => Karatsuba multiplication disabled
fmt.Printf("Tb = %10s\n", Tb)
func benchSqr(size int) func() {
x := rndNat(size)
var z nat
return func() {
z.sqr(nil, x)
}
}
// thresholds
th := 4
th1 := -1
th2 := -1
func computeDivRecursiveThreshold(t *testing.T) {
set := func(n int) { divRecursiveThreshold = n }
computeThreshold(t, "divRecursive", set, 4, 4, 200, benchDiv, 200, 8, 400)
}
var deltaOld time.Duration
for count := -1; count != 0 && th < 128; count-- {
// determine Tk, the work load execution time using Karatsuba multiplication
Tk := measureKaratsuba(th)
func benchDiv(size int) func() {
divx := rndNat(2 * size)
divy := rndNat(size)
var z, r nat
return func() {
z.div(nil, r, divx, divy)
}
}
// improvement over Tb
delta := (Tb - Tk) * 100 / Tb
func computeThreshold(t *testing.T, name string, set func(int), thresholdLo, thresholdStep, thresholdHi int, bench func(int) func(), sizeLo, sizeStep, sizeHi int) {
// Start CSV output; wrapped in txtar framing to separate CSV from other test ouptut.
fmt.Printf("-- calibrate-%s.csv --\n", name)
defer fmt.Printf("-- eof --\n")
fmt.Printf("th = %3d Tk = %10s %4d%%", th, Tk, delta)
fmt.Printf("goos,%s\n", runtime.GOOS)
fmt.Printf("goarch,%s\n", runtime.GOARCH)
fmt.Printf("cpu,%s\n", sysinfo.CPUName())
fmt.Printf("calibrate,%s\n", name)
// determine break-even point
if Tk < Tb && th1 < 0 {
th1 = th
fmt.Print(" break-even point")
// Expand lists of sizes and thresholds we will test.
var sizes, thresholds []int
for size := sizeLo; size <= sizeHi; size += sizeStep {
sizes = append(sizes, size)
}
for thresh := thresholdLo; thresh <= thresholdHi; thresh += thresholdStep {
thresholds = append(thresholds, thresh)
}
fmt.Printf("%s\n", csv("size \\ threshold", thresholds))
// Track minimum time observed for each size, threshold pair.
times := make([][]float64, len(sizes))
for i := range sizes {
times[i] = make([]float64, len(thresholds))
for j := range thresholds {
times[i][j] = math.Inf(+1)
}
}
// For each size, run at most MaxRounds of considering every threshold.
// If we run a threshold Stable times in a row without seeing more
// than a 1% improvement in the observed minimum, move on to the next one.
// After we run Converged rounds (not necessarily in a row)
// without seeing any threshold improve by more than 1%, stop.
const (
MaxRounds = 1600
Stable = 20
Converged = 200
)
for i, size := range sizes {
b := bench(size)
same := 0
for range MaxRounds {
better := false
for j, threshold := range thresholds {
// No point if threshold is far beyond size
if false && threshold > size+2*sizeStep {
continue
}
// BasicSqr is different from the recursive thresholds: it either applies or not,
// without any question of recursive subproblems. Only try the thresholds
// size-1, size, size+1, size+2
// to get two data points using basic multiplication and two using basic squaring.
// This avoids gathering many redundant data points.
// (The others have redundant data points as well, but for them the math is less trivial
// and best not duplicated in the calibration code.)
if false && name == "basicSqr" && (threshold < size-1 || threshold > size+3) {
continue
}
set(threshold)
b() // warm up
b()
tmin := times[i][j]
for k := 0; k < Stable; k++ {
start := time.Now()
b()
t := float64(time.Since(start))
if t < tmin {
if t < tmin*99/100 {
better = true
k = 0
}
tmin = t
}
}
times[i][j] = tmin
}
if !better {
if same++; same >= Converged {
break
}
}
}
// determine diminishing return
if 0 < delta && delta < deltaOld && th2 < 0 {
th2 = th
fmt.Print(" diminishing return")
}
deltaOld = delta
fmt.Println()
// trigger counter
if th1 >= 0 && th2 >= 0 && count < 0 {
count = 10 // this many extra measurements after we got both thresholds
}
th++
fmt.Printf("%s\n", csv(fmt.Sprint(size), times[i]))
}
// For each size, normalize timings by the minimum achieved for that size.
fmt.Printf("%s\n", csv("size \\ threshold", thresholds))
norms := make([][]float64, len(sizes))
for i, times := range times {
m := min(1e100, slices.Min(times)) // make finite so divide preserves inf values
norms[i] = make([]float64, len(times))
for j, d := range times {
norms[i][j] = d / m
}
fmt.Printf("%s\n", csv(fmt.Sprint(sizes[i]), norms[i]))
}
// For each threshold, compute geomean of normalized timings across all sizes.
geomeans := make([]float64, len(thresholds))
for j := range thresholds {
p := 1.0
n := 0
for i := range sizes {
if v := norms[i][j]; !math.IsInf(v, +1) {
p *= v
n++
}
}
if n == 0 {
geomeans[j] = math.Inf(+1)
} else {
geomeans[j] = math.Pow(p, 1/float64(n))
}
}
fmt.Printf("%s\n", csv("geomean", geomeans))
// Add best threshold and smallest, largest within 10% and 5% of best.
var lo10, lo5, best, hi5, hi10 int
for i, g := range geomeans {
if g < geomeans[best] {
best = i
}
}
lo5 = best
for lo5 > 0 && geomeans[lo5-1] <= 1.05 {
lo5--
}
lo10 = lo5
for lo10 > 0 && geomeans[lo10-1] <= 1.10 {
lo10--
}
hi5 = best
for hi5+1 < len(geomeans) && geomeans[hi5+1] <= 1.05 {
hi5++
}
hi10 = hi5
for hi10+1 < len(geomeans) && geomeans[hi10+1] <= 1.10 {
hi10++
}
fmt.Printf("lo10%%,%d\n", thresholds[lo10])
fmt.Printf("lo5%%,%d\n", thresholds[lo5])
fmt.Printf("min,%d\n", thresholds[best])
fmt.Printf("hi5%%,%d\n", thresholds[hi5])
fmt.Printf("hi10%%,%d\n", thresholds[hi10])
set(thresholds[best])
}
func measureSqr(words, nruns int, mode string) time.Duration {
// more runs for better statistics
initBasicSqr, initKaratsubaSqr := basicSqrThreshold, karatsubaSqrThreshold
switch mode {
case sqrModeMul:
basicSqrThreshold = words + 1
case sqrModeBasic:
basicSqrThreshold, karatsubaSqrThreshold = words-1, words+1
case sqrModeKaratsuba:
karatsubaSqrThreshold = words - 1
}
var testval int64
for i := 0; i < nruns; i++ {
res := testing.Benchmark(func(b *testing.B) { benchmarkNatSqr(b, words) })
testval += res.NsPerOp()
}
testval /= int64(nruns)
basicSqrThreshold, karatsubaSqrThreshold = initBasicSqr, initKaratsubaSqr
return time.Duration(testval)
}
func computeSqrThreshold(from, to, step, nruns int, lower, upper string) int {
fmt.Printf("Calibrating threshold between %s and %s\n", lower, upper)
fmt.Printf("Looking for a timing difference for x between %d - %d words by %d step\n", from, to, step)
var initPos bool
var threshold int
for i := from; i <= to; i += step {
baseline := measureSqr(i, nruns, lower)
testval := measureSqr(i, nruns, upper)
pos := baseline > testval
delta := baseline - testval
percent := delta * 100 / baseline
fmt.Printf("words = %3d deltaT = %10s (%4d%%) is %s better: %v", i, delta, percent, upper, pos)
if i == from {
initPos = pos
// csv returns a single csv line starting with name and followed by the values.
// Values that are float64 +infinity, denoting missing data, are replaced by an empty string.
func csv[T int | float64](name string, values []T) string {
line := []string{name}
for _, v := range values {
if math.IsInf(float64(v), +1) {
line = append(line, "")
} else {
line = append(line, fmt.Sprint(v))
}
if threshold == 0 && pos != initPos {
threshold = i
fmt.Printf(" threshold found")
}
fmt.Println()
}
if threshold != 0 {
fmt.Printf("Found threshold = %d between %d - %d\n", threshold, from, to)
} else {
fmt.Printf("Found NO threshold between %d - %d\n", from, to)
}
return threshold
return strings.Join(line, ",")
}

View File

@ -378,19 +378,6 @@ func rndNat1(n int) nat {
return x
}
func BenchmarkMul(b *testing.B) {
stk := getStack()
defer stk.free()
mulx := rndNat(1e4)
muly := rndNat(1e4)
b.ResetTimer()
for i := 0; i < b.N; i++ {
var z nat
z.mul(stk, mulx, muly)
}
}
func benchmarkNatMul(b *testing.B, nwords int) {
x := rndNat(nwords)
y := rndNat(nwords)

View File

@ -722,7 +722,7 @@ func greaterThan(x1, x2, y1, y2 Word) bool {
// divRecursiveThreshold is the number of divisor digits
// at which point divRecursive is faster than divBasic.
const divRecursiveThreshold = 100
var divRecursiveThreshold = 40 // see calibrate_test.go
// divRecursive implements recursive division as described above.
// It overwrites z with ⌊u/v⌋ and overwrites u with the remainder r.

View File

@ -9,7 +9,7 @@ package big
// Operands that are shorter than karatsubaThreshold are multiplied using
// "grade school" multiplication; for longer operands the Karatsuba algorithm
// is used.
var karatsubaThreshold = 40 // computed by calibrate_test.go
var karatsubaThreshold = 40 // see calibrate_test.go
// mul sets z = x*y, using stk for temporary storage.
// The caller may pass stk == nil to request that mul obtain and release one itself.
@ -65,8 +65,8 @@ func (z nat) mul(stk *stack, x, y nat) nat {
// Operands that are shorter than basicSqrThreshold are squared using
// "grade school" multiplication; for operands longer than karatsubaSqrThreshold
// we use the Karatsuba algorithm optimized for x == y.
var basicSqrThreshold = 20 // computed by calibrate_test.go
var karatsubaSqrThreshold = 260 // computed by calibrate_test.go
var basicSqrThreshold = 12 // see calibrate_test.go
var karatsubaSqrThreshold = 80 // see calibrate_test.go
// sqr sets z = x*x, using stk for temporary storage.
// The caller may pass stk == nil to request that sqr obtain and release one itself.
@ -87,7 +87,7 @@ func (z nat) sqr(stk *stack, x nat) nat {
}
z = z.make(2 * n)
if n < basicSqrThreshold {
if n < basicSqrThreshold && n < karatsubaSqrThreshold {
basicMul(z, x, x)
return z.norm()
}
@ -112,6 +112,11 @@ func (z nat) sqr(stk *stack, x nat) nat {
// The (non-normalized) result is placed in z.
func basicSqr(stk *stack, z, x nat) {
n := len(x)
if n < basicSqrThreshold {
basicMul(z, x, x)
return
}
defer stk.restore(stk.save())
t := stk.nat(2 * n)
clear(t)