math/big: update calibration tests and recalibrate

Refactor calibration tests to use the same logic for all. Choosing thresholds that are broadly appropriate for all systems is part science but also part guesswork and judgement. We could instead set per-GOOS/GOARCH thresholds, but that seems like too much work, and even then there would be variation between different chips within a GOOS/GOARCH. (For example see the three linux/amd64 systems benchmarked below.) The thresholds chosen in this CL are: karatsubaThreshold = 40 // unchanged basicSqrThreshold = 12 // was 20 karatsubaSqrThreshold = 80 // was 260 divRecursiveThreshold = 40 // was 100 The new file calibrate.md explains the calibration process and links to graphs justifying those values. (The graphs are hosted on swtch.com to avoid adding a megabyte of extra data to the Go repo and Go distributions.) A rendered copy of calibrate.md is at https://swtch.com/math/big/calibrate.html. goos: linux goarch: amd64 pkg: math/big cpu: Intel(R) Xeon(R) Platinum 8481C CPU @ 2.70GHz │ old │ new │ │ sec/op │ sec/op vs base │ Div/20/10-88 13.13n ± 2% 13.14n ± 2% ~ (p=0.494 n=15) Div/40/20-88 13.13n ± 2% 13.14n ± 2% ~ (p=0.137 n=15) Div/100/50-88 25.50n ± 0% 25.51n ± 0% ~ (p=0.038 n=15) Div/200/100-88 113.1n ± 1% 116.0n ± 3% +2.56% (p=0.000 n=15) Div/400/200-88 135.3n ± 0% 137.1n ± 1% ~ (p=0.004 n=15) Div/1000/500-88 259.9n ± 1% 259.0n ± 2% ~ (p=0.182 n=15) Div/2000/1000-88 568.8n ± 1% 564.7n ± 3% ~ (p=0.927 n=15) Div/20000/10000-88 25.79µ ± 1% 22.11µ ± 2% -14.26% (p=0.000 n=15) Div/200000/100000-88 755.1µ ± 1% 737.6µ ± 1% -2.32% (p=0.000 n=15) Div/2000000/1000000-88 31.30m ± 0% 31.20m ± 1% ~ (p=0.081 n=15) Div/20000000/10000000-88 1.268 ± 0% 1.265 ± 0% ~ (p=0.011 n=15) NatMul/10-88 142.6n ± 0% 142.9n ± 7% ~ (p=0.145 n=15) NatMul/100-88 4.347µ ± 0% 4.350µ ± 3% ~ (p=0.430 n=15) NatMul/1000-88 187.6µ ± 0% 188.4µ ± 2% ~ (p=0.004 n=15) NatMul/10000-88 8.052m ± 0% 8.057m ± 1% ~ (p=0.148 n=15) NatMul/100000-88 260.6m ± 0% 260.7m ± 0% ~ (p=0.512 n=15) NatSqr/1-88 26.58n ± 5% 27.96n ± 8% ~ (p=0.574 n=15) NatSqr/2-88 42.35n ± 7% 44.87n ± 6% ~ (p=0.690 n=15) NatSqr/3-88 53.28n ± 4% 55.62n ± 5% ~ (p=0.151 n=15) NatSqr/5-88 76.26n ± 6% 81.43n ± 6% +6.78% (p=0.000 n=15) NatSqr/8-88 110.8n ± 5% 116.4n ± 6% ~ (p=0.040 n=15) NatSqr/10-88 141.4n ± 4% 147.8n ± 4% ~ (p=0.011 n=15) NatSqr/20-88 325.8n ± 3% 341.7n ± 4% +4.88% (p=0.000 n=15) NatSqr/30-88 536.8n ± 3% 556.1n ± 4% ~ (p=0.027 n=15) NatSqr/50-88 1.168µ ± 3% 1.197µ ± 3% ~ (p=0.442 n=15) NatSqr/80-88 2.527µ ± 2% 2.480µ ± 2% -1.86% (p=0.000 n=15) NatSqr/100-88 3.771µ ± 2% 3.535µ ± 2% -6.26% (p=0.000 n=15) NatSqr/200-88 14.03µ ± 2% 10.57µ ± 3% -24.68% (p=0.000 n=15) NatSqr/300-88 24.06µ ± 2% 20.57µ ± 2% -14.52% (p=0.000 n=15) NatSqr/500-88 65.43µ ± 1% 45.45µ ± 1% -30.55% (p=0.000 n=15) NatSqr/800-88 126.41µ ± 1% 94.13µ ± 2% -25.54% (p=0.000 n=15) NatSqr/1000-88 196.4µ ± 1% 135.1µ ± 1% -31.18% (p=0.000 n=15) NatSqr/10000-88 6.404m ± 0% 5.326m ± 1% -16.84% (p=0.000 n=15) NatSqr/100000-88 267.2m ± 0% 198.7m ± 0% -25.64% (p=0.000 n=15) geomean 7.318µ 6.948µ -5.06% goos: linux goarch: amd64 pkg: math/big cpu: Intel(R) Xeon(R) CPU @ 3.10GHz │ old │ new │ │ sec/op │ sec/op vs base │ Div/20/10-16 22.23n ± 0% 22.23n ± 0% ~ (p=0.973 n=15) Div/40/20-16 22.23n ± 0% 22.23n ± 0% ~ (p=0.226 n=15) Div/100/50-16 55.27n ± 1% 55.59n ± 0% ~ (p=0.004 n=15) Div/200/100-16 174.7n ± 3% 175.9n ± 2% ~ (p=0.645 n=15) Div/400/200-16 208.8n ± 1% 209.5n ± 2% ~ (p=0.169 n=15) Div/1000/500-16 378.7n ± 2% 380.5n ± 2% ~ (p=0.091 n=15) Div/2000/1000-16 778.4n ± 1% 781.1n ± 2% ~ (p=0.104 n=15) Div/20000/10000-16 25.16µ ± 1% 24.93µ ± 1% -0.91% (p=0.000 n=15) Div/200000/100000-16 926.4µ ± 0% 927.7µ ± 1% ~ (p=0.436 n=15) Div/2000000/1000000-16 35.58m ± 0% 35.53m ± 0% ~ (p=0.267 n=15) Div/20000000/10000000-16 1.333 ± 0% 1.330 ± 0% ~ (p=0.126 n=15) NatMul/10-16 172.6n ± 0% 165.4n ± 0% -4.17% (p=0.000 n=15) NatMul/100-16 5.706µ ± 0% 5.503µ ± 0% -3.56% (p=0.000 n=15) NatMul/1000-16 220.8µ ± 0% 219.1µ ± 0% -0.76% (p=0.000 n=15) NatMul/10000-16 8.688m ± 0% 8.621m ± 0% -0.77% (p=0.000 n=15) NatMul/100000-16 333.3m ± 0% 333.5m ± 0% ~ (p=0.512 n=15) NatSqr/1-16 28.66n ± 1% 28.42n ± 3% -0.84% (p=0.000 n=15) NatSqr/2-16 48.29n ± 2% 48.19n ± 2% ~ (p=0.042 n=15) NatSqr/3-16 59.93n ± 0% 59.64n ± 2% -0.48% (p=0.000 n=15) NatSqr/5-16 88.05n ± 0% 87.89n ± 3% ~ (p=0.066 n=15) NatSqr/8-16 127.7n ± 0% 126.9n ± 3% -0.63% (p=0.000 n=15) NatSqr/10-16 170.4n ± 0% 169.7n ± 3% ~ (p=0.004 n=15) NatSqr/20-16 388.8n ± 0% 392.9n ± 3% ~ (p=0.123 n=15) NatSqr/30-16 635.2n ± 0% 641.7n ± 3% ~ (p=0.123 n=15) NatSqr/50-16 1.304µ ± 1% 1.314µ ± 3% ~ (p=0.927 n=15) NatSqr/80-16 2.709µ ± 1% 2.899µ ± 4% +7.01% (p=0.000 n=15) NatSqr/100-16 3.885µ ± 0% 3.981µ ± 4% ~ (p=0.123 n=15) NatSqr/200-16 13.29µ ± 2% 12.14µ ± 4% -8.67% (p=0.000 n=15) NatSqr/300-16 23.39µ ± 0% 22.51µ ± 3% -3.78% (p=0.000 n=15) NatSqr/500-16 58.13µ ± 1% 50.56µ ± 2% -13.02% (p=0.000 n=15) NatSqr/800-16 118.4µ ± 1% 107.6µ ± 2% -9.11% (p=0.000 n=15) NatSqr/1000-16 172.7µ ± 1% 151.8µ ± 2% -12.11% (p=0.000 n=15) NatSqr/10000-16 6.065m ± 1% 5.757m ± 1% -5.08% (p=0.000 n=15) NatSqr/100000-16 240.9m ± 0% 228.1m ± 0% -5.32% (p=0.000 n=15) geomean 8.601µ 8.453µ -1.71% goos: linux goarch: amd64 pkg: math/big cpu: AMD Ryzen 9 7950X 16-Core Processor │ old │ new │ │ sec/op │ sec/op vs base │ Div/20/10-32 11.11n ± 0% 11.11n ± 1% ~ (p=0.532 n=15) Div/40/20-32 11.08n ± 1% 11.11n ± 0% ~ (p=0.815 n=15) Div/100/50-32 16.81n ± 0% 16.84n ± 29% ~ (p=0.020 n=15) Div/200/100-32 73.91n ± 0% 76.85n ± 11% +3.98% (p=0.000 n=15) Div/400/200-32 87.35n ± 0% 88.91n ± 34% +1.79% (p=0.000 n=15) Div/1000/500-32 169.3n ± 1% 168.9n ± 1% ~ (p=0.049 n=15) Div/2000/1000-32 369.3n ± 0% 369.0n ± 0% ~ (p=0.108 n=15) Div/20000/10000-32 15.92µ ± 0% 13.55µ ± 2% -14.91% (p=0.000 n=15) Div/200000/100000-32 491.4µ ± 0% 482.4µ ± 1% -1.84% (p=0.000 n=15) Div/2000000/1000000-32 20.09m ± 0% 19.96m ± 0% -0.69% (p=0.000 n=15) Div/20000000/10000000-32 756.5m ± 0% 755.5m ± 0% ~ (p=0.089 n=15) NatMul/10-32 125.4n ± 5% 124.8n ± 1% ~ (p=0.588 n=15) NatMul/100-32 2.952µ ± 3% 2.969µ ± 0% ~ (p=0.237 n=15) NatMul/1000-32 120.7µ ± 0% 121.1µ ± 0% +0.30% (p=0.000 n=15) NatMul/10000-32 4.845m ± 0% 4.839m ± 1% ~ (p=0.653 n=15) NatMul/100000-32 173.3m ± 0% 173.3m ± 0% ~ (p=0.838 n=15) NatSqr/1-32 31.18n ± 23% 32.08n ± 2% ~ (p=0.015 n=15) NatSqr/2-32 57.22n ± 28% 58.88n ± 2% ~ (p=0.054 n=15) NatSqr/3-32 61.34n ± 18% 64.33n ± 2% ~ (p=0.237 n=15) NatSqr/5-32 72.47n ± 17% 79.81n ± 3% ~ (p=0.067 n=15) NatSqr/8-32 83.26n ± 26% 100.10n ± 3% ~ (p=0.016 n=15) NatSqr/10-32 87.31n ± 43% 125.50n ± 2% ~ (p=0.003 n=15) NatSqr/20-32 193.5n ± 25% 244.4n ± 13% ~ (p=0.002 n=15) NatSqr/30-32 323.9n ± 17% 380.9n ± 6% ~ (p=0.003 n=15) NatSqr/50-32 713.4n ± 9% 761.7n ± 8% ~ (p=0.419 n=15) NatSqr/80-32 1.486µ ± 7% 1.609µ ± 5% +8.28% (p=0.000 n=15) NatSqr/100-32 2.115µ ± 9% 2.253µ ± 1% ~ (p=0.104 n=15) NatSqr/200-32 7.201µ ± 4% 6.610µ ± 1% -8.21% (p=0.000 n=15) NatSqr/300-32 13.08µ ± 2% 12.37µ ± 1% -5.41% (p=0.000 n=15) NatSqr/500-32 32.56µ ± 2% 27.83µ ± 2% -14.52% (p=0.000 n=15) NatSqr/800-32 66.83µ ± 3% 59.59µ ± 1% -10.83% (p=0.000 n=15) NatSqr/1000-32 98.09µ ± 1% 83.59µ ± 1% -14.78% (p=0.000 n=15) NatSqr/10000-32 3.445m ± 1% 3.245m ± 0% -5.81% (p=0.000 n=15) NatSqr/100000-32 137.3m ± 0% 127.0m ± 0% -7.54% (p=0.000 n=15) geomean 4.897µ 4.972µ +1.52% goos: linux goarch: arm64 pkg: math/big │ old │ new │ │ sec/op │ sec/op vs base │ Div/20/10-16 15.26n ± 2% 15.14n ± 1% ~ (p=0.212 n=15) Div/40/20-16 15.22n ± 1% 15.16n ± 0% ~ (p=0.190 n=15) Div/100/50-16 26.53n ± 2% 26.42n ± 0% -0.41% (p=0.000 n=15) Div/200/100-16 124.3n ± 0% 124.0n ± 0% ~ (p=0.704 n=15) Div/400/200-16 142.4n ± 0% 141.8n ± 0% ~ (p=0.074 n=15) Div/1000/500-16 262.0n ± 1% 261.3n ± 1% ~ (p=0.046 n=15) Div/2000/1000-16 532.6n ± 0% 532.5n ± 1% ~ (p=0.798 n=15) Div/20000/10000-16 22.27µ ± 0% 22.88µ ± 0% +2.73% (p=0.000 n=15) Div/200000/100000-16 890.4µ ± 0% 902.8µ ± 0% +1.39% (p=0.000 n=15) Div/2000000/1000000-16 35.03m ± 0% 35.10m ± 0% ~ (p=0.305 n=15) Div/20000000/10000000-16 1.380 ± 0% 1.385 ± 0% ~ (p=0.019 n=15) NatMul/10-16 177.6n ± 1% 175.6n ± 3% ~ (p=0.480 n=15) NatMul/100-16 5.675µ ± 0% 5.669µ ± 1% ~ (p=0.705 n=15) NatMul/1000-16 224.3µ ± 0% 224.6µ ± 0% ~ (p=0.653 n=15) NatMul/10000-16 8.735m ± 0% 8.739m ± 0% ~ (p=0.567 n=15) NatMul/100000-16 331.6m ± 0% 331.6m ± 1% ~ (p=0.412 n=15) NatSqr/1-16 43.69n ± 2% 42.77n ± 6% ~ (p=0.383 n=15) NatSqr/2-16 65.26n ± 2% 63.91n ± 5% ~ (p=0.285 n=15) NatSqr/3-16 73.95n ± 1% 72.25n ± 6% ~ (p=0.198 n=15) NatSqr/5-16 95.06n ± 1% 94.21n ± 3% ~ (p=0.721 n=15) NatSqr/8-16 155.5n ± 1% 153.4n ± 4% ~ (p=0.170 n=15) NatSqr/10-16 175.4n ± 1% 174.0n ± 2% ~ (p=0.271 n=15) NatSqr/20-16 360.8n ± 0% 358.5n ± 2% ~ (p=0.170 n=15) NatSqr/30-16 584.7n ± 0% 582.9n ± 1% ~ (p=0.170 n=15) NatSqr/50-16 1.323µ ± 0% 1.322µ ± 0% ~ (p=0.627 n=15) NatSqr/80-16 2.916µ ± 0% 2.674µ ± 0% -8.30% (p=0.000 n=15) NatSqr/100-16 4.365µ ± 0% 3.802µ ± 0% -12.90% (p=0.000 n=15) NatSqr/200-16 16.42µ ± 0% 11.29µ ± 0% -31.26% (p=0.000 n=15) NatSqr/300-16 28.07µ ± 0% 22.83µ ± 0% -18.68% (p=0.000 n=15) NatSqr/500-16 76.30µ ± 0% 50.06µ ± 0% -34.39% (p=0.000 n=15) NatSqr/800-16 147.5µ ± 0% 101.2µ ± 1% -31.41% (p=0.000 n=15) NatSqr/1000-16 228.6µ ± 0% 149.5µ ± 0% -34.61% (p=0.000 n=15) NatSqr/10000-16 7.417m ± 0% 6.025m ± 0% -18.76% (p=0.000 n=15) NatSqr/100000-16 309.2m ± 0% 214.9m ± 0% -30.50% (p=0.000 n=15) geomean 8.559µ 7.906µ -7.63% goos: darwin goarch: arm64 pkg: math/big cpu: Apple M3 Pro │ old │ new │ │ sec/op │ sec/op vs base │ Div/20/10-12 9.577n ± 6% 9.473n ± 5% ~ (p=0.384 n=15) Div/40/20-12 9.480n ± 1% 9.430n ± 1% ~ (p=0.019 n=15) Div/100/50-12 14.82n ± 0% 14.82n ± 0% ~ (p=0.845 n=15) Div/200/100-12 83.94n ± 1% 84.35n ± 4% ~ (p=0.512 n=15) Div/400/200-12 102.7n ± 1% 102.9n ± 0% ~ (p=0.845 n=15) Div/1000/500-12 185.3n ± 1% 181.9n ± 1% -1.83% (p=0.000 n=15) Div/2000/1000-12 397.0n ± 1% 396.7n ± 0% ~ (p=0.959 n=15) Div/20000/10000-12 14.05µ ± 0% 13.70µ ± 1% ~ (p=0.002 n=15) Div/200000/100000-12 529.4µ ± 3% 526.7µ ± 2% ~ (p=0.967 n=15) Div/2000000/1000000-12 20.05m ± 0% 20.05m ± 0% ~ (p=0.653 n=15) Div/20000000/10000000-12 788.2m ± 1% 789.0m ± 1% ~ (p=0.412 n=15) NatMul/10-12 79.95n ± 1% 80.87n ± 1% +1.15% (p=0.000 n=15) NatMul/100-12 2.973µ ± 0% 2.986µ ± 2% ~ (p=0.051 n=15) NatMul/1000-12 122.6µ ± 5% 123.0µ ± 1% ~ (p=0.783 n=15) NatMul/10000-12 4.990m ± 1% 5.000m ± 1% ~ (p=0.653 n=15) NatMul/100000-12 185.3m ± 3% 190.3m ± 1% ~ (p=0.089 n=15) NatSqr/1-12 11.84n ± 1% 11.88n ± 1% ~ (p=0.735 n=15) NatSqr/2-12 21.01n ± 1% 21.44n ± 6% ~ (p=0.039 n=15) NatSqr/3-12 25.59n ± 0% 26.74n ± 9% +4.49% (p=0.000 n=15) NatSqr/5-12 36.78n ± 0% 37.04n ± 1% +0.71% (p=0.000 n=15) NatSqr/8-12 63.09n ± 3% 63.22n ± 1% ~ (p=0.846 n=15) NatSqr/10-12 79.98n ± 0% 79.78n ± 0% ~ (p=0.100 n=15) NatSqr/20-12 174.0n ± 0% 175.5n ± 1% ~ (p=0.361 n=15) NatSqr/30-12 290.0n ± 0% 291.4n ± 0% ~ (p=0.002 n=15) NatSqr/50-12 655.2n ± 4% 658.1n ± 0% ~ (p=0.060 n=15) NatSqr/80-12 1.506µ ± 0% 1.397µ ± 5% -7.24% (p=0.000 n=15) NatSqr/100-12 2.273µ ± 0% 2.005µ ± 5% -11.79% (p=0.000 n=15) NatSqr/200-12 8.833µ ± 6% 6.109µ ± 0% -30.84% (p=0.000 n=15) NatSqr/300-12 15.15µ ± 4% 12.37µ ± 0% -18.34% (p=0.000 n=15) NatSqr/500-12 41.89µ ± 0% 27.70µ ± 1% -33.88% (p=0.000 n=15) NatSqr/800-12 80.72µ ± 0% 56.40µ ± 0% -30.12% (p=0.000 n=15) NatSqr/1000-12 127.06µ ± 1% 84.06µ ± 1% -33.84% (p=0.000 n=15) NatSqr/10000-12 4.130m ± 0% 3.390m ± 0% -17.91% (p=0.000 n=15) NatSqr/100000-12 173.2m ± 0% 131.2m ± 6% -24.25% (p=0.000 n=15) geomean 4.489µ 4.189µ -6.68% Change-Id: Iaf65fd85457b003ebf07a787c875cda321b40cc9 Reviewed-on: https://go-review.googlesource.com/c/go/+/652058 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Robert Griesemer <gri@google.com> Reviewed-by: Alan Donovan <adonovan@google.com> Auto-Submit: Russ Cox <rsc@golang.org>
2025-01-18 00:17:21 -05:00 · 2025-01-18 00:17:21 -05:00 · a812e5f3c3
parent 40c953cd46
commit a812e5f3c3
6 changed files with 735 additions and 148 deletions
--- a/src/math/big/calibrate.md
+++ b/src/math/big/calibrate.md
@ -0,0 +1,180 @@
+# Calibration of Algorithm Thresholds
+
+This document describes the approach to calibration of algorithmic thresholds in
+`math/big`, implemented in [calibrate_test.go](calibrate_test.go).
+
+Basic operations like multiplication and division have many possible implementations.
+Most algorithms that are better asymptotically have overheads that make them
+run slower for small inputs. When presented with an operation to run, `math/big`
+must decide which algorithm to use.
+
+For example, for small inputs, multiplication using the “grade school algorithm” is fastest.
+Given multi-digit x, y and a target z: clear z, and then for each digit y[i], z[i:] += x\*y[i].
+That last operation, adding a vector times a digit to another vector (including carrying up
+the vector during the multiplication and addition), can be implemented in a tight assembly loop.
+The overall speed is O(N\*\*2) where N is the number of digits in x and y (assume they match),
+but the tight inner loop performs well for small inputs.
+
+[Karatsuba's algorithm](https://en.wikipedia.org/wiki/Karatsuba_algorithm)
+multiplies two N-digit numbers by splitting them in half, computing
+three N/2-digit products, and then reconstructing the final product using a few more
+additions and subtractions. It runs in O(N\*\*log₂ 3) = O(N\*\*1.58) time.
+The grade school loop runs faster for small inputs,
+but eventually Karatsuba's smaller asymptotic run time wins.
+
+The multiplication implementation must decide which to use.
+Under the assumption that once Karatsuba is faster for some N,
+it will be larger for all larger N as well,
+the rule is to use Karatsuba's algorithm when the input length N ≥ karatsubaThreshold.
+
+Calibration is the process of determining what karatsubaThreshold should be set to.
+It doesn't sound like it should be that hard, but it is:
+- Theoretical analysis does not help: the answer depends on the actual machines
+and the actual constant factors in the two implementations.
+- We are picking a single karatsubaThreshold for all systems,
+despite them having different relative execution speeds for the operations
+in the two algorithms.
+(We could in theory pick different thresholds for different architectures,
+but there can still be significant variation within a given architecture.)
+- The assumption that there is a single N where
+an asymptotically better algorithm becomes faster and stays faster
+is not true in general.
+- Recursive algorithms like Karatsuba's may have  different optimal
+thresholds for different large input sizes.
+- Thresholds can interfere. For example, changing the karatsubaThreshold makes
+multiplication faster or slower, which in turn affects the best divRecursiveThreshold
+(because divisions use multiplication).
+
+The best we can do is measure the performance of the overall multiplication
+algorithm across a variety of inputs and thresholds and look for a threshold
+that balances all these concerns reasonably well,
+setting thresholds in dependency order (for example, multiplication before division).
+
+The code in `calibrate_test.go` does this measurement of a variety of input sizes
+and threshold values and prints the timing results as a CSV file.
+The code in `calibrate_graph.go` reads the CSV and writes out an SVG file plotting the data.
+For example:
+
+	go test -run=Calibrate/KaratsubaMul -timeout=1h -calibrate >kmul.csv
+	go run calibrate_graph.go kmul.csv >kmul.svg
+
+Any particular input is sensitive to only a few transitions in threshold.
+For example, an input of size 320 recurses on inputs of size 160,
+which recurses on inputs of size 80,
+which recurses on inputs of size 40,
+and so on, until falling below the Karatsuba threshold.
+Here is what the timing looks like for an input of size 320,
+normalized so that 1.0 is the fastest timing observed:
+
+![KaratsubaThreshold on an Apple M3 Pro, N=320 only](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac320.svg)
+
+For this input, all thresholds from 21 to 40 perform optimally and identically: they all mean “recurse at N=40 but not at N=20”.
+From the single input of size N=320, we cannot decide which of these 20 thresholds is best.
+
+Other inputs exercise other decision points. For example, here is the timing for N=240:
+
+![KaratsubaThreshold on an Apple M3 Pro, N=240 only](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac240.svg)
+
+In this case, all the thresholds from 31 to 60 perform optimally and identically, recursing at N=60 but not N=30.
+
+If we combine these two into a single graph and then plot the geometric mean of the two lines in blue,
+the optimal range becomes a little clearer:
+
+![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac240+320.svg)
+
+The actual calibration runs all possible inputs from size N=200 to N=400, in increments of 8,
+plotting all 26 lines in a faded gray (note the changed y-axis scale, zooming in near 1.0).
+
+![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac.svg)
+
+Now the optimal value is clear: the best threshold on this chip, with these algorithmic implementations, is 40.
+
+Unfortunately, other chips are different. Here is an Intel Xeon server chip:
+
+![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c2s16.svg)
+
+On this chip, the best threshold is closer to 60. Luckily, 40 is not a terrible choice either: it is only about 2% slower on average.
+
+The rest of this document presents the timings measured for the `math/big` thresholds on a variety of machines
+and justifies the final thresholds. The timings used these machines:
+
+- The `gotip-linux-amd64_c3h88-perf_vs_release` gomote, a Google Cloud c3-high-88 machine using an Intel Xeon Platinum 8481C CPU (Emerald Rapids).
+- The `gotip-linux-amd64_c2s16-perf_vs_release` gomote, a Google Cloud c2-standard-16 machine using an Intel Xeon Gold 6253CL CPU (Cascade Lake).
+- A home server built with an AMD Ryzen 9 7950X CPU.
+- The `gotip-linux-arm64_c4as16-perf_vs_release` gomote, a Google Cloud c4a-standard-16 machine using Google's Axiom Arm CPU.
+- An Apple MacBook Pro with an Apple M3 Pro CPU.
+
+In general, we break ties in favor of the newer c3h88 x86 perf gomote, then the c4as16 arm64 perf gomote, and then the others.
+
+## Karatsuba Multiplication
+
+Here are the full results for the Karatsuba multiplication threshold.
+
+![KaratsubaThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c3h88.svg)
+![KaratsubaThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c2s16.svg)
+![KaratsubaThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.s7.svg)
+![KaratsubaThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.c4as16.svg)
+![KaratsubaThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaMul/cal.mac.svg)
+
+The majority of systems have optimum thresholds near 40, so we chose karatsubaThreshold = 40.
+
+## Basic Squaring
+
+For squaring a number (`z.Mul(x, x)`), math/big uses grade school multiplication
+up to basicSqrThreshold, where it switches to a customized algorithm that is
+still quadratic but avoids half the word-by-word multiplies
+since the two arguments are identical.
+That algorithm's inner loops are not as tight as the grade school multiplication,
+so it is slower for small inputs. How small?
+
+Here are the timings:
+
+![BasicSqrThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c3h88.svg)
+![BasicSqrThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c2s16.svg)
+![BasicSqrThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/BasicSqr/cal.s7.svg)
+![BasicSqrThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/BasicSqr/cal.c4as16.svg)
+![BasicSqrThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/BasicSqr/cal.mac.svg)
+
+These inputs are so small that the calibration times batches of 100 instead of individual operations.
+There is no one best threshold, even on a single system, because some of the sizes seem to run
+the grade school algorithm faster than others.
+For example, on the AMD CPU,
+for N=14, basic squaring is 4% faster than basic multiplication,
+suggesting the threshold has been crossed,
+but for N=16, basic multiplication is 9% faster than basic squaring,
+probably because the tight assembly can use larger chunks.
+
+It is unclear why the Axiom Arm timings are so incredibly noisy.
+
+We chose basicSqrThreshold = 12.
+
+## Karatsuba Squaring
+
+Beyond the basic squaring threshold, at some point a customized Karatsuba can take over.
+It uses three half-sized squarings instead of three half-sized multiplies.
+Here are the timings:
+
+![KaratsubaSqrThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c3h88.svg)
+![KaratsubaSqrThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c2s16.svg)
+![KaratsubaSqrThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.s7.svg)
+![KaratsubaSqrThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.c4as16.svg)
+![KaratsubaSqrThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/KaratsubaSqr/cal.mac.svg)
+
+The majority of chips preferred a lower threshold, around 60-70,
+but the older Intel Xeon and the AMD prefer a threshold around 100-120.
+
+We chose karatsubaSqrThreshold = 80, which is within 2% of optimal on all the chips.
+
+## Recursive Division
+
+Division uses a recursive divide-and-conquer algorithm for large inputs,
+eventually falling back to a more traditional grade-school whole-input trial-and-error division.
+Here are the timings for the threshold between the two:
+
+![DivRecursiveThreshold on an Intel Xeon Platium 8481C](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c3h88.svg)
+![DivRecursiveThreshold on an Intel Xeon Gold 6253CL](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c2s16.svg)
+![DivRecursiveThreshold on an AMD Ryzen 9 7950X](https://swtch.com/math/big/_calibrate/DivRecursive/cal.s7.svg)
+![DivRecursiveThreshold on an Axiom Arm](https://swtch.com/math/big/_calibrate/DivRecursive/cal.c4as16.svg)
+![DivRecursiveThreshold on an Apple M3 Pro](https://swtch.com/math/big/_calibrate/DivRecursive/cal.mac.svg)
+
+We chose divRecursiveThreshold = 40.
--- a/src/math/big/calibrate_graph.go
+++ b/src/math/big/calibrate_graph.go
@ -0,0 +1,321 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ignore
+
+// This program converts CSV calibration data printed by
+//
+//	go test -run=Calibrate/Name -calibrate >file.csv
+//
+// into an SVG file. Invoke as:
+//
+//	go run calibrate_graph.go file.csv >file.svg
+//
+// See calibrate.md for more details.
+
+package main
+
+import (
+	"bytes"
+	"encoding/csv"
+	"flag"
+	"fmt"
+	"log"
+	"math"
+	"os"
+	"strconv"
+)
+
+func usage() {
+	fmt.Fprintf(os.Stderr, "usage: go run calibrate_graph.go file.csv >file.svg\n")
+	os.Exit(2)
+}
+
+// A Point is an X, Y coordinate in the data being plotted.
+type Point struct {
+	X, Y float64
+}
+
+// A Graph is a graph to draw as SVG.
+type Graph struct {
+	Title   string    // title above graph
+	Geomean []Point   // geomean line
+	Lines   [][]Point // normalized data lines
+	XAxis   string    // x-axis label
+	YAxis   string    // y-axis label
+	Min     Point     // min point of data display
+	Max     Point     // max point of data display
+}
+
+var yMax = flag.Float64("ymax", 1.2, "maximum y axis value")
+var alphaNorm = flag.Float64("alphanorm", 0.1, "alpha for a single norm line")
+
+func main() {
+	flag.Usage = usage
+	flag.Parse()
+	if flag.NArg() != 1 {
+		usage()
+	}
+
+	// Read CSV. It may be enclosed in
+	//	-- name.csv --
+	//	...
+	//	-- eof --
+	// framing, in which case remove the framing.
+	fdata, err := os.ReadFile(flag.Arg(0))
+	if err != nil {
+		log.Fatal(err)
+	}
+	if _, after, ok := bytes.Cut(fdata, []byte(".csv --\n")); ok {
+		fdata = after
+	}
+	if before, _, ok := bytes.Cut(fdata, []byte("-- eof --\n")); ok {
+		fdata = before
+	}
+	rd := csv.NewReader(bytes.NewReader(fdata))
+	rd.FieldsPerRecord = -1
+	records, err := rd.ReadAll()
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Construct graph from loaded CSV.
+	// CSV starts with metadata lines like
+	//	goos,darwin
+	// and then has two tables of timings.
+	// Each table looks like
+	//	size \ threshold,10,20,30,40
+	//	100,1,2,3,4
+	//	200,2,3,4,5
+	//	300,3,4,5,6
+	//	400,4,5,6,7
+	//	500,5,6,7,8
+	// The header line gives the threshold values and then each row
+	// gives an input size and the timings for each threshold.
+	// Omitted timings are empty strings and turn into infinities when parsing.
+	// The first table gives raw nanosecond timings.
+	// The second table gives timings normalized relative to the fastest
+	// possible threshold for a given input size.
+	// We only want the second table.
+	// The tables are followed by a list of geomeans of all the normalized
+	// timings for each threshold:
+	//	geomean,1.2,1.1,1.0,1.4
+	// We turn each normalized timing row into a line in the graph,
+	// and we turn the geomean into an overlaid thick line.
+	// The metadata is used for preparing the titles.
+	g := &Graph{
+		YAxis: "Relative Slowdown",
+		Min:   Point{0, 1},
+		Max:   Point{1, 1.2},
+	}
+	meta := make(map[string]string)
+	table := 0 // number of table headers seen
+	var thresholds []float64
+	maxNorm := 0.0
+	for _, rec := range records {
+		if len(rec) == 0 {
+			continue
+		}
+		if len(rec) == 2 {
+			meta[rec[0]] = rec[1]
+			continue
+		}
+		if rec[0] == `size \ threshold` {
+			table++
+			if table == 2 {
+				thresholds = parseFloats(rec)
+				g.Min.X = thresholds[0]
+				g.Max.X = thresholds[len(thresholds)-1]
+			}
+			continue
+		}
+		if rec[0] == "geomean" {
+			table = 3 // end of norms table
+			geomeans := parseFloats(rec)
+			g.Geomean = floatsToLine(thresholds, geomeans)
+			continue
+		}
+		if table == 2 {
+			if _, err := strconv.Atoi(rec[0]); err != nil { // size
+				log.Fatalf("invalid table line: %q", rec)
+			}
+			norms := parseFloats(rec)
+			if len(norms) > len(thresholds) {
+				log.Fatalf("too many timings (%d > %d): %q", len(norms), len(thresholds), rec)
+			}
+			g.Lines = append(g.Lines, floatsToLine(thresholds, norms))
+			for _, y := range norms {
+				maxNorm = max(maxNorm, y)
+			}
+			continue
+		}
+	}
+
+	g.Max.Y = min(*yMax, math.Ceil(maxNorm*100)/100)
+	g.XAxis = meta["calibrate"] + "Threshold"
+	g.Title = meta["goos"] + "/" + meta["goarch"] + " " + meta["cpu"]
+
+	os.Stdout.Write(g.SVG())
+}
+
+// parseFloats parses rec[1:] as floating point values.
+// If a field is the empty string, it is represented as +Inf.
+func parseFloats(rec []string) []float64 {
+	floats := make([]float64, 0, len(rec)-1)
+	for _, v := range rec[1:] {
+		if v == "" {
+			floats = append(floats, math.Inf(+1))
+			continue
+		}
+		f, err := strconv.ParseFloat(v, 64)
+		if err != nil {
+			log.Fatalf("invalid record: %q (%v)", rec, err)
+		}
+		floats = append(floats, f)
+	}
+	return floats
+}
+
+// floatsToLine converts a sequence of floats into a line, ignoring missing (infinite) values.
+func floatsToLine(x, y []float64) []Point {
+	var line []Point
+	for i, yi := range y {
+		if !math.IsInf(yi, 0) {
+			line = append(line, Point{x[i], yi})
+		}
+	}
+	return line
+}
+
+const svgHeader = `<svg width="%d" height="%d" version="1.1" xmlns="http://www.w3.org/2000/svg">
+  <defs>
+    <style type="text/css"><![CDATA[
+      text { stroke-width: 0; white-space: pre; }
+      text.hjc { text-anchor: middle; }
+      text.hjl { text-anchor: start; }
+      text.hjr { text-anchor: end; }
+      .def { stroke-linecap: round; stroke-linejoin: round; fill: none; stroke: #000000; stroke-width: 1px; }
+      .tick { stroke: #000000; fill: #000000; font: %dpx Times; }
+      .title { stroke: #000000; fill: #000000; font: %dpx Times; font-weight: bold; }
+      .axis { stroke-width: 2px; }
+      .norm { stroke: rgba(0,0,0,%f); }
+      .geomean { stroke: #6666ff; stroke-width: 2px; }
+    ]]></style>
+  </defs>
+  <g class="def">
+`
+
+// Layout constants for drawing graph
+const (
+	DX   = 600          // width of graphed data
+	DY   = 150          // height of graphed data
+	ML   = 80           // margin left
+	MT   = 30           // margin top
+	MR   = 10           // margin right
+	MB   = 50           // margin bottom
+	PS   = 14           // point size of text
+	W    = ML + DX + MR // width of overall graph
+	H    = MT + DY + MB // height of overall graph
+	Tick = 5            // axis tick length
+)
+
+// An SVGPoint is a point in the SVG image, in pixel units,
+// with Y increasing down the page.
+type SVGPoint struct {
+	X, Y int
+}
+
+func (p SVGPoint) String() string {
+	return fmt.Sprintf("%d,%d", p.X, p.Y)
+}
+
+// pt converts an x, y data value (such as from a Point) to an SVGPoint.
+func (g *Graph) pt(x, y float64) SVGPoint {
+	return SVGPoint{
+		X: ML + int((x-g.Min.X)/(g.Max.X-g.Min.X)*DX),
+		Y: H - MB - int((y-g.Min.Y)/(g.Max.Y-g.Min.Y)*DY),
+	}
+}
+
+// SVG returns the SVG text for the graph.
+func (g *Graph) SVG() []byte {
+
+	var svg bytes.Buffer
+	fmt.Fprintf(&svg, svgHeader, W, H, PS, PS, *alphaNorm)
+
+	// Draw data, clipped.
+	fmt.Fprintf(&svg, "<clipPath id=\"cp\"><path d=\"M %v L %v L %v L %v Z\" /></clipPath>\n",
+		g.pt(g.Min.X, g.Min.Y), g.pt(g.Max.X, g.Min.Y), g.pt(g.Max.X, g.Max.Y), g.pt(g.Min.X, g.Max.Y))
+	fmt.Fprintf(&svg, "<g clip-path=\"url(#cp)\">\n")
+	for _, line := range g.Lines {
+		if len(line) == 0 {
+			continue
+		}
+		fmt.Fprintf(&svg, "<path class=\"norm\" d=\"M %v", g.pt(line[0].X, line[0].Y))
+		for _, v := range line[1:] {
+			fmt.Fprintf(&svg, " L %v", g.pt(v.X, v.Y))
+		}
+		fmt.Fprintf(&svg, "\"/>\n")
+	}
+	// Draw geomean.
+	if len(g.Geomean) > 0 {
+		line := g.Geomean
+		fmt.Fprintf(&svg, "<path class=\"geomean\" d=\"M %v", g.pt(line[0].X, line[0].Y))
+		for _, v := range line[1:] {
+			fmt.Fprintf(&svg, " L %v", g.pt(v.X, v.Y))
+		}
+		fmt.Fprintf(&svg, "\"/>\n")
+	}
+	fmt.Fprintf(&svg, "</g>\n")
+
+	// Draw axes and major and minor tick marks.
+	fmt.Fprintf(&svg, "<path class=\"axis\" d=\"")
+	fmt.Fprintf(&svg, " M %v L %v", g.pt(g.Min.X, g.Min.Y), g.pt(g.Max.X, g.Min.Y)) // x axis
+	fmt.Fprintf(&svg, " M %v L %v", g.pt(g.Min.X, g.Min.Y), g.pt(g.Min.X, g.Max.Y)) // y axis
+	xscale := 10.0
+	if g.Max.X-g.Min.X < 100 {
+		xscale = 1.0
+	}
+	for x := int(math.Ceil(g.Min.X / xscale)); float64(x)*xscale <= g.Max.X; x++ {
+		if x%5 != 0 {
+			fmt.Fprintf(&svg, " M %v l 0,%d", g.pt(float64(x)*xscale, g.Min.Y), Tick)
+		} else {
+			fmt.Fprintf(&svg, " M %v l 0,%d", g.pt(float64(x)*xscale, g.Min.Y), 2*Tick)
+		}
+	}
+	yscale := 100.0
+	if g.Max.Y-g.Min.Y > 0.5 {
+		yscale = 10
+	}
+	for y := int(math.Ceil(g.Min.Y * yscale)); float64(y) <= g.Max.Y*yscale; y++ {
+		if y%5 != 0 {
+			fmt.Fprintf(&svg, " M %v l -%d,0", g.pt(g.Min.X, float64(y)/yscale), Tick)
+		} else {
+			fmt.Fprintf(&svg, " M %v l -%d,0", g.pt(g.Min.X, float64(y)/yscale), 2*Tick)
+		}
+	}
+	fmt.Fprintf(&svg, "\"/>\n")
+
+	// Draw tick labels on major marks.
+	for x := int(math.Ceil(g.Min.X / xscale)); float64(x)*xscale <= g.Max.X; x++ {
+		if x%5 == 0 {
+			p := g.pt(float64(x)*xscale, g.Min.Y)
+			fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"tick hjc\">%d</text>\n", p.X, p.Y+2*Tick+PS, x*int(xscale))
+		}
+	}
+	for y := int(math.Ceil(g.Min.Y * yscale)); float64(y) <= g.Max.Y*yscale; y++ {
+		if y%5 == 0 {
+			p := g.pt(g.Min.X, float64(y)/yscale)
+			fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"tick hjr\">%.2f</text>\n", p.X-2*Tick-Tick, p.Y+PS/3, float64(y)/yscale)
+		}
+	}
+
+	// Draw graph title and axis titles.
+	fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"title hjc\">%s</text>\n", ML+DX/2, MT-PS/3, g.Title)
+	fmt.Fprintf(&svg, "<text x=\"%d\" y=\"%d\" class=\"title hjc\">%s</text>\n", ML+DX/2, MT+DY+2*Tick+2*PS+PS/2, g.XAxis)
+	fmt.Fprintf(&svg, "<g transform=\"translate(%d,%d) rotate(-90)\"><text x=\"0\" y=\"0\" class=\"title hjc\">%s</text></g>\n", ML-Tick-Tick-3*PS, MT+DY/2, g.YAxis)
+
+	fmt.Fprintf(&svg, "</g></svg>\n")
+	return svg.Bytes()
+}
--- a/src/math/big/calibrate_test.go
+++ b/src/math/big/calibrate_test.go
@ -2,172 +2,266 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Calibration used to determine thresholds for using
-// different algorithms.  Ideally, this would be converted
-// to go generate to create thresholds.go
-
-// This file prints execution times for the Mul benchmark
-// given different Karatsuba thresholds. The result may be
-// used to manually fine-tune the threshold constant. The
-// results are somewhat fragile; use repeated runs to get
-// a clear picture.
-
-// Calculates lower and upper thresholds for when basicSqr
-// is faster than standard multiplication.
-
-// Usage: go test -run='^TestCalibrate$' -v -calibrate
+// TestCalibrate determines appropriate thresholds for when to use
+// different calculation algorithms. To run it, use:
+//
+//	go test -run=Calibrate -calibrate >cal.log
+//
+// Calibration data is printed in CSV format, along with the normal test output.
+// See calibrate.md for more details about using the output.

 package big

 import (
 	"flag"
 	"fmt"
+	"internal/sysinfo"
+	"math"
+	"runtime"
+	"slices"
+	"strings"
+	"sync"
 	"testing"
 	"time"
 )

 var calibrate = flag.Bool("calibrate", false, "run calibration test")
-
-const (
-	sqrModeMul       = "mul(x, x)"
-	sqrModeBasic     = "basicSqr(x)"
-	sqrModeKaratsuba = "karatsubaSqr(x)"
-)
+var calibrateOnce sync.Once

 func TestCalibrate(t *testing.T) {
 	if !*calibrate {
 		return
 	}

-	computeKaratsubaThresholds()
+	t.Run("KaratsubaMul", computeKaratsubaThreshold)
+	t.Run("BasicSqr", computeBasicSqrThreshold)
+	t.Run("KaratsubaSqr", computeKaratsubaSqrThreshold)
+	t.Run("DivRecursive", computeDivRecursiveThreshold)
+}

-	// compute basicSqrThreshold where overhead becomes negligible
-	minSqr := computeSqrThreshold(10, 30, 1, 3, sqrModeMul, sqrModeBasic)
-	// compute karatsubaSqrThreshold where karatsuba is faster
-	maxSqr := computeSqrThreshold(200, 500, 10, 3, sqrModeBasic, sqrModeKaratsuba)
-	if minSqr != 0 {
-		fmt.Printf("found basicSqrThreshold = %d\n", minSqr)
-	} else {
-		fmt.Println("no basicSqrThreshold found")
-	}
-	if maxSqr != 0 {
-		fmt.Printf("found karatsubaSqrThreshold = %d\n", maxSqr)
-	} else {
-		fmt.Println("no karatsubaSqrThreshold found")
+func computeKaratsubaThreshold(t *testing.T) {
+	set := func(n int) { karatsubaThreshold = n }
+	computeThreshold(t, "karatsuba", set, 0, 4, 200, benchMul, 200, 8, 400)
+}
+
+func benchMul(size int) func() {
+	x := rndNat(size)
+	y := rndNat(size)
+	var z nat
+	return func() {
+		z.mul(nil, x, y)
 	}
 }

-func karatsubaLoad(b *testing.B) {
-	BenchmarkMul(b)
+func computeBasicSqrThreshold(t *testing.T) {
+	setDuringTest(t, &karatsubaSqrThreshold, 1e9)
+	set := func(n int) { basicSqrThreshold = n }
+	computeThreshold(t, "basicSqr", set, 2, 1, 40, benchBasicSqr, 1, 1, 40)
 }

-// measureKaratsuba returns the time to run a Karatsuba-relevant benchmark
-// given Karatsuba threshold th.
-func measureKaratsuba(th int) time.Duration {
-	th, karatsubaThreshold = karatsubaThreshold, th
-	res := testing.Benchmark(karatsubaLoad)
-	karatsubaThreshold = th
-	return time.Duration(res.NsPerOp())
+func benchBasicSqr(size int) func() {
+	x := rndNat(size)
+	var z nat
+	return func() {
+		// Run 100 squarings because 1 is too fast at the small sizes we consider.
+		// Some systems don't even have precise enough clocks to measure it accurately.
+		for range 100 {
+			z.sqr(nil, x)
+		}
+	}
 }

-func computeKaratsubaThresholds() {
-	fmt.Printf("Multiplication times for varying Karatsuba thresholds\n")
-	fmt.Printf("(run repeatedly for good results)\n")
+func computeKaratsubaSqrThreshold(t *testing.T) {
+	set := func(n int) { karatsubaSqrThreshold = n }
+	computeThreshold(t, "karatsubaSqr", set, 0, 4, 200, benchSqr, 200, 8, 400)
+}

-	// determine Tk, the work load execution time using basic multiplication
-	Tb := measureKaratsuba(1e9) // th == 1e9 => Karatsuba multiplication disabled
-	fmt.Printf("Tb = %10s\n", Tb)
+func benchSqr(size int) func() {
+	x := rndNat(size)
+	var z nat
+	return func() {
+		z.sqr(nil, x)
+	}
+}

-	// thresholds
-	th := 4
-	th1 := -1
-	th2 := -1
+func computeDivRecursiveThreshold(t *testing.T) {
+	set := func(n int) { divRecursiveThreshold = n }
+	computeThreshold(t, "divRecursive", set, 4, 4, 200, benchDiv, 200, 8, 400)
+}

-	var deltaOld time.Duration
-	for count := -1; count != 0 && th < 128; count-- {
-		// determine Tk, the work load execution time using Karatsuba multiplication
-		Tk := measureKaratsuba(th)
+func benchDiv(size int) func() {
+	divx := rndNat(2 * size)
+	divy := rndNat(size)
+	var z, r nat
+	return func() {
+		z.div(nil, r, divx, divy)
+	}
+}

-		// improvement over Tb
-		delta := (Tb - Tk) * 100 / Tb
+func computeThreshold(t *testing.T, name string, set func(int), thresholdLo, thresholdStep, thresholdHi int, bench func(int) func(), sizeLo, sizeStep, sizeHi int) {
+	// Start CSV output; wrapped in txtar framing to separate CSV from other test ouptut.
+	fmt.Printf("-- calibrate-%s.csv --\n", name)
+	defer fmt.Printf("-- eof --\n")

-		fmt.Printf("th = %3d  Tk = %10s  %4d%%", th, Tk, delta)
+	fmt.Printf("goos,%s\n", runtime.GOOS)
+	fmt.Printf("goarch,%s\n", runtime.GOARCH)
+	fmt.Printf("cpu,%s\n", sysinfo.CPUName())
+	fmt.Printf("calibrate,%s\n", name)

-		// determine break-even point
-		if Tk < Tb && th1 < 0 {
-			th1 = th
-			fmt.Print("  break-even point")
+	// Expand lists of sizes and thresholds we will test.
+	var sizes, thresholds []int
+	for size := sizeLo; size <= sizeHi; size += sizeStep {
+		sizes = append(sizes, size)
+	}
+	for thresh := thresholdLo; thresh <= thresholdHi; thresh += thresholdStep {
+		thresholds = append(thresholds, thresh)
+	}
+
+	fmt.Printf("%s\n", csv("size \\ threshold", thresholds))
+
+	// Track minimum time observed for each size, threshold pair.
+	times := make([][]float64, len(sizes))
+	for i := range sizes {
+		times[i] = make([]float64, len(thresholds))
+		for j := range thresholds {
+			times[i][j] = math.Inf(+1)
+		}
+	}
+
+	// For each size, run at most MaxRounds of considering every threshold.
+	// If we run a threshold Stable times in a row without seeing more
+	// than a 1% improvement in the observed minimum, move on to the next one.
+	// After we run Converged rounds (not necessarily in a row)
+	// without seeing any threshold improve by more than 1%, stop.
+	const (
+		MaxRounds = 1600
+		Stable    = 20
+		Converged = 200
+	)
+
+	for i, size := range sizes {
+		b := bench(size)
+		same := 0
+		for range MaxRounds {
+			better := false
+			for j, threshold := range thresholds {
+				// No point if threshold is far beyond size
+				if false && threshold > size+2*sizeStep {
+					continue
+				}
+
+				// BasicSqr is different from the recursive thresholds: it either applies or not,
+				// without any question of recursive subproblems. Only try the thresholds
+				//	size-1, size, size+1, size+2
+				// to get two data points using basic multiplication and two using basic squaring.
+				// This avoids gathering many redundant data points.
+				// (The others have redundant data points as well, but for them the math is less trivial
+				// and best not duplicated in the calibration code.)
+				if false && name == "basicSqr" && (threshold < size-1 || threshold > size+3) {
+					continue
+				}
+
+				set(threshold)
+				b() // warm up
+				b()
+				tmin := times[i][j]
+				for k := 0; k < Stable; k++ {
+					start := time.Now()
+					b()
+					t := float64(time.Since(start))
+					if t < tmin {
+						if t < tmin*99/100 {
+							better = true
+							k = 0
+						}
+						tmin = t
+					}
+				}
+				times[i][j] = tmin
+			}
+			if !better {
+				if same++; same >= Converged {
+					break
+				}
+			}
 		}

-		// determine diminishing return
-		if 0 < delta && delta < deltaOld && th2 < 0 {
-			th2 = th
-			fmt.Print("  diminishing return")
-		}
-		deltaOld = delta
-
-		fmt.Println()
-
-		// trigger counter
-		if th1 >= 0 && th2 >= 0 && count < 0 {
-			count = 10 // this many extra measurements after we got both thresholds
-		}
-
-		th++
+		fmt.Printf("%s\n", csv(fmt.Sprint(size), times[i]))
 	}
+
+	// For each size, normalize timings by the minimum achieved for that size.
+	fmt.Printf("%s\n", csv("size \\ threshold", thresholds))
+	norms := make([][]float64, len(sizes))
+	for i, times := range times {
+		m := min(1e100, slices.Min(times)) // make finite so divide preserves inf values
+		norms[i] = make([]float64, len(times))
+		for j, d := range times {
+			norms[i][j] = d / m
+		}
+		fmt.Printf("%s\n", csv(fmt.Sprint(sizes[i]), norms[i]))
+	}
+
+	// For each threshold, compute geomean of normalized timings across all sizes.
+	geomeans := make([]float64, len(thresholds))
+	for j := range thresholds {
+		p := 1.0
+		n := 0
+		for i := range sizes {
+			if v := norms[i][j]; !math.IsInf(v, +1) {
+				p *= v
+				n++
+			}
+		}
+		if n == 0 {
+			geomeans[j] = math.Inf(+1)
+		} else {
+			geomeans[j] = math.Pow(p, 1/float64(n))
+		}
+	}
+	fmt.Printf("%s\n", csv("geomean", geomeans))
+
+	// Add best threshold and smallest, largest within 10% and 5% of best.
+	var lo10, lo5, best, hi5, hi10 int
+	for i, g := range geomeans {
+		if g < geomeans[best] {
+			best = i
+		}
+	}
+	lo5 = best
+	for lo5 > 0 && geomeans[lo5-1] <= 1.05 {
+		lo5--
+	}
+	lo10 = lo5
+	for lo10 > 0 && geomeans[lo10-1] <= 1.10 {
+		lo10--
+	}
+	hi5 = best
+	for hi5+1 < len(geomeans) && geomeans[hi5+1] <= 1.05 {
+		hi5++
+	}
+	hi10 = hi5
+	for hi10+1 < len(geomeans) && geomeans[hi10+1] <= 1.10 {
+		hi10++
+	}
+	fmt.Printf("lo10%%,%d\n", thresholds[lo10])
+	fmt.Printf("lo5%%,%d\n", thresholds[lo5])
+	fmt.Printf("min,%d\n", thresholds[best])
+	fmt.Printf("hi5%%,%d\n", thresholds[hi5])
+	fmt.Printf("hi10%%,%d\n", thresholds[hi10])
+
+	set(thresholds[best])
 }

-func measureSqr(words, nruns int, mode string) time.Duration {
-	// more runs for better statistics
-	initBasicSqr, initKaratsubaSqr := basicSqrThreshold, karatsubaSqrThreshold
-
-	switch mode {
-	case sqrModeMul:
-		basicSqrThreshold = words + 1
-	case sqrModeBasic:
-		basicSqrThreshold, karatsubaSqrThreshold = words-1, words+1
-	case sqrModeKaratsuba:
-		karatsubaSqrThreshold = words - 1
-	}
-
-	var testval int64
-	for i := 0; i < nruns; i++ {
-		res := testing.Benchmark(func(b *testing.B) { benchmarkNatSqr(b, words) })
-		testval += res.NsPerOp()
-	}
-	testval /= int64(nruns)
-
-	basicSqrThreshold, karatsubaSqrThreshold = initBasicSqr, initKaratsubaSqr
-
-	return time.Duration(testval)
-}
-
-func computeSqrThreshold(from, to, step, nruns int, lower, upper string) int {
-	fmt.Printf("Calibrating threshold between %s and %s\n", lower, upper)
-	fmt.Printf("Looking for a timing difference for x between %d - %d words by %d step\n", from, to, step)
-	var initPos bool
-	var threshold int
-	for i := from; i <= to; i += step {
-		baseline := measureSqr(i, nruns, lower)
-		testval := measureSqr(i, nruns, upper)
-		pos := baseline > testval
-		delta := baseline - testval
-		percent := delta * 100 / baseline
-		fmt.Printf("words = %3d deltaT = %10s (%4d%%) is %s better: %v", i, delta, percent, upper, pos)
-		if i == from {
-			initPos = pos
+// csv returns a single csv line starting with name and followed by the values.
+// Values that are float64 +infinity, denoting missing data, are replaced by an empty string.
+func csv[T int | float64](name string, values []T) string {
+	line := []string{name}
+	for _, v := range values {
+		if math.IsInf(float64(v), +1) {
+			line = append(line, "")
+		} else {
+			line = append(line, fmt.Sprint(v))
 		}
-		if threshold == 0 && pos != initPos {
-			threshold = i
-			fmt.Printf("  threshold  found")
-		}
-		fmt.Println()
-
 	}
-	if threshold != 0 {
-		fmt.Printf("Found threshold = %d between %d - %d\n", threshold, from, to)
-	} else {
-		fmt.Printf("Found NO threshold between %d - %d\n", from, to)
-	}
-	return threshold
+	return strings.Join(line, ",")
 }
--- a/src/math/big/nat_test.go
+++ b/src/math/big/nat_test.go
@ -378,19 +378,6 @@ func rndNat1(n int) nat {
 	return x
 }

-func BenchmarkMul(b *testing.B) {
-	stk := getStack()
-	defer stk.free()
-
-	mulx := rndNat(1e4)
-	muly := rndNat(1e4)
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		var z nat
-		z.mul(stk, mulx, muly)
-	}
-}
-
 func benchmarkNatMul(b *testing.B, nwords int) {
 	x := rndNat(nwords)
 	y := rndNat(nwords)
--- a/src/math/big/natdiv.go
+++ b/src/math/big/natdiv.go
@ -722,7 +722,7 @@ func greaterThan(x1, x2, y1, y2 Word) bool {

 // divRecursiveThreshold is the number of divisor digits
 // at which point divRecursive is faster than divBasic.
-const divRecursiveThreshold = 100
+var divRecursiveThreshold = 40 // see calibrate_test.go

 // divRecursive implements recursive division as described above.
 // It overwrites z with ⌊u/v⌋ and overwrites u with the remainder r.
--- a/src/math/big/natmul.go
+++ b/src/math/big/natmul.go
@ -9,7 +9,7 @@ package big
 // Operands that are shorter than karatsubaThreshold are multiplied using
 // "grade school" multiplication; for longer operands the Karatsuba algorithm
 // is used.
-var karatsubaThreshold = 40 // computed by calibrate_test.go
+var karatsubaThreshold = 40 // see calibrate_test.go

 // mul sets z = x*y, using stk for temporary storage.
 // The caller may pass stk == nil to request that mul obtain and release one itself.
@ -65,8 +65,8 @@ func (z nat) mul(stk *stack, x, y nat) nat {
 // Operands that are shorter than basicSqrThreshold are squared using
 // "grade school" multiplication; for operands longer than karatsubaSqrThreshold
 // we use the Karatsuba algorithm optimized for x == y.
-var basicSqrThreshold = 20      // computed by calibrate_test.go
-var karatsubaSqrThreshold = 260 // computed by calibrate_test.go
+var basicSqrThreshold = 12     // see calibrate_test.go
+var karatsubaSqrThreshold = 80 // see calibrate_test.go

 // sqr sets z = x*x, using stk for temporary storage.
 // The caller may pass stk == nil to request that sqr obtain and release one itself.
@ -87,7 +87,7 @@ func (z nat) sqr(stk *stack, x nat) nat {
 	}
 	z = z.make(2 * n)

-	if n < basicSqrThreshold {
+	if n < basicSqrThreshold && n < karatsubaSqrThreshold {
 		basicMul(z, x, x)
 		return z.norm()
 	}
@ -112,6 +112,11 @@ func (z nat) sqr(stk *stack, x nat) nat {
 // The (non-normalized) result is placed in z.
 func basicSqr(stk *stack, z, x nat) {
 	n := len(x)
+	if n < basicSqrThreshold {
+		basicMul(z, x, x)
+		return
+	}
+
 	defer stk.restore(stk.save())
 	t := stk.nat(2 * n)
 	clear(t)