mirror of https://github.com/golang/go.git
Division is much slower than multiplication. And the method of using multiplication by multiplying reciprocal and replacing division with it can increase the speed of divWVW algorithm by three times,and at the same time increase the speed of nats division. The benchmark test on arm64 is as follows: name old time/op new time/op delta DivWVW/1-4 13.1ns ± 4% 13.3ns ± 4% ~ (p=0.444 n=5+5) DivWVW/2-4 48.6ns ± 1% 51.2ns ± 2% +5.39% (p=0.008 n=5+5) DivWVW/3-4 82.0ns ± 1% 69.7ns ± 1% -15.03% (p=0.008 n=5+5) DivWVW/4-4 116ns ± 1% 71ns ± 2% -38.88% (p=0.008 n=5+5) DivWVW/5-4 152ns ± 1% 84ns ± 4% -44.70% (p=0.008 n=5+5) DivWVW/10-4 319ns ± 1% 155ns ± 4% -51.50% (p=0.008 n=5+5) DivWVW/100-4 3.44µs ± 3% 1.30µs ± 8% -62.30% (p=0.008 n=5+5) DivWVW/1000-4 33.8µs ± 0% 10.9µs ± 1% -67.74% (p=0.008 n=5+5) DivWVW/10000-4 343µs ± 4% 111µs ± 5% -67.63% (p=0.008 n=5+5) DivWVW/100000-4 3.35ms ± 1% 1.25ms ± 3% -62.79% (p=0.008 n=5+5) QuoRem-4 3.08µs ± 2% 2.21µs ± 4% -28.40% (p=0.008 n=5+5) ModSqrt225_Tonelli-4 444µs ± 2% 457µs ± 3% ~ (p=0.095 n=5+5) ModSqrt225_3Mod4-4 136µs ± 1% 138µs ± 3% ~ (p=0.151 n=5+5) ModSqrt231_Tonelli-4 473µs ± 3% 483µs ± 4% ~ (p=0.548 n=5+5) ModSqrt231_5Mod8-4 164µs ± 9% 169µs ±12% ~ (p=0.421 n=5+5) Sqrt-4 36.8µs ± 1% 28.6µs ± 0% -22.17% (p=0.016 n=5+4) Div/20/10-4 50.0ns ± 3% 51.3ns ± 6% ~ (p=0.238 n=5+5) Div/40/20-4 49.8ns ± 2% 51.3ns ± 6% ~ (p=0.222 n=5+5) Div/100/50-4 85.8ns ± 4% 86.5ns ± 5% ~ (p=0.246 n=5+5) Div/200/100-4 335ns ± 3% 296ns ± 2% -11.60% (p=0.008 n=5+5) Div/400/200-4 442ns ± 2% 359ns ± 5% -18.81% (p=0.008 n=5+5) Div/1000/500-4 858ns ± 3% 643ns ± 6% -25.06% (p=0.008 n=5+5) Div/2000/1000-4 1.70µs ± 3% 1.28µs ± 4% -24.80% (p=0.008 n=5+5) Div/20000/10000-4 45.0µs ± 5% 41.8µs ± 4% -7.17% (p=0.016 n=5+5) Div/200000/100000-4 1.51ms ± 7% 1.43ms ± 3% -5.42% (p=0.016 n=5+5) Div/2000000/1000000-4 57.6ms ± 4% 57.5ms ± 3% ~ (p=1.000 n=5+5) Div/20000000/10000000-4 2.08s ± 3% 2.04s ± 1% ~ (p=0.095 n=5+5) name old speed new speed delta DivWVW/1-4 4.87GB/s ± 4% 4.80GB/s ± 4% ~ (p=0.310 n=5+5) DivWVW/2-4 2.63GB/s ± 1% 2.50GB/s ± 2% -5.07% (p=0.008 n=5+5) DivWVW/3-4 2.34GB/s ± 1% 2.76GB/s ± 1% +17.70% (p=0.008 n=5+5) DivWVW/4-4 2.21GB/s ± 1% 3.61GB/s ± 2% +63.42% (p=0.008 n=5+5) DivWVW/5-4 2.10GB/s ± 2% 3.81GB/s ± 4% +80.89% (p=0.008 n=5+5) DivWVW/10-4 2.01GB/s ± 0% 4.13GB/s ± 4% +105.91% (p=0.008 n=5+5) DivWVW/100-4 1.86GB/s ± 2% 4.95GB/s ± 7% +165.63% (p=0.008 n=5+5) DivWVW/1000-4 1.89GB/s ± 0% 5.86GB/s ± 1% +209.96% (p=0.008 n=5+5) DivWVW/10000-4 1.87GB/s ± 4% 5.76GB/s ± 5% +208.96% (p=0.008 n=5+5) DivWVW/100000-4 1.91GB/s ± 1% 5.14GB/s ± 3% +168.85% (p=0.008 n=5+5) Change-Id: I049f1196562b20800e6ef8a6493fd147f93ad830 Reviewed-on: https://go-review.googlesource.com/c/go/+/250417 Trust: Giovanni Bajo <rasky@develer.com> Trust: Keith Randall <khr@golang.org> Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> |
||
|---|---|---|
| .. | ||
| big | ||
| bits | ||
| cmplx | ||
| rand | ||
| abs.go | ||
| acos_s390x.s | ||
| acosh.go | ||
| acosh_s390x.s | ||
| all_test.go | ||
| arith_s390x.go | ||
| arith_s390x_test.go | ||
| asin.go | ||
| asin_386.s | ||
| asin_s390x.s | ||
| asinh.go | ||
| asinh_s390x.s | ||
| atan.go | ||
| atan2.go | ||
| atan2_386.s | ||
| atan2_s390x.s | ||
| atan_386.s | ||
| atan_s390x.s | ||
| atanh.go | ||
| atanh_s390x.s | ||
| bits.go | ||
| cbrt.go | ||
| cbrt_s390x.s | ||
| const.go | ||
| copysign.go | ||
| cosh_s390x.s | ||
| dim.go | ||
| dim_amd64.s | ||
| dim_arm64.s | ||
| dim_riscv64.s | ||
| dim_s390x.s | ||
| erf.go | ||
| erf_s390x.s | ||
| erfc_s390x.s | ||
| erfinv.go | ||
| example_test.go | ||
| exp.go | ||
| exp2_386.s | ||
| exp_amd64.s | ||
| exp_arm64.s | ||
| exp_asm.go | ||
| exp_s390x.s | ||
| expm1.go | ||
| expm1_386.s | ||
| expm1_s390x.s | ||
| export_s390x_test.go | ||
| export_test.go | ||
| floor.go | ||
| floor_386.s | ||
| floor_amd64.s | ||
| floor_arm64.s | ||
| floor_ppc64x.s | ||
| floor_s390x.s | ||
| floor_wasm.s | ||
| fma.go | ||
| frexp.go | ||
| frexp_386.s | ||
| gamma.go | ||
| huge_test.go | ||
| hypot.go | ||
| hypot_386.s | ||
| hypot_amd64.s | ||
| j0.go | ||
| j1.go | ||
| jn.go | ||
| ldexp.go | ||
| ldexp_386.s | ||
| lgamma.go | ||
| log.go | ||
| log1p.go | ||
| log1p_386.s | ||
| log1p_s390x.s | ||
| log10.go | ||
| log10_386.s | ||
| log10_s390x.s | ||
| log_386.s | ||
| log_amd64.s | ||
| log_s390x.s | ||
| logb.go | ||
| mod.go | ||
| mod_386.s | ||
| modf.go | ||
| modf_386.s | ||
| modf_arm64.s | ||
| modf_ppc64x.s | ||
| nextafter.go | ||
| pow.go | ||
| pow10.go | ||
| pow_s390x.s | ||
| remainder.go | ||
| remainder_386.s | ||
| signbit.go | ||
| sin.go | ||
| sin_s390x.s | ||
| sincos.go | ||
| sinh.go | ||
| sinh_s390x.s | ||
| sqrt.go | ||
| sqrt_386.s | ||
| sqrt_amd64.s | ||
| sqrt_arm.s | ||
| sqrt_arm64.s | ||
| sqrt_mipsx.s | ||
| sqrt_ppc64x.s | ||
| sqrt_riscv64.s | ||
| sqrt_s390x.s | ||
| sqrt_wasm.s | ||
| stubs_386.s | ||
| stubs_amd64.s | ||
| stubs_arm.s | ||
| stubs_arm64.s | ||
| stubs_mips64x.s | ||
| stubs_mipsx.s | ||
| stubs_ppc64x.s | ||
| stubs_riscv64.s | ||
| stubs_s390x.s | ||
| stubs_wasm.s | ||
| tan.go | ||
| tan_s390x.s | ||
| tanh.go | ||
| tanh_s390x.s | ||
| trig_reduce.go | ||
| unsafe.go | ||