math: implement func archExp and archExp2 in assembly on loong64

goos: linux goarch: loong64 pkg: math cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 26.30n ± 0% 12.93n ± 0% -50.85% (p=0.000 n=10) ExpGo 26.86n ± 0% 26.92n ± 0% +0.22% (p=0.000 n=10) Expm1 16.76n ± 0% 16.75n ± 0% ~ (p=0.060 n=10) Exp2 23.05n ± 0% 12.12n ± 0% -47.42% (p=0.000 n=10) Exp2Go 23.41n ± 0% 23.47n ± 0% +0.28% (p=0.000 n=10) geomean 22.97n 17.54n -23.64% goos: linux goarch: loong64 pkg: math/cmplx cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 51.32n ± 0% 35.41n ± 0% -30.99% (p=0.000 n=10) goos: linux goarch: loong64 pkg: math cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 50.27n ± 0% 48.75n ± 1% -3.01% (p=0.000 n=10) ExpGo 50.72n ± 0% 50.44n ± 0% -0.55% (p=0.000 n=10) Expm1 28.40n ± 0% 28.32n ± 0% ~ (p=0.360 n=10) Exp2 50.09n ± 0% 21.49n ± 1% -57.10% (p=0.000 n=10) Exp2Go 50.05n ± 0% 49.69n ± 0% -0.72% (p=0.000 n=10) geomean 44.85n 37.52n -16.35% goos: linux goarch: loong64 pkg: math/cmplx cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Exp 88.56n ± 0% 67.29n ± 0% -24.03% (p=0.000 n=10) Change-Id: I89e456d26fc075d83335ee4a31227d2aface5714 Reviewed-on: https://go-review.googlesource.com/c/go/+/653935 Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2024-12-31 21:02:47 +08:00 · 2024-12-31 21:02:47 +08:00 · 6ba91df153
parent 39d7a49736
commit 6ba91df153
5 changed files with 240 additions and 4 deletions
--- a/src/math/exp2_asm.go
+++ b/src/math/exp2_asm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build arm64
+//go:build arm64 || loong64

 package math

--- a/src/math/exp2_noasm.go
+++ b/src/math/exp2_noasm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build !arm64
+//go:build !arm64 && !loong64

 package math

--- a/src/math/exp_asm.go
+++ b/src/math/exp_asm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build amd64 || arm64 || s390x
+//go:build amd64 || arm64 || loong64 || s390x

 package math

--- a/src/math/exp_loong64.s
+++ b/src/math/exp_loong64.s
@ -0,0 +1,236 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define NearZero	0x3e30000000000000	// 2**-28
+#define PosInf		0x7ff0000000000000
+#define FracMask	0x000fffffffffffff
+#define C1		0x3cb0000000000000	// 2**-52
+
+DATA exprodata<>+0(SB)/8, $0.0
+DATA exprodata<>+8(SB)/8, $0.5
+DATA exprodata<>+16(SB)/8, $1.0
+DATA exprodata<>+24(SB)/8, $2.0
+DATA exprodata<>+32(SB)/8, $6.93147180369123816490e-01	// Ln2Hi
+DATA exprodata<>+40(SB)/8, $1.90821492927058770002e-10	// Ln2Lo
+DATA exprodata<>+48(SB)/8, $1.44269504088896338700e+00	// Log2e
+DATA exprodata<>+56(SB)/8, $7.09782712893383973096e+02	// Overflow
+DATA exprodata<>+64(SB)/8, $-7.45133219101941108420e+02	// Underflow
+DATA exprodata<>+72(SB)/8, $1.0239999999999999e+03	// Overflow2
+DATA exprodata<>+80(SB)/8, $-1.0740e+03			// Underflow2
+DATA exprodata<>+88(SB)/8, $3.7252902984619141e-09	// NearZero
+GLOBL exprodata<>+0(SB), NOPTR|RODATA, $96
+
+DATA expmultirodata<>+0(SB)/8, $1.66666666666666657415e-01	// P1
+DATA expmultirodata<>+8(SB)/8, $-2.77777777770155933842e-03	// P2
+DATA expmultirodata<>+16(SB)/8, $6.61375632143793436117e-05	// P3
+DATA expmultirodata<>+24(SB)/8, $-1.65339022054652515390e-06	// P4
+DATA expmultirodata<>+32(SB)/8, $4.13813679705723846039e-08	// P5
+GLOBL expmultirodata<>+0(SB), NOPTR|RODATA, $40
+
+// Exp returns e**x, the base-e exponential of x.
+// This is an assembly implementation of the method used for function Exp in file exp.go.
+//
+// func Exp(x float64) float64
+TEXT ·archExp(SB),$0-16
+	MOVD	x+0(FP), F0	// F0 = x
+
+	MOVV	$exprodata<>+0(SB), R10
+	MOVD	56(R10), F1	// Overflow
+	MOVD	64(R10), F2	// Underflow
+	MOVD	88(R10), F3	// NearZero
+	MOVD	16(R10), F17	// 1.0
+
+	CMPEQD	F0, F0, FCC0
+	BFPF	isNaN		// x = NaN, return NaN
+
+	CMPGTD	F0, F1, FCC0
+	BFPT	overflow	// x > Overflow, return PosInf
+
+	CMPGTD	F2, F0, FCC0
+	BFPT	underflow	// x < Underflow, return 0
+
+	ABSD	F0, F5
+	CMPGTD	F3, F5, FCC0
+	BFPT	nearzero	// fabs(x) < NearZero, return 1 + x
+
+	// argument reduction, x = k*ln2 + r,  |r| <= 0.5*ln2
+	// computed as r = hi - lo for extra precision.
+	MOVD	0(R10), F5
+	MOVD	8(R10), F3
+	MOVD	48(R10), F2
+	CMPGTD	F0, F5, FCC0
+	BFPT	add		// x > 0
+sub:
+	FMSUBD	F3, F2, F0, F3	// Log2e*x - 0.5
+	JMP	2(PC)
+add:
+	FMADDD	F3, F2, F0, F3	// Log2e*x + 0.5
+
+	FTINTRZVD F3, F4	// float64 -> int64
+	MOVV	F4, R5		// R5 = int(k)
+	FFINTDV	F4, F3		// int64 -> float64
+
+	MOVD	32(R10), F4
+	MOVD	40(R10), F5
+	FNMSUBD	F0, F3, F4, F4
+	MULD	F3, F5, F5
+	SUBD	F5, F4, F6
+	MULD	F6, F6, F7
+
+	// compute c
+	MOVV	$expmultirodata<>+0(SB), R11
+	MOVD	32(R11), F8
+	MOVD	24(R11), F9
+	FMADDD	F9, F8, F7, F13
+	MOVD	16(R11), F10
+	FMADDD	F10, F13, F7, F13
+	MOVD	8(R11), F11
+	FMADDD	F11, F13, F7, F13
+	MOVD	0(R11), F12
+	FMADDD	F12, F13, F7, F13
+	FNMSUBD	F6, F13, F7, F13
+
+	// compute y
+	MOVD	24(R10), F14
+	SUBD	F13, F14, F14
+	MULD	F6, F13, F15
+	DIVD	F14, F15, F15
+	SUBD	F15, F5, F15
+	SUBD	F4, F15, F15
+	SUBD	F15, F17, F16
+
+	// inline Ldexp(y, k), benefit:
+	// 1, no parameter pass overhead.
+	// 2, skip unnecessary checks for Inf/NaN/Zero
+	MOVV	F16, R4
+	MOVV	$FracMask, R9
+	AND	R9, R4, R6	// fraction
+	SRLV	$52, R4, R7	// exponent
+	ADDV	R5, R7
+	MOVV	$1, R12
+	BGE	R7, R12, normal
+	ADDV	$52, R7		// denormal
+	MOVV	$C1, R8
+	MOVV	R8, F17
+normal:
+	SLLV	$52, R7
+	OR	R7, R6, R4
+	MOVV	R4, F0
+	MULD	F17, F0		// return m * x
+	MOVD	F0, ret+8(FP)
+	RET
+nearzero:
+	ADDD	F17, F0, F0
+isNaN:
+	MOVD	F0, ret+8(FP)
+	RET
+underflow:
+	MOVV	R0, ret+8(FP)
+	RET
+overflow:
+	MOVV	$PosInf, R4
+	MOVV	R4, ret+8(FP)
+	RET
+
+
+// Exp2 returns 2**x, the base-2 exponential of x.
+// This is an assembly implementation of the method used for function Exp2 in file exp.go.
+//
+// func Exp2(x float64) float64
+TEXT ·archExp2(SB),$0-16
+	MOVD	x+0(FP), F0	// F0 = x
+
+	MOVV	$exprodata<>+0(SB), R10
+	MOVD	72(R10), F1	// Overflow2
+	MOVD	80(R10), F2	// Underflow2
+	MOVD	88(R10), F3	// NearZero
+
+	CMPEQD	F0, F0, FCC0
+	BFPF	isNaN		// x = NaN, return NaN
+
+	CMPGTD	F0, F1, FCC0
+	BFPT	overflow	// x > Overflow, return PosInf
+
+	CMPGTD	F2, F0, FCC0
+	BFPT	underflow	// x < Underflow, return 0
+
+	// argument reduction; x = r*lg(e) + k with |r| <= ln(2)/2
+	// computed as r = hi - lo for extra precision.
+	MOVD	0(R10), F10
+	MOVD	8(R10), F2
+	CMPGTD	F0, F10, FCC0
+	BFPT	add
+sub:
+	SUBD	F2, F0, F3	// x - 0.5
+	JMP	2(PC)
+add:
+	ADDD	F2, F0, F3	// x + 0.5
+
+	FTINTRZVD F3, F4
+	MOVV	F4, R5
+	FFINTDV	F4, F3
+
+	MOVD	32(R10), F4
+	MOVD	40(R10), F5
+	SUBD	F3, F0, F3
+	MULD	F3, F4
+	FNMSUBD	F10, F3, F5, F5
+	SUBD	F5, F4, F6
+	MULD	F6, F6, F7
+
+	// compute c
+	MOVV	$expmultirodata<>+0(SB), R11
+	MOVD	32(R11), F8
+	MOVD	24(R11), F9
+	FMADDD	F9, F8, F7, F13
+	MOVD	16(R11), F10
+	FMADDD	F10, F13, F7, F13
+	MOVD	8(R11), F11
+	FMADDD	F11, F13, F7, F13
+	MOVD	0(R11), F12
+	FMADDD	F12, F13, F7, F13
+	FNMSUBD	F6, F13, F7, F13
+
+	// compute y
+	MOVD	24(R10), F14
+	SUBD	F13, F14, F14
+	MULD	F6, F13, F15
+	DIVD	F14, F15
+
+	MOVD	16(R10), F17
+	SUBD	F15, F5, F15
+	SUBD	F4, F15, F15
+	SUBD	F15, F17, F16
+
+	// inline Ldexp(y, k), benefit:
+	// 1, no parameter pass overhead.
+	// 2, skip unnecessary checks for Inf/NaN/Zero
+	MOVV	F16, R4
+	MOVV	$FracMask, R9
+	SRLV	$52, R4, R7	// exponent
+	AND	R9, R4, R6	// fraction
+	ADDV	R5, R7
+	MOVV	$1, R12
+	BGE	R7, R12, normal
+
+	ADDV	$52, R7		// denormal
+	MOVV	$C1, R8
+	MOVV	R8, F17
+normal:
+	SLLV	$52, R7
+	OR	R7, R6, R4
+	MOVV	R4, F0
+	MULD	F17, F0
+isNaN:
+	MOVD	F0, ret+8(FP)
+	RET
+underflow:
+	MOVV	R0, ret+8(FP)
+	RET
+overflow:
+	MOVV	$PosInf, R4
+	MOVV	R4, ret+8(FP)
+	RET
--- a/src/math/exp_noasm.go
+++ b/src/math/exp_noasm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-//go:build !amd64 && !arm64 && !s390x
+//go:build !amd64 && !arm64 && !loong64 && !s390x

 package math