cmd/compiler,internal/runtime/atomic: optimize Load{64,32,8} on loong64

The LoadAcquire barrier on Loong64 is "dbar 0x14", using the correct barrier in Load{8,32,64} implementation can improve performance. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicLoad64 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad64-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad64-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad 17.220n ± 0% 4.402n ± 0% -74.44% (p=0.000 n=20) AtomicLoad-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) AtomicLoad8-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20) geomean 17.21n 4.402n -74.42% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicLoad64 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad64-2 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20) AtomicLoad64-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20) AtomicLoad-2 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad-4 18.81n ± 0% 10.42n ± 0% -44.63% (p=0.000 n=20) AtomicLoad8 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) AtomicLoad8-2 18.82n ± 0% 10.41n ± 0% -44.70% (p=0.000 n=20) AtomicLoad8-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20) geomean 18.82n 10.41n -44.68% Change-Id: I9d47c9d6f359c4f2e41035ca656429aade2e7847 Reviewed-on: https://go-review.googlesource.com/c/go/+/581357 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-04-23 17:01:28 +08:00 · 2024-04-23 17:01:28 +08:00 · 3214129a83
parent 01ab9a016a
commit 3214129a83
3 changed files with 18 additions and 14 deletions
--- a/src/cmd/compile/internal/loong64/ssa.go
+++ b/src/cmd/compile/internal/loong64/ssa.go
@ -468,6 +468,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
 		s.UseArgs(16) // space used in callee args area by assembly stubs
 	case ssa.OpLOONG64LoweredAtomicLoad8, ssa.OpLOONG64LoweredAtomicLoad32, ssa.OpLOONG64LoweredAtomicLoad64:
 		// MOVB	(Rarg0), Rout
 		// DBAR	0x14
 		as := loong64.AMOVV
 		switch v.Op {
 		case ssa.OpLOONG64LoweredAtomicLoad8:
@ -475,13 +477,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		case ssa.OpLOONG64LoweredAtomicLoad32:
 			as = loong64.AMOVW
 		}
 		s.Prog(loong64.ADBAR)
 		p := s.Prog(as)
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = v.Args[0].Reg()
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg0()
-		s.Prog(loong64.ADBAR)
+		p1 := s.Prog(loong64.ADBAR)
 		p1.From.Type = obj.TYPE_CONST
 		p1.From.Offset = 0x14
 	case ssa.OpLOONG64LoweredAtomicStore8, ssa.OpLOONG64LoweredAtomicStore32, ssa.OpLOONG64LoweredAtomicStore64:
 		as := loong64.AMOVV
 		switch v.Op {
--- a/src/internal/runtime/atomic/atomic_loong64.s
+++ b/src/internal/runtime/atomic/atomic_loong64.s
@ -319,38 +319,30 @@ TEXT ·Oruintptr(SB), NOSPLIT, $0-24
 // uint32 internal∕runtime∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
 	MOVV	ptr+0(FP), R19
 	DBAR
 	MOVWU	0(R19), R19
-	DBAR
+	DBAR	$0x14	// LoadAcquire barrier
 	MOVW	R19, ret+8(FP)
 	RET
 // uint8 internal∕runtime∕atomic·Load8(uint8 volatile* ptr)
 TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
 	MOVV	ptr+0(FP), R19
 	DBAR
 	MOVBU	0(R19), R19
-	DBAR
+	DBAR	$0x14
 	MOVB	R19, ret+8(FP)
 	RET
 // uint64 internal∕runtime∕atomic·Load64(uint64 volatile* ptr)
 TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
 	MOVV	ptr+0(FP), R19
 	DBAR
 	MOVV	0(R19), R19
-	DBAR
+	DBAR	$0x14
 	MOVV	R19, ret+8(FP)
 	RET
 // void *internal∕runtime∕atomic·Loadp(void *volatile *ptr)
 TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
-	MOVV	ptr+0(FP), R19
+	JMP     ·Load64(SB)
 	DBAR
 	MOVV	0(R19), R19
 	DBAR
 	MOVV	R19, ret+8(FP)
 	RET
 // uint32 internal∕runtime∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
--- a/src/internal/runtime/atomic/bench_test.go
+++ b/src/internal/runtime/atomic/bench_test.go
@ -43,6 +43,14 @@ func BenchmarkAtomicStore(b *testing.B) {
 	}
 }
 func BenchmarkAtomicLoad8(b *testing.B) {
 	var x uint8
 	sink = &x
 	for i := 0; i < b.N; i++ {
 		atomic.Load8(&x)
 	}
 }
 func BenchmarkAnd8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x