mirror of https://github.com/golang/go.git
cmd/compiler,internal/runtime/atomic: optimize Load{64,32,8} on loong64
The LoadAcquire barrier on Loong64 is "dbar 0x14", using the correct
barrier in Load{8,32,64} implementation can improve performance.
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000-HV @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
AtomicLoad64 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad64-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad64-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad 17.220n ± 0% 4.402n ± 0% -74.44% (p=0.000 n=20)
AtomicLoad-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad8 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad8-2 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
AtomicLoad8-4 17.210n ± 0% 4.402n ± 0% -74.42% (p=0.000 n=20)
geomean 17.21n 4.402n -74.42%
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A5000 @ 2500.00MHz
| bench.old | bench.new |
| sec/op | sec/op vs base |
AtomicLoad64 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20)
AtomicLoad64-2 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20)
AtomicLoad64-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20)
AtomicLoad 18.81n ± 0% 10.41n ± 0% -44.66% (p=0.000 n=20)
AtomicLoad-2 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20)
AtomicLoad-4 18.81n ± 0% 10.42n ± 0% -44.63% (p=0.000 n=20)
AtomicLoad8 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20)
AtomicLoad8-2 18.82n ± 0% 10.41n ± 0% -44.70% (p=0.000 n=20)
AtomicLoad8-4 18.82n ± 0% 10.41n ± 0% -44.69% (p=0.000 n=20)
geomean 18.82n 10.41n -44.68%
Change-Id: I9d47c9d6f359c4f2e41035ca656429aade2e7847
Reviewed-on: https://go-review.googlesource.com/c/go/+/581357
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
01ab9a016a
commit
3214129a83
|
|
@ -468,6 +468,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
|
p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
|
||||||
s.UseArgs(16) // space used in callee args area by assembly stubs
|
s.UseArgs(16) // space used in callee args area by assembly stubs
|
||||||
case ssa.OpLOONG64LoweredAtomicLoad8, ssa.OpLOONG64LoweredAtomicLoad32, ssa.OpLOONG64LoweredAtomicLoad64:
|
case ssa.OpLOONG64LoweredAtomicLoad8, ssa.OpLOONG64LoweredAtomicLoad32, ssa.OpLOONG64LoweredAtomicLoad64:
|
||||||
|
// MOVB (Rarg0), Rout
|
||||||
|
// DBAR 0x14
|
||||||
as := loong64.AMOVV
|
as := loong64.AMOVV
|
||||||
switch v.Op {
|
switch v.Op {
|
||||||
case ssa.OpLOONG64LoweredAtomicLoad8:
|
case ssa.OpLOONG64LoweredAtomicLoad8:
|
||||||
|
|
@ -475,13 +477,15 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
||||||
case ssa.OpLOONG64LoweredAtomicLoad32:
|
case ssa.OpLOONG64LoweredAtomicLoad32:
|
||||||
as = loong64.AMOVW
|
as = loong64.AMOVW
|
||||||
}
|
}
|
||||||
s.Prog(loong64.ADBAR)
|
|
||||||
p := s.Prog(as)
|
p := s.Prog(as)
|
||||||
p.From.Type = obj.TYPE_MEM
|
p.From.Type = obj.TYPE_MEM
|
||||||
p.From.Reg = v.Args[0].Reg()
|
p.From.Reg = v.Args[0].Reg()
|
||||||
p.To.Type = obj.TYPE_REG
|
p.To.Type = obj.TYPE_REG
|
||||||
p.To.Reg = v.Reg0()
|
p.To.Reg = v.Reg0()
|
||||||
s.Prog(loong64.ADBAR)
|
p1 := s.Prog(loong64.ADBAR)
|
||||||
|
p1.From.Type = obj.TYPE_CONST
|
||||||
|
p1.From.Offset = 0x14
|
||||||
|
|
||||||
case ssa.OpLOONG64LoweredAtomicStore8, ssa.OpLOONG64LoweredAtomicStore32, ssa.OpLOONG64LoweredAtomicStore64:
|
case ssa.OpLOONG64LoweredAtomicStore8, ssa.OpLOONG64LoweredAtomicStore32, ssa.OpLOONG64LoweredAtomicStore64:
|
||||||
as := loong64.AMOVV
|
as := loong64.AMOVV
|
||||||
switch v.Op {
|
switch v.Op {
|
||||||
|
|
|
||||||
|
|
@ -319,38 +319,30 @@ TEXT ·Oruintptr(SB), NOSPLIT, $0-24
|
||||||
// uint32 internal∕runtime∕atomic·Load(uint32 volatile* ptr)
|
// uint32 internal∕runtime∕atomic·Load(uint32 volatile* ptr)
|
||||||
TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
|
TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
|
||||||
MOVV ptr+0(FP), R19
|
MOVV ptr+0(FP), R19
|
||||||
DBAR
|
|
||||||
MOVWU 0(R19), R19
|
MOVWU 0(R19), R19
|
||||||
DBAR
|
DBAR $0x14 // LoadAcquire barrier
|
||||||
MOVW R19, ret+8(FP)
|
MOVW R19, ret+8(FP)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// uint8 internal∕runtime∕atomic·Load8(uint8 volatile* ptr)
|
// uint8 internal∕runtime∕atomic·Load8(uint8 volatile* ptr)
|
||||||
TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
|
TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
|
||||||
MOVV ptr+0(FP), R19
|
MOVV ptr+0(FP), R19
|
||||||
DBAR
|
|
||||||
MOVBU 0(R19), R19
|
MOVBU 0(R19), R19
|
||||||
DBAR
|
DBAR $0x14
|
||||||
MOVB R19, ret+8(FP)
|
MOVB R19, ret+8(FP)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// uint64 internal∕runtime∕atomic·Load64(uint64 volatile* ptr)
|
// uint64 internal∕runtime∕atomic·Load64(uint64 volatile* ptr)
|
||||||
TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
|
TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
|
||||||
MOVV ptr+0(FP), R19
|
MOVV ptr+0(FP), R19
|
||||||
DBAR
|
|
||||||
MOVV 0(R19), R19
|
MOVV 0(R19), R19
|
||||||
DBAR
|
DBAR $0x14
|
||||||
MOVV R19, ret+8(FP)
|
MOVV R19, ret+8(FP)
|
||||||
RET
|
RET
|
||||||
|
|
||||||
// void *internal∕runtime∕atomic·Loadp(void *volatile *ptr)
|
// void *internal∕runtime∕atomic·Loadp(void *volatile *ptr)
|
||||||
TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
|
TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
|
||||||
MOVV ptr+0(FP), R19
|
JMP ·Load64(SB)
|
||||||
DBAR
|
|
||||||
MOVV 0(R19), R19
|
|
||||||
DBAR
|
|
||||||
MOVV R19, ret+8(FP)
|
|
||||||
RET
|
|
||||||
|
|
||||||
// uint32 internal∕runtime∕atomic·LoadAcq(uint32 volatile* ptr)
|
// uint32 internal∕runtime∕atomic·LoadAcq(uint32 volatile* ptr)
|
||||||
TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
|
TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,14 @@ func BenchmarkAtomicStore(b *testing.B) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkAtomicLoad8(b *testing.B) {
|
||||||
|
var x uint8
|
||||||
|
sink = &x
|
||||||
|
for i := 0; i < b.N; i++ {
|
||||||
|
atomic.Load8(&x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func BenchmarkAnd8(b *testing.B) {
|
func BenchmarkAnd8(b *testing.B) {
|
||||||
var x [512]uint8 // give byte its own cache line
|
var x [512]uint8 // give byte its own cache line
|
||||||
sink = &x
|
sink = &x
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue