runtime: update and restore g0 stack bounds at cgocallback

Currently, at a cgo callback where there is already a Go frame on
the stack (i.e. C->Go->C->Go), we require that at the inner Go
callback the SP is within the g0's stack bounds set by a previous
callback. This is to prevent that the C code switches stack while
having a Go frame on the stack, which we don't really support. But
this could also happen when we cannot get accurate stack bounds,
e.g. when pthread_getattr_np is not available. Since the stack
bounds are just estimates based on the current SP, if there are
multiple C->Go callbacks with various stack depth, it is possible
that the SP of a later callback falls out of a previous call's
estimate. This leads to runtime throw in a seemingly reasonable
program.

This CL changes it to save the old g0 stack bounds at cgocallback,
update the bounds, and restore the old bounds at return. So each
callback will get its own stack bounds based on the current SP,
and when it returns, the outer callback has the its old stack
bounds restored.

Also, at a cgo callback when there is no Go frame on the stack,
we currently always get new stack bounds. We do this because if
we can only get estimated bounds based on the SP, and the stack
depth varies a lot between two C->Go calls, the previous
estimates may be off and we fall out or nearly fall out of the
previous bounds. But this causes a performance problem: the
pthread API to get accurate stack bounds (pthread_getattr_np) is
very slow when called on the main thread. Getting the stack bounds
every time significantly slows down repeated C->Go calls on the
main thread.

This CL fixes it by "caching" the stack bounds if they are
accurate. I.e. at the second time Go calls into C, if the previous
stack bounds are accurate, and the current SP is in bounds, we can
be sure it is the same stack and we don't need to update the bounds.
This avoids the repeated calls to pthread_getattr_np. If we cannot
get the accurate bounds, we continue to update the stack bounds
based on the SP, and that operation is very cheap.

On a Linux/AMD64 machine with glibc:

name                     old time/op  new time/op  delta
CgoCallbackMainThread-8  96.4µs ± 3%   0.1µs ± 2%  -99.92%  (p=0.000 n=10+9)

Fixes #68285.
Fixes #68587.

Change-Id: I3422badd5ad8ff63e1a733152d05fb7a44d5d435
Reviewed-on: https://go-review.googlesource.com/c/go/+/600296
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
Cherry Mui 2024-07-22 16:23:43 -04:00
parent 555ef55460
commit 76a8409eb8
9 changed files with 202 additions and 95 deletions

View File

@ -33,7 +33,7 @@ import (
"unicode" "unicode"
) )
var globalSkip = func(t *testing.T) {} var globalSkip = func(t testing.TB) {}
// Program to run. // Program to run.
var bin []string var bin []string
@ -59,12 +59,12 @@ func TestMain(m *testing.M) {
func testMain(m *testing.M) int { func testMain(m *testing.M) int {
if testing.Short() && os.Getenv("GO_BUILDER_NAME") == "" { if testing.Short() && os.Getenv("GO_BUILDER_NAME") == "" {
globalSkip = func(t *testing.T) { t.Skip("short mode and $GO_BUILDER_NAME not set") } globalSkip = func(t testing.TB) { t.Skip("short mode and $GO_BUILDER_NAME not set") }
return m.Run() return m.Run()
} }
if runtime.GOOS == "linux" { if runtime.GOOS == "linux" {
if _, err := os.Stat("/etc/alpine-release"); err == nil { if _, err := os.Stat("/etc/alpine-release"); err == nil {
globalSkip = func(t *testing.T) { t.Skip("skipping failing test on alpine - go.dev/issue/19938") } globalSkip = func(t testing.TB) { t.Skip("skipping failing test on alpine - go.dev/issue/19938") }
return m.Run() return m.Run()
} }
} }
@ -1291,8 +1291,8 @@ func TestPreemption(t *testing.T) {
} }
} }
// Issue 59294. Test calling Go function from C after using some // Issue 59294 and 68285. Test calling Go function from C after with
// stack space. // various stack space.
func TestDeepStack(t *testing.T) { func TestDeepStack(t *testing.T) {
globalSkip(t) globalSkip(t)
testenv.MustHaveGoBuild(t) testenv.MustHaveGoBuild(t)
@ -1350,6 +1350,53 @@ func TestDeepStack(t *testing.T) {
} }
} }
func BenchmarkCgoCallbackMainThread(b *testing.B) {
// Benchmark for calling into Go fron C main thread.
// See issue #68587.
//
// It uses a subprocess, which is a C binary that calls
// Go on the main thread b.N times. There is some overhead
// for launching the subprocess. It is probably fine when
// b.N is large.
globalSkip(b)
testenv.MustHaveGoBuild(b)
testenv.MustHaveCGO(b)
testenv.MustHaveBuildMode(b, "c-archive")
if !testWork {
defer func() {
os.Remove("testp10" + exeSuffix)
os.Remove("libgo10.a")
os.Remove("libgo10.h")
}()
}
cmd := exec.Command("go", "build", "-buildmode=c-archive", "-o", "libgo10.a", "./libgo10")
out, err := cmd.CombinedOutput()
b.Logf("%v\n%s", cmd.Args, out)
if err != nil {
b.Fatal(err)
}
ccArgs := append(cc, "-o", "testp10"+exeSuffix, "main10.c", "libgo10.a")
out, err = exec.Command(ccArgs[0], ccArgs[1:]...).CombinedOutput()
b.Logf("%v\n%s", ccArgs, out)
if err != nil {
b.Fatal(err)
}
argv := cmdToRun("./testp10")
argv = append(argv, fmt.Sprint(b.N))
cmd = exec.Command(argv[0], argv[1:]...)
b.ResetTimer()
err = cmd.Run()
if err != nil {
b.Fatal(err)
}
}
func TestSharedObject(t *testing.T) { func TestSharedObject(t *testing.T) {
// Test that we can put a Go c-archive into a C shared object. // Test that we can put a Go c-archive into a C shared object.
globalSkip(t) globalSkip(t)

View File

@ -0,0 +1,12 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import "C"
//export GoF
func GoF() {}
func main() {}

View File

@ -6,9 +6,29 @@ package main
import "runtime" import "runtime"
// extern void callGoWithVariousStack(int);
import "C" import "C"
func main() {} func main() {}
//export GoF //export GoF
func GoF() { runtime.GC() } func GoF(p int32) {
runtime.GC()
if p != 0 {
panic("panic")
}
}
//export callGoWithVariousStackAndGoFrame
func callGoWithVariousStackAndGoFrame(p int32) {
if p != 0 {
defer func() {
e := recover()
if e == nil {
panic("did not panic")
}
runtime.GC()
}()
}
C.callGoWithVariousStack(C.int(p));
}

View File

@ -0,0 +1,22 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdio.h>
#include <stdlib.h>
#include "libgo10.h"
int main(int argc, char **argv) {
int n, i;
if (argc != 2) {
perror("wrong arg");
return 2;
}
n = atoi(argv[1]);
for (i = 0; i < n; i++)
GoF();
return 0;
}

View File

@ -6,19 +6,27 @@
void use(int *x) { (*x)++; } void use(int *x) { (*x)++; }
void callGoFWithDeepStack() { void callGoFWithDeepStack(int p) {
int x[10000]; int x[10000];
use(&x[0]); use(&x[0]);
use(&x[9999]); use(&x[9999]);
GoF(); GoF(p);
use(&x[0]); use(&x[0]);
use(&x[9999]); use(&x[9999]);
} }
int main() { void callGoWithVariousStack(int p) {
GoF(); // call GoF without using much stack GoF(0); // call GoF without using much stack
callGoFWithDeepStack(); // call GoF with a deep stack callGoFWithDeepStack(p); // call GoF with a deep stack
GoF(0); // again on a shallow stack
}
int main() {
callGoWithVariousStack(0);
callGoWithVariousStackAndGoFrame(0); // normal execution
callGoWithVariousStackAndGoFrame(1); // panic and recover
} }

View File

@ -31,10 +31,11 @@ x_cgo_getstackbound(uintptr bounds[2])
pthread_attr_get_np(pthread_self(), &attr); pthread_attr_get_np(pthread_self(), &attr);
pthread_attr_getstack(&attr, &addr, &size); // low address pthread_attr_getstack(&attr, &addr, &size); // low address
#else #else
// We don't know how to get the current stacks, so assume they are the // We don't know how to get the current stacks, leave it as
// same as the default stack bounds. // 0 and the caller will use an estimate based on the current
pthread_attr_getstacksize(&attr, &size); // SP.
addr = __builtin_frame_address(0) + 4096 - size; addr = 0;
size = 0;
#endif #endif
pthread_attr_destroy(&attr); pthread_attr_destroy(&attr);

View File

@ -231,34 +231,6 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
func callbackUpdateSystemStack(mp *m, sp uintptr, signal bool) { func callbackUpdateSystemStack(mp *m, sp uintptr, signal bool) {
g0 := mp.g0 g0 := mp.g0
inBound := sp > g0.stack.lo && sp <= g0.stack.hi
if mp.ncgo > 0 && !inBound {
// ncgo > 0 indicates that this M was in Go further up the stack
// (it called C and is now receiving a callback).
//
// !inBound indicates that we were called with SP outside the
// expected system stack bounds (C changed the stack out from
// under us between the cgocall and cgocallback?).
//
// It is not safe for the C call to change the stack out from
// under us, so throw.
// Note that this case isn't possible for signal == true, as
// that is always passing a new M from needm.
// Stack is bogus, but reset the bounds anyway so we can print.
hi := g0.stack.hi
lo := g0.stack.lo
g0.stack.hi = sp + 1024
g0.stack.lo = sp - 32*1024
g0.stackguard0 = g0.stack.lo + stackGuard
g0.stackguard1 = g0.stackguard0
print("M ", mp.id, " procid ", mp.procid, " runtime: cgocallback with sp=", hex(sp), " out of bounds [", hex(lo), ", ", hex(hi), "]")
print("\n")
exit(2)
}
if !mp.isextra { if !mp.isextra {
// We allocated the stack for standard Ms. Don't replace the // We allocated the stack for standard Ms. Don't replace the
// stack bounds with estimated ones when we already initialized // stack bounds with estimated ones when we already initialized
@ -266,26 +238,37 @@ func callbackUpdateSystemStack(mp *m, sp uintptr, signal bool) {
return return
} }
// This M does not have Go further up the stack. However, it may have inBound := sp > g0.stack.lo && sp <= g0.stack.hi
// previously called into Go, initializing the stack bounds. Between if inBound && mp.g0StackAccurate {
// that call returning and now the stack may have changed (perhaps the // This M has called into Go before and has the stack bounds
// C thread is running a coroutine library). We need to update the // initialized. We have the accurate stack bounds, and the SP
// stack bounds for this case. // is in bounds. We expect it continues to run within the same
// bounds.
return
}
// We don't have an accurate stack bounds (either it never calls
// into Go before, or we couldn't get the accurate bounds), or the
// current SP is not within the previous bounds (the stack may have
// changed between calls). We need to update the stack bounds.
// //
// N.B. we need to update the stack bounds even if SP appears to // N.B. we need to update the stack bounds even if SP appears to
// already be in bounds. Our "bounds" may actually be estimated dummy // already be in bounds, if our bounds are estimated dummy bounds
// bounds (below). The actual stack bounds could have shifted but still // (below). We may be in a different region within the same actual
// have partial overlap with our dummy bounds. If we failed to update // stack bounds, but our estimates were not accurate. Or the actual
// in that case, we could find ourselves seemingly called near the // stack bounds could have shifted but still have partial overlap with
// bottom of the stack bounds, where we quickly run out of space. // our dummy bounds. If we failed to update in that case, we could find
// ourselves seemingly called near the bottom of the stack bounds, where
// we quickly run out of space.
// Set the stack bounds to match the current stack. If we don't // Set the stack bounds to match the current stack. If we don't
// actually know how big the stack is, like we don't know how big any // actually know how big the stack is, like we don't know how big any
// scheduling stack is, but we assume there's at least 32 kB. If we // scheduling stack is, but we assume there's at least 32 kB. If we
// can get a more accurate stack bound from pthread, use that, provided // can get a more accurate stack bound from pthread, use that, provided
// it actually contains SP.. // it actually contains SP.
g0.stack.hi = sp + 1024 g0.stack.hi = sp + 1024
g0.stack.lo = sp - 32*1024 g0.stack.lo = sp - 32*1024
mp.g0StackAccurate = false
if !signal && _cgo_getstackbound != nil { if !signal && _cgo_getstackbound != nil {
// Don't adjust if called from the signal handler. // Don't adjust if called from the signal handler.
// We are on the signal stack, not the pthread stack. // We are on the signal stack, not the pthread stack.
@ -296,12 +279,16 @@ func callbackUpdateSystemStack(mp *m, sp uintptr, signal bool) {
asmcgocall(_cgo_getstackbound, unsafe.Pointer(&bounds)) asmcgocall(_cgo_getstackbound, unsafe.Pointer(&bounds))
// getstackbound is an unsupported no-op on Windows. // getstackbound is an unsupported no-op on Windows.
// //
// On Unix systems, if the API to get accurate stack bounds is
// not available, it returns zeros.
//
// Don't use these bounds if they don't contain SP. Perhaps we // Don't use these bounds if they don't contain SP. Perhaps we
// were called by something not using the standard thread // were called by something not using the standard thread
// stack. // stack.
if bounds[0] != 0 && sp > bounds[0] && sp <= bounds[1] { if bounds[0] != 0 && sp > bounds[0] && sp <= bounds[1] {
g0.stack.lo = bounds[0] g0.stack.lo = bounds[0]
g0.stack.hi = bounds[1] g0.stack.hi = bounds[1]
mp.g0StackAccurate = true
} }
} }
g0.stackguard0 = g0.stack.lo + stackGuard g0.stackguard0 = g0.stack.lo + stackGuard
@ -319,6 +306,8 @@ func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
} }
sp := gp.m.g0.sched.sp // system sp saved by cgocallback. sp := gp.m.g0.sched.sp // system sp saved by cgocallback.
oldStack := gp.m.g0.stack
oldAccurate := gp.m.g0StackAccurate
callbackUpdateSystemStack(gp.m, sp, false) callbackUpdateSystemStack(gp.m, sp, false)
// The call from C is on gp.m's g0 stack, so we must ensure // The call from C is on gp.m's g0 stack, so we must ensure
@ -380,6 +369,12 @@ func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
reentersyscall(savedpc, uintptr(savedsp), uintptr(savedbp)) reentersyscall(savedpc, uintptr(savedsp), uintptr(savedbp))
gp.m.winsyscall = winsyscall gp.m.winsyscall = winsyscall
// Restore the old g0 stack bounds
gp.m.g0.stack = oldStack
gp.m.g0.stackguard0 = oldStack.lo + stackGuard
gp.m.g0.stackguard1 = gp.m.g0.stackguard0
gp.m.g0StackAccurate = oldAccurate
} }
func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) { func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {

View File

@ -2550,6 +2550,7 @@ func dropm() {
g0.stack.lo = 0 g0.stack.lo = 0
g0.stackguard0 = 0 g0.stackguard0 = 0
g0.stackguard1 = 0 g0.stackguard1 = 0
mp.g0StackAccurate = false
putExtraM(mp) putExtraM(mp)

View File

@ -530,47 +530,48 @@ type m struct {
_ uint32 // align next field to 8 bytes _ uint32 // align next field to 8 bytes
// Fields not known to debuggers. // Fields not known to debuggers.
procid uint64 // for debuggers, but offset not hard-coded procid uint64 // for debuggers, but offset not hard-coded
gsignal *g // signal-handling g gsignal *g // signal-handling g
goSigStack gsignalStack // Go-allocated signal handling stack goSigStack gsignalStack // Go-allocated signal handling stack
sigmask sigset // storage for saved signal mask sigmask sigset // storage for saved signal mask
tls [tlsSlots]uintptr // thread-local storage (for x86 extern register) tls [tlsSlots]uintptr // thread-local storage (for x86 extern register)
mstartfn func() mstartfn func()
curg *g // current running goroutine curg *g // current running goroutine
caughtsig guintptr // goroutine running during fatal signal caughtsig guintptr // goroutine running during fatal signal
p puintptr // attached p for executing go code (nil if not executing go code) p puintptr // attached p for executing go code (nil if not executing go code)
nextp puintptr nextp puintptr
oldp puintptr // the p that was attached before executing a syscall oldp puintptr // the p that was attached before executing a syscall
id int64 id int64
mallocing int32 mallocing int32
throwing throwType throwing throwType
preemptoff string // if != "", keep curg running on this m preemptoff string // if != "", keep curg running on this m
locks int32 locks int32
dying int32 dying int32
profilehz int32 profilehz int32
spinning bool // m is out of work and is actively looking for work spinning bool // m is out of work and is actively looking for work
blocked bool // m is blocked on a note blocked bool // m is blocked on a note
newSigstack bool // minit on C thread called sigaltstack newSigstack bool // minit on C thread called sigaltstack
printlock int8 printlock int8
incgo bool // m is executing a cgo call incgo bool // m is executing a cgo call
isextra bool // m is an extra m isextra bool // m is an extra m
isExtraInC bool // m is an extra m that is not executing Go code isExtraInC bool // m is an extra m that is not executing Go code
isExtraInSig bool // m is an extra m in a signal handler isExtraInSig bool // m is an extra m in a signal handler
freeWait atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait) freeWait atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait)
needextram bool needextram bool
traceback uint8 g0StackAccurate bool // whether the g0 stack has accurate bounds
ncgocall uint64 // number of cgo calls in total traceback uint8
ncgo int32 // number of cgo calls currently in progress ncgocall uint64 // number of cgo calls in total
cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily ncgo int32 // number of cgo calls currently in progress
cgoCallers *cgoCallers // cgo traceback if crashing in cgo call cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily
park note cgoCallers *cgoCallers // cgo traceback if crashing in cgo call
alllink *m // on allm park note
schedlink muintptr alllink *m // on allm
lockedg guintptr schedlink muintptr
createstack [32]uintptr // stack that created this thread, it's used for StackRecord.Stack0, so it must align with it. lockedg guintptr
lockedExt uint32 // tracking for external LockOSThread createstack [32]uintptr // stack that created this thread, it's used for StackRecord.Stack0, so it must align with it.
lockedInt uint32 // tracking for internal lockOSThread lockedExt uint32 // tracking for external LockOSThread
nextwaitm muintptr // next m waiting for lock lockedInt uint32 // tracking for internal lockOSThread
nextwaitm muintptr // next m waiting for lock
mLockProfile mLockProfile // fields relating to runtime.lock contention mLockProfile mLockProfile // fields relating to runtime.lock contention
profStack []uintptr // used for memory/block/mutex stack traces profStack []uintptr // used for memory/block/mutex stack traces