diff --git a/src/runtime/extern.go b/src/runtime/extern.go index bf0d0f71a6..ac07119cb9 100644 --- a/src/runtime/extern.go +++ b/src/runtime/extern.go @@ -56,13 +56,14 @@ It is a comma-separated list of name=val pairs setting these named variables: requires a rebuild), see https://pkg.go.dev/internal/goexperiment for details. dontfreezetheworld: by default, the start of a fatal panic or throw - "freezes the world", stopping all goroutines, which makes it possible - to traceback all goroutines (running goroutines cannot be traced), and + "freezes the world", preempting all threads to stop all running + goroutines, which makes it possible to traceback all goroutines, and keeps their state close to the point of panic. Setting - dontfreezetheworld=1 disables freeze, allowing goroutines to continue - executing during panic processing. This can be useful when debugging - the runtime scheduler, as freezetheworld perturbs scheduler state and - thus may hide problems. + dontfreezetheworld=1 disables this preemption, allowing goroutines to + continue executing during panic processing. Note that goroutines that + naturally enter the scheduler will still stop. This can be useful when + debugging the runtime scheduler, as freezetheworld perturbs scheduler + state and thus may hide problems. efence: setting efence=1 causes the allocator to run in a mode where each object is allocated on a unique page and addresses are diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 6d6b05b201..64fa272385 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -1247,9 +1247,6 @@ func startpanic_m() bool { if debug.schedtrace > 0 || debug.scheddetail > 0 { schedtrace(true) } - if debug.dontfreezetheworld > 0 { - return true - } freezetheworld() return true case 1: diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 9a252cfcf5..3cecd1a057 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -921,6 +921,35 @@ var freezing atomic.Bool // This function must not lock any mutexes. func freezetheworld() { freezing.Store(true) + if debug.dontfreezetheworld > 0 { + // Don't prempt Ps to stop goroutines. That will perturb + // scheduler state, making debugging more difficult. Instead, + // allow goroutines to continue execution. + // + // fatalpanic will tracebackothers to trace all goroutines. It + // is unsafe to trace a running goroutine, so tracebackothers + // will skip running goroutines. That is OK and expected, we + // expect users of dontfreezetheworld to use core files anyway. + // + // However, allowing the scheduler to continue running free + // introduces a race: a goroutine may be stopped when + // tracebackothers checks its status, and then start running + // later when we are in the middle of traceback, potentially + // causing a crash. + // + // To mitigate this, when an M naturally enters the scheduler, + // schedule checks if freezing is set and if so stops + // execution. This guarantees that while Gs can transition from + // running to stopped, they can never transition from stopped + // to running. + // + // The sleep here allows racing Ms that missed freezing and are + // about to run a G to complete the transition to running + // before we start traceback. + usleep(1000) + return + } + // stopwait and preemption requests can be lost // due to races with concurrently executing threads, // so try several times @@ -3552,6 +3581,18 @@ top: gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available + if debug.dontfreezetheworld > 0 && freezing.Load() { + // See comment in freezetheworld. We don't want to perturb + // scheduler state, so we didn't gcstopm in findRunnable, but + // also don't want to allow new goroutines to run. + // + // Deadlock here rather than in the findRunnable loop so if + // findRunnable is stuck in a loop we don't perturb that + // either. + lock(&deadlock) + lock(&deadlock) + } + // This thread is going to run a goroutine and is not spinning anymore, // so if it was marked as spinning we need to reset it now and potentially // start a new spinning M.