runtime: refactor runtime->tracer API to appear more like a lock

Currently the execution tracer synchronizes with itself using very heavyweight operations. As a result, it's totally fine for most of the tracer code to look like: if traceEnabled() { traceXXX(...) } However, if we want to make that synchronization more lightweight (as issue #60773 proposes), then this is insufficient. In particular, we need to make sure the tracer can't observe an inconsistency between g atomicstatus and the event that would be emitted for a particular g transition. This means making the g status change appear to happen atomically with the corresponding trace event being written out from the perspective of the tracer. This requires a change in API to something more like a lock. While we're here, we might as well make sure that trace events can *only* be emitted while this lock is held. This change introduces such an API: traceAcquire, which returns a value that can emit events, and traceRelease, which requires the value that was returned by traceAcquire. In practice, this won't be a real lock, it'll be more like a seqlock. For the current tracer, this API is completely overkill and the value returned by traceAcquire basically just checks trace.enabled. But it's necessary for the tracer described in #60773 and we can implement that more cleanly if we do this refactoring now instead of later. For #60773. Change-Id: Ibb9ff5958376339fafc2b5180aef65cf2ba18646 Reviewed-on: https://go-review.googlesource.com/c/go/+/515635 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Pratt <mpratt@google.com>
2023-07-27 19:04:04 +00:00 · 2023-07-27 19:04:04 +00:00 · f119abb65d
parent e3585c6757
commit f119abb65d
9 changed files with 388 additions and 177 deletions
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go
@ -166,10 +166,12 @@ func debugCallWrap(dispatch uintptr) {
 		gp.schedlink = 0

 		// Park the calling goroutine.
-		if traceEnabled() {
-			traceGoPark(traceBlockDebugCall, 1)
-		}
+		trace := traceAcquire()
 		casGToWaiting(gp, _Grunning, waitReasonDebugCall)
+		if trace.ok() {
+			trace.GoPark(traceBlockDebugCall, 1)
+			traceRelease(trace)
+		}
 		dropg()

 		// Directly execute the new goroutine. The debug
@ -225,19 +227,23 @@ func debugCallWrap1() {
 		// Switch back to the calling goroutine. At some point
 		// the scheduler will schedule us again and we'll
 		// finish exiting.
-		if traceEnabled() {
-			traceGoSched()
-		}
+		trace := traceAcquire()
 		casgstatus(gp, _Grunning, _Grunnable)
+		if trace.ok() {
+			trace.GoSched()
+			traceRelease(trace)
+		}
 		dropg()
 		lock(&sched.lock)
 		globrunqput(gp)
 		unlock(&sched.lock)

-		if traceEnabled() {
-			traceGoUnpark(callingG, 0)
-		}
+		trace = traceAcquire()
 		casgstatus(callingG, _Gwaiting, _Grunnable)
+		if trace.ok() {
+			trace.GoUnpark(callingG, 0)
+			traceRelease(trace)
+		}
 		execute(callingG, true)
 	})
 }
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@ -84,8 +84,10 @@ func (c *mcentral) cacheSpan() *mspan {
 	deductSweepCredit(spanBytes, 0)

 	traceDone := false
-	if traceEnabled() {
-		traceGCSweepStart()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCSweepStart()
+		traceRelease(trace)
 	}

 	// If we sweep spanBudget spans without finding any free
@ -157,9 +159,11 @@ func (c *mcentral) cacheSpan() *mspan {
 		}
 		sweep.active.end(sl)
 	}
-	if traceEnabled() {
-		traceGCSweepDone()
+	trace = traceAcquire()
+	if trace.ok() {
+		trace.GCSweepDone()
 		traceDone = true
+		traceRelease(trace)
 	}

 	// We failed to get a span from the mcentral so get one from mheap.
@ -170,8 +174,12 @@ func (c *mcentral) cacheSpan() *mspan {

 	// At this point s is a span that should have free slots.
 havespan:
-	if traceEnabled() && !traceDone {
-		traceGCSweepDone()
+	if !traceDone {
+		trace := traceAcquire()
+		if trace.ok() {
+			trace.GCSweepDone()
+			traceRelease(trace)
+		}
 	}
 	n := int(s.nelems) - int(s.allocCount)
 	if n == 0 || s.freeindex == s.nelems || s.allocCount == s.nelems {
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@ -647,8 +647,10 @@ func gcStart(trigger gcTrigger) {
 	// Update it under gcsema to avoid gctrace getting wrong values.
 	work.userForced = trigger.kind == gcTriggerCycle

-	if traceEnabled() {
-		traceGCStart()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCStart()
+		traceRelease(trace)
 	}

 	// Check that all Ps have finished deferred mcache flushes.
@ -989,8 +991,10 @@ func gcMarkTermination() {
 	mp.traceback = 0
 	casgstatus(curgp, _Gwaiting, _Grunning)

-	if traceEnabled() {
-		traceGCDone()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCDone()
+		traceRelease(trace)
 	}

 	// all done
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@ -420,7 +420,11 @@ retry:
 		// If the CPU limiter is enabled, intentionally don't
 		// assist to reduce the amount of CPU time spent in the GC.
 		if traced {
-			traceGCMarkAssistDone()
+			trace := traceAcquire()
+			if trace.ok() {
+				trace.GCMarkAssistDone()
+				traceRelease(trace)
+			}
 		}
 		return
 	}
@ -461,15 +465,22 @@ retry:
 			// We were able to steal all of the credit we
 			// needed.
 			if traced {
-				traceGCMarkAssistDone()
+				trace := traceAcquire()
+				if trace.ok() {
+					trace.GCMarkAssistDone()
+					traceRelease(trace)
+				}
 			}
 			return
 		}
 	}
-
 	if traceEnabled() && !traced {
-		traced = true
-		traceGCMarkAssistStart()
+		trace := traceAcquire()
+		if trace.ok() {
+			traced = true
+			trace.GCMarkAssistStart()
+			traceRelease(trace)
+		}
 	}

 	// Perform assist work
@ -515,7 +526,11 @@ retry:
 		// this G's assist debt, or the GC cycle is over.
 	}
 	if traced {
-		traceGCMarkAssistDone()
+		trace := traceAcquire()
+		if trace.ok() {
+			trace.GCMarkAssistDone()
+			traceRelease(trace)
+		}
 	}
 }

--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@ -807,9 +807,11 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {

 	// Run the background mark worker.
 	gp := node.gp.ptr()
+	trace := traceAcquire()
 	casgstatus(gp, _Gwaiting, _Grunnable)
-	if traceEnabled() {
-		traceGoUnpark(gp, 0)
+	if trace.ok() {
+		trace.GoUnpark(gp, 0)
+		traceRelease(trace)
 	}
 	return gp, now
 }
@ -828,8 +830,10 @@ func (c *gcControllerState) resetLive(bytesMarked uint64) {
 	c.triggered = ^uint64(0) // Reset triggered.

 	// heapLive was updated, so emit a trace event.
-	if traceEnabled() {
-		traceHeapAlloc(bytesMarked)
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.HeapAlloc(bytesMarked)
+		traceRelease(trace)
 	}
 }

@ -856,10 +860,12 @@ func (c *gcControllerState) markWorkerStop(mode gcMarkWorkerMode, duration int64

 func (c *gcControllerState) update(dHeapLive, dHeapScan int64) {
 	if dHeapLive != 0 {
+		trace := traceAcquire()
 		live := gcController.heapLive.Add(dHeapLive)
-		if traceEnabled() {
+		if trace.ok() {
 			// gcController.heapLive changed.
-			traceHeapAlloc(live)
+			trace.HeapAlloc(live)
+			traceRelease(trace)
 		}
 	}
 	if gcBlackenEnabled == 0 {
@ -1428,8 +1434,10 @@ func gcControllerCommit() {

 	// TODO(mknyszek): This isn't really accurate any longer because the heap
 	// goal is computed dynamically. Still useful to snapshot, but not as useful.
-	if traceEnabled() {
-		traceHeapGoal()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.HeapGoal()
+		traceRelease(trace)
 	}

 	trigger, heapGoal := gcController.trigger()
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@ -516,8 +516,10 @@ func (sl *sweepLocked) sweep(preserve bool) bool {
 		throw("mspan.sweep: bad span state")
 	}

-	if traceEnabled() {
-		traceGCSweepSpan(s.npages * _PageSize)
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCSweepSpan(s.npages * _PageSize)
+		traceRelease(trace)
 	}

 	mheap_.pagesSwept.Add(int64(s.npages))
@ -889,8 +891,10 @@ func deductSweepCredit(spanBytes uintptr, callerSweepPages uintptr) {
 		return
 	}

-	if traceEnabled() {
-		traceGCSweepStart()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCSweepStart()
+		traceRelease(trace)
 	}

 	// Fix debt if necessary.
@ -929,8 +933,10 @@ retry:
 		}
 	}

-	if traceEnabled() {
-		traceGCSweepDone()
+	trace = traceAcquire()
+	if trace.ok() {
+		trace.GCSweepDone()
+		traceRelease(trace)
 	}
 }

--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@ -791,8 +791,10 @@ func (h *mheap) reclaim(npage uintptr) {
 	// traceGCSweepStart/Done pair on the P.
 	mp := acquirem()

-	if traceEnabled() {
-		traceGCSweepStart()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GCSweepStart()
+		traceRelease(trace)
 	}

 	arenas := h.sweepArenas
@ -839,8 +841,10 @@ func (h *mheap) reclaim(npage uintptr) {
 		unlock(&h.lock)
 	}

-	if traceEnabled() {
-		traceGCSweepDone()
+	trace = traceAcquire()
+	if trace.ok() {
+		trace.GCSweepDone()
+		traceRelease(trace)
 	}
 	releasem(mp)
 }
@ -911,10 +915,12 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 		n -= uintptr(len(inUse) * 8)
 	}
 	sweep.active.end(sl)
-	if traceEnabled() {
+	trace := traceAcquire()
+	if trace.ok() {
 		unlock(&h.lock)
 		// Account for pages scanned but not reclaimed.
-		traceGCSweepSpan((n0 - nFreed) * pageSize)
+		trace.GCSweepSpan((n0 - nFreed) * pageSize)
+		traceRelease(trace)
 		lock(&h.lock)
 	}

--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -946,10 +946,6 @@ func fastrandinit() {

 // Mark gp ready to run.
 func ready(gp *g, traceskip int, next bool) {
-	if traceEnabled() {
-		traceGoUnpark(gp, traceskip)
-	}
-
 	status := readgstatus(gp)

 	// Mark runnable.
@ -960,7 +956,12 @@ func ready(gp *g, traceskip int, next bool) {
 	}

 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+	trace := traceAcquire()
 	casgstatus(gp, _Gwaiting, _Grunnable)
+	if trace.ok() {
+		trace.GoUnpark(gp, traceskip)
+		traceRelease(trace)
+	}
 	runqput(mp.p.ptr(), gp, next)
 	wakep()
 	releasem(mp)
@ -1407,8 +1408,10 @@ var gcsema uint32 = 1
 // Holding worldsema causes any other goroutines invoking
 // stopTheWorld to block.
 func stopTheWorldWithSema(reason stwReason) {
-	if traceEnabled() {
-		traceSTWStart(reason)
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.STWStart(reason)
+		traceRelease(trace)
 	}
 	gp := getg()

@ -1426,17 +1429,22 @@ func stopTheWorldWithSema(reason stwReason) {
 	gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	sched.stopwait--
 	// try to retake all P's in Psyscall status
+	trace = traceAcquire()
 	for _, pp := range allp {
 		s := pp.status
 		if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-			if traceEnabled() {
-				traceGoSysBlock(pp)
-				traceProcStop(pp)
+			if trace.ok() {
+				trace.GoSysBlock(pp)
+				trace.ProcStop(pp)
 			}
 			pp.syscalltick++
 			sched.stopwait--
 		}
 	}
+	if trace.ok() {
+		traceRelease(trace)
+	}
+
 	// stop idle P's
 	now := nanotime()
 	for {
@ -1533,8 +1541,10 @@ func startTheWorldWithSema() int64 {

 	// Capture start-the-world time before doing clean-up tasks.
 	startTime := nanotime()
-	if traceEnabled() {
-		traceSTWDone()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.STWDone()
+		traceRelease(trace)
 	}

 	// Wakeup an additional proc in case we have excessive runnable goroutines
@ -1853,17 +1863,21 @@ func forEachP(fn func(*p)) {

 	// Force Ps currently in _Psyscall into _Pidle and hand them
 	// off to induce safe point function execution.
+	trace := traceAcquire()
 	for _, p2 := range allp {
 		s := p2.status
 		if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-			if traceEnabled() {
-				traceGoSysBlock(p2)
-				traceProcStop(p2)
+			if trace.ok() {
+				trace.GoSysBlock(p2)
+				trace.ProcStop(p2)
 			}
 			p2.syscalltick++
 			handoffp(p2)
 		}
 	}
+	if trace.ok() {
+		traceRelease(trace)
+	}

 	// Wait for remaining Ps to run fn.
 	if wait {
@ -2172,8 +2186,10 @@ func oneNewExtraM() {
 	if raceenabled {
 		gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
 	}
-	if traceEnabled() {
-		traceOneNewExtraM(gp)
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.OneNewExtraM(gp)
+		traceRelease(trace)
 	}
 	// put on allg for garbage collector
 	allgadd(gp)
@ -2921,13 +2937,15 @@ func execute(gp *g, inheritTime bool) {
 		setThreadCPUProfiler(hz)
 	}

-	if traceEnabled() {
+	trace := traceAcquire()
+	if trace.ok() {
 		// GoSysExit has to happen when we have a P, but before GoStart.
 		// So we emit it here.
 		if gp.syscallsp != 0 {
-			traceGoSysExit()
+			trace.GoSysExit()
 		}
-		traceGoStart()
+		trace.GoStart()
+		traceRelease(trace)
 	}

 	gogo(&gp.sched)
@ -2964,8 +2982,12 @@ top:
 	if traceEnabled() || traceShuttingDown() {
 		gp := traceReader()
 		if gp != nil {
+			trace := traceAcquire()
 			casgstatus(gp, _Gwaiting, _Grunnable)
-			traceGoUnpark(gp, 0)
+			if trace.ok() {
+				trace.GoUnpark(gp, 0)
+				traceRelease(trace)
+			}
 			return gp, false, true
 		}
 	}
@ -3028,9 +3050,11 @@ top:
 			gp := list.pop()
 			injectglist(&list)
 			netpollAdjustWaiters(delta)
+			trace := traceAcquire()
 			casgstatus(gp, _Gwaiting, _Grunnable)
-			if traceEnabled() {
-				traceGoUnpark(gp, 0)
+			if trace.ok() {
+				trace.GoUnpark(gp, 0)
+				traceRelease(trace)
 			}
 			return gp, false, false
 		}
@ -3073,9 +3097,12 @@ top:
 		if node != nil {
 			pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
 			gp := node.gp.ptr()
+
+			trace := traceAcquire()
 			casgstatus(gp, _Gwaiting, _Grunnable)
-			if traceEnabled() {
-				traceGoUnpark(gp, 0)
+			if trace.ok() {
+				trace.GoUnpark(gp, 0)
+				traceRelease(trace)
 			}
 			return gp, false, false
 		}
@ -3088,9 +3115,11 @@ top:
 	// until a callback was triggered.
 	gp, otherReady := beforeIdle(now, pollUntil)
 	if gp != nil {
+		trace := traceAcquire()
 		casgstatus(gp, _Gwaiting, _Grunnable)
-		if traceEnabled() {
-			traceGoUnpark(gp, 0)
+		if trace.ok() {
+			trace.GoUnpark(gp, 0)
+			traceRelease(trace)
 		}
 		return gp, false, false
 	}
@ -3216,9 +3245,11 @@ top:

 			// Run the idle worker.
 			pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			trace := traceAcquire()
 			casgstatus(gp, _Gwaiting, _Grunnable)
-			if traceEnabled() {
-				traceGoUnpark(gp, 0)
+			if trace.ok() {
+				trace.GoUnpark(gp, 0)
+				traceRelease(trace)
 			}
 			return gp, false, false
 		}
@ -3278,9 +3309,11 @@ top:
 				gp := list.pop()
 				injectglist(&list)
 				netpollAdjustWaiters(delta)
+				trace := traceAcquire()
 				casgstatus(gp, _Gwaiting, _Grunnable)
-				if traceEnabled() {
-					traceGoUnpark(gp, 0)
+				if trace.ok() {
+					trace.GoUnpark(gp, 0)
+					traceRelease(trace)
 				}
 				return gp, false, false
 			}
@ -3548,10 +3581,12 @@ func injectglist(glist *gList) {
 	if glist.empty() {
 		return
 	}
-	if traceEnabled() {
+	trace := traceAcquire()
+	if trace.ok() {
 		for gp := glist.head.ptr(); gp != nil; gp = gp.schedlink.ptr() {
-			traceGoUnpark(gp, 0)
+			trace.GoUnpark(gp, 0)
 		}
+		traceRelease(trace)
 	}

 	// Mark all the goroutines as runnable before we put them
@ -3791,13 +3826,16 @@ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 func park_m(gp *g) {
 	mp := getg().m

-	if traceEnabled() {
-		traceGoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
-	}
+	trace := traceAcquire()

 	// N.B. Not using casGToWaiting here because the waitreason is
 	// set by park_m's caller.
 	casgstatus(gp, _Grunning, _Gwaiting)
+	if trace.ok() {
+		trace.GoPark(mp.waitTraceBlockReason, mp.waitTraceSkip)
+		traceRelease(trace)
+	}
+
 	dropg()

 	if fn := mp.waitunlockf; fn != nil {
@ -3805,23 +3843,35 @@ func park_m(gp *g) {
 		mp.waitunlockf = nil
 		mp.waitlock = nil
 		if !ok {
-			if traceEnabled() {
-				traceGoUnpark(gp, 2)
-			}
+			trace := traceAcquire()
 			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.ok() {
+				trace.GoUnpark(gp, 2)
+				traceRelease(trace)
+			}
 			execute(gp, true) // Schedule it back, never returns.
 		}
 	}
 	schedule()
 }

-func goschedImpl(gp *g) {
+func goschedImpl(gp *g, preempted bool) {
+	trace := traceAcquire()
 	status := readgstatus(gp)
 	if status&^_Gscan != _Grunning {
 		dumpgstatus(gp)
 		throw("bad g status")
 	}
 	casgstatus(gp, _Grunning, _Grunnable)
+	if trace.ok() {
+		if preempted {
+			trace.GoPreempt()
+		} else {
+			trace.GoSched()
+		}
+		traceRelease(trace)
+	}
+
 	dropg()
 	lock(&sched.lock)
 	globrunqput(gp)
@ -3836,39 +3886,25 @@ func goschedImpl(gp *g) {

 // Gosched continuation on g0.
 func gosched_m(gp *g) {
-	if traceEnabled() {
-		traceGoSched()
-	}
-	goschedImpl(gp)
+	goschedImpl(gp, false)
 }

 // goschedguarded is a forbidden-states-avoided version of gosched_m.
 func goschedguarded_m(gp *g) {
-
 	if !canPreemptM(gp.m) {
 		gogo(&gp.sched) // never return
 	}
-
-	if traceEnabled() {
-		traceGoSched()
-	}
-	goschedImpl(gp)
+	goschedImpl(gp, false)
 }

 func gopreempt_m(gp *g) {
-	if traceEnabled() {
-		traceGoPreempt()
-	}
-	goschedImpl(gp)
+	goschedImpl(gp, true)
 }

 // preemptPark parks gp and puts it in _Gpreempted.
 //
 //go:systemstack
 func preemptPark(gp *g) {
-	if traceEnabled() {
-		traceGoPark(traceBlockPreempted, 0)
-	}
 	status := readgstatus(gp)
 	if status&^_Gscan != _Grunning {
 		dumpgstatus(gp)
@ -3897,7 +3933,30 @@ func preemptPark(gp *g) {
 	// transitions until we can dropg.
 	casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
 	dropg()
+
+	// Be careful about how we trace this next event. The ordering
+	// is subtle.
+	//
+	// The moment we CAS into _Gpreempted, suspendG could CAS to
+	// _Gwaiting, do its work, and ready the goroutine. All of
+	// this could happen before we even get the chance to emit
+	// an event. The end result is that the events could appear
+	// out of order, and the tracer generally assumes the scheduler
+	// takes care of the ordering between GoPark and GoUnpark.
+	//
+	// The answer here is simple: emit the event while we still hold
+	// the _Gscan bit on the goroutine. We still need to traceAcquire
+	// and traceRelease across the CAS because the tracer could be
+	// what's calling suspendG in the first place, and we want the
+	// CAS and event emission to appear atomic to the tracer.
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GoPark(traceBlockPreempted, 0)
+	}
 	casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+	if trace.ok() {
+		traceRelease(trace)
+	}
 	schedule()
 }

@ -3910,11 +3969,13 @@ func goyield() {
 }

 func goyield_m(gp *g) {
-	if traceEnabled() {
-		traceGoPreempt()
-	}
+	trace := traceAcquire()
 	pp := gp.m.p.ptr()
 	casgstatus(gp, _Grunning, _Grunnable)
+	if trace.ok() {
+		trace.GoPreempt()
+		traceRelease(trace)
+	}
 	dropg()
 	runqput(pp, gp, false)
 	schedule()
@ -3925,8 +3986,10 @@ func goexit1() {
 	if raceenabled {
 		racegoend()
 	}
-	if traceEnabled() {
-		traceGoEnd()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GoEnd()
+		traceRelease(trace)
 	}
 	mcall(goexit0)
 }
@ -4065,6 +4128,7 @@ func save(pc, sp uintptr) {
 //
 //go:nosplit
 func reentersyscall(pc, sp uintptr) {
+	trace := traceAcquire()
 	gp := getg()

 	// Disable preemption because during this function g is in Gsyscall status,
@ -4095,8 +4159,11 @@ func reentersyscall(pc, sp uintptr) {
 		})
 	}

-	if traceEnabled() {
-		systemstack(traceGoSysCall)
+	if trace.ok() {
+		systemstack(func() {
+			trace.GoSysCall()
+			traceRelease(trace)
+		})
 		// systemstack itself clobbers g.sched.{pc,sp} and we might
 		// need them later when the G is genuinely blocked in a
 		// syscall
@ -4153,9 +4220,11 @@ func entersyscall_gcwait() {

 	lock(&sched.lock)
 	if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
-		if traceEnabled() {
-			traceGoSysBlock(pp)
-			traceProcStop(pp)
+		trace := traceAcquire()
+		if trace.ok() {
+			trace.GoSysBlock(pp)
+			trace.ProcStop(pp)
+			traceRelease(trace)
 		}
 		pp.syscalltick++
 		if sched.stopwait--; sched.stopwait == 0 {
@ -4209,9 +4278,11 @@ func entersyscallblock() {
 }

 func entersyscallblock_handoff() {
-	if traceEnabled() {
-		traceGoSysCall()
-		traceGoSysBlock(getg().m.p.ptr())
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.GoSysCall()
+		trace.GoSysBlock(getg().m.p.ptr())
+		traceRelease(trace)
 	}
 	handoffp(releasep())
 }
@ -4250,15 +4321,21 @@ func exitsyscall() {
 				tryRecordGoroutineProfileWB(gp)
 			})
 		}
-		if traceEnabled() {
+		trace := traceAcquire()
+		if trace.ok() {
 			if oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-				systemstack(traceGoStart)
+				systemstack(func() {
+					trace.GoStart()
+				})
 			}
 		}
 		// There's a cpu for us, so we can run.
 		gp.m.p.ptr().syscalltick++
 		// We need to cas the status and scan before resuming...
 		casgstatus(gp, _Gsyscall, _Grunning)
+		if trace.ok() {
+			traceRelease(trace)
+		}

 		// Garbage collector isn't running (since we are),
 		// so okay to clear syscallsp.
@ -4281,7 +4358,8 @@ func exitsyscall() {
 		return
 	}

-	if traceEnabled() {
+	trace := traceAcquire()
+	if trace.ok() {
 		// Wait till traceGoSysBlock event is emitted.
 		// This ensures consistency of the trace (the goroutine is started after it is blocked).
 		for oldp != nil && oldp.syscalltick == gp.m.syscalltick {
@ -4292,6 +4370,7 @@ func exitsyscall() {
 		// So instead we remember the syscall exit time and emit the event
 		// in execute when we have a P.
 		gp.trace.sysExitTime = traceClockNow()
+		traceRelease(trace)
 	}

 	gp.m.locks--
@ -4332,15 +4411,19 @@ func exitsyscallfast(oldp *p) bool {
 		var ok bool
 		systemstack(func() {
 			ok = exitsyscallfast_pidle()
-			if ok && traceEnabled() {
-				if oldp != nil {
-					// Wait till traceGoSysBlock event is emitted.
-					// This ensures consistency of the trace (the goroutine is started after it is blocked).
-					for oldp.syscalltick == gp.m.syscalltick {
-						osyield()
+			if ok {
+				trace := traceAcquire()
+				if trace.ok() {
+					if oldp != nil {
+						// Wait till traceGoSysBlock event is emitted.
+						// This ensures consistency of the trace (the goroutine is started after it is blocked).
+						for oldp.syscalltick == gp.m.syscalltick {
+							osyield()
+						}
 					}
+					trace.GoSysExit()
+					traceRelease(trace)
 				}
-				traceGoSysExit()
 			}
 		})
 		if ok {
@ -4358,15 +4441,17 @@ func exitsyscallfast(oldp *p) bool {
 func exitsyscallfast_reacquired() {
 	gp := getg()
 	if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-		if traceEnabled() {
+		trace := traceAcquire()
+		if trace.ok() {
 			// The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
 			// traceGoSysBlock for this syscall was already emitted,
 			// but here we effectively retake the p from the new syscall running on the same p.
 			systemstack(func() {
 				// Denote blocking of the new syscall.
-				traceGoSysBlock(gp.m.p.ptr())
+				trace.GoSysBlock(gp.m.p.ptr())
 				// Denote completion of the current syscall.
-				traceGoSysExit()
+				trace.GoSysExit()
+				traceRelease(trace)
 			})
 		}
 		gp.m.p.ptr().syscalltick++
@ -4631,9 +4716,11 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
 	if newg.trackingSeq%gTrackingPeriod == 0 {
 		newg.tracking = true
 	}
-	casgstatus(newg, _Gdead, _Grunnable)
 	gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))

+	// Get a goid and switch to runnable. Make all this atomic to the tracer.
+	trace := traceAcquire()
+	casgstatus(newg, _Gdead, _Grunnable)
 	if pp.goidcache == pp.goidcacheend {
 		// Sched.goidgen is the last allocated id,
 		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
@ -4644,6 +4731,12 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
 	}
 	newg.goid = pp.goidcache
 	pp.goidcache++
+	if trace.ok() {
+		trace.GoCreate(newg, newg.startpc)
+		traceRelease(trace)
+	}
+
+	// Set up race context.
 	if raceenabled {
 		newg.racectx = racegostart(callerpc)
 		newg.raceignore = 0
@ -4653,9 +4746,6 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
 			racereleasemergeg(newg, unsafe.Pointer(&labelSync))
 		}
 	}
-	if traceEnabled() {
-		traceGoCreate(newg, newg.startpc)
-	}
 	releasem(mp)

 	return newg
@ -5264,8 +5354,10 @@ func procresize(nprocs int32) *p {
 	if old < 0 || nprocs <= 0 {
 		throw("procresize: invalid arg")
 	}
-	if traceEnabled() {
-		traceGomaxprocs(nprocs)
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.Gomaxprocs(nprocs)
+		traceRelease(trace)
 	}

 	// update statistics
@ -5330,12 +5422,14 @@ func procresize(nprocs int32) *p {
 		// because p.destroy itself has write barriers, so we
 		// need to do that from a valid P.
 		if gp.m.p != 0 {
-			if traceEnabled() {
+			trace := traceAcquire()
+			if trace.ok() {
 				// Pretend that we were descheduled
 				// and then scheduled again to keep
 				// the trace sane.
-				traceGoSched()
-				traceProcStop(gp.m.p.ptr())
+				trace.GoSched()
+				trace.ProcStop(gp.m.p.ptr())
+				traceRelease(trace)
 			}
 			gp.m.p.ptr().m = 0
 		}
@ -5344,8 +5438,10 @@ func procresize(nprocs int32) *p {
 		pp.m = 0
 		pp.status = _Pidle
 		acquirep(pp)
-		if traceEnabled() {
-			traceGoStart()
+		trace := traceAcquire()
+		if trace.ok() {
+			trace.GoStart()
+			traceRelease(trace)
 		}
 	}

@ -5409,8 +5505,10 @@ func acquirep(pp *p) {
 	// from a potentially stale mcache.
 	pp.mcache.prepareForSweep()

-	if traceEnabled() {
-		traceProcStart()
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.ProcStart()
+		traceRelease(trace)
 	}
 }

@ -5451,8 +5549,10 @@ func releasep() *p {
 		print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
 		throw("releasep: invalid p state")
 	}
-	if traceEnabled() {
-		traceProcStop(gp.m.p.ptr())
+	trace := traceAcquire()
+	if trace.ok() {
+		trace.ProcStop(gp.m.p.ptr())
+		traceRelease(trace)
 	}
 	gp.m.p = 0
 	pp.m = 0
@ -5799,9 +5899,11 @@ func retake(now int64) uint32 {
 			// increment nmidle and report deadlock.
 			incidlelocked(-1)
 			if atomic.Cas(&pp.status, s, _Pidle) {
-				if traceEnabled() {
-					traceGoSysBlock(pp)
-					traceProcStop(pp)
+				trace := traceAcquire()
+				if trace.ok() {
+					trace.GoSysBlock(pp)
+					trace.ProcStop(pp)
+					traceRelease(trace)
 				}
 				n++
 				pp.syscalltick++
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@ -258,6 +258,8 @@ func traceBufPtrOf(b *traceBuf) traceBufPtr {

 // traceEnabled returns true if the trace is currently enabled.
 //
+// nosplit because it's called on the syscall path when stack movement is forbidden.
+//
 //go:nosplit
 func traceEnabled() bool {
 	return trace.enabled
@ -270,6 +272,52 @@ func traceShuttingDown() bool {
 	return trace.shutdown
 }

+// traceLocker represents an M writing trace events. While a traceLocker value
+// is valid, the tracer observes all operations on the G/M/P or trace events being
+// written as happening atomically.
+//
+// This doesn't do much for the current tracer, because the current tracer doesn't
+// need atomicity around non-trace runtime operations. All the state it needs it
+// collects carefully during a STW.
+type traceLocker struct {
+	enabled bool
+}
+
+// traceAcquire prepares this M for writing one or more trace events.
+//
+// This exists for compatibility with the upcoming new tracer; it doesn't do much
+// in the current tracer.
+//
+// nosplit because it's called on the syscall path when stack movement is forbidden.
+//
+//go:nosplit
+func traceAcquire() traceLocker {
+	if !traceEnabled() {
+		return traceLocker{false}
+	}
+	return traceLocker{true}
+}
+
+// ok returns true if the traceLocker is valid (i.e. tracing is enabled).
+//
+// nosplit because it's called on the syscall path when stack movement is forbidden.
+//
+//go:nosplit
+func (tl traceLocker) ok() bool {
+	return tl.enabled
+}
+
+// traceRelease indicates that this M is done writing trace events.
+//
+// This exists for compatibility with the upcoming new tracer; it doesn't do anything
+// in the current tracer.
+//
+// nosplit because it's called on the syscall path when stack movement is forbidden.
+//
+//go:nosplit
+func traceRelease(tl traceLocker) {
+}
+
 // StartTrace enables tracing for the current process.
 // While tracing, the data will be buffered and available via ReadTrace.
 // StartTrace returns an error if tracing is already enabled.
@ -367,8 +415,10 @@ func StartTrace() error {
 			gp.trace.tracedSyscallEnter = false
 		}
 	})
-	traceProcStart()
-	traceGoStart()
+	// Use a dummy traceLocker. The trace isn't enabled yet, but we can still write events.
+	tl := traceLocker{}
+	tl.ProcStart()
+	tl.GoStart()
 	// Note: startTicks needs to be set after we emit traceEvGoInSyscall events.
 	// If we do it the other way around, it is possible that exitsyscall will
 	// query sysExitTime after startTicks but before traceEvGoInSyscall timestamp.
@ -401,7 +451,10 @@ func StartTrace() error {
 	unlock(&sched.sysmonlock)

 	// Record the current state of HeapGoal to avoid information loss in trace.
-	traceHeapGoal()
+	//
+	// Use the same dummy trace locker. The trace can't end until after we start
+	// the world, and we can safely trace from here.
+	tl.HeapGoal()

 	startTheWorldGC()
 	return nil
@ -427,7 +480,10 @@ func StopTrace() {
 		return
 	}

-	traceGoSched()
+	// Trace GoSched for us, and use a dummy locker. The world is stopped
+	// and we control whether the trace is enabled, so this is safe.
+	tl := traceLocker{}
+	tl.GoSched()

 	atomicstorep(unsafe.Pointer(&trace.cpuLogWrite), nil)
 	trace.cpuLogRead.close()
@ -847,7 +903,7 @@ func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev by
 // profiling buffer. It is called from a signal handler, so is limited in what
 // it can do.
 func traceCPUSample(gp *g, pp *p, stk []uintptr) {
-	if !trace.enabled {
+	if !traceEnabled() {
 		// Tracing is usually turned off; don't spend time acquiring the signal
 		// lock unless it's active.
 		return
@ -1475,15 +1531,15 @@ func (a *traceAlloc) drop() {

 // The following functions write specific events to trace.

-func traceGomaxprocs(procs int32) {
+func (_ traceLocker) Gomaxprocs(procs int32) {
 	traceEvent(traceEvGomaxprocs, 1, uint64(procs))
 }

-func traceProcStart() {
+func (_ traceLocker) ProcStart() {
 	traceEvent(traceEvProcStart, -1, uint64(getg().m.id))
 }

-func traceProcStop(pp *p) {
+func (_ traceLocker) ProcStop(pp *p) {
 	// Sysmon and stopTheWorld can stop Ps blocked in syscalls,
 	// to handle this we temporary employ the P.
 	mp := acquirem()
@ -1494,16 +1550,16 @@ func traceProcStop(pp *p) {
 	releasem(mp)
 }

-func traceGCStart() {
+func (_ traceLocker) GCStart() {
 	traceEvent(traceEvGCStart, 3, trace.seqGC)
 	trace.seqGC++
 }

-func traceGCDone() {
+func (_ traceLocker) GCDone() {
 	traceEvent(traceEvGCDone, -1)
 }

-func traceSTWStart(reason stwReason) {
+func (_ traceLocker) STWStart(reason stwReason) {
 	// Don't trace if this STW is for trace start/stop, since traceEnabled
 	// switches during a STW.
 	if reason == stwStartTrace || reason == stwStopTrace {
@ -1513,7 +1569,7 @@ func traceSTWStart(reason stwReason) {
 	traceEvent(traceEvSTWStart, -1, uint64(reason))
 }

-func traceSTWDone() {
+func (_ traceLocker) STWDone() {
 	mp := getg().m
 	if !mp.trace.tracedSTWStart {
 		return
@ -1527,7 +1583,7 @@ func traceSTWDone() {
 //
 // traceGCSweepStart must be paired with traceGCSweepDone and there
 // must be no preemption points between these two calls.
-func traceGCSweepStart() {
+func (_ traceLocker) GCSweepStart() {
 	// Delay the actual GCSweepStart event until the first span
 	// sweep. If we don't sweep anything, don't emit any events.
 	pp := getg().m.p.ptr()
@ -1541,7 +1597,7 @@ func traceGCSweepStart() {
 //
 // This may be called outside a traceGCSweepStart/traceGCSweepDone
 // pair; however, it will not emit any trace events in this case.
-func traceGCSweepSpan(bytesSwept uintptr) {
+func (_ traceLocker) GCSweepSpan(bytesSwept uintptr) {
 	pp := getg().m.p.ptr()
 	if pp.trace.inSweep {
 		if pp.trace.swept == 0 {
@ -1551,7 +1607,7 @@ func traceGCSweepSpan(bytesSwept uintptr) {
 	}
 }

-func traceGCSweepDone() {
+func (_ traceLocker) GCSweepDone() {
 	pp := getg().m.p.ptr()
 	if !pp.trace.inSweep {
 		throw("missing traceGCSweepStart")
@ -1562,15 +1618,15 @@ func traceGCSweepDone() {
 	pp.trace.inSweep = false
 }

-func traceGCMarkAssistStart() {
+func (_ traceLocker) GCMarkAssistStart() {
 	traceEvent(traceEvGCMarkAssistStart, 1)
 }

-func traceGCMarkAssistDone() {
+func (_ traceLocker) GCMarkAssistDone() {
 	traceEvent(traceEvGCMarkAssistDone, -1)
 }

-func traceGoCreate(newg *g, pc uintptr) {
+func (_ traceLocker) GoCreate(newg *g, pc uintptr) {
 	newg.trace.seq = 0
 	newg.trace.lastP = getg().m.p
 	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
@ -1578,7 +1634,7 @@ func traceGoCreate(newg *g, pc uintptr) {
 	traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id))
 }

-func traceGoStart() {
+func (_ traceLocker) GoStart() {
 	gp := getg().m.curg
 	pp := gp.m.p
 	gp.trace.seq++
@ -1592,29 +1648,29 @@ func traceGoStart() {
 	}
 }

-func traceGoEnd() {
+func (_ traceLocker) GoEnd() {
 	traceEvent(traceEvGoEnd, -1)
 }

-func traceGoSched() {
+func (_ traceLocker) GoSched() {
 	gp := getg()
 	gp.trace.lastP = gp.m.p
 	traceEvent(traceEvGoSched, 1)
 }

-func traceGoPreempt() {
+func (_ traceLocker) GoPreempt() {
 	gp := getg()
 	gp.trace.lastP = gp.m.p
 	traceEvent(traceEvGoPreempt, 1)
 }

-func traceGoPark(reason traceBlockReason, skip int) {
+func (_ traceLocker) GoPark(reason traceBlockReason, skip int) {
 	// Convert the block reason directly to a trace event type.
 	// See traceBlockReason for more information.
 	traceEvent(byte(reason), skip)
 }

-func traceGoUnpark(gp *g, skip int) {
+func (_ traceLocker) GoUnpark(gp *g, skip int) {
 	pp := getg().m.p
 	gp.trace.seq++
 	if gp.trace.lastP == pp {
@ -1625,7 +1681,7 @@ func traceGoUnpark(gp *g, skip int) {
 	}
 }

-func traceGoSysCall() {
+func (_ traceLocker) GoSysCall() {
 	var skip int
 	switch {
 	case tracefpunwindoff():
@ -1646,7 +1702,7 @@ func traceGoSysCall() {
 	traceEvent(traceEvGoSysCall, skip)
 }

-func traceGoSysExit() {
+func (_ traceLocker) GoSysExit() {
 	gp := getg().m.curg
 	if !gp.trace.tracedSyscallEnter {
 		// There was no syscall entry traced for us at all, so there's definitely
@ -1673,7 +1729,7 @@ func traceGoSysExit() {
 	traceEvent(traceEvGoSysExit, -1, gp.goid, gp.trace.seq, uint64(ts))
 }

-func traceGoSysBlock(pp *p) {
+func (_ traceLocker) GoSysBlock(pp *p) {
 	// Sysmon and stopTheWorld can declare syscalls running on remote Ps as blocked,
 	// to handle this we temporary employ the P.
 	mp := acquirem()
@ -1684,11 +1740,11 @@ func traceGoSysBlock(pp *p) {
 	releasem(mp)
 }

-func traceHeapAlloc(live uint64) {
+func (_ traceLocker) HeapAlloc(live uint64) {
 	traceEvent(traceEvHeapAlloc, -1, live)
 }

-func traceHeapGoal() {
+func (_ traceLocker) HeapGoal() {
 	heapGoal := gcController.heapGoal()
 	if heapGoal == ^uint64(0) {
 		// Heap-based triggering is disabled.
@ -1789,15 +1845,15 @@ func startPCforTrace(pc uintptr) uintptr {
 	return f.datap.textAddr(*(*uint32)(w))
 }

-// traceOneNewExtraM registers the fact that a new extra M was created with
+// OneNewExtraM registers the fact that a new extra M was created with
 // the tracer. This matters if the M (which has an attached G) is used while
 // the trace is still active because if it is, we need the fact that it exists
 // to show up in the final trace.
-func traceOneNewExtraM(gp *g) {
+func (tl traceLocker) OneNewExtraM(gp *g) {
 	// Trigger two trace events for the locked g in the extra m,
 	// since the next event of the g will be traceEvGoSysExit in exitsyscall,
 	// while calling from C thread to Go.
-	traceGoCreate(gp, 0) // no start pc
+	tl.GoCreate(gp, 0) // no start pc
 	gp.trace.seq++
 	traceEvent(traceEvGoInSyscall, -1, gp.goid)
 }