diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 353610d50c..943a7233ae 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -501,7 +501,7 @@ func (c *gcControllerState) findRunnable(_p_ *p) *g {
 		// else for a while, so kick everything out of its run
 		// queue.
 	} else {
-		if _p_.m.ptr().currentwbuf == 0 && work.full == 0 && work.partial == 0 {
+		if _p_.gcw.wbuf == 0 && work.full == 0 && work.partial == 0 {
 			// No work to be done right now. This can
 			// happen at the end of the mark phase when
 			// there are still assists tapering off. Don't
@@ -1026,7 +1026,6 @@ func gcBgMarkWorker(p *p) {
 	// is set, this puts itself into _Gwaiting to be woken up by
 	// gcController.findRunnable at the appropriate time.
 	notewakeup(&work.bgMarkReady)
-	var gcw gcWork
 	for {
 		// Go to sleep until woken by gcContoller.findRunnable.
 		// We can't releasem yet since even the call to gopark
@@ -1055,18 +1054,19 @@ func gcBgMarkWorker(p *p) {
 		done := false
 		switch p.gcMarkWorkerMode {
 		case gcMarkWorkerDedicatedMode:
-			gcDrain(&gcw, gcBgCreditSlack)
+			gcDrain(&p.gcw, gcBgCreditSlack)
 			// gcDrain did the xadd(&work.nwait +1) to
 			// match the decrement above. It only returns
 			// at a mark completion point.
 			done = true
 		case gcMarkWorkerFractionalMode, gcMarkWorkerIdleMode:
-			gcDrainUntilPreempt(&gcw, gcBgCreditSlack)
+			gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
 			// Was this the last worker and did we run out
 			// of work?
 			done = xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0
 		}
-		gcw.dispose()
+		// We're not in mark termination, so there's no need
+		// to dispose p.gcw.
 
 		// If this worker reached a background mark completion
 		// point, signal the main GC goroutine.
@@ -1121,6 +1121,14 @@ func gcMark(start_time int64) {
 
 	gcCopySpans() // TODO(rlh): should this be hoisted and done only once? Right now it is done for normal marking and also for checkmarking.
 
+	// Gather all cached GC work. All other Ps are stopped, so
+	// it's safe to manipulate their GC work caches. During mark
+	// termination, these caches can still be used temporarily,
+	// but must be disposed to the global lists immediately.
+	for i := 0; i < int(gomaxprocs); i++ {
+		allp[i].gcw.dispose()
+	}
+
 	work.nwait = 0
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
@@ -1135,9 +1143,9 @@ func gcMark(start_time int64) {
 		helpgc(int32(work.nproc))
 	}
 
-	harvestwbufs() // move local workbufs onto global queues where the GC can find them
 	gchelperstart()
 	parfordo(work.markfor)
+
 	var gcw gcWork
 	gcDrain(&gcw, -1)
 	gcw.dispose()
@@ -1153,6 +1161,12 @@ func gcMark(start_time int64) {
 		notesleep(&work.alldone)
 	}
 
+	for i := 0; i < int(gomaxprocs); i++ {
+		if allp[i].gcw.wbuf != 0 {
+			throw("P has cached GC work at end of mark termination")
+		}
+	}
+
 	if trace.enabled {
 		traceGCScanDone()
 	}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 5d5a0dab75..2b6e9a37d3 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -55,6 +55,7 @@ var oneptr = [...]uint8{typePointer}
 
 //go:nowritebarrier
 func markroot(desc *parfor, i uint32) {
+	// TODO: Consider using getg().m.p.ptr().gcw.
 	var gcw gcWork
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
@@ -172,6 +173,8 @@ func markroot(desc *parfor, i uint32) {
 // allocations performed by this mutator since the last assist.
 //
 // It should only be called during gcphase == _GCmark.
+//
+// This must be called with preemption disabled.
 //go:nowritebarrier
 func gcAssistAlloc(size uintptr, allowAssist bool) {
 	// Find the G responsible for this assist.
@@ -228,19 +231,14 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
 
 		xadd(&work.nwait, -1)
 
-		// drain own current wbuf first in the hopes that it
+		// drain own cached work first in the hopes that it
 		// will be more cache friendly.
-		var gcw gcWork
-		gcw.initFromCache()
+		gcw := &getg().m.p.ptr().gcw
 		startScanWork := gcw.scanWork
-		gcDrainN(&gcw, scanWork)
+		gcDrainN(gcw, scanWork)
 		// Record that we did this much scan work.
 		gp.gcscanwork += gcw.scanWork - startScanWork
-		// TODO(austin): This is the vast majority of our
-		// disposes. Instead of constantly disposing, keep a
-		// per-P gcWork cache (probably combined with the
-		// write barrier wbuf cache).
-		gcw.dispose()
+		// No need to dispose since we're not in mark termination.
 
 		// If this is the last worker and we ran out of work,
 		// signal a completion point.
@@ -315,21 +313,24 @@ func scanstack(gp *g) {
 		throw("can't scan gchelper stack")
 	}
 
-	var gcw gcWork
-	gcw.initFromCache()
+	gcw := &getg().m.p.ptr().gcw
+	origBytesMarked := gcw.bytesMarked
+	origScanWork := gcw.scanWork
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
 		// Pick up gcw as free variable so gentraceback and friends can
 		// keep the same signature.
-		scanframeworker(frame, unused, &gcw)
+		scanframeworker(frame, unused, gcw)
 		return true
 	}
 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
 	tracebackdefers(gp, scanframe, nil)
 	// Stacks aren't part of the heap, so don't count them toward
 	// marked heap bytes.
-	gcw.bytesMarked = 0
-	gcw.scanWork = 0
-	gcw.disposeToCache()
+	gcw.bytesMarked = origBytesMarked
+	gcw.scanWork = origScanWork
+	if gcphase == _GCmarktermination {
+		gcw.dispose()
+	}
 	gp.gcscanvalid = true
 }
 
@@ -462,7 +463,6 @@ func gcDrain(gcw *gcWork, flushScanCredit int64) {
 		credit := gcw.scanWork - lastScanFlush
 		xaddint64(&gcController.bgScanCredit, credit)
 	}
-	checknocurrentwbuf()
 }
 
 // gcDrainUntilPreempt blackens grey objects until g.preempt is set.
@@ -524,7 +524,6 @@ func gcDrainUntilPreempt(gcw *gcWork, flushScanCredit int64) {
 // scanning is always done in whole object increments.
 //go:nowritebarrier
 func gcDrainN(gcw *gcWork, scanWork int64) {
-	checknocurrentwbuf()
 	targetScanWork := gcw.scanWork + scanWork
 	for gcw.scanWork < targetScanWork {
 		// This might be a good place to add prefetch code...
@@ -646,33 +645,16 @@ func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWork) {
 
 // Shade the object if it isn't already.
 // The object is not nil and known to be in the heap.
+// Preemption must be disabled.
 //go:nowritebarrier
 func shade(b uintptr) {
 	if obj, hbits, span := heapBitsForObject(b); obj != 0 {
-		// TODO: this would be a great place to put a check to see
-		// if we are harvesting and if we are then we should
-		// figure out why there is a call to shade when the
-		// harvester thinks we are in a STW.
-		// if atomicload(&harvestingwbufs) == uint32(1) {
-		//	// Throw here to discover write barriers
-		//	// being executed during a STW.
-		//	throw("shade during harvest")
-		// }
-
-		var gcw gcWork
-		greyobject(obj, 0, 0, hbits, span, &gcw)
-		// This is part of the write barrier so put the wbuf back.
+		gcw := &getg().m.p.ptr().gcw
+		greyobject(obj, 0, 0, hbits, span, gcw)
 		if gcphase == _GCmarktermination {
+			// Ps aren't allowed to cache work during mark
+			// termination.
 			gcw.dispose()
-		} else {
-			// If we added any pointers to the gcw, then
-			// currentwbuf must be nil because 1)
-			// greyobject got its wbuf from currentwbuf
-			// and 2) shade runs on the systemstack, so
-			// we're still on the same M.  If either of
-			// these becomes no longer true, we need to
-			// rethink this.
-			gcw.disposeToCache()
 		}
 	}
 }
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index fbe4d03adf..e7d1a104b8 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -38,7 +38,7 @@ func (wp wbufptr) ptr() *workbuf {
 // A gcWork provides the interface to produce and consume work for the
 // garbage collector.
 //
-// The usual pattern for using gcWork is:
+// A gcWork can be used on the stack as follows:
 //
 //     var gcw gcWork
 //     disable preemption
@@ -46,6 +46,15 @@ func (wp wbufptr) ptr() *workbuf {
 //     gcw.dispose()
 //     enable preemption
 //
+// Or from the per-P gcWork cache:
+//
+//     (preemption must be disabled)
+//     gcw := &getg().m.p.ptr().gcw
+//     .. call gcw.put() to produce and gcw.get() to consume ..
+//     if gcphase == _GCmarktermination {
+//         gcw.dispose()
+//     }
+//
 // It's important that any use of gcWork during the mark phase prevent
 // the garbage collector from transitioning to mark termination since
 // gcWork may locally hold GC work buffers. This can be done by
@@ -63,17 +72,6 @@ type gcWork struct {
 	scanWork int64
 }
 
-// initFromCache fetches work from this M's currentwbuf cache.
-//go:nowritebarrier
-func (w *gcWork) initFromCache() {
-	// TODO: Instead of making gcWork pull from the currentwbuf
-	// cache, use a gcWork as the cache and make shade pass around
-	// that gcWork.
-	if w.wbuf == 0 {
-		w.wbuf = wbufptr(xchguintptr(&getg().m.currentwbuf, 0))
-	}
-}
-
 // put enqueues a pointer for the garbage collector to trace.
 //go:nowritebarrier
 func (ww *gcWork) put(obj uintptr) {
@@ -174,27 +172,6 @@ func (w *gcWork) dispose() {
 	}
 }
 
-// disposeToCache returns any cached pointers to this M's currentwbuf.
-// It calls throw if currentwbuf is non-nil.
-//go:nowritebarrier
-func (w *gcWork) disposeToCache() {
-	if wbuf := w.wbuf; wbuf != 0 {
-		wbuf = wbufptr(xchguintptr(&getg().m.currentwbuf, uintptr(wbuf)))
-		if wbuf != 0 {
-			throw("m.currentwbuf non-nil in disposeToCache")
-		}
-		w.wbuf = 0
-	}
-	if w.bytesMarked != 0 {
-		xadd64(&work.bytesMarked, int64(w.bytesMarked))
-		w.bytesMarked = 0
-	}
-	if w.scanWork != 0 {
-		xaddint64(&gcController.scanWork, w.scanWork)
-		w.scanWork = 0
-	}
-}
-
 // balance moves some work that's cached in this gcWork back on the
 // global queue.
 //go:nowritebarrier
@@ -222,7 +199,7 @@ type workbuf struct {
 }
 
 // workbuf factory routines. These funcs are used to manage the
-// workbufs. They cache workbuf in the m struct field currentwbuf.
+// workbufs.
 // If the GC asks for some work these are the only routines that
 // make partially full wbufs available to the GC.
 // Each of the gets and puts also take an distinct integer that is used
@@ -283,13 +260,6 @@ func (b *workbuf) checkempty() {
 	}
 }
 
-// checknocurrentwbuf checks that the m's currentwbuf field is empty
-func checknocurrentwbuf() {
-	if getg().m.currentwbuf != 0 {
-		throw("unexpected currentwbuf")
-	}
-}
-
 // getempty pops an empty work buffer off the work.empty list,
 // allocating new buffers if none are available.
 // entry is used to record a brief history of ownership.
@@ -335,21 +305,7 @@ func putfull(b *workbuf, entry int) {
 // indicating that two line numbers in the call chain.
 //go:nowritebarrier
 func getpartialorempty(entry int) *workbuf {
-	var b *workbuf
-	// If this m has a buf in currentwbuf then as an optimization
-	// simply return that buffer. If it turns out currentwbuf
-	// is full, put it on the work.full queue and get another
-	// workbuf off the partial or empty queue.
-	if getg().m.currentwbuf != 0 {
-		b = (*workbuf)(unsafe.Pointer(xchguintptr(&getg().m.currentwbuf, 0)))
-		if b != nil {
-			if b.nobj <= len(b.obj) {
-				return b
-			}
-			putfull(b, entry+80100000)
-		}
-	}
-	b = (*workbuf)(lfstackpop(&work.partial))
+	b := (*workbuf)(lfstackpop(&work.partial))
 	if b != nil {
 		b.logget(entry)
 		return b
@@ -395,21 +351,6 @@ func trygetfull(entry int) *workbuf {
 		b.checknonempty()
 		return b
 	}
-	// full and partial are both empty so see if there
-	// is an work available on currentwbuf.
-	// This is an optimization to shift
-	// processing from the STW marktermination phase into
-	// the concurrent mark phase.
-	if getg().m.currentwbuf != 0 {
-		b = (*workbuf)(unsafe.Pointer(xchguintptr(&getg().m.currentwbuf, 0)))
-		if b != nil {
-			if b.nobj != 0 {
-				return b
-			}
-			putempty(b, 839)
-			b = nil
-		}
-	}
 	return b
 }
 
@@ -438,19 +379,6 @@ func getfull(entry int) *workbuf {
 		b.logget(entry)
 		return b
 	}
-	// Make sure that currentwbuf is also not a source for pointers to be
-	// processed. This is an optimization that shifts processing
-	// from the mark termination STW phase to the concurrent mark phase.
-	if getg().m.currentwbuf != 0 {
-		b = (*workbuf)(unsafe.Pointer(xchguintptr(&getg().m.currentwbuf, 0)))
-		if b != nil {
-			if b.nobj != 0 {
-				return b
-			}
-			putempty(b, 877)
-			b = nil
-		}
-	}
 
 	xadd(&work.nwait, +1)
 	for i := 0; ; i++ {
@@ -500,35 +428,3 @@ func handoff(b *workbuf) *workbuf {
 	putfull(b, 942)
 	return b1
 }
-
-// 1 when you are harvesting so that the write buffer code shade can
-// detect calls during a presumable STW write barrier.
-var harvestingwbufs uint32
-
-// harvestwbufs moves non-empty workbufs to work.full from  m.currentwuf
-// Must be in a STW phase.
-// xchguintptr is used since there are write barrier calls from the GC helper
-// routines even during a STW phase.
-// TODO: chase down write barrier calls in STW phase and understand and eliminate
-// them.
-//go:nowritebarrier
-func harvestwbufs() {
-	// announce to write buffer that you are harvesting the currentwbufs
-	atomicstore(&harvestingwbufs, 1)
-
-	for mp := allm; mp != nil; mp = mp.alllink {
-		wbuf := (*workbuf)(unsafe.Pointer(xchguintptr(&mp.currentwbuf, 0)))
-		// TODO: beat write barriers out of the mark termination and eliminate xchg
-		//		tempwbuf := (*workbuf)(unsafe.Pointer(tempm.currentwbuf))
-		//		tempm.currentwbuf = 0
-		if wbuf != nil {
-			if wbuf.nobj == 0 {
-				putempty(wbuf, 945)
-			} else {
-				putfull(wbuf, 947) //use full instead of partial so GC doesn't compete to get wbuf
-			}
-		}
-	}
-
-	atomicstore(&harvestingwbufs, 0)
-}
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 476108e36c..998a159887 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -304,7 +304,6 @@ type m struct {
 	waitsemacount uint32
 	waitsemalock  uint32
 	gcstats       gcstats
-	currentwbuf   uintptr // use locks or atomic operations such as xchguinptr to access.
 	needextram    bool
 	traceback     uint8
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
@@ -384,6 +383,11 @@ type p struct {
 	gcBgMarkWorker   *g
 	gcMarkWorkerMode gcMarkWorkerMode
 
+	// gcw is this P's GC work buffer cache. The work buffer is
+	// filled by write barriers, drained by mutator assists, and
+	// disposed on certain GC state transitions.
+	gcw gcWork
+
 	pad [64]byte
 }