diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 2bdb21af99..31e8e4caee 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1302,18 +1302,22 @@ func gcMarkTermination() {
 	sweep.nbgsweep = 0
 	sweep.npausesweep = 0
 
+	// If gcSweep didn't do it, finish the current heap profiling
+	// cycle and start a new heap profiling cycle. We do this
+	// before starting the world so events don't leak into the
+	// wrong cycle.
+	needProfCycle := _ConcurrentSweep && work.mode != gcForceBlockMode
+	if needProfCycle {
+		mProf_NextCycle()
+	}
+
 	systemstack(startTheWorldWithSema)
 
-	// Update heap profile stats if gcSweep didn't do it. This is
-	// relatively expensive, so we don't want to do it while the
-	// world is stopped, but it needs to happen ASAP after
-	// starting the world to prevent too many allocations from the
-	// next cycle leaking in. It must happen before releasing
-	// worldsema since there are applications that do a
-	// runtime.GC() to update the heap profile and then
-	// immediately collect the profile.
-	if _ConcurrentSweep && work.mode != gcForceBlockMode {
-		mProf_GC()
+	// Flush the heap profile so we can start a new cycle next GC.
+	// This is relatively expensive, so we don't do it with the
+	// world stopped.
+	if needProfCycle {
+		mProf_Flush()
 	}
 
 	// Free stack spans. This must be done between GC cycles.
@@ -1759,9 +1763,12 @@ func gcSweep(mode gcMode) {
 		for sweepone() != ^uintptr(0) {
 			sweep.npausesweep++
 		}
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		mProf_GC()
-		mProf_GC()
+		// All "free" events are now real, so flush everything
+		// into the published profile.
+		mProf_NextCycle()
+		mProf_Flush()
+		mProf_NextCycle()
+		mProf_Flush()
 		return
 	}
 
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 524598edaa..cd781c4416 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -77,36 +77,43 @@ type memRecord struct {
 	//
 	//       alloc → ▲ ← free
 	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
-	//       r_a     →    p_a    →  allocs
-	//                    p_f    →  frees
+	//       C+2     →    C+1    →  C
 	//
 	//                   alloc → ▲ ← free
 	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
-	//                   r_a     →    p_a    →  alloc
-	//		                  p_f    →  frees
+	//                   C+2     →    C+1    →  C
 	//
 	// Since we can't publish a consistent snapshot until all of
 	// the sweep frees are accounted for, we wait until the next
 	// mark termination ("MT" above) to publish the previous mark
-	// termination's snapshot ("P" above). To do this, information
-	// is delayed through "recent" and "prev" stages ("r_*" and
-	// "p_*" above). Specifically:
+	// termination's snapshot ("P" above). To do this, allocation
+	// and free events are accounted to *future* heap profile
+	// cycles ("C+n" above) and we only publish a cycle once all
+	// of the events from that cycle must be done. Specifically:
 	//
-	// Mallocs are accounted in recent stats.
-	// Explicit frees are accounted in recent stats.
-	// GC frees are accounted in prev stats.
-	// After GC prev stats are added to final stats and
-	// recent stats are moved into prev stats.
+	// Mallocs are accounted to cycle C+2.
+	// Explicit frees are accounted to cycle C+2.
+	// GC frees (done during sweeping) are accounted to cycle C+1.
+	//
+	// After mark termination, we increment the global heap
+	// profile cycle counter and accumulate the stats from cycle C
+	// into the active profile.
 
 	// active is the currently published profile. A profiling
 	// cycle can be accumulated into active once its complete.
 	active memRecordCycle
 
-	// changes between next-to-last GC and last GC
-	prev memRecordCycle
-
-	// changes since last GC
-	recent memRecordCycle
+	// future records the profile events we're counting for cycles
+	// that have not yet been published. This is ring buffer
+	// indexed by the global heap profile cycle C and stores
+	// cycles C, C+1, and C+2. Unlike active, these counts are
+	// only for a single cycle; they are not cumulative across
+	// cycles.
+	//
+	// We store cycle C here because there's a window between when
+	// C becomes the active cycle and when we've flushed it to
+	// active.
+	future [3]memRecordCycle
 }
 
 // memRecordCycle
@@ -136,8 +143,21 @@ var (
 	xbuckets  *bucket // mutex profile buckets
 	buckhash  *[179999]*bucket
 	bucketmem uintptr
+
+	mProf struct {
+		// All fields in mProf are protected by proflock.
+
+		// cycle is the global heap profile cycle. This wraps
+		// at mProfCycleWrap.
+		cycle uint32
+		// flushed indicates that future[cycle] in all buckets
+		// has been flushed to the active profile.
+		flushed bool
+	}
 )
 
+const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
 	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
@@ -248,32 +268,64 @@ func eqslice(x, y []uintptr) bool {
 	return true
 }
 
-func mprof_GC() {
+// mProf_NextCycle publishes the next heap profile cycle and creates a
+// fresh heap profile cycle. This operation is fast and can be done
+// during STW. The caller must call mProf_Flush before calling
+// mProf_NextCycle again.
+//
+// This is called by mark termination during STW so allocations and
+// frees after the world is started again count towards a new heap
+// profiling cycle.
+func mProf_NextCycle() {
+	lock(&proflock)
+	// We explicitly wrap mProf.cycle rather than depending on
+	// uint wraparound because the memRecord.future ring does not
+	// itself wrap at a power of two.
+	mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
+	mProf.flushed = false
+	unlock(&proflock)
+}
+
+// mProf_Flush flushes the events from the current heap profiling
+// cycle into the active profile. After this it is safe to start a new
+// heap profiling cycle with mProf_NextCycle.
+//
+// This is called by GC after mark termination starts the world. In
+// contrast with mProf_NextCycle, this is somewhat expensive, but safe
+// to do concurrently.
+func mProf_Flush() {
+	lock(&proflock)
+	if !mProf.flushed {
+		mProf_FlushLocked()
+		mProf.flushed = true
+	}
+	unlock(&proflock)
+}
+
+func mProf_FlushLocked() {
+	c := mProf.cycle
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
 
-		mp.active.add(&mp.prev)
-		mp.prev = mp.recent
-		mp.recent = memRecordCycle{}
+		// Flush cycle C into the published profile and clear
+		// it for reuse.
+		mpc := &mp.future[c%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
 	}
 }
 
-// Record that a gc just happened: all the 'recent' statistics are now real.
-func mProf_GC() {
-	lock(&proflock)
-	mprof_GC()
-	unlock(&proflock)
-}
-
 // Called by malloc to record a profiled block.
 func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 	var stk [maxStack]uintptr
 	nstk := callers(4, stk[:])
 	lock(&proflock)
 	b := stkbucket(memProfile, size, stk[:nstk], true)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.recent.allocs++
-	mp.recent.alloc_bytes += size
+	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+	mpc.allocs++
+	mpc.alloc_bytes += size
 	unlock(&proflock)
 
 	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
@@ -288,9 +340,11 @@ func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 // Called when freeing a profiled block.
 func mProf_Free(b *bucket, size uintptr) {
 	lock(&proflock)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.prev.frees++
-	mp.prev.free_bytes += size
+	mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+	mpc.frees++
+	mpc.free_bytes += size
 	unlock(&proflock)
 }
 
@@ -467,6 +521,10 @@ func (r *MemProfileRecord) Stack() []uintptr {
 // of calling MemProfile directly.
 func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 	lock(&proflock)
+	// If we're between mProf_NextCycle and mProf_Flush, take care
+	// of flushing to the active profile so we only have to look
+	// at the active profile below.
+	mProf_FlushLocked()
 	clear := true
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
@@ -481,12 +539,14 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 		// Absolutely no data, suggesting that a garbage collection
 		// has not yet happened. In order to allow profiling when
 		// garbage collection is disabled from the beginning of execution,
-		// accumulate stats as if a GC just happened, and recount buckets.
-		mprof_GC()
-		mprof_GC()
+		// accumulate all of the cycles, and recount buckets.
 		n = 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
+			for c := range mp.future {
+				mp.active.add(&mp.future[c])
+				mp.future[c] = memRecordCycle{}
+			}
 			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				n++
 			}