diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 32e030c518..a60684168b 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -8,16 +8,19 @@ #include "arch_GOARCH.h" #include "malloc.h" #include "stack.h" +#include "mgc0.h" #include "race.h" enum { Debug = 0, DebugMark = 0, // run second pass to check mark - DataBlock = 8*1024, // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, bitShift = sizeof(void*)*8/4, + + handoffThreshold = 4, + IntermediateBufferCapacity = 64, }; // Bits in per-word bitmap. @@ -70,12 +73,24 @@ uint32 runtime·worldsema = 1; static int32 gctrace; +typedef struct Obj Obj; +struct Obj +{ + byte *p; // data pointer + uintptr n; // size of data in bytes + uintptr ti; // type info +}; + +// The size of Workbuf is N*PageSize. typedef struct Workbuf Workbuf; struct Workbuf { - LFNode node; // must be first +#define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr)) + LFNode node; // must be first uintptr nobj; - byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)]; + Obj obj[SIZE/sizeof(Obj) - 1]; + uint8 _padding[SIZE%sizeof(Obj) + sizeof(Obj)]; +#undef SIZE }; typedef struct Finalizer Finalizer; @@ -97,9 +112,13 @@ struct FinBlock }; extern byte data[]; -extern byte etext[]; +extern byte edata[]; +extern byte bss[]; extern byte ebss[]; +extern byte gcdata[]; +extern byte gcbss[]; + static G *fing; static FinBlock *finq; // list of finalizers that are to be executed static FinBlock *finc; // cache of free blocks @@ -113,13 +132,6 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); -typedef struct GcRoot GcRoot; -struct GcRoot -{ - byte *p; - uintptr n; -}; - static struct { uint64 full; // lock-free list of full blocks uint64 empty; // lock-free list of empty blocks @@ -136,77 +148,122 @@ static struct { byte *chunk; uintptr nchunk; - GcRoot *roots; + Obj *roots; uint32 nroot; uint32 rootcap; } work; -// scanblock scans a block of n bytes starting at pointer b for references -// to other objects, scanning any it finds recursively until there are no -// unscanned objects left. Instead of using an explicit recursion, it keeps -// a work list in the Workbuf* structures and loops in the main function -// body. Keeping an explicit work list is easier on the stack allocator and -// more efficient. -static void -scanblock(byte *b, uintptr n) +enum { + // TODO(atom): to be expanded in a next CL + GC_DEFAULT_PTR = GC_NUM_INSTR, +}; + +// PtrTarget and BitTarget are structures used by intermediate buffers. +// The intermediate buffers hold GC data before it +// is moved/flushed to the work buffer (Workbuf). +// The size of an intermediate buffer is very small, +// such as 32 or 64 elements. +struct PtrTarget { - byte *obj, *arena_start, *arena_used, *p; - void **vp; - uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc; + void *p; + uintptr ti; +}; + +struct BitTarget +{ + void *p; + uintptr ti; + uintptr *bitp, shift; +}; + +struct BufferList +{ + struct PtrTarget ptrtarget[IntermediateBufferCapacity]; + struct BitTarget bittarget[IntermediateBufferCapacity]; + struct BufferList *next; +}; +static struct BufferList *bufferList; + +static Lock lock; + +// flushptrbuf moves data from the PtrTarget buffer to the work buffer. +// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned, +// while the work buffer contains blocks which have been marked +// and are prepared to be scanned by the garbage collector. +// +// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer. +// bitbuf holds temporary data generated by this function. +// +// A simplified drawing explaining how the todo-list moves from a structure to another: +// +// scanblock +// (find pointers) +// Obj ------> PtrTarget (pointer targets) +// ↑ | +// | | flushptrbuf (1st part, +// | | find block start) +// | ↓ +// `--------- BitTarget (pointer targets and the corresponding locations in bitmap) +// flushptrbuf +// (2nd part, mark and enqueue) +static void +flushptrbuf(struct PtrTarget *ptrbuf, uintptr n, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj, struct BitTarget *bitbuf) +{ + byte *p, *arena_start, *obj; + uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti; MSpan *s; PageID k; - void **wp; + Obj *wp; Workbuf *wbuf; - bool keepworking; + struct PtrTarget *ptrbuf_end; + struct BitTarget *bitbufpos, *bt; - if((intptr)n < 0) { - runtime·printf("scanblock %p %D\n", b, (int64)n); - runtime·throw("scanblock"); - } - - // Memory arena parameters. arena_start = runtime·mheap.arena_start; - arena_used = runtime·mheap.arena_used; - nproc = work.nproc; - wbuf = nil; // current work buffer - wp = nil; // storage for next queued pointer (write pointer) - nobj = 0; // number of queued objects + wp = *_wp; + wbuf = *_wbuf; + nobj = *_nobj; - // Scanblock helpers pass b==nil. - // Procs needs to return to make more - // calls to scanblock. But if work.nproc==1 then - // might as well process blocks as soon as we - // have them. - keepworking = b == nil || work.nproc == 1; + ptrbuf_end = ptrbuf + n; - // Align b to a word boundary. - off = (uintptr)b & (PtrSize-1); - if(off != 0) { - b += PtrSize - off; - n -= PtrSize - off; + // If buffer is nearly full, get a new one. + if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) { + if(wbuf != nil) + wbuf->nobj = nobj; + wbuf = getempty(wbuf); + wp = wbuf->obj; + nobj = 0; + + if(n >= nelem(wbuf->obj)) + runtime·throw("ptrbuf has to be smaller than WorkBuf"); } - for(;;) { - // Each iteration scans the block b of length n, queueing pointers in - // the work buffer. - if(Debug > 1) - runtime·printf("scanblock %p %D\n", b, (int64)n); + // TODO(atom): This block is a branch of an if-then-else statement. + // The single-threaded branch may be added in a next CL. + { + // Multi-threaded version. - vp = (void**)b; - n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */ - for(i=0; i= arena_used) - continue; + while(ptrbuf < ptrbuf_end) { + obj = ptrbuf->p; + ti = ptrbuf->ti; + ptrbuf++; + + // obj belongs to interval [mheap.arena_start, mheap.arena_used). + if(Debug > 1) { + if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used) + runtime·throw("object is outside of mheap"); + } // obj may be a pointer to a live object. // Try to find the beginning of the object. // Round down to word boundary. - obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) { + obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + ti = 0; + } // Find bits for this word. off = (uintptr*)obj - (uintptr*)arena_start; @@ -219,6 +276,8 @@ scanblock(byte *b, uintptr n) if((bits & (bitAllocated|bitBlockBoundary)) != 0) goto found; + ti = 0; + // Pointing just past the beginning? // Scan backward a little to find a block boundary. for(j=shift; j-->0; ) { @@ -239,13 +298,13 @@ scanblock(byte *b, uintptr n) s = runtime·mheap.map[x]; if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse) continue; - p = (byte*)((uintptr)s->start<start<sizeclass == 0) { obj = p; } else { if((byte*)obj >= (byte*)s->limit) continue; - size = runtime·class_to_size[s->sizeclass]; + size = s->elemsize; int32 i = ((byte*)obj - p)/size; obj = p+i*size; } @@ -258,81 +317,203 @@ scanblock(byte *b, uintptr n) bits = xbits >> shift; found: - // If another proc wants a pointer, give it some. - if(work.nwait > 0 && nobj > 4 && work.full == 0) { - wbuf->nobj = nobj; - wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = wbuf->obj + nobj; - } - // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about allocated and not marked. if((bits & (bitAllocated|bitMarked)) != bitAllocated) continue; - if(nproc == 1) - *bitp |= bitMarked<bitp; + bits = xbits >> bt->shift; + if((bits & bitMarked) != 0) + continue; + + // Mark the block + *bt->bitp = xbits | (bitMarked << bt->shift); // If object has no pointers, don't need to scan further. if((bits & bitNoPointers) != 0) continue; + obj = bt->p; + + // Ask span about size class. + // (Manually inlined copy of MHeap_Lookup.) + x = (uintptr)obj >> PageShift; + if(sizeof(void*) == 8) + x -= (uintptr)arena_start>>PageShift; + s = runtime·mheap.map[x]; + PREFETCH(obj); - // If buffer is full, get a new one. - if(wbuf == nil || nobj >= nelem(wbuf->obj)) { - if(wbuf != nil) - wbuf->nobj = nobj; - wbuf = getempty(wbuf); - wp = wbuf->obj; - nobj = 0; - } - *wp++ = obj; + *wp = (Obj){obj, s->elemsize, bt->ti}; + wp++; nobj++; - continue_obj:; + } + runtime·unlock(&lock); + + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + } + + *_wp = wp; + *_wbuf = wbuf; + *_nobj = nobj; +} + +// Program that scans the whole block and treats every block element as a potential pointer +static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR}; + +// scanblock scans a block of n bytes starting at pointer b for references +// to other objects, scanning any it finds recursively until there are no +// unscanned objects left. Instead of using an explicit recursion, it keeps +// a work list in the Workbuf* structures and loops in the main function +// body. Keeping an explicit work list is easier on the stack allocator and +// more efficient. +// +// wbuf: current work buffer +// wp: storage for next queued pointer (write pointer) +// nobj: number of queued objects +static void +scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking) +{ + byte *b, *arena_start, *arena_used; + uintptr n, i, end_b; + void *obj; + + // TODO(atom): to be expanded in a next CL + struct Frame {uintptr count, b; uintptr *loop_or_ret;}; + struct Frame stack_top; + + uintptr *pc; + + struct BufferList *scanbuffers; + struct PtrTarget *ptrbuf, *ptrbuf_end; + struct BitTarget *bitbuf; + + struct PtrTarget *ptrbufpos; + + // End of local variable declarations. + + if(sizeof(Workbuf) % PageSize != 0) + runtime·throw("scanblock: size of Workbuf is suboptimal"); + + // Memory arena parameters. + arena_start = runtime·mheap.arena_start; + arena_used = runtime·mheap.arena_used; + + // Allocate ptrbuf, bitbuf + { + runtime·lock(&lock); + + if(bufferList == nil) { + bufferList = runtime·SysAlloc(sizeof(*bufferList)); + bufferList->next = nil; + } + scanbuffers = bufferList; + bufferList = bufferList->next; + + ptrbuf = &scanbuffers->ptrtarget[0]; + ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget); + bitbuf = &scanbuffers->bittarget[0]; + + runtime·unlock(&lock); + } + + ptrbufpos = ptrbuf; + + goto next_block; + + for(;;) { + // Each iteration scans the block b of length n, queueing pointers in + // the work buffer. + if(Debug > 1) { + runtime·printf("scanblock %p %D\n", b, (int64)n); } + // TODO(atom): to be replaced in a next CL + pc = defaultProg; + + pc++; + stack_top.b = (uintptr)b; + + end_b = (uintptr)b + n - PtrSize; + + next_instr: + // TODO(atom): to be expanded in a next CL + switch(pc[0]) { + case GC_DEFAULT_PTR: + while(true) { + i = stack_top.b; + if(i > end_b) + goto next_block; + stack_top.b += PtrSize; + + obj = *(byte**)i; + if(obj >= arena_start && obj < arena_used) { + *ptrbufpos = (struct PtrTarget){obj, 0}; + ptrbufpos++; + if(ptrbufpos == ptrbuf_end) + goto flush_buffers; + } + } + + default: + runtime·throw("scanblock: invalid GC instruction"); + return; + } + + flush_buffers: + flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf); + ptrbufpos = ptrbuf; + goto next_instr; + + next_block: // Done scanning [b, b+n). Prepare for the next iteration of - // the loop by setting b and n to the parameters for the next block. + // the loop by setting b, n to the parameters for the next block. + + if(nobj == 0) { + flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf); + ptrbufpos = ptrbuf; + + if(nobj == 0) { + if(!keepworking) { + if(wbuf) + putempty(wbuf); + goto endscan; + } + // Emptied our buffer: refill. + wbuf = getfull(wbuf); + if(wbuf == nil) + goto endscan; + nobj = wbuf->nobj; + wp = wbuf->obj + wbuf->nobj; + } + } // Fetch b from the work buffer. - if(nobj == 0) { - if(!keepworking) { - if(wbuf) - putempty(wbuf); - return; - } - // Emptied our buffer: refill. - wbuf = getfull(wbuf); - if(wbuf == nil) - return; - nobj = wbuf->nobj; - wp = wbuf->obj + wbuf->nobj; - } - b = *--wp; + --wp; + b = wp->p; + n = wp->n; nobj--; - - // Ask span about size class. - // (Manually inlined copy of MHeap_Lookup.) - x = (uintptr)b>>PageShift; - if(sizeof(void*) == 8) - x -= (uintptr)arena_start>>PageShift; - s = runtime·mheap.map[x]; - if(s->sizeclass == 0) - n = s->npages<sizeclass]; } + +endscan: + runtime·lock(&lock); + scanbuffers->next = bufferList; + bufferList = scanbuffers; + runtime·unlock(&lock); } // debug_scanblock is the debug copy of scanblock. @@ -379,13 +560,12 @@ debug_scanblock(byte *b, uintptr n) continue; p = (byte*)((uintptr)s->start<elemsize; if(s->sizeclass == 0) { obj = p; - size = (uintptr)s->npages<= (byte*)s->limit) continue; - size = runtime·class_to_size[s->sizeclass]; int32 i = ((byte*)obj - p)/size; obj = p+i*size; } @@ -414,11 +594,74 @@ debug_scanblock(byte *b, uintptr n) } } +// Append obj to the work buffer. +// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer. +static void +enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj) +{ + uintptr nobj, off; + Obj *wp; + Workbuf *wbuf; + + if(Debug > 1) + runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti); + + // Align obj.b to a word boundary. + off = (uintptr)obj.p & (PtrSize-1); + if(off != 0) { + obj.p += PtrSize - off; + obj.n -= PtrSize - off; + obj.ti = 0; + } + + if(obj.p == nil || obj.n == 0) + return; + + // Load work buffer state + wp = *_wp; + wbuf = *_wbuf; + nobj = *_nobj; + + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + + // If buffer is full, get a new one. + if(wbuf == nil || nobj >= nelem(wbuf->obj)) { + if(wbuf != nil) + wbuf->nobj = nobj; + wbuf = getempty(wbuf); + wp = wbuf->obj; + nobj = 0; + } + + *wp = obj; + wp++; + nobj++; + + // Save work buffer state + *_wp = wp; + *_wbuf = wbuf; + *_nobj = nobj; +} + static void markroot(ParFor *desc, uint32 i) { + Obj *wp; + Workbuf *wbuf; + uintptr nobj; + USED(&desc); - scanblock(work.roots[i].p, work.roots[i].n); + wp = nil; + wbuf = nil; + nobj = 0; + enqueue(work.roots[i], &wbuf, &wp, &nobj); + scanblock(wbuf, wp, nobj, false); } // Get an empty work buffer off the work.empty list, @@ -508,25 +751,24 @@ handoff(Workbuf *b) } static void -addroot(byte *p, uintptr n) +addroot(Obj obj) { uint32 cap; - GcRoot *new; + Obj *new; if(work.nroot >= work.rootcap) { - cap = PageSize/sizeof(GcRoot); + cap = PageSize/sizeof(Obj); if(cap < 2*work.rootcap) cap = 2*work.rootcap; - new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot)); + new = (Obj*)runtime·SysAlloc(cap*sizeof(Obj)); if(work.roots != nil) { - runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot)); - runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)); + runtime·memmove(new, work.roots, work.rootcap*sizeof(Obj)); + runtime·SysFree(work.roots, work.rootcap*sizeof(Obj)); } work.roots = new; work.rootcap = cap; } - work.roots[work.nroot].p = p; - work.roots[work.nroot].n = n; + work.roots[work.nroot] = obj; work.nroot++; } @@ -570,7 +812,7 @@ addstackroots(G *gp) runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); runtime·throw("scanstack"); } - addroot(sp, (byte*)stk - sp); + addroot((Obj){sp, (byte*)stk - sp, 0}); sp = (byte*)stk->gobuf.sp; guard = stk->stackguard; stk = (Stktop*)stk->stackbase; @@ -588,7 +830,7 @@ addfinroots(void *v) runtime·throw("mark - finalizer inconsistency"); // do not mark the finalizer block itself. just mark the things it points at. - addroot(v, size); + addroot((Obj){v, size, 0}); } static void @@ -596,15 +838,15 @@ addroots(void) { G *gp; FinBlock *fb; - byte *p; MSpan *s, **allspans; uint32 spanidx; work.nroot = 0; - // mark data+bss. - for(p=data; ptypes.data, sizeof(void*)); + // TODO(atom): consider using defaultProg instead of 0 + addroot((Obj){(byte*)&s->types.data, sizeof(void*), 0}); break; } } } + // stacks for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { switch(gp->status){ default: @@ -646,7 +890,7 @@ addroots(void) runtime·walkfintab(addfinroots); for(fb=allfin; fb; fb=fb->alllink) - addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); + addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0}); } static bool @@ -887,8 +1131,9 @@ runtime·gchelper(void) { // parallel mark for over gc roots runtime·parfordo(work.markfor); + // help other threads scan secondary blocks - scanblock(nil, 0); + scanblock(nil, nil, 0, true); if(DebugMark) { // wait while the main thread executes mark(debug_scanblock) @@ -1050,26 +1295,27 @@ gc(struct gc_args *args) obj0 = mstats.nmalloc - mstats.nfree; } + m->locks++; // disable gc during mallocs in parforalloc + if(work.markfor == nil) + work.markfor = runtime·parforalloc(MaxGcproc); + if(work.sweepfor == nil) + work.sweepfor = runtime·parforalloc(MaxGcproc); + m->locks--; + work.nwait = 0; work.ndone = 0; work.debugmarkdone = 0; work.nproc = runtime·gcprocs(); addroots(); - m->locks++; // disable gc during mallocs in parforalloc - if(work.markfor == nil) - work.markfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot); - if(work.sweepfor == nil) - work.sweepfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); - m->locks--; if(work.nproc > 1) { runtime·noteclear(&work.alldone); runtime·helpgc(work.nproc); } runtime·parfordo(work.markfor); - scanblock(nil, 0); + scanblock(nil, nil, 0, true); if(DebugMark) { for(i=0; i