diff --git a/src/pkg/runtime/darwin/386/sys.s b/src/pkg/runtime/darwin/386/sys.s index 87fbdbb79e..15eaf93bc3 100644 --- a/src/pkg/runtime/darwin/386/sys.s +++ b/src/pkg/runtime/darwin/386/sys.s @@ -97,7 +97,7 @@ TEXT runtime·sigtramp(SB),7,$40 // save g MOVL g(CX), DI MOVL DI, 20(SP) - + // g = m->gsignal MOVL m(CX), BP MOVL m_gsignal(BP), BP @@ -111,7 +111,7 @@ TEXT runtime·sigtramp(SB),7,$40 MOVL context+16(FP), BX MOVL BX, 8(SP) MOVL DI, 12(SP) - + MOVL handler+0(FP), BX CALL BX @@ -138,6 +138,26 @@ TEXT runtime·sigaltstack(SB),7,$0 CALL runtime·notok(SB) RET +TEXT runtime·usleep(SB),7,$32 + MOVL $0, DX + MOVL usec+0(FP), AX + MOVL $1000000, CX + DIVL CX + MOVL AX, 24(SP) // sec + MOVL DX, 28(SP) // usec + + // select(0, 0, 0, 0, &tv) + MOVL $0, 0(SP) // "return PC" - ignored + MOVL $0, 4(SP) + MOVL $0, 8(SP) + MOVL $0, 12(SP) + MOVL $0, 16(SP) + LEAL 24(SP), AX + MOVL AX, 20(SP) + MOVL $93, AX + INT $0x80 + RET + // void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void)) // System call args are: func arg stack pthread flags. TEXT runtime·bsdthread_create(SB),7,$32 @@ -309,3 +329,12 @@ TEXT runtime·setldt(SB),7,$32 XORL AX, AX MOVW GS, AX RET + +TEXT runtime·sysctl(SB),7,$0 + MOVL $202, AX + INT $0x80 + JAE 3(PC) + NEGL AX + RET + MOVL $0, AX + RET diff --git a/src/pkg/runtime/darwin/amd64/sys.s b/src/pkg/runtime/darwin/amd64/sys.s index 8d1b20f118..7c79f18c49 100644 --- a/src/pkg/runtime/darwin/amd64/sys.s +++ b/src/pkg/runtime/darwin/amd64/sys.s @@ -81,11 +81,11 @@ TEXT runtime·sigaction(SB),7,$0 TEXT runtime·sigtramp(SB),7,$64 get_tls(BX) - + // save g MOVQ g(BX), R10 MOVQ R10, 48(SP) - + // g = m->gsignal MOVQ m(BX), BP MOVQ m_gsignal(BP), BP @@ -146,6 +146,24 @@ TEXT runtime·sigaltstack(SB),7,$0 CALL runtime·notok(SB) RET +TEXT runtime·usleep(SB),7,$16 + MOVL $0, DX + MOVL usec+0(FP), AX + MOVL $1000000, CX + DIVL CX + MOVQ AX, 0(SP) // sec + MOVL DX, 8(SP) // usec + + // select(0, 0, 0, 0, &tv) + MOVL $0, DI + MOVL $0, SI + MOVL $0, DX + MOVL $0, R10 + MOVQ SP, R8 + MOVL $(0x2000000+23), AX + SYSCALL + RET + // void bsdthread_create(void *stk, M *m, G *g, void (*fn)(void)) TEXT runtime·bsdthread_create(SB),7,$0 // Set up arguments to bsdthread_create system call. @@ -189,7 +207,7 @@ TEXT runtime·bsdthread_start(SB),7,$0 POPQ SI POPQ CX POPQ DX - + get_tls(BX) MOVQ CX, m(BX) MOVQ SI, m_procid(CX) // thread port is m->procid @@ -293,3 +311,18 @@ TEXT runtime·settls(SB),7,$32 MOVL $(0x3000000+3), AX // thread_fast_set_cthread_self - machdep call #3 SYSCALL RET + +TEXT runtime·sysctl(SB),7,$0 + MOVQ 8(SP), DI + MOVL 16(SP), SI + MOVQ 24(SP), DX + MOVQ 32(SP), R10 + MOVQ 40(SP), R8 + MOVQ 48(SP), R9 + MOVL $(0x2000000+202), AX // syscall entry + SYSCALL + JCC 3(PC) + NEGL AX + RET + MOVL $0, AX + RET diff --git a/src/pkg/runtime/darwin/os.h b/src/pkg/runtime/darwin/os.h index db3c2e8a7c..37160f779c 100644 --- a/src/pkg/runtime/darwin/os.h +++ b/src/pkg/runtime/darwin/os.h @@ -18,6 +18,7 @@ uint32 runtime·mach_task_self(void); uint32 runtime·mach_task_self(void); uint32 runtime·mach_thread_self(void); uint32 runtime·mach_thread_self(void); +int32 runtime·sysctl(uint32*, uint32, byte*, uintptr*, byte*, uintptr); struct Sigaction; void runtime·sigaction(uintptr, struct Sigaction*, struct Sigaction*); diff --git a/src/pkg/runtime/darwin/thread.c b/src/pkg/runtime/darwin/thread.c index 6733e815e8..c5d8ba4d3e 100644 --- a/src/pkg/runtime/darwin/thread.c +++ b/src/pkg/runtime/darwin/thread.c @@ -148,6 +148,20 @@ runtime·osinit(void) if(!runtime·iscgo) runtime·bsdthread_register(); runtime·destroylock = destroylock; + + // Use sysctl to fetch hw.ncpu. + uint32 mib[2]; + uint32 out; + int32 ret; + uintptr nout; + + mib[0] = 6; + mib[1] = 3; + nout = sizeof out; + out = 0; + ret = runtime·sysctl(mib, 2, (byte*)&out, &nout, nil, 0); + if(ret >= 0) + runtime·ncpu = out; } void diff --git a/src/pkg/runtime/linux/386/sys.s b/src/pkg/runtime/linux/386/sys.s index f87420f788..7549c04798 100644 --- a/src/pkg/runtime/linux/386/sys.s +++ b/src/pkg/runtime/linux/386/sys.s @@ -52,6 +52,25 @@ TEXT runtime·read(SB),7,$0 CALL *runtime·_vdso(SB) RET +TEXT runtime·usleep(SB),7,$28 + MOVL $0, DX + MOVL usec+0(FP), AX + MOVL $1000000, CX + DIVL CX + MOVL AX, 20(SP) + MOVL DX, 24(SP) + + // select(0, 0, 0, 0, &tv) + MOVL $0, 0(SP) + MOVL $0, 4(SP) + MOVL $0, 8(SP) + MOVL $0, 12(SP) + LEAL 20(SP), AX + MOVL AX, 16(SP) + MOVL $82, AX + SYSCALL + RET + TEXT runtime·raisesigpipe(SB),7,$12 MOVL $224, AX // syscall - gettid CALL *runtime·_vdso(SB) @@ -105,16 +124,16 @@ TEXT runtime·rt_sigaction(SB),7,$0 TEXT runtime·sigtramp(SB),7,$44 get_tls(CX) - + // save g MOVL g(CX), DI MOVL DI, 20(SP) - + // g = m->gsignal MOVL m(CX), BX MOVL m_gsignal(BX), BX MOVL BX, g(CX) - + // copy arguments for call to sighandler MOVL sig+0(FP), BX MOVL BX, 0(SP) @@ -125,12 +144,12 @@ TEXT runtime·sigtramp(SB),7,$44 MOVL DI, 12(SP) CALL runtime·sighandler(SB) - + // restore g get_tls(CX) MOVL 20(SP), BX MOVL BX, g(CX) - + RET TEXT runtime·sigignore(SB),7,$0 @@ -202,7 +221,7 @@ TEXT runtime·clone(SB),7,$0 MOVL $1234, 12(CX) // cannot use CALL *runtime·_vdso(SB) here, because - // the stack changes during the system call (after + // the stack changes during the system call (after // CALL *runtime·_vdso(SB), the child is still using // the parent's stack when executing its RET instruction). INT $0x80 diff --git a/src/pkg/runtime/linux/amd64/sys.s b/src/pkg/runtime/linux/amd64/sys.s index 8b4dcd921e..3174af2cb0 100644 --- a/src/pkg/runtime/linux/amd64/sys.s +++ b/src/pkg/runtime/linux/amd64/sys.s @@ -50,6 +50,24 @@ TEXT runtime·read(SB),7,$0-24 SYSCALL RET +TEXT runtime·usleep(SB),7,$16 + MOVL $0, DX + MOVL usec+0(FP), AX + MOVL $1000000, CX + DIVL CX + MOVQ AX, 0(SP) + MOVQ DX, 8(SP) + + // select(0, 0, 0, 0, &tv) + MOVL $0, DI + MOVL $0, SI + MOVL $0, DX + MOVL $0, R10 + MOVQ SP, R8 + MOVL $23, AX + SYSCALL + RET + TEXT runtime·raisesigpipe(SB),7,$12 MOVL $186, AX // syscall - gettid SYSCALL @@ -195,10 +213,10 @@ TEXT runtime·clone(SB),7,$0 CMPQ AX, $0 JEQ 2(PC) RET - + // In child, on new stack. MOVQ SI, SP - + // Initialize m->procid to Linux tid MOVL $186, AX // gettid SYSCALL diff --git a/src/pkg/runtime/linux/arm/sys.s b/src/pkg/runtime/linux/arm/sys.s index 8619f0945c..764e779fdd 100644 --- a/src/pkg/runtime/linux/arm/sys.s +++ b/src/pkg/runtime/linux/arm/sys.s @@ -33,6 +33,7 @@ #define SYS_gettid (SYS_BASE + 224) #define SYS_tkill (SYS_BASE + 238) #define SYS_sched_yield (SYS_BASE + 158) +#define SYS_select (SYS_BASE + 82) #define ARM_BASE (SYS_BASE + 0x0f0000) #define SYS_ARM_cacheflush (ARM_BASE + 2) @@ -254,7 +255,7 @@ TEXT runtime·sigtramp(SB),7,$24 // save g MOVW g, R3 MOVW g, 20(R13) - + // g = m->gsignal MOVW m_gsignal(m), g @@ -265,7 +266,7 @@ TEXT runtime·sigtramp(SB),7,$24 MOVW R3, 16(R13) BL runtime·sighandler(SB) - + // restore g MOVW 20(R13), g @@ -285,6 +286,23 @@ TEXT runtime·sigreturn(SB),7,$0 SWI $0 RET +TEXT runtime·usleep(SB),7,$12 + MOVW usec+0(FP), R0 + MOVW R0, R1 + MOVW $1000000, R2 + DIV R1, R0 + MOD R2, R0 + MOVW R1, 4(SP) + MOVW R2, 8(SP) + MOVW $0, R0 + MOVW $0, R1 + MOVW $0, R2 + MOVW $0, R3 + MOVW $4(SP), R4 + MOVW $SYS_select, R7 + SWI $0 + RET + // Use kernel version instead of native armcas in ../../arm.s. // See ../../../sync/atomic/asm_linux_arm.s for details. TEXT cas<>(SB),7,$0 diff --git a/src/pkg/runtime/linux/thread.c b/src/pkg/runtime/linux/thread.c index 4878a00f25..bf3b0947d6 100644 --- a/src/pkg/runtime/linux/thread.c +++ b/src/pkg/runtime/linux/thread.c @@ -8,7 +8,6 @@ #include "stack.h" extern SigTab runtime·sigtab[]; -static int32 proccount; int32 runtime·open(uint8*, int32, int32); int32 runtime·close(int32); @@ -136,13 +135,10 @@ futexlock(Lock *l) // its wakeup call. wait = v; - if(proccount == 0) - proccount = getproccount(); - // On uniprocessor's, no point spinning. // On multiprocessors, spin for ACTIVE_SPIN attempts. spin = 0; - if(proccount > 1) + if(runtime·ncpu > 1) spin = ACTIVE_SPIN; for(;;) { @@ -276,6 +272,7 @@ runtime·newosproc(M *m, G *g, void *stk, void (*fn)(void)) void runtime·osinit(void) { + runtime·ncpu = getproccount(); } void diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h index 5bc80f4df9..f22cae4b05 100644 --- a/src/pkg/runtime/malloc.h +++ b/src/pkg/runtime/malloc.h @@ -120,6 +120,13 @@ enum #else MHeapMap_Bits = 20, #endif + + // Max number of threads to run garbage collection. + // 2, 3, and 4 are all plausible maximums depending + // on the hardware details of the machine. The second + // proc is the one that helps the most (after the first), + // so start with just 2 for now. + MaxGcproc = 2, }; // A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).) @@ -192,7 +199,7 @@ struct MStats uint64 nlookup; // number of pointer lookups uint64 nmalloc; // number of mallocs uint64 nfree; // number of frees - + // Statistics about malloc heap. // protected by mheap.Lock uint64 heap_alloc; // bytes allocated and still in use @@ -210,7 +217,7 @@ struct MStats uint64 mcache_inuse; // MCache structures uint64 mcache_sys; uint64 buckhash_sys; // profiling bucket hash table - + // Statistics about garbage collector. // Protected by stopping the world during GC. uint64 next_gc; // next GC (in heap_alloc time) @@ -219,7 +226,7 @@ struct MStats uint32 numgc; bool enablegc; bool debuggc; - + // Statistics about allocation size classes. struct { uint32 size; @@ -240,7 +247,7 @@ extern MStats mstats; // // class_to_size[i] = largest size in class i // class_to_allocnpages[i] = number of pages to allocate when -// making new objects in class i +// making new objects in class i // class_to_transfercount[i] = number of objects to move when // taking a bunch of objects out of the central lists // and putting them in the thread free list. @@ -279,7 +286,7 @@ struct MCache int64 nmalloc; int64 nfree; } local_by_size[NumSizeClasses]; - + }; void* runtime·MCache_Alloc(MCache *c, int32 sizeclass, uintptr size, int32 zeroed); @@ -352,7 +359,7 @@ struct MHeap byte *arena_start; byte *arena_used; byte *arena_end; - + // central free lists for small size classes. // the union makes sure that the MCentrals are // spaced 64 bytes apart, so that each MCentral.Lock @@ -400,6 +407,8 @@ enum void runtime·MProf_Malloc(void*, uintptr); void runtime·MProf_Free(void*, uintptr); +int32 runtime·helpgc(void); +void runtime·gchelper(void); // Malloc profiling settings. // Must match definition in extern.go. diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 03d6f7d629..eaa056da0b 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -10,9 +10,9 @@ enum { Debug = 0, - UseCas = 1, PtrSize = sizeof(void*), - + DebugMark = 0, // run second pass to check mark + // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, bitShift = sizeof(void*)*8/4, @@ -51,17 +51,20 @@ enum { #define bitMask (bitBlockBoundary | bitAllocated | bitMarked | bitSpecial) +// TODO: Make these per-M. static uint64 nlookup; static uint64 nsizelookup; static uint64 naddrlookup; +static uint64 nhandoff; + static int32 gctrace; typedef struct Workbuf Workbuf; struct Workbuf { Workbuf *next; - uintptr nw; - byte *w[2048-2]; + uintptr nobj; + byte *obj[512-2]; }; extern byte data[]; @@ -75,6 +78,26 @@ static int32 fingwait; static void runfinq(void); static Workbuf* getempty(Workbuf*); static Workbuf* getfull(Workbuf*); +static void putempty(Workbuf*); +static Workbuf* handoff(Workbuf*); + +static struct { + Lock fmu; + Workbuf *full; + Lock emu; + Workbuf *empty; + uint32 nproc; + volatile uint32 nwait; + volatile uint32 ndone; + Note alldone; + Lock markgate; + Lock sweepgate; + MSpan *spans; + + Lock; + byte *chunk; + uintptr nchunk; +} work; // scanblock scans a block of n bytes starting at pointer b for references // to other objects, scanning any it finds recursively until there are no @@ -85,13 +108,14 @@ static Workbuf* getfull(Workbuf*); static void scanblock(byte *b, int64 n) { - byte *obj, *arena_start, *p; + byte *obj, *arena_start, *arena_used, *p; void **vp; - uintptr size, *bitp, bits, shift, i, j, x, xbits, off; + uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc; MSpan *s; PageID k; - void **bw, **w, **ew; + void **wp; Workbuf *wbuf; + bool keepworking; if((int64)(uintptr)n != n || n < 0) { runtime·printf("scanblock %p %D\n", b, n); @@ -100,11 +124,19 @@ scanblock(byte *b, int64 n) // Memory arena parameters. arena_start = runtime·mheap.arena_start; - + arena_used = runtime·mheap.arena_used; + nproc = work.nproc; + wbuf = nil; // current work buffer - ew = nil; // end of work buffer - bw = nil; // beginning of work buffer - w = nil; // current pointer into work buffer + wp = nil; // storage for next queued pointer (write pointer) + nobj = 0; // number of queued objects + + // Scanblock helpers pass b==nil. + // The main proc needs to return to make more + // calls to scanblock. But if work.nproc==1 then + // might as well process blocks as soon as we + // have them. + keepworking = b == nil || work.nproc == 1; // Align b to a word boundary. off = (uintptr)b & (PtrSize-1); @@ -120,17 +152,17 @@ scanblock(byte *b, int64 n) runtime·printf("scanblock %p %D\n", b, n); vp = (void**)b; - n /= PtrSize; + n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */ for(i=0; i= runtime·mheap.arena_used) + if((byte*)obj < arena_start || (byte*)obj >= arena_used) continue; - + // obj may be a pointer to a live object. // Try to find the beginning of the object. - + // Round down to word boundary. obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); @@ -188,47 +220,72 @@ scanblock(byte *b, int64 n) found: // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. - // If not allocated or already marked, done. - if((bits & bitAllocated) == 0 || (bits & bitMarked) != 0) + // Only care about allocated and not marked. + if((bits & (bitAllocated|bitMarked)) != bitAllocated) continue; - *bitp |= bitMarked<= ew) { - wbuf = getempty(wbuf); - bw = wbuf->w; - w = bw; - ew = bw + nelem(wbuf->w); + // If another proc wants a pointer, give it some. + if(nobj > 4 && work.nwait > 0 && work.full == nil) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; } - *w++ = obj; + + // If buffer is full, get a new one. + if(wbuf == nil || nobj >= nelem(wbuf->obj)) { + if(wbuf != nil) + wbuf->nobj = nobj; + wbuf = getempty(wbuf); + wp = wbuf->obj; + nobj = 0; + } + *wp++ = obj; + nobj++; + continue_obj:; } - + // Done scanning [b, b+n). Prepare for the next iteration of // the loop by setting b and n to the parameters for the next block. - // Fetch b from the work buffers. - if(w <= bw) { + // Fetch b from the work buffer. + if(nobj == 0) { + if(!keepworking) { + putempty(wbuf); + return; + } // Emptied our buffer: refill. wbuf = getfull(wbuf); if(wbuf == nil) - break; - bw = wbuf->w; - ew = wbuf->w + nelem(wbuf->w); - w = bw+wbuf->nw; + return; + nobj = wbuf->nobj; + wp = wbuf->obj + wbuf->nobj; } - b = *--w; - + b = *--wp; + nobj--; + // Figure out n = size of b. Start by loading bits for b. off = (uintptr*)b - (uintptr*)arena_start; bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; shift = off % wordsPerBitmapWord; xbits = *bitp; bits = xbits >> shift; - + // Might be small; look for nearby block boundary. // A block boundary is marked by either bitBlockBoundary // or bitAllocated being set (see notes near their definition). @@ -247,12 +304,12 @@ scanblock(byte *b, int64 n) // apply a mask to keep only the bits corresponding // to shift+j < bitShift aka j < bitShift-shift. bits &= (boundary<<(bitShift-shift)) - boundary; - + // A block boundary j words before b is indicated by // xbits>>(shift-j) & boundary // (assuming shift >= j). There is no cleverness here // avoid the test, because when j gets too large the shift - // turns negative, which is undefined in C. + // turns negative, which is undefined in C. for(j=1; j>j)&boundary) != 0 || shift>=j && ((xbits>>(shift-j))&boundary) != 0) { @@ -260,7 +317,7 @@ scanblock(byte *b, int64 n) goto scan; } } - + // Fall back to asking span about size class. // (Manually inlined copy of MHeap_Lookup.) nlookup++; @@ -277,29 +334,123 @@ scanblock(byte *b, int64 n) } } -static struct { - Workbuf *full; - Workbuf *empty; - byte *chunk; - uintptr nchunk; -} work; +// debug_scanblock is the debug copy of scanblock. +// it is simpler, slower, single-threaded, recursive, +// and uses bitSpecial as the mark bit. +static void +debug_scanblock(byte *b, int64 n) +{ + byte *obj, *p; + void **vp; + uintptr size, *bitp, bits, shift, i, xbits, off; + MSpan *s; + + if(!DebugMark) + runtime·throw("debug_scanblock without DebugMark"); + + if((int64)(uintptr)n != n || n < 0) { + runtime·printf("debug_scanblock %p %D\n", b, n); + runtime·throw("debug_scanblock"); + } + + // Align b to a word boundary. + off = (uintptr)b & (PtrSize-1); + if(off != 0) { + b += PtrSize - off; + n -= PtrSize - off; + } + + vp = (void**)b; + n /= PtrSize; + for(i=0; i= runtime·mheap.arena_used) + continue; + + // Round down to word boundary. + obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1)); + + // Consult span table to find beginning. + s = runtime·MHeap_LookupMaybe(&runtime·mheap, obj); + if(s == nil) + continue; + + + p = (byte*)((uintptr)s->start<sizeclass == 0) { + obj = p; + size = (uintptr)s->npages<= (byte*)s->limit) + continue; + size = runtime·class_to_size[s->sizeclass]; + int32 i = ((byte*)obj - p)/size; + obj = p+i*size; + } + + // Now that we know the object header, reload bits. + off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start; + bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + shift = off % wordsPerBitmapWord; + xbits = *bitp; + bits = xbits >> shift; + + // Now we have bits, bitp, and shift correct for + // obj pointing at the base of the object. + // If not allocated or already marked, done. + if((bits & bitAllocated) == 0 || (bits & bitSpecial) != 0) // NOTE: bitSpecial not bitMarked + continue; + *bitp |= bitSpecial<nw = nelem(b->w); - b->next = work.full; - work.full = b; + if(work.nproc == 1) { + // Put b on full list. + if(b != nil) { + b->next = work.full; + work.full = b; + } + // Grab from empty list if possible. + b = work.empty; + if(b != nil) { + work.empty = b->next; + goto haveb; + } + } else { + // Put b on full list. + if(b != nil) { + runtime·lock(&work.fmu); + b->next = work.full; + work.full = b; + runtime·unlock(&work.fmu); + } + // Grab from empty list if possible. + runtime·lock(&work.emu); + b = work.empty; + if(b != nil) + work.empty = b->next; + runtime·unlock(&work.emu); + if(b != nil) + goto haveb; } - b = work.empty; - if(b != nil) { - work.empty = b->next; - return b; - } - + + // Need to allocate. + runtime·lock(&work); if(work.nchunk < sizeof *b) { work.nchunk = 1<<20; work.chunk = runtime·SysAlloc(work.nchunk); @@ -307,27 +458,122 @@ getempty(Workbuf *b) b = (Workbuf*)work.chunk; work.chunk += sizeof *b; work.nchunk -= sizeof *b; + runtime·unlock(&work); + +haveb: + b->nobj = 0; return b; } +static void +putempty(Workbuf *b) +{ + if(b == nil) + return; + + if(work.nproc == 1) { + b->next = work.empty; + work.empty = b; + return; + } + + runtime·lock(&work.emu); + b->next = work.empty; + work.empty = b->next; + runtime·unlock(&work.emu); +} + // Get a full work buffer off the work.full list, or return nil. static Workbuf* getfull(Workbuf *b) { - if(b != nil) { - b->nw = 0; - b->next = work.empty; - work.empty = b; + int32 i; + Workbuf *b1; + + if(work.nproc == 1) { + // Put b on empty list. + if(b != nil) { + b->next = work.empty; + work.empty = b; + } + // Grab from full list if possible. + // Since work.nproc==1, no one else is + // going to give us work. + b = work.full; + if(b != nil) + work.full = b->next; + return b; } - b = work.full; - if(b != nil) - work.full = b->next; - return b; + + putempty(b); + + // Grab buffer from full list if possible. + for(;;) { + b1 = work.full; + if(b1 == nil) + break; + runtime·lock(&work.fmu); + if(work.full != nil) { + b1 = work.full; + work.full = b1->next; + runtime·unlock(&work.fmu); + return b1; + } + runtime·unlock(&work.fmu); + } + + runtime·xadd(&work.nwait, +1); + for(i=0;; i++) { + b1 = work.full; + if(b1 != nil) { + runtime·lock(&work.fmu); + if(work.full != nil) { + runtime·xadd(&work.nwait, -1); + b1 = work.full; + work.full = b1->next; + runtime·unlock(&work.fmu); + return b1; + } + runtime·unlock(&work.fmu); + continue; + } + if(work.nwait == work.nproc) + return nil; + if(i < 10) + runtime·procyield(20); + else if(i < 20) + runtime·osyield(); + else + runtime·usleep(100); + } +} + +static Workbuf* +handoff(Workbuf *b) +{ + int32 n; + Workbuf *b1; + + // Make new buffer with half of b's pointers. + b1 = getempty(nil); + n = b->nobj/2; + b->nobj -= n; + b1->nobj = n; + runtime·memmove(b1->obj, b->obj+b->nobj, n*sizeof b1->obj[0]); + nhandoff += n; + + // Put b on full list - let first half of b get stolen. + runtime·lock(&work.fmu); + b->next = work.full; + work.full = b; + runtime·unlock(&work.fmu); + + return b1; } // Scanstack calls scanblock on each of gp's stack segments. static void -scanstack(G *gp) +scanstack(void (*scanblock)(byte*, int64), G *gp) { int32 n; Stktop *stk; @@ -339,6 +585,9 @@ scanstack(G *gp) if(gp == g) { // Scanning our own stack: start at &gp. sp = (byte*)&gp; + } else if(gp->m != nil && gp->m->helpgc) { + // Gc helper scans its own stack. + return; } else { // Scanning another goroutine's stack. // The goroutine is usually asleep (the world is stopped). @@ -387,17 +636,27 @@ markfin(void *v) scanblock(v, size); } -// Mark static void -mark(void) +debug_markfin(void *v) +{ + uintptr size; + + if(!runtime·mlookup(v, &v, &size, nil)) + runtime·throw("debug_mark - finalizer inconsistency"); + debug_scanblock(v, size); +} + +// Mark +static void +mark(void (*scan)(byte*, int64)) { G *gp; // mark data+bss. // skip runtime·mheap itself, which has no interesting pointers // and is mostly zeroed and would not otherwise be paged in. - scanblock(data, (byte*)&runtime·mheap - data); - scanblock((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1)); + scan(data, (byte*)&runtime·mheap - data); + scan((byte*)(&runtime·mheap+1), end - (byte*)(&runtime·mheap+1)); // mark stacks for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { @@ -410,18 +669,24 @@ mark(void) case Grunning: if(gp != g) runtime·throw("mark - world not stopped"); - scanstack(gp); + scanstack(scan, gp); break; case Grunnable: case Gsyscall: case Gwaiting: - scanstack(gp); + scanstack(scan, gp); break; } } // mark things pointed at by objects with finalizers - runtime·walkfintab(markfin); + if(scan == debug_scanblock) + runtime·walkfintab(debug_markfin); + else + runtime·walkfintab(markfin); + + // in multiproc mode, join in the queued work. + scan(nil, 0); } // Sweep frees or calls finalizers for blocks not marked in the mark phase. @@ -435,8 +700,17 @@ sweep(void) byte *p; MCache *c; Finalizer *f; + byte *arena_start; + + arena_start = runtime·mheap.arena_start; + + for(;;) { + s = work.spans; + if(s == nil) + break; + if(!runtime·casp(&work.spans, s, s->allnext)) + continue; - for(s = runtime·mheap.allspans; s != nil; s = s->allnext) { if(s->state != MSpanInUse) continue; @@ -451,13 +725,15 @@ sweep(void) npages = runtime·class_to_allocnpages[cl]; n = (npages << PageShift) / size; } - - // sweep through n objects of given size starting at p. + + // Sweep through n objects of given size starting at p. + // This thread owns the span now, so it can manipulate + // the block bitmap without atomic operations. for(; n > 0; n--, p += size) { uintptr off, *bitp, shift, bits; - off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start; - bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; + off = (uintptr*)p - (uintptr*)arena_start; + bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; shift = off % wordsPerBitmapWord; bits = *bitp>>shift; @@ -465,17 +741,27 @@ sweep(void) continue; if((bits & bitMarked) != 0) { + if(DebugMark) { + if(!(bits & bitSpecial)) + runtime·printf("found spurious mark on %p\n", p); + *bitp &= ~(bitSpecial<arg = p; - f->next = finq; - finq = f; + for(;;) { + f->next = finq; + if(runtime·casp(&finq, f->next, f)) + break; + } continue; } runtime·MProf_Free(p, size); @@ -503,6 +789,23 @@ sweep(void) } } +void +runtime·gchelper(void) +{ + // Wait until main proc is ready for mark help. + runtime·lock(&work.markgate); + runtime·unlock(&work.markgate); + scanblock(nil, 0); + + // Wait until main proc is ready for sweep help. + runtime·lock(&work.sweepgate); + runtime·unlock(&work.sweepgate); + sweep(); + + if(runtime·xadd(&work.ndone, +1) == work.nproc-1) + runtime·notewakeup(&work.alldone); +} + // Semaphore, not Lock, so that the goroutine // reschedules when there is contention rather // than spinning. @@ -523,7 +826,7 @@ static void stealcache(void) { M *m; - + for(m=runtime·allm; m; m=m->alllink) runtime·MCache_ReleaseAll(m->mcache); } @@ -562,6 +865,7 @@ runtime·gc(int32 force) uint64 heap0, heap1, obj0, obj1; byte *p; Finalizer *fp; + bool extra; // The gc is turned off (via enablegc) until // the bootstrap has completed. @@ -582,7 +886,7 @@ runtime·gc(int32 force) gcpercent = -1; else gcpercent = runtime·atoi(p); - + p = runtime·getenv("GOGCTRACE"); if(p != nil) gctrace = runtime·atoi(p); @@ -600,6 +904,7 @@ runtime·gc(int32 force) nlookup = 0; nsizelookup = 0; naddrlookup = 0; + nhandoff = 0; m->gcing = 1; runtime·stoptheworld(); @@ -608,10 +913,30 @@ runtime·gc(int32 force) heap0 = mstats.heap_alloc; obj0 = mstats.nmalloc - mstats.nfree; - mark(); + runtime·lock(&work.markgate); + runtime·lock(&work.sweepgate); + + work.nproc = 1; + if(runtime·gomaxprocs > 1 && runtime·ncpu > 1) { + runtime·noteclear(&work.alldone); + work.nproc += runtime·helpgc(); + } + work.nwait = 0; + work.ndone = 0; + + runtime·unlock(&work.markgate); // let the helpers in + mark(scanblock); + if(DebugMark) + mark(debug_scanblock); t1 = runtime·nanotime(); + + work.spans = runtime·mheap.allspans; + runtime·unlock(&work.sweepgate); // let the helpers in sweep(); + if(work.nproc > 1) + runtime·notesleep(&work.alldone); t2 = runtime·nanotime(); + stealcache(); cachestats(); @@ -641,22 +966,32 @@ runtime·gc(int32 force) mstats.numgc++; if(mstats.debuggc) runtime·printf("pause %D\n", t3-t0); - + if(gctrace) { - runtime·printf("gc%d: %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr)\n", - mstats.numgc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000, + runtime·printf("gc%d(%d): %D+%D+%D ms %D -> %D MB %D -> %D (%D-%D) objects %D pointer lookups (%D size, %D addr) %D handoff\n", + mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000, heap0>>20, heap1>>20, obj0, obj1, mstats.nmalloc, mstats.nfree, - nlookup, nsizelookup, naddrlookup); + nlookup, nsizelookup, naddrlookup, nhandoff); } runtime·semrelease(&gcsema); - runtime·starttheworld(); - + + // If we could have used another helper proc, start one now, + // in the hope that it will be available next time. + // It would have been even better to start it before the collection, + // but doing so requires allocating memory, so it's tricky to + // coordinate. This lazy approach works out in practice: + // we don't mind if the first couple gc rounds don't have quite + // the maximum number of procs. + extra = work.nproc < runtime·gomaxprocs && work.nproc < MaxGcproc; + + runtime·starttheworld(extra); + // give the queued finalizers, if any, a chance to run if(fp != nil) runtime·gosched(); - + if(gctrace > 1 && !force) runtime·gc(1); } @@ -674,7 +1009,7 @@ runtime·UpdateMemStats(void) cachestats(); m->gcing = 0; runtime·semrelease(&gcsema); - runtime·starttheworld(); + runtime·starttheworld(0); } static void @@ -858,6 +1193,9 @@ runtime·blockspecial(void *v) { uintptr *b, off, shift; + if(DebugMark) + return true; + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; shift = off % wordsPerBitmapWord; @@ -870,6 +1208,9 @@ runtime·setblockspecial(void *v) { uintptr *b, off, shift, bits, obits; + if(DebugMark) + return; + off = (uintptr*)v - (uintptr*)runtime·mheap.arena_start; b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; shift = off % wordsPerBitmapWord; @@ -887,7 +1228,7 @@ runtime·setblockspecial(void *v) } } } - + void runtime·MHeap_MapBits(MHeap *h) { @@ -898,7 +1239,7 @@ runtime·MHeap_MapBits(MHeap *h) bitmapChunk = 8192 }; uintptr n; - + n = (h->arena_used - h->arena_start) / wordsPerBitmapWord; n = (n+bitmapChunk-1) & ~(bitmapChunk-1); if(h->bitmap_mapped >= n) diff --git a/src/pkg/runtime/print.c b/src/pkg/runtime/print.c index 3ce7794957..0d8caaf912 100644 --- a/src/pkg/runtime/print.c +++ b/src/pkg/runtime/print.c @@ -51,7 +51,7 @@ vprintf(int8 *s, byte *base) uintptr arg, narg; byte *v; -// lock(&debuglock); + //runtime·lock(&debuglock); lp = p = s; arg = 0; @@ -152,7 +152,7 @@ vprintf(int8 *s, byte *base) if(p > lp) runtime·write(2, lp, p-lp); -// unlock(&debuglock); + //runtime·unlock(&debuglock); } #pragma textflag 7 @@ -348,4 +348,4 @@ runtime·typestring(Eface e, String s) s = *e.type->string; FLUSH(&s); } - + diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index cc075741d1..39e3fa0230 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -15,6 +15,7 @@ static void unwindstack(G*, byte*); static void schedule(G*); static void acquireproc(void); static void releaseproc(void); +static M *startm(void); typedef struct Sched Sched; @@ -323,6 +324,9 @@ mcommoninit(M *m) m->fastrand = 0x49f6428aUL + m->id; m->stackalloc = runtime·malloc(sizeof(*m->stackalloc)); runtime·FixAlloc_Init(m->stackalloc, FixedStack, runtime·SysAlloc, nil, nil); + + if(m->mcache == nil) + m->mcache = runtime·allocmcache(); } // Try to increment mcpu. Report whether succeeded. @@ -422,7 +426,7 @@ mget(G *g) M *m; // if g has its own m, use it. - if((m = g->lockedm) != nil) + if(g && (m = g->lockedm) != nil) return m; // otherwise use general m pool. @@ -507,6 +511,7 @@ nextgandunlock(void) G *gp; uint32 v; +top: if(atomic_mcpu(runtime·sched.atomic) >= maxgomaxprocs) runtime·throw("negative mcpu"); @@ -584,12 +589,49 @@ nextgandunlock(void) schedunlock(); runtime·notesleep(&m->havenextg); + if(m->helpgc) { + runtime·gchelper(); + m->helpgc = 0; + runtime·lock(&runtime·sched); + goto top; + } if((gp = m->nextg) == nil) runtime·throw("bad m->nextg in nextgoroutine"); m->nextg = nil; return gp; } +int32 +runtime·helpgc(void) +{ + M *m; + int32 n, max; + + // Figure out how many CPUs to use. + // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc. + max = runtime·gomaxprocs; + if(max > runtime·ncpu) + max = runtime·ncpu; + if(max > MaxGcproc) + max = MaxGcproc; + + + // We're going to use one CPU no matter what. + // Figure out the max number of additional CPUs. + max--; + + runtime·lock(&runtime·sched); + n = 0; + while(n < max && (m = mget(nil)) != nil) { + n++; + m->helpgc = 1; + m->waitnextg = 0; + runtime·notewakeup(&m->havenextg); + } + runtime·unlock(&runtime·sched); + return n; +} + void runtime·stoptheworld(void) { @@ -626,15 +668,28 @@ runtime·stoptheworld(void) schedunlock(); } -// TODO(rsc): Remove. This is only temporary, -// for the mark and sweep collector. void -runtime·starttheworld(void) +runtime·starttheworld(bool extra) { + M *m; + schedlock(); runtime·gcwaiting = 0; setmcpumax(runtime·gomaxprocs); matchmg(); + if(extra && canaddmcpu()) { + // Start a new m that will (we hope) be idle + // and so available to help when the next + // garbage collection happens. + // canaddmcpu above did mcpu++ + // (necessary, because m will be doing various + // initialization work so is definitely running), + // but m is not running a specific goroutine, + // so set the helpgc flag as a signal to m's + // first schedule(nil) to mcpu--. + m = startm(); + m->helpgc = 1; + } schedunlock(); } @@ -644,8 +699,6 @@ runtime·mstart(void) { if(g != m->g0) runtime·throw("bad runtime·mstart"); - if(m->mcache == nil) - m->mcache = runtime·allocmcache(); // Record top of stack for use by mcall. // Once we call schedule we're never coming back, @@ -677,46 +730,55 @@ struct CgoThreadStart static void matchmg(void) { - G *g; + G *gp; + M *mp; if(m->mallocing || m->gcing) return; while(haveg() && canaddmcpu()) { - g = gget(); - if(g == nil) + gp = gget(); + if(gp == nil) runtime·throw("gget inconsistency"); - // Find the m that will run g. - M *m; - if((m = mget(g)) == nil){ - m = runtime·malloc(sizeof(M)); - mcommoninit(m); - - if(runtime·iscgo) { - CgoThreadStart ts; - - if(libcgo_thread_start == nil) - runtime·throw("libcgo_thread_start missing"); - // pthread_create will make us a stack. - m->g0 = runtime·malg(-1); - ts.m = m; - ts.g = m->g0; - ts.fn = runtime·mstart; - runtime·asmcgocall(libcgo_thread_start, &ts); - } else { - if(Windows) - // windows will layout sched stack on os stack - m->g0 = runtime·malg(-1); - else - m->g0 = runtime·malg(8192); - runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart); - } - } - mnextg(m, g); + // Find the m that will run gp. + if((mp = mget(gp)) == nil) + mp = startm(); + mnextg(mp, gp); } } +static M* +startm(void) +{ + M *m; + + m = runtime·malloc(sizeof(M)); + mcommoninit(m); + + if(runtime·iscgo) { + CgoThreadStart ts; + + if(libcgo_thread_start == nil) + runtime·throw("libcgo_thread_start missing"); + // pthread_create will make us a stack. + m->g0 = runtime·malg(-1); + ts.m = m; + ts.g = m->g0; + ts.fn = runtime·mstart; + runtime·asmcgocall(libcgo_thread_start, &ts); + } else { + if(Windows) + // windows will layout sched stack on os stack + m->g0 = runtime·malg(-1); + else + m->g0 = runtime·malg(8192); + runtime·newosproc(m, m->g0, m->g0->stackbase, runtime·mstart); + } + + return m; +} + // One round of scheduler: find a goroutine and run it. // The argument is the goroutine that was running before // schedule was called, or nil if this is the first call. @@ -767,6 +829,12 @@ schedule(G *gp) gp->readyonstop = 0; readylocked(gp); } + } else if(m->helpgc) { + // atomic { mcpu-- } + v = runtime·xadd(&runtime·sched.atomic, -1< maxgomaxprocs) + runtime·throw("negative mcpu in scheduler"); + m->helpgc = 0; } // Find (or wait for) g to run. Unlocks runtime·sched. @@ -1097,7 +1165,7 @@ runtime·newproc1(byte *fn, byte *argp, int32 narg, int32 nret, void *callerpc) //printf("newproc1 %p %p narg=%d nret=%d\n", fn, argp, narg, nret); siz = narg + nret; siz = (siz+7) & ~7; - + // We could instead create a secondary stack frame // and make it look like goexit was on the original but // the call to the actual goroutine function was split. diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h index 999511ac28..63f7d65dfb 100644 --- a/src/pkg/runtime/runtime.h +++ b/src/pkg/runtime/runtime.h @@ -57,7 +57,7 @@ typedef struct String String; typedef struct Usema Usema; typedef struct SigTab SigTab; typedef struct MCache MCache; -typedef struct FixAlloc FixAlloc; +typedef struct FixAlloc FixAlloc; typedef struct Iface Iface; typedef struct Itab Itab; typedef struct Eface Eface; @@ -238,6 +238,7 @@ struct M int32 waitnextg; int32 dying; int32 profilehz; + int32 helpgc; uint32 fastrand; uint64 ncgocall; Note havenextg; @@ -406,6 +407,7 @@ extern bool runtime·singleproc; extern uint32 runtime·panicking; extern int32 runtime·gcwaiting; // gc is waiting to run int8* runtime·goos; +int32 runtime·ncpu; extern bool runtime·iscgo; extern void (*runtime·destroylock)(Lock*); @@ -515,6 +517,7 @@ void runtime·startpanic(void); void runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp); void runtime·resetcpuprofiler(int32); void runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32); +void runtime·usleep(uint32); #pragma varargck argpos runtime·printf 1 #pragma varargck type "d" int32 @@ -534,7 +537,7 @@ void runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32); // TODO(rsc): Remove. These are only temporary, // for the mark and sweep collector. void runtime·stoptheworld(void); -void runtime·starttheworld(void); +void runtime·starttheworld(bool); /* * mutual exclusion locks. in the uncontended case, diff --git a/test/garbage/Makefile b/test/garbage/Makefile index e833843826..acf98a7dc4 100644 --- a/test/garbage/Makefile +++ b/test/garbage/Makefile @@ -18,7 +18,7 @@ all: $(addsuffix .out, $(ALL)) $(LD) -o $@ $*.$O %.bench: %.out - ./$*.out + time ./$*.out bench: $(addsuffix .bench, $(ALL)) diff --git a/test/garbage/parser.go b/test/garbage/parser.go index 19a96bc63b..06cc48384a 100644 --- a/test/garbage/parser.go +++ b/test/garbage/parser.go @@ -73,10 +73,6 @@ func parseDir(dirpath string) map[string]*ast.Package { } func main() { - runtime.GOMAXPROCS(4) - go func() {}() - go func() {}() - go func() {}() st := &runtime.MemStats packages = append(packages, packages...) packages = append(packages, packages...) @@ -132,7 +128,6 @@ func main() { } } - var packages = []string{ "archive/tar", "asn1", @@ -148,7 +143,6 @@ var packages = []string{ "container/ring", "container/vector", "crypto/aes", - "crypto/block", "crypto/blowfish", "crypto/hmac", "crypto/md4", @@ -167,7 +161,6 @@ var packages = []string{ "debug/macho", "debug/elf", "debug/gosym", - "debug/proc", "ebnf", "encoding/ascii85", "encoding/base64", @@ -177,9 +170,6 @@ var packages = []string{ "encoding/pem", "exec", "exp/datafmt", - "exp/draw", - "exp/eval", - "exp/iterable", "expvar", "flag", "fmt",