From d491e550c39bfa73c6daeb314c310ea8c399d5d2 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Tue, 9 Feb 2016 17:53:07 -0500 Subject: [PATCH] [dev.garbage] runtime: separate spans of noscan objects Currently, we mix objects with pointers and objects without pointers ("noscan" objects) together in memory. As a result, for every object we grey, we have to check that object's heap bits to find out if it's noscan, which adds to the per-object cost of GC. This also hurts the TLB footprint of the garbage collector because it decreases the density of scannable objects at the page level. This commit improves the situation by using separate spans for noscan objects. This will allow a much simpler noscan check (in a follow up CL), eliminate the need to clear the bitmap of noscan objects (in a follow up CL), and improves TLB footprint by increasing the density of scannable objects. This is also a step toward eliminating dead bits, since the current noscan check depends on checking the dead bit of the first word. This has no effect on the heap size of the garbage benchmark. We'll measure the performance change of this after the follow-up optimizations. Change-Id: I13bdc4869538ece5649a8d2a41c6605371618e40 Reviewed-on: https://go-review.googlesource.com/23700 Reviewed-by: Rick Hudson --- src/runtime/malloc.go | 23 +++++++------ src/runtime/mbitmap.go | 1 + src/runtime/mcache.go | 15 +++++---- src/runtime/mcentral.go | 14 ++++---- src/runtime/mfinal.go | 2 +- src/runtime/mgcmark.go | 2 +- src/runtime/mgcsweep.go | 10 +++--- src/runtime/mheap.go | 74 ++++++++++++++++++++++++++++------------- src/runtime/mstats.go | 4 +-- src/runtime/stubs.go | 7 ++++ 10 files changed, 94 insertions(+), 58 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index b079a07d51..e94c879184 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -491,8 +491,8 @@ func nextFreeFast(s *mspan) gclinkptr { // weight allocation. If it is a heavy weight allocation the caller must // determine whether a new GC cycle needs to be started or if the GC is active // whether this goroutine needs to assist the GC. -func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, s *mspan, shouldhelpgc bool) { - s = c.alloc[sizeclass] +func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) { + s = c.alloc[spc] shouldhelpgc = false freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { @@ -502,10 +502,10 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, s *mspan, shouldhelpgc b throw("s.allocCount != s.nelems && freeIndex == s.nelems") } systemstack(func() { - c.refill(int32(sizeclass)) + c.refill(spc) }) shouldhelpgc = true - s = c.alloc[sizeclass] + s = c.alloc[spc] freeIndex = s.nextFreeIndex() } @@ -629,10 +629,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { return x } // Allocate a new maxTinySize block. - span := c.alloc[tinySizeClass] + span := c.alloc[tinySpanClass] v := nextFreeFast(span) if v == 0 { - v, _, shouldhelpgc = c.nextFree(tinySizeClass) + v, _, shouldhelpgc = c.nextFree(tinySpanClass) } x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 @@ -652,10 +652,11 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { sizeclass = size_to_class128[(size-1024+127)>>7] } size = uintptr(class_to_size[sizeclass]) - span := c.alloc[sizeclass] + spc := makeSpanClass(sizeclass, noscan) + span := c.alloc[spc] v := nextFreeFast(span) if v == 0 { - v, span, shouldhelpgc = c.nextFree(sizeclass) + v, span, shouldhelpgc = c.nextFree(spc) } x = unsafe.Pointer(v) if needzero && span.needzero != 0 { @@ -666,7 +667,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { var s *mspan shouldhelpgc = true systemstack(func() { - s = largeAlloc(size, needzero) + s = largeAlloc(size, needzero, noscan) }) s.freeindex = 1 s.allocCount = 1 @@ -755,7 +756,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { return x } -func largeAlloc(size uintptr, needzero bool) *mspan { +func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan { // print("largeAlloc size=", size, "\n") if size+_PageSize < size { @@ -771,7 +772,7 @@ func largeAlloc(size uintptr, needzero bool) *mspan { // pays the debt down to npage pages. deductSweepCredit(npages*_PageSize, npages) - s := mheap_.alloc(npages, 0, true, needzero) + s := mheap_.alloc(npages, makeSpanClass(0, noscan), true, needzero) if s == nil { throw("out of memory") } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index ccefbcd8d6..4014bd3554 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -509,6 +509,7 @@ func (h heapBits) isPointer() bool { // It must be told how large the object at h is for efficiency. // h must describe the initial word of the object. func (h heapBits) hasPointers(size uintptr) bool { + // TODO: Use span.noScan instead of the heap bitmap. if size == sys.PtrSize { // 1-word objects are always pointers return true } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 5938e53ca8..f50941894f 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -31,7 +31,8 @@ type mcache struct { local_tinyallocs uintptr // number of tiny allocs not counted in other stats // The rest is not accessed on every malloc. - alloc [_NumSizeClasses]*mspan // spans to allocate from + + alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass stackcache [_NumStackOrders]stackfreelist @@ -76,7 +77,7 @@ func allocmcache() *mcache { c := (*mcache)(mheap_.cachealloc.alloc()) unlock(&mheap_.lock) memclr(unsafe.Pointer(c), unsafe.Sizeof(*c)) - for i := 0; i < _NumSizeClasses; i++ { + for i := range c.alloc { c.alloc[i] = &emptymspan } c.next_sample = nextSample() @@ -102,12 +103,12 @@ func freemcache(c *mcache) { // Gets a span that has a free object in it and assigns it // to be the cached span for the given sizeclass. Returns this span. -func (c *mcache) refill(sizeclass int32) *mspan { +func (c *mcache) refill(spc spanClass) *mspan { _g_ := getg() _g_.m.locks++ // Return the current cached span to the central lists. - s := c.alloc[sizeclass] + s := c.alloc[spc] if uintptr(s.allocCount) != s.nelems { throw("refill of span with free space remaining") @@ -118,7 +119,7 @@ func (c *mcache) refill(sizeclass int32) *mspan { } // Get a new cached span from the central lists. - s = mheap_.central[sizeclass].mcentral.cacheSpan() + s = mheap_.central[spc].mcentral.cacheSpan() if s == nil { throw("out of memory") } @@ -127,13 +128,13 @@ func (c *mcache) refill(sizeclass int32) *mspan { throw("span has no free space") } - c.alloc[sizeclass] = s + c.alloc[spc] = s _g_.m.locks-- return s } func (c *mcache) releaseAll() { - for i := 0; i < _NumSizeClasses; i++ { + for i := range c.alloc { s := c.alloc[i] if s != &emptymspan { mheap_.central[i].mcentral.uncacheSpan(s) diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 7b63110460..85bace24d1 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -17,14 +17,14 @@ import "runtime/internal/atomic" // Central list of free objects of a given size. type mcentral struct { lock mutex - sizeclass int32 + spanclass spanClass nonempty mSpanList // list of spans with a free object, ie a nonempty free list empty mSpanList // list of spans with no free objects (or cached in an mcache) } // Initialize a single central free list. -func (c *mcentral) init(sizeclass int32) { - c.sizeclass = sizeclass +func (c *mcentral) init(spc spanClass) { + c.spanclass = spc c.nonempty.init() c.empty.init() } @@ -32,7 +32,7 @@ func (c *mcentral) init(sizeclass int32) { // Allocate a span to use in an MCache. func (c *mcentral) cacheSpan() *mspan { // Deduct credit for this span allocation and sweep if necessary. - spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize + spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize deductSweepCredit(spanBytes, 0) lock(&c.lock) @@ -203,11 +203,11 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool { // grow allocates a new empty span from the heap and initializes it for c's size class. func (c *mcentral) grow() *mspan { - npages := uintptr(class_to_allocnpages[c.sizeclass]) - size := uintptr(class_to_size[c.sizeclass]) + npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) + size := uintptr(class_to_size[c.spanclass.sizeclass()]) n := (npages << _PageShift) / size - s := mheap_.alloc(npages, c.sizeclass, false, true) + s := mheap_.alloc(npages, c.spanclass, false, true) if s == nil { return nil } diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go index 1a744e4a51..cd650b2462 100644 --- a/src/runtime/mfinal.go +++ b/src/runtime/mfinal.go @@ -429,7 +429,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) { } n = s.elemsize - if s.sizeclass != 0 { + if s.spanclass.sizeclass() != 0 { x = add(x, (uintptr(v)-uintptr(x))/n*n) } return diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 00b96fd00b..c6de4d07e5 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1260,7 +1260,7 @@ func gcDumpObject(label string, obj, off uintptr) { print(" s=nil\n") return } - print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n") + print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, "\n") skipped := false for i := uintptr(0); i < s.elemsize; i += sys.PtrSize { // For big objects, just print the beginning (because diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 947c38e400..4bacfe2c38 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -182,7 +182,7 @@ func (s *mspan) sweep(preserve bool) bool { atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages)) - cl := s.sizeclass + spc := s.spanclass size := s.elemsize res := false nfree := 0 @@ -276,7 +276,7 @@ func (s *mspan) sweep(preserve bool) bool { // Count the number of free objects in this span. nfree = s.countFree() - if cl == 0 && nfree != 0 { + if spc.sizeclass() == 0 && nfree != 0 { s.needzero = 1 freeToHeap = true } @@ -317,9 +317,9 @@ func (s *mspan) sweep(preserve bool) bool { atomic.Store(&s.sweepgen, sweepgen) } - if nfreed > 0 && cl != 0 { - c.local_nsmallfree[cl] += uintptr(nfreed) - res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty) + if nfreed > 0 && spc.sizeclass() != 0 { + c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed) + res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { // Free large span to heap diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 4093288a7c..c85e93a997 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -57,7 +57,8 @@ type mheap struct { // the padding makes sure that the MCentrals are // spaced CacheLineSize bytes apart, so that each MCentral.lock // gets its own cache line. - central [_NumSizeClasses]struct { + // central is indexed by spanClass. + central [numSpanClasses]struct { mcentral mcentral pad [sys.CacheLineSize]byte } @@ -181,21 +182,21 @@ type mspan struct { // h->sweepgen is incremented by 2 after every GC sweepgen uint32 - divMul uint32 // for divide by elemsize - divMagic.mul - allocCount uint16 // capacity - number of objects in freelist - sizeclass uint8 // size class - incache bool // being used by an mcache - state uint8 // mspaninuse etc - needzero uint8 // needs to be zeroed before allocation - divShift uint8 // for divide by elemsize - divMagic.shift - divShift2 uint8 // for divide by elemsize - divMagic.shift2 - elemsize uintptr // computed from sizeclass or from npages - unusedsince int64 // first time spotted by gc in mspanfree state - npreleased uintptr // number of pages released to the os - limit uintptr // end of data in span - speciallock mutex // guards specials list - specials *special // linked list of special records sorted by offset. - baseMask uintptr // if non-0, elemsize is a power of 2, & this will get object allocation base + divMul uint32 // for divide by elemsize - divMagic.mul + allocCount uint16 // capacity - number of objects in freelist + spanclass spanClass // size class and noscan (uint8) + incache bool // being used by an mcache + state uint8 // mspaninuse etc + needzero uint8 // needs to be zeroed before allocation + divShift uint8 // for divide by elemsize - divMagic.shift + divShift2 uint8 // for divide by elemsize - divMagic.shift2 + elemsize uintptr // computed from sizeclass or from npages + unusedsince int64 // first time spotted by gc in mspanfree state + npreleased uintptr // number of pages released to the os + limit uintptr // end of data in span + speciallock mutex // guards specials list + specials *special // linked list of special records sorted by offset. + baseMask uintptr // if non-0, elemsize is a power of 2, & this will get object allocation base } func (s *mspan) base() uintptr { @@ -251,6 +252,31 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) { h.nspan = uint32(len(h_allspans)) } +// A spanClass represents the size class and noscan-ness of a span. +// +// Each size class has a noscan spanClass and a scan spanClass. The +// noscan spanClass contains only noscan objects, which do not contain +// pointers and thus do not need to be scanned by the garbage +// collector. +type spanClass uint8 + +const ( + numSpanClasses = _NumSizeClasses << 1 + tinySpanClass = tinySizeClass<<1 | 1 +) + +func makeSpanClass(sizeclass int8, noscan bool) spanClass { + return spanClass(sizeclass<<1) | spanClass(bool2int(noscan)) +} + +func (sc spanClass) sizeclass() int8 { + return int8(sc >> 1) +} + +func (sc spanClass) noscan() bool { + return sc&1 != 0 +} + // inheap reports whether b is a pointer into a (potentially dead) heap object. // It returns false for pointers into stack spans. // Non-preemptible because it is used by write barriers. @@ -335,7 +361,7 @@ func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 { } p := s.base() - if s.sizeclass == 0 { + if s.spanclass.sizeclass() == 0 { // Large object. if base != nil { *base = p @@ -374,7 +400,7 @@ func (h *mheap) init(spans_size uintptr) { h.freelarge.init() h.busylarge.init() for i := range h.central { - h.central[i].mcentral.init(int32(i)) + h.central[i].mcentral.init(spanClass(i)) } sp := (*slice)(unsafe.Pointer(&h_spans)) @@ -481,7 +507,7 @@ func (h *mheap) reclaim(npage uintptr) { // Allocate a new span of npage pages from the heap for GC'd memory // and record its size class in the HeapMap and HeapMapCache. -func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { +func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan { _g_ := getg() if _g_ != _g_.m.g0 { throw("_mheap_alloc not on g0 stack") @@ -514,8 +540,8 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { atomic.Store(&s.sweepgen, h.sweepgen) s.state = _MSpanInUse s.allocCount = 0 - s.sizeclass = uint8(sizeclass) - if sizeclass == 0 { + s.spanclass = spanclass + if sizeclass := spanclass.sizeclass(); sizeclass == 0 { s.elemsize = s.npages << _PageShift s.divShift = 0 s.divMul = 0 @@ -565,13 +591,13 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { return s } -func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan { +func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan { // Don't do any operations that lock the heap on the G stack. // It might trigger stack growth, and the stack growth code needs // to be able to allocate heap. var s *mspan systemstack(func() { - s = h.alloc_m(npage, sizeclass, large) + s = h.alloc_m(npage, spanclass, large) }) if s != nil { @@ -964,7 +990,7 @@ func (span *mspan) init(base uintptr, npages uintptr) { span.startAddr = base span.npages = npages span.allocCount = 0 - span.sizeclass = 0 + span.spanclass = 0 span.incache = false span.elemsize = 0 span.state = _MSpanDead diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go index 2d75d2fef1..4d12f40108 100644 --- a/src/runtime/mstats.go +++ b/src/runtime/mstats.go @@ -291,12 +291,12 @@ func updatememstats(stats *gcstats) { if s.state != mSpanInUse { continue } - if s.sizeclass == 0 { + if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 { memstats.nmalloc++ memstats.alloc += uint64(s.elemsize) } else { memstats.nmalloc += uint64(s.allocCount) - memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount) + memstats.by_size[sizeclass].nmalloc += uint64(s.allocCount) memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize) } } diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go index 6c28fd2e78..f568a68856 100644 --- a/src/runtime/stubs.go +++ b/src/runtime/stubs.go @@ -273,3 +273,10 @@ func round(n, a uintptr) uintptr { // checkASM returns whether assembly runtime checks have passed. func checkASM() bool + +// bool2int returns 0 if x is false or 1 if x is true. +func bool2int(x bool) int { + // Avoid branches. In the SSA compiler, this compiles to + // exactly what you would want it to. + return int(uint8(*(*uint8)(unsafe.Pointer(&x)))) +}