From 830ce3f1ed2771a94ed768816304ce2dd1017da0 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Mon, 11 Jan 2016 21:10:46 -0500 Subject: [PATCH 01/23] [dev.garbage] dev.garbage: create new dev.garbage branch This is for a GC experiment that may or may not go anywhere. Change-Id: I46a4535cc768ce8bbe33c72961f1fa87658493f7 Reviewed-on: https://go-review.googlesource.com/18534 Reviewed-by: Rick Hudson --- dev.garbage | 1 + 1 file changed, 1 insertion(+) create mode 100644 dev.garbage diff --git a/dev.garbage b/dev.garbage new file mode 100644 index 0000000000..b8c3a3fcb7 --- /dev/null +++ b/dev.garbage @@ -0,0 +1 @@ +Reviving dev.garbage branch for use in new garbage collection experiment. From 2ac8bdc52ae1ea0418df465de3f1ef36f49e2274 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Thu, 4 Feb 2016 11:41:48 -0500 Subject: [PATCH 02/23] [dev.garbage] runtime: bitmap allocation data structs The bitmap allocation data structure prototypes. Before this is released these underlying data structures need to be more performant but the signatures of helper functions utilizing these structures will remain stable. Change-Id: I5ace12f2fb512a7038a52bbde2bfb7e98783bcbe Reviewed-on: https://go-review.googlesource.com/19221 Reviewed-by: Austin Clements Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot --- src/runtime/malloc.go | 3 +++ src/runtime/mheap.go | 38 +++++++++++++++++++++++++++++++++++--- src/runtime/msize.go | 28 ++++++++++++++++++---------- 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 5f1e2f64c0..fe13b8b9a3 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -97,6 +97,9 @@ const ( pageShift = _PageShift pageSize = _PageSize pageMask = _PageMask + // By construction, single page spans of the smallest object class + // have the most objects per span. + maxObjsPerSpan = pageSize / 8 mSpanInUse = _MSpanInUse diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 895af9f07c..8c843be946 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -117,9 +117,41 @@ type mspan struct { prev **mspan // previous span's next field, or list head's first field if none list *mSpanList // For debugging. TODO: Remove. - start pageID // starting page number - npages uintptr // number of pages in span - freelist gclinkptr // list of free objects + start pageID // starting page number + npages uintptr // number of pages in span + freelist gclinkptr // list of free objects for _MSpanInUse + stackfreelist gclinkptr // list of free stacks, avoids overloading freelist for _MSpanStack + + // freeindex is the slot index between 0 and nelems at which to begin scanning + // for the next free object in this span. + // Each allocation scans allocBits starting at freeindex until it encounters a 0 + // indicating a free object. freeindex is then adjusted so that subsequent scans begin + // just past the the newly discovered free object. + // + // If freeindex == nelem, this span has no free objects. + // + // allocBits is a bitmap of objects in this span. + // If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0 + // then object n is free; + // otherwise, object n is allocated. Bits starting at nelem are + // undefined and should never be referenced. + // + // Object n starts at address n*elemsize + (start << pageShift). + freeindex uintptr + allocBits *[maxObjsPerSpan / 8]uint8 + gcmarkBits *[maxObjsPerSpan / 8]uint8 + nelems uintptr // number of object in the span. + // TODO(rlh) consider moving some of these fields into seperate arrays. + // Put another way is an array of structs a better idea than a struct of arrays. + + // allocBits and gcmarkBits currently point to either markbits1 + // or markbits2. At the end of a GC cycle allocBits and + // gcmarkBits swap roles simply by swapping pointers. + // This level of indirection also facilitates an implementation + // where markbits1 and markbits2 are not inlined in mspan. + markbits1 [maxObjsPerSpan / 8]uint8 // A bit for each obj. + markbits2 [maxObjsPerSpan / 8]uint8 // A bit for each obj. + // sweep generation: // if sweepgen == h->sweepgen - 2, the span needs sweeping // if sweepgen == h->sweepgen - 1, the span is currently being swept diff --git a/src/runtime/msize.go b/src/runtime/msize.go index 21fe2f4c61..18577b309b 100644 --- a/src/runtime/msize.go +++ b/src/runtime/msize.go @@ -55,7 +55,7 @@ var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8 func sizeToClass(size int32) int32 { if size > _MaxSmallSize { - throw("SizeToClass - invalid size") + throw("invalid size") } if size > 1024-8 { return int32(size_to_class128[(size-1024+127)>>7]) @@ -79,7 +79,7 @@ func initSizes() { } } if align&(align-1) != 0 { - throw("InitSizes - bug") + throw("incorrect alignment") } // Make the allocnpages big enough that @@ -106,10 +106,18 @@ func initSizes() { sizeclass++ } if sizeclass != _NumSizeClasses { - print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n") - throw("InitSizes - bad NumSizeClasses") + print("runtime: sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n") + throw("bad NumSizeClasses") + } + // Check maxObjsPerSpan => number of objects invariant. + for i, size := range class_to_size { + if size != 0 && class_to_allocnpages[i]*pageSize/size > maxObjsPerSpan { + throw("span contains too many objects") + } + if size == 0 && i != 0 { + throw("size is 0 but class is not 0") + } } - // Initialize the size_to_class tables. nextsize := 0 for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ { @@ -128,12 +136,12 @@ func initSizes() { for n := int32(0); n < _MaxSmallSize; n++ { sizeclass := sizeToClass(n) if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n { - print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") + print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") print("incorrect SizeToClass\n") goto dump } if sizeclass > 1 && class_to_size[sizeclass-1] >= n { - print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") + print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") print("SizeToClass too big\n") goto dump } @@ -155,18 +163,18 @@ func initSizes() { dump: if true { - print("NumSizeClasses=", _NumSizeClasses, "\n") + print("runtime: NumSizeClasses=", _NumSizeClasses, "\n") print("runtime·class_to_size:") for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ { print(" ", class_to_size[sizeclass], "") } print("\n\n") - print("size_to_class8:") + print("runtime: size_to_class8:") for i := 0; i < len(size_to_class8); i++ { print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n") } print("\n") - print("size_to_class128:") + print("runtime: size_to_class128:") for i := 0; i < len(size_to_class128); i++ { print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n") } From aed861038f876643a67c2297b384b6be140c46c1 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 8 Feb 2016 09:53:14 -0500 Subject: [PATCH 03/23] [dev.garbage] runtime: add stackfreelist The freelist for normal objects and the freelist for stacks share the same mspan field for holding the list head but are operated on by different code sequences. This overloading complicates the use of bit vectors for allocation of normal objects. This change refactors the use of the stackfreelist out from the use of freelist. Change-Id: I5b155b5b8a1fcd8e24c12ee1eb0800ad9b6b4fa0 Reviewed-on: https://go-review.googlesource.com/19315 Reviewed-by: Austin Clements --- src/runtime/mheap.go | 2 +- src/runtime/stack.go | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 8c843be946..a3d34a360e 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -552,7 +552,7 @@ func (h *mheap) allocStack(npage uintptr) *mspan { s := h.allocSpanLocked(npage) if s != nil { s.state = _MSpanStack - s.freelist = 0 + s.stackfreelist = 0 s.ref = 0 memstats.stacks_inuse += uint64(s.npages << _PageShift) } diff --git a/src/runtime/stack.go b/src/runtime/stack.go index fdd6710bad..5e373f1b94 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -194,23 +194,23 @@ func stackpoolalloc(order uint8) gclinkptr { if s.ref != 0 { throw("bad ref") } - if s.freelist.ptr() != nil { - throw("bad freelist") + if s.stackfreelist.ptr() != nil { + throw("bad stackfreelist") } for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order { x := gclinkptr(uintptr(s.start)<<_PageShift + i) - x.ptr().next = s.freelist - s.freelist = x + x.ptr().next = s.stackfreelist + s.stackfreelist = x } list.insert(s) } - x := s.freelist + x := s.stackfreelist if x.ptr() == nil { throw("span has no free stacks") } - s.freelist = x.ptr().next + s.stackfreelist = x.ptr().next s.ref++ - if s.freelist.ptr() == nil { + if s.stackfreelist.ptr() == nil { // all stacks in s are allocated. list.remove(s) } @@ -223,12 +223,12 @@ func stackpoolfree(x gclinkptr, order uint8) { if s.state != _MSpanStack { throw("freeing stack not in a stack span") } - if s.freelist.ptr() == nil { + if s.stackfreelist.ptr() == nil { // s will now have a free stack stackpool[order].insert(s) } - x.ptr().next = s.freelist - s.freelist = x + x.ptr().next = s.stackfreelist + s.stackfreelist = x s.ref-- if gcphase == _GCoff && s.ref == 0 { // Span is completely free. Return it to the heap @@ -247,7 +247,7 @@ func stackpoolfree(x gclinkptr, order uint8) { // // By not freeing, we prevent step #4 until GC is done. stackpool[order].remove(s) - s.freelist = 0 + s.stackfreelist = 0 mheap_.freeStack(s) } } @@ -1138,6 +1138,7 @@ func freeStackSpans() { if s.ref == 0 { list.remove(s) s.freelist = 0 + s.stackfreelist = 0 mheap_.freeStack(s) } s = next From e1c4e9a754833e169a41ea98a49c3712513879ab Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 8 Feb 2016 12:36:23 -0500 Subject: [PATCH 04/23] [dev.garbage] runtime: refactor next free object In preparation for changing how the next free object is chosen refactor and consolidate code into a single function. Change-Id: I6836cd88ed7cbf0b2df87abd7c1c3b9fabc1cbd8 Reviewed-on: https://go-review.googlesource.com/19317 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 59 ++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index fe13b8b9a3..70e7358e88 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -496,6 +496,32 @@ const ( _FlagNoZero = 1 << 1 // don't zero memory ) +// nextFree returns the next free object from the cached span if one is available. +// Otherwise it refills the cache with a span with an available object and +// returns that object along with a flag indicating that this was a heavy +// weight allocation. If it is a heavy weight allocation the caller must +// determine whether a new GC cycle needs to be started or if the GC is active +// whether this goroutine needs to assist the GC. +// https://golang.org/cl/5350 motivates why this routine should preform a +// prefetch. +func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { + s := c.alloc[sizeclass] + v = s.freelist + if v.ptr() == nil { + systemstack(func() { + c.refill(int32(sizeclass)) + }) + shouldhelpgc = true + s = c.alloc[sizeclass] + v = s.freelist + } + s.freelist = v.ptr().next + s.ref++ + // prefetchnta offers best performance, see change list message. + prefetchnta(uintptr(v.ptr().next)) + return +} + // Allocate an object of size bytes. // Small objects are allocated from the per-P cache's free lists. // Large objects (> 32 kB) are allocated straight from the heap. @@ -554,7 +580,6 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { shouldhelpgc := false dataSize := size c := gomcache() - var s *mspan var x unsafe.Pointer if size <= maxSmallSize { if flags&flagNoScan != 0 && size < maxTinySize { @@ -606,20 +631,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { return x } // Allocate a new maxTinySize block. - s = c.alloc[tinySizeClass] - v := s.freelist - if v.ptr() == nil { - systemstack(func() { - c.refill(tinySizeClass) - }) - shouldhelpgc = true - s = c.alloc[tinySizeClass] - v = s.freelist - } - s.freelist = v.ptr().next - s.ref++ - // prefetchnta offers best performance, see change list message. - prefetchnta(uintptr(v.ptr().next)) + var v gclinkptr + v, shouldhelpgc = c.nextFree(tinySizeClass) x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 (*[2]uint64)(x)[1] = 0 @@ -638,20 +651,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { sizeclass = size_to_class128[(size-1024+127)>>7] } size = uintptr(class_to_size[sizeclass]) - s = c.alloc[sizeclass] - v := s.freelist - if v.ptr() == nil { - systemstack(func() { - c.refill(int32(sizeclass)) - }) - shouldhelpgc = true - s = c.alloc[sizeclass] - v = s.freelist - } - s.freelist = v.ptr().next - s.ref++ - // prefetchnta offers best performance, see change list message. - prefetchnta(uintptr(v.ptr().next)) + var v gclinkptr + v, shouldhelpgc = c.nextFree(sizeclass) x = unsafe.Pointer(v) if flags&flagNoZero == 0 { v.ptr().next = 0 From dc65a82eff0a3af5a26f6c6d31c53bdac9b31168 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Tue, 9 Feb 2016 09:38:44 -0500 Subject: [PATCH 05/23] [dev.garbage] runtime: mark/allocation helper functions The gcmarkBits is a bit vector used by the GC to mark reachable objects. Once a GC cycle is complete the gcmarkBits swap places with the allocBits. allocBits is then used directly by malloc to locate free objects, thus avoiding the construction of a linked free list. This CL introduces a set of helper functions for manipulating gcmarkBits and allocBits that will be used by later CLs to realize the actual algorithm. Minimal attempts have been made to optimize these helper routines. Change-Id: I55ad6240ca32cd456e8ed4973c6970b3b882dd34 Reviewed-on: https://go-review.googlesource.com/19420 Reviewed-by: Austin Clements Run-TryBot: Rick Hudson TryBot-Result: Gobot Gobot --- src/runtime/malloc.go | 2 +- src/runtime/mbitmap.go | 184 ++++++++++++++++++++++++++++++++++++++-- src/runtime/mcentral.go | 2 +- 3 files changed, 180 insertions(+), 8 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 70e7358e88..528a5b73ba 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -772,7 +772,7 @@ func largeAlloc(size uintptr, flag uint32) *mspan { throw("out of memory") } s.limit = uintptr(s.start)<<_PageShift + size - heapBitsForSpan(s.base()).initSpan(s.layout()) + heapBitsForSpan(s.base()).initSpan(s) return s } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 685c29066b..a78efdc034 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -94,6 +94,8 @@ func addb(p *byte, n uintptr) *byte { } // subtractb returns the byte pointer p-n. +// subtractb is typically used when traversing the pointer tables referred to by hbits +// which are arranged in reverse order. //go:nowritebarrier func subtractb(p *byte, n uintptr) *byte { // Note: wrote out full expression instead of calling add(p, -n) @@ -112,6 +114,8 @@ func add1(p *byte) *byte { } // subtract1 returns the byte pointer p-1. +// subtract1 is typically used when traversing the pointer tables referred to by hbits +// which are arranged in reverse order. //go:nowritebarrier // // nosplit because it is used during write barriers and must not be preempted. @@ -158,6 +162,151 @@ type heapBits struct { shift uint32 } +// markBits provides access to the mark bit for an object in the heap. +// bytep points to the byte holding the mark bit. +// mask is a byte with a single bit set that can be &ed with *bytep +// to see if the bit has been set. +// *m.byte&m.mask != 0 indicates the mark bit is set. +// index can be used along with span information to generate +// the address of the object in the heap. +// We maintain one set of mark bits for allocation and one for +// marking purposes. +type markBits struct { + bytep *uint8 + mask uint8 + index uintptr +} + +//go:nosplit +func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { + whichByte := allocBitIndex / 8 + whichBit := allocBitIndex % 8 + return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} +} + +// nextFreeIndex returns the index of the next free object in s at or +// after the index'th object. +// There are hardware instructions that can be used to make this +// faster if profiling warrants it. +func (s *mspan) nextFreeIndex(index uintptr) uintptr { + var mask uint8 + if index == s.nelems { + return index + } + if index > s.nelems { + throw("index > s.nelems") + } + whichByte := index / 8 + theByte := s.allocBits[whichByte] + // Optimize for the first byte holding a free object. + if theByte != 0xff { + mask = 1 << (index % 8) + for index < s.nelems { + if mask&theByte == 0 { + return index + } + if mask == 1<<7 { + break + } + mask = mask << 1 + index++ + } + } + maxByteIndex := (s.nelems - 1) / 8 + theByte = 0xff // Free bit not found in this byte above so set to 0xff. + // If there was a 0 bit before incoming index then the byte would not be 0xff. + for theByte == 0xff { + whichByte++ + if whichByte > maxByteIndex { + return s.nelems + } + if uintptr(len(s.allocBits)) <= whichByte { + throw("whichByte > len(s.allocBits") + } + theByte = s.allocBits[whichByte] + } + index = whichByte * 8 + mask = uint8(1) + + for index < s.nelems { + if mask&theByte == 0 { + return index + } + if mask == 1<<7 { + break + } + mask = mask << 1 + index++ + } + return index +} + +func (s *mspan) isFree(index uintptr) bool { + whichByte := index / 8 + whichBit := index % 8 + return s.allocBits[whichByte]&uint8(1<= mheap_.arena_used { + throw("heapBitsForSpan: base out of range") + } + mbits = markBitsForAddr(base) + if mbits.mask != 1 { + throw("markBitsForSpan: unaligned start") + } + return mbits +} + // heapBitsForAddr returns the heapBits for the address addr. // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used). // @@ -174,11 +323,7 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { if base < mheap_.arena_start || base >= mheap_.arena_used { throw("heapBitsForSpan: base out of range") } - hbits = heapBitsForAddr(base) - if hbits.shift != 0 { - throw("heapBitsForSpan: unaligned start") - } - return hbits + return heapBitsForAddr(base) } // heapBitsForObject returns the base address for the heap object @@ -487,6 +632,22 @@ func typeBitsBulkBarrier(typ *_type, p, size uintptr) { } } +func (s *mspan) clearGCMarkBits() { + bytesInMarkBits := (s.nelems + 7) / 8 + bits := s.gcmarkBits[:bytesInMarkBits] + for i := range bits { + bits[i] = 0 + } +} + +func (s *mspan) clearAllocBits() { + bytesInMarkBits := (s.nelems + 7) / 8 + bits := s.allocBits[:bytesInMarkBits] + for i := range bits { + bits[i] = 0 + } +} + // The methods operating on spans all require that h has been returned // by heapBitsForSpan and that size, n, total are the span layout description // returned by the mspan's layout method. @@ -500,7 +661,18 @@ func typeBitsBulkBarrier(typ *_type, p, size uintptr) { // If this is a span of pointer-sized objects, it initializes all // words to pointer (and there are no dead bits). // Otherwise, it initializes all words to scalar/dead. -func (h heapBits) initSpan(size, n, total uintptr) { +func (h heapBits) initSpan(s *mspan) { + size, n, total := s.layout() + + // Init the markbit structures + s.allocBits = &s.markbits1 + s.gcmarkBits = &s.markbits2 + s.freeindex = 0 + s.nelems = n + s.clearAllocBits() + s.clearGCMarkBits() + + // Clear bits corresponding to objects. if total%heapBitmapScale != 0 { throw("initSpan: unaligned length") } diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 4f0b86c228..baca157db9 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -225,6 +225,6 @@ func (c *mcentral) grow() *mspan { } tail.ptr().next = 0 s.freelist = head - heapBitsForSpan(s.base()).initSpan(s.layout()) + heapBitsForSpan(s.base()).initSpan(s) return s } From 3479b065d43f2990ac12e7b00ddff6f63a876ca9 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Thu, 11 Feb 2016 13:57:58 -0500 Subject: [PATCH 06/23] [dev.garbage] runtime: allocate directly from GC mark bits Instead of building a freelist from the mark bits generated by the GC this CL allocates directly from the mark bits. The approach moves the mark bits from the pointer/no pointer heap structures into their own per span data structures. The mark/allocation vectors consist of a single mark bit per object. Two vectors are maintained, one for allocation and one for the GC's mark phase. During the GC cycle's sweep phase the interpretation of the vectors is swapped. The mark vector becomes the allocation vector and the old allocation vector is cleared and becomes the mark vector that the next GC cycle will use. Marked entries in the allocation vector indicate that the object is not free. Each allocation vector maintains a boundary between areas of the span already allocated from and areas not yet allocated from. As objects are allocated this boundary is moved until it reaches the end of the span. At this point further allocations will be done from another span. Since we no longer sweep a span inspecting each freed object the responsibility for maintaining pointer/scalar bits in the heapBitMap containing is now the responsibility of the the routines doing the actual allocation. This CL is functionally complete and ready for performance tuning. Change-Id: I336e0fc21eef1066e0b68c7067cc71b9f3d50e04 Reviewed-on: https://go-review.googlesource.com/19470 Reviewed-by: Austin Clements --- src/runtime/heapdump.go | 10 +- src/runtime/malloc.go | 36 +++-- src/runtime/mbitmap.go | 308 +++++++++++++++++++++++++--------------- src/runtime/mcache.go | 13 +- src/runtime/mcentral.go | 35 ++--- src/runtime/mgcmark.go | 10 +- src/runtime/mgcsweep.go | 72 ++++------ src/runtime/mheap.go | 16 ++- src/runtime/stack.go | 1 - 9 files changed, 282 insertions(+), 219 deletions(-) diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index e6a41f7f97..96dd6ff867 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -472,9 +472,13 @@ func dumpobjs() { if n > uintptr(len(freemark)) { throw("freemark array doesn't have enough entries") } - for l := s.freelist; l.ptr() != nil; l = l.ptr().next { - freemark[(uintptr(l)-p)/size] = true + + for freeIndex := s.freeindex; freeIndex < s.nelems; freeIndex++ { + if s.isFree(freeIndex) { + freemark[freeIndex] = true + } } + for j := uintptr(0); j < n; j, p = j+1, p+size { if freemark[j] { freemark[j] = false @@ -709,7 +713,7 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector { i := uintptr(0) hbits := heapBitsForAddr(p) for ; i < nptr; i++ { - if i >= 2 && !hbits.isMarked() { + if i >= 2 && !hbits.morePointers() { break // end of object } if hbits.isPointer() { diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 528a5b73ba..e635682cae 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -502,23 +502,34 @@ const ( // weight allocation. If it is a heavy weight allocation the caller must // determine whether a new GC cycle needs to be started or if the GC is active // whether this goroutine needs to assist the GC. -// https://golang.org/cl/5350 motivates why this routine should preform a -// prefetch. func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { s := c.alloc[sizeclass] - v = s.freelist - if v.ptr() == nil { + shouldhelpgc = false + freeIndex := s.nextFreeIndex(s.freeindex) + + if freeIndex == s.nelems { + // The span is full. + if uintptr(s.ref) != s.nelems { + throw("s.ref != s.nelems && freeIndex == s.nelems") + } systemstack(func() { c.refill(int32(sizeclass)) }) shouldhelpgc = true s = c.alloc[sizeclass] - v = s.freelist + freeIndex = s.nextFreeIndex(s.freeindex) } - s.freelist = v.ptr().next + if freeIndex >= s.nelems { + throw("freeIndex is not valid") + } + + v = gclinkptr(freeIndex*s.elemsize + s.base()) + // Advance the freeIndex. + s.freeindex = freeIndex + 1 s.ref++ - // prefetchnta offers best performance, see change list message. - prefetchnta(uintptr(v.ptr().next)) + if uintptr(s.ref) > s.nelems { + throw("s.ref > s.nelems") + } return } @@ -655,10 +666,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { v, shouldhelpgc = c.nextFree(sizeclass) x = unsafe.Pointer(v) if flags&flagNoZero == 0 { - v.ptr().next = 0 - if size > 2*sys.PtrSize && ((*[2]uintptr)(x))[1] != 0 { - memclr(unsafe.Pointer(v), size) - } + memclr(unsafe.Pointer(v), size) + // TODO:(rlh) Only clear if object is not known to be zeroed. } } } else { @@ -667,12 +676,13 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { systemstack(func() { s = largeAlloc(size, flags) }) + s.freeindex = 1 x = unsafe.Pointer(uintptr(s.start << pageShift)) size = s.elemsize } if flags&flagNoScan != 0 { - // All objects are pre-marked as noscan. Nothing to do. + heapBitsSetTypeNoScan(uintptr(x), size) } else { // If allocating a defer+arg block, now that we've picked a malloc size // large enough to hold everything, cut the "asked for" size down to diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index a78efdc034..10446fee42 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -24,7 +24,7 @@ // In each 2-bit entry, the lower bit holds the same information as in the 1-bit // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC. // The meaning of the high bit depends on the position of the word being described -// in its allocated object. In the first word, the high bit is the GC ``marked'' bit. +// in its allocated object. In the first word, the high bit is unused. // In the second word, the high bit is the GC ``checkmarked'' bit (see below). // In the third and later words, the high bit indicates that the object is still // being described. In these words, if a bit pair with a high bit 0 is encountered, @@ -33,12 +33,13 @@ // in the object are uninteresting to the garbage collector. // // The 2-bit entries are split when written into the byte, so that the top half -// of the byte contains 4 mark bits and the bottom half contains 4 pointer bits. +// of the byte contains 4 high bits and the bottom half contains 4 low (pointer) +// bits. // This form allows a copy from the 1-bit to the 4-bit form to keep the // pointer bits contiguous, instead of having to space them out. // // The code makes use of the fact that the zero value for a heap bitmap -// has no live pointer bit set and is (depending on position), not marked, +// has no live pointer bit set and is (depending on position), not used, // not checkmarked, and is the dead encoding. // These properties must be preserved when modifying the encoding. // @@ -63,6 +64,7 @@ // It is still used in general, except in checkmark the type bit is repurposed // as the checkmark bit and then reinitialized (to 1) as the type bit when // finished. +// package runtime @@ -254,16 +256,20 @@ func markBitsForAddr(p uintptr) markBits { func (s *mspan) markBitsForAddr(p uintptr) markBits { byteOffset := p - s.base() - markBitIndex := byteOffset / s.elemsize // TODO if hot spot use fancy divide.... - return s.markBitsForIndex(markBitIndex) -} - -func (s *mspan) markBitsForIndex(markBitIndex uintptr) markBits { + markBitIndex := uintptr(0) + if byteOffset != 0 { + // markBitIndex := (p - s.base()) / s.elemsize, using division by multiplication + markBitIndex = uintptr(uint64(byteOffset) >> s.divShift * uint64(s.divMul) >> s.divShift2) + } whichByte := markBitIndex / 8 whichBit := markBitIndex % 8 return markBits{&s.gcmarkBits[whichByte], uint8(1 << whichBit), markBitIndex} } +func (s *mspan) markBitsForBase() markBits { + return markBits{&s.gcmarkBits[0], uint8(1), 0} +} + // isMarked reports whether mark bit m is set. func (m markBits) isMarked() bool { return *m.bytep&m.mask != 0 @@ -307,6 +313,17 @@ func markBitsForSpan(base uintptr) (mbits markBits) { return mbits } +// advance advances the markBits to the next object in the span. +func (m *markBits) advance() { + if m.mask == 1<<7 { + m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1)) + m.mask = 1 + } else { + m.mask = m.mask << 1 + } + m.index++ +} + // heapBitsForAddr returns the heapBits for the address addr. // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used). // @@ -440,28 +457,13 @@ func (h heapBits) bits() uint32 { return uint32(*h.bitp) >> (h.shift & 31) } -// isMarked reports whether the heap bits have the marked bit set. -// h must describe the initial word of the object. -func (h heapBits) isMarked() bool { +// morePointers returns true if this word and all remaining words in this object +// are scalars. +// h must not describe the first or second word of the object. +func (h heapBits) morePointers() bool { return *h.bitp&(bitMarked<= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { + if doCall { f(base + i*sys.PtrSize) } - if x&(bitMarked<= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { + if doCall { f(base + (i+1)*sys.PtrSize) } - if x&(bitMarked<<(2*heapBitsShift)) != 0 { - x &^= bitMarked << (2 * heapBitsShift) - } else { + if cl != 0 { + nfree++ + } + } + mbits.advance() + if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { + if doCall { f(base + (i+2)*sys.PtrSize) } - if x&(bitMarked<<(3*heapBitsShift)) != 0 { - x &^= bitMarked << (3 * heapBitsShift) - } else { + if cl != 0 { + nfree++ + } + } + mbits.advance() + if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { + if doCall { f(base + (i+3)*sys.PtrSize) } - *bitp = uint8(x) - bitp = subtract1(bitp) - } - - case size%(4*sys.PtrSize) == 0: - // Mark bit is in first word of each object. - // Each object starts at bit 0 of a heap bitmap byte. - bitp := h.bitp - step := size / heapBitmapScale - for i := uintptr(0); i < n; i++ { - x := uint32(*bitp) - if x&bitMarked != 0 { - x &^= bitMarked - } else { - x = 0 - f(base + i*size) + if cl != 0 { + nfree++ } - *bitp = uint8(x) - bitp = subtractb(bitp, step) } + mbits.advance() + } + return +} - case size%(4*sys.PtrSize) == 2*sys.PtrSize: - // Mark bit is in first word of each object, - // but every other object starts halfway through a heap bitmap byte. - // Unroll loop 2x to handle alternating shift count and step size. - bitp := h.bitp - step := size / heapBitmapScale - var i uintptr - for i = uintptr(0); i < n; i += 2 { - x := uint32(*bitp) - if x&bitMarked != 0 { - x &^= bitMarked - } else { - x &^= bitMarked | bitPointer | (bitMarked|bitPointer)< 2*sys.PtrSize { - x = 0 +func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool { + mByte := *m.bytep + for { + for mByte == 0xff { + if m.index >= maxIndex { + return false + } + m.index = (m.index + 8) &^ (8 - 1) + m.mask = 1 + m.bytep = add1(m.bytep) + mByte = *m.bytep + } + if m.index >= maxIndex { + return false + } + for m.index < maxIndex { + if m.mask&mByte == 0 { + if m.index < s.freeindex { + return true + } + if s.allocBits[m.index/8]&m.mask != 0 { + return true } } - *bitp = uint8(x) - if i+1 >= n { + if m.mask == 1<<7 { + m.mask = 1 + m.bytep = add1(m.bytep) + mByte = *m.bytep + m.index++ break - } - bitp = subtractb(bitp, step) - x = uint32(*bitp) - if x&(bitMarked<<(2*heapBitsShift)) != 0 { - x &^= bitMarked << (2 * heapBitsShift) } else { - x &^= (bitMarked|bitPointer)<<(2*heapBitsShift) | (bitMarked|bitPointer)<<(3*heapBitsShift) - f(base + (i+1)*size) - if size > 2*sys.PtrSize { - *subtract1(bitp) = 0 - } + m.mask = m.mask << 1 + m.index++ } - *bitp = uint8(x) - bitp = subtractb(bitp, step+1) } } + return false +} + +func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { + twobits := s.markBitsForBase() + for twobits.nextFreed(n, s) { + if doCall { + f(base + twobits.index*size) + } + if cl != 0 { + nfree++ + } + twobits.advance() + } + return } // heapBitsSetType records that the new allocation [x, x+size) @@ -862,7 +892,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // size is sizeof(_defer{}) (at least 6 words) and dataSize may be // arbitrarily larger. // - // The checks for size == ptrSize and size == 2*ptrSize can therefore + // The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore // assume that dataSize == size without checking it explicitly. if sys.PtrSize == 8 && size == sys.PtrSize { @@ -902,10 +932,13 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // (In general the number of instances of typ being allocated is // dataSize/typ.size.) if sys.PtrSize == 4 && dataSize == sys.PtrSize { - // 1 pointer. + // 1 pointer object. On 32-bit machines clear the bit for the + // unused second word. if gcphase == _GCoff { + *h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift *h.bitp |= bitPointer << h.shift } else { + atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<>= 2 nb -= 2 // Note: no bitMarker in hb because the first two words don't get markers from us. if gcphase == _GCoff { + *hbitp &^= uint8((bitPointer | (bitPointer << heapBitsShift)) << (2 * heapBitsShift)) *hbitp |= uint8(hb) } else { + atomic.And8(hbitp, ^(uint8(bitPointer|bitPointer< 2*sys.PtrSize { + *bitp &^= (bitPointer | bitMarked) << (2 * heapBitsShift) + } + } else if h.shift == 2 { + *bitp &^= bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift) + if size > 2*sys.PtrSize { + bitp = subtract1(bitp) + *bitp &^= bitPointer | bitMarked + } + } else { + throw("Type has unrecognized size") + } + } else { + throw("Type has unrecognized size") + } +} + var debugPtrmask struct { lock mutex data *byte @@ -1424,7 +1506,7 @@ func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize u // progToPointerMask returns the 1-bit pointer mask output by the GC program prog. // size the size of the region described by prog, in bytes. -// The resulting bitvector will have no more than size/ptrSize bits. +// The resulting bitvector will have no more than size/sys.PtrSize bits. func progToPointerMask(prog *byte, size uintptr) bitvector { n := (size/sys.PtrSize + 7) / 8 x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1] @@ -1560,7 +1642,7 @@ Run: // into a register and use that register for the entire loop // instead of repeatedly reading from memory. // Handling fewer than 8 bits here makes the general loop simpler. - // The cutoff is ptrSize*8 - 7 to guarantee that when we add + // The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add // the pattern to a bit buffer holding at most 7 bits (a partial byte) // it will not overflow. src := dst @@ -1855,7 +1937,7 @@ func getgcmask(ep interface{}) (mask []byte) { if hbits.isPointer() { mask[i/sys.PtrSize] = 1 } - if i >= 2*sys.PtrSize && !hbits.isMarked() { + if i >= 2*sys.PtrSize && !hbits.morePointers() { mask = mask[:i/sys.PtrSize] break } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 2230c5c200..424fa0efac 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -108,9 +108,11 @@ func (c *mcache) refill(sizeclass int32) *mspan { _g_.m.locks++ // Return the current cached span to the central lists. s := c.alloc[sizeclass] - if s.freelist.ptr() != nil { - throw("refill on a nonempty span") + + if uintptr(s.ref) != s.nelems { + throw("refill of span with free space remaining") } + if s != &emptymspan { s.incache = false } @@ -120,10 +122,11 @@ func (c *mcache) refill(sizeclass int32) *mspan { if s == nil { throw("out of memory") } - if s.freelist.ptr() == nil { - println(s.ref, (s.npages<<_PageShift)/s.elemsize) - throw("empty span") + + if uintptr(s.ref) == s.nelems { + throw("span has no free space") } + c.alloc[sizeclass] = s _g_.m.locks-- return s diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index baca157db9..47d3ae2f81 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -18,7 +18,7 @@ import "runtime/internal/atomic" type mcentral struct { lock mutex sizeclass int32 - nonempty mSpanList // list of spans with a free object + nonempty mSpanList // list of spans with a free object, ie a nonempty free list empty mSpanList // list of spans with no free objects (or cached in an mcache) } @@ -67,7 +67,9 @@ retry: c.empty.insertBack(s) unlock(&c.lock) s.sweep(true) - if s.freelist.ptr() != nil { + freeIndex := s.nextFreeIndex(0) + if freeIndex != s.nelems { + s.freeindex = freeIndex goto havespan } lock(&c.lock) @@ -115,9 +117,6 @@ havespan: // heap_live changed. gcController.revise() } - if s.freelist.ptr() == nil { - throw("freelist empty") - } s.incache = true return s } @@ -150,15 +149,11 @@ func (c *mcentral) uncacheSpan(s *mspan) { // the latest generation. // If preserve=true, don't return the span to heap nor relink in MCentral lists; // caller takes care of it. -func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool) bool { +func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool { if s.incache { - throw("freespan into cached span") + throw("freeSpan given cached span") } - // Add the objects back to s's free list. - wasempty := s.freelist.ptr() == nil - end.ptr().next = s.freelist - s.freelist = start s.ref -= uint16(n) if preserve { @@ -190,16 +185,14 @@ func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, p return false } - // s is completely freed, return it to the heap. c.nonempty.remove(s) s.needzero = 1 - s.freelist = 0 unlock(&c.lock) mheap_.freeSpan(s, 0) return true } -// Fetch a new span from the heap and carve into objects for the free list. +// grow allocates a new empty span from the heap and initializes it for c's size class. func (c *mcentral) grow() *mspan { npages := uintptr(class_to_allocnpages[c.sizeclass]) size := uintptr(class_to_size[c.sizeclass]) @@ -212,19 +205,7 @@ func (c *mcentral) grow() *mspan { p := uintptr(s.start << _PageShift) s.limit = p + size*n - head := gclinkptr(p) - tail := gclinkptr(p) - // i==0 iteration already done - for i := uintptr(1); i < n; i++ { - p += size - tail.ptr().next = gclinkptr(p) - tail = gclinkptr(p) - } - if s.freelist.ptr() != nil { - throw("freelist not empty") - } - tail.ptr().next = 0 - s.freelist = head + heapBitsForSpan(s.base()).initSpan(s) return s } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 66d61bae1e..fe8a56460b 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1044,9 +1044,9 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork if obj&(sys.PtrSize-1) != 0 { throw("greyobject: obj not pointer-aligned") } - + mbits := span.markBitsForAddr(obj) if useCheckmark { - if !hbits.isMarked() { + if !mbits.isMarked() { printlock() print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n") print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n") @@ -1068,10 +1068,10 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork } } else { // If marked we have nothing to do. - if hbits.isMarked() { + if mbits.isMarked() { return } - hbits.setMarked() + mbits.setMarked() // If this is a noscan object, fast-track it to black // instead of greying it. @@ -1138,7 +1138,7 @@ func gcmarknewobject_m(obj, size uintptr) { if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen. throw("gcmarknewobject called while doing checkmark") } - heapBitsForAddr(obj).setMarked() + markBitsForAddr(obj).setMarked() atomic.Xadd64(&work.bytesMarked, int64(size)) } diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 31d1a80183..7a1a76cbad 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -192,16 +192,13 @@ func (s *mspan) sweep(preserve bool) bool { c := _g_.m.mcache freeToHeap := false - // Mark any free objects in this span so we don't collect them. - sstart := uintptr(s.start << _PageShift) - for link := s.freelist; link.ptr() != nil; link = link.ptr().next { - if uintptr(link) < sstart || s.limit <= uintptr(link) { - // Free list is corrupted. - dumpFreeList(s) - throw("free list corrupted") - } - heapBitsForAddr(uintptr(link)).setMarkedNonAtomic() - } + // The allocBits indicate which unmarked objects don't need to be + // processed since they were free at the end of the last GC cycle + // and were not allocated since then. + // If the allocBits index is >= s.freeindex and the bit + // is not marked then the object remains unallocated + // since the last GC. + // This situation is analogous to being on a freelist. // Unlink & free special records for any objects we're about to free. // Two complications here: @@ -216,8 +213,8 @@ func (s *mspan) sweep(preserve bool) bool { for special != nil { // A finalizer can be set for an inner byte of an object, find object beginning. p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size - hbits := heapBitsForAddr(p) - if !hbits.isMarked() { + mbits := s.markBitsForAddr(p) + if !mbits.isMarked() { // This object is not marked and has at least one special record. // Pass 1: see if it has at least one finalizer. hasFin := false @@ -225,7 +222,7 @@ func (s *mspan) sweep(preserve bool) bool { for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next { if tmp.kind == _KindSpecialFinalizer { // Stop freeing of object if it has a finalizer. - hbits.setMarkedNonAtomic() + mbits.setMarkedNonAtomic() hasFin = true break } @@ -259,8 +256,7 @@ func (s *mspan) sweep(preserve bool) bool { // This thread owns the span now, so it can manipulate // the block bitmap without atomic operations. - size, n, _ := s.layout() - heapBitsSweepSpan(s.base(), size, n, func(p uintptr) { + nfree = heapBitsSweepSpan(s, func(p uintptr) { // At this point we know that we are looking at garbage object // that needs to be collected. if debug.allocfreetrace != 0 { @@ -288,17 +284,18 @@ func (s *mspan) sweep(preserve bool) bool { } else if size > sys.PtrSize { *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0 } - if head.ptr() == nil { - head = gclinkptr(p) - } else { - end.ptr().next = gclinkptr(p) - } - end = gclinkptr(p) - end.ptr().next = gclinkptr(0x0bade5) - nfree++ } }) + wasempty := s.nextFreeIndex(s.freeindex) == s.nelems + + s.freeindex = 0 // reset allocation index to start of span. + + // Swap role of allocBits with gcmarkBits + // Clear gcmarkBits in preparation for next GC + s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits + s.clearGCMarkBits() // prepare for next GC + // We need to set s.sweepgen = h.sweepgen only when all blocks are swept, // because of the potential for a concurrent free/SetFinalizer. // But we need to set it before we make the span available for allocation @@ -311,11 +308,14 @@ func (s *mspan) sweep(preserve bool) bool { print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n") throw("MSpan_Sweep: bad span state after sweep") } + // Serialization point. + // At this point the mark bits are cleared and allocation ready + // to go so release the span. atomic.Store(&s.sweepgen, sweepgen) } if nfree > 0 { c.local_nsmallfree[cl] += uintptr(nfree) - res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve) + res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { // Free large span to heap @@ -399,27 +399,3 @@ func reimburseSweepCredit(unusableBytes uintptr) { throw("spanBytesAlloc underflow") } } - -func dumpFreeList(s *mspan) { - printlock() - print("runtime: free list of span ", s, ":\n") - sstart := uintptr(s.start << _PageShift) - link := s.freelist - for i := 0; i < int(s.npages*_PageSize/s.elemsize); i++ { - if i != 0 { - print(" -> ") - } - print(hex(link)) - if link.ptr() == nil { - break - } - if uintptr(link) < sstart || s.limit <= uintptr(link) { - // Bad link. Stop walking before we crash. - print(" (BAD)") - break - } - link = link.ptr().next - } - print("\n") - printunlock() -} diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index a3d34a360e..d5dde5e72e 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -119,8 +119,7 @@ type mspan struct { start pageID // starting page number npages uintptr // number of pages in span - freelist gclinkptr // list of free objects for _MSpanInUse - stackfreelist gclinkptr // list of free stacks, avoids overloading freelist for _MSpanStack + stackfreelist gclinkptr // list of free stacks, avoids overloading freelist // freeindex is the slot index between 0 and nelems at which to begin scanning // for the next free object in this span. @@ -472,7 +471,6 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { // able to map interior pointer to containing span. atomic.Store(&s.sweepgen, h.sweepgen) s.state = _MSpanInUse - s.freelist = 0 s.ref = 0 s.sizeclass = uint8(sizeclass) if sizeclass == 0 { @@ -914,7 +912,6 @@ func (span *mspan) init(start pageID, npages uintptr) { span.list = nil span.start = start span.npages = npages - span.freelist = 0 span.ref = 0 span.sizeclass = 0 span.incache = false @@ -925,6 +922,17 @@ func (span *mspan) init(start pageID, npages uintptr) { span.speciallock.key = 0 span.specials = nil span.needzero = 0 + span.freeindex = 0 + span.allocBits = &span.markbits1 + span.gcmarkBits = &span.markbits2 + // determine if this is actually needed. It is once / span so it + // isn't expensive. This is to be replaced by an arena + // based system where things can be cleared all at once so + // don't worry about optimizing this. + for i := 0; i < len(span.markbits1); i++ { + span.allocBits[i] = 0 + span.gcmarkBits[i] = 0 + } } func (span *mspan) inList() bool { diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 5e373f1b94..8fd7ef2bcf 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -1137,7 +1137,6 @@ func freeStackSpans() { next := s.next if s.ref == 0 { list.remove(s) - s.freelist = 0 s.stackfreelist = 0 mheap_.freeStack(s) } From e4ac2d4acc8cb44df2107e3fa1067755feaaa005 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Tue, 16 Feb 2016 17:16:43 -0500 Subject: [PATCH 07/23] [dev.garbage] runtime: replace ref with allocCount This is a renaming of the field ref to the more appropriate allocCount. The field holds the number of objects in the span that are currently allocated. Some throws strings were adjusted to more accurately convey the meaning of allocCount. Change-Id: I10daf44e3e9cc24a10912638c7de3c1984ef8efe Reviewed-on: https://go-review.googlesource.com/19518 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 10 +++++----- src/runtime/mcache.go | 4 ++-- src/runtime/mcentral.go | 16 ++++++++-------- src/runtime/mheap.go | 14 +++++++------- src/runtime/mstats.go | 6 +++--- src/runtime/stack.go | 12 ++++++------ 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index e635682cae..6db323a8d3 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -509,8 +509,8 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { if freeIndex == s.nelems { // The span is full. - if uintptr(s.ref) != s.nelems { - throw("s.ref != s.nelems && freeIndex == s.nelems") + if uintptr(s.allocCount) != s.nelems { + throw("s.allocCount != s.nelems && freeIndex == s.nelems") } systemstack(func() { c.refill(int32(sizeclass)) @@ -526,9 +526,9 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { v = gclinkptr(freeIndex*s.elemsize + s.base()) // Advance the freeIndex. s.freeindex = freeIndex + 1 - s.ref++ - if uintptr(s.ref) > s.nelems { - throw("s.ref > s.nelems") + s.allocCount++ + if uintptr(s.allocCount) > s.nelems { + throw("s.allocCount > s.nelems") } return } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 424fa0efac..5938e53ca8 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -109,7 +109,7 @@ func (c *mcache) refill(sizeclass int32) *mspan { // Return the current cached span to the central lists. s := c.alloc[sizeclass] - if uintptr(s.ref) != s.nelems { + if uintptr(s.allocCount) != s.nelems { throw("refill of span with free space remaining") } @@ -123,7 +123,7 @@ func (c *mcache) refill(sizeclass int32) *mspan { throw("out of memory") } - if uintptr(s.ref) == s.nelems { + if uintptr(s.allocCount) == s.nelems { throw("span has no free space") } diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 47d3ae2f81..5dafa28450 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -100,11 +100,11 @@ retry: // c is unlocked. havespan: cap := int32((s.npages << _PageShift) / s.elemsize) - n := cap - int32(s.ref) + n := cap - int32(s.allocCount) if n == 0 { - throw("empty span") + throw("span has no free objects") } - usedBytes := uintptr(s.ref) * s.elemsize + usedBytes := uintptr(s.allocCount) * s.elemsize if usedBytes > 0 { reimburseSweepCredit(usedBytes) } @@ -127,12 +127,12 @@ func (c *mcentral) uncacheSpan(s *mspan) { s.incache = false - if s.ref == 0 { - throw("uncaching full span") + if s.allocCount == 0 { + throw("uncaching span but s.allocCount == 0") } cap := int32((s.npages << _PageShift) / s.elemsize) - n := cap - int32(s.ref) + n := cap - int32(s.allocCount) if n > 0 { c.empty.remove(s) c.nonempty.insert(s) @@ -154,7 +154,7 @@ func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, p throw("freeSpan given cached span") } - s.ref -= uint16(n) + s.allocCount -= uint16(n) if preserve { // preserve is set only when called from MCentral_CacheSpan above, @@ -180,7 +180,7 @@ func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, p // lock of c above.) atomic.Store(&s.sweepgen, mheap_.sweepgen) - if s.ref != 0 { + if s.allocCount != 0 { unlock(&c.lock) return false } diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index d5dde5e72e..cd35acb6dd 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -159,7 +159,7 @@ type mspan struct { sweepgen uint32 divMul uint32 // for divide by elemsize - divMagic.mul - ref uint16 // capacity - number of objects in freelist + allocCount uint16 // capacity - number of objects in freelist sizeclass uint8 // size class incache bool // being used by an mcache state uint8 // mspaninuse etc @@ -471,7 +471,7 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { // able to map interior pointer to containing span. atomic.Store(&s.sweepgen, h.sweepgen) s.state = _MSpanInUse - s.ref = 0 + s.allocCount = 0 s.sizeclass = uint8(sizeclass) if sizeclass == 0 { s.elemsize = s.npages << _PageShift @@ -551,7 +551,7 @@ func (h *mheap) allocStack(npage uintptr) *mspan { if s != nil { s.state = _MSpanStack s.stackfreelist = 0 - s.ref = 0 + s.allocCount = 0 memstats.stacks_inuse += uint64(s.npages << _PageShift) } @@ -773,12 +773,12 @@ func (h *mheap) freeStack(s *mspan) { func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) { switch s.state { case _MSpanStack: - if s.ref != 0 { + if s.allocCount != 0 { throw("MHeap_FreeSpanLocked - invalid stack free") } case _MSpanInUse: - if s.ref != 0 || s.sweepgen != h.sweepgen { - print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " ref ", s.ref, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") + if s.allocCount != 0 || s.sweepgen != h.sweepgen { + print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") throw("MHeap_FreeSpanLocked - invalid free") } h.pagesInUse -= uint64(s.npages) @@ -912,7 +912,7 @@ func (span *mspan) init(start pageID, npages uintptr) { span.list = nil span.start = start span.npages = npages - span.ref = 0 + span.allocCount = 0 span.sizeclass = 0 span.incache = false span.elemsize = 0 diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go index 84a79e312c..2d75d2fef1 100644 --- a/src/runtime/mstats.go +++ b/src/runtime/mstats.go @@ -295,9 +295,9 @@ func updatememstats(stats *gcstats) { memstats.nmalloc++ memstats.alloc += uint64(s.elemsize) } else { - memstats.nmalloc += uint64(s.ref) - memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref) - memstats.alloc += uint64(s.ref) * uint64(s.elemsize) + memstats.nmalloc += uint64(s.allocCount) + memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount) + memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize) } } unlock(&mheap_.lock) diff --git a/src/runtime/stack.go b/src/runtime/stack.go index 8fd7ef2bcf..1ca737e920 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -191,8 +191,8 @@ func stackpoolalloc(order uint8) gclinkptr { if s == nil { throw("out of memory") } - if s.ref != 0 { - throw("bad ref") + if s.allocCount != 0 { + throw("bad allocCount") } if s.stackfreelist.ptr() != nil { throw("bad stackfreelist") @@ -209,7 +209,7 @@ func stackpoolalloc(order uint8) gclinkptr { throw("span has no free stacks") } s.stackfreelist = x.ptr().next - s.ref++ + s.allocCount++ if s.stackfreelist.ptr() == nil { // all stacks in s are allocated. list.remove(s) @@ -229,8 +229,8 @@ func stackpoolfree(x gclinkptr, order uint8) { } x.ptr().next = s.stackfreelist s.stackfreelist = x - s.ref-- - if gcphase == _GCoff && s.ref == 0 { + s.allocCount-- + if gcphase == _GCoff && s.allocCount == 0 { // Span is completely free. Return it to the heap // immediately if we're sweeping. // @@ -1135,7 +1135,7 @@ func freeStackSpans() { list := &stackpool[order] for s := list.first; s != nil; { next := s.next - if s.ref == 0 { + if s.allocCount == 0 { list.remove(s) s.stackfreelist = 0 mheap_.freeStack(s) From 44fe90d0b393c961e3fb1b4c37e93ce268da46bc Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 17 Feb 2016 11:27:52 -0500 Subject: [PATCH 08/23] [dev.garbage] runtime: logic that uses count trailing zero (ctz) Most (all?) processors that Go supports supply a hardware instruction that takes a byte and returns the number of zeros trailing the first 1 encountered, or 8 if no ones are found. This is the index within the byte of the first 1 encountered. CTZ should improve the performance of the nextFreeIndex function. Since nextFreeIndex wants the next unmarked (0) bit a bit-wise complement is needed before calling ctz. Furthermore unmarked bits associated with previously allocated objects need to be ignored. Instead of writing a 1 as we allocate the code masks all bits less than the freeindex after loading the byte. While this CL does not actual execute a CTZ instruction it supplies a ctz function with the appropiate signature along with the logic to execute it. Change-Id: I5c55ce0ed48ca22c21c4dd9f969b0819b4eadaa7 Reviewed-on: https://go-review.googlesource.com/20169 Reviewed-by: Keith Randall Reviewed-by: Austin Clements --- src/runtime/mbitmap.go | 71 ++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 10446fee42..f02558bed0 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -186,12 +186,22 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} } +// A temporary stand in for the count trailing zero ctz instruction. +func ctz(markBits byte) uint8 { + tz := uint8(0) // trailing zero count. + if markBits == 0 { + return 8 // 8 + } + for mask := byte(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { + } + return tz +} + // nextFreeIndex returns the index of the next free object in s at or // after the index'th object. // There are hardware instructions that can be used to make this // faster if profiling warrants it. func (s *mspan) nextFreeIndex(index uintptr) uintptr { - var mask uint8 if index == s.nelems { return index } @@ -200,47 +210,34 @@ func (s *mspan) nextFreeIndex(index uintptr) uintptr { } whichByte := index / 8 theByte := s.allocBits[whichByte] - // Optimize for the first byte holding a free object. - if theByte != 0xff { - mask = 1 << (index % 8) - for index < s.nelems { - if mask&theByte == 0 { - return index - } - if mask == 1<<7 { - break - } - mask = mask << 1 - index++ - } - } - maxByteIndex := (s.nelems - 1) / 8 - theByte = 0xff // Free bit not found in this byte above so set to 0xff. - // If there was a 0 bit before incoming index then the byte would not be 0xff. - for theByte == 0xff { - whichByte++ - if whichByte > maxByteIndex { + + theBitMask := uint8(1<<(index%8) - 1) + // theBitMask holds a 1 for every bit < index which have already been allocated. + // Flip the masked marked bits so 1 means a free bit. + theByte = ^(theByte | theBitMask) + tz := ctz(theByte) + if tz != 8 { + result := uintptr(tz) + whichByte*8 + if result >= s.nelems { return s.nelems } - if uintptr(len(s.allocBits)) <= whichByte { - throw("whichByte > len(s.allocBits") - } - theByte = s.allocBits[whichByte] + return result } - index = whichByte * 8 - mask = uint8(1) - - for index < s.nelems { - if mask&theByte == 0 { - return index + whichByte++ + index = (whichByte) * 8 + for ; index < s.nelems; index += 8 { + theByte = ^s.allocBits[whichByte] + tz = ctz(theByte) + if tz != 8 { + result := uintptr(tz) + whichByte*8 + if result >= s.nelems { + return s.nelems + } + return result } - if mask == 1<<7 { - break - } - mask = mask << 1 - index++ + whichByte++ } - return index + return s.nelems } func (s *mspan) isFree(index uintptr) bool { From 4093481523b1e064e998d5d586276db45f4d11a7 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 24 Feb 2016 14:36:30 -0500 Subject: [PATCH 09/23] [dev.garbage] runtime: add bit and cache ctz64 (count trailing zero) Add to each span a 64 bit cache (allocCache) of the allocBits at freeindex. allocCache is shifted such that the lowest bit corresponds to the bit freeindex. allocBits uses a 0 to indicate an object is free, on the other hand allocCache uses a 1 to indicate an object is free. This facilitates ctz64 (count trailing zero) which counts the number of 0s trailing the least significant 1. This is also the index of the least significant 1. Each span maintains a freeindex indicating the boundary between allocated objects and unallocated objects. allocCache is shifted as freeindex is incremented such that the low bit in allocCache corresponds to the bit a freeindex in the allocBits array. Currently ctz64 is written in Go using a for loop so it is not very efficient. Use of the hardware instruction will follow. With this in mind comparisons of the garbage benchmark are as follows. 1.6 release 2.8 seconds dev:garbage branch 3.1 seconds. Profiling shows the go implementation of ctz64 takes up 1% of the total time. Change-Id: If084ed9c3b1eda9f3c6ab2e794625cb870b8167f Reviewed-on: https://go-review.googlesource.com/20200 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 15 ++-- src/runtime/mbitmap.go | 178 ++++++++++++++++++++++------------------ src/runtime/mcentral.go | 31 ++++--- src/runtime/mgcsweep.go | 13 +-- src/runtime/mheap.go | 14 +++- 5 files changed, 143 insertions(+), 108 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 6db323a8d3..574ce3dafc 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -505,29 +505,30 @@ const ( func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { s := c.alloc[sizeclass] shouldhelpgc = false - freeIndex := s.nextFreeIndex(s.freeindex) - + freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { // The span is full. - if uintptr(s.allocCount) != s.nelems { - throw("s.allocCount != s.nelems && freeIndex == s.nelems") + if uintptr(s.allocCount) > s.nelems { + println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) + throw("s.allocCount > s.nelems && freeIndex == s.nelems") } systemstack(func() { c.refill(int32(sizeclass)) }) shouldhelpgc = true s = c.alloc[sizeclass] - freeIndex = s.nextFreeIndex(s.freeindex) + + freeIndex = s.nextFreeIndex() } + if freeIndex >= s.nelems { throw("freeIndex is not valid") } v = gclinkptr(freeIndex*s.elemsize + s.base()) - // Advance the freeIndex. - s.freeindex = freeIndex + 1 s.allocCount++ if uintptr(s.allocCount) > s.nelems { + println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems) throw("s.allocCount > s.nelems") } return diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index f02558bed0..910c4fa844 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -187,57 +187,84 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { } // A temporary stand in for the count trailing zero ctz instruction. -func ctz(markBits byte) uint8 { - tz := uint8(0) // trailing zero count. +// IA bsf works on 64 bit non-zero word. +func ctz64(markBits uint64) uint64 { if markBits == 0 { - return 8 // 8 + return 64 // bits in 64 bit word, ensures loop terminates } - for mask := byte(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { + // tz holds trailing zero count. + tz := uint64(0) + for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { } return tz } -// nextFreeIndex returns the index of the next free object in s at or -// after the index'th object. +// refillAllocCache takes 8 bytes s.allocBits starting at whichByte +// and negates them so that ctz (count trailing zeros) instructions +// can be used. It then places these 8 bytes into the cached 64 bit +// s.allocCache. +func (s *mspan) refillAllocCache(whichByte uintptr) { + bytes := s.allocBits[whichByte : whichByte+8] + aCache := uint64(0) + aCache |= uint64(bytes[0]) + aCache |= uint64(bytes[1]) << (1 * 8) + aCache |= uint64(bytes[2]) << (2 * 8) + aCache |= uint64(bytes[3]) << (3 * 8) + aCache |= uint64(bytes[4]) << (4 * 8) + aCache |= uint64(bytes[5]) << (5 * 8) + aCache |= uint64(bytes[6]) << (6 * 8) + aCache |= uint64(bytes[7]) << (7 * 8) + s.allocCache = ^aCache +} + +// nextFreeIndex returns the index of the next free object in s at +// or after s.freeindex. // There are hardware instructions that can be used to make this // faster if profiling warrants it. -func (s *mspan) nextFreeIndex(index uintptr) uintptr { - if index == s.nelems { - return index +func (s *mspan) nextFreeIndex() uintptr { + if s.freeindex == s.nelems { + return s.freeindex } - if index > s.nelems { - throw("index > s.nelems") + if s.freeindex > s.nelems { + throw("s.freeindex > s.nelems") } - whichByte := index / 8 - theByte := s.allocBits[whichByte] - theBitMask := uint8(1<<(index%8) - 1) - // theBitMask holds a 1 for every bit < index which have already been allocated. - // Flip the masked marked bits so 1 means a free bit. - theByte = ^(theByte | theBitMask) - tz := ctz(theByte) - if tz != 8 { - result := uintptr(tz) + whichByte*8 - if result >= s.nelems { - return s.nelems + aCache := s.allocCache + bitIndex := ctz64(aCache) + for bitIndex == 64 { + // Move index to start of next cached bits. + s.freeindex = (s.freeindex + 64) &^ (64 - 1) + if s.freeindex >= s.nelems { + s.freeindex = s.nelems + return s.freeindex } - return result + whichByte := s.freeindex / 8 + // Refill s.allocCache with the next 64 alloc bits. + // Unlike in allocBits a 1 in s.allocCache means + // the object is not marked. + s.refillAllocCache(whichByte) + aCache = s.allocCache + bitIndex = ctz64(aCache) + // Nothing was available try again now allocCache has been refilled. } - whichByte++ - index = (whichByte) * 8 - for ; index < s.nelems; index += 8 { - theByte = ^s.allocBits[whichByte] - tz = ctz(theByte) - if tz != 8 { - result := uintptr(tz) + whichByte*8 - if result >= s.nelems { - return s.nelems - } - return result - } - whichByte++ + result := s.freeindex + uintptr(bitIndex) + if result >= s.nelems { + s.freeindex = s.nelems + return s.freeindex } - return s.nelems + s.allocCache >>= bitIndex + 1 + s.freeindex = result + 1 + + if s.freeindex%64 == 0 && s.freeindex != s.nelems { + // We just incremented s.freeindex so it isn't 0. + // As each 1 in s.allocCache was encountered and used for allocation + // it was shifted away. At this point s.allocCache contains all 0s. + // Refill s.allocCache so that it corresponds + // to the bits at s.allocBits starting at s.freeindex. + whichByte := s.freeindex / 8 + s.refillAllocCache(whichByte) + } + return result } func (s *mspan) isFree(index uintptr) bool { @@ -667,6 +694,7 @@ func (h heapBits) initSpan(s *mspan) { s.allocBits = &s.markbits1 s.gcmarkBits = &s.markbits2 s.freeindex = 0 + s.allocCache = ^uint64(0) // all 1s indicating all free. s.nelems = n s.clearAllocBits() s.clearGCMarkBits() @@ -746,7 +774,6 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { n := s.nelems cl := s.sizeclass doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0 - h := heapBitsForSpan(base) switch { default: @@ -763,69 +790,58 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { mbits := s.markBitsForBase() - for i := uintptr(0); i < n; i += 4 { + // Consider mark bits in all four 2-bit entries of each bitmap byte. + if cl == 0 { + throw("8BitPtrs are not in cl 0") + } + // Consider mark bits in all four 2-bit entries of each bitmap byte. + for i := uintptr(0); i < n; i++ { // Note that unlike the other size cases, we leave the pointer bits set here. // These are initialized during initSpan when the span is created and left // in place the whole time the span is used for pointer-sized objects. // That lets heapBitsSetType avoid an atomic update to set the pointer bit // during allocation. - if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { - if doCall { + if !mbits.isMarked() { + nfree++ + if mbits.index < s.freeindex { + f(base + i*sys.PtrSize) + } else if s.allocBits[mbits.index/8]&mbits.mask == 1 { + // it was marked in the previous cycle but not this cycle + // if it wasn't marked in the prvious cycle the call would be redundant. f(base + i*sys.PtrSize) - } - if cl != 0 { - nfree++ - } - } - mbits.advance() - if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { - if doCall { - f(base + (i+1)*sys.PtrSize) - } - if cl != 0 { - nfree++ - } - } - mbits.advance() - if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { - if doCall { - f(base + (i+2)*sys.PtrSize) - } - if cl != 0 { - nfree++ - } - } - mbits.advance() - if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { - if doCall { - f(base + (i+3)*sys.PtrSize) - } - if cl != 0 { - nfree++ } } mbits.advance() } - return + return nfree } -func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool { +// nextFreed returns the next object that is being freed during this GC cycle. +// If the mark bit is set then the object is free. If it is < s.freeindex +// then either the object was freed during by this GC cycle. +// If it is >= freeindex then if the allocBit is set then it was +// freed during this GC cycle. If the allocBit is 0 it was freed +// during a previous cycle so is not considered a freed. +func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool { mByte := *m.bytep for { for mByte == 0xff { - if m.index >= maxIndex { + if m.index >= nelems { return false } m.index = (m.index + 8) &^ (8 - 1) m.mask = 1 m.bytep = add1(m.bytep) mByte = *m.bytep + // Nothing free found totalFree remains the same. } - if m.index >= maxIndex { + if m.index >= nelems { return false } - for m.index < maxIndex { + for m.index < nelems { if m.mask&mByte == 0 { + // At this point we have a free object so update totalFree + *totalFree++ if m.index < s.freeindex { return true } @@ -848,18 +864,16 @@ func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool { return false } -func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { +func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int { + totalFree := 0 twobits := s.markBitsForBase() - for twobits.nextFreed(n, s) { + for twobits.nextFreed(n, s, &totalFree) { if doCall { f(base + twobits.index*size) } - if cl != 0 { - nfree++ - } twobits.advance() } - return + return totalFree } // heapBitsSetType records that the new allocation [x, x+size) diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 5dafa28450..d5f05ae639 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -67,7 +67,7 @@ retry: c.empty.insertBack(s) unlock(&c.lock) s.sweep(true) - freeIndex := s.nextFreeIndex(0) + freeIndex := s.nextFreeIndex() if freeIndex != s.nelems { s.freeindex = freeIndex goto havespan @@ -101,7 +101,7 @@ retry: havespan: cap := int32((s.npages << _PageShift) / s.elemsize) n := cap - int32(s.allocCount) - if n == 0 { + if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems { throw("span has no free objects") } usedBytes := uintptr(s.allocCount) * s.elemsize @@ -118,6 +118,15 @@ havespan: gcController.revise() } s.incache = true + freeByteBase := s.freeindex &^ (64 - 1) + whichByte := freeByteBase / 8 + // Init alloc bits cache. + s.refillAllocCache(whichByte) + + // Adjust the allocCache so that s.freeindex corresponds to the low bit in + // s.allocCache. + s.allocCache >>= s.freeindex % 64 + return s } @@ -143,19 +152,19 @@ func (c *mcentral) uncacheSpan(s *mspan) { unlock(&c.lock) } -// Free n objects from a span s back into the central free list c. -// Called during sweep. -// Returns true if the span was returned to heap. Sets sweepgen to -// the latest generation. -// If preserve=true, don't return the span to heap nor relink in MCentral lists; -// caller takes care of it. -func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool { +// freeSpan updates c and s after sweeping s. +// It sets s's sweepgen to the latest generation, +// and, based on the number of free objects in s, +// moves s to the appropriate list of c or returns it +// to the heap. +// freeSpan returns true if s was returned to the heap. +// If preserve=true, it does not move s (the caller +// must take care of it). +func (c *mcentral) freeSpan(s *mspan, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool { if s.incache { throw("freeSpan given cached span") } - s.allocCount -= uint16(n) - if preserve { // preserve is set only when called from MCentral_CacheSpan above, // the span must be in the empty list. diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 7a1a76cbad..c217ee8d86 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -257,7 +257,7 @@ func (s *mspan) sweep(preserve bool) bool { // the block bitmap without atomic operations. nfree = heapBitsSweepSpan(s, func(p uintptr) { - // At this point we know that we are looking at garbage object + // At this point we know that we are looking at a garbage object // that needs to be collected. if debug.allocfreetrace != 0 { tracefree(unsafe.Pointer(p), size) @@ -286,8 +286,8 @@ func (s *mspan) sweep(preserve bool) bool { } } }) - - wasempty := s.nextFreeIndex(s.freeindex) == s.nelems + s.allocCount = uint16(s.nelems) - uint16(nfree) + wasempty := s.nextFreeIndex() == s.nelems s.freeindex = 0 // reset allocation index to start of span. @@ -295,6 +295,8 @@ func (s *mspan) sweep(preserve bool) bool { // Clear gcmarkBits in preparation for next GC s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits s.clearGCMarkBits() // prepare for next GC + // Initialize alloc bits cache. + s.refillAllocCache(0) // We need to set s.sweepgen = h.sweepgen only when all blocks are swept, // because of the potential for a concurrent free/SetFinalizer. @@ -313,9 +315,10 @@ func (s *mspan) sweep(preserve bool) bool { // to go so release the span. atomic.Store(&s.sweepgen, sweepgen) } - if nfree > 0 { + + if nfree > 0 && cl != 0 { c.local_nsmallfree[cl] += uintptr(nfree) - res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve, wasempty) + res = mheap_.central[cl].mcentral.freeSpan(s, head, end, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { // Free large span to heap diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index cd35acb6dd..4be503315b 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -136,7 +136,15 @@ type mspan struct { // undefined and should never be referenced. // // Object n starts at address n*elemsize + (start << pageShift). - freeindex uintptr + freeindex uintptr + + // Cache of the allocBits at freeindex. allocCache is shifted + // such that the lowest bit corresponds to the bit freeindex. + // allocCache holds the complement of allocBits, thus allowing + // ctz64 (count trailing zero) to use it directly. + // allocCache may contain bits beyond s.nelems; the caller must ignore + // these. + allocCache uint64 allocBits *[maxObjsPerSpan / 8]uint8 gcmarkBits *[maxObjsPerSpan / 8]uint8 nelems uintptr // number of object in the span. @@ -947,7 +955,7 @@ func (list *mSpanList) init() { func (list *mSpanList) remove(span *mspan) { if span.prev == nil || span.list != list { - println("failed MSpanList_Remove", span, span.prev, span.list, list) + println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list) throw("MSpanList_Remove") } if span.next != nil { @@ -969,7 +977,7 @@ func (list *mSpanList) isEmpty() bool { func (list *mSpanList) insert(span *mspan) { if span.next != nil || span.prev != nil || span.list != nil { - println("failed MSpanList_Insert", span, span.next, span.prev, span.list) + println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list) throw("MSpanList_Insert") } span.next = list.first From 8dda1c4c08adf8b2107dec1c0d70d24443269ccd Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 2 Mar 2016 12:15:02 -0500 Subject: [PATCH 10/23] [dev.garbage] runtime: remove heapBitsSweepSpan Prior to this CL the sweep phase was responsible for locating all objects that were about to be freed and calling a function to process the object. This was done by the function heapBitsSweepSpan. Part of processing included calls to tracefree and msanfree as well as counting how many objects were freed. The calls to tracefree and msanfree have been moved into the gcmalloc routine and called when the object is about to be reallocated. The counting of free objects has been optimized using an array based popcnt algorithm and if all the objects in a span are free then span is freed. Similarly the code to locate the next free object has been optimized to use an array based ctz (count trailing zero). Various hot paths in the allocation logic have been optimized. At this point the garbage benchmark is within 3% of the 1.6 release. Change-Id: I00643c442e2ada1685c010c3447e4ea8537d2dfa Reviewed-on: https://go-review.googlesource.com/20201 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 59 +++++++++- src/runtime/mbitmap.go | 250 +++++++++++++++++++--------------------- src/runtime/mgcsweep.go | 40 +------ src/runtime/mheap.go | 12 +- 4 files changed, 187 insertions(+), 174 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 574ce3dafc..2da13f2073 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -496,6 +496,33 @@ const ( _FlagNoZero = 1 << 1 // don't zero memory ) +// nextFreeFast returns the next free object if one is quickly available. +// Otherwise it returns 0. +func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { + s := c.alloc[sizeclass] + ctzIndex := uint8(s.allocCache & 0xff) + if ctzIndex != 0 { + theBit := uint64(ctzVals[ctzIndex]) + freeidx := s.freeindex // help the pre ssa compiler out here with cse. + result := freeidx + uintptr(theBit) + if result < s.nelems { + s.allocCache >>= (theBit + 1) + freeidx = result + 1 + if freeidx%64 == 0 && freeidx != s.nelems { + // We just incremented s.freeindex so it isn't 0 + // so we are moving to the next aCache. + whichByte := freeidx / 8 + s.refillAllocCache(whichByte) + } + s.freeindex = freeidx + v := gclinkptr(result*s.elemsize + s.base()) + s.allocCount++ + return v + } + } + return 0 +} + // nextFree returns the next free object from the cached span if one is available. // Otherwise it refills the cache with a span with an available object and // returns that object along with a flag indicating that this was a heavy @@ -508,9 +535,9 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { // The span is full. - if uintptr(s.allocCount) > s.nelems { + if uintptr(s.allocCount) != s.nelems { println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) - throw("s.allocCount > s.nelems && freeIndex == s.nelems") + throw("s.allocCount != s.nelems && freeIndex == s.nelems") } systemstack(func() { c.refill(int32(sizeclass)) @@ -644,7 +671,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } // Allocate a new maxTinySize block. var v gclinkptr - v, shouldhelpgc = c.nextFree(tinySizeClass) + v = c.nextFreeFast(tinySizeClass) + if v == 0 { + v, shouldhelpgc = c.nextFree(tinySizeClass) + } x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 (*[2]uint64)(x)[1] = 0 @@ -664,7 +694,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } size = uintptr(class_to_size[sizeclass]) var v gclinkptr - v, shouldhelpgc = c.nextFree(sizeclass) + v = c.nextFreeFast(sizeclass) + if v == 0 { + v, shouldhelpgc = c.nextFree(sizeclass) + } x = unsafe.Pointer(v) if flags&flagNoZero == 0 { memclr(unsafe.Pointer(v), size) @@ -725,9 +758,27 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { }) } + // The object x is about to be reused but tracefree and msanfree + // need to be informed. + // TODO:(rlh) It is quite possible that this object is being allocated + // out of a fresh span and that there is no preceding call to + // tracealloc with this object. If this is an issue then initialization + // of the fresh span needs to leave some crumbs around that can be used to + // avoid these calls. Furthermore these crumbs a likely the same as + // those needed to determine if the object needs to be zeroed. + // In the case of msanfree it does not make sense to call msanfree + // followed by msanmalloc. msanfree indicates that the bytes are not + // initialized but msanmalloc is about to indicate that they are. + // It makes no difference whether msanmalloc has been called on these + // bytes or not. + if debug.allocfreetrace != 0 { + tracefree(unsafe.Pointer(x), size) + } + if raceenabled { racemalloc(x, size) } + if msanenabled { msanmalloc(x, size) } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 910c4fa844..ea398904e3 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -186,17 +186,59 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} } +// ctzVals contains the count of trailing zeros for the +// index. 0 returns 8 indicating 8 zeros. +var ctzVals = [256]int8{ + 8, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 7, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0} + // A temporary stand in for the count trailing zero ctz instruction. // IA bsf works on 64 bit non-zero word. func ctz64(markBits uint64) uint64 { - if markBits == 0 { + ctz8 := ctzVals[markBits&0xff] + if ctz8 != 8 { + return uint64(ctz8) + } else if markBits == 0 { // low byte is zero check fill word. return 64 // bits in 64 bit word, ensures loop terminates } - // tz holds trailing zero count. - tz := uint64(0) - for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { + result := uint64(8) + markBits >>= 8 + for ctz8 = ctzVals[markBits&0xff]; ctz8 == 8; ctz8 = ctzVals[markBits&0xff] { + result += 8 + markBits >>= 8 } - return tz + result += uint64(ctz8) + return result } // refillAllocCache takes 8 bytes s.allocBits starting at whichByte @@ -222,10 +264,12 @@ func (s *mspan) refillAllocCache(whichByte uintptr) { // There are hardware instructions that can be used to make this // faster if profiling warrants it. func (s *mspan) nextFreeIndex() uintptr { - if s.freeindex == s.nelems { - return s.freeindex + sfreeindex := s.freeindex + snelems := s.nelems + if sfreeindex == snelems { + return sfreeindex } - if s.freeindex > s.nelems { + if sfreeindex > snelems { throw("s.freeindex > s.nelems") } @@ -233,37 +277,37 @@ func (s *mspan) nextFreeIndex() uintptr { bitIndex := ctz64(aCache) for bitIndex == 64 { // Move index to start of next cached bits. - s.freeindex = (s.freeindex + 64) &^ (64 - 1) - if s.freeindex >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + sfreeindex = (sfreeindex + 64) &^ (64 - 1) + if sfreeindex >= snelems { + s.freeindex = snelems + return snelems } - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 // Refill s.allocCache with the next 64 alloc bits. - // Unlike in allocBits a 1 in s.allocCache means - // the object is not marked. s.refillAllocCache(whichByte) aCache = s.allocCache bitIndex = ctz64(aCache) // Nothing was available try again now allocCache has been refilled. } - result := s.freeindex + uintptr(bitIndex) - if result >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + result := sfreeindex + uintptr(bitIndex) + if result >= snelems { + s.freeindex = snelems + return snelems } - s.allocCache >>= bitIndex + 1 - s.freeindex = result + 1 - if s.freeindex%64 == 0 && s.freeindex != s.nelems { + s.allocCache >>= (bitIndex + 1) + sfreeindex = result + 1 + + if sfreeindex%64 == 0 && sfreeindex != snelems { // We just incremented s.freeindex so it isn't 0. // As each 1 in s.allocCache was encountered and used for allocation // it was shifted away. At this point s.allocCache contains all 0s. // Refill s.allocCache so that it corresponds // to the bits at s.allocBits starting at s.freeindex. - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 s.refillAllocCache(whichByte) } + s.freeindex = sfreeindex return result } @@ -760,120 +804,60 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) { } } -// heapBitsSweepSpan coordinates the sweeping of a span and inspects -// each freed object. If objects are being traced or if msan is enabled -// then heapBitsSweepSpan calls f(p), where p is the object's base address. -// When not tracing and msan is not enabled heapBitsSweepSpan is lightweight. -// heapBitsSweepSpan never alters the pointer/scalar heapBit maps. HeapBit map -// maintenance is the responsibility of the allocation routines. -// TODO:(rlh) Deal with the checkmark bits but moving them -// out of heap bitmap thus enabling bulk clearing. -func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { - base := s.base() - size := s.elemsize - n := s.nelems - cl := s.sizeclass - doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0 - h := heapBitsForSpan(base) - switch { - default: - throw("heapBitsSweepSpan") - case sys.PtrSize == 8 && size == sys.PtrSize: - nfree = heapBitsSweep8BitPtrs(h, s, base, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 0: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 2*sys.PtrSize: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - } - return -} +// oneBitCount is indexed by byte and produces the +// number of 1 bits in that byte. For example 128 has 1 bit set +// and oneBitCount[128] will holds 1. +var oneBitCount = [256]uint8{ + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8} -func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { - mbits := s.markBitsForBase() - // Consider mark bits in all four 2-bit entries of each bitmap byte. - if cl == 0 { - throw("8BitPtrs are not in cl 0") +// countFree runs through the mark bits in a span and counts the number of free objects +// in the span. +// TODO:(rlh) Use popcount intrinsic. +func (s *mspan) countFree() int { + count := 0 + maxIndex := s.nelems / 8 + for i := uintptr(0); i < maxIndex; i++ { + count += int(oneBitCount[s.gcmarkBits[i]]) } - // Consider mark bits in all four 2-bit entries of each bitmap byte. - for i := uintptr(0); i < n; i++ { - // Note that unlike the other size cases, we leave the pointer bits set here. - // These are initialized during initSpan when the span is created and left - // in place the whole time the span is used for pointer-sized objects. - // That lets heapBitsSetType avoid an atomic update to set the pointer bit - // during allocation. - if !mbits.isMarked() { - nfree++ - if mbits.index < s.freeindex { - f(base + i*sys.PtrSize) - } else if s.allocBits[mbits.index/8]&mbits.mask == 1 { - // it was marked in the previous cycle but not this cycle - // if it wasn't marked in the prvious cycle the call would be redundant. - f(base + i*sys.PtrSize) - } - } - mbits.advance() - } - return nfree -} -// nextFreed returns the next object that is being freed during this GC cycle. -// If the mark bit is set then the object is free. If it is < s.freeindex -// then either the object was freed during by this GC cycle. -// If it is >= freeindex then if the allocBit is set then it was -// freed during this GC cycle. If the allocBit is 0 it was freed -// during a previous cycle so is not considered a freed. -func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool { - mByte := *m.bytep - for { - for mByte == 0xff { - if m.index >= nelems { - return false - } - m.index = (m.index + 8) &^ (8 - 1) - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - // Nothing free found totalFree remains the same. - } - if m.index >= nelems { - return false - } - for m.index < nelems { - if m.mask&mByte == 0 { - // At this point we have a free object so update totalFree - *totalFree++ - if m.index < s.freeindex { - return true - } - if s.allocBits[m.index/8]&m.mask != 0 { - return true - } - } - if m.mask == 1<<7 { - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - m.index++ - break - } else { - m.mask = m.mask << 1 - m.index++ - } - } + if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 { + markBits := uint8(s.gcmarkBits[maxIndex]) + mask := uint8((1 << bitsInLastByte) - 1) + bits := markBits & mask + count += int(oneBitCount[bits]) } - return false -} - -func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int { - totalFree := 0 - twobits := s.markBitsForBase() - for twobits.nextFreed(n, s, &totalFree) { - if doCall { - f(base + twobits.index*size) - } - twobits.advance() - } - return totalFree + return int(s.nelems) - count } // heapBitsSetType records that the new allocation [x, x+size) diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index c217ee8d86..1a6be6634d 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -8,7 +8,6 @@ package runtime import ( "runtime/internal/atomic" - "runtime/internal/sys" "unsafe" ) @@ -252,40 +251,13 @@ func (s *mspan) sweep(preserve bool) bool { } } - // Sweep through n objects of given size starting at p. - // This thread owns the span now, so it can manipulate - // the block bitmap without atomic operations. + // Count the number of free objects in this span. + nfree = s.countFree() + if cl == 0 && nfree != 0 { + s.needzero = 1 + freeToHeap = true + } - nfree = heapBitsSweepSpan(s, func(p uintptr) { - // At this point we know that we are looking at a garbage object - // that needs to be collected. - if debug.allocfreetrace != 0 { - tracefree(unsafe.Pointer(p), size) - } - if msanenabled { - msanfree(unsafe.Pointer(p), size) - } - - // Reset to allocated+noscan. - if cl == 0 { - // Free large span. - if preserve { - throw("can't preserve large span") - } - s.needzero = 1 - - // Free the span after heapBitsSweepSpan - // returns, since it's not done with the span. - freeToHeap = true - } else { - // Free small object. - if size > 2*sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed" - } else if size > sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0 - } - } - }) s.allocCount = uint16(s.nelems) - uint16(nfree) wasempty := s.nextFreeIndex() == s.nelems diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 4be503315b..b0b3bbd957 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -137,6 +137,9 @@ type mspan struct { // // Object n starts at address n*elemsize + (start << pageShift). freeindex uintptr + // TODO: Look up nelems from sizeclass and remove this field if it + // helps performance. + nelems uintptr // number of object in the span. // Cache of the allocBits at freeindex. allocCache is shifted // such that the lowest bit corresponds to the bit freeindex. @@ -147,9 +150,6 @@ type mspan struct { allocCache uint64 allocBits *[maxObjsPerSpan / 8]uint8 gcmarkBits *[maxObjsPerSpan / 8]uint8 - nelems uintptr // number of object in the span. - // TODO(rlh) consider moving some of these fields into seperate arrays. - // Put another way is an array of structs a better idea than a struct of arrays. // allocBits and gcmarkBits currently point to either markbits1 // or markbits2. At the end of a GC cycle allocBits and @@ -753,6 +753,12 @@ func (h *mheap) freeSpan(s *mspan, acct int32) { mp.mcache.local_scan = 0 memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs) mp.mcache.local_tinyallocs = 0 + if msanenabled { + // Tell msan that this entire span is no longer in use. + base := unsafe.Pointer(s.base()) + bytes := s.npages << _PageShift + msanfree(base, bytes) + } if acct != 0 { memstats.heap_objects-- } From f8d0d4fd59b6cb6f875eac7753f036b10a28f995 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 14 Mar 2016 12:02:02 -0400 Subject: [PATCH 11/23] [dev.garbage] runtime: cleanup and optimize span.base() Prior to this CL the base of a span was calculated in various places using shifts or calls to base(). This CL now always calls base() which has been optimized to calculate the base of the span when the span is initialized and store that value in the span structure. Change-Id: I661f2bfa21e3748a249cdf049ef9062db6e78100 Reviewed-on: https://go-review.googlesource.com/20703 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 4 ++-- src/runtime/mbitmap.go | 2 +- src/runtime/mcentral.go | 2 +- src/runtime/mgcmark.go | 2 +- src/runtime/mgcsweep.go | 8 ++++---- src/runtime/mheap.go | 13 ++++++++----- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 2da13f2073..31335dae80 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -711,7 +711,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { s = largeAlloc(size, flags) }) s.freeindex = 1 - x = unsafe.Pointer(uintptr(s.start << pageShift)) + x = unsafe.Pointer(s.base()) size = s.elemsize } @@ -833,7 +833,7 @@ func largeAlloc(size uintptr, flag uint32) *mspan { if s == nil { throw("out of memory") } - s.limit = uintptr(s.start)<<_PageShift + size + s.limit = s.base() + size heapBitsForSpan(s.base()).initSpan(s) return s } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index ea398904e3..b342de600e 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -457,7 +457,7 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits } else { print(" to unused region of span") } - print("idx=", hex(idx), " span.start=", hex(s.start<<_PageShift), " span.limit=", hex(s.limit), " span.state=", s.state, "\n") + print("idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n") if refBase != 0 { print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n") gcDumpObject("object", refBase, refOff) diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index d5f05ae639..bbbfb18fbf 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -212,7 +212,7 @@ func (c *mcentral) grow() *mspan { return nil } - p := uintptr(s.start << _PageShift) + p := s.base() s.limit = p + size*n heapBitsForSpan(s.base()).initSpan(s) diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index fe8a56460b..47456857e9 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -287,7 +287,7 @@ func markrootSpans(gcw *gcWork, shard int) { // retain everything it points to. spf := (*specialfinalizer)(unsafe.Pointer(sp)) // A finalizer can be set for an inner byte of an object, find object beginning. - p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize + p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize // Mark everything that can be reached from // the object (but *not* the object itself or diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 1a6be6634d..9316cc6f49 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -211,13 +211,13 @@ func (s *mspan) sweep(preserve bool) bool { special := *specialp for special != nil { // A finalizer can be set for an inner byte of an object, find object beginning. - p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size + p := s.base() + uintptr(special.offset)/size*size mbits := s.markBitsForAddr(p) if !mbits.isMarked() { // This object is not marked and has at least one special record. // Pass 1: see if it has at least one finalizer. hasFin := false - endOffset := p - uintptr(s.start<<_PageShift) + size + endOffset := p - s.base() + size for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next { if tmp.kind == _KindSpecialFinalizer { // Stop freeing of object if it has a finalizer. @@ -230,7 +230,7 @@ func (s *mspan) sweep(preserve bool) bool { for special != nil && uintptr(special.offset) < endOffset { // Find the exact byte for which the special was setup // (as opposed to object beginning). - p := uintptr(s.start<<_PageShift) + uintptr(special.offset) + p := s.base() + uintptr(special.offset) if special.kind == _KindSpecialFinalizer || !hasFin { // Splice out special record. y := special @@ -311,7 +311,7 @@ func (s *mspan) sweep(preserve bool) bool { // implement and then call some kind of MHeap_DeleteSpan. if debug.efence > 0 { s.limit = 0 // prevent mlookup from finding this span - sysFault(unsafe.Pointer(uintptr(s.start<<_PageShift)), size) + sysFault(unsafe.Pointer(s.base()), size) } else { mheap_.freeSpan(s, 1) } diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index b0b3bbd957..9f07dfbb99 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -116,7 +116,8 @@ type mspan struct { next *mspan // next span in list, or nil if none prev **mspan // previous span's next field, or list head's first field if none list *mSpanList // For debugging. TODO: Remove. - + //TODO:(rlh) Eliminate start field and use startAddr >> PageShift instead. + startAddr uintptr // uintptr(s.start << _PageShift) aka s.base() start pageID // starting page number npages uintptr // number of pages in span stackfreelist gclinkptr // list of free stacks, avoids overloading freelist @@ -184,7 +185,7 @@ type mspan struct { } func (s *mspan) base() uintptr { - return uintptr(s.start << _PageShift) + return s.startAddr } func (s *mspan) layout() (size, n, total uintptr) { @@ -300,7 +301,7 @@ func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 { return 0 } - p := uintptr(s.start) << _PageShift + p := s.base() if s.sizeclass == 0 { // Large object. if base != nil { @@ -542,7 +543,7 @@ func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) if s != nil { if needzero && s.needzero != 0 { - memclr(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift) + memclr(unsafe.Pointer(s.base()), s.npages<<_PageShift) } s.needzero = 0 } @@ -610,7 +611,7 @@ HaveSpan: throw("still in list") } if s.npreleased > 0 { - sysUsed(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift) + sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift) memstats.heap_released -= uint64(s.npreleased << _PageShift) s.npreleased = 0 } @@ -826,6 +827,7 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i t := h_spans[p-1] if t != nil && t.state == _MSpanFree { s.start = t.start + s.startAddr = uintptr(s.start << _PageShift) s.npages += t.npages s.npreleased = t.npreleased // absorb released pages s.needzero |= t.needzero @@ -925,6 +927,7 @@ func (span *mspan) init(start pageID, npages uintptr) { span.prev = nil span.list = nil span.start = start + span.startAddr = uintptr(start << _PageShift) span.npages = npages span.allocCount = 0 span.sizeclass = 0 From 1354b32cd70f2702381764fd595dd2faa996840c Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 14 Mar 2016 12:17:48 -0400 Subject: [PATCH 12/23] [dev.garbage] runtime: add gc work buffer tryGet and put fast paths The complexity of the GC work buffers put and tryGet prevented them from being inlined. This CL simplifies the fast path thus enabling inlining. If the fast path does not succeed the previous put and tryGet functions are called. Change-Id: I6da6495d0dadf42bd0377c110b502274cc01acf5 Reviewed-on: https://go-review.googlesource.com/20704 Reviewed-by: Austin Clements --- src/runtime/mgcmark.go | 16 ++++++++++++---- src/runtime/mgcwork.go | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 47456857e9..d05ad6549f 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -831,7 +831,10 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) { if blocking { b = gcw.get() } else { - b = gcw.tryGet() + b = gcw.tryGetFast() + if b == 0 { + b = gcw.tryGet() + } } if b == 0 { // work barrier reached or tryGet failed. @@ -894,7 +897,11 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 { // PREFETCH(wbuf->obj[wbuf.nobj - 3]; // } // - b := gcw.tryGet() + b := gcw.tryGetFast() + if b == 0 { + b = gcw.tryGet() + } + if b == 0 { break } @@ -1087,8 +1094,9 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork // Previously we put the obj in an 8 element buffer that is drained at a rate // to give the PREFETCH time to do its work. // Use of PREFETCHNTA might be more appropriate than PREFETCH - - gcw.put(obj) + if !gcw.putFast(obj) { + gcw.put(obj) + } } // gcDumpObject dumps the contents of obj for debugging and marks the diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go index 63a3ade3a6..d04840b686 100644 --- a/src/runtime/mgcwork.go +++ b/src/runtime/mgcwork.go @@ -116,6 +116,22 @@ func (w *gcWork) put(obj uintptr) { wbuf.nobj++ } +// putFast does a put and returns true if it can be done quickly +// otherwise it returns false and the caller needs to call put. +//go:nowritebarrier +func (w *gcWork) putFast(obj uintptr) bool { + wbuf := w.wbuf1.ptr() + if wbuf == nil { + return false + } else if wbuf.nobj == len(wbuf.obj) { + return false + } + + wbuf.obj[wbuf.nobj] = obj + wbuf.nobj++ + return true +} + // tryGet dequeues a pointer for the garbage collector to trace. // // If there are no pointers remaining in this gcWork or in the global @@ -147,6 +163,23 @@ func (w *gcWork) tryGet() uintptr { return wbuf.obj[wbuf.nobj] } +// tryGetFast dequeues a pointer for the garbage collector to trace +// if one is readily available. Otherwise it returns 0 and +// the caller is expected to call tryGet(). +//go:nowritebarrier +func (w *gcWork) tryGetFast() uintptr { + wbuf := w.wbuf1.ptr() + if wbuf == nil { + return 0 + } + if wbuf.nobj == 0 { + return 0 + } + + wbuf.nobj-- + return wbuf.obj[wbuf.nobj] +} + // get dequeues a pointer for the garbage collector to trace, blocking // if necessary to ensure all pointers from all queues and caches have // been retrieved. get returns 0 if there are no pointers remaining. From 2063d5d903718962de58a86a692626fe89919a4d Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Mon, 14 Mar 2016 12:17:48 -0400 Subject: [PATCH 13/23] [dev.garbage] runtime: restructure alloc and mark bits Two changes are included here that are dependent on the other. The first is that allocBits and gcamrkBits are changed to a *uint8 which points to the first byte of that span's mark and alloc bits. Several places were altered to perform pointer arithmetic to locate the byte corresponding to an object in the span. The actual bit corresponding to an object is indexed in the byte by using the lower three bits of the objects index. The second change avoids the redundant calculation of an object's index. The index is returned from heapBitsForObject and then used by the functions indexing allocBits and gcmarkBits. Finally we no longer allocate the gc bits in the span structures. Instead we use an arena based allocation scheme that allows for a more compact bit map as well as recycling and bulk clearing of the mark bits. Change-Id: If4d04b2021c092ec39a4caef5937a8182c64dfef Reviewed-on: https://go-review.googlesource.com/20705 Reviewed-by: Austin Clements --- src/runtime/cgocall.go | 2 +- src/runtime/malloc.go | 2 +- src/runtime/mbitmap.go | 86 +++++++++++----------- src/runtime/mgcmark.go | 21 +++--- src/runtime/mgcsweep.go | 16 ++-- src/runtime/mheap.go | 159 +++++++++++++++++++++++++++++++++++----- 6 files changed, 204 insertions(+), 82 deletions(-) diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go index c6000bf98f..be234345d1 100644 --- a/src/runtime/cgocall.go +++ b/src/runtime/cgocall.go @@ -529,7 +529,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) { return } - b, hbits, span := heapBitsForObject(uintptr(p), 0, 0) + b, hbits, span, _ := heapBitsForObject(uintptr(p), 0, 0) base = b if base == 0 { return diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 6fe4656603..86fdb3fdbb 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -491,7 +491,7 @@ var zerobase uintptr // Otherwise it returns 0. func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { s := c.alloc[sizeclass] - ctzIndex := uint8(s.allocCache & 0xff) + ctzIndex := uint8(s.allocCache) if ctzIndex != 0 { theBit := uint64(ctzVals[ctzIndex]) freeidx := s.freeindex // help the pre ssa compiler out here with cse. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index af89577703..387fb8535d 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -186,7 +186,8 @@ type markBits struct { func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { whichByte := allocBitIndex / 8 whichBit := allocBitIndex % 8 - return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} + bytePtr := addb(s.allocBits, whichByte) + return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex} } // ctzVals contains the count of trailing zeros for the @@ -249,7 +250,7 @@ func ctz64(markBits uint64) uint64 { // can be used. It then places these 8 bytes into the cached 64 bit // s.allocCache. func (s *mspan) refillAllocCache(whichByte uintptr) { - bytes := s.allocBits[whichByte : whichByte+8] + bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte))) aCache := uint64(0) aCache |= uint64(bytes[0]) aCache |= uint64(bytes[1]) << (1 * 8) @@ -317,28 +318,37 @@ func (s *mspan) nextFreeIndex() uintptr { func (s *mspan) isFree(index uintptr) bool { whichByte := index / 8 whichBit := index % 8 - return s.allocBits[whichByte]&uint8(1<> s.divShift + } + return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2) } func markBitsForAddr(p uintptr) markBits { s := spanOf(p) - return s.markBitsForAddr(p) + objIndex := s.objIndex(p) + return s.markBitsForIndex(objIndex) } -func (s *mspan) markBitsForAddr(p uintptr) markBits { - byteOffset := p - s.base() - markBitIndex := uintptr(0) - if byteOffset != 0 { - // markBitIndex := (p - s.base()) / s.elemsize, using division by multiplication - markBitIndex = uintptr(uint64(byteOffset) >> s.divShift * uint64(s.divMul) >> s.divShift2) - } - whichByte := markBitIndex / 8 - whichBit := markBitIndex % 8 - return markBits{&s.gcmarkBits[whichByte], uint8(1 << whichBit), markBitIndex} +func (s *mspan) markBitsForIndex(objIndex uintptr) markBits { + whichByte := objIndex / 8 + bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index + bytePtr := addb(s.gcmarkBits, whichByte) + return markBits{bytePtr, bitMask, objIndex} } func (s *mspan) markBitsForBase() markBits { - return markBits{&s.gcmarkBits[0], uint8(1), 0} + return markBits{s.gcmarkBits, uint8(1), 0} } // isMarked reports whether mark bit m is set. @@ -346,7 +356,9 @@ func (m markBits) isMarked() bool { return *m.bytep&m.mask != 0 } -// setMarked sets the marked bit in the markbits, atomically. +// setMarked sets the marked bit in the markbits, atomically. Some compilers +// are not able to inline atomic.Or8 function so if it appears as a hot spot consider +// inlining it manually. func (m markBits) setMarked() { // Might be racing with other updates, so use atomic update always. // We used to be clever here and use a non-atomic update in certain @@ -415,7 +427,8 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { } // heapBitsForObject returns the base address for the heap object -// containing the address p, along with the heapBits for base. +// containing the address p, the heapBits for base, +// the object's span, and of the index of the object in s. // If p does not point into a heap object, // return base == 0 // otherwise return the base of the object. @@ -423,7 +436,7 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { // refBase and refOff optionally give the base address of the object // in which the pointer p was found and the byte offset at which it // was found. These are used for error reporting. -func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan) { +func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) { arenaStart := mheap_.arena_start if p < arenaStart || p >= mheap_.arena_used { return @@ -475,6 +488,7 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits // optimize for power of 2 sized objects. base = s.base() base = base + (p-base)&s.baseMask + objIndex = (base - s.base()) >> s.divShift // base = p & s.baseMask is faster for small spans, // but doesn't work for large spans. // Overall, it's faster to use the more general computation above. @@ -482,8 +496,8 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits base = s.base() if p-base >= s.elemsize { // n := (p - base) / s.elemsize, using division by multiplication - n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) - base += n * s.elemsize + objIndex = uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) + base += objIndex * s.elemsize } } // Now that we know the actual base, compute heapBits to return to caller. @@ -751,22 +765,6 @@ func typeBitsBulkBarrier(typ *_type, p, size uintptr) { } } -func (s *mspan) clearGCMarkBits() { - bytesInMarkBits := (s.nelems + 7) / 8 - bits := s.gcmarkBits[:bytesInMarkBits] - for i := range bits { - bits[i] = 0 - } -} - -func (s *mspan) clearAllocBits() { - bytesInMarkBits := (s.nelems + 7) / 8 - bits := s.allocBits[:bytesInMarkBits] - for i := range bits { - bits[i] = 0 - } -} - // The methods operating on spans all require that h has been returned // by heapBitsForSpan and that size, n, total are the span layout description // returned by the mspan's layout method. @@ -784,13 +782,13 @@ func (h heapBits) initSpan(s *mspan) { size, n, total := s.layout() // Init the markbit structures - s.allocBits = &s.markbits1 - s.gcmarkBits = &s.markbits2 s.freeindex = 0 s.allocCache = ^uint64(0) // all 1s indicating all free. s.nelems = n - s.clearAllocBits() - s.clearGCMarkBits() + s.allocBits = nil + s.gcmarkBits = nil + s.gcmarkBits = newMarkBits(s.nelems) + s.allocBits = newAllocBits(s.nelems) // Clear bits corresponding to objects. if total%heapBitmapScale != 0 { @@ -897,13 +895,13 @@ func (s *mspan) countFree() int { count := 0 maxIndex := s.nelems / 8 for i := uintptr(0); i < maxIndex; i++ { - count += int(oneBitCount[s.gcmarkBits[i]]) + mrkBits := *addb(s.gcmarkBits, i) + count += int(oneBitCount[mrkBits]) } - if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 { - markBits := uint8(s.gcmarkBits[maxIndex]) + mrkBits := *addb(s.gcmarkBits, maxIndex) mask := uint8((1 << bitsInLastByte) - 1) - bits := markBits & mask + bits := mrkBits & mask count += int(oneBitCount[bits]) } return int(s.nelems) - count diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 3704164527..18f930f89a 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1082,8 +1082,8 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) { // Same work as in scanobject; see comments there. obj := *(*uintptr)(unsafe.Pointer(b + i)) if obj != 0 && arena_start <= obj && obj < arena_used { - if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 { - greyobject(obj, b, i, hbits, span, gcw) + if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 { + greyobject(obj, b, i, hbits, span, gcw, objIndex) } } } @@ -1148,8 +1148,8 @@ func scanobject(b uintptr, gcw *gcWork) { // Check if it points into heap and not back at the current object. if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n { // Mark the object. - if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 { - greyobject(obj, b, i, hbits, span, gcw) + if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 { + greyobject(obj, b, i, hbits, span, gcw, objIndex) } } } @@ -1162,9 +1162,9 @@ func scanobject(b uintptr, gcw *gcWork) { // Preemption must be disabled. //go:nowritebarrier func shade(b uintptr) { - if obj, hbits, span := heapBitsForObject(b, 0, 0); obj != 0 { + if obj, hbits, span, objIndex := heapBitsForObject(b, 0, 0); obj != 0 { gcw := &getg().m.p.ptr().gcw - greyobject(obj, 0, 0, hbits, span, gcw) + greyobject(obj, 0, 0, hbits, span, gcw, objIndex) if gcphase == _GCmarktermination || gcBlackenPromptly { // Ps aren't allowed to cache work during mark // termination. @@ -1177,12 +1177,13 @@ func shade(b uintptr) { // If it isn't already marked, mark it and enqueue into gcw. // base and off are for debugging only and could be removed. //go:nowritebarrierrec -func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork) { +func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) { // obj should be start of allocation, and so must be at least pointer-aligned. if obj&(sys.PtrSize-1) != 0 { throw("greyobject: obj not pointer-aligned") } - mbits := span.markBitsForAddr(obj) + mbits := span.markBitsForIndex(objIndex) + if useCheckmark { if !mbits.isMarked() { printlock() @@ -1209,8 +1210,8 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork if mbits.isMarked() { return } - mbits.setMarked() - + // mbits.setMarked() // Avoid extra call overhead with manual inlining. + atomic.Or8(mbits.bytep, mbits.mask) // If this is a noscan object, fast-track it to black // instead of greying it. if !hbits.hasPointers(span.elemsize) { diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 9316cc6f49..b1d6234af4 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -51,6 +51,7 @@ func finishsweep_m(stw bool) { } } } + nextMarkBitArenaEpoch() } func bgsweep(c chan int) { @@ -211,8 +212,9 @@ func (s *mspan) sweep(preserve bool) bool { special := *specialp for special != nil { // A finalizer can be set for an inner byte of an object, find object beginning. - p := s.base() + uintptr(special.offset)/size*size - mbits := s.markBitsForAddr(p) + objIndex := uintptr(special.offset) / size + p := s.base() + objIndex*size + mbits := s.markBitsForIndex(objIndex) if !mbits.isMarked() { // This object is not marked and has at least one special record. // Pass 1: see if it has at least one finalizer. @@ -260,13 +262,13 @@ func (s *mspan) sweep(preserve bool) bool { s.allocCount = uint16(s.nelems) - uint16(nfree) wasempty := s.nextFreeIndex() == s.nelems - s.freeindex = 0 // reset allocation index to start of span. - // Swap role of allocBits with gcmarkBits - // Clear gcmarkBits in preparation for next GC - s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits - s.clearGCMarkBits() // prepare for next GC + // gcmarkBits becomes the allocBits. + // get a fresh cleared gcmarkBits in preparation for next GC + s.allocBits = s.gcmarkBits + s.gcmarkBits = newMarkBits(s.nelems) + // Initialize alloc bits cache. s.refillAllocCache(0) diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 1333dd696b..7d85891617 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -149,16 +149,31 @@ type mspan struct { // allocCache may contain bits beyond s.nelems; the caller must ignore // these. allocCache uint64 - allocBits *[maxObjsPerSpan / 8]uint8 - gcmarkBits *[maxObjsPerSpan / 8]uint8 - // allocBits and gcmarkBits currently point to either markbits1 - // or markbits2. At the end of a GC cycle allocBits and - // gcmarkBits swap roles simply by swapping pointers. - // This level of indirection also facilitates an implementation - // where markbits1 and markbits2 are not inlined in mspan. - markbits1 [maxObjsPerSpan / 8]uint8 // A bit for each obj. - markbits2 [maxObjsPerSpan / 8]uint8 // A bit for each obj. + // allocBits and gcmarkBits hold pointers to a span's mark and + // allocation bits. The pointers are 8 byte aligned. + // There are three arenas where this data is held. + // free: Dirty arenas that are no longer accessed + // and can be reused. + // next: Holds information to be used in the next GC cycle. + // current: Information being used during this GC cycle. + // previous: Information being used during the last GC cycle. + // A new GC cycle starts with the call to finishsweep_m. + // finishsweep_m moves the previous arena to the free arena, + // the current arena to the previous arena, and + // the next arena to the current arena. + // The next arena is populated as the spans request + // memory to hold gcmarkBits for the next GC cycle as well + // as allocBits for newly allocated spans. + // + // The pointer arithmetic is done "by hand" instead of using + // arrays to avoid bounds checks along critical performance + // paths. + // The sweep will free the old allocBits and set allocBits to the + // gcmarkBits. The gcmarkBits are replaced with a fresh zeroed + // out memory. + allocBits *uint8 + gcmarkBits *uint8 // sweep generation: // if sweepgen == h->sweepgen - 2, the span needs sweeping @@ -950,16 +965,8 @@ func (span *mspan) init(start pageID, npages uintptr) { span.specials = nil span.needzero = 0 span.freeindex = 0 - span.allocBits = &span.markbits1 - span.gcmarkBits = &span.markbits2 - // determine if this is actually needed. It is once / span so it - // isn't expensive. This is to be replaced by an arena - // based system where things can be cleared all at once so - // don't worry about optimizing this. - for i := 0; i < len(span.markbits1); i++ { - span.allocBits[i] = 0 - span.gcmarkBits[i] = 0 - } + span.allocBits = nil + span.gcmarkBits = nil } func (span *mspan) inList() bool { @@ -1226,3 +1233,117 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr) { panic("not reached") } } + +const gcBitsChunkBytes = uintptr(1 << 16) +const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{}) + +type gcBitsHeader struct { + free uintptr // free is the index into bits of the next free byte. + next uintptr // *gcBits triggers recursive type bug. (issue 14620) +} + +type gcBits struct { + // gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand. + free uintptr // free is the index into bits of the next free byte. + next *gcBits + bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8 +} + +var gcBitsArenas struct { + lock mutex + free *gcBits + next *gcBits + current *gcBits + previous *gcBits +} + +// newMarkBits returns a pointer to 8 byte aligned bytes +// to be used for a span's mark bits. +func newMarkBits(nelems uintptr) *uint8 { + lock(&gcBitsArenas.lock) + blocksNeeded := uintptr((nelems + 63) / 64) + bytesNeeded := blocksNeeded * 8 + if gcBitsArenas.next == nil || + gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) { + // Allocate a new arena. + fresh := newArena() + fresh.next = gcBitsArenas.next + gcBitsArenas.next = fresh + } + if gcBitsArenas.next.free >= gcBitsChunkBytes { + println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes) + throw("markBits overflow") + } + result := &gcBitsArenas.next.bits[gcBitsArenas.next.free] + gcBitsArenas.next.free += bytesNeeded + unlock(&gcBitsArenas.lock) + return result +} + +// newAllocBits returns a pointer to 8 byte aligned bytes +// to be used for this span's alloc bits. +// newAllocBits is used to provide newly initialized spans +// allocation bits. For spans not being initialized the +// the mark bits are repurposed as allocation bits when +// the span is swept. +func newAllocBits(nelems uintptr) *uint8 { + return newMarkBits(nelems) +} + +// nextMarkBitArenaEpoch establishes a new epoch for the arenas +// holding the mark bits. The arenas are named relative to the +// current GC cycle which is demarcated by the call to finishweep_m. +// +// All current spans have been swept. +// During that sweep each span allocated room for its gcmarkBits in +// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current +// where the GC will mark objects and after each span is swept these bits +// will be used to allocate objects. +// gcBitsArenas.current becomes gcBitsArenas.previous where the span's +// gcAllocBits live until all the spans have been swept during this GC cycle. +// The span's sweep extinguishes all the references to gcBitsArenas.previous +// by pointing gcAllocBits into the gcBitsArenas.current. +// The gcBitsArenas.previous is released to the gcBitsArenas.free list. +func nextMarkBitArenaEpoch() { + lock(&gcBitsArenas.lock) + if gcBitsArenas.previous != nil { + if gcBitsArenas.free == nil { + gcBitsArenas.free = gcBitsArenas.previous + } else { + // Find end of previous arenas. + last := gcBitsArenas.previous + for last = gcBitsArenas.previous; last.next != nil; last = last.next { + } + last.next = gcBitsArenas.free + gcBitsArenas.free = gcBitsArenas.previous + } + } + gcBitsArenas.previous = gcBitsArenas.current + gcBitsArenas.current = gcBitsArenas.next + gcBitsArenas.next = nil // newMarkBits calls newArena when needed + unlock(&gcBitsArenas.lock) +} + +// newArena allocates and zeroes a gcBits arena. +func newArena() *gcBits { + var result *gcBits + if gcBitsArenas.free == nil { + result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys)) + if result == nil { + throw("runtime: cannot allocate memory") + } + } else { + result = gcBitsArenas.free + gcBitsArenas.free = gcBitsArenas.free.next + memclr(unsafe.Pointer(result), gcBitsChunkBytes) + } + result.next = nil + // If result.bits is not 8 byte aligned adjust index so + // that &result.bits[result.free] is 8 byte aligned. + if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 { + result.free = 0 + } else { + result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7) + } + return result +} From 2fb75ea6c65d03c3fda89c8e954712a2fa97b052 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Thu, 31 Mar 2016 10:45:36 -0400 Subject: [PATCH 14/23] [dev.garbage] runtime: use sys.Ctz64 intrinsic Our compilers now provides instrinsics including sys.Ctz64 that support CTZ (count trailing zero) instructions. This CL replaces the Go versions of CTZ with the compiler intrinsic. Count trailing zeros CTZ finds the least significant 1 in a word and returns the number of less significant 0s in the word. Allocation uses the bitmap created by the garbage collector to locate an unmarked object. The logic takes a word of the bitmap, complements, and then caches it. It then uses CTZ to locate an available unmarked object. It then shifts marked bits out of the bitmap word preparing it for the next search. Once all the unmarked objects are used in the cached work the bitmap gets another word and repeats the process. Change-Id: Id2fc42d1d4b9893efaa2e1bd01896985b7e42f82 Reviewed-on: https://go-review.googlesource.com/21366 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 11 ++++--- src/runtime/mbitmap.go | 65 ++++-------------------------------------- src/runtime/mheap.go | 2 +- 3 files changed, 12 insertions(+), 66 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 86fdb3fdbb..ec4939f1dd 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -491,14 +491,13 @@ var zerobase uintptr // Otherwise it returns 0. func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { s := c.alloc[sizeclass] - ctzIndex := uint8(s.allocCache) - if ctzIndex != 0 { - theBit := uint64(ctzVals[ctzIndex]) - freeidx := s.freeindex // help the pre ssa compiler out here with cse. - result := freeidx + uintptr(theBit) + + theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache? + if theBit < 64 { + result := s.freeindex + uintptr(theBit) if result < s.nelems { s.allocCache >>= (theBit + 1) - freeidx = result + 1 + freeidx := result + 1 if freeidx%64 == 0 && freeidx != s.nelems { // We just incremented s.freeindex so it isn't 0 // so we are moving to the next aCache. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 387fb8535d..f2a5238c31 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -190,62 +190,7 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex} } -// ctzVals contains the count of trailing zeros for the -// index. 0 returns 8 indicating 8 zeros. -var ctzVals = [256]int8{ - 8, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 7, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, - 3, 0, 1, 0, 2, 0, 1, 0} - -// A temporary stand in for the count trailing zero ctz instruction. -// IA bsf works on 64 bit non-zero word. -func ctz64(markBits uint64) uint64 { - ctz8 := ctzVals[markBits&0xff] - if ctz8 != 8 { - return uint64(ctz8) - } else if markBits == 0 { // low byte is zero check fill word. - return 64 // bits in 64 bit word, ensures loop terminates - } - result := uint64(8) - markBits >>= 8 - for ctz8 = ctzVals[markBits&0xff]; ctz8 == 8; ctz8 = ctzVals[markBits&0xff] { - result += 8 - markBits >>= 8 - } - result += uint64(ctz8) - return result -} - -// refillAllocCache takes 8 bytes s.allocBits starting at whichByte +// refillaCache takes 8 bytes s.allocBits starting at whichByte // and negates them so that ctz (count trailing zeros) instructions // can be used. It then places these 8 bytes into the cached 64 bit // s.allocCache. @@ -278,7 +223,8 @@ func (s *mspan) nextFreeIndex() uintptr { } aCache := s.allocCache - bitIndex := ctz64(aCache) + + bitIndex := sys.Ctz64(aCache) for bitIndex == 64 { // Move index to start of next cached bits. sfreeindex = (sfreeindex + 64) &^ (64 - 1) @@ -290,8 +236,9 @@ func (s *mspan) nextFreeIndex() uintptr { // Refill s.allocCache with the next 64 alloc bits. s.refillAllocCache(whichByte) aCache = s.allocCache - bitIndex = ctz64(aCache) - // Nothing was available try again now allocCache has been refilled. + bitIndex = sys.Ctz64(aCache) + // nothing available in cached bits + // grab the next 8 bytes and try again. } result := sfreeindex + uintptr(bitIndex) if result >= snelems { diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 7d85891617..e4946ff8e9 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -145,7 +145,7 @@ type mspan struct { // Cache of the allocBits at freeindex. allocCache is shifted // such that the lowest bit corresponds to the bit freeindex. // allocCache holds the complement of allocBits, thus allowing - // ctz64 (count trailing zero) to use it directly. + // ctz (count trailing zero) to use it directly. // allocCache may contain bits beyond s.nelems; the caller must ignore // these. allocCache uint64 From 15744c92de5e6a2295bfbae2126b19c124bbb46a Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 10:53:25 -0400 Subject: [PATCH 15/23] [dev.garbage] runtime: remove unused head/end arguments from freeSpan These used to be used for the list of newly freed objects, but that's no longer a thing. Change-Id: I5a4503137b74ec0eae5372ca271b1aa0b32df074 Reviewed-on: https://go-review.googlesource.com/22557 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot --- src/runtime/mcentral.go | 2 +- src/runtime/mgcsweep.go | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index bbbfb18fbf..5baaef99f8 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -160,7 +160,7 @@ func (c *mcentral) uncacheSpan(s *mspan) { // freeSpan returns true if s was returned to the heap. // If preserve=true, it does not move s (the caller // must take care of it). -func (c *mcentral) freeSpan(s *mspan, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool { +func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool { if s.incache { throw("freeSpan given cached span") } diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index b1d6234af4..084d0a71c1 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -187,8 +187,6 @@ func (s *mspan) sweep(preserve bool) bool { res := false nfree := 0 - var head, end gclinkptr - c := _g_.m.mcache freeToHeap := false @@ -292,7 +290,7 @@ func (s *mspan) sweep(preserve bool) bool { if nfree > 0 && cl != 0 { c.local_nsmallfree[cl] += uintptr(nfree) - res = mheap_.central[cl].mcentral.freeSpan(s, head, end, preserve, wasempty) + res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { // Free large span to heap From 2e8b74b69574e969b5565e69cb54d39064b2dba1 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 11:19:53 -0400 Subject: [PATCH 16/23] [dev.garbage] runtime: document sysAlloc In particular, it always returns an aligned pointer. Change-Id: I763789a539a4bfd8b0efb36a39a80be1a479d3e2 Reviewed-on: https://go-review.googlesource.com/22558 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot --- src/runtime/malloc.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index ec4939f1dd..2d8905b88d 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -387,6 +387,10 @@ func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer { return sysReserve(nil, n, reserved) } +// sysAlloc allocates the next n bytes from the heap arena. The +// returned pointer is always _PageSize aligned and between +// h.arena_start and h.arena_end. sysAlloc returns nil on failure. +// There is no corresponding free function. func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer { if n > h.arena_end-h.arena_used { // We are in 32-bit mode, maybe we didn't use all possible address space yet. From b7adc41fbacac446c1daf0cb282cb2a921d4a15b Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 10:59:00 -0400 Subject: [PATCH 17/23] [dev.garbage] runtime: use s.base() everywhere it makes sense Currently we have lots of (s.start << _PageShift) and variants. We now have an s.base() function that returns this. It's faster and more readable, so use it. Change-Id: I888060a9dae15ea75ca8cc1c2b31c905e71b452b Reviewed-on: https://go-review.googlesource.com/22559 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements --- src/runtime/heapdump.go | 6 +++--- src/runtime/mfinal.go | 2 +- src/runtime/mgcmark.go | 2 +- src/runtime/mheap.go | 8 ++++---- src/runtime/stack.go | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 6085c6866c..4afe663418 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -447,7 +447,7 @@ func dumproots() { continue } spf := (*specialfinalizer)(unsafe.Pointer(sp)) - p := unsafe.Pointer((uintptr(s.start) << _PageShift) + uintptr(spf.special.offset)) + p := unsafe.Pointer(s.base() + uintptr(spf.special.offset)) dumpfinalizer(p, spf.fn, spf.fint, spf.ot) } } @@ -467,7 +467,7 @@ func dumpobjs() { if s.state != _MSpanInUse { continue } - p := uintptr(s.start << _PageShift) + p := s.base() size := s.elemsize n := (s.npages << _PageShift) / size if n > uintptr(len(freemark)) { @@ -619,7 +619,7 @@ func dumpmemprof() { continue } spp := (*specialprofile)(unsafe.Pointer(sp)) - p := uintptr(s.start<<_PageShift) + uintptr(spp.special.offset) + p := s.base() + uintptr(spp.special.offset) dumpint(tagAllocSample) dumpint(uint64(p)) dumpint(uint64(uintptr(unsafe.Pointer(spp.b)))) diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go index e81650d842..6dce6d7501 100644 --- a/src/runtime/mfinal.go +++ b/src/runtime/mfinal.go @@ -402,7 +402,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) { if s == nil { return } - x = unsafe.Pointer(uintptr(s.start) << pageShift) + x = unsafe.Pointer(s.base()) if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse { s = nil diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 18f930f89a..14449c3d4b 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1247,7 +1247,7 @@ func gcDumpObject(label string, obj, off uintptr) { print(" s=nil\n") return } - print(" s.start*_PageSize=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n") + print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n") skipped := false for i := uintptr(0); i < s.elemsize; i += sys.PtrSize { // For big objects, just print the beginning (because diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index e4946ff8e9..40ed466038 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -808,7 +808,7 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i } case _MSpanInUse: if s.allocCount != 0 || s.sweepgen != h.sweepgen { - print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") + print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") throw("MHeap_FreeSpanLocked - invalid free") } h.pagesInUse -= uint64(s.npages) @@ -892,7 +892,7 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr { var sumreleased uintptr for s := list.first; s != nil; s = s.next { if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages { - start := uintptr(s.start) << _PageShift + start := s.base() end := start + s.npages<<_PageShift if sys.PhysPageSize > _PageSize { // We can only release pages in @@ -1062,7 +1062,7 @@ func addspecial(p unsafe.Pointer, s *special) bool { mp := acquirem() span.ensureSwept() - offset := uintptr(p) - uintptr(span.start<<_PageShift) + offset := uintptr(p) - span.base() kind := s.kind lock(&span.speciallock) @@ -1110,7 +1110,7 @@ func removespecial(p unsafe.Pointer, kind uint8) *special { mp := acquirem() span.ensureSwept() - offset := uintptr(p) - uintptr(span.start<<_PageShift) + offset := uintptr(p) - span.base() lock(&span.speciallock) t := &span.specials diff --git a/src/runtime/stack.go b/src/runtime/stack.go index ac4efc114b..f68c513fd6 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -198,7 +198,7 @@ func stackpoolalloc(order uint8) gclinkptr { throw("bad stackfreelist") } for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order { - x := gclinkptr(uintptr(s.start)<<_PageShift + i) + x := gclinkptr(s.base() + i) x.ptr().next = s.stackfreelist s.stackfreelist = x } @@ -391,7 +391,7 @@ func stackalloc(n uint32) (stack, []stkbar) { throw("out of memory") } } - v = unsafe.Pointer(s.start << _PageShift) + v = unsafe.Pointer(s.base()) } if raceenabled { @@ -456,7 +456,7 @@ func stackfree(stk stack, n uintptr) { } else { s := mheap_.lookup(v) if s.state != _MSpanStack { - println(hex(s.start<<_PageShift), v) + println(hex(s.base()), v) throw("bad span state") } if gcphase == _GCoff { From 3e2462387f39db99a9a2b551c444c22fae460949 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 11:21:01 -0400 Subject: [PATCH 18/23] [dev.garbage] runtime: eliminate mspan.start This converts all remaining uses of mspan.start to instead use mspan.base(). In many cases, this actually reduces the complexity of the code. Change-Id: If113840e00d3345a6cf979637f6a152e6344aee7 Reviewed-on: https://go-review.googlesource.com/22590 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements --- src/runtime/malloc.go | 3 --- src/runtime/mbitmap.go | 3 +-- src/runtime/mheap.go | 45 +++++++++++++++--------------------------- 3 files changed, 17 insertions(+), 34 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 2d8905b88d..5210b3d910 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -170,9 +170,6 @@ const ( _MaxGcproc = 32 ) -// Page number (address>>pageShift) -type pageID uintptr - const _MaxArena32 = 2 << 30 // OS-defined helpers: diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index f2a5238c31..9df64cb168 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -392,9 +392,8 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits idx := off >> _PageShift // p points into the heap, but possibly to the middle of an object. // Consult the span table to find the block beginning. - k := p >> _PageShift s = h_spans[idx] - if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse { + if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse { if s == nil || s.state == _MSpanStack { // If s is nil, the virtual address has never been part of the heap. // This pointer may be to some mmap'd region, so we allow it. diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 40ed466038..1f732c2111 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -116,9 +116,8 @@ type mspan struct { next *mspan // next span in list, or nil if none prev **mspan // previous span's next field, or list head's first field if none list *mSpanList // For debugging. TODO: Remove. - //TODO:(rlh) Eliminate start field and use startAddr >> PageShift instead. - startAddr uintptr // uintptr(s.start << _PageShift) aka s.base() - start pageID // starting page number + + startAddr uintptr // address of first byte of span aka s.base() npages uintptr // number of pages in span stackfreelist gclinkptr // list of free stacks, avoids overloading freelist @@ -262,11 +261,8 @@ func inheap(b uintptr) bool { return false } // Not a beginning of a block, consult span table to find the block beginning. - k := b >> _PageShift - x := k - x -= mheap_.arena_start >> _PageShift - s := h_spans[x] - if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse { + s := h_spans[(b-mheap_.arena_start)>>_PageShift] + if s == nil || b < s.base() || b >= s.limit || s.state != mSpanInUse { return false } return true @@ -634,10 +630,9 @@ HaveSpan: if s.npages > npage { // Trim extra and put it back in the heap. t := (*mspan)(h.spanalloc.alloc()) - t.init(s.start+pageID(npage), s.npages-npage) + t.init(s.base()+npage<<_PageShift, s.npages-npage) s.npages = npage - p := uintptr(t.start) - p -= (h.arena_start >> _PageShift) + p := (t.base() - h.arena_start) >> _PageShift if p > 0 { h_spans[p-1] = s } @@ -651,8 +646,7 @@ HaveSpan: } s.unusedsince = 0 - p := uintptr(s.start) - p -= (h.arena_start >> _PageShift) + p := (s.base() - h.arena_start) >> _PageShift for n := uintptr(0); n < npage; n++ { h_spans[p+n] = s } @@ -680,7 +674,7 @@ func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan { if s.npages < npage { continue } - if best == nil || s.npages < best.npages || (s.npages == best.npages && s.start < best.start) { + if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) { best = s } } @@ -717,9 +711,8 @@ func (h *mheap) grow(npage uintptr) bool { // Create a fake "in use" span and free it, so that the // right coalescing happens. s := (*mspan)(h.spanalloc.alloc()) - s.init(pageID(uintptr(v)>>_PageShift), ask>>_PageShift) - p := uintptr(s.start) - p -= (h.arena_start >> _PageShift) + s.init(uintptr(v), ask>>_PageShift) + p := (s.base() - h.arena_start) >> _PageShift for i := p; i < p+s.npages; i++ { h_spans[i] = s } @@ -750,11 +743,8 @@ func (h *mheap) lookupMaybe(v unsafe.Pointer) *mspan { if uintptr(v) < h.arena_start || uintptr(v) >= h.arena_used { return nil } - p := uintptr(v) >> _PageShift - q := p - q -= h.arena_start >> _PageShift - s := h_spans[q] - if s == nil || p < uintptr(s.start) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse { + s := h_spans[(uintptr(v)-h.arena_start)>>_PageShift] + if s == nil || uintptr(v) < s.base() || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse { return nil } return s @@ -836,13 +826,11 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i s.npreleased = 0 // Coalesce with earlier, later spans. - p := uintptr(s.start) - p -= h.arena_start >> _PageShift + p := (s.base() - h.arena_start) >> _PageShift if p > 0 { t := h_spans[p-1] if t != nil && t.state == _MSpanFree { - s.start = t.start - s.startAddr = uintptr(s.start << _PageShift) + s.startAddr = t.startAddr s.npages += t.npages s.npreleased = t.npreleased // absorb released pages s.needzero |= t.needzero @@ -947,12 +935,11 @@ func runtime_debug_freeOSMemory() { } // Initialize a new span with the given start and npages. -func (span *mspan) init(start pageID, npages uintptr) { +func (span *mspan) init(base uintptr, npages uintptr) { span.next = nil span.prev = nil span.list = nil - span.start = start - span.startAddr = uintptr(start << _PageShift) + span.startAddr = base span.npages = npages span.allocCount = 0 span.sizeclass = 0 From 38f674687a5dbce63af60a0a52892f666d7c626c Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 15:32:01 -0400 Subject: [PATCH 19/23] [dev.garbage] runtime: reintroduce no-zeroing optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we always zero objects when we allocate them. We used to have an optimization that would not zero objects that had not been allocated since the whole span was last zeroed (either by getting it from the system or by getting it from the heap, which does a bulk zero), but this depended on the sweeper clobbering the first two words of each object. Hence, we lost this optimization when the bitmap sweeper went away. Re-introduce this optimization using a different mechanism. Each span already keeps a flag indicating that it just came from the OS or was just bulk zeroed by the mheap. We can simply use this flag to know when we don't need to zero an object. This is slightly less efficient than the old optimization: if a span gets allocated and partially used, then GC happens and the span gets returned to the mcentral, then the span gets re-acquired, the old optimization knew that it only had to re-zero the objects that had been reclaimed, whereas this optimization will re-zero everything. However, in this case, you're already paying for the garbage collection, and you've only wasted one zeroing of the span, so in practice there seems to be little difference. (If we did want to revive the full optimization, each span could keep track of a frontier beyond which all free slots are zeroed. I prototyped this and it didn't obvious do any better than the much simpler approach in this commit.) This significantly improves BinaryTree17, which is allocation-heavy (and runs first, so most pages are already zeroed), and slightly improves everything else. name old time/op new time/op delta XBenchGarbage-12 2.15ms ± 1% 2.14ms ± 1% -0.80% (p=0.000 n=17+17) name old time/op new time/op delta BinaryTree17-12 2.71s ± 1% 2.56s ± 1% -5.73% (p=0.000 n=18+19) DivconstI64-12 1.70ns ± 1% 1.70ns ± 1% ~ (p=0.562 n=18+18) DivconstU64-12 1.74ns ± 2% 1.74ns ± 1% ~ (p=0.394 n=20+20) DivconstI32-12 1.74ns ± 0% 1.74ns ± 0% ~ (all samples are equal) DivconstU32-12 1.66ns ± 1% 1.66ns ± 0% ~ (p=0.516 n=15+16) DivconstI16-12 1.84ns ± 0% 1.84ns ± 0% ~ (all samples are equal) DivconstU16-12 1.82ns ± 0% 1.82ns ± 0% ~ (all samples are equal) DivconstI8-12 1.79ns ± 0% 1.79ns ± 0% ~ (all samples are equal) DivconstU8-12 1.60ns ± 0% 1.60ns ± 1% ~ (p=0.603 n=17+19) Fannkuch11-12 2.11s ± 1% 2.11s ± 0% ~ (p=0.333 n=16+19) FmtFprintfEmpty-12 45.1ns ± 4% 45.4ns ± 5% ~ (p=0.111 n=20+20) FmtFprintfString-12 134ns ± 0% 129ns ± 0% -3.45% (p=0.000 n=18+16) FmtFprintfInt-12 131ns ± 1% 129ns ± 1% -1.54% (p=0.000 n=16+18) FmtFprintfIntInt-12 205ns ± 2% 203ns ± 0% -0.56% (p=0.014 n=20+18) FmtFprintfPrefixedInt-12 200ns ± 2% 197ns ± 1% -1.48% (p=0.000 n=20+18) FmtFprintfFloat-12 256ns ± 1% 256ns ± 0% -0.21% (p=0.008 n=18+20) FmtManyArgs-12 805ns ± 0% 804ns ± 0% -0.19% (p=0.001 n=18+18) GobDecode-12 7.21ms ± 1% 7.14ms ± 1% -0.92% (p=0.000 n=19+20) GobEncode-12 5.88ms ± 1% 5.88ms ± 1% ~ (p=0.641 n=18+19) Gzip-12 218ms ± 1% 218ms ± 1% ~ (p=0.271 n=19+18) Gunzip-12 37.1ms ± 0% 36.9ms ± 0% -0.29% (p=0.000 n=18+17) HTTPClientServer-12 78.1µs ± 2% 77.4µs ± 2% ~ (p=0.070 n=19+19) JSONEncode-12 15.5ms ± 1% 15.5ms ± 0% ~ (p=0.063 n=20+18) JSONDecode-12 56.1ms ± 0% 55.4ms ± 1% -1.18% (p=0.000 n=19+18) Mandelbrot200-12 4.05ms ± 0% 4.06ms ± 0% +0.29% (p=0.001 n=18+18) GoParse-12 3.28ms ± 1% 3.21ms ± 1% -2.30% (p=0.000 n=20+20) RegexpMatchEasy0_32-12 69.4ns ± 2% 69.3ns ± 1% ~ (p=0.205 n=18+16) RegexpMatchEasy0_1K-12 239ns ± 0% 239ns ± 0% ~ (all samples are equal) RegexpMatchEasy1_32-12 69.4ns ± 1% 69.4ns ± 1% ~ (p=0.620 n=15+18) RegexpMatchEasy1_1K-12 370ns ± 1% 369ns ± 2% ~ (p=0.088 n=20+20) RegexpMatchMedium_32-12 108ns ± 0% 108ns ± 0% ~ (all samples are equal) RegexpMatchMedium_1K-12 33.6µs ± 3% 33.5µs ± 3% ~ (p=0.718 n=20+20) RegexpMatchHard_32-12 1.68µs ± 1% 1.67µs ± 2% ~ (p=0.316 n=20+20) RegexpMatchHard_1K-12 50.5µs ± 3% 50.4µs ± 3% ~ (p=0.659 n=20+20) Revcomp-12 381ms ± 1% 381ms ± 1% ~ (p=0.916 n=19+18) Template-12 66.5ms ± 1% 65.8ms ± 2% -1.08% (p=0.000 n=20+20) TimeParse-12 317ns ± 0% 319ns ± 0% +0.48% (p=0.000 n=19+12) TimeFormat-12 338ns ± 0% 338ns ± 0% ~ (p=0.124 n=19+18) [Geo mean] 5.99µs 5.96µs -0.54% Change-Id: I638ffd9d9f178835bbfa499bac20bd7224f1a907 Reviewed-on: https://go-review.googlesource.com/22591 Reviewed-by: Rick Hudson --- src/runtime/malloc.go | 23 ++++++++++------------- src/runtime/mcentral.go | 2 +- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 5210b3d910..d5061b55ba 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -490,9 +490,7 @@ var zerobase uintptr // nextFreeFast returns the next free object if one is quickly available. // Otherwise it returns 0. -func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { - s := c.alloc[sizeclass] - +func nextFreeFast(s *mspan) gclinkptr { theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache? if theBit < 64 { result := s.freeindex + uintptr(theBit) @@ -520,8 +518,8 @@ func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { // weight allocation. If it is a heavy weight allocation the caller must // determine whether a new GC cycle needs to be started or if the GC is active // whether this goroutine needs to assist the GC. -func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { - s := c.alloc[sizeclass] +func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, s *mspan, shouldhelpgc bool) { + s = c.alloc[sizeclass] shouldhelpgc = false freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { @@ -658,10 +656,10 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { return x } // Allocate a new maxTinySize block. - var v gclinkptr - v = c.nextFreeFast(tinySizeClass) + span := c.alloc[tinySizeClass] + v := nextFreeFast(span) if v == 0 { - v, shouldhelpgc = c.nextFree(tinySizeClass) + v, _, shouldhelpgc = c.nextFree(tinySizeClass) } x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 @@ -681,15 +679,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { sizeclass = size_to_class128[(size-1024+127)>>7] } size = uintptr(class_to_size[sizeclass]) - var v gclinkptr - v = c.nextFreeFast(sizeclass) + span := c.alloc[sizeclass] + v := nextFreeFast(span) if v == 0 { - v, shouldhelpgc = c.nextFree(sizeclass) + v, span, shouldhelpgc = c.nextFree(sizeclass) } x = unsafe.Pointer(v) - if needzero { + if needzero && span.needzero != 0 { memclr(unsafe.Pointer(v), size) - // TODO:(rlh) Only clear if object is not known to be zeroed. } } } else { diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 5baaef99f8..7b63110460 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -164,6 +164,7 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool { if s.incache { throw("freeSpan given cached span") } + s.needzero = 1 if preserve { // preserve is set only when called from MCentral_CacheSpan above, @@ -195,7 +196,6 @@ func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool { } c.nonempty.remove(s) - s.needzero = 1 unlock(&c.lock) mheap_.freeSpan(s, 0) return true From 6d11490539e3aa459066b94c6587f5429dfe7a31 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Thu, 28 Apr 2016 15:49:39 -0400 Subject: [PATCH 20/23] [dev.garbage] runtime: fix allocfreetrace We broke tracing of freed objects in GODEBUG=allocfreetrace=1 mode when we removed the sweep over the mark bitmap. Fix it by re-introducing the sweep over the bitmap specifically if we're in allocfreetrace mode. This doesn't have to be even remotely efficient, since the overhead of allocfreetrace is huge anyway, so we can keep the code for this down to just a few lines. Change-Id: I9e176b3b04c73608a0ea3068d5d0cd30760ebd40 Reviewed-on: https://go-review.googlesource.com/22592 Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot Reviewed-by: Rick Hudson --- src/runtime/malloc.go | 17 ----------------- src/runtime/mgcsweep.go | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index d5061b55ba..2ac504f9dc 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -743,23 +743,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { gcmarknewobject(uintptr(x), size, scanSize) } - // The object x is about to be reused but tracefree and msanfree - // need to be informed. - // TODO:(rlh) It is quite possible that this object is being allocated - // out of a fresh span and that there is no preceding call to - // tracealloc with this object. If this is an issue then initialization - // of the fresh span needs to leave some crumbs around that can be used to - // avoid these calls. Furthermore these crumbs a likely the same as - // those needed to determine if the object needs to be zeroed. - // In the case of msanfree it does not make sense to call msanfree - // followed by msanmalloc. msanfree indicates that the bytes are not - // initialized but msanmalloc is about to indicate that they are. - // It makes no difference whether msanmalloc has been called on these - // bytes or not. - if debug.allocfreetrace != 0 { - tracefree(unsafe.Pointer(x), size) - } - if raceenabled { racemalloc(x, size) } diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 084d0a71c1..c9ef63547a 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -251,6 +251,21 @@ func (s *mspan) sweep(preserve bool) bool { } } + if debug.allocfreetrace != 0 { + // Find all newly freed objects. This doesn't have to + // efficient; allocfreetrace has massive overhead. + mbits := s.markBitsForBase() + abits := s.allocBitsForIndex(0) + for i := uintptr(0); i < s.nelems; i++ { + if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) { + x := s.base() + i*s.elemsize + tracefree(unsafe.Pointer(x), size) + } + mbits.advance() + abits.advance() + } + } + // Count the number of free objects in this span. nfree = s.countFree() if cl == 0 && nfree != 0 { From d97625ae9e7195a68d1c9f2b2ff54eb85545982e Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Fri, 29 Apr 2016 09:44:53 -0400 Subject: [PATCH 21/23] [dev.garbage] runtime: fix nfree accounting Commit 8dda1c4 changed the meaning of "nfree" in sweep from the number of newly freed objects to the total number of free objects in the span, but didn't update where sweep added nfree to c.local_nsmallfree. Hence, we're over-accounting the number of frees. This is causing TestArrayHash to fail with "too many allocs NNN - hash not balanced". Fix this by computing the number of newly freed objects and adding that to c.local_nsmallfree, so it behaves like it used to. Computing this requires a small tweak to mallocgc: apparently we've never set s.allocCount when allocating a large object; fix this by setting it to 1 so sweep doesn't get confused. Change-Id: I31902ffd310110da4ffd807c5c06f1117b872dc8 Reviewed-on: https://go-review.googlesource.com/22595 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements --- src/runtime/malloc.go | 1 + src/runtime/mgcsweep.go | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 2ac504f9dc..438cd06161 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -696,6 +696,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { s = largeAlloc(size, needzero) }) s.freeindex = 1 + s.allocCount = 1 x = unsafe.Pointer(s.base()) size = s.elemsize } diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index c9ef63547a..82537edaaa 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -272,8 +272,14 @@ func (s *mspan) sweep(preserve bool) bool { s.needzero = 1 freeToHeap = true } + nalloc := uint16(s.nelems) - uint16(nfree) + nfreed := s.allocCount - nalloc + if nalloc > s.allocCount { + print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n") + throw("sweep increased allocation count") + } - s.allocCount = uint16(s.nelems) - uint16(nfree) + s.allocCount = nalloc wasempty := s.nextFreeIndex() == s.nelems s.freeindex = 0 // reset allocation index to start of span. @@ -304,7 +310,7 @@ func (s *mspan) sweep(preserve bool) bool { } if nfree > 0 && cl != 0 { - c.local_nsmallfree[cl] += uintptr(nfree) + c.local_nsmallfree[cl] += uintptr(nfreed) res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { From b3579c095e00f89d8c92c2aa4fb4af222a96f429 Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Fri, 29 Apr 2016 10:57:06 -0400 Subject: [PATCH 22/23] [dev.garbage] runtime: revive sweep fast path sweep used to skip mcental.freeSpan (and its locking) if it didn't find any new free objects. We lost that optimization when the freed-object counting changed in dad83f7 to count total free objects instead of newly freed objects. The previous commit brings back counting of newly freed objects, so we can easily revive this optimization by checking that count (like we used to) instead of the total free objects count. Change-Id: I43658707a1c61674d0366124d5976b00d98741a9 Reviewed-on: https://go-review.googlesource.com/22596 Run-TryBot: Austin Clements Reviewed-by: Rick Hudson --- src/runtime/mgcsweep.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 82537edaaa..b8e33897c1 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -296,7 +296,7 @@ func (s *mspan) sweep(preserve bool) bool { // But we need to set it before we make the span available for allocation // (return it to heap or mcentral), because allocation code assumes that a // span is already swept if available for allocation. - if freeToHeap || nfree == 0 { + if freeToHeap || nfreed == 0 { // The span must be in our exclusive ownership until we update sweepgen, // check for potential races. if s.state != mSpanInUse || s.sweepgen != sweepgen-1 { @@ -309,7 +309,7 @@ func (s *mspan) sweep(preserve bool) bool { atomic.Store(&s.sweepgen, sweepgen) } - if nfree > 0 && cl != 0 { + if nfreed > 0 && cl != 0 { c.local_nsmallfree[cl] += uintptr(nfreed) res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty) // MCentral_FreeSpan updates sweepgen From e9eaa181fcadc2162baa62ccd8bfeb610acfdd55 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Fri, 29 Apr 2016 12:09:36 -0400 Subject: [PATCH 23/23] [dev.garbage] runtime: simplify nextFreeFast so it is inlined nextFreeFast is currently not inlined by the compiler due to its size and complexity. This CL simplifies nextFreeFast by letting the slow path handle (nextFree) handle a corner cases. Change-Id: Ia9c5d1a7912bcb4bec072f5fd240f0e0bafb20e4 Reviewed-on: https://go-review.googlesource.com/22598 Reviewed-by: Austin Clements Run-TryBot: Austin Clements --- src/runtime/malloc.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 438cd06161..c9cc82192d 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -495,14 +495,11 @@ func nextFreeFast(s *mspan) gclinkptr { if theBit < 64 { result := s.freeindex + uintptr(theBit) if result < s.nelems { - s.allocCache >>= (theBit + 1) freeidx := result + 1 if freeidx%64 == 0 && freeidx != s.nelems { - // We just incremented s.freeindex so it isn't 0 - // so we are moving to the next aCache. - whichByte := freeidx / 8 - s.refillAllocCache(whichByte) + return 0 } + s.allocCache >>= (theBit + 1) s.freeindex = freeidx v := gclinkptr(result*s.elemsize + s.base()) s.allocCount++