diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 574ce3dafc..2da13f2073 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -496,6 +496,33 @@ const ( _FlagNoZero = 1 << 1 // don't zero memory ) +// nextFreeFast returns the next free object if one is quickly available. +// Otherwise it returns 0. +func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { + s := c.alloc[sizeclass] + ctzIndex := uint8(s.allocCache & 0xff) + if ctzIndex != 0 { + theBit := uint64(ctzVals[ctzIndex]) + freeidx := s.freeindex // help the pre ssa compiler out here with cse. + result := freeidx + uintptr(theBit) + if result < s.nelems { + s.allocCache >>= (theBit + 1) + freeidx = result + 1 + if freeidx%64 == 0 && freeidx != s.nelems { + // We just incremented s.freeindex so it isn't 0 + // so we are moving to the next aCache. + whichByte := freeidx / 8 + s.refillAllocCache(whichByte) + } + s.freeindex = freeidx + v := gclinkptr(result*s.elemsize + s.base()) + s.allocCount++ + return v + } + } + return 0 +} + // nextFree returns the next free object from the cached span if one is available. // Otherwise it refills the cache with a span with an available object and // returns that object along with a flag indicating that this was a heavy @@ -508,9 +535,9 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { // The span is full. - if uintptr(s.allocCount) > s.nelems { + if uintptr(s.allocCount) != s.nelems { println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) - throw("s.allocCount > s.nelems && freeIndex == s.nelems") + throw("s.allocCount != s.nelems && freeIndex == s.nelems") } systemstack(func() { c.refill(int32(sizeclass)) @@ -644,7 +671,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } // Allocate a new maxTinySize block. var v gclinkptr - v, shouldhelpgc = c.nextFree(tinySizeClass) + v = c.nextFreeFast(tinySizeClass) + if v == 0 { + v, shouldhelpgc = c.nextFree(tinySizeClass) + } x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 (*[2]uint64)(x)[1] = 0 @@ -664,7 +694,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } size = uintptr(class_to_size[sizeclass]) var v gclinkptr - v, shouldhelpgc = c.nextFree(sizeclass) + v = c.nextFreeFast(sizeclass) + if v == 0 { + v, shouldhelpgc = c.nextFree(sizeclass) + } x = unsafe.Pointer(v) if flags&flagNoZero == 0 { memclr(unsafe.Pointer(v), size) @@ -725,9 +758,27 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { }) } + // The object x is about to be reused but tracefree and msanfree + // need to be informed. + // TODO:(rlh) It is quite possible that this object is being allocated + // out of a fresh span and that there is no preceding call to + // tracealloc with this object. If this is an issue then initialization + // of the fresh span needs to leave some crumbs around that can be used to + // avoid these calls. Furthermore these crumbs a likely the same as + // those needed to determine if the object needs to be zeroed. + // In the case of msanfree it does not make sense to call msanfree + // followed by msanmalloc. msanfree indicates that the bytes are not + // initialized but msanmalloc is about to indicate that they are. + // It makes no difference whether msanmalloc has been called on these + // bytes or not. + if debug.allocfreetrace != 0 { + tracefree(unsafe.Pointer(x), size) + } + if raceenabled { racemalloc(x, size) } + if msanenabled { msanmalloc(x, size) } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 910c4fa844..ea398904e3 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -186,17 +186,59 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} } +// ctzVals contains the count of trailing zeros for the +// index. 0 returns 8 indicating 8 zeros. +var ctzVals = [256]int8{ + 8, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 7, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0} + // A temporary stand in for the count trailing zero ctz instruction. // IA bsf works on 64 bit non-zero word. func ctz64(markBits uint64) uint64 { - if markBits == 0 { + ctz8 := ctzVals[markBits&0xff] + if ctz8 != 8 { + return uint64(ctz8) + } else if markBits == 0 { // low byte is zero check fill word. return 64 // bits in 64 bit word, ensures loop terminates } - // tz holds trailing zero count. - tz := uint64(0) - for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { + result := uint64(8) + markBits >>= 8 + for ctz8 = ctzVals[markBits&0xff]; ctz8 == 8; ctz8 = ctzVals[markBits&0xff] { + result += 8 + markBits >>= 8 } - return tz + result += uint64(ctz8) + return result } // refillAllocCache takes 8 bytes s.allocBits starting at whichByte @@ -222,10 +264,12 @@ func (s *mspan) refillAllocCache(whichByte uintptr) { // There are hardware instructions that can be used to make this // faster if profiling warrants it. func (s *mspan) nextFreeIndex() uintptr { - if s.freeindex == s.nelems { - return s.freeindex + sfreeindex := s.freeindex + snelems := s.nelems + if sfreeindex == snelems { + return sfreeindex } - if s.freeindex > s.nelems { + if sfreeindex > snelems { throw("s.freeindex > s.nelems") } @@ -233,37 +277,37 @@ func (s *mspan) nextFreeIndex() uintptr { bitIndex := ctz64(aCache) for bitIndex == 64 { // Move index to start of next cached bits. - s.freeindex = (s.freeindex + 64) &^ (64 - 1) - if s.freeindex >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + sfreeindex = (sfreeindex + 64) &^ (64 - 1) + if sfreeindex >= snelems { + s.freeindex = snelems + return snelems } - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 // Refill s.allocCache with the next 64 alloc bits. - // Unlike in allocBits a 1 in s.allocCache means - // the object is not marked. s.refillAllocCache(whichByte) aCache = s.allocCache bitIndex = ctz64(aCache) // Nothing was available try again now allocCache has been refilled. } - result := s.freeindex + uintptr(bitIndex) - if result >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + result := sfreeindex + uintptr(bitIndex) + if result >= snelems { + s.freeindex = snelems + return snelems } - s.allocCache >>= bitIndex + 1 - s.freeindex = result + 1 - if s.freeindex%64 == 0 && s.freeindex != s.nelems { + s.allocCache >>= (bitIndex + 1) + sfreeindex = result + 1 + + if sfreeindex%64 == 0 && sfreeindex != snelems { // We just incremented s.freeindex so it isn't 0. // As each 1 in s.allocCache was encountered and used for allocation // it was shifted away. At this point s.allocCache contains all 0s. // Refill s.allocCache so that it corresponds // to the bits at s.allocBits starting at s.freeindex. - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 s.refillAllocCache(whichByte) } + s.freeindex = sfreeindex return result } @@ -760,120 +804,60 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) { } } -// heapBitsSweepSpan coordinates the sweeping of a span and inspects -// each freed object. If objects are being traced or if msan is enabled -// then heapBitsSweepSpan calls f(p), where p is the object's base address. -// When not tracing and msan is not enabled heapBitsSweepSpan is lightweight. -// heapBitsSweepSpan never alters the pointer/scalar heapBit maps. HeapBit map -// maintenance is the responsibility of the allocation routines. -// TODO:(rlh) Deal with the checkmark bits but moving them -// out of heap bitmap thus enabling bulk clearing. -func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { - base := s.base() - size := s.elemsize - n := s.nelems - cl := s.sizeclass - doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0 - h := heapBitsForSpan(base) - switch { - default: - throw("heapBitsSweepSpan") - case sys.PtrSize == 8 && size == sys.PtrSize: - nfree = heapBitsSweep8BitPtrs(h, s, base, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 0: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 2*sys.PtrSize: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - } - return -} +// oneBitCount is indexed by byte and produces the +// number of 1 bits in that byte. For example 128 has 1 bit set +// and oneBitCount[128] will holds 1. +var oneBitCount = [256]uint8{ + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8} -func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { - mbits := s.markBitsForBase() - // Consider mark bits in all four 2-bit entries of each bitmap byte. - if cl == 0 { - throw("8BitPtrs are not in cl 0") +// countFree runs through the mark bits in a span and counts the number of free objects +// in the span. +// TODO:(rlh) Use popcount intrinsic. +func (s *mspan) countFree() int { + count := 0 + maxIndex := s.nelems / 8 + for i := uintptr(0); i < maxIndex; i++ { + count += int(oneBitCount[s.gcmarkBits[i]]) } - // Consider mark bits in all four 2-bit entries of each bitmap byte. - for i := uintptr(0); i < n; i++ { - // Note that unlike the other size cases, we leave the pointer bits set here. - // These are initialized during initSpan when the span is created and left - // in place the whole time the span is used for pointer-sized objects. - // That lets heapBitsSetType avoid an atomic update to set the pointer bit - // during allocation. - if !mbits.isMarked() { - nfree++ - if mbits.index < s.freeindex { - f(base + i*sys.PtrSize) - } else if s.allocBits[mbits.index/8]&mbits.mask == 1 { - // it was marked in the previous cycle but not this cycle - // if it wasn't marked in the prvious cycle the call would be redundant. - f(base + i*sys.PtrSize) - } - } - mbits.advance() - } - return nfree -} -// nextFreed returns the next object that is being freed during this GC cycle. -// If the mark bit is set then the object is free. If it is < s.freeindex -// then either the object was freed during by this GC cycle. -// If it is >= freeindex then if the allocBit is set then it was -// freed during this GC cycle. If the allocBit is 0 it was freed -// during a previous cycle so is not considered a freed. -func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool { - mByte := *m.bytep - for { - for mByte == 0xff { - if m.index >= nelems { - return false - } - m.index = (m.index + 8) &^ (8 - 1) - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - // Nothing free found totalFree remains the same. - } - if m.index >= nelems { - return false - } - for m.index < nelems { - if m.mask&mByte == 0 { - // At this point we have a free object so update totalFree - *totalFree++ - if m.index < s.freeindex { - return true - } - if s.allocBits[m.index/8]&m.mask != 0 { - return true - } - } - if m.mask == 1<<7 { - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - m.index++ - break - } else { - m.mask = m.mask << 1 - m.index++ - } - } + if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 { + markBits := uint8(s.gcmarkBits[maxIndex]) + mask := uint8((1 << bitsInLastByte) - 1) + bits := markBits & mask + count += int(oneBitCount[bits]) } - return false -} - -func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int { - totalFree := 0 - twobits := s.markBitsForBase() - for twobits.nextFreed(n, s, &totalFree) { - if doCall { - f(base + twobits.index*size) - } - twobits.advance() - } - return totalFree + return int(s.nelems) - count } // heapBitsSetType records that the new allocation [x, x+size) diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index c217ee8d86..1a6be6634d 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -8,7 +8,6 @@ package runtime import ( "runtime/internal/atomic" - "runtime/internal/sys" "unsafe" ) @@ -252,40 +251,13 @@ func (s *mspan) sweep(preserve bool) bool { } } - // Sweep through n objects of given size starting at p. - // This thread owns the span now, so it can manipulate - // the block bitmap without atomic operations. + // Count the number of free objects in this span. + nfree = s.countFree() + if cl == 0 && nfree != 0 { + s.needzero = 1 + freeToHeap = true + } - nfree = heapBitsSweepSpan(s, func(p uintptr) { - // At this point we know that we are looking at a garbage object - // that needs to be collected. - if debug.allocfreetrace != 0 { - tracefree(unsafe.Pointer(p), size) - } - if msanenabled { - msanfree(unsafe.Pointer(p), size) - } - - // Reset to allocated+noscan. - if cl == 0 { - // Free large span. - if preserve { - throw("can't preserve large span") - } - s.needzero = 1 - - // Free the span after heapBitsSweepSpan - // returns, since it's not done with the span. - freeToHeap = true - } else { - // Free small object. - if size > 2*sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed" - } else if size > sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0 - } - } - }) s.allocCount = uint16(s.nelems) - uint16(nfree) wasempty := s.nextFreeIndex() == s.nelems diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 4be503315b..b0b3bbd957 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -137,6 +137,9 @@ type mspan struct { // // Object n starts at address n*elemsize + (start << pageShift). freeindex uintptr + // TODO: Look up nelems from sizeclass and remove this field if it + // helps performance. + nelems uintptr // number of object in the span. // Cache of the allocBits at freeindex. allocCache is shifted // such that the lowest bit corresponds to the bit freeindex. @@ -147,9 +150,6 @@ type mspan struct { allocCache uint64 allocBits *[maxObjsPerSpan / 8]uint8 gcmarkBits *[maxObjsPerSpan / 8]uint8 - nelems uintptr // number of object in the span. - // TODO(rlh) consider moving some of these fields into seperate arrays. - // Put another way is an array of structs a better idea than a struct of arrays. // allocBits and gcmarkBits currently point to either markbits1 // or markbits2. At the end of a GC cycle allocBits and @@ -753,6 +753,12 @@ func (h *mheap) freeSpan(s *mspan, acct int32) { mp.mcache.local_scan = 0 memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs) mp.mcache.local_tinyallocs = 0 + if msanenabled { + // Tell msan that this entire span is no longer in use. + base := unsafe.Pointer(s.base()) + bytes := s.npages << _PageShift + msanfree(base, bytes) + } if acct != 0 { memstats.heap_objects-- }