diff --git a/dev.garbage b/dev.garbage new file mode 100644 index 0000000000..b8c3a3fcb7 --- /dev/null +++ b/dev.garbage @@ -0,0 +1 @@ +Reviving dev.garbage branch for use in new garbage collection experiment. diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go index c6000bf98f..be234345d1 100644 --- a/src/runtime/cgocall.go +++ b/src/runtime/cgocall.go @@ -529,7 +529,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) { return } - b, hbits, span := heapBitsForObject(uintptr(p), 0, 0) + b, hbits, span, _ := heapBitsForObject(uintptr(p), 0, 0) base = b if base == 0 { return diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 0afab09095..4afe663418 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -447,7 +447,7 @@ func dumproots() { continue } spf := (*specialfinalizer)(unsafe.Pointer(sp)) - p := unsafe.Pointer((uintptr(s.start) << _PageShift) + uintptr(spf.special.offset)) + p := unsafe.Pointer(s.base() + uintptr(spf.special.offset)) dumpfinalizer(p, spf.fn, spf.fint, spf.ot) } } @@ -467,15 +467,19 @@ func dumpobjs() { if s.state != _MSpanInUse { continue } - p := uintptr(s.start << _PageShift) + p := s.base() size := s.elemsize n := (s.npages << _PageShift) / size if n > uintptr(len(freemark)) { throw("freemark array doesn't have enough entries") } - for l := s.freelist; l.ptr() != nil; l = l.ptr().next { - freemark[(uintptr(l)-p)/size] = true + + for freeIndex := s.freeindex; freeIndex < s.nelems; freeIndex++ { + if s.isFree(freeIndex) { + freemark[freeIndex] = true + } } + for j := uintptr(0); j < n; j, p = j+1, p+size { if freemark[j] { freemark[j] = false @@ -615,7 +619,7 @@ func dumpmemprof() { continue } spp := (*specialprofile)(unsafe.Pointer(sp)) - p := uintptr(s.start<<_PageShift) + uintptr(spp.special.offset) + p := s.base() + uintptr(spp.special.offset) dumpint(tagAllocSample) dumpint(uint64(p)) dumpint(uint64(uintptr(unsafe.Pointer(spp.b)))) @@ -710,7 +714,7 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector { i := uintptr(0) hbits := heapBitsForAddr(p) for ; i < nptr; i++ { - if i >= 2 && !hbits.isMarked() { + if i >= 2 && !hbits.morePointers() { break // end of object } if hbits.isPointer() { diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 081d1419cb..c9cc82192d 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -94,6 +94,9 @@ const ( pageShift = _PageShift pageSize = _PageSize pageMask = _PageMask + // By construction, single page spans of the smallest object class + // have the most objects per span. + maxObjsPerSpan = pageSize / 8 mSpanInUse = _MSpanInUse @@ -167,9 +170,6 @@ const ( _MaxGcproc = 32 ) -// Page number (address>>pageShift) -type pageID uintptr - const _MaxArena32 = 2 << 30 // OS-defined helpers: @@ -384,6 +384,10 @@ func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer { return sysReserve(nil, n, reserved) } +// sysAlloc allocates the next n bytes from the heap arena. The +// returned pointer is always _PageSize aligned and between +// h.arena_start and h.arena_end. sysAlloc returns nil on failure. +// There is no corresponding free function. func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer { if n > h.arena_end-h.arena_used { // We are in 32-bit mode, maybe we didn't use all possible address space yet. @@ -484,6 +488,65 @@ func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer { // base address for all 0-byte allocations var zerobase uintptr +// nextFreeFast returns the next free object if one is quickly available. +// Otherwise it returns 0. +func nextFreeFast(s *mspan) gclinkptr { + theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache? + if theBit < 64 { + result := s.freeindex + uintptr(theBit) + if result < s.nelems { + freeidx := result + 1 + if freeidx%64 == 0 && freeidx != s.nelems { + return 0 + } + s.allocCache >>= (theBit + 1) + s.freeindex = freeidx + v := gclinkptr(result*s.elemsize + s.base()) + s.allocCount++ + return v + } + } + return 0 +} + +// nextFree returns the next free object from the cached span if one is available. +// Otherwise it refills the cache with a span with an available object and +// returns that object along with a flag indicating that this was a heavy +// weight allocation. If it is a heavy weight allocation the caller must +// determine whether a new GC cycle needs to be started or if the GC is active +// whether this goroutine needs to assist the GC. +func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, s *mspan, shouldhelpgc bool) { + s = c.alloc[sizeclass] + shouldhelpgc = false + freeIndex := s.nextFreeIndex() + if freeIndex == s.nelems { + // The span is full. + if uintptr(s.allocCount) != s.nelems { + println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) + throw("s.allocCount != s.nelems && freeIndex == s.nelems") + } + systemstack(func() { + c.refill(int32(sizeclass)) + }) + shouldhelpgc = true + s = c.alloc[sizeclass] + + freeIndex = s.nextFreeIndex() + } + + if freeIndex >= s.nelems { + throw("freeIndex is not valid") + } + + v = gclinkptr(freeIndex*s.elemsize + s.base()) + s.allocCount++ + if uintptr(s.allocCount) > s.nelems { + println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems) + throw("s.allocCount > s.nelems") + } + return +} + // Allocate an object of size bytes. // Small objects are allocated from the per-P cache's free lists. // Large objects (> 32 kB) are allocated straight from the heap. @@ -538,7 +601,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { shouldhelpgc := false dataSize := size c := gomcache() - var s *mspan var x unsafe.Pointer noscan := typ == nil || typ.kind&kindNoPointers != 0 if size <= maxSmallSize { @@ -591,20 +653,11 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { return x } // Allocate a new maxTinySize block. - s = c.alloc[tinySizeClass] - v := s.freelist - if v.ptr() == nil { - systemstack(func() { - c.refill(tinySizeClass) - }) - shouldhelpgc = true - s = c.alloc[tinySizeClass] - v = s.freelist + span := c.alloc[tinySizeClass] + v := nextFreeFast(span) + if v == 0 { + v, _, shouldhelpgc = c.nextFree(tinySizeClass) } - s.freelist = v.ptr().next - s.ref++ - // prefetchnta offers best performance, see change list message. - prefetchnta(uintptr(v.ptr().next)) x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 (*[2]uint64)(x)[1] = 0 @@ -623,26 +676,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { sizeclass = size_to_class128[(size-1024+127)>>7] } size = uintptr(class_to_size[sizeclass]) - s = c.alloc[sizeclass] - v := s.freelist - if v.ptr() == nil { - systemstack(func() { - c.refill(int32(sizeclass)) - }) - shouldhelpgc = true - s = c.alloc[sizeclass] - v = s.freelist + span := c.alloc[sizeclass] + v := nextFreeFast(span) + if v == 0 { + v, span, shouldhelpgc = c.nextFree(sizeclass) } - s.freelist = v.ptr().next - s.ref++ - // prefetchnta offers best performance, see change list message. - prefetchnta(uintptr(v.ptr().next)) x = unsafe.Pointer(v) - if needzero { - v.ptr().next = 0 - if size > 2*sys.PtrSize && ((*[2]uintptr)(x))[1] != 0 { - memclr(unsafe.Pointer(v), size) - } + if needzero && span.needzero != 0 { + memclr(unsafe.Pointer(v), size) } } } else { @@ -651,13 +692,15 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { systemstack(func() { s = largeAlloc(size, needzero) }) - x = unsafe.Pointer(uintptr(s.start << pageShift)) + s.freeindex = 1 + s.allocCount = 1 + x = unsafe.Pointer(s.base()) size = s.elemsize } var scanSize uintptr if noscan { - // All objects are pre-marked as noscan. Nothing to do. + heapBitsSetTypeNoScan(uintptr(x), size) } else { // If allocating a defer+arg block, now that we've picked a malloc size // large enough to hold everything, cut the "asked for" size down to @@ -701,6 +744,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer { if raceenabled { racemalloc(x, size) } + if msanenabled { msanmalloc(x, size) } @@ -755,8 +799,8 @@ func largeAlloc(size uintptr, needzero bool) *mspan { if s == nil { throw("out of memory") } - s.limit = uintptr(s.start)<<_PageShift + size - heapBitsForSpan(s.base()).initSpan(s.layout()) + s.limit = s.base() + size + heapBitsForSpan(s.base()).initSpan(s) return s } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 3df697ee5c..9df64cb168 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -24,7 +24,7 @@ // In each 2-bit entry, the lower bit holds the same information as in the 1-bit // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC. // The meaning of the high bit depends on the position of the word being described -// in its allocated object. In the first word, the high bit is the GC ``marked'' bit. +// in its allocated object. In the first word, the high bit is unused. // In the second word, the high bit is the GC ``checkmarked'' bit (see below). // In the third and later words, the high bit indicates that the object is still // being described. In these words, if a bit pair with a high bit 0 is encountered, @@ -33,12 +33,13 @@ // in the object are uninteresting to the garbage collector. // // The 2-bit entries are split when written into the byte, so that the top half -// of the byte contains 4 mark bits and the bottom half contains 4 pointer bits. +// of the byte contains 4 high bits and the bottom half contains 4 low (pointer) +// bits. // This form allows a copy from the 1-bit to the 4-bit form to keep the // pointer bits contiguous, instead of having to space them out. // // The code makes use of the fact that the zero value for a heap bitmap -// has no live pointer bit set and is (depending on position), not marked, +// has no live pointer bit set and is (depending on position), not used, // not checkmarked, and is the dead encoding. // These properties must be preserved when modifying the encoding. // @@ -63,6 +64,7 @@ // It is still used in general, except in checkmark the type bit is repurposed // as the checkmark bit and then reinitialized (to 1) as the type bit when // finished. +// package runtime @@ -95,6 +97,8 @@ func addb(p *byte, n uintptr) *byte { } // subtractb returns the byte pointer p-n. +// subtractb is typically used when traversing the pointer tables referred to by hbits +// which are arranged in reverse order. //go:nowritebarrier //go:nosplit func subtractb(p *byte, n uintptr) *byte { @@ -115,6 +119,8 @@ func add1(p *byte) *byte { } // subtract1 returns the byte pointer p-1. +// subtract1 is typically used when traversing the pointer tables referred to by hbits +// which are arranged in reverse order. //go:nowritebarrier // // nosplit because it is used during write barriers and must not be preempted. @@ -161,6 +167,193 @@ type heapBits struct { shift uint32 } +// markBits provides access to the mark bit for an object in the heap. +// bytep points to the byte holding the mark bit. +// mask is a byte with a single bit set that can be &ed with *bytep +// to see if the bit has been set. +// *m.byte&m.mask != 0 indicates the mark bit is set. +// index can be used along with span information to generate +// the address of the object in the heap. +// We maintain one set of mark bits for allocation and one for +// marking purposes. +type markBits struct { + bytep *uint8 + mask uint8 + index uintptr +} + +//go:nosplit +func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { + whichByte := allocBitIndex / 8 + whichBit := allocBitIndex % 8 + bytePtr := addb(s.allocBits, whichByte) + return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex} +} + +// refillaCache takes 8 bytes s.allocBits starting at whichByte +// and negates them so that ctz (count trailing zeros) instructions +// can be used. It then places these 8 bytes into the cached 64 bit +// s.allocCache. +func (s *mspan) refillAllocCache(whichByte uintptr) { + bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte))) + aCache := uint64(0) + aCache |= uint64(bytes[0]) + aCache |= uint64(bytes[1]) << (1 * 8) + aCache |= uint64(bytes[2]) << (2 * 8) + aCache |= uint64(bytes[3]) << (3 * 8) + aCache |= uint64(bytes[4]) << (4 * 8) + aCache |= uint64(bytes[5]) << (5 * 8) + aCache |= uint64(bytes[6]) << (6 * 8) + aCache |= uint64(bytes[7]) << (7 * 8) + s.allocCache = ^aCache +} + +// nextFreeIndex returns the index of the next free object in s at +// or after s.freeindex. +// There are hardware instructions that can be used to make this +// faster if profiling warrants it. +func (s *mspan) nextFreeIndex() uintptr { + sfreeindex := s.freeindex + snelems := s.nelems + if sfreeindex == snelems { + return sfreeindex + } + if sfreeindex > snelems { + throw("s.freeindex > s.nelems") + } + + aCache := s.allocCache + + bitIndex := sys.Ctz64(aCache) + for bitIndex == 64 { + // Move index to start of next cached bits. + sfreeindex = (sfreeindex + 64) &^ (64 - 1) + if sfreeindex >= snelems { + s.freeindex = snelems + return snelems + } + whichByte := sfreeindex / 8 + // Refill s.allocCache with the next 64 alloc bits. + s.refillAllocCache(whichByte) + aCache = s.allocCache + bitIndex = sys.Ctz64(aCache) + // nothing available in cached bits + // grab the next 8 bytes and try again. + } + result := sfreeindex + uintptr(bitIndex) + if result >= snelems { + s.freeindex = snelems + return snelems + } + + s.allocCache >>= (bitIndex + 1) + sfreeindex = result + 1 + + if sfreeindex%64 == 0 && sfreeindex != snelems { + // We just incremented s.freeindex so it isn't 0. + // As each 1 in s.allocCache was encountered and used for allocation + // it was shifted away. At this point s.allocCache contains all 0s. + // Refill s.allocCache so that it corresponds + // to the bits at s.allocBits starting at s.freeindex. + whichByte := sfreeindex / 8 + s.refillAllocCache(whichByte) + } + s.freeindex = sfreeindex + return result +} + +func (s *mspan) isFree(index uintptr) bool { + whichByte := index / 8 + whichBit := index % 8 + byteVal := *addb(s.allocBits, whichByte) + return byteVal&uint8(1<> s.divShift + } + return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2) +} + +func markBitsForAddr(p uintptr) markBits { + s := spanOf(p) + objIndex := s.objIndex(p) + return s.markBitsForIndex(objIndex) +} + +func (s *mspan) markBitsForIndex(objIndex uintptr) markBits { + whichByte := objIndex / 8 + bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index + bytePtr := addb(s.gcmarkBits, whichByte) + return markBits{bytePtr, bitMask, objIndex} +} + +func (s *mspan) markBitsForBase() markBits { + return markBits{s.gcmarkBits, uint8(1), 0} +} + +// isMarked reports whether mark bit m is set. +func (m markBits) isMarked() bool { + return *m.bytep&m.mask != 0 +} + +// setMarked sets the marked bit in the markbits, atomically. Some compilers +// are not able to inline atomic.Or8 function so if it appears as a hot spot consider +// inlining it manually. +func (m markBits) setMarked() { + // Might be racing with other updates, so use atomic update always. + // We used to be clever here and use a non-atomic update in certain + // cases, but it's not worth the risk. + atomic.Or8(m.bytep, m.mask) +} + +// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically. +func (m markBits) setMarkedNonAtomic() { + *m.bytep |= m.mask +} + +// clearMarked clears the marked bit in the markbits, atomically. +func (m markBits) clearMarked() { + // Might be racing with other updates, so use atomic update always. + // We used to be clever here and use a non-atomic update in certain + // cases, but it's not worth the risk. + atomic.And8(m.bytep, ^m.mask) +} + +// clearMarkedNonAtomic clears the marked bit non-atomically. +func (m markBits) clearMarkedNonAtomic() { + *m.bytep ^= m.mask +} + +// markBitsForSpan returns the markBits for the span base address base. +func markBitsForSpan(base uintptr) (mbits markBits) { + if base < mheap_.arena_start || base >= mheap_.arena_used { + throw("heapBitsForSpan: base out of range") + } + mbits = markBitsForAddr(base) + if mbits.mask != 1 { + throw("markBitsForSpan: unaligned start") + } + return mbits +} + +// advance advances the markBits to the next object in the span. +func (m *markBits) advance() { + if m.mask == 1<<7 { + m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1)) + m.mask = 1 + } else { + m.mask = m.mask << 1 + } + m.index++ +} + // heapBitsForAddr returns the heapBits for the address addr. // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used). // @@ -177,15 +370,12 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { if base < mheap_.arena_start || base >= mheap_.arena_used { throw("heapBitsForSpan: base out of range") } - hbits = heapBitsForAddr(base) - if hbits.shift != 0 { - throw("heapBitsForSpan: unaligned start") - } - return hbits + return heapBitsForAddr(base) } // heapBitsForObject returns the base address for the heap object -// containing the address p, along with the heapBits for base. +// containing the address p, the heapBits for base, +// the object's span, and of the index of the object in s. // If p does not point into a heap object, // return base == 0 // otherwise return the base of the object. @@ -193,7 +383,7 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { // refBase and refOff optionally give the base address of the object // in which the pointer p was found and the byte offset at which it // was found. These are used for error reporting. -func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan) { +func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) { arenaStart := mheap_.arena_start if p < arenaStart || p >= mheap_.arena_used { return @@ -202,9 +392,8 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits idx := off >> _PageShift // p points into the heap, but possibly to the middle of an object. // Consult the span table to find the block beginning. - k := p >> _PageShift s = h_spans[idx] - if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse { + if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse { if s == nil || s.state == _MSpanStack { // If s is nil, the virtual address has never been part of the heap. // This pointer may be to some mmap'd region, so we allow it. @@ -230,7 +419,7 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits } else { print(" to unused region of span") } - print("idx=", hex(idx), " span.start=", hex(s.start<<_PageShift), " span.limit=", hex(s.limit), " span.state=", s.state, "\n") + print("idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n") if refBase != 0 { print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n") gcDumpObject("object", refBase, refOff) @@ -245,6 +434,7 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits // optimize for power of 2 sized objects. base = s.base() base = base + (p-base)&s.baseMask + objIndex = (base - s.base()) >> s.divShift // base = p & s.baseMask is faster for small spans, // but doesn't work for large spans. // Overall, it's faster to use the more general computation above. @@ -252,8 +442,8 @@ func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits base = s.base() if p-base >= s.elemsize { // n := (p - base) / s.elemsize, using division by multiplication - n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) - base += n * s.elemsize + objIndex = uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) + base += objIndex * s.elemsize } } // Now that we know the actual base, compute heapBits to return to caller. @@ -298,28 +488,13 @@ func (h heapBits) bits() uint32 { return uint32(*h.bitp) >> (h.shift & 31) } -// isMarked reports whether the heap bits have the marked bit set. -// h must describe the initial word of the object. -func (h heapBits) isMarked() bool { +// morePointers returns true if this word and all remaining words in this object +// are scalars. +// h must not describe the first or second word of the object. +func (h heapBits) morePointers() bool { return *h.bitp&(bitMarked< 2*sys.PtrSize { - x = 0 - } - } - *bitp = uint8(x) - if i+1 >= n { - break - } - bitp = subtractb(bitp, step) - x = uint32(*bitp) - if x&(bitMarked<<(2*heapBitsShift)) != 0 { - x &^= bitMarked << (2 * heapBitsShift) - } else { - x &^= (bitMarked|bitPointer)<<(2*heapBitsShift) | (bitMarked|bitPointer)<<(3*heapBitsShift) - f(base + (i+1)*size) - if size > 2*sys.PtrSize { - *subtract1(bitp) = 0 - } - } - *bitp = uint8(x) - bitp = subtractb(bitp, step+1) - } +// countFree runs through the mark bits in a span and counts the number of free objects +// in the span. +// TODO:(rlh) Use popcount intrinsic. +func (s *mspan) countFree() int { + count := 0 + maxIndex := s.nelems / 8 + for i := uintptr(0); i < maxIndex; i++ { + mrkBits := *addb(s.gcmarkBits, i) + count += int(oneBitCount[mrkBits]) } + if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 { + mrkBits := *addb(s.gcmarkBits, maxIndex) + mask := uint8((1 << bitsInLastByte) - 1) + bits := mrkBits & mask + count += int(oneBitCount[bits]) + } + return int(s.nelems) - count } // heapBitsSetType records that the new allocation [x, x+size) @@ -739,7 +880,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // size is sizeof(_defer{}) (at least 6 words) and dataSize may be // arbitrarily larger. // - // The checks for size == ptrSize and size == 2*ptrSize can therefore + // The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore // assume that dataSize == size without checking it explicitly. if sys.PtrSize == 8 && size == sys.PtrSize { @@ -779,10 +920,13 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // (In general the number of instances of typ being allocated is // dataSize/typ.size.) if sys.PtrSize == 4 && dataSize == sys.PtrSize { - // 1 pointer. + // 1 pointer object. On 32-bit machines clear the bit for the + // unused second word. if gcphase == _GCoff { + *h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift *h.bitp |= bitPointer << h.shift } else { + atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<>= 2 nb -= 2 // Note: no bitMarker in hb because the first two words don't get markers from us. if gcphase == _GCoff { + *hbitp &^= uint8((bitPointer | (bitPointer << heapBitsShift)) << (2 * heapBitsShift)) *hbitp |= uint8(hb) } else { + atomic.And8(hbitp, ^(uint8(bitPointer|bitPointer< 2*sys.PtrSize { + *bitp &^= (bitPointer | bitMarked) << (2 * heapBitsShift) + } + } else if h.shift == 2 { + *bitp &^= bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift) + if size > 2*sys.PtrSize { + bitp = subtract1(bitp) + *bitp &^= bitPointer | bitMarked + } + } else { + throw("Type has unrecognized size") + } + } else { + throw("Type has unrecognized size") + } +} + var debugPtrmask struct { lock mutex data *byte @@ -1301,7 +1494,7 @@ func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize u // progToPointerMask returns the 1-bit pointer mask output by the GC program prog. // size the size of the region described by prog, in bytes. -// The resulting bitvector will have no more than size/ptrSize bits. +// The resulting bitvector will have no more than size/sys.PtrSize bits. func progToPointerMask(prog *byte, size uintptr) bitvector { n := (size/sys.PtrSize + 7) / 8 x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1] @@ -1437,7 +1630,7 @@ Run: // into a register and use that register for the entire loop // instead of repeatedly reading from memory. // Handling fewer than 8 bits here makes the general loop simpler. - // The cutoff is ptrSize*8 - 7 to guarantee that when we add + // The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add // the pattern to a bit buffer holding at most 7 bits (a partial byte) // it will not overflow. src := dst @@ -1732,7 +1925,7 @@ func getgcmask(ep interface{}) (mask []byte) { if hbits.isPointer() { mask[i/sys.PtrSize] = 1 } - if i >= 2*sys.PtrSize && !hbits.isMarked() { + if i >= 2*sys.PtrSize && !hbits.morePointers() { mask = mask[:i/sys.PtrSize] break } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 2230c5c200..5938e53ca8 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -108,9 +108,11 @@ func (c *mcache) refill(sizeclass int32) *mspan { _g_.m.locks++ // Return the current cached span to the central lists. s := c.alloc[sizeclass] - if s.freelist.ptr() != nil { - throw("refill on a nonempty span") + + if uintptr(s.allocCount) != s.nelems { + throw("refill of span with free space remaining") } + if s != &emptymspan { s.incache = false } @@ -120,10 +122,11 @@ func (c *mcache) refill(sizeclass int32) *mspan { if s == nil { throw("out of memory") } - if s.freelist.ptr() == nil { - println(s.ref, (s.npages<<_PageShift)/s.elemsize) - throw("empty span") + + if uintptr(s.allocCount) == s.nelems { + throw("span has no free space") } + c.alloc[sizeclass] = s _g_.m.locks-- return s diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 4f0b86c228..7b63110460 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -18,7 +18,7 @@ import "runtime/internal/atomic" type mcentral struct { lock mutex sizeclass int32 - nonempty mSpanList // list of spans with a free object + nonempty mSpanList // list of spans with a free object, ie a nonempty free list empty mSpanList // list of spans with no free objects (or cached in an mcache) } @@ -67,7 +67,9 @@ retry: c.empty.insertBack(s) unlock(&c.lock) s.sweep(true) - if s.freelist.ptr() != nil { + freeIndex := s.nextFreeIndex() + if freeIndex != s.nelems { + s.freeindex = freeIndex goto havespan } lock(&c.lock) @@ -98,11 +100,11 @@ retry: // c is unlocked. havespan: cap := int32((s.npages << _PageShift) / s.elemsize) - n := cap - int32(s.ref) - if n == 0 { - throw("empty span") + n := cap - int32(s.allocCount) + if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems { + throw("span has no free objects") } - usedBytes := uintptr(s.ref) * s.elemsize + usedBytes := uintptr(s.allocCount) * s.elemsize if usedBytes > 0 { reimburseSweepCredit(usedBytes) } @@ -115,10 +117,16 @@ havespan: // heap_live changed. gcController.revise() } - if s.freelist.ptr() == nil { - throw("freelist empty") - } s.incache = true + freeByteBase := s.freeindex &^ (64 - 1) + whichByte := freeByteBase / 8 + // Init alloc bits cache. + s.refillAllocCache(whichByte) + + // Adjust the allocCache so that s.freeindex corresponds to the low bit in + // s.allocCache. + s.allocCache >>= s.freeindex % 64 + return s } @@ -128,12 +136,12 @@ func (c *mcentral) uncacheSpan(s *mspan) { s.incache = false - if s.ref == 0 { - throw("uncaching full span") + if s.allocCount == 0 { + throw("uncaching span but s.allocCount == 0") } cap := int32((s.npages << _PageShift) / s.elemsize) - n := cap - int32(s.ref) + n := cap - int32(s.allocCount) if n > 0 { c.empty.remove(s) c.nonempty.insert(s) @@ -144,22 +152,19 @@ func (c *mcentral) uncacheSpan(s *mspan) { unlock(&c.lock) } -// Free n objects from a span s back into the central free list c. -// Called during sweep. -// Returns true if the span was returned to heap. Sets sweepgen to -// the latest generation. -// If preserve=true, don't return the span to heap nor relink in MCentral lists; -// caller takes care of it. -func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool) bool { +// freeSpan updates c and s after sweeping s. +// It sets s's sweepgen to the latest generation, +// and, based on the number of free objects in s, +// moves s to the appropriate list of c or returns it +// to the heap. +// freeSpan returns true if s was returned to the heap. +// If preserve=true, it does not move s (the caller +// must take care of it). +func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool { if s.incache { - throw("freespan into cached span") + throw("freeSpan given cached span") } - - // Add the objects back to s's free list. - wasempty := s.freelist.ptr() == nil - end.ptr().next = s.freelist - s.freelist = start - s.ref -= uint16(n) + s.needzero = 1 if preserve { // preserve is set only when called from MCentral_CacheSpan above, @@ -185,21 +190,18 @@ func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, p // lock of c above.) atomic.Store(&s.sweepgen, mheap_.sweepgen) - if s.ref != 0 { + if s.allocCount != 0 { unlock(&c.lock) return false } - // s is completely freed, return it to the heap. c.nonempty.remove(s) - s.needzero = 1 - s.freelist = 0 unlock(&c.lock) mheap_.freeSpan(s, 0) return true } -// Fetch a new span from the heap and carve into objects for the free list. +// grow allocates a new empty span from the heap and initializes it for c's size class. func (c *mcentral) grow() *mspan { npages := uintptr(class_to_allocnpages[c.sizeclass]) size := uintptr(class_to_size[c.sizeclass]) @@ -210,21 +212,9 @@ func (c *mcentral) grow() *mspan { return nil } - p := uintptr(s.start << _PageShift) + p := s.base() s.limit = p + size*n - head := gclinkptr(p) - tail := gclinkptr(p) - // i==0 iteration already done - for i := uintptr(1); i < n; i++ { - p += size - tail.ptr().next = gclinkptr(p) - tail = gclinkptr(p) - } - if s.freelist.ptr() != nil { - throw("freelist not empty") - } - tail.ptr().next = 0 - s.freelist = head - heapBitsForSpan(s.base()).initSpan(s.layout()) + + heapBitsForSpan(s.base()).initSpan(s) return s } diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go index e81650d842..6dce6d7501 100644 --- a/src/runtime/mfinal.go +++ b/src/runtime/mfinal.go @@ -402,7 +402,7 @@ func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) { if s == nil { return } - x = unsafe.Pointer(uintptr(s.start) << pageShift) + x = unsafe.Pointer(s.base()) if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse { s = nil diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index b5a9ff9b56..14449c3d4b 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -360,7 +360,7 @@ func markrootSpans(gcw *gcWork, shard int) { // retain everything it points to. spf := (*specialfinalizer)(unsafe.Pointer(sp)) // A finalizer can be set for an inner byte of an object, find object beginning. - p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize + p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize // Mark everything that can be reached from // the object (but *not* the object itself or @@ -962,7 +962,10 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) { if blocking { b = gcw.get() } else { - b = gcw.tryGet() + b = gcw.tryGetFast() + if b == 0 { + b = gcw.tryGet() + } } if b == 0 { // work barrier reached or tryGet failed. @@ -1025,7 +1028,11 @@ func gcDrainN(gcw *gcWork, scanWork int64) int64 { // PREFETCH(wbuf->obj[wbuf.nobj - 3]; // } // - b := gcw.tryGet() + b := gcw.tryGetFast() + if b == 0 { + b = gcw.tryGet() + } + if b == 0 { break } @@ -1075,8 +1082,8 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) { // Same work as in scanobject; see comments there. obj := *(*uintptr)(unsafe.Pointer(b + i)) if obj != 0 && arena_start <= obj && obj < arena_used { - if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 { - greyobject(obj, b, i, hbits, span, gcw) + if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 { + greyobject(obj, b, i, hbits, span, gcw, objIndex) } } } @@ -1141,8 +1148,8 @@ func scanobject(b uintptr, gcw *gcWork) { // Check if it points into heap and not back at the current object. if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n { // Mark the object. - if obj, hbits, span := heapBitsForObject(obj, b, i); obj != 0 { - greyobject(obj, b, i, hbits, span, gcw) + if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 { + greyobject(obj, b, i, hbits, span, gcw, objIndex) } } } @@ -1155,9 +1162,9 @@ func scanobject(b uintptr, gcw *gcWork) { // Preemption must be disabled. //go:nowritebarrier func shade(b uintptr) { - if obj, hbits, span := heapBitsForObject(b, 0, 0); obj != 0 { + if obj, hbits, span, objIndex := heapBitsForObject(b, 0, 0); obj != 0 { gcw := &getg().m.p.ptr().gcw - greyobject(obj, 0, 0, hbits, span, gcw) + greyobject(obj, 0, 0, hbits, span, gcw, objIndex) if gcphase == _GCmarktermination || gcBlackenPromptly { // Ps aren't allowed to cache work during mark // termination. @@ -1170,14 +1177,15 @@ func shade(b uintptr) { // If it isn't already marked, mark it and enqueue into gcw. // base and off are for debugging only and could be removed. //go:nowritebarrierrec -func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork) { +func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) { // obj should be start of allocation, and so must be at least pointer-aligned. if obj&(sys.PtrSize-1) != 0 { throw("greyobject: obj not pointer-aligned") } + mbits := span.markBitsForIndex(objIndex) if useCheckmark { - if !hbits.isMarked() { + if !mbits.isMarked() { printlock() print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n") print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n") @@ -1199,11 +1207,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork } } else { // If marked we have nothing to do. - if hbits.isMarked() { + if mbits.isMarked() { return } - hbits.setMarked() - + // mbits.setMarked() // Avoid extra call overhead with manual inlining. + atomic.Or8(mbits.bytep, mbits.mask) // If this is a noscan object, fast-track it to black // instead of greying it. if !hbits.hasPointers(span.elemsize) { @@ -1218,8 +1226,9 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork // Previously we put the obj in an 8 element buffer that is drained at a rate // to give the PREFETCH time to do its work. // Use of PREFETCHNTA might be more appropriate than PREFETCH - - gcw.put(obj) + if !gcw.putFast(obj) { + gcw.put(obj) + } } // gcDumpObject dumps the contents of obj for debugging and marks the @@ -1238,7 +1247,7 @@ func gcDumpObject(label string, obj, off uintptr) { print(" s=nil\n") return } - print(" s.start*_PageSize=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n") + print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n") skipped := false for i := uintptr(0); i < s.elemsize; i += sys.PtrSize { // For big objects, just print the beginning (because @@ -1274,7 +1283,7 @@ func gcmarknewobject(obj, size, scanSize uintptr) { if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen. throw("gcmarknewobject called while doing checkmark") } - heapBitsForAddr(obj).setMarked() + markBitsForAddr(obj).setMarked() gcw := &getg().m.p.ptr().gcw gcw.bytesMarked += uint64(size) gcw.scanWork += int64(scanSize) diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index 31d1a80183..b8e33897c1 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -8,7 +8,6 @@ package runtime import ( "runtime/internal/atomic" - "runtime/internal/sys" "unsafe" ) @@ -52,6 +51,7 @@ func finishsweep_m(stw bool) { } } } + nextMarkBitArenaEpoch() } func bgsweep(c chan int) { @@ -187,21 +187,16 @@ func (s *mspan) sweep(preserve bool) bool { res := false nfree := 0 - var head, end gclinkptr - c := _g_.m.mcache freeToHeap := false - // Mark any free objects in this span so we don't collect them. - sstart := uintptr(s.start << _PageShift) - for link := s.freelist; link.ptr() != nil; link = link.ptr().next { - if uintptr(link) < sstart || s.limit <= uintptr(link) { - // Free list is corrupted. - dumpFreeList(s) - throw("free list corrupted") - } - heapBitsForAddr(uintptr(link)).setMarkedNonAtomic() - } + // The allocBits indicate which unmarked objects don't need to be + // processed since they were free at the end of the last GC cycle + // and were not allocated since then. + // If the allocBits index is >= s.freeindex and the bit + // is not marked then the object remains unallocated + // since the last GC. + // This situation is analogous to being on a freelist. // Unlink & free special records for any objects we're about to free. // Two complications here: @@ -215,17 +210,18 @@ func (s *mspan) sweep(preserve bool) bool { special := *specialp for special != nil { // A finalizer can be set for an inner byte of an object, find object beginning. - p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size - hbits := heapBitsForAddr(p) - if !hbits.isMarked() { + objIndex := uintptr(special.offset) / size + p := s.base() + objIndex*size + mbits := s.markBitsForIndex(objIndex) + if !mbits.isMarked() { // This object is not marked and has at least one special record. // Pass 1: see if it has at least one finalizer. hasFin := false - endOffset := p - uintptr(s.start<<_PageShift) + size + endOffset := p - s.base() + size for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next { if tmp.kind == _KindSpecialFinalizer { // Stop freeing of object if it has a finalizer. - hbits.setMarkedNonAtomic() + mbits.setMarkedNonAtomic() hasFin = true break } @@ -234,7 +230,7 @@ func (s *mspan) sweep(preserve bool) bool { for special != nil && uintptr(special.offset) < endOffset { // Find the exact byte for which the special was setup // (as opposed to object beginning). - p := uintptr(s.start<<_PageShift) + uintptr(special.offset) + p := s.base() + uintptr(special.offset) if special.kind == _KindSpecialFinalizer || !hasFin { // Splice out special record. y := special @@ -255,67 +251,67 @@ func (s *mspan) sweep(preserve bool) bool { } } - // Sweep through n objects of given size starting at p. - // This thread owns the span now, so it can manipulate - // the block bitmap without atomic operations. - - size, n, _ := s.layout() - heapBitsSweepSpan(s.base(), size, n, func(p uintptr) { - // At this point we know that we are looking at garbage object - // that needs to be collected. - if debug.allocfreetrace != 0 { - tracefree(unsafe.Pointer(p), size) - } - if msanenabled { - msanfree(unsafe.Pointer(p), size) - } - - // Reset to allocated+noscan. - if cl == 0 { - // Free large span. - if preserve { - throw("can't preserve large span") + if debug.allocfreetrace != 0 { + // Find all newly freed objects. This doesn't have to + // efficient; allocfreetrace has massive overhead. + mbits := s.markBitsForBase() + abits := s.allocBitsForIndex(0) + for i := uintptr(0); i < s.nelems; i++ { + if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) { + x := s.base() + i*s.elemsize + tracefree(unsafe.Pointer(x), size) } - s.needzero = 1 - - // Free the span after heapBitsSweepSpan - // returns, since it's not done with the span. - freeToHeap = true - } else { - // Free small object. - if size > 2*sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed" - } else if size > sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0 - } - if head.ptr() == nil { - head = gclinkptr(p) - } else { - end.ptr().next = gclinkptr(p) - } - end = gclinkptr(p) - end.ptr().next = gclinkptr(0x0bade5) - nfree++ + mbits.advance() + abits.advance() } - }) + } + + // Count the number of free objects in this span. + nfree = s.countFree() + if cl == 0 && nfree != 0 { + s.needzero = 1 + freeToHeap = true + } + nalloc := uint16(s.nelems) - uint16(nfree) + nfreed := s.allocCount - nalloc + if nalloc > s.allocCount { + print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n") + throw("sweep increased allocation count") + } + + s.allocCount = nalloc + wasempty := s.nextFreeIndex() == s.nelems + s.freeindex = 0 // reset allocation index to start of span. + + // gcmarkBits becomes the allocBits. + // get a fresh cleared gcmarkBits in preparation for next GC + s.allocBits = s.gcmarkBits + s.gcmarkBits = newMarkBits(s.nelems) + + // Initialize alloc bits cache. + s.refillAllocCache(0) // We need to set s.sweepgen = h.sweepgen only when all blocks are swept, // because of the potential for a concurrent free/SetFinalizer. // But we need to set it before we make the span available for allocation // (return it to heap or mcentral), because allocation code assumes that a // span is already swept if available for allocation. - if freeToHeap || nfree == 0 { + if freeToHeap || nfreed == 0 { // The span must be in our exclusive ownership until we update sweepgen, // check for potential races. if s.state != mSpanInUse || s.sweepgen != sweepgen-1 { print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n") throw("MSpan_Sweep: bad span state after sweep") } + // Serialization point. + // At this point the mark bits are cleared and allocation ready + // to go so release the span. atomic.Store(&s.sweepgen, sweepgen) } - if nfree > 0 { - c.local_nsmallfree[cl] += uintptr(nfree) - res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve) + + if nfreed > 0 && cl != 0 { + c.local_nsmallfree[cl] += uintptr(nfreed) + res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty) // MCentral_FreeSpan updates sweepgen } else if freeToHeap { // Free large span to heap @@ -336,7 +332,7 @@ func (s *mspan) sweep(preserve bool) bool { // implement and then call some kind of MHeap_DeleteSpan. if debug.efence > 0 { s.limit = 0 // prevent mlookup from finding this span - sysFault(unsafe.Pointer(uintptr(s.start<<_PageShift)), size) + sysFault(unsafe.Pointer(s.base()), size) } else { mheap_.freeSpan(s, 1) } @@ -399,27 +395,3 @@ func reimburseSweepCredit(unusableBytes uintptr) { throw("spanBytesAlloc underflow") } } - -func dumpFreeList(s *mspan) { - printlock() - print("runtime: free list of span ", s, ":\n") - sstart := uintptr(s.start << _PageShift) - link := s.freelist - for i := 0; i < int(s.npages*_PageSize/s.elemsize); i++ { - if i != 0 { - print(" -> ") - } - print(hex(link)) - if link.ptr() == nil { - break - } - if uintptr(link) < sstart || s.limit <= uintptr(link) { - // Bad link. Stop walking before we crash. - print(" (BAD)") - break - } - link = link.ptr().next - } - print("\n") - printunlock() -} diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go index 63a3ade3a6..d04840b686 100644 --- a/src/runtime/mgcwork.go +++ b/src/runtime/mgcwork.go @@ -116,6 +116,22 @@ func (w *gcWork) put(obj uintptr) { wbuf.nobj++ } +// putFast does a put and returns true if it can be done quickly +// otherwise it returns false and the caller needs to call put. +//go:nowritebarrier +func (w *gcWork) putFast(obj uintptr) bool { + wbuf := w.wbuf1.ptr() + if wbuf == nil { + return false + } else if wbuf.nobj == len(wbuf.obj) { + return false + } + + wbuf.obj[wbuf.nobj] = obj + wbuf.nobj++ + return true +} + // tryGet dequeues a pointer for the garbage collector to trace. // // If there are no pointers remaining in this gcWork or in the global @@ -147,6 +163,23 @@ func (w *gcWork) tryGet() uintptr { return wbuf.obj[wbuf.nobj] } +// tryGetFast dequeues a pointer for the garbage collector to trace +// if one is readily available. Otherwise it returns 0 and +// the caller is expected to call tryGet(). +//go:nowritebarrier +func (w *gcWork) tryGetFast() uintptr { + wbuf := w.wbuf1.ptr() + if wbuf == nil { + return 0 + } + if wbuf.nobj == 0 { + return 0 + } + + wbuf.nobj-- + return wbuf.obj[wbuf.nobj] +} + // get dequeues a pointer for the garbage collector to trace, blocking // if necessary to ensure all pointers from all queues and caches have // been retrieved. get returns 0 if there are no pointers remaining. diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 99f7b54fc8..1f732c2111 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -117,9 +117,63 @@ type mspan struct { prev **mspan // previous span's next field, or list head's first field if none list *mSpanList // For debugging. TODO: Remove. - start pageID // starting page number - npages uintptr // number of pages in span - freelist gclinkptr // list of free objects + startAddr uintptr // address of first byte of span aka s.base() + npages uintptr // number of pages in span + stackfreelist gclinkptr // list of free stacks, avoids overloading freelist + + // freeindex is the slot index between 0 and nelems at which to begin scanning + // for the next free object in this span. + // Each allocation scans allocBits starting at freeindex until it encounters a 0 + // indicating a free object. freeindex is then adjusted so that subsequent scans begin + // just past the the newly discovered free object. + // + // If freeindex == nelem, this span has no free objects. + // + // allocBits is a bitmap of objects in this span. + // If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0 + // then object n is free; + // otherwise, object n is allocated. Bits starting at nelem are + // undefined and should never be referenced. + // + // Object n starts at address n*elemsize + (start << pageShift). + freeindex uintptr + // TODO: Look up nelems from sizeclass and remove this field if it + // helps performance. + nelems uintptr // number of object in the span. + + // Cache of the allocBits at freeindex. allocCache is shifted + // such that the lowest bit corresponds to the bit freeindex. + // allocCache holds the complement of allocBits, thus allowing + // ctz (count trailing zero) to use it directly. + // allocCache may contain bits beyond s.nelems; the caller must ignore + // these. + allocCache uint64 + + // allocBits and gcmarkBits hold pointers to a span's mark and + // allocation bits. The pointers are 8 byte aligned. + // There are three arenas where this data is held. + // free: Dirty arenas that are no longer accessed + // and can be reused. + // next: Holds information to be used in the next GC cycle. + // current: Information being used during this GC cycle. + // previous: Information being used during the last GC cycle. + // A new GC cycle starts with the call to finishsweep_m. + // finishsweep_m moves the previous arena to the free arena, + // the current arena to the previous arena, and + // the next arena to the current arena. + // The next arena is populated as the spans request + // memory to hold gcmarkBits for the next GC cycle as well + // as allocBits for newly allocated spans. + // + // The pointer arithmetic is done "by hand" instead of using + // arrays to avoid bounds checks along critical performance + // paths. + // The sweep will free the old allocBits and set allocBits to the + // gcmarkBits. The gcmarkBits are replaced with a fresh zeroed + // out memory. + allocBits *uint8 + gcmarkBits *uint8 + // sweep generation: // if sweepgen == h->sweepgen - 2, the span needs sweeping // if sweepgen == h->sweepgen - 1, the span is currently being swept @@ -128,7 +182,7 @@ type mspan struct { sweepgen uint32 divMul uint32 // for divide by elemsize - divMagic.mul - ref uint16 // capacity - number of objects in freelist + allocCount uint16 // capacity - number of objects in freelist sizeclass uint8 // size class incache bool // being used by an mcache state uint8 // mspaninuse etc @@ -145,7 +199,7 @@ type mspan struct { } func (s *mspan) base() uintptr { - return uintptr(s.start << _PageShift) + return s.startAddr } func (s *mspan) layout() (size, n, total uintptr) { @@ -207,11 +261,8 @@ func inheap(b uintptr) bool { return false } // Not a beginning of a block, consult span table to find the block beginning. - k := b >> _PageShift - x := k - x -= mheap_.arena_start >> _PageShift - s := h_spans[x] - if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse { + s := h_spans[(b-mheap_.arena_start)>>_PageShift] + if s == nil || b < s.base() || b >= s.limit || s.state != mSpanInUse { return false } return true @@ -261,7 +312,7 @@ func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 { return 0 } - p := uintptr(s.start) << _PageShift + p := s.base() if s.sizeclass == 0 { // Large object. if base != nil { @@ -440,8 +491,7 @@ func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan { // able to map interior pointer to containing span. atomic.Store(&s.sweepgen, h.sweepgen) s.state = _MSpanInUse - s.freelist = 0 - s.ref = 0 + s.allocCount = 0 s.sizeclass = uint8(sizeclass) if sizeclass == 0 { s.elemsize = s.npages << _PageShift @@ -504,7 +554,7 @@ func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) if s != nil { if needzero && s.needzero != 0 { - memclr(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift) + memclr(unsafe.Pointer(s.base()), s.npages<<_PageShift) } s.needzero = 0 } @@ -520,8 +570,8 @@ func (h *mheap) allocStack(npage uintptr) *mspan { s := h.allocSpanLocked(npage) if s != nil { s.state = _MSpanStack - s.freelist = 0 - s.ref = 0 + s.stackfreelist = 0 + s.allocCount = 0 memstats.stacks_inuse += uint64(s.npages << _PageShift) } @@ -572,7 +622,7 @@ HaveSpan: throw("still in list") } if s.npreleased > 0 { - sysUsed(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift) + sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift) memstats.heap_released -= uint64(s.npreleased << _PageShift) s.npreleased = 0 } @@ -580,10 +630,9 @@ HaveSpan: if s.npages > npage { // Trim extra and put it back in the heap. t := (*mspan)(h.spanalloc.alloc()) - t.init(s.start+pageID(npage), s.npages-npage) + t.init(s.base()+npage<<_PageShift, s.npages-npage) s.npages = npage - p := uintptr(t.start) - p -= (h.arena_start >> _PageShift) + p := (t.base() - h.arena_start) >> _PageShift if p > 0 { h_spans[p-1] = s } @@ -597,8 +646,7 @@ HaveSpan: } s.unusedsince = 0 - p := uintptr(s.start) - p -= (h.arena_start >> _PageShift) + p := (s.base() - h.arena_start) >> _PageShift for n := uintptr(0); n < npage; n++ { h_spans[p+n] = s } @@ -626,7 +674,7 @@ func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan { if s.npages < npage { continue } - if best == nil || s.npages < best.npages || (s.npages == best.npages && s.start < best.start) { + if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) { best = s } } @@ -663,9 +711,8 @@ func (h *mheap) grow(npage uintptr) bool { // Create a fake "in use" span and free it, so that the // right coalescing happens. s := (*mspan)(h.spanalloc.alloc()) - s.init(pageID(uintptr(v)>>_PageShift), ask>>_PageShift) - p := uintptr(s.start) - p -= (h.arena_start >> _PageShift) + s.init(uintptr(v), ask>>_PageShift) + p := (s.base() - h.arena_start) >> _PageShift for i := p; i < p+s.npages; i++ { h_spans[i] = s } @@ -696,11 +743,8 @@ func (h *mheap) lookupMaybe(v unsafe.Pointer) *mspan { if uintptr(v) < h.arena_start || uintptr(v) >= h.arena_used { return nil } - p := uintptr(v) >> _PageShift - q := p - q -= h.arena_start >> _PageShift - s := h_spans[q] - if s == nil || p < uintptr(s.start) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse { + s := h_spans[(uintptr(v)-h.arena_start)>>_PageShift] + if s == nil || uintptr(v) < s.base() || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse { return nil } return s @@ -715,6 +759,12 @@ func (h *mheap) freeSpan(s *mspan, acct int32) { mp.mcache.local_scan = 0 memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs) mp.mcache.local_tinyallocs = 0 + if msanenabled { + // Tell msan that this entire span is no longer in use. + base := unsafe.Pointer(s.base()) + bytes := s.npages << _PageShift + msanfree(base, bytes) + } if acct != 0 { memstats.heap_objects-- } @@ -743,12 +793,12 @@ func (h *mheap) freeStack(s *mspan) { func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) { switch s.state { case _MSpanStack: - if s.ref != 0 { + if s.allocCount != 0 { throw("MHeap_FreeSpanLocked - invalid stack free") } case _MSpanInUse: - if s.ref != 0 || s.sweepgen != h.sweepgen { - print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " ref ", s.ref, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") + if s.allocCount != 0 || s.sweepgen != h.sweepgen { + print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n") throw("MHeap_FreeSpanLocked - invalid free") } h.pagesInUse -= uint64(s.npages) @@ -776,12 +826,11 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince i s.npreleased = 0 // Coalesce with earlier, later spans. - p := uintptr(s.start) - p -= h.arena_start >> _PageShift + p := (s.base() - h.arena_start) >> _PageShift if p > 0 { t := h_spans[p-1] if t != nil && t.state == _MSpanFree { - s.start = t.start + s.startAddr = t.startAddr s.npages += t.npages s.npreleased = t.npreleased // absorb released pages s.needzero |= t.needzero @@ -831,7 +880,7 @@ func scavengelist(list *mSpanList, now, limit uint64) uintptr { var sumreleased uintptr for s := list.first; s != nil; s = s.next { if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages { - start := uintptr(s.start) << _PageShift + start := s.base() end := start + s.npages<<_PageShift if sys.PhysPageSize > _PageSize { // We can only release pages in @@ -886,14 +935,13 @@ func runtime_debug_freeOSMemory() { } // Initialize a new span with the given start and npages. -func (span *mspan) init(start pageID, npages uintptr) { +func (span *mspan) init(base uintptr, npages uintptr) { span.next = nil span.prev = nil span.list = nil - span.start = start + span.startAddr = base span.npages = npages - span.freelist = 0 - span.ref = 0 + span.allocCount = 0 span.sizeclass = 0 span.incache = false span.elemsize = 0 @@ -903,6 +951,9 @@ func (span *mspan) init(start pageID, npages uintptr) { span.speciallock.key = 0 span.specials = nil span.needzero = 0 + span.freeindex = 0 + span.allocBits = nil + span.gcmarkBits = nil } func (span *mspan) inList() bool { @@ -917,7 +968,7 @@ func (list *mSpanList) init() { func (list *mSpanList) remove(span *mspan) { if span.prev == nil || span.list != list { - println("failed MSpanList_Remove", span, span.prev, span.list, list) + println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list) throw("MSpanList_Remove") } if span.next != nil { @@ -939,7 +990,7 @@ func (list *mSpanList) isEmpty() bool { func (list *mSpanList) insert(span *mspan) { if span.next != nil || span.prev != nil || span.list != nil { - println("failed MSpanList_Insert", span, span.next, span.prev, span.list) + println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list) throw("MSpanList_Insert") } span.next = list.first @@ -998,7 +1049,7 @@ func addspecial(p unsafe.Pointer, s *special) bool { mp := acquirem() span.ensureSwept() - offset := uintptr(p) - uintptr(span.start<<_PageShift) + offset := uintptr(p) - span.base() kind := s.kind lock(&span.speciallock) @@ -1046,7 +1097,7 @@ func removespecial(p unsafe.Pointer, kind uint8) *special { mp := acquirem() span.ensureSwept() - offset := uintptr(p) - uintptr(span.start<<_PageShift) + offset := uintptr(p) - span.base() lock(&span.speciallock) t := &span.specials @@ -1169,3 +1220,117 @@ func freespecial(s *special, p unsafe.Pointer, size uintptr) { panic("not reached") } } + +const gcBitsChunkBytes = uintptr(1 << 16) +const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{}) + +type gcBitsHeader struct { + free uintptr // free is the index into bits of the next free byte. + next uintptr // *gcBits triggers recursive type bug. (issue 14620) +} + +type gcBits struct { + // gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand. + free uintptr // free is the index into bits of the next free byte. + next *gcBits + bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8 +} + +var gcBitsArenas struct { + lock mutex + free *gcBits + next *gcBits + current *gcBits + previous *gcBits +} + +// newMarkBits returns a pointer to 8 byte aligned bytes +// to be used for a span's mark bits. +func newMarkBits(nelems uintptr) *uint8 { + lock(&gcBitsArenas.lock) + blocksNeeded := uintptr((nelems + 63) / 64) + bytesNeeded := blocksNeeded * 8 + if gcBitsArenas.next == nil || + gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) { + // Allocate a new arena. + fresh := newArena() + fresh.next = gcBitsArenas.next + gcBitsArenas.next = fresh + } + if gcBitsArenas.next.free >= gcBitsChunkBytes { + println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes) + throw("markBits overflow") + } + result := &gcBitsArenas.next.bits[gcBitsArenas.next.free] + gcBitsArenas.next.free += bytesNeeded + unlock(&gcBitsArenas.lock) + return result +} + +// newAllocBits returns a pointer to 8 byte aligned bytes +// to be used for this span's alloc bits. +// newAllocBits is used to provide newly initialized spans +// allocation bits. For spans not being initialized the +// the mark bits are repurposed as allocation bits when +// the span is swept. +func newAllocBits(nelems uintptr) *uint8 { + return newMarkBits(nelems) +} + +// nextMarkBitArenaEpoch establishes a new epoch for the arenas +// holding the mark bits. The arenas are named relative to the +// current GC cycle which is demarcated by the call to finishweep_m. +// +// All current spans have been swept. +// During that sweep each span allocated room for its gcmarkBits in +// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current +// where the GC will mark objects and after each span is swept these bits +// will be used to allocate objects. +// gcBitsArenas.current becomes gcBitsArenas.previous where the span's +// gcAllocBits live until all the spans have been swept during this GC cycle. +// The span's sweep extinguishes all the references to gcBitsArenas.previous +// by pointing gcAllocBits into the gcBitsArenas.current. +// The gcBitsArenas.previous is released to the gcBitsArenas.free list. +func nextMarkBitArenaEpoch() { + lock(&gcBitsArenas.lock) + if gcBitsArenas.previous != nil { + if gcBitsArenas.free == nil { + gcBitsArenas.free = gcBitsArenas.previous + } else { + // Find end of previous arenas. + last := gcBitsArenas.previous + for last = gcBitsArenas.previous; last.next != nil; last = last.next { + } + last.next = gcBitsArenas.free + gcBitsArenas.free = gcBitsArenas.previous + } + } + gcBitsArenas.previous = gcBitsArenas.current + gcBitsArenas.current = gcBitsArenas.next + gcBitsArenas.next = nil // newMarkBits calls newArena when needed + unlock(&gcBitsArenas.lock) +} + +// newArena allocates and zeroes a gcBits arena. +func newArena() *gcBits { + var result *gcBits + if gcBitsArenas.free == nil { + result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys)) + if result == nil { + throw("runtime: cannot allocate memory") + } + } else { + result = gcBitsArenas.free + gcBitsArenas.free = gcBitsArenas.free.next + memclr(unsafe.Pointer(result), gcBitsChunkBytes) + } + result.next = nil + // If result.bits is not 8 byte aligned adjust index so + // that &result.bits[result.free] is 8 byte aligned. + if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 { + result.free = 0 + } else { + result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7) + } + return result +} diff --git a/src/runtime/msize.go b/src/runtime/msize.go index 21fe2f4c61..18577b309b 100644 --- a/src/runtime/msize.go +++ b/src/runtime/msize.go @@ -55,7 +55,7 @@ var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8 func sizeToClass(size int32) int32 { if size > _MaxSmallSize { - throw("SizeToClass - invalid size") + throw("invalid size") } if size > 1024-8 { return int32(size_to_class128[(size-1024+127)>>7]) @@ -79,7 +79,7 @@ func initSizes() { } } if align&(align-1) != 0 { - throw("InitSizes - bug") + throw("incorrect alignment") } // Make the allocnpages big enough that @@ -106,10 +106,18 @@ func initSizes() { sizeclass++ } if sizeclass != _NumSizeClasses { - print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n") - throw("InitSizes - bad NumSizeClasses") + print("runtime: sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n") + throw("bad NumSizeClasses") + } + // Check maxObjsPerSpan => number of objects invariant. + for i, size := range class_to_size { + if size != 0 && class_to_allocnpages[i]*pageSize/size > maxObjsPerSpan { + throw("span contains too many objects") + } + if size == 0 && i != 0 { + throw("size is 0 but class is not 0") + } } - // Initialize the size_to_class tables. nextsize := 0 for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ { @@ -128,12 +136,12 @@ func initSizes() { for n := int32(0); n < _MaxSmallSize; n++ { sizeclass := sizeToClass(n) if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n { - print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") + print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") print("incorrect SizeToClass\n") goto dump } if sizeclass > 1 && class_to_size[sizeclass-1] >= n { - print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") + print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n") print("SizeToClass too big\n") goto dump } @@ -155,18 +163,18 @@ func initSizes() { dump: if true { - print("NumSizeClasses=", _NumSizeClasses, "\n") + print("runtime: NumSizeClasses=", _NumSizeClasses, "\n") print("runtime·class_to_size:") for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ { print(" ", class_to_size[sizeclass], "") } print("\n\n") - print("size_to_class8:") + print("runtime: size_to_class8:") for i := 0; i < len(size_to_class8); i++ { print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n") } print("\n") - print("size_to_class128:") + print("runtime: size_to_class128:") for i := 0; i < len(size_to_class128); i++ { print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n") } diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go index 84a79e312c..2d75d2fef1 100644 --- a/src/runtime/mstats.go +++ b/src/runtime/mstats.go @@ -295,9 +295,9 @@ func updatememstats(stats *gcstats) { memstats.nmalloc++ memstats.alloc += uint64(s.elemsize) } else { - memstats.nmalloc += uint64(s.ref) - memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref) - memstats.alloc += uint64(s.ref) * uint64(s.elemsize) + memstats.nmalloc += uint64(s.allocCount) + memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount) + memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize) } } unlock(&mheap_.lock) diff --git a/src/runtime/stack.go b/src/runtime/stack.go index c4b1fb862e..f68c513fd6 100644 --- a/src/runtime/stack.go +++ b/src/runtime/stack.go @@ -191,26 +191,26 @@ func stackpoolalloc(order uint8) gclinkptr { if s == nil { throw("out of memory") } - if s.ref != 0 { - throw("bad ref") + if s.allocCount != 0 { + throw("bad allocCount") } - if s.freelist.ptr() != nil { - throw("bad freelist") + if s.stackfreelist.ptr() != nil { + throw("bad stackfreelist") } for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order { - x := gclinkptr(uintptr(s.start)<<_PageShift + i) - x.ptr().next = s.freelist - s.freelist = x + x := gclinkptr(s.base() + i) + x.ptr().next = s.stackfreelist + s.stackfreelist = x } list.insert(s) } - x := s.freelist + x := s.stackfreelist if x.ptr() == nil { throw("span has no free stacks") } - s.freelist = x.ptr().next - s.ref++ - if s.freelist.ptr() == nil { + s.stackfreelist = x.ptr().next + s.allocCount++ + if s.stackfreelist.ptr() == nil { // all stacks in s are allocated. list.remove(s) } @@ -223,14 +223,14 @@ func stackpoolfree(x gclinkptr, order uint8) { if s.state != _MSpanStack { throw("freeing stack not in a stack span") } - if s.freelist.ptr() == nil { + if s.stackfreelist.ptr() == nil { // s will now have a free stack stackpool[order].insert(s) } - x.ptr().next = s.freelist - s.freelist = x - s.ref-- - if gcphase == _GCoff && s.ref == 0 { + x.ptr().next = s.stackfreelist + s.stackfreelist = x + s.allocCount-- + if gcphase == _GCoff && s.allocCount == 0 { // Span is completely free. Return it to the heap // immediately if we're sweeping. // @@ -247,7 +247,7 @@ func stackpoolfree(x gclinkptr, order uint8) { // // By not freeing, we prevent step #4 until GC is done. stackpool[order].remove(s) - s.freelist = 0 + s.stackfreelist = 0 mheap_.freeStack(s) } } @@ -391,7 +391,7 @@ func stackalloc(n uint32) (stack, []stkbar) { throw("out of memory") } } - v = unsafe.Pointer(s.start << _PageShift) + v = unsafe.Pointer(s.base()) } if raceenabled { @@ -456,7 +456,7 @@ func stackfree(stk stack, n uintptr) { } else { s := mheap_.lookup(v) if s.state != _MSpanStack { - println(hex(s.start<<_PageShift), v) + println(hex(s.base()), v) throw("bad span state") } if gcphase == _GCoff { @@ -1136,9 +1136,9 @@ func freeStackSpans() { list := &stackpool[order] for s := list.first; s != nil; { next := s.next - if s.ref == 0 { + if s.allocCount == 0 { list.remove(s) - s.freelist = 0 + s.stackfreelist = 0 mheap_.freeStack(s) } s = next