diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index 49ee88eaec..9926985c58 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -72,11 +72,7 @@ func TestIntendedInlining(t *testing.T) { "cgoInRange", "gclinkptr.ptr", "guintptr.ptr", - "heapBits.bits", - "heapBits.isPointer", - "heapBits.morePointers", - "heapBits.next", - "heapBitsForAddr", + "writeHeapBitsForAddr", "markBits.isMarked", "muintptr.ptr", "puintptr.ptr", @@ -224,6 +220,8 @@ func TestIntendedInlining(t *testing.T) { // On loong64, mips64x and riscv64, Ctz64 is not intrinsified and causes nextFreeFast too expensive // to inline (Issue 22239). want["runtime"] = append(want["runtime"], "nextFreeFast") + // Same behavior for heapBits.nextFast. + want["runtime"] = append(want["runtime"], "heapBits.nextFast") } if runtime.GOARCH != "386" { // As explained above, Ctz64 and Ctz32 are not Go code on 386. diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go index 0e493fb5e2..aa620bf0ee 100644 --- a/src/reflect/all_test.go +++ b/src/reflect/all_test.go @@ -6989,8 +6989,21 @@ func TestFuncLayout(t *testing.T) { } } +// trimBitmap removes trailing 0 elements from b and returns the result. +func trimBitmap(b []byte) []byte { + for len(b) > 0 && b[len(b)-1] == 0 { + b = b[:len(b)-1] + } + return b +} + func verifyGCBits(t *testing.T, typ Type, bits []byte) { heapBits := GCBits(New(typ).Interface()) + + // Trim scalars at the end, as bits might end in zero, + // e.g. with rep(2, lit(1, 0)). + bits = trimBitmap(bits) + if !bytes.Equal(heapBits, bits) { _, _, line, _ := runtime.Caller(1) t.Errorf("line %d: heapBits incorrect for %v\nhave %v\nwant %v", line, typ, heapBits, bits) @@ -7007,12 +7020,10 @@ func verifyGCBitsSlice(t *testing.T, typ Type, cap int, bits []byte) { heapBits := GCBits(data.Interface()) // Repeat the bitmap for the slice size, trimming scalars in // the last element. - bits = rep(cap, bits) - for len(bits) > 0 && bits[len(bits)-1] == 0 { - bits = bits[:len(bits)-1] - } + bits = trimBitmap(rep(cap, bits)) if !bytes.Equal(heapBits, bits) { - t.Errorf("heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", typ, cap, heapBits, bits) + _, _, line, _ := runtime.Caller(1) + t.Errorf("line %d: heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", line, typ, cap, heapBits, bits) } } diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go index 892654ed5b..dd9de9d247 100644 --- a/src/runtime/cgocall.go +++ b/src/runtime/cgocall.go @@ -568,17 +568,16 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) { if base == 0 { return } - hbits := heapBitsForAddr(base) n := span.elemsize - for i = uintptr(0); i < n; i += goarch.PtrSize { - if !hbits.morePointers() { - // No more possible pointers. + hbits := heapBitsForAddr(base, n) + for { + var addr uintptr + if hbits, addr = hbits.next(); addr == 0 { break } - if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) { + if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(addr))) { panic(errorString(msg)) } - hbits = hbits.next() } return diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go index 6b492093ea..84e7516758 100644 --- a/src/runtime/cgocheck.go +++ b/src/runtime/cgocheck.go @@ -153,16 +153,16 @@ func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) { // src must be in the regular heap. - hbits := heapBitsForAddr(uintptr(src)) - for i := uintptr(0); i < off+size; i += goarch.PtrSize { - bits := hbits.bits() - if i >= off && bits&bitPointer != 0 { - v := *(*unsafe.Pointer)(add(src, i)) - if cgoIsGoPointer(v) { - throw(cgoWriteBarrierFail) - } + hbits := heapBitsForAddr(uintptr(src), size) + for { + var addr uintptr + if hbits, addr = hbits.next(); addr == 0 { + break + } + v := *(*unsafe.Pointer)(unsafe.Pointer(addr)) + if cgoIsGoPointer(v) { + throw(cgoWriteBarrierFail) } - hbits = hbits.next() } } diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go index 0601e38f2a..a3d817105b 100644 --- a/src/runtime/heapdump.go +++ b/src/runtime/heapdump.go @@ -737,16 +737,16 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector { for i := uintptr(0); i < nptr/8+1; i++ { tmpbuf[i] = 0 } - i := uintptr(0) - hbits := heapBitsForAddr(p) - for ; i < nptr; i++ { - if !hbits.morePointers() { - break // end of object + + hbits := heapBitsForAddr(p, size) + for { + var addr uintptr + hbits, addr = hbits.next() + if addr == 0 { + break } - if hbits.isPointer() { - tmpbuf[i/8] |= 1 << (i % 8) - } - hbits = hbits.next() + i := (addr - p) / goarch.PtrSize + tmpbuf[i/8] |= 1 << (i % 8) } - return bitvector{int32(i), &tmpbuf[0]} + return bitvector{int32(nptr), &tmpbuf[0]} } diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index b044e29d95..0219401c83 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -247,13 +247,15 @@ const ( // memory. heapArenaBytes = 1 << logHeapArenaBytes + heapArenaWords = heapArenaBytes / goarch.PtrSize + // logHeapArenaBytes is log_2 of heapArenaBytes. For clarity, // prefer using heapArenaBytes where possible (we need the // constant to compute some other constants). logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64 - // heapArenaBitmapBytes is the size of each heap arena's bitmap. - heapArenaBitmapBytes = heapArenaBytes / (goarch.PtrSize * 8 / 2) + // heapArenaBitmapWords is the size of each heap arena's bitmap in uintptrs. + heapArenaBitmapWords = heapArenaWords / (8 * goarch.PtrSize) pagesPerArena = heapArenaBytes / pageSize @@ -353,10 +355,10 @@ func mallocinit() { throw("bad TinySizeClass") } - if heapArenaBitmapBytes&(heapArenaBitmapBytes-1) != 0 { + if heapArenaBitmapWords&(heapArenaBitmapWords-1) != 0 { // heapBits expects modular arithmetic on bitmap // addresses to work. - throw("heapArenaBitmapBytes not a power of 2") + throw("heapArenaBitmapWords not a power of 2") } // Check physPageSize. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index fcf59b8b3c..1c7ae8a68e 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -14,34 +14,30 @@ // // Heap bitmap // -// The heap bitmap comprises 2 bits for each pointer-sized word in the heap, -// stored in the heapArena metadata backing each heap arena. -// That is, if ha is the heapArena for the arena starting a start, -// then ha.bitmap[0] holds the 2-bit entries for the four words start -// through start+3*ptrSize, ha.bitmap[1] holds the entries for -// start+4*ptrSize through start+7*ptrSize, and so on. +// The heap bitmap comprises 1 bit for each pointer-sized word in the heap, +// recording whether a pointer is stored in that word or not. This bitmap +// is stored in the heapArena metadata backing each heap arena. +// That is, if ha is the heapArena for the arena starting at "start", +// then ha.bitmap[0] holds the 64 bits for the 64 words "start" +// through start+63*ptrSize, ha.bitmap[1] holds the entries for +// start+64*ptrSize through start+127*ptrSize, and so on. +// Bits correspond to words in little-endian order. ha.bitmap[0]&1 represents +// the word at "start", ha.bitmap[0]>>1&1 represents the word at start+8, etc. +// (For 32-bit platforms, s/64/32/.) // -// In each 2-bit entry, the lower bit is a pointer/scalar bit, just -// like in the stack/data bitmaps described above. The upper bit -// indicates scan/dead: a "1" value ("scan") indicates that there may -// be pointers in later words of the allocation, and a "0" value -// ("dead") indicates there are no more pointers in the allocation. If -// the upper bit is 0, the lower bit must also be 0, and this -// indicates scanning can ignore the rest of the allocation. +// We also keep a noMorePtrs bitmap which allows us to stop scanning +// the heap bitmap early in certain situations. If ha.noMorePtrs[i]>>j&1 +// is 1, then the object containing the last word described by ha.bitmap[8*i+j] +// has no more pointers beyond those described by ha.bitmap[8*i+j]. +// If ha.noMorePtrs[i]>>j&1 is set, the entries in ha.bitmap[8*i+j+1] and +// beyond must all be zero until the start of the next object. // -// The 2-bit entries are split when written into the byte, so that the top half -// of the byte contains 4 high (scan) bits and the bottom half contains 4 low -// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to -// keep the pointer bits contiguous, instead of having to space them out. -// -// The code makes use of the fact that the zero value for a heap -// bitmap means scalar/dead. This property must be preserved when -// modifying the encoding. -// -// The bitmap for noscan spans is not maintained. Code must ensure -// that an object is scannable before consulting its bitmap by +// The bitmap for noscan spans is not maintained (can be junk). Code must +// ensure that an object is scannable before consulting its bitmap by // checking either the noscan bit in the span or by consulting its // type's information. +// +// The bitmap for unallocated objects is also not maintained. package runtime @@ -52,18 +48,6 @@ import ( "unsafe" ) -const ( - bitPointer = 1 << 0 - bitScan = 1 << 4 - - heapBitsShift = 1 // shift offset between successive bitPointer or bitScan entries - wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte - - // all scan/pointer bits in a byte - bitScanAll = bitScan | bitScan<>63)*32)) @@ -433,121 +376,132 @@ func reflect_verifyNotInHeapPtr(p uintptr) bool { return spanOf(p) == nil && p != clobberdeadPtr } -// next returns the heapBits describing the next pointer-sized word in memory. -// That is, if h describes address p, h.next() describes p+ptrSize. +const ptrBits = 8 * goarch.PtrSize + +// heapBits provides access to the bitmap bits for a single heap word. +// The methods on heapBits take value receivers so that the compiler +// can more easily inline calls to those methods and registerize the +// struct fields independently. +type heapBits struct { + // heapBits will report on pointers in the range [addr,addr+size). + // The low bit of mask contains the pointerness of the word at addr + // (assuming valid>0). + addr, size uintptr + + // The next few pointer bits representing words starting at addr. + // Those bits already returned by next() are zeroed. + mask uintptr + // Number of bits in mask that are valid. mask is always less than 1<> off + valid := ptrBits - off + + // Process depending on where the object ends. + nptr := size / goarch.PtrSize + if nptr < valid { + // Bits for this object end before the end of this bitmap word. + // Squash bits for the following objects. + mask &= 1<<(nptr&(ptrBits-1)) - 1 + valid = nptr + } else if nptr == valid { + // Bits for this object end at exactly the end of this bitmap word. + // All good. + } else { + // Bits for this object extend into the next bitmap word. See if there + // may be any pointers recorded there. + if uintptr(ha.noMorePtrs[idx/8])>>(idx%8)&1 != 0 { + // No more pointers in this object after this bitmap word. + // Update size so we know not to look there. + size = valid * goarch.PtrSize + } + } + + return heapBits{addr: addr, size: size, mask: mask, valid: valid} +} + +// Returns the (absolute) address of the next known pointer and +// a heapBits iterator representing any remaining pointers. +// If there are no more pointers, returns address 0. // Note that next does not modify h. The caller must record the result. // // nosplit because it is used during write barriers and must not be preempted. // //go:nosplit -func (h heapBits) next() heapBits { - if h.shift < 3*heapBitsShift { - h.shift += heapBitsShift - } else if h.bitp != h.last { - h.bitp, h.shift = add1(h.bitp), 0 +func (h heapBits) next() (heapBits, uintptr) { + for { + if h.mask != 0 { + var i int + if goarch.PtrSize == 8 { + i = sys.Ctz64(uint64(h.mask)) + } else { + i = sys.Ctz32(uint32(h.mask)) + } + h.mask ^= uintptr(1) << (i & (ptrBits - 1)) + return h, h.addr + uintptr(i)*goarch.PtrSize + } + + // Skip words that we've already processed. + h.addr += h.valid * goarch.PtrSize + h.size -= h.valid * goarch.PtrSize + if h.size == 0 { + return h, 0 // no more pointers + } + + // Grab more bits and try again. + h = heapBitsForAddr(h.addr, h.size) + } +} + +// nextFast is like next, but can return 0 even when there are more pointers +// to be found. Callers should call next if nextFast returns 0 as its second +// return value. +// if addr, h = h.nextFast(); addr == 0 { +// if addr, h = h.next(); addr == 0 { +// ... no more pointers ... +// } +// } +// ... process pointer at addr ... +// nextFast is designed to be inlineable. +// +//go:nosplit +func (h heapBits) nextFast() (heapBits, uintptr) { + // TESTQ/JEQ + if h.mask == 0 { + return h, 0 + } + // BSFQ + var i int + if goarch.PtrSize == 8 { + i = sys.Ctz64(uint64(h.mask)) } else { - // Move to the next arena. - return h.nextArena() + i = sys.Ctz32(uint32(h.mask)) } - return h -} - -// nextArena advances h to the beginning of the next heap arena. -// -// This is a slow-path helper to next. gc's inliner knows that -// heapBits.next can be inlined even though it calls this. This is -// marked noinline so it doesn't get inlined into next and cause next -// to be too big to inline. -// -//go:nosplit -//go:noinline -func (h heapBits) nextArena() heapBits { - h.arena++ - ai := arenaIdx(h.arena) - l2 := mheap_.arenas[ai.l1()] - if l2 == nil { - // We just passed the end of the object, which - // was also the end of the heap. Poison h. It - // should never be dereferenced at this point. - return heapBits{} - } - ha := l2[ai.l2()] - if ha == nil { - return heapBits{} - } - h.bitp, h.shift = &ha.bitmap[0], 0 - h.last = &ha.bitmap[len(ha.bitmap)-1] - return h -} - -// forward returns the heapBits describing n pointer-sized words ahead of h in memory. -// That is, if h describes address p, h.forward(n) describes p+n*ptrSize. -// h.forward(1) is equivalent to h.next(), just slower. -// Note that forward does not modify h. The caller must record the result. -// bits returns the heap bits for the current word. -// -//go:nosplit -func (h heapBits) forward(n uintptr) heapBits { - n += uintptr(h.shift) / heapBitsShift - nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4 - h.shift = uint32(n%4) * heapBitsShift - if nbitp <= uintptr(unsafe.Pointer(h.last)) { - h.bitp = (*uint8)(unsafe.Pointer(nbitp)) - return h - } - - // We're in a new heap arena. - past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1) - h.arena += 1 + uint32(past/heapArenaBitmapBytes) - ai := arenaIdx(h.arena) - if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil { - a := l2[ai.l2()] - h.bitp = &a.bitmap[past%heapArenaBitmapBytes] - h.last = &a.bitmap[len(a.bitmap)-1] - } else { - h.bitp, h.last = nil, nil - } - return h -} - -// forwardOrBoundary is like forward, but stops at boundaries between -// contiguous sections of the bitmap. It returns the number of words -// advanced over, which will be <= n. -func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) { - maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp))) - if n > maxn { - n = maxn - } - return h.forward(n), n -} - -// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer. -// The result includes in its higher bits the bits for subsequent words -// described by the same bitmap byte. -// -// nosplit because it is used during write barriers and must not be preempted. -// -//go:nosplit -func (h heapBits) bits() uint32 { - // The (shift & 31) eliminates a test and conditional branch - // from the generated code. - return uint32(*h.bitp) >> (h.shift & 31) -} - -// morePointers reports whether this word and all remaining words in this object -// are scalars. -// h must not describe the second word of the object. -func (h heapBits) morePointers() bool { - return h.bits()&bitScan != 0 -} - -// isPointer reports whether the heap bits describe a pointer word. -// -// nosplit because it is used during write barriers and must not be preempted. -// -//go:nosplit -func (h heapBits) isPointer() bool { - return h.bits()&bitPointer != 0 + // BTCQ + h.mask ^= uintptr(1) << (i & (ptrBits - 1)) + // LEAQ (XX)(XX*8) + return h, h.addr + uintptr(i)*goarch.PtrSize } // bulkBarrierPreWrite executes a write barrier @@ -611,27 +565,29 @@ func bulkBarrierPreWrite(dst, src, size uintptr) { } buf := &getg().m.p.ptr().wbBuf - h := heapBitsForAddr(dst) + h := heapBitsForAddr(dst, size) if src == 0 { - for i := uintptr(0); i < size; i += goarch.PtrSize { - if h.isPointer() { - dstx := (*uintptr)(unsafe.Pointer(dst + i)) - if !buf.putFast(*dstx, 0) { - wbBufFlush(nil, 0) - } + for { + var addr uintptr + if h, addr = h.next(); addr == 0 { + break + } + dstx := (*uintptr)(unsafe.Pointer(addr)) + if !buf.putFast(*dstx, 0) { + wbBufFlush(nil, 0) } - h = h.next() } } else { - for i := uintptr(0); i < size; i += goarch.PtrSize { - if h.isPointer() { - dstx := (*uintptr)(unsafe.Pointer(dst + i)) - srcx := (*uintptr)(unsafe.Pointer(src + i)) - if !buf.putFast(*dstx, *srcx) { - wbBufFlush(nil, 0) - } + for { + var addr uintptr + if h, addr = h.next(); addr == 0 { + break + } + dstx := (*uintptr)(unsafe.Pointer(addr)) + srcx := (*uintptr)(unsafe.Pointer(src + (addr - dst))) + if !buf.putFast(*dstx, *srcx) { + wbBufFlush(nil, 0) } - h = h.next() } } } @@ -654,15 +610,16 @@ func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr) { return } buf := &getg().m.p.ptr().wbBuf - h := heapBitsForAddr(dst) - for i := uintptr(0); i < size; i += goarch.PtrSize { - if h.isPointer() { - srcx := (*uintptr)(unsafe.Pointer(src + i)) - if !buf.putFast(0, *srcx) { - wbBufFlush(nil, 0) - } + h := heapBitsForAddr(dst, size) + for { + var addr uintptr + if h, addr = h.next(); addr == 0 { + break + } + srcx := (*uintptr)(unsafe.Pointer(addr - dst + src)) + if !buf.putFast(0, *srcx) { + wbBufFlush(nil, 0) } - h = h.next() } } @@ -759,43 +716,21 @@ func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) { } } -// The methods operating on spans all require that h has been returned -// by heapBitsForSpan and that size, n, total are the span layout description -// returned by the mspan's layout method. -// If total > size*n, it means that there is extra leftover memory in the span, -// usually due to rounding. -// -// TODO(rsc): Perhaps introduce a different heapBitsSpan type. - -// initSpan initializes the heap bitmap for a span. -// If this is a span of pointer-sized objects, it initializes all -// words to pointer/scan. -// Otherwise, it initializes all words to scalar/dead. -func (h heapBits) initSpan(s *mspan) { - // Clear bits corresponding to objects. - nw := (s.npages << _PageShift) / goarch.PtrSize - if nw%wordsPerBitmapByte != 0 { - throw("initSpan: unaligned length") - } - if h.shift != 0 { - throw("initSpan: unaligned base") - } +// initHeapBits initializes the heap bitmap for a span. +// If this is a span of single pointer allocations, it initializes all +// words to pointer. +func (s *mspan) initHeapBits() { isPtrs := goarch.PtrSize == 8 && s.elemsize == goarch.PtrSize - for nw > 0 { - hNext, anw := h.forwardOrBoundary(nw) - nbyte := anw / wordsPerBitmapByte - if isPtrs { - bitp := h.bitp - for i := uintptr(0); i < nbyte; i++ { - *bitp = bitPointerAll | bitScanAll - bitp = add1(bitp) - } - } else { - memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte) - } - h = hNext - nw -= anw + if !isPtrs { + return // nothing to do } + h := writeHeapBitsForAddr(s.base()) + size := s.npages * pageSize + nptrs := size / goarch.PtrSize + for i := uintptr(0); i < nptrs; i += ptrBits { + h = h.write(^uintptr(0), ptrBits) + } + h.flush(s.base(), size) } // countAlloc returns the number of objects allocated in span s by @@ -818,6 +753,146 @@ func (s *mspan) countAlloc() int { return count } +type writeHeapBits struct { + addr uintptr // address that the low bit of mask represents the pointer state of. + mask uintptr // some pointer bits starting at the address addr. + valid uintptr // number of bits in buf that are valid (including low) + low uintptr // number of low-order bits to not overwrite +} + +func writeHeapBitsForAddr(addr uintptr) (h writeHeapBits) { + // We start writing bits maybe in the middle of a heap bitmap word. + // Remember how many bits into the word we started, so we can be sure + // not to overwrite the previous bits. + h.low = addr / goarch.PtrSize % ptrBits + + // round down to heap word that starts the bitmap word. + h.addr = addr - h.low*goarch.PtrSize + + // We don't have any bits yet. + h.mask = 0 + h.valid = h.low + + return +} + +// write appends the pointerness of the next valid pointer slots +// using the low valid bits of bits. 1=pointer, 0=scalar. +func (h writeHeapBits) write(bits, valid uintptr) writeHeapBits { + if h.valid+valid <= ptrBits { + // Fast path - just accumulate the bits. + h.mask |= bits << h.valid + h.valid += valid + return h + } + // Too many bits to fit in this word. Write the current word + // out and move on to the next word. + + data := h.mask | bits<> (ptrBits - h.valid) // leftover for next word + h.valid += valid - ptrBits // have h.valid+valid bits, writing ptrBits of them + + // Flush mask to the memory bitmap. + // TODO: figure out how to cache arena lookup. + ai := arenaIndex(h.addr) + ha := mheap_.arenas[ai.l1()][ai.l2()] + idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords + m := uintptr(1)< ptrBits { + h = h.write(0, ptrBits) + words -= ptrBits + } + return h.write(0, words) +} + +// Flush the bits that have been written, and add zeros as needed +// to cover the full object [addr, addr+size). +func (h writeHeapBits) flush(addr, size uintptr) { + // zeros counts the number of bits needed to represent the object minus the + // number of bits we've already written. This is the number of 0 bits + // that need to be added. + zeros := (addr+size-h.addr)/goarch.PtrSize - h.valid + + // Add zero bits up to the bitmap word boundary + if zeros > 0 { + z := ptrBits - h.valid + if z > zeros { + z = zeros + } + h.valid += z + zeros -= z + } + + // Find word in bitmap that we're going to write. + ai := arenaIndex(h.addr) + ha := mheap_.arenas[ai.l1()][ai.l2()] + idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords + + // Write remaining bits. + if h.valid != h.low { + m := uintptr(1)<> 1 - - // For h.shift > 1 heap bits cross a byte boundary and need to be written part - // to h.bitp and part to the next h.bitp. - switch h.shift { - case 0: - *h.bitp &^= mask3 << 0 - *h.bitp |= hb << 0 - case 1: - *h.bitp &^= mask3 << 1 - *h.bitp |= hb << 1 - case 2: - *h.bitp &^= mask2 << 2 - *h.bitp |= (hb & mask2) << 2 - // Two words written to the first byte. - // Advance two words to get to the next byte. - h = h.next().next() - *h.bitp &^= mask1 - *h.bitp |= (hb >> 2) & mask1 - case 3: - *h.bitp &^= mask1 << 3 - *h.bitp |= (hb & mask1) << 3 - // One word written to the first byte. - // Advance one word to get to the next byte. - h = h.next() - *h.bitp &^= mask2 - *h.bitp |= (hb >> 1) & mask2 - } - return - } - - // Copy from 1-bit ptrmask into 2-bit bitmap. - // The basic approach is to use a single uintptr as a bit buffer, - // alternating between reloading the buffer and writing bitmap bytes. - // In general, one load can supply two bitmap byte writes. - // This is a lot of lines of code, but it compiles into relatively few - // machine instructions. - - outOfPlace := false - if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrandn(2) == 0) { - // This object spans heap arenas, so the bitmap may be - // discontiguous. Unroll it into the object instead - // and then copy it out. - // - // In doubleCheck mode, we randomly do this anyway to - // stress test the bitmap copying path. - outOfPlace = true - h.bitp = (*uint8)(unsafe.Pointer(x)) - h.last = nil - } - - var ( - // Ptrmask input. - p *byte // last ptrmask byte read - b uintptr // ptrmask bits already loaded - nb uintptr // number of bits in b at next read - endp *byte // final ptrmask byte to read (then repeat) - endnb uintptr // number of valid bits in *endp - pbits uintptr // alternate source of bits - - // Heap bitmap output. - w uintptr // words processed - nw uintptr // number of words to process - hbitp *byte // next heap bitmap byte to write - hb uintptr // bits being prepared for *hbitp - ) - - hbitp = h.bitp - - // Handle GC program. Delayed until this part of the code - // so that we can use the same double-checking mechanism - // as the 1-bit case. Nothing above could have encountered - // GC programs: the cases were all too small. + // Handle GC program. if typ.kind&kindGCProg != 0 { - heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4)) - if doubleCheck { - // Double-check the heap bits written by GC program - // by running the GC program to create a 1-bit pointer mask - // and then jumping to the double-check code below. - // This doesn't catch bugs shared between the 1-bit and 4-bit - // GC program execution, but it does catch mistakes specific - // to just one of those and bugs in heapBitsSetTypeGCProg's - // implementation of arrays. - lock(&debugPtrmask.lock) - if debugPtrmask.data == nil { - debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys)) + // Expand the gc program into the storage we're going to use for the actual object. + obj := (*uint8)(unsafe.Pointer(x)) + n := runGCProg(addb(typ.gcdata, 4), obj) + // Use the expanded program to set the heap bits. + for i := uintptr(0); true; i += typ.size { + // Copy expanded program to heap bitmap. + p := obj + j := n + for j > 8 { + h = h.write(uintptr(*p), 8) + p = add1(p) + j -= 8 } - ptrmask = debugPtrmask.data - runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1) + h = h.write(uintptr(*p), j) + + if i+typ.size == dataSize { + break // no padding after last element + } + + // Pad with zeros to the start of the next element. + h = h.pad(typ.size - n*goarch.PtrSize) } - goto Phase4 + + h.flush(x, size) + + // Erase the expanded GC program. + memclrNoHeapPointers(unsafe.Pointer(obj), (n+7)/8) + return } // Note about sizes: @@ -1061,424 +987,52 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // to scan the buffer's heap bitmap at all. // The 1-bit ptrmasks are sized to contain only bits for // the typ.ptrdata prefix, zero padded out to a full byte - // of bitmap. This code sets nw (below) so that heap bitmap - // bits are only written for the typ.ptrdata prefix; if there is - // more room in the allocated object, the next heap bitmap - // entry is a 00, indicating that there are no more pointers - // to scan. So only the ptrmask for the ptrdata bytes is needed. + // of bitmap. If there is more room in the allocated object, + // that space is pointerless. The noMorePtrs bitmap will prevent + // scanning large pointerless tails of an object. // // Replicated copies are not as nice: if there is an array of // objects with scalar tails, all but the last tail does have to // be initialized, because there is no way to say "skip forward". - // However, because of the possibility of a repeated type with - // size not a multiple of 4 pointers (one heap bitmap byte), - // the code already must handle the last ptrmask byte specially - // by treating it as containing only the bits for endnb pointers, - // where endnb <= 4. We represent large scalar tails that must - // be expanded in the replication by setting endnb larger than 4. - // This will have the effect of reading many bits out of b, - // but once the real bits are shifted out, b will supply as many - // zero bits as we try to read, which is exactly what we need. - p = ptrmask - if typ.size < dataSize { - // Filling in bits for an array of typ. - // Set up for repetition of ptrmask during main loop. - // Note that ptrmask describes only a prefix of - const maxBits = goarch.PtrSize*8 - 7 - if typ.ptrdata/goarch.PtrSize <= maxBits { - // Entire ptrmask fits in uintptr with room for a byte fragment. - // Load into pbits and never read from ptrmask again. - // This is especially important when the ptrmask has - // fewer than 8 bits in it; otherwise the reload in the middle - // of the Phase 2 loop would itself need to loop to gather - // at least 8 bits. - - // Accumulate ptrmask into b. - // ptrmask is sized to describe only typ.ptrdata, but we record - // it as describing typ.size bytes, since all the high bits are zero. - nb = typ.ptrdata / goarch.PtrSize - for i := uintptr(0); i < nb; i += 8 { - b |= uintptr(*p) << i - p = add1(p) - } - nb = typ.size / goarch.PtrSize - - // Replicate ptrmask to fill entire pbits uintptr. - // Doubling and truncating is fewer steps than - // iterating by nb each time. (nb could be 1.) - // Since we loaded typ.ptrdata/goarch.PtrSize bits - // but are pretending to have typ.size/goarch.PtrSize, - // there might be no replication necessary/possible. - pbits = b - endnb = nb - if nb+nb <= maxBits { - for endnb <= goarch.PtrSize*8 { - pbits |= pbits << endnb - endnb += endnb - } - // Truncate to a multiple of original ptrmask. - // Because nb+nb <= maxBits, nb fits in a byte. - // Byte division is cheaper than uintptr division. - endnb = uintptr(maxBits/byte(nb)) * nb - pbits &= 1<= nw { - goto Phase3 - } - *hbitp = uint8(hb) - hbitp = add1(hbitp) - b >>= 4 - nb -= 4 - - case h.shift == 2: - // Ptrmask and heap bitmap are misaligned. - // - // On 32 bit architectures only the 6-word object that corresponds - // to a 24 bytes size class can start with h.shift of 2 here since - // all other non 16 byte aligned size classes have been handled by - // special code paths at the beginning of heapBitsSetType on 32 bit. - // - // Many size classes are only 16 byte aligned. On 64 bit architectures - // this results in a heap bitmap position starting with a h.shift of 2. - // - // The bits for the first two words are in a byte shared - // with another object, so we must be careful with the bits - // already there. - // - // We took care of 1-word, 2-word, and 3-word objects above, - // so this is at least a 6-word object. - hb = (b & (bitPointer | bitPointer< 1 { - hb |= bitScan << (3 * heapBitsShift) - } - b >>= 2 - nb -= 2 - *hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift)) - *hbitp |= uint8(hb) - hbitp = add1(hbitp) - if w += 2; w >= nw { - // We know that there is more data, because we handled 2-word and 3-word objects above. - // This must be at least a 6-word object. If we're out of pointer words, - // mark no scan in next bitmap byte and finish. - hb = 0 - w += 4 - goto Phase3 - } - } - - // Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap. - // The loop computes the bits for that last write but does not execute the write; - // it leaves the bits in hb for processing by phase 3. - // To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to - // use in the first half of the loop right now, and then we only adjust nb explicitly - // if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop. - nb -= 4 - for { - // Emit bitmap byte. - // b has at least nb+4 bits, with one exception: - // if w+4 >= nw, then b has only nw-w bits, - // but we'll stop at the break and then truncate - // appropriately in Phase 3. - hb = b & bitPointerAll - hb |= bitScanAll - if w += 4; w >= nw { - break - } - *hbitp = uint8(hb) - hbitp = add1(hbitp) - b >>= 4 - - // Load more bits. b has nb right now. - if p != endp { - // Fast path: keep reading from ptrmask. - // nb unmodified: we just loaded 8 bits, - // and the next iteration will consume 8 bits, - // leaving us with the same nb the next time we're here. - if nb < 8 { - b |= uintptr(*p) << nb - p = add1(p) - } else { - // Reduce the number of bits in b. - // This is important if we skipped - // over a scalar tail, since nb could - // be larger than the bit width of b. - nb -= 8 - } - } else if p == nil { - // Almost as fast path: track bit count and refill from pbits. - // For short repetitions. - if nb < 8 { - b |= pbits << nb - nb += endnb - } - nb -= 8 // for next iteration - } else { - // Slow path: reached end of ptrmask. - // Process final partial byte and rewind to start. - b |= uintptr(*p) << nb - nb += endnb - if nb < 8 { - b |= uintptr(*ptrmask) << nb - p = add1(ptrmask) - } else { - nb -= 8 - p = ptrmask - } - } - - // Emit bitmap byte. - hb = b & bitPointerAll - hb |= bitScanAll - if w += 4; w >= nw { - break - } - *hbitp = uint8(hb) - hbitp = add1(hbitp) - b >>= 4 - } - -Phase3: - // Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries. - if w > nw { - // Counting the 4 entries in hb not yet written to memory, - // there are more entries than possible pointer slots. - // Discard the excess entries (can't be more than 3). - mask := uintptr(1)<<(4-(w-nw)) - 1 - hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits - } - - // Change nw from counting possibly-pointer words to total words in allocation. - nw = size / goarch.PtrSize - - // Write whole bitmap bytes. - // The first is hb, the rest are zero. - if w <= nw { - *hbitp = uint8(hb) - hbitp = add1(hbitp) - hb = 0 // for possible final half-byte below - for w += 4; w <= nw; w += 4 { - *hbitp = 0 - hbitp = add1(hbitp) - } - } - - // Write final partial bitmap byte if any. - // We know w > nw, or else we'd still be in the loop above. - // It can be bigger only due to the 4 entries in hb that it counts. - // If w == nw+4 then there's nothing left to do: we wrote all nw entries - // and can discard the 4 sitting in hb. - // But if w == nw+2, we need to write first two in hb. - // The byte is shared with the next object, so be careful with - // existing bits. - if w == nw+2 { - *hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<= 4 { - // This loop processes four words at a time, - // so round cnw down accordingly. - hNext, words := h.forwardOrBoundary(cnw / 4 * 4) - - // n is the number of bitmap bytes to copy. - n := words / 4 - memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n) - cnw -= words - h = hNext - src = addb(src, n) - } - if doubleCheck && h.shift != 0 { - print("cnw=", cnw, " h.shift=", h.shift, "\n") - throw("bad shift after block copy") - } - // Handle the last byte if it's shared. - if cnw == 2 { - *h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)< x+size { - throw("copy exceeded object size") - } - if !(cnw == 0 || cnw == 2) { - print("x=", x, " size=", size, " cnw=", cnw, "\n") - throw("bad number of remaining words") - } - // Set up hbitp so doubleCheck code below can check it. - hbitp = h.bitp - } - // Zero the object where we wrote the bitmap. - memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x) - } - - // Double check the whole bitmap. if doubleCheck { - // x+size may not point to the heap, so back up one - // word and then advance it the way we do above. - end := heapBitsForAddr(x + size - goarch.PtrSize) - if outOfPlace { - // In out-of-place copying, we just advance - // using next. - end = end.next() - } else { - // Don't use next because that may advance to - // the next arena and the in-place logic - // doesn't do that. - end.shift += heapBitsShift - if end.shift == 4*heapBitsShift { - end.bitp, end.shift = add1(end.bitp), 0 + h := heapBitsForAddr(x, size) + for i := uintptr(0); i < size; i += goarch.PtrSize { + // Compute the pointer bit we want at offset i. + want := false + if i < dataSize { + off := i % typ.size + if off < typ.ptrdata { + j := off / goarch.PtrSize + want = *addb(typ.gcdata, j/8)>>(j%8)&1 != 0 + } + } + if want { + var addr uintptr + h, addr = h.next() + if addr != x+i { + throw("heapBitsSetType: pointer entry not correct") + } } } - if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) { - println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size) - print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") - print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n") - h0 := heapBitsForAddr(x) - print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n") - print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n") - throw("bad heapBitsSetType") - } - - // Double-check that bits to be written were written correctly. - // Does not check that other bits were not written, unfortunately. - h := heapBitsForAddr(x) - nptr := typ.ptrdata / goarch.PtrSize - ndata := typ.size / goarch.PtrSize - count := dataSize / typ.size - totalptr := ((count-1)*typ.size + typ.ptrdata) / goarch.PtrSize - for i := uintptr(0); i < size/goarch.PtrSize; i++ { - j := i % ndata - var have, want uint8 - have = (*h.bitp >> h.shift) & (bitPointer | bitScan) - if i >= totalptr { - if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 { - // heapBitsSetTypeGCProg always fills - // in full nibbles of bitScan. - want = bitScan - } - } else { - if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 { - want |= bitPointer - } - want |= bitScan - } - if have != want { - println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size) - print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") - print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n") - print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n") - h0 := heapBitsForAddr(x) - print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n") - print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n") - print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n") - println("at word", i, "offset", i*goarch.PtrSize, "have", hex(have), "want", hex(want)) - if typ.kind&kindGCProg != 0 { - println("GC program:") - dumpGCProg(addb(typ.gcdata, 4)) - } - throw("bad heapBitsSetType") - } - h = h.next() - } - if ptrmask == debugPtrmask.data { - unlock(&debugPtrmask.lock) + if _, addr := h.next(); addr != 0 { + throw("heapBitsSetType: extra pointer") } } } @@ -1488,92 +1042,6 @@ var debugPtrmask struct { data *byte } -// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program. -// progSize is the size of the memory described by the program. -// elemSize is the size of the element that the GC program describes (a prefix of). -// dataSize is the total size of the intended data, a multiple of elemSize. -// allocSize is the total size of the allocated memory. -// -// GC programs are only used for large allocations. -// heapBitsSetType requires that allocSize is a multiple of 4 words, -// so that the relevant bitmap bytes are not shared with surrounding -// objects. -func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) { - if goarch.PtrSize == 8 && allocSize%(4*goarch.PtrSize) != 0 { - // Alignment will be wrong. - throw("heapBitsSetTypeGCProg: small allocation") - } - var totalBits uintptr - if elemSize == dataSize { - totalBits = runGCProg(prog, nil, h.bitp, 2) - if totalBits*goarch.PtrSize != progSize { - println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize) - throw("heapBitsSetTypeGCProg: unexpected bit count") - } - } else { - count := dataSize / elemSize - - // Piece together program trailer to run after prog that does: - // literal(0) - // repeat(1, elemSize-progSize-1) // zeros to fill element size - // repeat(elemSize, count-1) // repeat that element for count - // This zero-pads the data remaining in the first element and then - // repeats that first element to fill the array. - var trailer [40]byte // 3 varints (max 10 each) + some bytes - i := 0 - if n := elemSize/goarch.PtrSize - progSize/goarch.PtrSize; n > 0 { - // literal(0) - trailer[i] = 0x01 - i++ - trailer[i] = 0 - i++ - if n > 1 { - // repeat(1, n-1) - trailer[i] = 0x81 - i++ - n-- - for ; n >= 0x80; n >>= 7 { - trailer[i] = byte(n | 0x80) - i++ - } - trailer[i] = byte(n) - i++ - } - } - // repeat(elemSize/ptrSize, count-1) - trailer[i] = 0x80 - i++ - n := elemSize / goarch.PtrSize - for ; n >= 0x80; n >>= 7 { - trailer[i] = byte(n | 0x80) - i++ - } - trailer[i] = byte(n) - i++ - n = count - 1 - for ; n >= 0x80; n >>= 7 { - trailer[i] = byte(n | 0x80) - i++ - } - trailer[i] = byte(n) - i++ - trailer[i] = 0 - i++ - - runGCProg(prog, &trailer[0], h.bitp, 2) - - // Even though we filled in the full array just now, - // record that we only filled in up to the ptrdata of the - // last element. This will cause the code below to - // memclr the dead section of the final array element, - // so that scanobject can stop early in the final element. - totalBits = (elemSize*(count-1) + progSize) / goarch.PtrSize - } - endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4)) - endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/goarch.PtrSize/wordsPerBitmapByte)) - memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg)) -} - // progToPointerMask returns the 1-bit pointer mask output by the GC program prog. // size the size of the region described by prog, in bytes. // The resulting bitvector will have no more than size/goarch.PtrSize bits. @@ -1581,7 +1049,7 @@ func progToPointerMask(prog *byte, size uintptr) bitvector { n := (size/goarch.PtrSize + 7) / 8 x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1] x[len(x)-1] = 0xa1 // overflow check sentinel - n = runGCProg(prog, nil, &x[0], 1) + n = runGCProg(prog, &x[0]) if x[len(x)-1] != 0xa1 { throw("progToPointerMask: overflow") } @@ -1602,15 +1070,8 @@ func progToPointerMask(prog *byte, size uintptr) bitvector { // 10000000 n c: repeat the previous n bits c times; n, c are varints // 1nnnnnnn c: repeat the previous n bits c times; c is a varint -// runGCProg executes the GC program prog, and then trailer if non-nil, -// writing to dst with entries of the given size. -// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst. -// If size == 2, dst is the 2-bit heap bitmap, and writes move backward -// starting at dst (because the heap bitmap does). In this case, the caller guarantees -// that only whole bytes in dst need to be written. -// -// runGCProg returns the number of 1- or 2-bit entries written to memory. -func runGCProg(prog, trailer, dst *byte, size int) uintptr { +// runGCProg returns the number of 1-bit entries written to memory. +func runGCProg(prog, dst *byte) uintptr { dstStart := dst // Bits waiting to be written to memory. @@ -1623,20 +1084,9 @@ Run: // Flush accumulated full bytes. // The rest of the loop assumes that nbits <= 7. for ; nbits >= 8; nbits -= 8 { - if size == 1 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - } else { - v := bits&bitPointerAll | bitScanAll - *dst = uint8(v) - dst = add1(dst) - bits >>= 4 - v = bits&bitPointerAll | bitScanAll - *dst = uint8(v) - dst = add1(dst) - bits >>= 4 - } + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 } // Process one instruction. @@ -1646,32 +1096,16 @@ Run: if inst&0x80 == 0 { // Literal bits; n == 0 means end of program. if n == 0 { - // Program is over; continue in trailer if present. - if trailer != nil { - p = trailer - trailer = nil - continue - } + // Program is over. break Run } nbyte := n / 8 for i := uintptr(0); i < nbyte; i++ { bits |= uintptr(*p) << nbits p = add1(p) - if size == 1 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - } else { - v := bits&0xf | bitScanAll - *dst = uint8(v) - dst = add1(dst) - bits >>= 4 - v = bits&0xf | bitScanAll - *dst = uint8(v) - dst = add1(dst) - bits >>= 4 - } + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 } if n %= 8; n > 0 { bits |= uintptr(*p) << nbits @@ -1720,22 +1154,12 @@ Run: npattern := nbits // If we need more bits, fetch them from memory. - if size == 1 { + src = subtract1(src) + for npattern < n { + pattern <<= 8 + pattern |= uintptr(*src) src = subtract1(src) - for npattern < n { - pattern <<= 8 - pattern |= uintptr(*src) - src = subtract1(src) - npattern += 8 - } - } else { - src = subtract1(src) - for npattern < n { - pattern <<= 4 - pattern |= uintptr(*src) & 0xf - src = subtract1(src) - npattern += 4 - } + npattern += 8 } // We started with the whole bit output buffer, @@ -1785,20 +1209,11 @@ Run: for ; c >= npattern; c -= npattern { bits |= pattern << nbits nbits += npattern - if size == 1 { - for nbits >= 8 { - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - nbits -= 8 - } - } else { - for nbits >= 4 { - *dst = uint8(bits&0xf | bitScanAll) - dst = add1(dst) - bits >>= 4 - nbits -= 4 - } + for nbits >= 8 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + nbits -= 8 } } @@ -1815,75 +1230,38 @@ Run: // Since nbits <= 7, we know the first few bytes of repeated data // are already written to memory. off := n - nbits // n > nbits because n > maxBits and nbits <= 7 - if size == 1 { - // Leading src fragment. - src = subtractb(src, (off+7)/8) - if frag := off & 7; frag != 0 { - bits |= uintptr(*src) >> (8 - frag) << nbits - src = add1(src) - nbits += frag - c -= frag - } - // Main loop: load one byte, write another. - // The bits are rotating through the bit buffer. - for i := c / 8; i > 0; i-- { - bits |= uintptr(*src) << nbits - src = add1(src) - *dst = uint8(bits) - dst = add1(dst) - bits >>= 8 - } - // Final src fragment. - if c %= 8; c > 0 { - bits |= (uintptr(*src) & (1<> (4 - frag) << nbits - src = add1(src) - nbits += frag - c -= frag - } - // Main loop: load one byte, write another. - // The bits are rotating through the bit buffer. - for i := c / 4; i > 0; i-- { - bits |= (uintptr(*src) & 0xf) << nbits - src = add1(src) - *dst = uint8(bits&0xf | bitScanAll) - dst = add1(dst) - bits >>= 4 - } - // Final src fragment. - if c %= 4; c > 0 { - bits |= (uintptr(*src) & (1<> (8 - frag) << nbits + src = add1(src) + nbits += frag + c -= frag } - } - - // Write any final bits out, using full-byte writes, even for the final byte. - var totalBits uintptr - if size == 1 { - totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits - nbits += -nbits & 7 - for ; nbits > 0; nbits -= 8 { + // Main loop: load one byte, write another. + // The bits are rotating through the bit buffer. + for i := c / 8; i > 0; i-- { + bits |= uintptr(*src) << nbits + src = add1(src) *dst = uint8(bits) dst = add1(dst) bits >>= 8 } - } else { - totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits - nbits += -nbits & 3 - for ; nbits > 0; nbits -= 4 { - v := bits&0xf | bitScanAll - *dst = uint8(v) - dst = add1(dst) - bits >>= 4 + // Final src fragment. + if c %= 8; c > 0 { + bits |= (uintptr(*src) & (1< 0; nbits -= 8 { + *dst = uint8(bits) + dst = add1(dst) + bits >>= 8 + } return totalBits } @@ -1898,7 +1276,7 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan { // Compute the number of pages needed for bitmapBytes. pages := divRoundUp(bitmapBytes, pageSize) s := mheap_.allocManual(pages, spanAllocPtrScalarBits) - runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1) + runGCProg(addb(prog, 4), (*byte)(unsafe.Pointer(s.startAddr))) return s } func dematerializeGCProg(s *mspan) { @@ -1966,13 +1344,7 @@ func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool { // //go:linkname reflect_gcbits reflect.gcbits func reflect_gcbits(x any) []byte { - ret := getgcmask(x) - typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem - nptr := typ.ptrdata / goarch.PtrSize - for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 { - ret = ret[:len(ret)-1] - } - return ret + return getgcmask(x) } // Returns GC type info for the pointer stored in ep for testing. @@ -2011,18 +1383,22 @@ func getgcmask(ep any) (mask []byte) { // heap if base, s, _ := findObject(uintptr(p), 0, 0); base != 0 { - hbits := heapBitsForAddr(base) + if s.spanclass.noscan() { + return nil + } n := s.elemsize + hbits := heapBitsForAddr(base, n) mask = make([]byte, n/goarch.PtrSize) - for i := uintptr(0); i < n; i += goarch.PtrSize { - if hbits.isPointer() { - mask[i/goarch.PtrSize] = 1 - } - if !hbits.morePointers() { - mask = mask[:i/goarch.PtrSize] + for { + var addr uintptr + if hbits, addr = hbits.next(); addr == 0 { break } - hbits = hbits.next() + mask[(addr-base)/goarch.PtrSize] = 1 + } + // Callers expect this mask to end at the last pointer. + for len(mask) > 0 && mask[len(mask)-1] == 0 { + mask = mask[:len(mask)-1] } return } diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 1f484fb9b6..40674d8939 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -251,7 +251,7 @@ func (c *mcache) allocLarge(size uintptr, noscan bool) *mspan { // visible to the background sweeper. mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s) s.limit = s.base() + size - heapBitsForAddr(s.base()).initSpan(s) + s.initHeapBits() return s } diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index e4bdf35071..c7ce573da6 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -250,6 +250,6 @@ func (c *mcentral) grow() *mspan { // n := (npages << _PageShift) / size n := s.divideByElemSize(npages << _PageShift) s.limit = s.base() + size*n - heapBitsForAddr(s.base()).initSpan(s) + s.initHeapBits() return s } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index c38b725d4b..d4d7c93ba9 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -1267,7 +1267,6 @@ func scanobject(b uintptr, gcw *gcWork) { // b is either the beginning of an object, in which case this // is the size of the object to scan, or it points to an // oblet, in which case we compute the size to scan below. - hbits := heapBitsForAddr(b) s := spanOfUnchecked(b) n := s.elemsize if n == 0 { @@ -1302,20 +1301,24 @@ func scanobject(b uintptr, gcw *gcWork) { } } - var i uintptr - for i = 0; i < n; i, hbits = i+goarch.PtrSize, hbits.next() { - // Load bits once. See CL 22712 and issue 16973 for discussion. - bits := hbits.bits() - if bits&bitScan == 0 { - break // no more pointers in this object - } - if bits&bitPointer == 0 { - continue // not a pointer + hbits := heapBitsForAddr(b, n) + var scanSize uintptr + for { + var addr uintptr + if hbits, addr = hbits.nextFast(); addr == 0 { + if hbits, addr = hbits.next(); addr == 0 { + break + } } + // Keep track of farthest pointer we found, so we can + // update heapScanWork. TODO: is there a better metric, + // now that we can skip scalar portions pretty efficiently? + scanSize = addr - b + goarch.PtrSize + // Work here is duplicated in scanblock and above. // If you make changes here, make changes there too. - obj := *(*uintptr)(unsafe.Pointer(b + i)) + obj := *(*uintptr)(unsafe.Pointer(addr)) // At this point we have extracted the next potential pointer. // Quickly filter out nil and pointers back to the current object. @@ -1329,13 +1332,13 @@ func scanobject(b uintptr, gcw *gcWork) { // heap. In this case, we know the object was // just allocated and hence will be marked by // allocation itself. - if obj, span, objIndex := findObject(obj, b, i); obj != 0 { - greyobject(obj, b, i, span, gcw, objIndex) + if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 { + greyobject(obj, b, addr-b, span, gcw, objIndex) } } } gcw.bytesMarked += uint64(n) - gcw.heapScanWork += int64(i) + gcw.heapScanWork += int64(scanSize) } // scanConservative scans block [b, b+n) conservatively, treating any diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 5d4297617d..12307594f0 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -221,9 +221,22 @@ var mheap_ mheap //go:notinheap type heapArena struct { // bitmap stores the pointer/scalar bitmap for the words in - // this arena. See mbitmap.go for a description. Use the - // heapBits type to access this. - bitmap [heapArenaBitmapBytes]byte + // this arena. See mbitmap.go for a description. + // This array uses 1 bit per word of heap, or 1.6% of the heap size (for 64-bit). + bitmap [heapArenaBitmapWords]uintptr + + // If the ith bit of noMorePtrs is true, then there are no more + // pointers for the object containing the word described by the + // high bit of bitmap[i]. + // In that case, bitmap[i+1], ... must be zero until the start + // of the next object. + // We never operate on these entries using bit-parallel techniques, + // so it is ok if they are small. Also, they can't be bigger than + // uint16 because at that size a single noMorePtrs entry + // represents 8K of memory, the minimum size of a span. Any larger + // and we'd have to worry about concurrent updates. + // This array uses 1 bit per word of bitmap, or .024% of the heap size (for 64-bit). + noMorePtrs [heapArenaBitmapWords / 8]uint8 // spans maps from virtual address page ID within this arena to *mspan. // For allocated spans, their pages map to the span itself. diff --git a/src/runtime/slice.go b/src/runtime/slice.go index 5c2edd9fe1..89f5343c34 100644 --- a/src/runtime/slice.go +++ b/src/runtime/slice.go @@ -260,12 +260,14 @@ func growslice(et *_type, old slice, cap int) slice { capmem = roundupsize(uintptr(newcap) << shift) overflow = uintptr(newcap) > (maxAlloc >> shift) newcap = int(capmem >> shift) + capmem = uintptr(newcap) << shift default: lenmem = uintptr(old.len) * et.size newlenmem = uintptr(cap) * et.size capmem, overflow = math.MulUintptr(et.size, uintptr(newcap)) capmem = roundupsize(capmem) newcap = int(capmem / et.size) + capmem = uintptr(newcap) * et.size } // The check of overflow in addition to capmem > maxAlloc is needed