runtime: don't call span.heapBits in writeHeapBitsSmall

For whatever reason, span.heapBits is kind of slow. It accounts for
about a quarter of the cost of writeHeapBitsSmall, which is absurd. We
get a nice speed improvement for small allocations by eliminating this
call.

                   │   before    │               after               │
                   │   sec/op    │   sec/op     vs base              │
MallocTypeInfo16-4   29.47n ± 1%   27.02n ± 1%  -8.31% (p=0.002 n=6)

Change-Id: I6270e26902e5a9254cf1503fac81c3c799c59d6a
Reviewed-on: https://go-review.googlesource.com/c/go/+/614255
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
Michael Anthony Knyszek 2024-09-18 01:58:50 +00:00 committed by Gopher Robot
parent d94b7a1876
commit 56fb8350c8
2 changed files with 19 additions and 6 deletions

View File

@ -427,8 +427,15 @@ func mallocinit() {
// Check that the minimum size (exclusive) for a malloc header is also
// a size class boundary. This is important to making sure checks align
// across different parts of the runtime.
//
// While we're here, also check to make sure all these size classes'
// span sizes are one page. Some code relies on this.
minSizeForMallocHeaderIsSizeClass := false
sizeClassesUpToMinSizeForMallocHeaderAreOnePage := true
for i := 0; i < len(class_to_size); i++ {
if class_to_allocnpages[i] > 1 {
sizeClassesUpToMinSizeForMallocHeaderAreOnePage = false
}
if minSizeForMallocHeader == uintptr(class_to_size[i]) {
minSizeForMallocHeaderIsSizeClass = true
break
@ -437,6 +444,9 @@ func mallocinit() {
if !minSizeForMallocHeaderIsSizeClass {
throw("min size of malloc header is not a size class boundary")
}
if !sizeClassesUpToMinSizeForMallocHeaderAreOnePage {
throw("expected all size classes up to min size for malloc header to fit in one-page spans")
}
// Check that the pointer bitmap for all small sizes without a malloc header
// fits in a word.
if minSizeForMallocHeader/goarch.PtrSize > 8*goarch.PtrSize {

View File

@ -642,8 +642,7 @@ func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize
// The objects here are always really small, so a single load is sufficient.
src0 := readUintptr(typ.GCData)
// Create repetitions of the bitmap if we have a small array.
bits := span.elemsize / goarch.PtrSize
// Create repetitions of the bitmap if we have a small slice backing store.
scanSize = typ.PtrBytes
src := src0
switch typ.Size_ {
@ -658,19 +657,23 @@ func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize
// Since we're never writing more than one uintptr's worth of bits, we're either going
// to do one or two writes.
dst := span.heapBits()
dst := unsafe.Pointer(span.base() + pageSize - pageSize/goarch.PtrSize/8)
o := (x - span.base()) / goarch.PtrSize
i := o / ptrBits
j := o % ptrBits
bits := span.elemsize / goarch.PtrSize
if j+bits > ptrBits {
// Two writes.
bits0 := ptrBits - j
bits1 := bits - bits0
dst[i+0] = dst[i+0]&(^uintptr(0)>>bits0) | (src << j)
dst[i+1] = dst[i+1]&^((1<<bits1)-1) | (src >> bits0)
dst0 := (*uintptr)(add(dst, (i+0)*goarch.PtrSize))
dst1 := (*uintptr)(add(dst, (i+1)*goarch.PtrSize))
*dst0 = (*dst0)&(^uintptr(0)>>bits0) | (src << j)
*dst1 = (*dst1)&^((1<<bits1)-1) | (src >> bits0)
} else {
// One write.
dst[i] = (dst[i] &^ (((1 << bits) - 1) << j)) | (src << j)
dst := (*uintptr)(add(dst, i*goarch.PtrSize))
*dst = (*dst)&^(((1<<bits)-1)<<j) | (src << j)
}
const doubleCheck = false