diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go index 72339f4aa5..5fef55610f 100644 --- a/src/runtime/defs_linux_386.go +++ b/src/runtime/defs_linux_386.go @@ -23,6 +23,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go index 298f3ebf7c..dce7799b6a 100644 --- a/src/runtime/defs_linux_amd64.go +++ b/src/runtime/defs_linux_amd64.go @@ -23,6 +23,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_arm.go b/src/runtime/defs_linux_arm.go index 6fee57dacf..71cf8c6d50 100644 --- a/src/runtime/defs_linux_arm.go +++ b/src/runtime/defs_linux_arm.go @@ -25,6 +25,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go index 0216096301..606cd70494 100644 --- a/src/runtime/defs_linux_arm64.go +++ b/src/runtime/defs_linux_arm64.go @@ -23,6 +23,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_loong64.go b/src/runtime/defs_linux_loong64.go index 6eca18bdae..692d8c78e9 100644 --- a/src/runtime/defs_linux_loong64.go +++ b/src/runtime/defs_linux_loong64.go @@ -24,6 +24,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go index 2e8c4056ba..8a0af41234 100644 --- a/src/runtime/defs_linux_mips64x.go +++ b/src/runtime/defs_linux_mips64x.go @@ -26,6 +26,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go index 7593600cc6..8322beab2b 100644 --- a/src/runtime/defs_linux_mipsx.go +++ b/src/runtime/defs_linux_mipsx.go @@ -26,6 +26,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_ppc64.go b/src/runtime/defs_linux_ppc64.go index bb3ac016e5..f87924affe 100644 --- a/src/runtime/defs_linux_ppc64.go +++ b/src/runtime/defs_linux_ppc64.go @@ -23,6 +23,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_ppc64le.go b/src/runtime/defs_linux_ppc64le.go index bb3ac016e5..f87924affe 100644 --- a/src/runtime/defs_linux_ppc64le.go +++ b/src/runtime/defs_linux_ppc64le.go @@ -23,6 +23,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_riscv64.go b/src/runtime/defs_linux_riscv64.go index ce4a7f36cd..29b1ef2a50 100644 --- a/src/runtime/defs_linux_riscv64.go +++ b/src/runtime/defs_linux_riscv64.go @@ -24,6 +24,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go index 36497dd40d..b0280213b3 100644 --- a/src/runtime/defs_linux_s390x.go +++ b/src/runtime/defs_linux_s390x.go @@ -24,6 +24,7 @@ const ( _MADV_FREE = 0x8 _MADV_HUGEPAGE = 0xe _MADV_NOHUGEPAGE = 0xf + _MADV_COLLAPSE = 0x19 _SA_RESTART = 0x10000000 _SA_ONSTACK = 0x8000000 diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index b1f5045110..c43c5d0551 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -1820,8 +1820,8 @@ func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) { s.i.setEmpty(chunkIdx(ci)) } -func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool { - return s.i.setNoHugePage(chunkIdx(ci)) +func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) { + s.i.setNoHugePage(chunkIdx(ci)) } func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool { diff --git a/src/runtime/mem.go b/src/runtime/mem.go index 7b01905224..22688d51d5 100644 --- a/src/runtime/mem.go +++ b/src/runtime/mem.go @@ -91,6 +91,12 @@ func sysNoHugePage(v unsafe.Pointer, n uintptr) { sysNoHugePageOS(v, n) } +// sysHugePageCollapse attempts to immediately back the provided memory region +// with huge pages. It is best-effort and may fail silently. +func sysHugePageCollapse(v unsafe.Pointer, n uintptr) { + sysHugePageCollapseOS(v, n) +} + // sysFree transitions a memory region from any state to None. Therefore, it // returns memory unconditionally. It is used if an out-of-memory error has been // detected midway through an allocation or to carve out an aligned section of diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go index deae61635c..dff2756d97 100644 --- a/src/runtime/mem_aix.go +++ b/src/runtime/mem_aix.go @@ -41,6 +41,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) { func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { +} + // Don't split the stack as this function may be invoked without a valid G, // which prevents us from allocating more stack. // diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go index a9025ad015..78128aedf7 100644 --- a/src/runtime/mem_bsd.go +++ b/src/runtime/mem_bsd.go @@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) { func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { +} + // Don't split the stack as this function may be invoked without a valid G, // which prevents us from allocating more stack. // diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go index 1e3e53d45b..ae8487127c 100644 --- a/src/runtime/mem_darwin.go +++ b/src/runtime/mem_darwin.go @@ -39,6 +39,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) { func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { +} + // Don't split the stack as this function may be invoked without a valid G, // which prevents us from allocating more stack. // diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go index bdfab13fed..c9823d3011 100644 --- a/src/runtime/mem_linux.go +++ b/src/runtime/mem_linux.go @@ -116,6 +116,31 @@ func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { madvise(v, n, _MADV_NOHUGEPAGE) } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { + if uintptr(v)&(physPageSize-1) != 0 { + // The Linux implementation requires that the address + // addr be page-aligned, and allows length to be zero. + throw("unaligned sysHugePageCollapseOS") + } + if physHugePageSize == 0 { + return + } + // N.B. If you find yourself debugging this code, note that + // this call can fail with EAGAIN because it's best-effort. + // Also, when it returns an error, it's only for the last + // huge page in the region requested. + // + // It can also sometimes return EINVAL if the corresponding + // region hasn't been backed by physical memory. This is + // difficult to guarantee in general, and it also means + // there's no way to distinguish whether this syscall is + // actually available. Oops. + // + // Anyway, that's why this call just doesn't bother checking + // any errors. + madvise(v, n, _MADV_COLLAPSE) +} + // Don't split the stack as this function may be invoked without a valid G, // which prevents us from allocating more stack. // diff --git a/src/runtime/mem_sbrk.go b/src/runtime/mem_sbrk.go index c8f50e7bd5..dc0a764a2c 100644 --- a/src/runtime/mem_sbrk.go +++ b/src/runtime/mem_sbrk.go @@ -163,6 +163,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) { func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { +} + func sysMapOS(v unsafe.Pointer, n uintptr) { } diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go index c11abc17ad..477d898870 100644 --- a/src/runtime/mem_windows.go +++ b/src/runtime/mem_windows.go @@ -97,6 +97,9 @@ func sysHugePageOS(v unsafe.Pointer, n uintptr) { func sysNoHugePageOS(v unsafe.Pointer, n uintptr) { } +func sysHugePageCollapseOS(v unsafe.Pointer, n uintptr) { +} + // Don't split the stack as this function may be invoked without a valid G, // which prevents us from allocating more stack. // diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go index 82a94be22a..4c6d6be4f0 100644 --- a/src/runtime/mgcscavenge.go +++ b/src/runtime/mgcscavenge.go @@ -771,7 +771,7 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt // Grab whether the chunk is hugepage backed and if it is, // clear it. We're about to break up this huge page. - shouldNoHugePage := p.scav.index.setNoHugePage(ci) + p.scav.index.setNoHugePage(ci) // With that done, it's safe to unlock. unlock(p.mheapLock) @@ -781,9 +781,6 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt // Only perform sys* operations if we're not in a test. // It's dangerous to do so otherwise. - if shouldNoHugePage { - sysNoHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes) - } sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize) // Update global accounting only when not in test, otherwise @@ -1134,17 +1131,34 @@ func (s *scavengeIndex) find(force bool) (chunkIdx, uint) { } // alloc updates metadata for chunk at index ci with the fact that -// an allocation of npages occurred. +// an allocation of npages occurred. It also eagerly attempts to collapse +// the chunk's memory into hugepage if the chunk has become sufficiently +// dense and we're not allocating the whole chunk at once (which suggests +// the allocation is part of a bigger one and it's probably not worth +// eagerly collapsing). // // alloc may only run concurrently with find. func (s *scavengeIndex) alloc(ci chunkIdx, npages uint) { sc := s.chunks[ci].load() sc.alloc(npages, s.gen) if !sc.isHugePage() && sc.inUse > scavChunkHiOccPages { - // Mark dense chunks as specifically backed by huge pages. + // Mark that we're considering this chunk as backed by huge pages. sc.setHugePage() - if !s.test { - sysHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes) + + // Collapse dense chunks into huge pages and mark that + // we did that, but only if we're not allocating to + // use the entire chunk. If we're allocating an entire chunk, + // this is likely part of a much bigger allocation. For + // instance, if the caller is allocating a 1 GiB slice of bytes, we + // don't want to go and manually collapse all those pages; we want + // them to be demand-paged. If the caller is actually going to use + // all that memory, it'll naturally get backed by huge pages later. + // + // This also avoids having sysHugePageCollapse fail. On Linux, + // the call requires that some part of the huge page being collapsed + // is already paged in. + if !s.test && npages < pallocChunkPages { + sysHugePageCollapse(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes) } } s.chunks[ci].store(sc) @@ -1204,14 +1218,13 @@ func (s *scavengeIndex) setEmpty(ci chunkIdx) { // Returns true if the set was successful (not already backed by huge pages). // // setNoHugePage may only run concurrently with find. -func (s *scavengeIndex) setNoHugePage(ci chunkIdx) bool { +func (s *scavengeIndex) setNoHugePage(ci chunkIdx) { val := s.chunks[ci].load() if !val.isHugePage() { - return false + return } val.setNoHugePage() s.chunks[ci].store(val) - return true } // atomicScavChunkData is an atomic wrapper around a scavChunkData @@ -1282,8 +1295,9 @@ const ( // file. The reason we say "HasFree" here is so the zero value is // correct for a newly-grown chunk. (New memory is scavenged.) scavChunkHasFree scavChunkFlags = 1 << iota - // scavChunkNoHugePage indicates whether this chunk has been marked - // sysNoHugePage. If not set, it means the chunk is marked sysHugePage. + // scavChunkNoHugePage indicates whether this chunk has had any huge + // pages broken by the scavenger. + //. // The negative here is unfortunate, but necessary to make it so that // the zero value of scavChunkData accurately represents the state of // a newly-grown chunk. (New memory is marked as backed by huge pages.) diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go index ed53a5672b..3e789ab85c 100644 --- a/src/runtime/mpagealloc.go +++ b/src/runtime/mpagealloc.go @@ -426,11 +426,6 @@ func (p *pageAlloc) grow(base, size uintptr) { // we need to ensure this newly-free memory is visible in the // summaries. p.update(base, size/pageSize, true, false) - - // Mark all new memory as huge page eligible. - if !p.test { - sysHugePage(unsafe.Pointer(base), size) - } } // enableChunkHugePages enables huge pages for the chunk bitmap mappings (disabled by default).