go/src/runtime/tracebuf.go

// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Trace buffer management.

package runtime

import (
	"internal/runtime/sys"
	"internal/trace/tracev2"
	"unsafe"
)

// Maximum number of bytes required to encode uint64 in base-128.
const traceBytesPerNumber = 10

// traceWriter is the interface for writing all trace data.
//
// This type is passed around as a value, and all of its methods return
// a new traceWriter. This allows for chaining together calls in a fluent-style
// API. This is partly stylistic, and very slightly for performance, since
// the compiler can destructure this value and pass it between calls as
// just regular arguments. However, this style is not load-bearing, and
// we can change it if it's deemed too error-prone.
type traceWriter struct {
	traceLocker
	exp tracev2.Experiment
	*traceBuf
}

// writer returns an a traceWriter that writes into the current M's stream.
//
// Once this is called, the caller must guard against stack growth until
// end is called on it. Therefore, it's highly recommended to use this
// API in a "fluent" style, for example tl.writer().event(...).end().
// Better yet, callers just looking to write events should use eventWriter
// when possible, which is a much safer wrapper around this function.
//
// nosplit to allow for safe reentrant tracing from stack growth paths.
//
//go:nosplit
func (tl traceLocker) writer() traceWriter {
	if debugTraceReentrancy {
		// Checks that the invariants of this function are being upheld.
		gp := getg()
		if gp == gp.m.curg {
			tl.mp.trace.oldthrowsplit = gp.throwsplit
			gp.throwsplit = true
		}
	}
	return traceWriter{traceLocker: tl, traceBuf: tl.mp.trace.buf[tl.gen%2][tracev2.NoExperiment]}
}

// unsafeTraceWriter produces a traceWriter that doesn't lock the trace.
//
// It should only be used in contexts where either:
// - Another traceLocker is held.
// - trace.gen is prevented from advancing.
//
// This does not have the same stack growth restrictions as traceLocker.writer.
//
// buf may be nil.
func unsafeTraceWriter(gen uintptr, buf *traceBuf) traceWriter {
	return traceWriter{traceLocker: traceLocker{gen: gen}, traceBuf: buf}
}

// event writes out the bytes of an event into the event stream.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (w traceWriter) event(ev tracev2.EventType, args ...traceArg) traceWriter {
	// N.B. Everything in this call must be nosplit to maintain
	// the stack growth related invariants for writing events.

	// Make sure we have room.
	w, _ = w.ensure(1 + (len(args)+1)*traceBytesPerNumber)

	// Compute the timestamp diff that we'll put in the trace.
	ts := traceClockNow()
	if ts <= w.traceBuf.lastTime {
		ts = w.traceBuf.lastTime + 1
	}
	tsDiff := uint64(ts - w.traceBuf.lastTime)
	w.traceBuf.lastTime = ts

	// Write out event.
	w.byte(byte(ev))
	w.varint(tsDiff)
	for _, arg := range args {
		w.varint(uint64(arg))
	}
	return w
}

// end writes the buffer back into the m.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (w traceWriter) end() {
	if w.mp == nil {
		// Tolerate a nil mp. It makes code that creates traceWriters directly
		// less error-prone.
		return
	}
	w.mp.trace.buf[w.gen%2][w.exp] = w.traceBuf
	if debugTraceReentrancy {
		// The writer is no longer live, we can drop throwsplit (if it wasn't
		// already set upon entry).
		gp := getg()
		if gp == gp.m.curg {
			gp.throwsplit = w.mp.trace.oldthrowsplit
		}
	}
}

// ensure makes sure that at least maxSize bytes are available to write.
//
// Returns whether the buffer was flushed.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (w traceWriter) ensure(maxSize int) (traceWriter, bool) {
	refill := w.traceBuf == nil || !w.available(maxSize)
	if refill {
		w = w.refill()
	}
	return w, refill
}

// flush puts w.traceBuf on the queue of full buffers.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (w traceWriter) flush() traceWriter {
	systemstack(func() {
		lock(&trace.lock)
		if w.traceBuf != nil {
			traceBufFlush(w.traceBuf, w.gen)
		}
		unlock(&trace.lock)
	})
	w.traceBuf = nil
	return w
}

// refill puts w.traceBuf on the queue of full buffers and refresh's w's buffer.
func (w traceWriter) refill() traceWriter {
	systemstack(func() {
		lock(&trace.lock)
		if w.traceBuf != nil {
			traceBufFlush(w.traceBuf, w.gen)
		}
		if trace.empty != nil {
			w.traceBuf = trace.empty
			trace.empty = w.traceBuf.link
			unlock(&trace.lock)
		} else {
			unlock(&trace.lock)
			w.traceBuf = (*traceBuf)(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
			if w.traceBuf == nil {
				throw("trace: out of memory")
			}
		}
	})
	// Initialize the buffer.
	ts := traceClockNow()
	if ts <= w.traceBuf.lastTime {
		ts = w.traceBuf.lastTime + 1
	}
	w.traceBuf.lastTime = ts
	w.traceBuf.link = nil
	w.traceBuf.pos = 0

	// Tolerate a nil mp.
	mID := ^uint64(0)
	if w.mp != nil {
		mID = uint64(w.mp.procid)
	}

	// Write the buffer's header.
	if w.exp == tracev2.NoExperiment {
		w.byte(byte(tracev2.EvEventBatch))
	} else {
		w.byte(byte(tracev2.EvExperimentalBatch))
		w.byte(byte(w.exp))
	}
	w.varint(uint64(w.gen))
	w.varint(uint64(mID))
	w.varint(uint64(ts))
	w.traceBuf.lenPos = w.varintReserve()
	return w
}

// expWriter returns a traceWriter that writes into the current M's stream for
// the given experiment.
func (tl traceLocker) expWriter(exp tracev2.Experiment) traceWriter {
	return traceWriter{traceLocker: tl, traceBuf: tl.mp.trace.buf[tl.gen%2][exp], exp: exp}
}

// unsafeTraceExpWriter produces a traceWriter for experimental trace batches
// that doesn't lock the trace. Data written to experimental batches need not
// conform to the standard trace format.
//
// It should only be used in contexts where either:
// - Another traceLocker is held.
// - trace.gen is prevented from advancing.
//
// This does not have the same stack growth restrictions as traceLocker.writer.
//
// buf may be nil.
func unsafeTraceExpWriter(gen uintptr, buf *traceBuf, exp tracev2.Experiment) traceWriter {
	return traceWriter{traceLocker: traceLocker{gen: gen}, traceBuf: buf, exp: exp}
}

// traceBufQueue is a FIFO of traceBufs.
type traceBufQueue struct {
	head, tail *traceBuf
}

// push queues buf into queue of buffers.
func (q *traceBufQueue) push(buf *traceBuf) {
	buf.link = nil
	if q.head == nil {
		q.head = buf
	} else {
		q.tail.link = buf
	}
	q.tail = buf
}

// pop dequeues from the queue of buffers.
func (q *traceBufQueue) pop() *traceBuf {
	buf := q.head
	if buf == nil {
		return nil
	}
	q.head = buf.link
	if q.head == nil {
		q.tail = nil
	}
	buf.link = nil
	return buf
}

func (q *traceBufQueue) empty() bool {
	return q.head == nil
}

// traceBufHeader is per-P tracing buffer.
type traceBufHeader struct {
	link     *traceBuf // in trace.empty/full
	lastTime traceTime // when we wrote the last event
	pos      int       // next write offset in arr
	lenPos   int       // position of batch length value
}

// traceBuf is per-M tracing buffer.
//
// TODO(mknyszek): Rename traceBuf to traceBatch, since they map 1:1 with event batches.
type traceBuf struct {
	_ sys.NotInHeap
	traceBufHeader
	arr [tracev2.MaxBatchSize - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
}

// byte appends v to buf.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) byte(v byte) {
	buf.arr[buf.pos] = v
	buf.pos++
}

// varint appends v to buf in little-endian-base-128 encoding.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) varint(v uint64) {
	pos := buf.pos
	arr := buf.arr[pos : pos+traceBytesPerNumber]
	for i := range arr {
		if v < 0x80 {
			pos += i + 1
			arr[i] = byte(v)
			break
		}
		arr[i] = 0x80 | byte(v)
		v >>= 7
	}
	buf.pos = pos
}

// varintReserve reserves enough space in buf to hold any varint.
//
// Space reserved this way can be filled in with the varintAt method.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) varintReserve() int {
	p := buf.pos
	buf.pos += traceBytesPerNumber
	return p
}

// stringData appends s's data directly to buf.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) stringData(s string) {
	buf.pos += copy(buf.arr[buf.pos:], s)
}

// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) available(size int) bool {
	return len(buf.arr)-buf.pos >= size
}

// varintAt writes varint v at byte position pos in buf. This always
// consumes traceBytesPerNumber bytes. This is intended for when the caller
// needs to reserve space for a varint but can't populate it until later.
// Use varintReserve to reserve this space.
//
// nosplit because it's part of writing an event for an M, which must not
// have any stack growth.
//
//go:nosplit
func (buf *traceBuf) varintAt(pos int, v uint64) {
	for i := 0; i < traceBytesPerNumber; i++ {
		if i < traceBytesPerNumber-1 {
			buf.arr[pos] = 0x80 | byte(v)
		} else {
			buf.arr[pos] = byte(v)
		}
		v >>= 7
		pos++
	}
	if v != 0 {
		throw("v could not fit in traceBytesPerNumber")
	}
}

// traceBufFlush flushes a trace buffer.
//
// Must run on the system stack because trace.lock must be held.
//
//go:systemstack
func traceBufFlush(buf *traceBuf, gen uintptr) {
	assertLockHeld(&trace.lock)

	// Write out the non-header length of the batch in the header.
	//
	// Note: the length of the header is not included to make it easier
	// to calculate this value when deserializing and reserializing the
	// trace. Varints can have additional padding of zero bits that is
	// quite difficult to preserve, and if we include the header we
	// force serializers to do more work. Nothing else actually needs
	// padding.
	buf.varintAt(buf.lenPos, uint64(buf.pos-(buf.lenPos+traceBytesPerNumber)))
	trace.full[gen%2].push(buf)

	// Notify the scheduler that there's work available and that the trace
	// reader should be scheduled.
	if !trace.workAvailable.Load() {
		trace.workAvailable.Store(true)
	}
}