diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 2f3c609907..694b3e66cd 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -291,6 +291,13 @@ func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {
 		<-main_init_done
 	}
 
+	// Check whether the profiler needs to be turned on or off; this route to
+	// run Go code does not use runtime.execute, so bypasses the check there.
+	hz := sched.profilehz
+	if gp.m.profilehz != hz {
+		setThreadCPUProfiler(hz)
+	}
+
 	// Add entry to defer stack in case of panic.
 	restore := true
 	defer unwindm(&restore)
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index d307c472b8..a4646577cb 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -7,10 +7,21 @@ package runtime
 import (
 	"internal/abi"
 	"internal/goarch"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
-type mOS struct{}
+type mOS struct {
+	// profileTimer holds the ID of the POSIX interval timer for profiling CPU
+	// usage on this thread.
+	//
+	// It is valid when the profileTimerValid field is non-zero. A thread
+	// creates and manages its own timer, and these fields are read and written
+	// only by this thread. But because some of the reads on profileTimerValid
+	// are in signal handling code, access to that field uses atomic operations.
+	profileTimer      int32
+	profileTimerValid uint32
+}
 
 //go:noescape
 func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
@@ -518,18 +529,136 @@ func signalM(mp *m, sig int) {
 	tgkill(getpid(), int(mp.procid), sig)
 }
 
+// go118UseTimerCreateProfiler enables the per-thread CPU profiler. Platforms
+// with support for SetCgoTraceback do some signal handling in assembly; do not
+// enable it for them until the changes to those code paths are in place.
+const go118UseTimerCreateProfiler = GOARCH != "amd64" && GOARCH != "ppc64le"
+
+// validSIGPROF compares this signal delivery's code against the signal sources
+// that the profiler uses, returning whether the delivery should be processed.
+// To be processed, a signal delivery from a known profiling mechanism should
+// correspond to the best profiling mechanism available to this thread. Signals
+// from other sources are always considered valid.
+//
+//go:nosplit
+func validSIGPROF(mp *m, c *sigctxt) bool {
+	code := int32(c.sigcode())
+	setitimer := code == _SI_KERNEL
+	timer_create := code == _SI_TIMER
+
+	if !(setitimer || timer_create) {
+		// The signal doesn't correspond to a profiling mechanism that the
+		// runtime enables itself. There's no reason to process it, but there's
+		// no reason to ignore it either.
+		return true
+	}
+
+	if mp == nil {
+		// Since we don't have an M, we can't check if there's an active
+		// per-thread timer for this thread. We don't know how long this thread
+		// has been around, and if it happened to interact with the Go scheduler
+		// at a time when profiling was active (causing it to have a per-thread
+		// timer). But it may have never interacted with the Go scheduler, or
+		// never while profiling was active. To avoid double-counting, process
+		// only signals from setitimer.
+		//
+		// When a custom cgo traceback function has been registered (on
+		// platforms that support runtime.SetCgoTraceback), SIGPROF signals
+		// delivered to a thread that cannot find a matching M do this check in
+		// the assembly implementations of runtime.cgoSigtramp.
+		return setitimer
+	}
+
+	// Having an M means the thread interacts with the Go scheduler, and we can
+	// check whether there's an active per-thread timer for this thread.
+	if atomic.Load(&mp.profileTimerValid) != 0 {
+		// If this M has its own per-thread CPU profiling interval timer, we
+		// should track the SIGPROF signals that come from that timer (for
+		// accurate reporting of its CPU usage; see issue 35057) and ignore any
+		// that it gets from the process-wide setitimer (to not over-count its
+		// CPU consumption).
+		return timer_create
+	}
+
+	// No active per-thread timer means the only valid profiler is setitimer.
+	return setitimer
+}
+
 func setProcessCPUProfiler(hz int32) {
 	setProcessCPUProfilerTimer(hz)
 }
 
 func setThreadCPUProfiler(hz int32) {
-	setThreadCPUProfilerHz(hz)
-}
+	mp := getg().m
+	mp.profilehz = hz
 
-//go:nosplit
-func validSIGPROF(mp *m, c *sigctxt) bool {
-	// Once the profiler for Linux uses timer_create, this function will need to
-	// do more than return true. But so long as the only profiling timer in use
-	// is setitimer, returning true is correct.
-	return true
+	if !go118UseTimerCreateProfiler {
+		return
+	}
+
+	// destroy any active timer
+	if atomic.Load(&mp.profileTimerValid) != 0 {
+		timerid := mp.profileTimer
+		atomic.Store(&mp.profileTimerValid, 0)
+		mp.profileTimer = 0
+
+		ret := timer_delete(timerid)
+		if ret != 0 {
+			print("runtime: failed to disable profiling timer; timer_delete(", timerid, ") errno=", -ret, "\n")
+			throw("timer_delete")
+		}
+	}
+
+	if hz == 0 {
+		// If the goal was to disable profiling for this thread, then the job's done.
+		return
+	}
+
+	// The period of the timer should be 1/Hz. For every "1/Hz" of additional
+	// work, the user should expect one additional sample in the profile.
+	//
+	// But to scale down to very small amounts of application work, to observe
+	// even CPU usage of "one tenth" of the requested period, set the initial
+	// timing delay in a different way: So that "one tenth" of a period of CPU
+	// spend shows up as a 10% chance of one sample (for an expected value of
+	// 0.1 samples), and so that "two and six tenths" periods of CPU spend show
+	// up as a 60% chance of 3 samples and a 40% chance of 2 samples (for an
+	// expected value of 2.6). Set the initial delay to a value in the unifom
+	// random distribution between 0 and the desired period. And because "0"
+	// means "disable timer", add 1 so the half-open interval [0,period) turns
+	// into (0,period].
+	//
+	// Otherwise, this would show up as a bias away from short-lived threads and
+	// from threads that are only occasionally active: for example, when the
+	// garbage collector runs on a mostly-idle system, the additional threads it
+	// activates may do a couple milliseconds of GC-related work and nothing
+	// else in the few seconds that the profiler observes.
+	spec := new(itimerspec)
+	spec.it_value.setNsec(1 + int64(fastrandn(uint32(1e9/hz))))
+	spec.it_interval.setNsec(1e9 / int64(hz))
+
+	var timerid int32
+	sevp := &sigevent{
+		notify:                 _SIGEV_THREAD_ID,
+		signo:                  _SIGPROF,
+		sigev_notify_thread_id: int32(mp.procid),
+	}
+	ret := timer_create(_CLOCK_THREAD_CPUTIME_ID, sevp, &timerid)
+	if ret != 0 {
+		// If we cannot create a timer for this M, leave profileTimerValid false
+		// to fall back to the process-wide setitimer profiler.
+		return
+	}
+
+	ret = timer_settime(timerid, 0, spec, nil)
+	if ret != 0 {
+		print("runtime: failed to configure profiling timer; timer_settime(", timerid,
+			", 0, {interval: {",
+			spec.it_interval.tv_sec, "s + ", spec.it_interval.tv_nsec, "ns} value: {",
+			spec.it_value.tv_sec, "s + ", spec.it_value.tv_nsec, "ns}}, nil) errno=", -ret, "\n")
+		throw("timer_settime")
+	}
+
+	mp.profileTimer = timerid
+	atomic.Store(&mp.profileTimerValid, 1)
 }
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 5b9ad50746..8c34d39ad5 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -79,6 +79,10 @@ func cpuHog2(x int) int {
 	return foo
 }
 
+func cpuHog3(x int) int {
+	return cpuHog0(x, 1e5)
+}
+
 // Return a list of functions that we don't want to ever appear in CPU
 // profiles. For gccgo, that list includes the sigprof handler itself.
 func avoidFunctions() []string {
@@ -107,6 +111,93 @@ func TestCPUProfileMultithreaded(t *testing.T) {
 	})
 }
 
+func TestCPUProfileMultithreadMagnitude(t *testing.T) {
+	if runtime.GOOS != "linux" {
+		t.Skip("issue 35057 is only confirmed on Linux")
+	}
+
+	// Run a workload in a single goroutine, then run copies of the same
+	// workload in several goroutines. For both the serial and parallel cases,
+	// the CPU time the process measures with its own profiler should match the
+	// total CPU usage that the OS reports.
+	//
+	// We could also check that increases in parallelism (GOMAXPROCS) lead to a
+	// linear increase in the CPU usage reported by both the OS and the
+	// profiler, but without a guarantee of exclusive access to CPU resources
+	// that is likely to be a flaky test.
+
+	// Require the smaller value to be within 10%, or 40% in short mode.
+	maxDiff := 0.10
+	if testing.Short() {
+		maxDiff = 0.40
+	}
+
+	parallelism := runtime.GOMAXPROCS(0)
+
+	var cpuTime1, cpuTimeN time.Duration
+	p := testCPUProfile(t, stackContains, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog3"}, avoidFunctions(), func(dur time.Duration) {
+		cpuTime1 = diffCPUTime(t, func() {
+			// Consume CPU in one goroutine
+			cpuHogger(cpuHog1, &salt1, dur)
+		})
+
+		cpuTimeN = diffCPUTime(t, func() {
+			// Next, consume CPU in several goroutines
+			var wg sync.WaitGroup
+			var once sync.Once
+			for i := 0; i < parallelism; i++ {
+				wg.Add(1)
+				go func() {
+					defer wg.Done()
+					var salt = 0
+					cpuHogger(cpuHog3, &salt, dur)
+					once.Do(func() { salt1 = salt })
+				}()
+			}
+			wg.Wait()
+		})
+	})
+
+	for i, unit := range []string{"count", "nanoseconds"} {
+		if have, want := p.SampleType[i].Unit, unit; have != want {
+			t.Errorf("pN SampleType[%d]; %q != %q", i, have, want)
+		}
+	}
+
+	var value1, valueN time.Duration
+	for _, sample := range p.Sample {
+		if stackContains("runtime/pprof.cpuHog1", uintptr(sample.Value[0]), sample.Location, sample.Label) {
+			value1 += time.Duration(sample.Value[1]) * time.Nanosecond
+		}
+		if stackContains("runtime/pprof.cpuHog3", uintptr(sample.Value[0]), sample.Location, sample.Label) {
+			valueN += time.Duration(sample.Value[1]) * time.Nanosecond
+		}
+	}
+
+	compare := func(a, b time.Duration, maxDiff float64) func(*testing.T) {
+		return func(t *testing.T) {
+			t.Logf("compare %s vs %s", a, b)
+			if a <= 0 || b <= 0 {
+				t.Errorf("Expected both time reports to be positive")
+				return
+			}
+
+			if a < b {
+				a, b = b, a
+			}
+
+			diff := float64(a-b) / float64(a)
+			if diff > maxDiff {
+				t.Errorf("CPU usage reports are too different (limit -%.1f%%, got -%.1f%%)", maxDiff*100, diff*100)
+			}
+		}
+	}
+
+	// check that the OS's perspective matches what the Go runtime measures
+	t.Run("serial execution OS vs pprof", compare(cpuTime1, value1, maxDiff))
+	t.Run("parallel execution OS vs pprof", compare(cpuTimeN, valueN, maxDiff))
+}
+
 // containsInlinedCall reports whether the function body for the function f is
 // known to contain an inlined function call within the first maxBytes bytes.
 func containsInlinedCall(f interface{}, maxBytes int) bool {
@@ -350,6 +441,16 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
 	return nil
 }
 
+var diffCPUTimeImpl func(f func()) time.Duration
+
+func diffCPUTime(t *testing.T, f func()) time.Duration {
+	if fn := diffCPUTimeImpl; fn != nil {
+		return fn(f)
+	}
+	t.Fatalf("cannot measure CPU time on GOOS=%s GOARCH=%s", runtime.GOOS, runtime.GOARCH)
+	return 0
+}
+
 func contains(slice []string, s string) bool {
 	for i := range slice {
 		if slice[i] == s {
diff --git a/src/runtime/pprof/rusage_test.go b/src/runtime/pprof/rusage_test.go
new file mode 100644
index 0000000000..66b1b3bbe1
--- /dev/null
+++ b/src/runtime/pprof/rusage_test.go
@@ -0,0 +1,40 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build darwin || freebsd || linux || netbsd || openbsd
+// +build darwin freebsd linux netbsd openbsd
+
+package pprof
+
+import (
+	"syscall"
+	"time"
+)
+
+func init() {
+	diffCPUTimeImpl = diffCPUTimeRUsage
+}
+
+func diffCPUTimeRUsage(f func()) time.Duration {
+	ok := true
+	var before, after syscall.Rusage
+
+	err := syscall.Getrusage(syscall.RUSAGE_SELF, &before)
+	if err != nil {
+		ok = false
+	}
+
+	f()
+
+	err = syscall.Getrusage(syscall.RUSAGE_SELF, &after)
+	if err != nil {
+		ok = false
+	}
+
+	if !ok {
+		return 0
+	}
+
+	return time.Duration((after.Utime.Nano() + after.Stime.Nano()) - (before.Utime.Nano() + before.Stime.Nano()))
+}